]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'powerpc-4.3-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Sep 2015 15:01:06 +0000 (08:01 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 18 Sep 2015 15:01:06 +0000 (08:01 -0700)
Pull powerpc fixes from Michael Ellerman:

 - Fix 32-bit TCE table init in kdump kernel from Nish

 - Fix kdump with non-power-of-2 crashkernel= from Nish

 - Abort cxl_pci_enable_device_hook() if PCI channel is offline from
   Andrew

 - Fix to release DRC when configure_connector() fails from Bharata

 - Wire up sys_userfaultfd()

 - Fix race condition in tearing down MSI interrupts from Paul

 - Fix unbalanced pci_dev_get() in cxl_probe() from Daniel

 - Fix cxl build failure due to -Wunused-variable gcc behaviour change
   from Ian

 - Tell the toolchain to use ABI v2 when building an LE boot wrapper
   from Benh

 - Fix THP to recompute hash value after a failed update from Aneesh

 - 32-bit memcpy/memset: only use dcbz once cache is enabled from
   Christophe

* tag 'powerpc-4.3-2' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux:
  powerpc32: memset: only use dcbz once cache is enabled
  powerpc32: memcpy: only use dcbz once cache is enabled
  powerpc/mm: Recompute hash value after a failed update
  powerpc/boot: Specify ABI v2 when building an LE boot wrapper
  cxl: Fix build failure due to -Wunused-variable behaviour change
  cxl: Fix unbalanced pci_dev_get in cxl_probe
  powerpc/MSI: Fix race condition in tearing down MSI interrupts
  powerpc: Wire up sys_userfaultfd()
  powerpc/pseries: Release DRC when configure_connector fails
  cxl: abort cxl_pci_enable_device_hook() if PCI channel is offline
  powerpc/powernv/pci-ioda: fix kdump with non-power-of-2 crashkernel=
  powerpc/powernv/pci-ioda: fix 32-bit TCE table init in kdump kernel

1934 files changed:
.gitignore
CREDITS
Documentation/ABI/testing/sysfs-hypervisor-pmu [new file with mode: 0644]
Documentation/Changes
Documentation/DMA-API.txt
Documentation/DocBook/device-drivers.tmpl
Documentation/blockdev/zram.txt
Documentation/cgroups/blkio-controller.txt
Documentation/cgroups/unified-hierarchy.txt
Documentation/devicetree/bindings/hwmon/ina209.txt [new file with mode: 0644]
Documentation/devicetree/bindings/hwmon/ina2xx.txt [new file with mode: 0644]
Documentation/devicetree/bindings/hwmon/max6697.txt [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/i2c-cadence.txt
Documentation/devicetree/bindings/i2c/i2c-emev2.txt [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/i2c-lpc2k.txt [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/i2c.txt [new file with mode: 0644]
Documentation/devicetree/bindings/i2c/ina209.txt [deleted file]
Documentation/devicetree/bindings/i2c/ina2xx.txt [deleted file]
Documentation/devicetree/bindings/i2c/max6697.txt [deleted file]
Documentation/devicetree/bindings/i2c/ti,bq32k.txt [deleted file]
Documentation/devicetree/bindings/i2c/trivial-devices.txt
Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt [new file with mode: 0644]
Documentation/devicetree/bindings/input/touchscreen/imx6ul_tsc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/iommu/arm,smmu.txt
Documentation/devicetree/bindings/iommu/ti,omap-iommu.txt
Documentation/devicetree/bindings/ipmi.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/arasan,sdhci.txt
Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.txt
Documentation/devicetree/bindings/mmc/sdhci-atmel.txt [new file with mode: 0644]
Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
Documentation/devicetree/bindings/pwm/lpc1850-sct-pwm.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/rtc-omap.txt
Documentation/devicetree/bindings/rtc/ti,bq32k.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rtc/xlnx-rtc.txt [new file with mode: 0644]
Documentation/devicetree/bindings/soc/qcom/qcom,smd.txt
Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt [new file with mode: 0644]
Documentation/devicetree/bindings/watchdog/lpc18xx-wdt.txt [new file with mode: 0644]
Documentation/features/vm/THP/arch-support.txt
Documentation/filesystems/Locking
Documentation/filesystems/dax.txt
Documentation/filesystems/proc.txt
Documentation/gpio/board.txt
Documentation/gpio/consumer.txt
Documentation/hwmon/nct6775
Documentation/i2c/busses/i2c-parport
Documentation/i2c/slave-interface
Documentation/i2c/ten-bit-addresses
Documentation/infiniband/sysfs.txt
Documentation/ioctl/ioctl-number.txt
Documentation/kbuild/kbuild.txt
Documentation/kernel-parameters.txt
Documentation/module-signing.txt
Documentation/security/Smack.txt
Documentation/security/Yama.txt
Documentation/static-keys.txt
Documentation/sysctl/vm.txt
Documentation/sysrq.txt
Documentation/thermal/sysfs-api.txt
Documentation/trace/ftrace.txt
Documentation/virtual/kvm/api.txt
Documentation/vm/00-INDEX
Documentation/vm/hugetlbpage.txt
Documentation/vm/idle_page_tracking.txt [new file with mode: 0644]
Documentation/vm/pagemap.txt
Documentation/vm/zswap.txt
Documentation/watchdog/src/watchdog-test.c
MAINTAINERS
Makefile
arch/Kconfig
arch/alpha/include/asm/dma-mapping.h
arch/alpha/include/asm/io.h
arch/alpha/kernel/pci-noop.c
arch/alpha/kernel/pci_iommu.c
arch/alpha/lib/udelay.c
arch/arc/plat-axs10x/axs10x.c
arch/arm/Kconfig
arch/arm/Makefile
arch/arm/boot/compressed/decompress.c
arch/arm/boot/dts/am4372.dtsi
arch/arm/boot/dts/am437x-gp-evm.dts
arch/arm/boot/dts/am437x-idk-evm.dts
arch/arm/boot/dts/am437x-sk-evm.dts
arch/arm/boot/dts/exynos3250-monk.dts
arch/arm/boot/dts/exynos3250-rinato.dts
arch/arm/boot/dts/exynos3250.dtsi
arch/arm/boot/dts/exynos4.dtsi
arch/arm/boot/dts/exynos4212.dtsi
arch/arm/boot/dts/exynos4412-odroid-common.dtsi
arch/arm/boot/dts/exynos4412-odroidu3.dts
arch/arm/boot/dts/exynos4412-origen.dts
arch/arm/boot/dts/exynos4412-trats2.dts
arch/arm/boot/dts/exynos4412.dtsi
arch/arm/boot/dts/exynos5250-arndale.dts
arch/arm/boot/dts/exynos5250-smdk5250.dts
arch/arm/boot/dts/exynos5250-snow.dts
arch/arm/boot/dts/exynos5250-spring.dts
arch/arm/boot/dts/exynos5250.dtsi
arch/arm/boot/dts/exynos5422-cpus.dtsi [new file with mode: 0644]
arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
arch/arm/boot/dts/qcom-apq8064-cm-qs600.dts
arch/arm/boot/dts/qcom-apq8064-ifc6410.dts
arch/arm/boot/dts/qcom-apq8074-dragonboard.dts
arch/arm/boot/dts/qcom-apq8084-ifc6540.dts
arch/arm/boot/dts/qcom-apq8084-mtp.dts
arch/arm/boot/dts/qcom-apq8084.dtsi
arch/arm/boot/dts/qcom-ipq8064-ap148.dts
arch/arm/boot/dts/qcom-ipq8064.dtsi
arch/arm/boot/dts/qcom-msm8660-surf.dts
arch/arm/boot/dts/qcom-msm8660.dtsi
arch/arm/boot/dts/qcom-msm8960-cdp.dts
arch/arm/boot/dts/qcom-msm8960.dtsi
arch/arm/boot/dts/qcom-msm8974-sony-xperia-honami.dts
arch/arm/boot/dts/qcom-msm8974.dtsi
arch/arm/configs/cm_x2xx_defconfig
arch/arm/configs/em_x270_defconfig
arch/arm/configs/exynos_defconfig
arch/arm/configs/magician_defconfig
arch/arm/configs/multi_v7_defconfig
arch/arm/configs/palmz72_defconfig
arch/arm/configs/pcm027_defconfig
arch/arm/configs/trizeps4_defconfig
arch/arm/include/asm/assembler.h
arch/arm/include/asm/bug.h
arch/arm/include/asm/dma-mapping.h
arch/arm/include/asm/domain.h
arch/arm/include/asm/irq.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/memory.h
arch/arm/include/asm/thread_info.h
arch/arm/include/asm/xen/events.h
arch/arm/include/asm/xen/page.h
arch/arm/kernel/process.c
arch/arm/kernel/smp.c
arch/arm/kvm/arm.c
arch/arm/kvm/guest.c
arch/arm/kvm/interrupts.S
arch/arm/kvm/reset.c
arch/arm/mach-clps711x/board-cdb89712.c
arch/arm/mach-exynos/Kconfig
arch/arm/mach-exynos/exynos.c
arch/arm/mach-mmp/include/mach/regs-rtc.h [deleted file]
arch/arm/mach-pxa/devices.c
arch/arm/mach-pxa/pxa27x.c
arch/arm/mach-pxa/pxa3xx.c
arch/arm/mach-sa1100/include/mach/SA-1100.h
arch/arm/mach-shmobile/pm-rcar.c
arch/arm/mm/dma-mapping.c
arch/arm/nwfpe/entry.S
arch/arm/xen/enlighten.c
arch/arm/xen/hypercall.S
arch/arm/xen/mm.c
arch/arm64/Kconfig
arch/arm64/Makefile
arch/arm64/include/asm/dma-mapping.h
arch/arm64/include/asm/hw_breakpoint.h
arch/arm64/include/asm/kvm_arm.h
arch/arm64/include/asm/kvm_asm.h
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/asm/memory.h
arch/arm64/include/asm/pgtable.h
arch/arm64/include/asm/xen/events.h
arch/arm64/include/uapi/asm/kvm.h
arch/arm64/kernel/asm-offsets.c
arch/arm64/kernel/debug-monitors.c
arch/arm64/kernel/head.S
arch/arm64/kernel/hw_breakpoint.c
arch/arm64/kernel/module.c
arch/arm64/kernel/setup.c
arch/arm64/kernel/signal32.c
arch/arm64/kvm/Makefile
arch/arm64/kvm/debug.c [new file with mode: 0644]
arch/arm64/kvm/guest.c
arch/arm64/kvm/handle_exit.c
arch/arm64/kvm/hyp.S
arch/arm64/kvm/reset.c
arch/arm64/kvm/sys_regs.c
arch/arm64/kvm/sys_regs.h
arch/arm64/kvm/sys_regs_generic_v8.c
arch/arm64/kvm/trace.h
arch/arm64/mm/dma-mapping.c
arch/cris/Kconfig
arch/cris/arch-v10/kernel/entry.S
arch/cris/arch-v10/lib/dmacopy.c [deleted file]
arch/cris/arch-v10/lib/old_checksum.c [deleted file]
arch/cris/arch-v32/drivers/Kconfig
arch/cris/arch-v32/drivers/axisflashmap.c
arch/cris/arch-v32/drivers/mach-a3/gpio.c
arch/cris/arch-v32/drivers/mach-fs/gpio.c
arch/cris/arch-v32/kernel/entry.S
arch/cris/arch-v32/kernel/process.c
arch/cris/arch-v32/kernel/signal.c
arch/cris/arch-v32/mach-fs/pinmux.c
arch/cris/configs/artpec_3_defconfig
arch/cris/configs/etraxfs_defconfig
arch/cris/include/arch-v10/arch/elf.h [deleted file]
arch/cris/include/arch-v10/arch/ptrace.h [deleted file]
arch/cris/include/arch-v32/arch/bug.h
arch/cris/include/arch-v32/arch/elf.h [deleted file]
arch/cris/include/arch-v32/arch/irqflags.h
arch/cris/include/arch-v32/arch/ptrace.h [deleted file]
arch/cris/include/asm/Kbuild
arch/cris/include/asm/elf.h [deleted file]
arch/cris/include/asm/mmu_context.h
arch/cris/include/asm/stacktrace.h [new file with mode: 0644]
arch/cris/include/asm/types.h [deleted file]
arch/cris/include/asm/unistd.h
arch/cris/include/uapi/asm/Kbuild
arch/cris/include/uapi/asm/auxvec.h [deleted file]
arch/cris/include/uapi/asm/bitsperlong.h [deleted file]
arch/cris/include/uapi/asm/elf.h [new file with mode: 0644]
arch/cris/include/uapi/asm/elf_v10.h [new file with mode: 0644]
arch/cris/include/uapi/asm/elf_v32.h [new file with mode: 0644]
arch/cris/include/uapi/asm/errno.h [deleted file]
arch/cris/include/uapi/asm/fcntl.h [deleted file]
arch/cris/include/uapi/asm/ioctl.h [deleted file]
arch/cris/include/uapi/asm/ipcbuf.h [deleted file]
arch/cris/include/uapi/asm/kvm_para.h [deleted file]
arch/cris/include/uapi/asm/mman.h [deleted file]
arch/cris/include/uapi/asm/msgbuf.h [deleted file]
arch/cris/include/uapi/asm/poll.h [deleted file]
arch/cris/include/uapi/asm/ptrace.h
arch/cris/include/uapi/asm/ptrace_v10.h [new file with mode: 0644]
arch/cris/include/uapi/asm/ptrace_v32.h [new file with mode: 0644]
arch/cris/include/uapi/asm/resource.h [deleted file]
arch/cris/include/uapi/asm/sembuf.h [deleted file]
arch/cris/include/uapi/asm/shmbuf.h [deleted file]
arch/cris/include/uapi/asm/siginfo.h [deleted file]
arch/cris/include/uapi/asm/socket.h [deleted file]
arch/cris/include/uapi/asm/sockios.h [deleted file]
arch/cris/include/uapi/asm/statfs.h [deleted file]
arch/cris/include/uapi/asm/types.h [deleted file]
arch/cris/include/uapi/asm/unistd.h
arch/cris/kernel/Makefile
arch/cris/kernel/irq.c
arch/cris/kernel/stacktrace.c [new file with mode: 0644]
arch/h8300/boot/compressed/misc.c
arch/h8300/include/asm/dma-mapping.h
arch/hexagon/include/asm/dma-mapping.h
arch/hexagon/include/uapi/asm/signal.h
arch/hexagon/kernel/dma.c
arch/hexagon/kernel/time.c
arch/ia64/Kconfig
arch/ia64/hp/common/sba_iommu.c
arch/ia64/include/asm/dma-mapping.h
arch/ia64/include/asm/io.h
arch/ia64/include/asm/unistd.h
arch/ia64/include/uapi/asm/unistd.h
arch/ia64/kernel/cyclone.c
arch/ia64/kernel/entry.S
arch/ia64/kernel/uncached.c
arch/ia64/mm/init.c
arch/ia64/sn/pci/pci_dma.c
arch/m32r/boot/compressed/misc.c
arch/m68k/Kconfig
arch/m68k/coldfire/m54xx.c
arch/m68k/coldfire/pit.c
arch/metag/include/asm/ftrace.h
arch/metag/kernel/irq.c
arch/microblaze/include/asm/dma-mapping.h
arch/microblaze/include/asm/ftrace.h
arch/microblaze/include/uapi/asm/elf.h
arch/mips/Kconfig
arch/mips/boot/compressed/decompress.c
arch/mips/cavium-octeon/dma-octeon.c
arch/mips/configs/pistachio_defconfig
arch/mips/include/asm/dma-mapping.h
arch/mips/loongson64/common/dma-swiotlb.c
arch/mips/mm/dma-default.c
arch/mips/netlogic/common/nlm-dma.c
arch/nios2/boot/dts/10m50_devboard.dts [new file with mode: 0755]
arch/nios2/configs/10m50_defconfig [new file with mode: 0755]
arch/nios2/kernel/misaligned.c
arch/nios2/kernel/time.c
arch/openrisc/include/asm/dma-mapping.h
arch/parisc/include/asm/io.h
arch/parisc/kernel/irq.c
arch/parisc/kernel/syscall.S
arch/parisc/kernel/time.c
arch/parisc/mm/fault.c
arch/powerpc/Kconfig
arch/powerpc/Makefile
arch/powerpc/include/asm/dma-mapping.h
arch/powerpc/include/asm/ftrace.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/ppc-opcode.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/pci_of_scan.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_32_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_host.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_builtin.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rm_xics.c
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_paired_singles.c
arch/powerpc/kvm/book3s_segment.S
arch/powerpc/kvm/book3s_xics.c
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/mm/mem.c
arch/powerpc/platforms/cell/ras.c
arch/powerpc/sysdev/axonram.c
arch/s390/Kconfig
arch/s390/boot/compressed/misc.c
arch/s390/include/asm/dma-mapping.h
arch/s390/mm/init.c
arch/s390/pci/pci_dma.c
arch/sh/Kconfig
arch/sh/boot/compressed/misc.c
arch/sh/include/asm/dma-mapping.h
arch/sh/include/asm/ftrace.h
arch/sh/include/asm/io.h
arch/sh/mm/init.c
arch/sparc/include/asm/dma-mapping.h
arch/sparc/include/asm/ftrace.h
arch/sparc/include/asm/pgtable_32.h
arch/sparc/kernel/pci.c
arch/tile/Kconfig
arch/tile/include/asm/dma-mapping.h
arch/tile/mm/init.c
arch/unicore32/boot/compressed/misc.c
arch/unicore32/include/asm/dma-mapping.h
arch/unicore32/include/asm/memory.h
arch/x86/Kconfig
arch/x86/boot/compressed/misc.c
arch/x86/boot/header.S
arch/x86/crypto/ghash-clmulni-intel_glue.c
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/entry/vsyscall/vsyscall_64.c
arch/x86/include/asm/cacheflush.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/dma-mapping.h
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/io.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/pmem.h [new file with mode: 0644]
arch/x86/include/asm/qspinlock.h
arch/x86/include/asm/xen/events.h
arch/x86/include/asm/xen/hypercall.h
arch/x86/include/asm/xen/interface.h
arch/x86/include/asm/xen/page.h
arch/x86/include/uapi/asm/e820.h
arch/x86/kernel/Makefile
arch/x86/kernel/alternative.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/hw_nmi.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/perf_event_intel.c
arch/x86/kernel/cpu/perf_event_intel_bts.c
arch/x86/kernel/kexec-bzimage64.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/ldt.c
arch/x86/kernel/pci-dma.c
arch/x86/kernel/pmem.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/tsc.c
arch/x86/kernel/vm86_32.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/mpx.c
arch/x86/mm/numa.c
arch/x86/mm/srat.c
arch/x86/platform/efi/efi.c
arch/x86/platform/uv/uv_nmi.c
arch/x86/xen/Kconfig
arch/x86/xen/Makefile
arch/x86/xen/apic.c
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/p2m.c
arch/x86/xen/p2m.h [deleted file]
arch/x86/xen/platform-pci-unplug.c
arch/x86/xen/pmu.c [new file with mode: 0644]
arch/x86/xen/pmu.h [new file with mode: 0644]
arch/x86/xen/setup.c
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/xen-head.S
arch/x86/xen/xen-ops.h
arch/xtensa/include/asm/dma-mapping.h
arch/xtensa/include/asm/io.h
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-throttle.c
block/blk.h
block/cfq-iosched.c
certs/Kconfig [new file with mode: 0644]
certs/Makefile [new file with mode: 0644]
certs/system_certificates.S [new file with mode: 0644]
certs/system_keyring.c [new file with mode: 0644]
crypto/Kconfig
crypto/asymmetric_keys/Makefile
crypto/asymmetric_keys/asymmetric_type.c
crypto/asymmetric_keys/mscode_parser.c
crypto/asymmetric_keys/pkcs7.asn1
crypto/asymmetric_keys/pkcs7_key_type.c
crypto/asymmetric_keys/pkcs7_parser.c
crypto/asymmetric_keys/pkcs7_parser.h
crypto/asymmetric_keys/pkcs7_trust.c
crypto/asymmetric_keys/pkcs7_verify.c
crypto/asymmetric_keys/public_key.c
crypto/asymmetric_keys/verify_pefile.c
crypto/asymmetric_keys/x509_akid.asn1 [new file with mode: 0644]
crypto/asymmetric_keys/x509_cert_parser.c
crypto/asymmetric_keys/x509_parser.h
crypto/asymmetric_keys/x509_public_key.c
crypto/testmgr.c
drivers/acpi/Kconfig
drivers/acpi/acpi_pnp.c
drivers/acpi/nfit.c
drivers/acpi/nfit.h
drivers/acpi/thermal.c
drivers/android/binder.c
drivers/base/power/domain.c
drivers/base/power/opp.c
drivers/base/property.c
drivers/base/regmap/internal.h
drivers/base/regmap/regcache.c
drivers/base/regmap/regmap-ac97.c
drivers/base/regmap/regmap-debugfs.c
drivers/base/regmap/regmap-i2c.c
drivers/base/regmap/regmap-irq.c
drivers/base/regmap/regmap-mmio.c
drivers/base/regmap/regmap-spi.c
drivers/base/regmap/regmap-spmi.c
drivers/base/regmap/regmap.c
drivers/block/brd.c
drivers/block/rbd.c
drivers/block/virtio_blk.c
drivers/block/xen-blkfront.c
drivers/block/zram/zcomp.c
drivers/block/zram/zram_drv.c
drivers/block/zram/zram_drv.h
drivers/bus/vexpress-config.c
drivers/char/ipmi/ipmi_bt_sm.c
drivers/char/ipmi/ipmi_kcs_sm.c
drivers/char/ipmi/ipmi_msghandler.c
drivers/char/ipmi/ipmi_powernv.c
drivers/char/ipmi/ipmi_si_intf.c
drivers/char/ipmi/ipmi_si_sm.h
drivers/char/ipmi/ipmi_smic_sm.c
drivers/char/ipmi/ipmi_ssif.c
drivers/clk/h8300/clk-h8s2678.c
drivers/clk/hisilicon/Kconfig
drivers/clk/hisilicon/Makefile
drivers/clk/rockchip/clk-rk3188.c
drivers/clk/samsung/clk-exynos4.c
drivers/clk/shmobile/clk-emev2.c
drivers/cpufreq/Kconfig.arm
drivers/cpufreq/Makefile
drivers/cpufreq/cpufreq-dt.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/exynos-cpufreq.c [deleted file]
drivers/cpufreq/exynos-cpufreq.h [deleted file]
drivers/cpufreq/exynos4x12-cpufreq.c [deleted file]
drivers/cpufreq/exynos5250-cpufreq.c [deleted file]
drivers/cpufreq/intel_pstate.c
drivers/cpuidle/coupled.c
drivers/cpuidle/cpuidle.h
drivers/cpuidle/driver.c
drivers/crypto/Kconfig
drivers/crypto/qat/qat_common/adf_transport_debug.c
drivers/crypto/sunxi-ss/sun4i-ss-cipher.c
drivers/edac/sb_edac.c
drivers/firmware/efi/Kconfig
drivers/gpio/Kconfig
drivers/gpio/gpio-mxc.c
drivers/gpio/gpio-mxs.c
drivers/gpio/gpio-omap.c
drivers/gpio/gpio-sx150x.c
drivers/gpio/gpiolib.c
drivers/gpu/drm/drm_atomic.c
drivers/gpu/drm/drm_dp_helper.c
drivers/gpu/drm/exynos/Kconfig
drivers/gpu/drm/exynos/exynos_drm_g2d.c
drivers/gpu/drm/exynos/exynos_drm_gem.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem_execbuffer.c
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/intel_csr.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dp_mst.c
drivers/gpu/drm/i915/intel_dsi.c
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c
drivers/gpu/drm/nouveau/nvkm/engine/gr/nv04.c
drivers/gpu/drm/nouveau/nvkm/subdev/clk/gt215.c
drivers/gpu/drm/qxl/qxl_display.c
drivers/gpu/drm/qxl/qxl_drv.h
drivers/gpu/drm/vgem/vgem_drv.c
drivers/hsi/clients/cmt_speech.c
drivers/hwmon/Kconfig
drivers/hwmon/lm75.c
drivers/hwmon/nct6775.c
drivers/hwmon/ntc_thermistor.c
drivers/hwmon/tmp102.c
drivers/i2c/busses/Kconfig
drivers/i2c/busses/Makefile
drivers/i2c/busses/i2c-cadence.c
drivers/i2c/busses/i2c-designware-core.c
drivers/i2c/busses/i2c-designware-pcidrv.c
drivers/i2c/busses/i2c-emev2.c [new file with mode: 0644]
drivers/i2c/busses/i2c-lpc2k.c [new file with mode: 0644]
drivers/i2c/busses/i2c-mt65xx.c
drivers/i2c/busses/i2c-omap.c
drivers/i2c/busses/i2c-parport.c
drivers/i2c/busses/i2c-parport.h
drivers/i2c/busses/i2c-pxa.c
drivers/i2c/busses/i2c-tegra.c
drivers/i2c/busses/i2c-viperboard.c
drivers/i2c/busses/i2c-xgene-slimpro.c
drivers/i2c/busses/i2c-xiic.c
drivers/i2c/i2c-core.c
drivers/i2c/i2c-slave-eeprom.c
drivers/i2c/muxes/Kconfig
drivers/i2c/muxes/Makefile
drivers/i2c/muxes/i2c-arb-gpio-challenge.c
drivers/i2c/muxes/i2c-mux-gpio.c
drivers/i2c/muxes/i2c-mux-pca9541.c
drivers/i2c/muxes/i2c-mux-pca954x.c
drivers/i2c/muxes/i2c-mux-pinctrl.c
drivers/i2c/muxes/i2c-mux-reg.c [new file with mode: 0644]
drivers/infiniband/Kconfig
drivers/infiniband/core/Makefile
drivers/infiniband/core/cache.c
drivers/infiniband/core/cm.c
drivers/infiniband/core/cma.c
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/device.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/mad_priv.h
drivers/infiniband/core/multicast.c
drivers/infiniband/core/netlink.c
drivers/infiniband/core/roce_gid_mgmt.c [new file with mode: 0644]
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucm.c
drivers/infiniband/core/ucma.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/amso1100/Kbuild [deleted file]
drivers/infiniband/hw/amso1100/Kconfig [deleted file]
drivers/infiniband/hw/amso1100/c2.c [deleted file]
drivers/infiniband/hw/amso1100/c2.h [deleted file]
drivers/infiniband/hw/amso1100/c2_ae.c [deleted file]
drivers/infiniband/hw/amso1100/c2_ae.h [deleted file]
drivers/infiniband/hw/amso1100/c2_alloc.c [deleted file]
drivers/infiniband/hw/amso1100/c2_cm.c [deleted file]
drivers/infiniband/hw/amso1100/c2_cq.c [deleted file]
drivers/infiniband/hw/amso1100/c2_intr.c [deleted file]
drivers/infiniband/hw/amso1100/c2_mm.c [deleted file]
drivers/infiniband/hw/amso1100/c2_mq.c [deleted file]
drivers/infiniband/hw/amso1100/c2_mq.h [deleted file]
drivers/infiniband/hw/amso1100/c2_pd.c [deleted file]
drivers/infiniband/hw/amso1100/c2_provider.c [deleted file]
drivers/infiniband/hw/amso1100/c2_provider.h [deleted file]
drivers/infiniband/hw/amso1100/c2_qp.c [deleted file]
drivers/infiniband/hw/amso1100/c2_rnic.c [deleted file]
drivers/infiniband/hw/amso1100/c2_status.h [deleted file]
drivers/infiniband/hw/amso1100/c2_user.h [deleted file]
drivers/infiniband/hw/amso1100/c2_vq.c [deleted file]
drivers/infiniband/hw/amso1100/c2_vq.h [deleted file]
drivers/infiniband/hw/amso1100/c2_wr.h [deleted file]
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/ehca/Kconfig [deleted file]
drivers/infiniband/hw/ehca/Makefile [deleted file]
drivers/infiniband/hw/ehca/ehca_av.c [deleted file]
drivers/infiniband/hw/ehca/ehca_classes.h [deleted file]
drivers/infiniband/hw/ehca/ehca_classes_pSeries.h [deleted file]
drivers/infiniband/hw/ehca/ehca_cq.c [deleted file]
drivers/infiniband/hw/ehca/ehca_eq.c [deleted file]
drivers/infiniband/hw/ehca/ehca_hca.c [deleted file]
drivers/infiniband/hw/ehca/ehca_irq.c [deleted file]
drivers/infiniband/hw/ehca/ehca_irq.h [deleted file]
drivers/infiniband/hw/ehca/ehca_iverbs.h [deleted file]
drivers/infiniband/hw/ehca/ehca_main.c [deleted file]
drivers/infiniband/hw/ehca/ehca_mcast.c [deleted file]
drivers/infiniband/hw/ehca/ehca_mrmw.c [deleted file]
drivers/infiniband/hw/ehca/ehca_mrmw.h [deleted file]
drivers/infiniband/hw/ehca/ehca_pd.c [deleted file]
drivers/infiniband/hw/ehca/ehca_qes.h [deleted file]
drivers/infiniband/hw/ehca/ehca_qp.c [deleted file]
drivers/infiniband/hw/ehca/ehca_reqs.c [deleted file]
drivers/infiniband/hw/ehca/ehca_sqp.c [deleted file]
drivers/infiniband/hw/ehca/ehca_tools.h [deleted file]
drivers/infiniband/hw/ehca/ehca_uverbs.c [deleted file]
drivers/infiniband/hw/ehca/hcp_if.c [deleted file]
drivers/infiniband/hw/ehca/hcp_if.h [deleted file]
drivers/infiniband/hw/ehca/hcp_phyp.c [deleted file]
drivers/infiniband/hw/ehca/hcp_phyp.h [deleted file]
drivers/infiniband/hw/ehca/hipz_fns.h [deleted file]
drivers/infiniband/hw/ehca/hipz_fns_core.h [deleted file]
drivers/infiniband/hw/ehca/hipz_hw.h [deleted file]
drivers/infiniband/hw/ehca/ipz_pt_fn.c [deleted file]
drivers/infiniband/hw/ehca/ipz_pt_fn.h [deleted file]
drivers/infiniband/hw/ipath/Kconfig [deleted file]
drivers/infiniband/hw/ipath/Makefile [deleted file]
drivers/infiniband/hw/ipath/ipath_common.h [deleted file]
drivers/infiniband/hw/ipath/ipath_cq.c [deleted file]
drivers/infiniband/hw/ipath/ipath_debug.h [deleted file]
drivers/infiniband/hw/ipath/ipath_diag.c [deleted file]
drivers/infiniband/hw/ipath/ipath_dma.c [deleted file]
drivers/infiniband/hw/ipath/ipath_driver.c [deleted file]
drivers/infiniband/hw/ipath/ipath_eeprom.c [deleted file]
drivers/infiniband/hw/ipath/ipath_file_ops.c [deleted file]
drivers/infiniband/hw/ipath/ipath_fs.c [deleted file]
drivers/infiniband/hw/ipath/ipath_iba6110.c [deleted file]
drivers/infiniband/hw/ipath/ipath_init_chip.c [deleted file]
drivers/infiniband/hw/ipath/ipath_intr.c [deleted file]
drivers/infiniband/hw/ipath/ipath_kernel.h [deleted file]
drivers/infiniband/hw/ipath/ipath_keys.c [deleted file]
drivers/infiniband/hw/ipath/ipath_mad.c [deleted file]
drivers/infiniband/hw/ipath/ipath_mmap.c [deleted file]
drivers/infiniband/hw/ipath/ipath_mr.c [deleted file]
drivers/infiniband/hw/ipath/ipath_qp.c [deleted file]
drivers/infiniband/hw/ipath/ipath_rc.c [deleted file]
drivers/infiniband/hw/ipath/ipath_registers.h [deleted file]
drivers/infiniband/hw/ipath/ipath_ruc.c [deleted file]
drivers/infiniband/hw/ipath/ipath_sdma.c [deleted file]
drivers/infiniband/hw/ipath/ipath_srq.c [deleted file]
drivers/infiniband/hw/ipath/ipath_stats.c [deleted file]
drivers/infiniband/hw/ipath/ipath_sysfs.c [deleted file]
drivers/infiniband/hw/ipath/ipath_uc.c [deleted file]
drivers/infiniband/hw/ipath/ipath_ud.c [deleted file]
drivers/infiniband/hw/ipath/ipath_user_pages.c [deleted file]
drivers/infiniband/hw/ipath/ipath_user_sdma.c [deleted file]
drivers/infiniband/hw/ipath/ipath_user_sdma.h [deleted file]
drivers/infiniband/hw/ipath/ipath_verbs.c [deleted file]
drivers/infiniband/hw/ipath/ipath_verbs.h [deleted file]
drivers/infiniband/hw/ipath/ipath_verbs_mcast.c [deleted file]
drivers/infiniband/hw/ipath/ipath_wc_ppc64.c [deleted file]
drivers/infiniband/hw/ipath/ipath_wc_x86_64.c [deleted file]
drivers/infiniband/hw/mlx4/ah.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/mad.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mcg.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx4/sysfs.c
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_sli.h
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
drivers/infiniband/hw/qib/qib_file_ops.c
drivers/infiniband/hw/qib/qib_keys.c
drivers/infiniband/hw/qib/qib_mad.h
drivers/infiniband/hw/qib/qib_mmap.c
drivers/infiniband/hw/qib/qib_mr.c
drivers/infiniband/hw/qib/qib_ruc.c
drivers/infiniband/hw/qib/qib_verbs.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/infiniband/ulp/iser/iser_memory.c
drivers/infiniband/ulp/iser/iser_verbs.c
drivers/infiniband/ulp/isert/ib_isert.c
drivers/infiniband/ulp/isert/ib_isert.h
drivers/infiniband/ulp/srp/ib_srp.c
drivers/infiniband/ulp/srp/ib_srp.h
drivers/infiniband/ulp/srpt/ib_srpt.c
drivers/infiniband/ulp/srpt/ib_srpt.h
drivers/input/evdev.c
drivers/input/keyboard/imx_keypad.c
drivers/input/misc/ab8500-ponkey.c
drivers/input/misc/pwm-beeper.c
drivers/input/misc/regulator-haptic.c
drivers/input/misc/sparcspkr.c
drivers/input/misc/xen-kbdfront.c
drivers/input/mouse/elan_i2c_core.c
drivers/input/serio/i8042.c
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/Makefile
drivers/input/touchscreen/colibri-vf50-ts.c [new file with mode: 0644]
drivers/input/touchscreen/cyttsp4_i2c.c
drivers/input/touchscreen/cyttsp_i2c.c
drivers/input/touchscreen/elants_i2c.c
drivers/input/touchscreen/imx6ul_tsc.c [new file with mode: 0644]
drivers/input/touchscreen/sun4i-ts.c
drivers/iommu/Kconfig
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_init.c
drivers/iommu/amd_iommu_v2.c
drivers/iommu/arm-smmu-v3.c
drivers/iommu/arm-smmu.c
drivers/iommu/dmar.c
drivers/iommu/fsl_pamu.c
drivers/iommu/intel-iommu.c
drivers/iommu/intel_irq_remapping.c
drivers/iommu/io-pgtable-arm.c
drivers/iommu/io-pgtable.c
drivers/iommu/io-pgtable.h
drivers/iommu/ipmmu-vmsa.c
drivers/iommu/irq_remapping.c
drivers/iommu/msm_iommu.c
drivers/iommu/of_iommu.c
drivers/iommu/omap-iommu-debug.c
drivers/iommu/omap-iommu.c
drivers/iommu/omap-iommu.h
drivers/iommu/omap-iopgtable.h
drivers/iommu/tegra-smmu.c
drivers/irqchip/irq-gic-v3.c
drivers/irqchip/irq-gic.c
drivers/isdn/icn/icn.h
drivers/md/Kconfig
drivers/md/dm-mpath.c
drivers/media/platform/omap/Kconfig
drivers/media/platform/omap/omap_vout.c
drivers/media/v4l2-core/Kconfig
drivers/media/v4l2-core/videobuf2-core.c
drivers/media/v4l2-core/videobuf2-dma-contig.c
drivers/media/v4l2-core/videobuf2-dma-sg.c
drivers/media/v4l2-core/videobuf2-memops.c
drivers/media/v4l2-core/videobuf2-vmalloc.c
drivers/memory/tegra/tegra114.c
drivers/memory/tegra/tegra124.c
drivers/memory/tegra/tegra30.c
drivers/misc/eeprom/at24.c
drivers/misc/eeprom/max6875.c
drivers/misc/genwqe/card_dev.c
drivers/misc/mei/wd.c
drivers/misc/sgi-xp/xpc_uv.c
drivers/mmc/card/block.c
drivers/mmc/core/core.c
drivers/mmc/core/host.c
drivers/mmc/host/Kconfig
drivers/mmc/host/Makefile
drivers/mmc/host/android-goldfish.c
drivers/mmc/host/atmel-mci.c
drivers/mmc/host/dw_mmc-rockchip.c
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/omap.c
drivers/mmc/host/omap_hsmmc.c
drivers/mmc/host/pxamci.c
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/mmc/host/sdhci-esdhc.h
drivers/mmc/host/sdhci-msm.c
drivers/mmc/host/sdhci-of-arasan.c
drivers/mmc/host/sdhci-of-at91.c [new file with mode: 0644]
drivers/mmc/host/sdhci-of-esdhc.c
drivers/mmc/host/sdhci-pci.c
drivers/mmc/host/sdhci-sirf.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/sdhci.h
drivers/mmc/host/sh_mmcif.c
drivers/mmc/host/sunxi-mmc.c
drivers/mmc/host/tmio_mmc_pio.c
drivers/mmc/host/usdhi6rol0.c
drivers/mtd/devices/slram.c
drivers/mtd/nand/Makefile
drivers/mtd/nand/diskonchip.c
drivers/mtd/onenand/generic.c
drivers/mtd/spi-nor/spi-nor.c
drivers/net/bonding/bond_options.c
drivers/net/dsa/bcm_sf2.c
drivers/net/dsa/bcm_sf2.h
drivers/net/dsa/mv88e6171.c
drivers/net/ethernet/altera/altera_tse_main.c
drivers/net/ethernet/cavium/liquidio/lio_main.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
drivers/net/ethernet/chelsio/cxgb4/sge.c
drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
drivers/net/ethernet/chelsio/cxgb4/t4fw_version.h
drivers/net/ethernet/davicom/dm9000.c
drivers/net/ethernet/emulex/benet/be_cmds.c
drivers/net/ethernet/ethoc.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/ethernet/jme.c
drivers/net/ethernet/marvell/mv643xx_eth.c
drivers/net/ethernet/mellanox/mlx4/en_main.c
drivers/net/ethernet/mellanox/mlx4/intf.c
drivers/net/ethernet/mellanox/mlx5/core/fw.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c
drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
drivers/net/ethernet/realtek/r8169.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ethernet/synopsys/Kconfig
drivers/net/ntb_netdev.c
drivers/net/phy/Kconfig
drivers/net/phy/Makefile
drivers/net/phy/fixed_phy.c
drivers/net/phy/microchip.c [new file with mode: 0644]
drivers/net/usb/lan78xx.c
drivers/net/usb/r8152.c
drivers/net/usb/usbnet.c
drivers/net/vxlan.c
drivers/net/wan/sbni.c
drivers/net/wireless/ath/wil6210/debugfs.c
drivers/net/xen-netback/common.h
drivers/net/xen-netback/netback.c
drivers/net/xen-netfront.c
drivers/ntb/hw/intel/ntb_hw_intel.c
drivers/ntb/hw/intel/ntb_hw_intel.h
drivers/ntb/ntb_transport.c
drivers/nvdimm/Kconfig
drivers/nvdimm/Makefile
drivers/nvdimm/btt.c
drivers/nvdimm/btt.h
drivers/nvdimm/btt_devs.c
drivers/nvdimm/claim.c [new file with mode: 0644]
drivers/nvdimm/dimm_devs.c
drivers/nvdimm/e820.c [new file with mode: 0644]
drivers/nvdimm/namespace_devs.c
drivers/nvdimm/nd-core.h
drivers/nvdimm/nd.h
drivers/nvdimm/pfn.h [new file with mode: 0644]
drivers/nvdimm/pfn_devs.c [new file with mode: 0644]
drivers/nvdimm/pmem.c
drivers/nvdimm/region.c
drivers/nvdimm/region_devs.c
drivers/of/irq.c
drivers/parisc/ccio-dma.c
drivers/parisc/lba_pci.c
drivers/parisc/sba_iommu.c
drivers/pci/Kconfig
drivers/pci/pci-driver.c
drivers/pci/probe.c
drivers/pinctrl/core.c
drivers/pinctrl/pinctrl-digicolor.c
drivers/pinctrl/pinmux.c
drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c
drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c
drivers/pinctrl/samsung/pinctrl-s3c24xx.c
drivers/platform/x86/Kconfig
drivers/platform/x86/Makefile
drivers/platform/x86/acer-wmi.c
drivers/platform/x86/acerhdf.c
drivers/platform/x86/asus-laptop.c
drivers/platform/x86/asus-nb-wmi.c
drivers/platform/x86/hp-wireless.c
drivers/platform/x86/hp-wmi.c
drivers/platform/x86/ideapad-laptop.c
drivers/platform/x86/intel_mid_thermal.c
drivers/platform/x86/surfacepro3_button.c [new file with mode: 0644]
drivers/platform/x86/thinkpad_acpi.c
drivers/platform/x86/toshiba_acpi.c
drivers/platform/x86/wmi.c
drivers/pnp/manager.c
drivers/power/charger-manager.c
drivers/power/power_supply_core.c
drivers/power/twl4030_charger.c
drivers/pwm/Kconfig
drivers/pwm/Makefile
drivers/pwm/core.c
drivers/pwm/pwm-atmel-hlcdc.c
drivers/pwm/pwm-atmel-tcb.c
drivers/pwm/pwm-atmel.c
drivers/pwm/pwm-bcm-kona.c
drivers/pwm/pwm-ep93xx.c
drivers/pwm/pwm-imx.c
drivers/pwm/pwm-lpc18xx-sct.c [new file with mode: 0644]
drivers/pwm/pwm-mxs.c
drivers/pwm/pwm-pca9685.c
drivers/pwm/pwm-renesas-tpu.c
drivers/pwm/pwm-rockchip.c
drivers/pwm/pwm-tegra.c
drivers/pwm/pwm-tiecap.c
drivers/pwm/pwm-tiehrpwm.c
drivers/pwm/sysfs.c
drivers/regulator/core.c
drivers/reset/reset-ath79.c
drivers/rtc/Kconfig
drivers/rtc/Makefile
drivers/rtc/class.c
drivers/rtc/interface.c
drivers/rtc/rtc-88pm80x.c
drivers/rtc/rtc-ab-b5ze-s3.c
drivers/rtc/rtc-ab8500.c
drivers/rtc/rtc-abx80x.c
drivers/rtc/rtc-armada38x.c
drivers/rtc/rtc-as3722.c
drivers/rtc/rtc-at91rm9200.c
drivers/rtc/rtc-at91sam9.c
drivers/rtc/rtc-bfin.c
drivers/rtc/rtc-bq32k.c
drivers/rtc/rtc-cmos.c
drivers/rtc/rtc-coh901331.c
drivers/rtc/rtc-core.h
drivers/rtc/rtc-da9063.c
drivers/rtc/rtc-dev.c
drivers/rtc/rtc-ds1305.c
drivers/rtc/rtc-ds1307.c
drivers/rtc/rtc-ds1343.c
drivers/rtc/rtc-ds1374.c
drivers/rtc/rtc-ds1511.c
drivers/rtc/rtc-ds1553.c
drivers/rtc/rtc-ds1685.c
drivers/rtc/rtc-ds1742.c
drivers/rtc/rtc-ds3232.c
drivers/rtc/rtc-fm3130.c
drivers/rtc/rtc-gemini.c
drivers/rtc/rtc-hym8563.c
drivers/rtc/rtc-isl12022.c
drivers/rtc/rtc-isl12057.c
drivers/rtc/rtc-lpc24xx.c [new file with mode: 0644]
drivers/rtc/rtc-m48t59.c
drivers/rtc/rtc-max8997.c
drivers/rtc/rtc-moxart.c
drivers/rtc/rtc-mpc5121.c
drivers/rtc/rtc-mt6397.c
drivers/rtc/rtc-mv.c
drivers/rtc/rtc-omap.c
drivers/rtc/rtc-opal.c
drivers/rtc/rtc-pcf2123.c
drivers/rtc/rtc-pcf2127.c
drivers/rtc/rtc-pcf85063.c
drivers/rtc/rtc-pcf8523.c
drivers/rtc/rtc-pcf8563.c
drivers/rtc/rtc-pcf8583.c
drivers/rtc/rtc-pl031.c
drivers/rtc/rtc-pxa.c
drivers/rtc/rtc-rp5c01.c
drivers/rtc/rtc-rx8025.c
drivers/rtc/rtc-rx8581.c
drivers/rtc/rtc-s3c.c
drivers/rtc/rtc-s5m.c
drivers/rtc/rtc-sa1100.c
drivers/rtc/rtc-sa1100.h [new file with mode: 0644]
drivers/rtc/rtc-sirfsoc.c
drivers/rtc/rtc-stk17ta8.c
drivers/rtc/rtc-sysfs.c
drivers/rtc/rtc-tx4939.c
drivers/rtc/rtc-vt8500.c
drivers/rtc/rtc-zynqmp.c [new file with mode: 0644]
drivers/s390/block/dcssblk.c
drivers/s390/crypto/zcrypt_api.c
drivers/scsi/Makefile
drivers/scsi/aic94xx/aic94xx_init.c
drivers/scsi/aic94xx/aic94xx_sds.c
drivers/scsi/arcmsr/arcmsr_hba.c
drivers/scsi/bfa/bfa_ioc.c
drivers/scsi/device_handler/Kconfig
drivers/scsi/device_handler/Makefile
drivers/scsi/device_handler/scsi_dh.c [deleted file]
drivers/scsi/device_handler/scsi_dh_alua.c
drivers/scsi/device_handler/scsi_dh_emc.c
drivers/scsi/device_handler/scsi_dh_hp_sw.c
drivers/scsi/device_handler/scsi_dh_rdac.c
drivers/scsi/fcoe/fcoe.c
drivers/scsi/ipr.c
drivers/scsi/libiscsi.c
drivers/scsi/lpfc/lpfc_mbox.c
drivers/scsi/mpt2sas/mpt2sas_base.c
drivers/scsi/mpt2sas/mpt2sas_base.h
drivers/scsi/mpt2sas/mpt2sas_ctl.c
drivers/scsi/mpt2sas/mpt2sas_scsih.c
drivers/scsi/mpt2sas/mpt2sas_transport.c
drivers/scsi/mpt3sas/mpi/mpi2.h
drivers/scsi/mpt3sas/mpi/mpi2_cnfg.h
drivers/scsi/mpt3sas/mpi/mpi2_ioc.h
drivers/scsi/mpt3sas/mpi/mpi2_tool.h
drivers/scsi/mpt3sas/mpt3sas_base.c
drivers/scsi/mpt3sas/mpt3sas_base.h
drivers/scsi/mpt3sas/mpt3sas_scsih.c
drivers/scsi/mpt3sas/mpt3sas_transport.c
drivers/scsi/mvsas/mv_init.c
drivers/scsi/pm8001/pm8001_hwi.c
drivers/scsi/pm8001/pm80xx_hwi.c
drivers/scsi/qla2xxx/Kconfig
drivers/scsi/qla2xxx/tcm_qla2xxx.c
drivers/scsi/scsi_common.c
drivers/scsi/scsi_debug.c
drivers/scsi/scsi_dh.c [new file with mode: 0644]
drivers/scsi/scsi_error.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_priv.h
drivers/scsi/scsi_sysfs.c
drivers/scsi/scsi_transport_sas.c
drivers/scsi/sun3x_esp.c
drivers/scsi/xen-scsifront.c
drivers/soc/qcom/smd.c
drivers/soc/qcom/smem.c
drivers/staging/Kconfig
drivers/staging/Makefile
drivers/staging/android/ion/ion.c
drivers/staging/board/armadillo800eva.c
drivers/staging/board/board.c
drivers/staging/comedi/comedi_fops.c
drivers/staging/comedi/drivers/ii_pci20kc.c
drivers/staging/rdma/Kconfig [new file with mode: 0644]
drivers/staging/rdma/Makefile [new file with mode: 0644]
drivers/staging/rdma/amso1100/Kbuild [new file with mode: 0644]
drivers/staging/rdma/amso1100/Kconfig [new file with mode: 0644]
drivers/staging/rdma/amso1100/TODO [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_ae.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_ae.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_alloc.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_cm.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_cq.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_intr.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_mm.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_mq.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_mq.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_pd.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_provider.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_provider.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_qp.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_rnic.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_status.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_user.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_vq.c [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_vq.h [new file with mode: 0644]
drivers/staging/rdma/amso1100/c2_wr.h [new file with mode: 0644]
drivers/staging/rdma/ehca/Kconfig [new file with mode: 0644]
drivers/staging/rdma/ehca/Makefile [new file with mode: 0644]
drivers/staging/rdma/ehca/TODO [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_av.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_classes.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_classes_pSeries.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_cq.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_eq.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_hca.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_irq.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_irq.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_iverbs.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_main.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_mcast.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_mrmw.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_mrmw.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_pd.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_qes.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_qp.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_reqs.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_sqp.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_tools.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ehca_uverbs.c [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_if.c [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_if.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_phyp.c [new file with mode: 0644]
drivers/staging/rdma/ehca/hcp_phyp.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hipz_fns.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hipz_fns_core.h [new file with mode: 0644]
drivers/staging/rdma/ehca/hipz_hw.h [new file with mode: 0644]
drivers/staging/rdma/ehca/ipz_pt_fn.c [new file with mode: 0644]
drivers/staging/rdma/ehca/ipz_pt_fn.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/Kconfig [new file with mode: 0644]
drivers/staging/rdma/hfi1/Makefile [new file with mode: 0644]
drivers/staging/rdma/hfi1/TODO [new file with mode: 0644]
drivers/staging/rdma/hfi1/chip.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/chip.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/chip_registers.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/common.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/cq.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/debugfs.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/debugfs.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/device.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/device.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/diag.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/dma.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/driver.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/eprom.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/eprom.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/file_ops.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/firmware.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/hfi.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/init.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/intr.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/iowait.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/keys.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/mad.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/mad.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/mmap.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/mr.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/opa_compat.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/pcie.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/pio.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/pio.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/pio_copy.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/platform_config.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/qp.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/qp.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/qsfp.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/qsfp.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/rc.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/ruc.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/sdma.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/sdma.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/srq.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/sysfs.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/trace.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/trace.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/twsi.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/twsi.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/uc.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/ud.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/user_pages.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/user_sdma.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/user_sdma.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/verbs.c [new file with mode: 0644]
drivers/staging/rdma/hfi1/verbs.h [new file with mode: 0644]
drivers/staging/rdma/hfi1/verbs_mcast.c [new file with mode: 0644]
drivers/staging/rdma/ipath/Kconfig [new file with mode: 0644]
drivers/staging/rdma/ipath/Makefile [new file with mode: 0644]
drivers/staging/rdma/ipath/TODO [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_common.h [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_cq.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_debug.h [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_diag.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_dma.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_driver.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_eeprom.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_file_ops.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_fs.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_iba6110.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_init_chip.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_intr.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_kernel.h [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_keys.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_mad.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_mmap.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_mr.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_qp.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_rc.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_registers.h [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_ruc.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_sdma.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_srq.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_stats.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_sysfs.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_uc.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_ud.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_user_pages.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_user_sdma.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_user_sdma.h [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_verbs.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_verbs.h [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_verbs_mcast.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_wc_ppc64.c [new file with mode: 0644]
drivers/staging/rdma/ipath/ipath_wc_x86_64.c [new file with mode: 0644]
drivers/staging/unisys/visorbus/visorchannel.c
drivers/staging/unisys/visorbus/visorchipset.c
drivers/target/iscsi/iscsi_target.c
drivers/target/iscsi/iscsi_target.h
drivers/target/iscsi/iscsi_target_configfs.c
drivers/target/iscsi/iscsi_target_device.c
drivers/target/iscsi/iscsi_target_login.c
drivers/target/iscsi/iscsi_target_login.h
drivers/target/iscsi/iscsi_target_nego.c
drivers/target/iscsi/iscsi_target_stat.c
drivers/target/iscsi/iscsi_target_tmr.c
drivers/target/iscsi/iscsi_target_tpg.c
drivers/target/iscsi/iscsi_target_tpg.h
drivers/target/iscsi/iscsi_target_util.c
drivers/target/loopback/tcm_loop.c
drivers/target/target_core_device.c
drivers/target/target_core_fabric_configfs.c
drivers/target/target_core_hba.c
drivers/target/target_core_sbc.c
drivers/target/target_core_spc.c
drivers/target/target_core_tpg.c
drivers/target/target_core_transport.c
drivers/target/target_core_user.c
drivers/target/target_core_xcopy.c
drivers/target/tcm_fc/tfc_cmd.c
drivers/thermal/Kconfig
drivers/thermal/Makefile
drivers/thermal/armada_thermal.c
drivers/thermal/db8500_thermal.c
drivers/thermal/dove_thermal.c
drivers/thermal/fair_share.c
drivers/thermal/gov_bang_bang.c
drivers/thermal/hisi_thermal.c
drivers/thermal/imx_thermal.c
drivers/thermal/int340x_thermal/int3400_thermal.c
drivers/thermal/int340x_thermal/int340x_thermal_zone.c
drivers/thermal/int340x_thermal/int340x_thermal_zone.h
drivers/thermal/int340x_thermal/processor_thermal_device.c
drivers/thermal/intel_pch_thermal.c [new file with mode: 0644]
drivers/thermal/intel_powerclamp.c
drivers/thermal/intel_quark_dts_thermal.c
drivers/thermal/intel_soc_dts_iosf.c
drivers/thermal/kirkwood_thermal.c
drivers/thermal/of-thermal.c
drivers/thermal/power_allocator.c
drivers/thermal/qcom-spmi-temp-alarm.c
drivers/thermal/rcar_thermal.c
drivers/thermal/rockchip_thermal.c
drivers/thermal/samsung/exynos_tmu.c
drivers/thermal/spear_thermal.c
drivers/thermal/st/st_thermal.c
drivers/thermal/step_wise.c
drivers/thermal/tegra_soctherm.c
drivers/thermal/thermal_core.c
drivers/thermal/thermal_hwmon.c
drivers/thermal/ti-soc-thermal/ti-thermal-common.c
drivers/thermal/x86_pkg_temp_thermal.c
drivers/tty/hvc/hvc_xen.c
drivers/tty/serial/8250/8250_core.c
drivers/tty/serial/Kconfig
drivers/tty/serial/amba-pl011.c
drivers/tty/sysrq.c
drivers/video/console/Kconfig
drivers/video/fbdev/Kconfig
drivers/video/fbdev/atmel_lcdfb.c
drivers/video/fbdev/core/fbmon.c
drivers/video/fbdev/core/fbsysfs.c
drivers/video/fbdev/core/modedb.c
drivers/video/fbdev/ocfb.c
drivers/video/fbdev/omap2/displays-new/encoder-opa362.c
drivers/video/fbdev/omap2/omapfb/omapfb-main.c
drivers/video/fbdev/pxa168fb.c
drivers/video/fbdev/s1d13xxxfb.c
drivers/video/fbdev/s3c-fb.c
drivers/video/fbdev/ssd1307fb.c
drivers/video/fbdev/stifb.c
drivers/video/fbdev/udlfb.c
drivers/video/fbdev/vfb.c
drivers/video/fbdev/xen-fbfront.c
drivers/virtio/virtio_balloon.c
drivers/virtio/virtio_mmio.c
drivers/watchdog/Kconfig
drivers/watchdog/Makefile
drivers/watchdog/at91rm9200_wdt.c
drivers/watchdog/at91sam9_wdt.c
drivers/watchdog/at91sam9_wdt.h
drivers/watchdog/bcm2835_wdt.c
drivers/watchdog/bcm47xx_wdt.c
drivers/watchdog/bcm_kona_wdt.c
drivers/watchdog/booke_wdt.c
drivers/watchdog/coh901327_wdt.c
drivers/watchdog/da9052_wdt.c
drivers/watchdog/da9055_wdt.c
drivers/watchdog/da9062_wdt.c
drivers/watchdog/da9063_wdt.c
drivers/watchdog/davinci_wdt.c
drivers/watchdog/digicolor_wdt.c
drivers/watchdog/ep93xx_wdt.c
drivers/watchdog/gpio_wdt.c
drivers/watchdog/ie6xx_wdt.c
drivers/watchdog/imgpdc_wdt.c
drivers/watchdog/intel-mid_wdt.c
drivers/watchdog/jz4740_wdt.c
drivers/watchdog/lpc18xx_wdt.c [new file with mode: 0644]
drivers/watchdog/mena21_wdt.c
drivers/watchdog/menf21bmc_wdt.c
drivers/watchdog/mpc8xxx_wdt.c
drivers/watchdog/mtk_wdt.c
drivers/watchdog/nv_tco.c
drivers/watchdog/omap_wdt.c
drivers/watchdog/orion_wdt.c
drivers/watchdog/pnx4008_wdt.c
drivers/watchdog/qcom-wdt.c
drivers/watchdog/retu_wdt.c
drivers/watchdog/rt2880_wdt.c
drivers/watchdog/s3c2410_wdt.c
drivers/watchdog/sama5d4_wdt.c [new file with mode: 0644]
drivers/watchdog/shwdt.c
drivers/watchdog/sirfsoc_wdt.c
drivers/watchdog/sp805_wdt.c
drivers/watchdog/st_lpc_wdt.c
drivers/watchdog/stmp3xxx_rtc_wdt.c
drivers/watchdog/sunxi_wdt.c
drivers/watchdog/tegra_wdt.c
drivers/watchdog/twl4030_wdt.c
drivers/watchdog/txx9wdt.c
drivers/watchdog/ux500_wdt.c
drivers/watchdog/via_wdt.c
drivers/watchdog/wm831x_wdt.c
drivers/watchdog/wm8350_wdt.c
drivers/xen/Kconfig
drivers/xen/balloon.c
drivers/xen/biomerge.c
drivers/xen/events/events_base.c
drivers/xen/events/events_fifo.c
drivers/xen/gntalloc.c
drivers/xen/gntdev.c
drivers/xen/manage.c
drivers/xen/privcmd.c
drivers/xen/swiotlb-xen.c
drivers/xen/sys-hypervisor.c
drivers/xen/tmem.c
drivers/xen/xenbus/xenbus_client.c
drivers/xen/xenbus/xenbus_dev_backend.c
drivers/xen/xenbus/xenbus_probe.c
drivers/xen/xenfs/Makefile
drivers/xen/xenfs/super.c
drivers/xen/xenfs/xenfs.h
drivers/xen/xenfs/xensyms.c [new file with mode: 0644]
drivers/xen/xlate_mmu.c
fs/affs/super.c
fs/block_dev.c
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/inode.c
fs/btrfs/scrub.c
fs/btrfs/tree-defrag.c
fs/btrfs/volumes.c
fs/ceph/addr.c
fs/ceph/caps.c
fs/ceph/file.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/snap.c
fs/ceph/super.c
fs/cifs/cifs_ioctl.h [new file with mode: 0644]
fs/cifs/cifsfs.c
fs/cifs/cifsfs.h
fs/cifs/cifspdu.h
fs/cifs/cifssmb.c
fs/cifs/file.c
fs/cifs/ioctl.c
fs/cifs/smb2pdu.c
fs/cifs/transport.c
fs/coda/upcall.c
fs/coredump.c
fs/dax.c
fs/debugfs/file.c
fs/ecryptfs/crypto.c
fs/ecryptfs/dentry.c
fs/ext2/file.c
fs/ext2/inode.c
fs/ext4/ext4.h
fs/ext4/file.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/fs-writeback.c
fs/gfs2/glock.c
fs/gfs2/glops.c
fs/gfs2/incore.h
fs/gfs2/lock_dlm.c
fs/gfs2/lops.c
fs/gfs2/meta_io.c
fs/gfs2/meta_io.h
fs/gfs2/quota.c
fs/gfs2/rgrp.c
fs/gfs2/trace_gfs2.h
fs/gfs2/trans.c
fs/hfs/bnode.c
fs/hfs/brec.c
fs/hfsplus/bnode.c
fs/hugetlbfs/inode.c
fs/kernfs/dir.c
fs/namei.c
fs/nfs/blocklayout/blocklayout.h
fs/nfs/blocklayout/dev.c
fs/nfs/blocklayout/extent_tree.c
fs/nfs/callback.c
fs/nfs/callback_proc.c
fs/nfs/client.c
fs/nfs/delegation.c
fs/nfs/delegation.h
fs/nfs/dir.c
fs/nfs/file.c
fs/nfs/flexfilelayout/flexfilelayout.c
fs/nfs/flexfilelayout/flexfilelayout.h
fs/nfs/flexfilelayout/flexfilelayoutdev.c
fs/nfs/inode.c
fs/nfs/internal.h
fs/nfs/nfs3xdr.c
fs/nfs/nfs42.h
fs/nfs/nfs42xdr.c
fs/nfs/nfs4_fs.h
fs/nfs/nfs4client.c
fs/nfs/nfs4file.c
fs/nfs/nfs4idmap.c
fs/nfs/nfs4proc.c
fs/nfs/nfs4state.c
fs/nfs/nfs4trace.h
fs/nfs/nfs4xdr.c
fs/nfs/pagelist.c
fs/nfs/pnfs.c
fs/nfs/pnfs.h
fs/nfs/pnfs_nfs.c
fs/nfs/super.c
fs/nfs/write.c
fs/nfsd/blocklayoutxdr.c
fs/nfsd/blocklayoutxdr.h
fs/nsfs.c
fs/ocfs2/dlm/dlmrecovery.c
fs/proc/base.c
fs/proc/generic.c
fs/proc/page.c
fs/proc/task_mmu.c
fs/seq_file.c
fs/ufs/balloc.c
fs/userfaultfd.c
fs/xfs/Makefile
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_alloc_btree.c
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_attr_leaf.c
fs/xfs/libxfs/xfs_attr_remote.c
fs/xfs/libxfs/xfs_bit.c [new file with mode: 0644]
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap_btree.c
fs/xfs/libxfs/xfs_btree.c
fs/xfs/libxfs/xfs_da_btree.c
fs/xfs/libxfs/xfs_da_format.h
fs/xfs/libxfs/xfs_dir2.c
fs/xfs/libxfs/xfs_dir2_block.c
fs/xfs/libxfs/xfs_dir2_data.c
fs/xfs/libxfs/xfs_dir2_leaf.c
fs/xfs/libxfs/xfs_dir2_node.c
fs/xfs/libxfs/xfs_dquot_buf.c
fs/xfs/libxfs/xfs_format.h
fs/xfs/libxfs/xfs_ialloc.c
fs/xfs/libxfs/xfs_ialloc_btree.c
fs/xfs/libxfs/xfs_inode_buf.c
fs/xfs/libxfs/xfs_sb.c
fs/xfs/libxfs/xfs_symlink_remote.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_bit.c [deleted file]
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_buf_item.h
fs/xfs/xfs_dir2_readdir.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_extfree_item.h
fs/xfs/xfs_file.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_super.c
fs/xfs/xfs_symlink.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_extfree.c
fs/xfs/xfs_trans_priv.h
include/asm-generic/dma-mapping-common.h
include/asm-generic/early_ioremap.h
include/asm-generic/fixmap.h
include/asm-generic/memory_model.h
include/asm-generic/qspinlock.h
include/asm-generic/rtc.h
include/asm-generic/vmlinux.lds.h
include/crypto/pkcs7.h
include/crypto/public_key.h
include/dt-bindings/i2c/i2c.h [new file with mode: 0644]
include/keys/system_keyring.h
include/kvm/arm_arch_timer.h
include/kvm/arm_vgic.h
include/linux/amba/serial.h
include/linux/asn1_ber_bytecode.h
include/linux/audit.h
include/linux/backing-dev.h
include/linux/blk-cgroup.h
include/linux/blkdev.h
include/linux/ceph/ceph_features.h
include/linux/ceph/libceph.h
include/linux/ceph/messenger.h
include/linux/ceph/msgr.h
include/linux/cgroup_subsys.h
include/linux/clockchips.h
include/linux/dax.h [new file with mode: 0644]
include/linux/debugfs.h
include/linux/dmapool.h
include/linux/fb.h
include/linux/fs.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/i2c.h
include/linux/intel-iommu.h
include/linux/io-mapping.h
include/linux/io.h
include/linux/ipmi_smi.h
include/linux/irqchip/arm-gic-v3.h
include/linux/irqchip/arm-gic.h
include/linux/jump_label.h
include/linux/kernfs.h
include/linux/kexec.h
include/linux/kmod.h
include/linux/kvm_host.h
include/linux/libnvdimm.h
include/linux/lsm_audit.h
include/linux/lsm_hooks.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memory_hotplug.h
include/linux/microchipphy.h [new file with mode: 0644]
include/linux/mlx4/device.h
include/linux/mlx4/driver.h
include/linux/mlx5/device.h
include/linux/mlx5/driver.h
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmc/card.h
include/linux/mmc/dw_mmc.h
include/linux/mmc/host.h
include/linux/mmu_notifier.h
include/linux/mmzone.h
include/linux/mtd/map.h
include/linux/netlink.h
include/linux/nfs4.h
include/linux/nfs_fs.h
include/linux/nfs_fs_sb.h
include/linux/nfs_xdr.h
include/linux/nmi.h
include/linux/ntb.h
include/linux/ntb_transport.h
include/linux/oid_registry.h
include/linux/oom.h
include/linux/page-flags.h
include/linux/page-isolation.h
include/linux/page_ext.h
include/linux/page_idle.h [new file with mode: 0644]
include/linux/pci.h
include/linux/platform_data/i2c-mux-reg.h [new file with mode: 0644]
include/linux/platform_data/mmc-esdhc-imx.h
include/linux/pm_opp.h
include/linux/pmem.h
include/linux/poison.h
include/linux/printk.h
include/linux/ptrace.h
include/linux/pwm.h
include/linux/regmap.h
include/linux/reset.h
include/linux/seccomp.h
include/linux/seq_file.h
include/linux/string_helpers.h
include/linux/sunrpc/addr.h
include/linux/sunrpc/auth.h
include/linux/sunrpc/svc_rdma.h
include/linux/sunrpc/xprtrdma.h
include/linux/swap.h
include/linux/swapops.h
include/linux/syscalls.h
include/linux/thermal.h
include/linux/tick.h
include/linux/verify_pefile.h
include/linux/zbud.h
include/linux/zpool.h
include/linux/zsmalloc.h
include/media/videobuf2-memops.h
include/net/addrconf.h
include/net/bonding.h
include/net/fib_rules.h
include/net/mac80211.h
include/net/netfilter/br_netfilter.h
include/net/netfilter/nf_conntrack.h
include/net/netfilter/nf_tables.h
include/net/sock.h
include/rdma/ib_cm.h
include/rdma/ib_mad.h
include/rdma/ib_pack.h
include/rdma/ib_smi.h
include/rdma/ib_verbs.h
include/rdma/opa_port_info.h [new file with mode: 0644]
include/rdma/opa_smi.h
include/rdma/rdma_netlink.h
include/scsi/scsi_common.h
include/scsi/scsi_device.h
include/scsi/scsi_dh.h
include/scsi/scsi_eh.h
include/soc/tegra/mc.h
include/target/iscsi/iscsi_target_core.h
include/target/iscsi/iscsi_target_stat.h
include/target/iscsi/iscsi_transport.h
include/target/target_core_backend.h
include/target/target_core_base.h
include/target/target_core_fabric.h
include/trace/events/kvm.h
include/trace/events/task.h
include/trace/events/thermal_power_allocator.h
include/trace/events/writeback.h
include/uapi/asm-generic/unistd.h
include/uapi/drm/i915_drm.h
include/uapi/linux/Kbuild
include/uapi/linux/audit.h
include/uapi/linux/elf-em.h
include/uapi/linux/if_ether.h
include/uapi/linux/kernel-page-flags.h
include/uapi/linux/kvm.h
include/uapi/linux/membarrier.h [new file with mode: 0644]
include/uapi/linux/ndctl.h
include/uapi/linux/nfs4.h
include/uapi/linux/ptrace.h
include/uapi/linux/target_core_user.h
include/uapi/linux/toshiba.h
include/uapi/rdma/Kbuild
include/uapi/rdma/hfi/Kbuild [new file with mode: 0644]
include/uapi/rdma/hfi/hfi1_user.h [new file with mode: 0644]
include/uapi/rdma/rdma_netlink.h
include/uapi/xen/privcmd.h
include/video/vga.h
include/xen/events.h
include/xen/interface/platform.h
include/xen/interface/xen.h
include/xen/interface/xenpmu.h [new file with mode: 0644]
include/xen/page.h
include/xen/xen-ops.h
init/Kconfig
init/initramfs.c
init/main.c
ipc/msgutil.c
ipc/shm.c
kernel/Makefile
kernel/audit.c
kernel/audit.h
kernel/audit_fsnotify.c [new file with mode: 0644]
kernel/audit_tree.c
kernel/audit_watch.c
kernel/auditfilter.c
kernel/auditsc.c
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/cgroup.c
kernel/cpu_pm.c
kernel/cred.c
kernel/events/core.c
kernel/extable.c
kernel/kexec.c
kernel/kexec_core.c [new file with mode: 0644]
kernel/kexec_file.c [new file with mode: 0644]
kernel/kexec_internal.h [new file with mode: 0644]
kernel/kmod.c
kernel/ksysfs.c
kernel/locking/qspinlock.c
kernel/membarrier.c [new file with mode: 0644]
kernel/memremap.c [new file with mode: 0644]
kernel/module_signing.c
kernel/printk/printk.c
kernel/profile.c
kernel/ptrace.c
kernel/reboot.c
kernel/resource.c
kernel/sched/core.c
kernel/seccomp.c
kernel/sys_ni.c
kernel/sysctl.c
kernel/system_certificates.S [deleted file]
kernel/system_keyring.c [deleted file]
kernel/time/clockevents.c
kernel/time/tick-common.c
kernel/time/tick-sched.c
kernel/time/timekeeping.c
kernel/time/timer_list.c
kernel/trace/ftrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_output.c
kernel/trace/trace_stack.c
lib/Kconfig
lib/Makefile
lib/asn1_decoder.c
lib/bitmap.c
lib/decompress_bunzip2.c
lib/decompress_inflate.c
lib/decompress_unlz4.c
lib/decompress_unlzma.c
lib/decompress_unlzo.c
lib/decompress_unxz.c
lib/devres.c
lib/kstrtox.c
lib/nmi_backtrace.c [new file with mode: 0644]
lib/pci_iomap.c
lib/show_mem.c
lib/string_helpers.c
lib/test-kstrtox.c
lib/test_kasan.c
lib/zlib_deflate/deftree.c
lib/zlib_deflate/defutil.h
mm/Kconfig
mm/Makefile
mm/backing-dev.c
mm/bootmem.c
mm/compaction.c
mm/debug.c
mm/dmapool.c
mm/early_ioremap.c
mm/filemap.c
mm/frame_vector.c [new file with mode: 0644]
mm/huge_memory.c
mm/hugetlb.c
mm/hwpoison-inject.c
mm/internal.h
mm/kasan/kasan.c
mm/kmemleak.c
mm/list_lru.c
mm/madvise.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/mempool.c
mm/memtest.c
mm/migrate.c
mm/mmap.c
mm/mmu_notifier.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_ext.c
mm/page_idle.c [new file with mode: 0644]
mm/page_isolation.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slab_common.c
mm/slob.c
mm/slub.c
mm/swap.c
mm/swap_state.c
mm/swapfile.c
mm/vmscan.c
mm/zbud.c
mm/zpool.c
mm/zsmalloc.c
mm/zswap.c
net/9p/trans_rdma.c
net/bridge/br_netlink.c
net/bridge/br_vlan.c
net/ceph/ceph_common.c
net/ceph/crypto.c
net/ceph/messenger.c
net/ceph/mon_client.c
net/ceph/osd_client.c
net/ceph/osdmap.c
net/core/fib_rules.c
net/decnet/dn_rules.c
net/ipv4/fib_rules.c
net/ipv4/ipmr.c
net/ipv4/tcp_cubic.c
net/ipv4/tcp_output.c
net/ipv6/addrconf.c
net/ipv6/fib6_rules.c
net/ipv6/ip6mr.c
net/ipv6/route.c
net/mac80211/cfg.c
net/mac80211/mlme.c
net/mac80211/rate.c
net/mac80211/tdls.c
net/mac80211/vht.c
net/netfilter/ipset/ip_set_hash_gen.h
net/netfilter/ipset/ip_set_hash_netnet.c
net/netfilter/ipset/ip_set_hash_netportnet.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_synproxy_core.c
net/netfilter/nfnetlink.c
net/netfilter/nfnetlink_queue_core.c
net/netfilter/xt_CT.c
net/netlink/af_netlink.c
net/openvswitch/Kconfig
net/openvswitch/Makefile
net/openvswitch/conntrack.h
net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_recv.c
net/rds/ib_send.c
net/rds/iw.c
net/rds/iw_rdma.c
net/rds/iw_send.c
net/rfkill/core.c
net/sctp/protocol.c
net/sunrpc/auth_unix.c
net/sunrpc/xprtrdma/fmr_ops.c
net/sunrpc/xprtrdma/frwr_ops.c
net/sunrpc/xprtrdma/physical_ops.c
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
net/sunrpc/xprtrdma/svc_rdma_transport.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/verbs.c
net/sunrpc/xprtrdma/xprt_rdma.h
net/sunrpc/xprtsock.c
net/switchdev/switchdev.c
net/tipc/bcast.c
net/wireless/reg.c
scripts/.gitignore
scripts/Kbuild.include
scripts/Makefile
scripts/Makefile.extrawarn
scripts/Makefile.modinst
scripts/asn1_compiler.c
scripts/basic/fixdep.c
scripts/checkpatch.pl
scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci [new file with mode: 0644]
scripts/coccinelle/api/platform_no_drv_owner.cocci
scripts/coccinelle/api/pm_runtime.cocci
scripts/coccinelle/api/simple_open.cocci
scripts/coccinelle/api/vma_pages.cocci [new file with mode: 0644]
scripts/coccinelle/misc/ifaddr.cocci
scripts/coccinelle/misc/irqf_oneshot.cocci
scripts/coccinelle/misc/returnvar.cocci
scripts/coccinelle/misc/semicolon.cocci
scripts/coccinelle/misc/simple_return.cocci
scripts/extract-cert.c [new file with mode: 0644]
scripts/genksyms/parse.tab.c_shipped
scripts/genksyms/parse.tab.h_shipped
scripts/genksyms/parse.y
scripts/kconfig/confdata.c
scripts/kconfig/merge_config.sh
scripts/kconfig/symbol.c
scripts/kconfig/zconf.gperf
scripts/kconfig/zconf.hash.c_shipped
scripts/kconfig/zconf.l
scripts/kconfig/zconf.lex.c_shipped
scripts/mod/modpost.h
scripts/package/Makefile
scripts/package/builddeb
scripts/package/mkspec
scripts/selinux/mdp/mdp.c
scripts/sign-file [deleted file]
scripts/sign-file.c [new file with mode: 0755]
scripts/stackdelta [new file with mode: 0755]
scripts/stackusage [new file with mode: 0755]
scripts/tags.sh
security/Kconfig
security/device_cgroup.c
security/lsm_audit.c
security/security.c
security/selinux/avc.c
security/selinux/hooks.c
security/selinux/include/avc.h
security/selinux/include/security.h
security/selinux/selinuxfs.c
security/selinux/ss/avtab.c
security/selinux/ss/avtab.h
security/selinux/ss/conditional.c
security/selinux/ss/conditional.h
security/selinux/ss/policydb.c
security/selinux/ss/services.c
security/selinux/ss/services.h
security/smack/smack.h
security/smack/smack_access.c
security/smack/smack_lsm.c
security/smack/smackfs.c
security/yama/Kconfig
security/yama/yama_lsm.c
sound/pci/hda/patch_realtek.c
sound/sparc/amd7930.c
sound/usb/stream.c
tools/perf/builtin-script.c
tools/perf/tests/sw-clock.c
tools/perf/tests/task-exit.c
tools/perf/ui/browsers/hists.c
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/header.c
tools/perf/util/intel-bts.c
tools/perf/util/intel-pt.c
tools/perf/util/parse-events.c
tools/perf/util/parse-events.y
tools/testing/nvdimm/Kbuild
tools/testing/nvdimm/test/iomap.c
tools/testing/nvdimm/test/nfit.c
tools/testing/selftests/Makefile
tools/testing/selftests/breakpoints/Makefile
tools/testing/selftests/lib.mk
tools/testing/selftests/membarrier/.gitignore [new file with mode: 0644]
tools/testing/selftests/membarrier/Makefile [new file with mode: 0644]
tools/testing/selftests/membarrier/membarrier_test.c [new file with mode: 0644]
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/hugetlbfstest.c [deleted file]
tools/testing/selftests/vm/run_vmtests
tools/testing/selftests/vm/userfaultfd.c
tools/testing/selftests/x86/entry_from_vm86.c
tools/testing/selftests/zram/Makefile [new file with mode: 0644]
tools/testing/selftests/zram/README [new file with mode: 0644]
tools/testing/selftests/zram/zram.sh [new file with mode: 0755]
tools/testing/selftests/zram/zram01.sh [new file with mode: 0755]
tools/testing/selftests/zram/zram02.sh [new file with mode: 0755]
tools/testing/selftests/zram/zram_lib.sh [new file with mode: 0755]
tools/vm/page-types.c
virt/kvm/arm/arch_timer.c
virt/kvm/arm/vgic-v2.c
virt/kvm/arm/vgic-v3.c
virt/kvm/arm/vgic.c
virt/kvm/irqchip.c
virt/kvm/kvm_main.c

index 4ad4a98b884b9c857166d993d167fe4953210201..fd3a355925432f58c1bcf84b16ec3ec8bc8c5457 100644 (file)
@@ -36,6 +36,7 @@
 modules.builtin
 Module.symvers
 *.dwo
+*.su
 
 #
 # Top-level generic files
@@ -44,6 +45,7 @@ Module.symvers
 /TAGS
 /linux
 /vmlinux
+/vmlinux.32
 /vmlinux-gdb.py
 /vmlinuz
 /System.map
@@ -89,6 +91,9 @@ GRTAGS
 GSYMS
 GTAGS
 
+# id-utils files
+ID
+
 *.orig
 *~
 \#*#
@@ -97,6 +102,7 @@ GTAGS
 # Leavings from module signing
 #
 extra_certificates
+signing_key.pem
 signing_key.priv
 signing_key.x509
 x509.genkey
diff --git a/CREDITS b/CREDITS
index bcb8efaa945903abd97fb207718bcdfacf516c55..8207cc62ee9d6079bb55032090ef518ef08f6b04 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -2992,6 +2992,10 @@ S: 2200 Mission College Blvd
 S: Santa Clara, CA 95052
 S: USA
 
+N: Anil Ravindranath
+E: anil_ravindranath@pmc-sierra.com
+D: PMC-Sierra MaxRAID driver
+
 N: Eric S. Raymond
 E: esr@thyrsus.com
 W: http://www.tuxedo.org/~esr/
diff --git a/Documentation/ABI/testing/sysfs-hypervisor-pmu b/Documentation/ABI/testing/sysfs-hypervisor-pmu
new file mode 100644 (file)
index 0000000..224faa1
--- /dev/null
@@ -0,0 +1,23 @@
+What:          /sys/hypervisor/pmu/pmu_mode
+Date:          August 2015
+KernelVersion: 4.3
+Contact:       Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Description:
+               Describes mode that Xen's performance-monitoring unit (PMU)
+               uses. Accepted values are
+                       "off"  -- PMU is disabled
+                       "self" -- The guest can profile itself
+                       "hv"   -- The guest can profile itself and, if it is
+                                 privileged (e.g. dom0), the hypervisor
+                       "all" --  The guest can profile itself, the hypervisor
+                                 and all other guests. Only available to
+                                 privileged guests.
+
+What:           /sys/hypervisor/pmu/pmu_features
+Date:           August 2015
+KernelVersion:  4.3
+Contact:        Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Description:
+               Describes Xen PMU features (as an integer). A set bit indicates
+               that the corresponding feature is enabled. See
+               include/xen/interface/xenpmu.h for available features
index 646cdaa6e9d13304b7e647ccf158518634122feb..6d886300485827846541744075f7919be3d16200 100644 (file)
@@ -43,6 +43,7 @@ o  udev                   081                     # udevd --version
 o  grub                   0.93                    # grub --version || grub-install --version
 o  mcelog                 0.6                     # mcelog --version
 o  iptables               1.4.2                   # iptables -V
+o  openssl & libcrypto    1.0.1k                  # openssl version
 
 
 Kernel compilation
@@ -79,6 +80,17 @@ BC
 You will need bc to build kernels 3.10 and higher
 
 
+OpenSSL
+-------
+
+Module signing and external certificate handling use the OpenSSL program and
+crypto library to do key creation and signature generation.
+
+You will need openssl to build kernels 3.7 and higher if module signing is
+enabled.  You will also need openssl development packages to build kernels 4.3
+and higher.
+
+
 System utilities
 ================
 
@@ -295,6 +307,10 @@ Binutils
 --------
 o  <ftp://ftp.kernel.org/pub/linux/devel/binutils/>
 
+OpenSSL
+-------
+o  <https://www.openssl.org/>
+
 System utilities
 ****************
 
@@ -392,4 +408,3 @@ o  <http://oprofile.sf.net/download/>
 NFS-Utils
 ---------
 o  <http://nfs.sourceforge.net/>
-
index 7eba542eff7c8317d0da92ea4b638076b4f0f996..edccacd4f048a13e8afdb63db7d98ad41667a503 100644 (file)
@@ -104,6 +104,13 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
 from this pool must not cross 4KByte boundaries.
 
 
+       void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+                             dma_addr_t *handle)
+
+Wraps dma_pool_alloc() and also zeroes the returned memory if the
+allocation attempt succeeded.
+
+
        void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
                        dma_addr_t *dma_handle);
 
index abba93f9d64a5c3244d1a2ff710a837d1d243848..1d6008d51b5552f4be9a71f4bc4e568b004fb1eb 100644 (file)
@@ -490,4 +490,31 @@ X!Ilib/fonts/fonts.c
 !Edrivers/hsi/hsi.c
   </chapter>
 
+  <chapter id="pwm">
+    <title>Pulse-Width Modulation (PWM)</title>
+    <para>
+      Pulse-width modulation is a modulation technique primarily used to
+      control power supplied to electrical devices.
+    </para>
+    <para>
+      The PWM framework provides an abstraction for providers and consumers
+      of PWM signals. A controller that provides one or more PWM signals is
+      registered as <structname>struct pwm_chip</structname>. Providers are
+      expected to embed this structure in a driver-specific structure. This
+      structure contains fields that describe a particular chip.
+    </para>
+    <para>
+      A chip exposes one or more PWM signal sources, each of which exposed
+      as a <structname>struct pwm_device</structname>. Operations can be
+      performed on PWM devices to control the period, duty cycle, polarity
+      and active state of the signal.
+    </para>
+    <para>
+      Note that PWM devices are exclusive resources: they can always only be
+      used by one consumer at a time.
+    </para>
+!Iinclude/linux/pwm.h
+!Edrivers/pwm/core.c
+  </chapter>
+
 </book>
index c4de576093affed9a5877bfb484c75c67246ab96..62435bb252660f0e870e71f0f7f3f6ae0449d397 100644 (file)
@@ -144,7 +144,8 @@ mem_used_max      RW    the maximum amount memory zram have consumed to
                         store compressed data
 mem_limit         RW    the maximum amount of memory ZRAM can use to store
                         the compressed data
-num_migrated      RO    the number of objects migrated migrated by compaction
+pages_compacted   RO    the number of pages freed during compaction
+                        (available only via zram<id>/mm_stat node)
 compact           WO    trigger memory compaction
 
 WARNING
index 68b6a6a470b073436b15aa18ef8ba822e39602b4..12686bec37b94491bc6b12a0440d812c6d393012 100644 (file)
@@ -201,7 +201,7 @@ Proportional weight policy files
          specifies the number of bytes.
 
 - blkio.io_serviced
-       - Number of IOs completed to/from the disk by the group. These
+       - Number of IOs (bio) issued to the disk by the group. These
          are further divided by the type of operation - read or write, sync
          or async. First two fields specify the major and minor number of the
          device, third field specifies the operation type and the fourth field
@@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
       subjected to both the constraints.
 
 - blkio.throttle.io_serviced
-       - Number of IOs (bio) completed to/from the disk by the group (as
-         seen by throttling policy). These are further divided by the type
-         of operation - read or write, sync or async. First two fields specify
-         the major and minor number of the device, third field specifies the
-         operation type and the fourth field specifies the number of IOs.
-
-         blkio.io_serviced does accounting as seen by CFQ and counts are in
-         number of requests (struct request). On the other hand,
-         blkio.throttle.io_serviced counts number of IO in terms of number
-         of bios as seen by throttling policy.  These bios can later be
-         merged by elevator and total number of requests completed can be
-         lesser.
+       - Number of IOs (bio) issued to the disk by the group. These
+         are further divided by the type of operation - read or write, sync
+         or async. First two fields specify the major and minor number of the
+         device, third field specifies the operation type and the fourth field
+         specifies the number of IOs.
 
 - blkio.throttle.io_service_bytes
        - Number of bytes transferred to/from the disk by the group. These
@@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is
          device, third field specifies the operation type and the fourth field
          specifies the number of bytes.
 
-         These numbers should roughly be same as blkio.io_service_bytes as
-         updated by CFQ. The difference between two is that
-         blkio.io_service_bytes will not be updated if CFQ is not operating
-         on request queue.
-
 Common files among various policies
 -----------------------------------
 - blkio.reset_stats
index 1ee9caf29e576c0a8a89927fb9cced2ed5521c60..e0975c2cf03ded24709ca7189152094e220023a7 100644 (file)
@@ -27,7 +27,7 @@ CONTENTS
     5-3-1. Format
     5-3-2. Control Knobs
   5-4. Per-Controller Changes
-    5-4-1. blkio
+    5-4-1. io
     5-4-2. cpuset
     5-4-3. memory
 6. Planned Changes
@@ -203,7 +203,7 @@ other issues.  The mapping from nice level to weight isn't obvious or
 universal, and there are various other knobs which simply aren't
 available for tasks.
 
-The blkio controller implicitly creates a hidden leaf node for each
+The io controller implicitly creates a hidden leaf node for each
 cgroup to host the tasks.  The hidden leaf has its own copies of all
 the knobs with "leaf_" prefixed.  While this allows equivalent control
 over internal tasks, it's with serious drawbacks.  It always adds an
@@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified.
 
 5-4. Per-Controller Changes
 
-5-4-1. blkio
+5-4-1. io
 
-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io.  The interface is overhauled anyway.  The
+  new name is more in line with the other two major controllers, cpu
+  and memory, and better suited given that it may be used for cgroup
+  writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+  recursive stat files pointless and, as no internal node can have
+  tasks, leaf weights are meaningless.  The operation model is
+  simplified and the interface is overhauled accordingly.
+
+  io.stat
+
+       The stat file.  The reported stats are from the point where
+       bio's are issued to request_queue.  The stats are counted
+       independent of which policies are enabled.  Each line in the
+       file follows the following format.  More fields may later be
+       added at the end.
+
+         $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+  io.weight
+
+       The weight setting, currently only available and effective if
+       cfq-iosched is in use for the target device.  The weight is
+       between 1 and 10000 and defaults to 100.  The first line
+       always contains the default weight in the following format to
+       use when per-device setting is missing.
+
+         default $WEIGHT
+
+       Subsequent lines list per-device weights of the following
+       format.
+
+         $MAJ:$MIN $WEIGHT
+
+       Writing "$WEIGHT" or "default $WEIGHT" changes the default
+       setting.  Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+       while "$MAJ:$MIN default" clears it.
+
+       This file is available only on non-root cgroups.
+
+  io.max
+
+       The maximum bandwidth and/or iops setting, only available if
+       blk-throttle is enabled.  The file is of the following format.
+
+         $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+       ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+       read/write IOs per second.  "max" indicates no limit.  Writing
+       to the file follows the same format but the individual
+       settings may be ommitted or specified in any order.
+
+       This file is available only on non-root cgroups.
 
 
 5-4-2. cpuset
diff --git a/Documentation/devicetree/bindings/hwmon/ina209.txt b/Documentation/devicetree/bindings/hwmon/ina209.txt
new file mode 100644 (file)
index 0000000..9dd2bee
--- /dev/null
@@ -0,0 +1,18 @@
+ina209 properties
+
+Required properties:
+- compatible: Must be "ti,ina209"
+- reg: I2C address
+
+Optional properties:
+
+- shunt-resistor
+       Shunt resistor value in micro-Ohm
+
+Example:
+
+temp-sensor@4c {
+       compatible = "ti,ina209";
+       reg = <0x4c>;
+       shunt-resistor = <5000>;
+};
diff --git a/Documentation/devicetree/bindings/hwmon/ina2xx.txt b/Documentation/devicetree/bindings/hwmon/ina2xx.txt
new file mode 100644 (file)
index 0000000..a2ad85d
--- /dev/null
@@ -0,0 +1,22 @@
+ina2xx properties
+
+Required properties:
+- compatible: Must be one of the following:
+       - "ti,ina219" for ina219
+       - "ti,ina220" for ina220
+       - "ti,ina226" for ina226
+       - "ti,ina230" for ina230
+- reg: I2C address
+
+Optional properties:
+
+- shunt-resistor
+       Shunt resistor value in micro-Ohm
+
+Example:
+
+ina220@44 {
+       compatible = "ti,ina220";
+       reg = <0x44>;
+       shunt-resistor = <1000>;
+};
diff --git a/Documentation/devicetree/bindings/hwmon/max6697.txt b/Documentation/devicetree/bindings/hwmon/max6697.txt
new file mode 100644 (file)
index 0000000..5f79399
--- /dev/null
@@ -0,0 +1,64 @@
+max6697 properties
+
+Required properties:
+- compatible:
+       Should be one of
+               maxim,max6581
+               maxim,max6602
+               maxim,max6622
+               maxim,max6636
+               maxim,max6689
+               maxim,max6693
+               maxim,max6694
+               maxim,max6697
+               maxim,max6698
+               maxim,max6699
+- reg: I2C address
+
+Optional properties:
+
+- smbus-timeout-disable
+       Set to disable SMBus timeout. If not specified, SMBus timeout will be
+       enabled.
+- extended-range-enable
+       Only valid for MAX6581. Set to enable extended temperature range.
+       Extended temperature will be disabled if not specified.
+- beta-compensation-enable
+       Only valid for MAX6693 and MX6694. Set to enable beta compensation on
+       remote temperature channel 1.
+       Beta compensation will be disabled if not specified.
+- alert-mask
+       Alert bit mask. Alert disabled for bits set.
+       Select bit 0 for local temperature, bit 1..7 for remote temperatures.
+       If not specified, alert will be enabled for all channels.
+- over-temperature-mask
+       Over-temperature bit mask. Over-temperature reporting disabled for
+       bits set.
+       Select bit 0 for local temperature, bit 1..7 for remote temperatures.
+       If not specified, over-temperature reporting will be enabled for all
+       channels.
+- resistance-cancellation
+       Boolean for all chips other than MAX6581. Set to enable resistance
+       cancellation on remote temperature channel 1.
+       For MAX6581, resistance cancellation enabled for all channels if
+       specified as boolean, otherwise as per bit mask specified.
+       Only supported for remote temperatures (bit 1..7).
+       If not specified, resistance cancellation will be disabled for all
+       channels.
+- transistor-ideality
+       For MAX6581 only. Two values; first is bit mask, second is ideality
+       select value as per MAX6581 data sheet. Select bit 1..7 for remote
+       channels.
+       Transistor ideality will be initialized to default (1.008) if not
+       specified.
+
+Example:
+
+temp-sensor@1a {
+       compatible = "maxim,max6697";
+       reg = <0x1a>;
+       smbus-timeout-disable;
+       resistance-cancellation;
+       alert-mask = <0x72>;
+       over-temperature-mask = <0x7f>;
+};
index 7cb0b5608f495b39da967cdd55fb7aa56e58e083..ebaa90c58c8e727bd7c9fbfcb56cbd8b5e20d345 100644 (file)
@@ -2,7 +2,11 @@ Binding for the Cadence I2C controller
 
 Required properties:
   - reg: Physical base address and size of the controller's register area.
-  - compatible: Compatibility string. Must be 'cdns,i2c-r1p10'.
+  - compatible: Should contain one of:
+               * "cdns,i2c-r1p10"
+               Note:   Use this when cadence i2c controller version 1.0 is used.
+               * "cdns,i2c-r1p14"
+               Note:   Use this when cadence i2c controller version 1.4 is used.
   - clocks: Input clock specifier. Refer to common clock bindings.
   - interrupts: Interrupt specifier. Refer to interrupt bindings.
   - #address-cells: Should be 1.
diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/i2c-emev2.txt
new file mode 100644 (file)
index 0000000..5ed1ea1
--- /dev/null
@@ -0,0 +1,22 @@
+Device tree configuration for Renesas EMEV2 IIC controller
+
+Required properties:
+- compatible      : "renesas,iic-emev2"
+- reg             : address start and address range size of device
+- interrupts      : specifier for the IIC controller interrupt
+- clocks          : phandle to the IP core SCLK
+- clock-names     : must be "sclk"
+- #address-cells  : should be <1>
+- #size-cells     : should be <0>
+
+Example:
+
+       iic0: i2c@e0070000 {
+               #address-cells = <1>;
+               #size-cells = <0>;
+               compatible = "renesas,iic-emev2";
+               reg = <0xe0070000 0x28>;
+               interrupts = <0 32 IRQ_TYPE_EDGE_RISING>;
+               clocks = <&iic0_sclk>;
+               clock-names = "sclk";
+       };
diff --git a/Documentation/devicetree/bindings/i2c/i2c-lpc2k.txt b/Documentation/devicetree/bindings/i2c/i2c-lpc2k.txt
new file mode 100644 (file)
index 0000000..4101aa6
--- /dev/null
@@ -0,0 +1,33 @@
+NXP I2C controller for LPC2xxx/178x/18xx/43xx
+
+Required properties:
+ - compatible: must be "nxp,lpc1788-i2c"
+ - reg: physical address and length of the device registers
+ - interrupts: a single interrupt specifier
+ - clocks: clock for the device
+ - #address-cells: should be <1>
+ - #size-cells: should be <0>
+
+Optional properties:
+- clock-frequency: the desired I2C bus clock frequency in Hz; in
+  absence of this property the default value is used (100 kHz).
+
+Example:
+i2c0: i2c@400a1000 {
+       compatible = "nxp,lpc1788-i2c";
+       reg = <0x400a1000 0x1000>;
+       interrupts = <18>;
+       clocks = <&ccu1 CLK_APB1_I2C0>;
+       #address-cells = <1>;
+       #size-cells = <0>;
+};
+
+&i2c0 {
+       clock-frequency = <400000>;
+
+       lm75@48 {
+               compatible = "nxp,lm75";
+               reg = <0x48>;
+       };
+};
+
diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt b/Documentation/devicetree/bindings/i2c/i2c-mux-reg.txt
new file mode 100644 (file)
index 0000000..688783f
--- /dev/null
@@ -0,0 +1,74 @@
+Register-based I2C Bus Mux
+
+This binding describes an I2C bus multiplexer that uses a single register
+to route the I2C signals.
+
+Required properties:
+- compatible: i2c-mux-reg
+- i2c-parent: The phandle of the I2C bus that this multiplexer's master-side
+  port is connected to.
+* Standard I2C mux properties. See mux.txt in this directory.
+* I2C child bus nodes. See mux.txt in this directory.
+
+Optional properties:
+- reg: this pair of <offset size> specifies the register to control the mux.
+  The <offset size> depends on its parent node. It can be any memory-mapped
+  address. The size must be either 1, 2, or 4 bytes. If reg is omitted, the
+  resource of this device will be used.
+- little-endian: The existence indicates the register is in little endian.
+- big-endian: The existence indicates the register is in big endian.
+  If both little-endian and big-endian are omitted, the endianness of the
+  CPU will be used.
+- write-only: The existence indicates the register is write-only.
+- idle-state: value to set the muxer to when idle. When no value is
+  given, it defaults to the last value used.
+
+Whenever an access is made to a device on a child bus, the value set
+in the revelant node's reg property will be output to the register.
+
+If an idle state is defined, using the idle-state (optional) property,
+whenever an access is not being made to a device on a child bus, the
+register will be set according to the idle value.
+
+If an idle state is not defined, the most recently used value will be
+left programmed into the register.
+
+Example of a mux on PCIe card, the host is a powerpc SoC (big endian):
+
+       i2c-mux {
+               /* the <offset size> depends on the address translation
+                * of the parent device. If omitted, device resource
+                * will be used instead. The size is to determine
+                * whether iowrite32, iowrite16, or iowrite8 will be used.
+                */
+               reg = <0x6028 0x4>;
+               little-endian;          /* little endian register on PCIe */
+               compatible = "i2c-mux-reg";
+               #address-cells = <1>;
+               #size-cells = <0>;
+               i2c-parent = <&i2c1>;
+               i2c@0 {
+                       reg = <0>;
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       si5338: clock-generator@70 {
+                               compatible = "silabs,si5338";
+                               reg = <0x70>;
+                               /* other stuff */
+                       };
+               };
+
+               i2c@1 {
+                       /* data is written using iowrite32 */
+                       reg = <1>;
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+
+                       si5338: clock-generator@70 {
+                               compatible = "silabs,si5338";
+                               reg = <0x70>;
+                               /* other stuff */
+                       };
+               };
+       };
diff --git a/Documentation/devicetree/bindings/i2c/i2c.txt b/Documentation/devicetree/bindings/i2c/i2c.txt
new file mode 100644 (file)
index 0000000..8a99150
--- /dev/null
@@ -0,0 +1,45 @@
+Generic device tree bindings for I2C busses
+===========================================
+
+This document describes generic bindings which can be used to describe I2C
+busses in a device tree.
+
+Required properties
+-------------------
+
+- #address-cells  - should be <1>. Read more about addresses below.
+- #size-cells     - should be <0>.
+- compatible      - name of I2C bus controller following generic names
+                   recommended practice.
+
+For other required properties e.g. to describe register sets,
+clocks, etc. check the binding documentation of the specific driver.
+
+The cells properties above define that an address of children of an I2C bus
+are described by a single value. This is usually a 7 bit address. However,
+flags can be attached to the address. I2C_TEN_BIT_ADDRESS is used to mark a 10
+bit address. It is needed to avoid the ambiguity between e.g. a 7 bit address
+of 0x50 and a 10 bit address of 0x050 which, in theory, can be on the same bus.
+Another flag is I2C_OWN_SLAVE_ADDRESS to mark addresses on which we listen to
+be devices ourselves.
+
+Optional properties
+-------------------
+
+These properties may not be supported by all drivers. However, if a driver
+wants to support one of the below features, it should adapt the bindings below.
+
+- clock-frequency      - frequency of bus clock in Hz.
+- wakeup-source                - device can be used as a wakeup source.
+
+- interrupts           - interrupts used by the device.
+- interrupt-names      - "irq" and "wakeup" names are recognized by I2C core,
+                         other names are left to individual drivers.
+
+Binding may contain optional "interrupts" property, describing interrupts
+used by the device. I2C core will assign "irq" interrupt (or the very first
+interrupt if not using interrupt names) as primary interrupt for the slave.
+
+Also, if device is marked as a wakeup source, I2C core will set up "wakeup"
+interrupt for the device. If "wakeup" interrupt name is not present in the
+binding, then primary interrupt will be used as wakeup interrupt.
diff --git a/Documentation/devicetree/bindings/i2c/ina209.txt b/Documentation/devicetree/bindings/i2c/ina209.txt
deleted file mode 100644 (file)
index 9dd2bee..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-ina209 properties
-
-Required properties:
-- compatible: Must be "ti,ina209"
-- reg: I2C address
-
-Optional properties:
-
-- shunt-resistor
-       Shunt resistor value in micro-Ohm
-
-Example:
-
-temp-sensor@4c {
-       compatible = "ti,ina209";
-       reg = <0x4c>;
-       shunt-resistor = <5000>;
-};
diff --git a/Documentation/devicetree/bindings/i2c/ina2xx.txt b/Documentation/devicetree/bindings/i2c/ina2xx.txt
deleted file mode 100644 (file)
index a2ad85d..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-ina2xx properties
-
-Required properties:
-- compatible: Must be one of the following:
-       - "ti,ina219" for ina219
-       - "ti,ina220" for ina220
-       - "ti,ina226" for ina226
-       - "ti,ina230" for ina230
-- reg: I2C address
-
-Optional properties:
-
-- shunt-resistor
-       Shunt resistor value in micro-Ohm
-
-Example:
-
-ina220@44 {
-       compatible = "ti,ina220";
-       reg = <0x44>;
-       shunt-resistor = <1000>;
-};
diff --git a/Documentation/devicetree/bindings/i2c/max6697.txt b/Documentation/devicetree/bindings/i2c/max6697.txt
deleted file mode 100644 (file)
index 5f79399..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-max6697 properties
-
-Required properties:
-- compatible:
-       Should be one of
-               maxim,max6581
-               maxim,max6602
-               maxim,max6622
-               maxim,max6636
-               maxim,max6689
-               maxim,max6693
-               maxim,max6694
-               maxim,max6697
-               maxim,max6698
-               maxim,max6699
-- reg: I2C address
-
-Optional properties:
-
-- smbus-timeout-disable
-       Set to disable SMBus timeout. If not specified, SMBus timeout will be
-       enabled.
-- extended-range-enable
-       Only valid for MAX6581. Set to enable extended temperature range.
-       Extended temperature will be disabled if not specified.
-- beta-compensation-enable
-       Only valid for MAX6693 and MX6694. Set to enable beta compensation on
-       remote temperature channel 1.
-       Beta compensation will be disabled if not specified.
-- alert-mask
-       Alert bit mask. Alert disabled for bits set.
-       Select bit 0 for local temperature, bit 1..7 for remote temperatures.
-       If not specified, alert will be enabled for all channels.
-- over-temperature-mask
-       Over-temperature bit mask. Over-temperature reporting disabled for
-       bits set.
-       Select bit 0 for local temperature, bit 1..7 for remote temperatures.
-       If not specified, over-temperature reporting will be enabled for all
-       channels.
-- resistance-cancellation
-       Boolean for all chips other than MAX6581. Set to enable resistance
-       cancellation on remote temperature channel 1.
-       For MAX6581, resistance cancellation enabled for all channels if
-       specified as boolean, otherwise as per bit mask specified.
-       Only supported for remote temperatures (bit 1..7).
-       If not specified, resistance cancellation will be disabled for all
-       channels.
-- transistor-ideality
-       For MAX6581 only. Two values; first is bit mask, second is ideality
-       select value as per MAX6581 data sheet. Select bit 1..7 for remote
-       channels.
-       Transistor ideality will be initialized to default (1.008) if not
-       specified.
-
-Example:
-
-temp-sensor@1a {
-       compatible = "maxim,max6697";
-       reg = <0x1a>;
-       smbus-timeout-disable;
-       resistance-cancellation;
-       alert-mask = <0x72>;
-       over-temperature-mask = <0x7f>;
-};
diff --git a/Documentation/devicetree/bindings/i2c/ti,bq32k.txt b/Documentation/devicetree/bindings/i2c/ti,bq32k.txt
deleted file mode 100644 (file)
index e204906..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-* TI BQ32000                I2C Serial Real-Time Clock
-
-Required properties:
-- compatible: Should contain "ti,bq32000".
-- reg: I2C address for chip
-
-Optional properties:
-- trickle-resistor-ohms : Selected resistor for trickle charger
-       Values usable are 1120 and 20180
-       Should be given if trickle charger should be enabled
-- trickle-diode-disable : Do not use internal trickle charger diode
-       Should be given if internal trickle charger diode should be disabled
-Example:
-       bq32000: rtc@68 {
-               compatible = "ti,bq32000";
-               trickle-resistor-ohms = <1120>;
-               reg = <0x68>;
-       };
index 00f8652e193a940cab80cd6bf91b8127cdecd547..d77d412cbc685f5796d0ff6820a34970570cce95 100644 (file)
@@ -95,6 +95,8 @@ stm,m41t00            Serial Access TIMEKEEPER
 stm,m41t62             Serial real-time clock (RTC) with alarm
 stm,m41t80             M41T80 - SERIAL ACCESS RTC WITH ALARMS
 taos,tsl2550           Ambient Light Sensor with SMBUS/Two Wire Serial Interface
+ti,ads7828             8-Channels, 12-bit ADC
+ti,ads7830             8-Channels, 8-bit ADC
 ti,tsc2003             I2C Touch-Screen Controller
 ti,tmp102              Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
 ti,tmp103              Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface
diff --git a/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt b/Documentation/devicetree/bindings/input/touchscreen/colibri-vf50-ts.txt
new file mode 100644 (file)
index 0000000..9d9e930
--- /dev/null
@@ -0,0 +1,36 @@
+* Toradex Colibri VF50 Touchscreen driver
+
+Required Properties:
+- compatible must be toradex,vf50-touchscreen
+- io-channels: adc channels being used by the Colibri VF50 module
+- xp-gpios: FET gate driver for input of X+
+- xm-gpios: FET gate driver for input of X-
+- yp-gpios: FET gate driver for input of Y+
+- ym-gpios: FET gate driver for input of Y-
+- interrupt-parent: phandle for the interrupt controller
+- interrupts: pen irq interrupt for touch detection
+- pinctrl-names: "idle", "default", "gpios"
+- pinctrl-0: pinctrl node for pen/touch detection state pinmux
+- pinctrl-1: pinctrl node for X/Y and pressure measurement (ADC) state pinmux
+- pinctrl-2: pinctrl node for gpios functioning as FET gate drivers
+- vf50-ts-min-pressure: pressure level at which to stop measuring X/Y values
+
+Example:
+
+       touchctrl: vf50_touchctrl {
+               compatible = "toradex,vf50-touchscreen";
+               io-channels = <&adc1 0>,<&adc0 0>,
+                               <&adc0 1>,<&adc1 2>;
+               xp-gpios = <&gpio0 13 GPIO_ACTIVE_LOW>;
+               xm-gpios = <&gpio2 29 GPIO_ACTIVE_HIGH>;
+               yp-gpios = <&gpio0 12 GPIO_ACTIVE_LOW>;
+               ym-gpios = <&gpio0 4 GPIO_ACTIVE_HIGH>;
+               interrupt-parent = <&gpio0>;
+               interrupts = <8 IRQ_TYPE_LEVEL_LOW>;
+               pinctrl-names = "idle","default","gpios";
+               pinctrl-0 = <&pinctrl_touchctrl_idle>;
+               pinctrl-1 = <&pinctrl_touchctrl_default>;
+               pinctrl-2 = <&pinctrl_touchctrl_gpios>;
+               vf50-ts-min-pressure = <200>;
+               status = "disabled";
+       };
diff --git a/Documentation/devicetree/bindings/input/touchscreen/imx6ul_tsc.txt b/Documentation/devicetree/bindings/input/touchscreen/imx6ul_tsc.txt
new file mode 100644 (file)
index 0000000..853dff9
--- /dev/null
@@ -0,0 +1,36 @@
+* Freescale i.MX6UL Touch Controller
+
+Required properties:
+- compatible: must be "fsl,imx6ul-tsc".
+- reg: this touch controller address and the ADC2 address.
+- interrupts: the interrupt of this touch controller and ADC2.
+- clocks: the root clock of touch controller and ADC2.
+- clock-names; must be "tsc" and "adc".
+- xnur-gpio: the X- gpio this controller connect to.
+  This xnur-gpio returns to low once the finger leave the touch screen (The
+  last touch event the touch controller capture).
+
+Optional properties:
+- measure-delay-time: the value of measure delay time.
+  Before X-axis or Y-axis measurement, the screen need some time before
+  even potential distribution ready.
+  This value depends on the touch screen.
+- pre-charge-time: the touch screen need some time to precharge.
+  This value depends on the touch screen.
+
+Example:
+       tsc: tsc@02040000 {
+               compatible = "fsl,imx6ul-tsc";
+               reg = <0x02040000 0x4000>, <0x0219c000 0x4000>;
+               interrupts = <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+                            <GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>;
+               clocks = <&clks IMX6UL_CLK_IPG>,
+                        <&clks IMX6UL_CLK_ADC2>;
+               clock-names = "tsc", "adc";
+               pinctrl-names = "default";
+               pinctrl-0 = <&pinctrl_tsc>;
+               xnur-gpio = <&gpio1 3 GPIO_ACTIVE_LOW>;
+               measure-delay-time = <0xfff>;
+               pre-charge-time = <0xffff>;
+               status = "okay";
+       };
index 06760503a819f5fbc15e747626ce3f734b2888ff..718074501fcbc97b5c7b5c33cab2736e433f5220 100644 (file)
@@ -43,6 +43,12 @@ conditions.
 
 ** System MMU optional properties:
 
+- dma-coherent  : Present if page table walks made by the SMMU are
+                  cache coherent with the CPU.
+
+                  NOTE: this only applies to the SMMU itself, not
+                  masters connected upstream of the SMMU.
+
 - calxeda,smmu-secure-config-access : Enable proper handling of buggy
                   implementations that always use secure access to
                   SMMU configuration registers. In this case non-secure
index 42531dc387aa6babaed955c1a821bf6a3eb77b84..869699925fd599e3d34c39155dfbb027d2262dca 100644 (file)
@@ -8,6 +8,11 @@ Required properties:
 - ti,hwmods  : Name of the hwmod associated with the IOMMU instance
 - reg        : Address space for the configuration registers
 - interrupts : Interrupt specifier for the IOMMU instance
+- #iommu-cells : Should be 0. OMAP IOMMUs are all "single-master" devices,
+                 and needs no additional data in the pargs specifier. Please
+                 also refer to the generic bindings document for more info
+                 on this property,
+                     Documentation/devicetree/bindings/iommu/iommu.txt
 
 Optional properties:
 - ti,#tlb-entries : Number of entries in the translation look-aside buffer.
@@ -18,6 +23,7 @@ Optional properties:
 Example:
        /* OMAP3 ISP MMU */
        mmu_isp: mmu@480bd400 {
+               #iommu-cells = <0>;
                compatible = "ti,omap2-iommu";
                reg = <0x480bd400 0x80>;
                interrupts = <24>;
diff --git a/Documentation/devicetree/bindings/ipmi.txt b/Documentation/devicetree/bindings/ipmi.txt
new file mode 100644 (file)
index 0000000..d5f1a87
--- /dev/null
@@ -0,0 +1,25 @@
+IPMI device
+
+Required properties:
+- compatible: should be one of ipmi-kcs, ipmi-smic, or ipmi-bt
+- device_type: should be ipmi
+- reg: Address and length of the register set for the device
+
+Optional properties:
+- interrupts: The interrupt for the device.  Without this the interface
+       is polled.
+- reg-size - The size of the register.  Defaults to 1
+- reg-spacing - The number of bytes between register starts.  Defaults to 1
+- reg-shift - The amount to shift the registers to the right to get the data
+       into bit zero.
+
+Example:
+
+smic@fff3a000 {
+       compatible = "ipmi-smic";
+       device_type = "ipmi";
+       reg = <0xfff3a000 0x1000>;
+       interrupts = <0 24 4>;
+       reg-size = <4>;
+       reg-spacing = <4>;
+};
index 7e9490313d5add1f0fddf1327a3901625b1d6d55..da541c3631f81880c6e87b62e8ca2cb133d1aec4 100644 (file)
@@ -9,7 +9,7 @@ Device Tree Bindings for the Arasan SDHCI Controller
 
 Required Properties:
   - compatible: Compatibility string. Must be 'arasan,sdhci-8.9a' or
-                'arasan,sdhci-4.9a'
+                'arasan,sdhci-4.9a' or 'arasan,sdhci-5.1'
   - reg: From mmc bindings: Register location and length.
   - clocks: From clock bindings: Handles to clock inputs.
   - clock-names: From clock bindings: Tuple including "clk_xin" and "clk_ahb"
index 211e7785f4d240ec2ffc7258839f740e8b614b78..dca56d6248f5944880c0e73395344217fd420daf 100644 (file)
@@ -15,6 +15,7 @@ Required properties:
               "fsl,imx6q-usdhc"
               "fsl,imx6sl-usdhc"
               "fsl,imx6sx-usdhc"
+              "fsl,imx7d-usdhc"
 
 Optional properties:
 - fsl,wp-controller : Indicate to use controller internal write protection
@@ -27,6 +28,11 @@ Optional properties:
   transparent level shifters on the outputs of the controller. Two cells are
   required, first cell specifies minimum slot voltage (mV), second cell
   specifies maximum slot voltage (mV). Several ranges could be specified.
+- fsl,tuning-step: Specify the increasing delay cell steps in tuning procedure.
+  The uSDHC use one delay cell as default increasing step to do tuning process.
+  This property allows user to change the tuning step to more than one delay
+  cells which is useful for some special boards or cards when the default
+  tuning step can't find the proper delay window within limited tuning retries.
 
 Examples:
 
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-atmel.txt b/Documentation/devicetree/bindings/mmc/sdhci-atmel.txt
new file mode 100644 (file)
index 0000000..1b662d7
--- /dev/null
@@ -0,0 +1,21 @@
+* Atmel SDHCI controller
+
+This file documents the differences between the core properties in
+Documentation/devicetree/bindings/mmc/mmc.txt and the properties used by the
+sdhci-of-at91 driver.
+
+Required properties:
+- compatible:          Must be "atmel,sama5d2-sdhci".
+- clocks:              Phandlers to the clocks.
+- clock-names:         Must be "hclock", "multclk", "baseclk";
+
+
+Example:
+
+sdmmc0: sdio-host@a0000000 {
+       compatible = "atmel,sama5d2-sdhci";
+       reg = <0xa0000000 0x300>;
+       interrupts = <31 IRQ_TYPE_LEVEL_HIGH 0>;
+       clocks = <&sdmmc0_hclk>, <&sdmmc0_gclk>, <&main>;
+       clock-names = "hclock", "multclk", "baseclk";
+};
index 76bf087bc8898fc82f9b7d48c94cce498a85be50..74166a0d460d9f162da02166a251a8be5122bb9c 100644 (file)
@@ -102,7 +102,7 @@ not every application needs SDIO irq, e.g. MMC cards.
                pinctrl-1 = <&mmc1_idle>;
                pinctrl-2 = <&mmc1_sleep>;
                ...
-               interrupts-extended = <&intc 64 &gpio2 28 0>;
+               interrupts-extended = <&intc 64 &gpio2 28 GPIO_ACTIVE_LOW>;
        };
 
        mmc1_idle : pinmux_cirq_pin {
diff --git a/Documentation/devicetree/bindings/pwm/lpc1850-sct-pwm.txt b/Documentation/devicetree/bindings/pwm/lpc1850-sct-pwm.txt
new file mode 100644 (file)
index 0000000..36e49d4
--- /dev/null
@@ -0,0 +1,20 @@
+* NXP LPC18xx State Configurable Timer - Pulse Width Modulator driver
+
+Required properties:
+  - compatible: Should be "nxp,lpc1850-sct-pwm"
+  - reg: Should contain physical base address and length of pwm registers.
+  - clocks: Must contain an entry for each entry in clock-names.
+    See ../clock/clock-bindings.txt for details.
+  - clock-names: Must include the following entries.
+    - pwm: PWM operating clock.
+  - #pwm-cells: Should be 3. See pwm.txt in this directory for the description
+    of the cells format.
+
+Example:
+  pwm: pwm@40000000 {
+    compatible = "nxp,lpc1850-sct-pwm";
+    reg = <0x40000000 0x1000>;
+    clocks =<&ccu1 CLK_CPU_SCT>;
+    clock-names = "pwm";
+    #pwm-cells = <3>;
+  };
diff --git a/Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.txt b/Documentation/devicetree/bindings/rtc/nxp,lpc1788-rtc.txt
new file mode 100644 (file)
index 0000000..3c97bd1
--- /dev/null
@@ -0,0 +1,21 @@
+NXP LPC1788 real-time clock
+
+The LPC1788 RTC provides calendar and clock functionality
+together with periodic tick and alarm interrupt support.
+
+Required properties:
+- compatible   : must contain "nxp,lpc1788-rtc"
+- reg          : Specifies base physical address and size of the registers.
+- interrupts   : A single interrupt specifier.
+- clocks       : Must contain clock specifiers for rtc and register clock
+- clock-names  : Must contain "rtc" and "reg"
+  See ../clocks/clock-bindings.txt for details.
+
+Example:
+rtc: rtc@40046000 {
+       compatible = "nxp,lpc1788-rtc";
+       reg = <0x40046000 0x1000>;
+       interrupts = <47>;
+       clocks = <&creg_clk 0>, <&ccu1 CLK_CPU_BUS>;
+       clock-names = "rtc", "reg";
+};
index 43a83668673aace3e76a029c1629398aeca8244f..bf7d11ae9bea68f107936211d8256203f89a6157 100644 (file)
@@ -16,6 +16,8 @@ Required properties:
 Optional properties:
 - system-power-controller: whether the rtc is controlling the system power
   through pmic_power_en
+- clocks: Any internal or external clocks feeding in to rtc
+- clock-names: Corresponding names of the clocks
 
 Example:
 
@@ -26,4 +28,6 @@ rtc@1c23000 {
                      19>;
        interrupt-parent = <&intc>;
        system-power-controller;
+       clocks = <&clk_32k_rtc>, <&clk_32768_ck>;
+       clock-names = "ext-clk", "int-clk";
 };
diff --git a/Documentation/devicetree/bindings/rtc/ti,bq32k.txt b/Documentation/devicetree/bindings/rtc/ti,bq32k.txt
new file mode 100644 (file)
index 0000000..e204906
--- /dev/null
@@ -0,0 +1,18 @@
+* TI BQ32000                I2C Serial Real-Time Clock
+
+Required properties:
+- compatible: Should contain "ti,bq32000".
+- reg: I2C address for chip
+
+Optional properties:
+- trickle-resistor-ohms : Selected resistor for trickle charger
+       Values usable are 1120 and 20180
+       Should be given if trickle charger should be enabled
+- trickle-diode-disable : Do not use internal trickle charger diode
+       Should be given if internal trickle charger diode should be disabled
+Example:
+       bq32000: rtc@68 {
+               compatible = "ti,bq32000";
+               trickle-resistor-ohms = <1120>;
+               reg = <0x68>;
+       };
diff --git a/Documentation/devicetree/bindings/rtc/xlnx-rtc.txt b/Documentation/devicetree/bindings/rtc/xlnx-rtc.txt
new file mode 100644 (file)
index 0000000..0df6f01
--- /dev/null
@@ -0,0 +1,25 @@
+* Xilinx Zynq Ultrascale+ MPSoC Real Time Clock
+
+RTC controller for the Xilinx Zynq MPSoC Real Time Clock
+Separate IRQ lines for seconds and alarm
+
+Required properties:
+- compatible: Should be "xlnx,zynqmp-rtc"
+- reg: Physical base address of the controller and length
+       of memory mapped region.
+- interrupts: IRQ lines for the RTC.
+- interrupt-names: interrupt line names eg. "sec" "alarm"
+
+Optional:
+- calibration: calibration value for 1 sec period which will
+               be programmed directly to calibration register
+
+Example:
+rtc: rtc@ffa60000 {
+       compatible = "xlnx,zynqmp-rtc";
+       reg = <0x0 0xffa60000 0x100>;
+       interrupt-parent = <&gic>;
+       interrupts = <0 26 4>, <0 27 4>;
+       interrupt-names = "alarm", "sec";
+       calibration = <0x198233>;
+};
index f65c76db98599253fbf068d89f1fb32e914b4377..97d9b3e1bf399d60a7aadbb8114b704924eb458f 100644 (file)
@@ -37,6 +37,12 @@ The edge is described by the following properties:
        Definition: the identifier of the remote processor in the smd channel
                    allocation table
 
+- qcom,remote-pid:
+       Usage: optional
+       Value type: <u32>
+       Definition: the identifier for the remote processor as known by the rest
+                   of the system.
+
 = SMD DEVICES
 
 In turn, subnodes of the "edges" represent devices tied to SMD channels on that
diff --git a/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt b/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt
new file mode 100644 (file)
index 0000000..f7cc7c0
--- /dev/null
@@ -0,0 +1,35 @@
+* Atmel SAMA5D4 Watchdog Timer (WDT) Controller
+
+Required properties:
+- compatible: "atmel,sama5d4-wdt"
+- reg: base physical address and length of memory mapped region.
+
+Optional properties:
+- timeout-sec: watchdog timeout value (in seconds).
+- interrupts: interrupt number to the CPU.
+- atmel,watchdog-type: should be "hardware" or "software".
+       "hardware": enable watchdog fault reset. A watchdog fault triggers
+                   watchdog reset.
+       "software": enable watchdog fault interrupt. A watchdog fault asserts
+                   watchdog interrupt.
+- atmel,idle-halt: present if you want to stop the watchdog when the CPU is
+                  in idle state.
+       CAUTION: This property should be used with care, it actually makes the
+       watchdog not counting when the CPU is in idle state, therefore the
+       watchdog reset time depends on mean CPU usage and will not reset at all
+       if the CPU stop working while it is in idle state, which is probably
+       not what you want.
+- atmel,dbg-halt: present if you want to stop the watchdog when the CPU is
+                 in debug state.
+
+Example:
+       watchdog@fc068640 {
+               compatible = "atmel,sama5d4-wdt";
+               reg = <0xfc068640 0x10>;
+               interrupts = <4 IRQ_TYPE_LEVEL_HIGH 5>;
+               timeout-sec = <10>;
+               atmel,watchdog-type = "hardware";
+               atmel,dbg-halt;
+               atmel,idle-halt;
+               status = "okay";
+       };
diff --git a/Documentation/devicetree/bindings/watchdog/lpc18xx-wdt.txt b/Documentation/devicetree/bindings/watchdog/lpc18xx-wdt.txt
new file mode 100644 (file)
index 0000000..09f6b24
--- /dev/null
@@ -0,0 +1,19 @@
+* NXP LPC18xx Watchdog Timer (WDT)
+
+Required properties:
+- compatible: Should be "nxp,lpc1850-wwdt"
+- reg: Should contain WDT registers location and length
+- clocks: Must contain an entry for each entry in clock-names.
+- clock-names: Should contain "wdtclk" and "reg"; the watchdog counter
+               clock and register interface clock respectively.
+- interrupts: Should contain WDT interrupt
+
+Examples:
+
+watchdog@40080000 {
+       compatible = "nxp,lpc1850-wwdt";
+       reg = <0x40080000 0x24>;
+       clocks = <&cgu BASE_SAFE_CLK>, <&ccu1 CLK_CPU_WWDT>;
+       clock-names = "wdtclk", "reg";
+       interrupts = <49>;
+};
index 972d02c2a74ccd0ecad5554152ece286f9307b0b..df384e3e845f7b22f121427c6a0c2d352d7648ce 100644 (file)
@@ -20,7 +20,7 @@
     |        ia64: | TODO |
     |        m32r: |  ..  |
     |        m68k: |  ..  |
-    |       metag: |  ..  |
+    |       metag: | TODO |
     |  microblaze: |  ..  |
     |        mips: |  ok  |
     |     mn10300: |  ..  |
index 6a34a0f4d37ccf33248cfd81d2c917d45d8401bc..06d443450f2138fc8595ef43be0e64ebc33f66f7 100644 (file)
@@ -397,7 +397,8 @@ prototypes:
        int (*release) (struct gendisk *, fmode_t);
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-       int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
+       int (*direct_access) (struct block_device *, sector_t, void __pmem **,
+                               unsigned long *);
        int (*media_changed) (struct gendisk *);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*revalidate_disk) (struct gendisk *);
index 7af2851d667c7ab0733ff177c7c1ca91d33bf85e..7bde64014a89716a5242b413e0536cb207864802 100644 (file)
@@ -60,9 +60,10 @@ Filesystem support consists of
 - implementing the direct_IO address space operation, and calling
   dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
 - implementing an mmap file operation for DAX files which sets the
-  VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
-  for fault and page_mkwrite (which should probably call dax_fault() and
-  dax_mkwrite(), passing the appropriate get_block() callback)
+  VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+  include handlers for fault, pmd_fault and page_mkwrite (which should
+  probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+  appropriate get_block() callback)
 - calling dax_truncate_page() instead of block_truncate_page() for DAX files
 - calling dax_zero_page_range() instead of zero_user() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
index 6f7fafde0884e1e991e9b598c1837bdcb573a19d..d411ca63c8b6ce1f00ba624ace1fe5e0045da554 100644 (file)
@@ -424,6 +424,7 @@ Private_Dirty:         0 kB
 Referenced:          892 kB
 Anonymous:             0 kB
 Swap:                  0 kB
+SwapPss:               0 kB
 KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:              374 kB
@@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
 (size), the amount of the mapping that is currently resident in RAM (RSS), the
 process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping.  Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e.  is currently used
-by only one process, is accounted as private and not as shared.  "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e.  is currently used by only one process, is accounted
+as private and not as shared.
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
 "Anonymous" shows the amount of memory that does not belong to any file.  Even
 a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
 "Swap" shows how much would-be-anonymous memory is also used, but out on
 swap.
-
+"SwapPss" shows proportional swap share of this mapping.
 "VmFlags" field deserves a separate description. This member represents the kernel
 flags associated with the particular virtual memory area in two letter encoded
 manner. The codes are the following:
index b80606de545ad809fb147688060b26b51af4d5e1..f59c43b6411b7d1b51b48c9acc1dd4b5f2338337 100644 (file)
@@ -21,8 +21,8 @@ exact way to do it depends on the GPIO controller providing the GPIOs, see the
 device tree bindings for your controller.
 
 GPIOs mappings are defined in the consumer device's node, in a property named
-<function>-gpios, where <function> is the function the driver will request
-through gpiod_get(). For example:
+either <function>-gpios or <function>-gpio, where <function> is the function
+the driver will request through gpiod_get(). For example:
 
        foo_device {
                compatible = "acme,foo";
@@ -31,7 +31,7 @@ through gpiod_get(). For example:
                            <&gpio 16 GPIO_ACTIVE_HIGH>, /* green */
                            <&gpio 17 GPIO_ACTIVE_HIGH>; /* blue */
 
-               power-gpios = <&gpio 1 GPIO_ACTIVE_LOW>;
+               power-gpio = <&gpio 1 GPIO_ACTIVE_LOW>;
        };
 
 This property will make GPIOs 15, 16 and 17 available to the driver under the
@@ -39,15 +39,24 @@ This property will make GPIOs 15, 16 and 17 available to the driver under the
 
        struct gpio_desc *red, *green, *blue, *power;
 
-       red = gpiod_get_index(dev, "led", 0);
-       green = gpiod_get_index(dev, "led", 1);
-       blue = gpiod_get_index(dev, "led", 2);
+       red = gpiod_get_index(dev, "led", 0, GPIOD_OUT_HIGH);
+       green = gpiod_get_index(dev, "led", 1, GPIOD_OUT_HIGH);
+       blue = gpiod_get_index(dev, "led", 2, GPIOD_OUT_HIGH);
 
-       power = gpiod_get(dev, "power");
+       power = gpiod_get(dev, "power", GPIOD_OUT_HIGH);
 
 The led GPIOs will be active-high, while the power GPIO will be active-low (i.e.
 gpiod_is_active_low(power) will be true).
 
+The second parameter of the gpiod_get() functions, the con_id string, has to be
+the <function>-prefix of the GPIO suffixes ("gpios" or "gpio", automatically
+looked up by the gpiod functions internally) used in the device tree. With above
+"led-gpios" example, use the prefix without the "-" as con_id parameter: "led".
+
+Internally, the GPIO subsystem prefixes the GPIO suffix ("gpios" or "gpio")
+with the string passed in con_id to get the resulting string
+(snprintf(... "%s-%s", con_id, gpio_suffixes[]).
+
 ACPI
 ----
 ACPI also supports function names for GPIOs in a similar fashion to DT.
@@ -142,13 +151,14 @@ The driver controlling "foo.0" will then be able to obtain its GPIOs as follows:
 
        struct gpio_desc *red, *green, *blue, *power;
 
-       red = gpiod_get_index(dev, "led", 0);
-       green = gpiod_get_index(dev, "led", 1);
-       blue = gpiod_get_index(dev, "led", 2);
+       red = gpiod_get_index(dev, "led", 0, GPIOD_OUT_HIGH);
+       green = gpiod_get_index(dev, "led", 1, GPIOD_OUT_HIGH);
+       blue = gpiod_get_index(dev, "led", 2, GPIOD_OUT_HIGH);
 
-       power = gpiod_get(dev, "power");
-       gpiod_direction_output(power, 1);
+       power = gpiod_get(dev, "power", GPIOD_OUT_HIGH);
 
-Since the "power" GPIO is mapped as active-low, its actual signal will be 0
-after this code. Contrary to the legacy integer GPIO interface, the active-low
-property is handled during mapping and is thus transparent to GPIO consumers.
+Since the "led" GPIOs are mapped as active-high, this example will switch their
+signals to 1, i.e. enabling the LEDs. And for the "power" GPIO, which is mapped
+as active-low, its actual signal will be 0 after this code. Contrary to the legacy
+integer GPIO interface, the active-low property is handled during mapping and is
+thus transparent to GPIO consumers.
index a206639454ab7acdbf797c2598d818d2dc372544..e000502fde2006aa958a01e5c4fd357c5a6a21f9 100644 (file)
@@ -39,6 +39,9 @@ device that displays digits), an additional index argument can be specified:
                                          const char *con_id, unsigned int idx,
                                          enum gpiod_flags flags)
 
+For a more detailed description of the con_id parameter in the DeviceTree case
+see Documentation/gpio/board.txt
+
 The flags parameter is used to optionally specify a direction and initial value
 for the GPIO. Values can be:
 
index f0dd3d2fec96da61b56d010c19a83755d320f52f..76add4c9cd6893f4aa6aeaae427bf6becec6b4a3 100644 (file)
@@ -32,6 +32,10 @@ Supported chips:
     Prefix: 'nct6792'
     Addresses scanned: ISA address retrieved from Super I/O registers
     Datasheet: Available from Nuvoton upon request
+  * Nuvoton NCT6793D
+    Prefix: 'nct6793'
+    Addresses scanned: ISA address retrieved from Super I/O registers
+    Datasheet: Available from Nuvoton upon request
 
 Authors:
         Guenter Roeck <linux@roeck-us.net>
index 0e2d17b460fddc79b4127b8771c6edb58565780a..c3dbb3bfd8141f300346aa262a496b72320254b3 100644 (file)
@@ -20,6 +20,7 @@ It currently supports the following devices:
  * (type=5) Analog Devices evaluation boards: ADM1025, ADM1030, ADM1031
  * (type=6) Barco LPT->DVI (K5800236) adapter
  * (type=7) One For All JP1 parallel port adapter
+ * (type=8) VCT-jig
 
 These devices use different pinout configurations, so you have to tell
 the driver what you have, using the type module parameter. There is no
index 2dee4e2d62df195ee3897e21d0d1e73424a21985..61ed05cd95317f0cb92ce46dd06624ea7936fdb3 100644 (file)
@@ -31,10 +31,13 @@ User manual
 ===========
 
 I2C slave backends behave like standard I2C clients. So, you can instantiate
-them as described in the document 'instantiating-devices'. A quick example for
-instantiating the slave-eeprom driver from userspace at address 0x64 on bus 1:
+them as described in the document 'instantiating-devices'. The only difference
+is that i2c slave backends have their own address space. So, you have to add
+0x1000 to the address you would originally request. An example for
+instantiating the slave-eeprom driver from userspace at the 7 bit address 0x64
+on bus 1:
 
-  # echo slave-24c02 0x64 > /sys/bus/i2c/devices/i2c-1/new_device
+  # echo slave-24c02 0x1064 > /sys/bus/i2c/devices/i2c-1/new_device
 
 Each backend should come with separate documentation to describe its specific
 behaviour and setup.
index cdfe13901b99cb64a9bbc2484b13ca7cd174e878..7b2d11e53a49f14c09190449172d539c4725e503 100644 (file)
@@ -2,6 +2,10 @@ The I2C protocol knows about two kinds of device addresses: normal 7 bit
 addresses, and an extended set of 10 bit addresses. The sets of addresses
 do not intersect: the 7 bit address 0x10 is not the same as the 10 bit
 address 0x10 (though a single device could respond to both of them).
+To avoid ambiguity, the user sees 10 bit addresses mapped to a different
+address space, namely 0xa000-0xa3ff. The leading 0xa (= 10) represents the
+10 bit mode. This is used for creating device names in sysfs. It is also
+needed when instantiating 10 bit devices via the new_device file in sysfs.
 
 I2C messages to and from 10-bit address devices have a different format.
 See the I2C specification for the details.
index ddd519b72ee15fee315f7003b18caa1fc053ead1..9028b025501a6e4fab03a3e4be152c8fbbc6694c 100644 (file)
@@ -64,3 +64,23 @@ MTHCA
     fw_ver   - Firmware version
     hca_type - HCA type: "MT23108", "MT25208 (MT23108 compat mode)",
                or "MT25208"
+
+HFI1
+
+  The hfi1 driver also creates these additional files:
+
+   hw_rev - hardware revision
+   board_id - manufacturing board id
+   tempsense - thermal sense information
+   serial - board serial number
+   nfreectxts - number of free user contexts
+   nctxts - number of allowed contexts (PSM2)
+   chip_reset - diagnostic (root only)
+   boardversion - board version
+   ports/1/
+          CMgtA/
+               cc_settings_bin - CCA tables used by PSM2
+               cc_table_bin
+          sc2v/ - 32 files (0 - 31) used to translate sl->vl
+          sl2sc/ - 32 files (0 - 31) used to translate sl->sc
+          vl2mtu/ - 16 (0 - 15) files used to determine MTU for vl
index 39ac6546d4a42f5a97f027647aa1fb1147ec6fd2..df1b25eb838296f8fbc09d98459ec674f507e4eb 100644 (file)
@@ -265,7 +265,7 @@ Code  Seq#(hex)     Include File            Comments
 's'    all     linux/cdk.h
 't'    00-7F   linux/ppp-ioctl.h
 't'    80-8F   linux/isdn_ppp.h
-'t'    90      linux/toshiba.h
+'t'    90-91   linux/toshiba.h         toshiba and toshiba_acpi SMM
 'u'    00-1F   linux/smb_fs.h          gone
 'u'    20-3F   linux/uvcvideo.h        USB video class host driver
 'v'    00-1F   linux/ext2_fs.h         conflict!
index 6466704d47b5a5d0cf8e139051caa23b0ebc1503..0ff6a466a05b242bbba0a1ce9bf640b6a05fa8e7 100644 (file)
@@ -174,6 +174,11 @@ The output directory is often set using "O=..." on the commandline.
 
 The value can be overridden in which case the default value is ignored.
 
+KBUILD_SIGN_PIN
+--------------------------------------------------
+This variable allows a passphrase or PIN to be passed to the sign-file
+utility when signing kernel modules, if the private key requires such.
+
 KBUILD_MODPOST_WARN
 --------------------------------------------------
 KBUILD_MODPOST_WARN can be set to avoid errors in case of undefined
index f2529286cd66dbd8c68fdf7d90cf75de8c421b3b..22a4b687ea5b4b3cb9d576bfeffaed813256a795 100644 (file)
@@ -2285,6 +2285,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        The default parameter value of '0' causes the kernel
                        not to attempt recovery of lost locks.
 
+       nfs4.layoutstats_timer =
+                       [NFSv4.2] Change the rate at which the kernel sends
+                       layoutstats to the pNFS metadata server.
+
+                       Setting this to value to 0 causes the kernel to use
+                       whatever value is the default set by the layout
+                       driver. A non-zero value sets the minimum interval
+                       in seconds between layoutstats transmissions.
+
        nfsd.nfs4_disable_idmapping=
                        [NFSv4] When set to the default of '1', the NFSv4
                        server will return only numeric uids and gids to
@@ -4097,6 +4106,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        plus one apbt timer for broadcast timer.
                        x86_intel_mid_timer=apbt_only | lapic_and_apbt
 
+       xen_512gb_limit         [KNL,X86-64,XEN]
+                       Restricts the kernel running paravirtualized under Xen
+                       to use only up to 512 GB of RAM. The reason to do so is
+                       crash analysis tools and Xen tools for doing domain
+                       save/restore/migration must be enabled to handle larger
+                       domains.
+
        xen_emul_unplug=                [HW,X86,XEN]
                        Unplug Xen emulated devices
                        Format: [unplug0,][unplug1]
index c72702ec1ded8b0b5d728e6cd0519a5862d2d3c7..a78bf1ffa68cb4c4defe32146fc75f8449a46245 100644 (file)
@@ -89,6 +89,32 @@ This has a number of options available:
      their signatures checked without causing a dependency loop.
 
 
+ (4) "File name or PKCS#11 URI of module signing key" (CONFIG_MODULE_SIG_KEY)
+
+     Setting this option to something other than its default of
+     "certs/signing_key.pem" will disable the autogeneration of signing keys
+     and allow the kernel modules to be signed with a key of your choosing.
+     The string provided should identify a file containing both a private key
+     and its corresponding X.509 certificate in PEM form, or — on systems where
+     the OpenSSL ENGINE_pkcs11 is functional — a PKCS#11 URI as defined by
+     RFC7512. In the latter case, the PKCS#11 URI should reference both a
+     certificate and a private key.
+
+     If the PEM file containing the private key is encrypted, or if the
+     PKCS#11 token requries a PIN, this can be provided at build time by
+     means of the KBUILD_SIGN_PIN variable.
+
+
+ (5) "Additional X.509 keys for default system keyring" (CONFIG_SYSTEM_TRUSTED_KEYS)
+
+     This option can be set to the filename of a PEM-encoded file containing
+     additional certificates which will be included in the system keyring by
+     default.
+
+Note that enabling module signing adds a dependency on the OpenSSL devel
+packages to the kernel build processes for the tool that does the signing.
+
+
 =======================
 GENERATING SIGNING KEYS
 =======================
@@ -100,16 +126,16 @@ it can be deleted or stored securely.  The public key gets built into the
 kernel so that it can be used to check the signatures as the modules are
 loaded.
 
-Under normal conditions, the kernel build will automatically generate a new
-keypair using openssl if one does not exist in the files:
+Under normal conditions, when CONFIG_MODULE_SIG_KEY is unchanged from its
+default, the kernel build will automatically generate a new keypair using
+openssl if one does not exist in the file:
 
-       signing_key.priv
-       signing_key.x509
+       certs/signing_key.pem
 
 during the building of vmlinux (the public part of the key needs to be built
 into vmlinux) using parameters in the:
 
-       x509.genkey
+       certs/x509.genkey
 
 file (which is also generated if it does not already exist).
 
@@ -135,8 +161,12 @@ kernel sources tree and the openssl command.  The following is an example to
 generate the public/private key files:
 
        openssl req -new -nodes -utf8 -sha256 -days 36500 -batch -x509 \
-          -config x509.genkey -outform DER -out signing_key.x509 \
-          -keyout signing_key.priv
+          -config x509.genkey -outform PEM -out kernel_key.pem \
+          -keyout kernel_key.pem
+
+The full pathname for the resulting kernel_key.pem file can then be specified
+in the CONFIG_MODULE_SIG_KEY option, and the certificate and key therein will
+be used instead of an autogenerated keypair.
 
 
 =========================
@@ -152,10 +182,9 @@ in a keyring called ".system_keyring" that can be seen by:
        302d2d52 I------     1 perm 1f010000     0     0 asymmetri Fedora kernel signing key: d69a84e6bce3d216b979e9505b3e3ef9a7118079: X509.RSA a7118079 []
        ...
 
-Beyond the public key generated specifically for module signing, any file
-placed in the kernel source root directory or the kernel build root directory
-whose name is suffixed with ".x509" will be assumed to be an X.509 public key
-and will be added to the keyring.
+Beyond the public key generated specifically for module signing, additional
+trusted certificates can be provided in a PEM-encoded file referenced by the
+CONFIG_SYSTEM_TRUSTED_KEYS configuration option.
 
 Further, the architecture code may take public keys from a hardware store and
 add those in also (e.g. from the UEFI key database).
@@ -181,7 +210,7 @@ To manually sign a module, use the scripts/sign-file tool available in
 the Linux kernel source tree.  The script requires 4 arguments:
 
        1.  The hash algorithm (e.g., sha256)
-       2.  The private key filename
+       2.  The private key filename or PKCS#11 URI
        3.  The public key filename
        4.  The kernel module to be signed
 
@@ -194,6 +223,9 @@ The hash algorithm used does not have to match the one configured, but if it
 doesn't, you should make sure that hash algorithm is either built into the
 kernel or can be loaded without requiring itself.
 
+If the private key requires a passphrase or PIN, it can be provided in the
+$KBUILD_SIGN_PIN environment variable.
+
 
 ============================
 SIGNED MODULES AND STRIPPING
index de5e1aeca7fb95f8a34dff42519a339958aad4ff..5e6d07fbed07c5483a73a5d47c9ba8ec67a9e7c6 100644 (file)
@@ -28,6 +28,10 @@ Smack kernels use the CIPSO IP option. Some network
 configurations are intolerant of IP options and can impede
 access to systems that use them as Smack does.
 
+Smack is used in the Tizen operating system. Please
+go to http://wiki.tizen.org for information about how
+Smack is used in Tizen.
+
 The current git repository for Smack user space is:
 
        git://github.com/smack-team/smack.git
@@ -108,6 +112,8 @@ in the smackfs filesystem. This pseudo-filesystem is mounted
 on /sys/fs/smackfs.
 
 access
+       Provided for backward compatibility. The access2 interface
+       is preferred and should be used instead.
        This interface reports whether a subject with the specified
        Smack label has a particular access to an object with a
        specified Smack label. Write a fixed format access rule to
@@ -136,6 +142,8 @@ change-rule
        those in the fourth string. If there is no such rule it will be
        created using the access specified in the third and the fourth strings.
 cipso
+       Provided for backward compatibility. The cipso2 interface
+       is preferred and should be used instead.
        This interface allows a specific CIPSO header to be assigned
        to a Smack label. The format accepted on write is:
                "%24s%4d%4d"["%4d"]...
@@ -157,7 +165,19 @@ direct
 doi
        This contains the CIPSO domain of interpretation used in
        network packets.
+ipv6host
+       This interface allows specific IPv6 internet addresses to be
+       treated as single label hosts. Packets are sent to single
+       label hosts only from processes that have Smack write access
+       to the host label. All packets received from single label hosts
+       are given the specified label. The format accepted on write is:
+               "%h:%h:%h:%h:%h:%h:%h:%h label" or
+               "%h:%h:%h:%h:%h:%h:%h:%h/%d label".
+       The "::" address shortcut is not supported.
+       If label is "-DELETE" a matched entry will be deleted.
 load
+       Provided for backward compatibility. The load2 interface
+       is preferred and should be used instead.
        This interface allows access control rules in addition to
        the system defined rules to be specified. The format accepted
        on write is:
@@ -181,6 +201,8 @@ load2
        permissions that are not allowed. The string "r-x--" would
        specify read and execute access.
 load-self
+       Provided for backward compatibility. The load-self2 interface
+       is preferred and should be used instead.
        This interface allows process specific access rules to be
        defined. These rules are only consulted if access would
        otherwise be permitted, and are intended to provide additional
@@ -205,6 +227,8 @@ netlabel
        received from single label hosts are given the specified
        label. The format accepted on write is:
                "%d.%d.%d.%d label" or "%d.%d.%d.%d/%d label".
+       If the label specified is "-CIPSO" the address is treated
+       as a host that supports CIPSO headers.
 onlycap
        This contains labels processes must have for CAP_MAC_ADMIN
        and CAP_MAC_OVERRIDE to be effective. If this file is empty
@@ -232,7 +256,8 @@ unconfined
        is dangerous and can ruin the proper labeling of your system.
        It should never be used in production.
 
-You can add access rules in /etc/smack/accesses. They take the form:
+If you are using the smackload utility
+you can add access rules in /etc/smack/accesses. They take the form:
 
     subjectlabel objectlabel access
 
index 227a63f018a27dfcbad74264d68655499ada6b78..d9ee7d7a6c7fdada4a84f95309b879f4ec099d1c 100644 (file)
@@ -1,9 +1,7 @@
-Yama is a Linux Security Module that collects a number of system-wide DAC
-security protections that are not handled by the core kernel itself. To
-select it at boot time, specify "security=yama" (though this will disable
-any other LSM).
-
-Yama is controlled through sysctl in /proc/sys/kernel/yama:
+Yama is a Linux Security Module that collects system-wide DAC security
+protections that are not handled by the core kernel itself. This is
+selectable at build-time with CONFIG_SECURITY_YAMA, and can be controlled
+at run-time through sysctls in /proc/sys/kernel/yama:
 
 - ptrace_scope
 
index f4cb0b2d5cd79048c51cf7b89f803561c2070e28..477927becacba69ee4bdea2203dd796979d14449 100644 (file)
@@ -15,8 +15,8 @@ The updated API replacements are:
 
 DEFINE_STATIC_KEY_TRUE(key);
 DEFINE_STATIC_KEY_FALSE(key);
-static_key_likely()
-statick_key_unlikely()
+static_branch_likely()
+static_branch_unlikely()
 
 0) Abstract
 
index 9c3f2f8054b5f90411b341a6e39a4b060c78804a..a4482fceacecc9443f6cdc12b5083acdeabbcca1 100644 (file)
@@ -349,7 +349,7 @@ zone[i]'s protection[j] is calculated by following expression.
 
 (i < j):
   zone[i]->protection[j]
-  = (total sums of present_pages from zone[i+1] to zone[j] on the node)
+  = (total sums of managed_pages from zone[i+1] to zone[j] on the node)
     / lowmem_reserve_ratio[i];
 (i = j):
    (should not be protected. = 0;
@@ -360,7 +360,7 @@ The default values of lowmem_reserve_ratio[i] are
     256 (if zone[i] means DMA or DMA32 zone)
     32  (others).
 As above expression, they are reciprocal number of ratio.
-256 means 1/256. # of protection pages becomes about "0.39%" of total present
+256 means 1/256. # of protection pages becomes about "0.39%" of total managed
 pages of higher zones on the node.
 
 If you would like to protect more pages, smaller values are effective.
index 267f39386f99f77de3993f980652c6b28a7c76d3..13f5619b2203e68af6d766f66a8137dd1133d4fa 100644 (file)
@@ -75,7 +75,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 
 'e'     - Send a SIGTERM to all processes, except for init.
 
-'f'    - Will call oom_kill to kill a memory hog process.
+'f'    - Will call the oom killer to kill a memory hog process, but do not
+         panic if nothing can be killed.
 
 'g'    - Used by kgdb (kernel debugger)
 
index c1f6864a8c5db86def7b40ca7746e028e99bf37a..10f062ea6bc2b4f04e09ca48ab194071da9f298a 100644 (file)
@@ -180,6 +180,7 @@ Thermal zone device sys I/F, created once it's registered:
     |---temp:                  Current temperature
     |---mode:                  Working mode of the thermal zone
     |---policy:                        Thermal governor used for this zone
+    |---available_policies:    Available thermal governors for this zone
     |---trip_point_[0-*]_temp: Trip point temperature
     |---trip_point_[0-*]_type: Trip point type
     |---trip_point_[0-*]_hyst: Hysteresis value for this trip point
@@ -256,6 +257,10 @@ policy
        One of the various thermal governors used for a particular zone.
        RW, Required
 
+available_policies
+       Available thermal governors which can be used for a particular zone.
+       RO, Required
+
 trip_point_[0-*]_temp
        The temperature above which trip point will be fired.
        Unit: millidegree Celsius
@@ -417,6 +422,7 @@ method, the sys I/F structure will be built like this:
     |---temp:                  37000
     |---mode:                  enabled
     |---policy:                        step_wise
+    |---available_policies:    step_wise fair_share
     |---trip_point_0_temp:     100000
     |---trip_point_0_type:     critical
     |---trip_point_1_temp:     80000
index 87bb4aa6a6b98582e8cd4b97b707a62dc75c12ae..ef621d34ba5bf7a5a3c2d570de2ebaecf0615cc0 100644 (file)
@@ -691,6 +691,8 @@ The above is mostly meaningful for kernel developers.
         The marks are determined by the difference between this
         current trace and the next trace.
          '$' - greater than 1 second
+         '@' - greater than 100 milisecond
+         '*' - greater than 10 milisecond
          '#' - greater than 1000 microsecond
          '!' - greater than 100 microsecond
          '+' - greater than 10 microsecond
@@ -1944,26 +1946,49 @@ want, depending on your needs.
 
   ie:
 
-  0)               |    up_write() {
-  0)   0.646 us    |      _spin_lock_irqsave();
-  0)   0.684 us    |      _spin_unlock_irqrestore();
-  0)   3.123 us    |    }
-  0)   0.548 us    |    fput();
-  0) + 58.628 us   |  }
+  3) # 1837.709 us |          } /* __switch_to */
+  3)               |          finish_task_switch() {
+  3)   0.313 us    |            _raw_spin_unlock_irq();
+  3)   3.177 us    |          }
+  3) # 1889.063 us |        } /* __schedule */
+  3) ! 140.417 us  |      } /* __schedule */
+  3) # 2034.948 us |    } /* schedule */
+  3) * 33998.59 us |  } /* schedule_preempt_disabled */
 
   [...]
 
-  0)               |      putname() {
-  0)               |        kmem_cache_free() {
-  0)   0.518 us    |          __phys_addr();
-  0)   1.757 us    |        }
-  0)   2.861 us    |      }
-  0) ! 115.305 us  |    }
-  0) ! 116.402 us  |  }
+  1)   0.260 us    |              msecs_to_jiffies();
+  1)   0.313 us    |              __rcu_read_unlock();
+  1) + 61.770 us   |            }
+  1) + 64.479 us   |          }
+  1)   0.313 us    |          rcu_bh_qs();
+  1)   0.313 us    |          __local_bh_enable();
+  1) ! 217.240 us  |        }
+  1)   0.365 us    |        idle_cpu();
+  1)               |        rcu_irq_exit() {
+  1)   0.417 us    |          rcu_eqs_enter_common.isra.47();
+  1)   3.125 us    |        }
+  1) ! 227.812 us  |      }
+  1) ! 457.395 us  |    }
+  1) @ 119760.2 us |  }
+
+  [...]
+
+  2)               |    handle_IPI() {
+  1)   6.979 us    |                  }
+  2)   0.417 us    |      scheduler_ipi();
+  1)   9.791 us    |                }
+  1) + 12.917 us   |              }
+  2)   3.490 us    |    }
+  1) + 15.729 us   |            }
+  1) + 18.542 us   |          }
+  2) $ 3594274 us  |  }
 
   + means that the function exceeded 10 usecs.
   ! means that the function exceeded 100 usecs.
   # means that the function exceeded 1000 usecs.
+  * means that the function exceeded 10 msecs.
+  @ means that the function exceeded 100 msecs.
   $ means that the function exceeded 1 sec.
 
 
index a4ebcb712375478e9bb60a7652d49f4d46b9c978..d9ecceea5a02960615fb25fe74f08e6a8a5e0232 100644 (file)
@@ -2671,7 +2671,7 @@ handled.
 4.87 KVM_SET_GUEST_DEBUG
 
 Capability: KVM_CAP_SET_GUEST_DEBUG
-Architectures: x86, s390, ppc
+Architectures: x86, s390, ppc, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_guest_debug (in)
 Returns: 0 on success; -1 on error
@@ -2693,8 +2693,8 @@ when running. Common control bits are:
 The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 
-  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -2709,6 +2709,11 @@ updated to the correct (supplied) values.
 The second part of the structure is architecture specific and
 typically contains a set of debug registers.
 
+For arm64 the number of debug registers is implementation defined and
+can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and
+KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number
+indicating the number of supported registers.
+
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
@@ -3111,11 +3116,13 @@ data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 
+               /* KVM_EXIT_DEBUG */
                struct {
                        struct kvm_debug_exit_arch arch;
                } debug;
 
-Unused.
+If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event
+for which architecture specific information is returned.
 
                /* KVM_EXIT_MMIO */
                struct {
index 081c49777abb81e54bc6bfee8b2ae553b3954b3e..6a5e2a102a451b186344361603b43bff4c4ea15c 100644 (file)
@@ -14,6 +14,8 @@ hugetlbpage.txt
        - a brief summary of hugetlbpage support in the Linux kernel.
 hwpoison.txt
        - explains what hwpoison is
+idle_page_tracking.txt
+       - description of the idle page tracking feature.
 ksm.txt
        - how to use the Kernel Samepage Merging feature.
 numa
index 030977fb8d2dcb0cb820b5989b9c0cfedd3c2d83..54dd9b9c6c31aeed967c0b7c8f6ca3e39d289cae 100644 (file)
@@ -329,7 +329,14 @@ Examples
 
 3) hugepage-mmap:  see tools/testing/selftests/vm/hugepage-mmap.c
 
-4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
-   wide range of userspace tools to help with huge page usability, environment
-   setup, and control. Furthermore it provides useful test cases that should be
-   used when modifying code to ensure no regressions are introduced.
+4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
+   provides a wide range of userspace tools to help with huge page usability,
+   environment setup, and control.
+
+Kernel development regression testing
+=====================================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions.  In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
diff --git a/Documentation/vm/idle_page_tracking.txt b/Documentation/vm/idle_page_tracking.txt
new file mode 100644 (file)
index 0000000..85dcc3b
--- /dev/null
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+    /proc/pid/pagemap if the workload is represented by a process, or by
+    filtering out alien pages using /proc/kpagecgroup in case the workload is
+    placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+    one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
index 6bfbc172cdb96b437728a34eadf3e257ab456799..0e1e55588b598b45a60e3e476bad0dc8a9a53266 100644 (file)
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
    physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -16,11 +16,17 @@ There are three components to pagemap:
     * Bits 0-4   swap type if swapped
     * Bits 5-54  swap offset if swapped
     * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
-    * Bits 56-60 zero
-    * Bit  61    page is file-page or shared-anon
+    * Bit  56    page exclusively mapped (since 4.2)
+    * Bits 57-60 zero
+    * Bit  61    page is file-page or shared-anon (since 3.5)
     * Bit  62    page swapped
     * Bit  63    page present
 
+   Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
+   In 4.0 and 4.1 opens by unprivileged fail with -EPERM.  Starting from
+   4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
+   Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
+
    If the page is not present but in swap, then the PFN contains an
    encoding of the swap file number and the page's offset into the
    swap. Unmapped pages return a null PFN. This allows determining
@@ -64,6 +70,11 @@ There are three components to pagemap:
     22. THP
     23. BALLOON
     24. ZERO_PAGE
+    25. IDLE
+
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
 
 Short descriptions to the page flags:
 
@@ -110,6 +121,12 @@ Short descriptions to the page flags:
 24. ZERO_PAGE
     zero page for pfn_zero or huge_zero page
 
+25. IDLE
+    page has not been accessed since it was marked idle (see
+    Documentation/vm/idle_page_tracking.txt). Note that this flag may be
+    stale in case the page was accessed via a PTE. To make sure the flag
+    is up-to-date one has to read /sys/kernel/mm/page_idle/bitmap first.
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
@@ -159,3 +176,8 @@ Other notes:
 Reading from any of the files will return -EINVAL if you are not starting
 the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
 into the file), or if the size of the read is not a multiple of 8 bytes.
+
+Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
+always 12 at most architectures). Since Linux 3.11 their meaning changes
+after first clear of soft-dirty bits. Since Linux 4.2 they are used for
+flags unconditionally.
index 8458c0861e4e6e01fcd2aac4aa6721e88a6a1f37..89fff7d611ccb533a5c3d375bc94fecf3c2e0687 100644 (file)
@@ -32,7 +32,7 @@ can also be enabled and disabled at runtime using the sysfs interface.
 An example command to enable zswap at runtime, assuming sysfs is mounted
 at /sys, is:
 
-echo 1 > /sys/modules/zswap/parameters/enabled
+echo 1 > /sys/module/zswap/parameters/enabled
 
 When zswap is disabled at runtime it will stop storing pages that are
 being swapped out.  However, it will _not_ immediately write out or fault
@@ -49,14 +49,26 @@ Zswap receives pages for compression through the Frontswap API and is able to
 evict pages from its own compressed pool on an LRU basis and write them back to
 the backing swap device in the case that the compressed pool is full.
 
-Zswap makes use of zbud for the managing the compressed memory pool.  Each
-allocation in zbud is not directly accessible by address.  Rather, a handle is
+Zswap makes use of zpool for the managing the compressed memory pool.  Each
+allocation in zpool is not directly accessible by address.  Rather, a handle is
 returned by the allocation routine and that handle must be mapped before being
 accessed.  The compressed memory pool grows on demand and shrinks as compressed
-pages are freed.  The pool is not preallocated.
+pages are freed.  The pool is not preallocated.  By default, a zpool of type
+zbud is created, but it can be selected at boot time by setting the "zpool"
+attribute, e.g. zswap.zpool=zbud.  It can also be changed at runtime using the
+sysfs "zpool" attribute, e.g.
+
+echo zbud > /sys/module/zswap/parameters/zpool
+
+The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
+means the compression ratio will always be 2:1 or worse (because of half-full
+zbud pages).  The zsmalloc type zpool has a more complex compressed page
+storage method, and it can achieve greater storage densities.  However,
+zsmalloc does not implement compressed page eviction, so once zswap fills it
+cannot evict the oldest page, it can only reject new pages.
 
 When a swap page is passed from frontswap to zswap, zswap maintains a mapping
-of the swap entry, a combination of the swap type and swap offset, to the zbud
+of the swap entry, a combination of the swap type and swap offset, to the zpool
 handle that references that compressed swap page.  This mapping is achieved
 with a red-black tree per swap type.  The swap offset is the search key for the
 tree nodes.
@@ -74,9 +86,17 @@ controlled policy:
 * max_pool_percent - The maximum percentage of memory that the compressed
     pool can occupy.
 
-Zswap allows the compressor to be selected at kernel boot time by setting the
-“compressor” attribute.  The default compressor is lzo.  e.g.
-zswap.compressor=deflate
+The default compressor is lzo, but it can be selected at boot time by setting
+the “compressor” attribute, e.g. zswap.compressor=lzo.  It can also be changed
+at runtime using the sysfs "compressor" attribute, e.g.
+
+echo lzo > /sys/module/zswap/parameters/compressor
+
+When the zpool and/or compressor parameter is changed at runtime, any existing
+compressed pages are not modified; they are left in their own zpool.  When a
+request is made for a page in an old zpool, it is uncompressed using its
+original compressor.  Once all pages are removed from an old zpool, the zpool
+and its compressor are freed.
 
 A debugfs interface is provided for various statistic about pool size, number
 of pages stored, and various counters for the reasons pages are rejected.
index 3da822967ee0e13d1a2dbcd88d2d75f5a4aca370..fcdde8fc98be3a122ee8eef82c42d828a4de4a7a 100644 (file)
@@ -41,6 +41,7 @@ static void term(int sig)
 int main(int argc, char *argv[])
 {
     int flags;
+    unsigned int ping_rate = 1;
 
     fd = open("/dev/watchdog", O_WRONLY);
 
@@ -63,22 +64,33 @@ int main(int argc, char *argv[])
            fprintf(stderr, "Watchdog card enabled.\n");
            fflush(stderr);
            goto end;
+       } else if (!strncasecmp(argv[1], "-t", 2) && argv[2]) {
+           flags = atoi(argv[2]);
+           ioctl(fd, WDIOC_SETTIMEOUT, &flags);
+           fprintf(stderr, "Watchdog timeout set to %u seconds.\n", flags);
+           fflush(stderr);
+           goto end;
+       } else if (!strncasecmp(argv[1], "-p", 2) && argv[2]) {
+           ping_rate = strtoul(argv[2], NULL, 0);
+           fprintf(stderr, "Watchdog ping rate set to %u seconds.\n", ping_rate);
+           fflush(stderr);
        } else {
-           fprintf(stderr, "-d to disable, -e to enable.\n");
+           fprintf(stderr, "-d to disable, -e to enable, -t <n> to set " \
+               "the timeout,\n-p <n> to set the ping rate, and \n");
            fprintf(stderr, "run by itself to tick the card.\n");
            fflush(stderr);
            goto end;
        }
-    } else {
-       fprintf(stderr, "Watchdog Ticking Away!\n");
-       fflush(stderr);
     }
 
+    fprintf(stderr, "Watchdog Ticking Away!\n");
+    fflush(stderr);
+
     signal(SIGINT, term);
 
     while(1) {
        keep_alive();
-       sleep(1);
+       sleep(ping_rate);
     }
 end:
     close(fd);
index 4d8c8e10fb392f989fc79bb96d223bb079ff160b..274f85405584e1249c8b089b789f1a4d11973500 100644 (file)
@@ -1905,6 +1905,12 @@ L:       linux-mtd@lists.infradead.org
 S:     Supported
 F:     drivers/mtd/nand/atmel_nand*
 
+ATMEL SDMMC DRIVER
+M:     Ludovic Desroches <ludovic.desroches@atmel.com>
+L:     linux-mmc@vger.kernel.org
+S:     Supported
+F:     drivers/mmc/host/sdhci-of-at91.c
+
 ATMEL SPI DRIVER
 M:     Nicolas Ferre <nicolas.ferre@atmel.com>
 S:     Supported
@@ -2621,6 +2627,15 @@ S:       Supported
 F:     Documentation/filesystems/ceph.txt
 F:     fs/ceph/
 
+CERTIFICATE HANDLING:
+M:     David Howells <dhowells@redhat.com>
+M:     David Woodhouse <dwmw2@infradead.org>
+L:     keyrings@linux-nfs.org
+S:     Maintained
+F:     Documentation/module-signing.txt
+F:     certs/
+F:     scripts/extract-cert.c
+
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 L:     linux-usb@vger.kernel.org
 S:     Orphan
@@ -5326,6 +5341,7 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git
 S:     Supported
 F:     Documentation/infiniband/
 F:     drivers/infiniband/
+F:     drivers/staging/rdma/
 F:     include/uapi/linux/if_infiniband.h
 F:     include/uapi/rdma/
 F:     include/rdma/
@@ -5583,7 +5599,7 @@ IPATH DRIVER
 M:     Mike Marciniszyn <infinipath@intel.com>
 L:     linux-rdma@vger.kernel.org
 S:     Maintained
-F:     drivers/infiniband/hw/ipath/
+F:     drivers/staging/rdma/ipath/
 
 IPMI SUBSYSTEM
 M:     Corey Minyard <minyard@acm.org>
@@ -5994,7 +6010,7 @@ F:        kernel/kexec.c
 
 KEYS/KEYRINGS:
 M:     David Howells <dhowells@redhat.com>
-L:     keyrings@linux-nfs.org
+L:     keyrings@vger.kernel.org
 S:     Maintained
 F:     Documentation/security/keys.txt
 F:     include/linux/key.h
@@ -6006,7 +6022,7 @@ KEYS-TRUSTED
 M:     David Safford <safford@us.ibm.com>
 M:     Mimi Zohar <zohar@linux.vnet.ibm.com>
 L:     linux-security-module@vger.kernel.org
-L:     keyrings@linux-nfs.org
+L:     keyrings@vger.kernel.org
 S:     Supported
 F:     Documentation/security/keys-trusted-encrypted.txt
 F:     include/keys/trusted-type.h
@@ -6017,7 +6033,7 @@ KEYS-ENCRYPTED
 M:     Mimi Zohar <zohar@linux.vnet.ibm.com>
 M:     David Safford <safford@us.ibm.com>
 L:     linux-security-module@vger.kernel.org
-L:     keyrings@linux-nfs.org
+L:     keyrings@vger.kernel.org
 S:     Supported
 F:     Documentation/security/keys-trusted-encrypted.txt
 F:     include/keys/encrypted-type.h
@@ -6220,6 +6236,7 @@ Q:        https://patchwork.kernel.org/project/linux-nvdimm/list/
 S:     Supported
 F:     drivers/nvdimm/pmem.c
 F:     include/linux/pmem.h
+F:     arch/*/include/asm/pmem.h
 
 LINUX FOR IBM pSERIES (RS/6000)
 M:     Paul Mackerras <paulus@au.ibm.com>
@@ -6435,11 +6452,11 @@ F:      drivers/hwmon/ltc4261.c
 LTP (Linux Test Project)
 M:     Mike Frysinger <vapier@gentoo.org>
 M:     Cyril Hrubis <chrubis@suse.cz>
-M:     Wanlong Gao <gaowanlong@cn.fujitsu.com>
+M:     Wanlong Gao <wanlong.gao@gmail.com>
 M:     Jan Stancek <jstancek@redhat.com>
 M:     Stanislav Kholmanskikh <stanislav.kholmanskikh@oracle.com>
 M:     Alexey Kodanev <alexey.kodanev@oracle.com>
-L:     ltp-list@lists.sourceforge.net (subscribers-only)
+L:     ltp@lists.linux.it (subscribers-only)
 W:     http://linux-test-project.github.io/
 T:     git git://github.com/linux-test-project/ltp.git
 S:     Maintained
@@ -6772,6 +6789,14 @@ W:       http://www.mellanox.com
 Q:     http://patchwork.ozlabs.org/project/netdev/list/
 F:     drivers/net/ethernet/mellanox/mlxsw/
 
+MEMBARRIER SUPPORT
+M:     Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+M:     "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+L:     linux-kernel@vger.kernel.org
+S:     Supported
+F:     kernel/membarrier.c
+F:     include/uapi/linux/membarrier.h
+
 MEMORY MANAGEMENT
 L:     linux-mm@kvack.org
 W:     http://www.linux-mm.org
@@ -6837,6 +6862,12 @@ T:       git git://git.monstr.eu/linux-2.6-microblaze.git
 S:     Supported
 F:     arch/microblaze/
 
+MICROSOFT SURFACE PRO 3 BUTTON DRIVER
+M:     Chen Yu <yu.c.chen@intel.com>
+L:     platform-driver-x86@vger.kernel.org
+S:     Supported
+F:     drivers/platform/x86/surfacepro3_button.c
+
 MICROTEK X6 SCANNER
 M:     Oliver Neukum <oliver@neukum.org>
 S:     Maintained
@@ -7354,7 +7385,7 @@ F:        drivers/scsi/nsp32*
 NIOS2 ARCHITECTURE
 M:     Ley Foon Tan <lftan@altera.com>
 L:     nios2-dev@lists.rocketboards.org (moderated for non-subscribers)
-T:     git git://git.rocketboards.org/linux-socfpga-next.git
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/lftan/nios2.git
 S:     Maintained
 F:     arch/nios2/
 
@@ -7373,6 +7404,7 @@ NTB DRIVER CORE
 M:     Jon Mason <jdmason@kudzu.us>
 M:     Dave Jiang <dave.jiang@intel.com>
 M:     Allen Hubbe <Allen.Hubbe@emc.com>
+L:     linux-ntb@googlegroups.com
 S:     Supported
 W:     https://github.com/jonmason/ntb/wiki
 T:     git git://github.com/jonmason/ntb.git
@@ -7384,6 +7416,7 @@ F:        include/linux/ntb_transport.h
 NTB INTEL DRIVER
 M:     Jon Mason <jdmason@kudzu.us>
 M:     Dave Jiang <dave.jiang@intel.com>
+L:     linux-ntb@googlegroups.com
 S:     Supported
 W:     https://github.com/jonmason/ntb/wiki
 T:     git git://github.com/jonmason/ntb.git
@@ -8176,10 +8209,9 @@ F:       drivers/hwmon/pmbus/
 F:     include/linux/i2c/pmbus.h
 
 PMC SIERRA MaxRAID DRIVER
-M:     Anil Ravindranath <anil_ravindranath@pmc-sierra.com>
 L:     linux-scsi@vger.kernel.org
 W:     http://www.pmc-sierra.com/
-S:     Supported
+S:     Orphan
 F:     drivers/scsi/pmcraid.*
 
 PMC SIERRA PM8001 DRIVER
@@ -9264,6 +9296,12 @@ T:       git git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
 S:     Supported
 F:     security/apparmor/
 
+YAMA SECURITY MODULE
+M:     Kees Cook <keescook@chromium.org>
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git yama/tip
+S:     Supported
+F:     security/yama/
+
 SENSABLE PHANTOM
 M:     Jiri Slaby <jirislaby@gmail.com>
 S:     Maintained
@@ -9948,6 +9986,12 @@ M:       Arnaud Patard <arnaud.patard@rtp-net.org>
 S:     Odd Fixes
 F:     drivers/staging/xgifb/
 
+HFI1 DRIVER
+M:     Mike Marciniszyn <infinipath@intel.com>
+L:     linux-rdma@vger.kernel.org
+S:     Supported
+F:     drivers/staging/rdma/hfi1
+
 STARFIRE/DURALAN NETWORK DRIVER
 M:     Ion Badulescu <ionut@badula.org>
 S:     Odd Fixes
@@ -10473,7 +10517,6 @@ F:      drivers/platform/x86/toshiba_haps.c
 
 TOSHIBA SMM DRIVER
 M:     Jonathan Buzzard <jonathan@buzzard.org.uk>
-L:     tlinux-users@tce.toshiba-dme.co.jp
 W:     http://www.buzzard.org.uk/toshiba/
 S:     Maintained
 F:     drivers/char/toshiba.c
index c3615937df3891f8418db7a05656c9969954ee80..1a132ea43ca52588909bb054624177c8f8f63095 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
-PATCHLEVEL = 2
+PATCHLEVEL = 3
 SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -rc1
 NAME = Hurr durr I'ma sheep
 
 # *DOCUMENTATION*
@@ -666,14 +666,7 @@ endif
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
-ifeq ($(shell $(CC) -v 2>&1 | grep -c "clang version"), 1)
-COMPILER := clang
-else
-COMPILER := gcc
-endif
-export COMPILER
-
-ifeq ($(COMPILER),clang)
+ifeq ($(cc-name),clang)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
 KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
@@ -875,10 +868,9 @@ INITRD_COMPRESS-$(CONFIG_RD_LZ4)   := lz4
 # export INITRD_COMPRESS := $(INITRD_COMPRESS-y)
 
 ifdef CONFIG_MODULE_SIG_ALL
-MODSECKEY = ./signing_key.priv
-MODPUBKEY = ./signing_key.x509
-export MODPUBKEY
-mod_sign_cmd = perl $(srctree)/scripts/sign-file $(CONFIG_MODULE_SIG_HASH) $(MODSECKEY) $(MODPUBKEY)
+$(eval $(call config_filename,MODULE_SIG_KEY))
+
+mod_sign_cmd = scripts/sign-file $(CONFIG_MODULE_SIG_HASH) $(MODULE_SIG_KEY_SRCPREFIX)$(CONFIG_MODULE_SIG_KEY) certs/signing_key.x509
 else
 mod_sign_cmd = true
 endif
@@ -886,7 +878,7 @@ export mod_sign_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y         += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y         += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
 
 vmlinux-dirs   := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
                     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
@@ -1178,8 +1170,8 @@ MRPROPER_DIRS  += include/config usr/include include/generated          \
                  arch/*/include/generated .tmp_objdiff
 MRPROPER_FILES += .config .config.old .version .old_version \
                  Module.symvers tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS \
-                 signing_key.priv signing_key.x509 x509.genkey         \
-                 extra_certificates signing_key.x509.keyid             \
+                 signing_key.pem signing_key.priv signing_key.x509     \
+                 x509.genkey extra_certificates signing_key.x509.keyid \
                  signing_key.x509.signer vmlinux-gdb.py
 
 # clean - Delete most, but leave enough to build external modules
@@ -1433,6 +1425,7 @@ clean: $(clean-dirs)
                \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \
                -o -name '*.ko.*' \
                -o -name '*.dwo'  \
+               -o -name '*.su'  \
                -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
                -o -name '*.symtypes' -o -name 'modules.order' \
                -o -name modules.builtin -o -name '.tmp_*.o.*' \
index 8f35649305804c913efe1501e486ddb465e12810..4e949e58b1928363232abac3a69a25413e90652e 100644 (file)
@@ -2,6 +2,9 @@
 # General architecture dependent options
 #
 
+config KEXEC_CORE
+       bool
+
 config OPROFILE
        tristate "OProfile system profiling"
        depends on PROFILING
index dfa32f0613201a92c5c6e5bf81443302deff46ac..72a8ca7796d91a2d2a92d696ce507650678c3998 100644 (file)
@@ -12,42 +12,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       return get_dma_ops(dev)->alloc(dev, size, dma_handle, gfp, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       get_dma_ops(dev)->free(dev, size, vaddr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f)      dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h)       dma_free_coherent(d, s, v, h)
-
 #define dma_cache_sync(dev, va, size, dir)               ((void)0)
 
 #endif /* _ALPHA_DMA_MAPPING_H */
index f05bdb4b1cb97ee8f0d3bbec58fbc7285522c852..ff4049155c840c2fc4bc9e8015b5282dadb57469 100644 (file)
@@ -297,7 +297,9 @@ static inline void __iomem * ioremap_nocache(unsigned long offset,
                                             unsigned long size)
 {
        return ioremap(offset, size);
-} 
+}
+
+#define ioremap_uc ioremap_nocache
 
 static inline void iounmap(volatile void __iomem *addr)
 {
index df24b76f92461a5df780059eea1488e3b50945e7..2b1f4a1e92723bf67450c3d3ba4006e792971c70 100644 (file)
@@ -166,15 +166,6 @@ static int alpha_noop_supported(struct device *dev, u64 mask)
        return mask < 0x00ffffffUL ? 0 : 1;
 }
 
-static int alpha_noop_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 struct dma_map_ops alpha_noop_ops = {
        .alloc                  = alpha_noop_alloc_coherent,
        .free                   = alpha_noop_free_coherent,
@@ -182,7 +173,6 @@ struct dma_map_ops alpha_noop_ops = {
        .map_sg                 = alpha_noop_map_sg,
        .mapping_error          = alpha_noop_mapping_error,
        .dma_supported          = alpha_noop_supported,
-       .set_dma_mask           = alpha_noop_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_noop_ops;
index eddee77203431fb9d50d7c0feb313f26d5985d8e..8969bf2dfe3a0d4ff797888d2ce0a4a8785103dc 100644 (file)
@@ -939,16 +939,6 @@ static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
        return dma_addr == 0;
 }
 
-static int alpha_pci_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask ||
-           !pci_dma_supported(alpha_gendev_to_pci(dev), mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 struct dma_map_ops alpha_pci_ops = {
        .alloc                  = alpha_pci_alloc_coherent,
        .free                   = alpha_pci_free_coherent,
@@ -958,7 +948,6 @@ struct dma_map_ops alpha_pci_ops = {
        .unmap_sg               = alpha_pci_unmap_sg,
        .mapping_error          = alpha_pci_mapping_error,
        .dma_supported          = alpha_pci_supported,
-       .set_dma_mask           = alpha_pci_set_mask,
 };
 
 struct dma_map_ops *dma_ops = &alpha_pci_ops;
index 69d52aa37bae2c27ddf568b10b28e43d14af4084..f2d81ff38aa6474f6443412889443cfb4904b3e0 100644 (file)
@@ -30,6 +30,7 @@ __delay(int loops)
                "       bgt %0,1b"
                : "=&r" (tmp), "=r" (loops) : "1"(loops));
 }
+EXPORT_SYMBOL(__delay);
 
 #ifdef CONFIG_SMP
 #define LPJ     cpu_data[smp_processor_id()].loops_per_jiffy
index ad9825d4026aefe0b51d0f85037a2858f1ccda83..0a77b19e1df8db1d37af0346e36a4b53254ca272 100644 (file)
@@ -402,6 +402,8 @@ static void __init axs103_early_init(void)
        unsigned int num_cores = (read_aux_reg(ARC_REG_MCIP_BCR) >> 16) & 0x3F;
        if (num_cores > 2)
                arc_set_core_freq(50 * 1000000);
+       else if (num_cores == 2)
+               arc_set_core_freq(75 * 1000000);
 #endif
 
        switch (arc_get_core_freq()/1000000) {
index 0d1b717e1eca6754672294777f9d7f17b9f08364..72ad724c67ae94cd6682ec15f3834966dd7028c0 100644 (file)
@@ -2020,6 +2020,7 @@ config KEXEC
        bool "Kexec system call (EXPERIMENTAL)"
        depends on (!SMP || PM_SLEEP_SMP)
        depends on !CPU_V7M
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 7451b447cc2d2cb8cc68a9bf59f125f2dd2ce347..2c2b28ee48119771dfa92f353124795d456d770a 100644 (file)
@@ -54,6 +54,14 @@ AS           += -EL
 LD             += -EL
 endif
 
+#
+# The Scalar Replacement of Aggregates (SRA) optimization pass in GCC 4.9 and
+# later may result in code being generated that handles signed short and signed
+# char struct members incorrectly. So disable it.
+# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65932)
+#
+KBUILD_CFLAGS  += $(call cc-option,-fno-ipa-sra)
+
 # This selects which instruction set is used.
 # Note that GCC does not numerically define an architecture version
 # macro, but instead defines a whole series of macros which makes
index bd245d34952d2ad2392e9f9399654ac479c431f1..a0765e7ed6c7dd2166b2cb95874fc076532d526c 100644 (file)
@@ -57,5 +57,5 @@ extern char * strstr(const char * s1, const char *s2);
 
 int do_decompress(u8 *input, int len, u8 *output, void (*error)(char *x))
 {
-       return decompress(input, len, NULL, NULL, output, NULL, error);
+       return __decompress(input, len, NULL, NULL, output, 0, NULL, error);
 }
index 564900b9fcceb2ca477e78a50e2d815f033ecce4..0447c04a40cc439d90296c0f9d70b541eedf18b8 100644 (file)
                        interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH
                                      GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
                        ti,hwmods = "rtc";
+                       clocks = <&clk_32768_ck>;
+                       clock-names = "int-clk";
                        status = "disabled";
                };
 
index 215775dc69483dba5b526b69eef58cbde3f697b7..22038f21f2283a30cdac9235f0a6a2171813264e 100644 (file)
                clock-frequency = <12000000>;
        };
 
+       /* fixed 32k external oscillator clock */
+       clk_32k_rtc: clk_32k_rtc {
+               #clock-cells = <0>;
+               compatible = "fixed-clock";
+               clock-frequency = <32768>;
+       };
+
        sound0: sound@0 {
                compatible = "simple-audio-card";
                simple-audio-card,name = "AM437x-GP-EVM";
        tx-num-evt = <32>;
        rx-num-evt = <32>;
 };
+
+&rtc {
+       clocks = <&clk_32k_rtc>, <&clk_32768_ck>;
+       clock-names = "ext-clk", "int-clk";
+       status = "okay";
+};
index 378344271746f20446cc8654c52fd2ef46c5cda2..af25801418b49ff322279d5147524c069c5ce9b5 100644 (file)
                        gpios = <&gpio4 2 GPIO_ACTIVE_LOW>;
                };
        };
+
+       /* fixed 32k external oscillator clock */
+       clk_32k_rtc: clk_32k_rtc {
+               #clock-cells = <0>;
+               compatible = "fixed-clock";
+               clock-frequency = <32768>;
+       };
 };
 
 &am43xx_pinmux {
 };
 
 &rtc {
+       clocks = <&clk_32k_rtc>, <&clk_32768_ck>;
+       clock-names = "ext-clk", "int-clk";
        status = "okay";
 };
 
index 22af44894c66f77153832a59a8262a65b4d28b8b..7da7c2da4af13b3bc711f15a9098c10da80d56fa 100644 (file)
                display0 = &lcd0;
        };
 
+       /* fixed 32k external oscillator clock */
+       clk_32k_rtc: clk_32k_rtc {
+               #clock-cells = <0>;
+               compatible = "fixed-clock";
+               clock-frequency = <32768>;
+       };
+
        backlight {
                compatible = "pwm-backlight";
                pwms = <&ecap0 0 50000 PWM_POLARITY_INVERTED>;
 };
 
 &rtc {
+       clocks = <&clk_32k_rtc>, <&clk_32768_ck>;
+       clock-names = "ext-clk", "int-clk";
        status = "okay";
 };
 
index a5863acc5fff36aa28e1be183a85fc37883cd186..540a0adf2be6dcc94da5b487958cacca4348996d 100644 (file)
                min-microvolt = <1100000>;
                max-microvolt = <2700000>;
        };
+
+       thermal-zones {
+               cpu_thermal: cpu-thermal {
+                       cooling-maps {
+                               map0 {
+                                       /* Correspond to 500MHz at freq_table */
+                                       cooling-device = <&cpu0 5 5>;
+                               };
+                               map1 {
+                                       /* Correspond to 200MHz at freq_table */
+                                       cooling-device = <&cpu0 8 8>;
+                               };
+                       };
+               };
+       };
 };
 
 &adc {
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &exynos_usbphy {
        status = "okay";
 };
index baa9b2f52009eebffbe1a856afb8bbca551ded2f..41a5fafb9aa93a5393cc7b0930f376041f3a5fbb 100644 (file)
                min-microvolt = <1100000>;
                max-microvolt = <2700000>;
        };
+
+       thermal-zones {
+               cpu_thermal: cpu-thermal {
+                       cooling-maps {
+                               map0 {
+                                       /* Corresponds to 500MHz */
+                                       cooling-device = <&cpu0 5 5>;
+                               };
+                               map1 {
+                                       /* Corresponds to 200MHz */
+                                       cooling-device = <&cpu0 8 8>;
+                               };
+                       };
+               };
+       };
 };
 
 &adc {
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &exynos_usbphy {
        status = "okay";
 };
index 2db99433e17fdad0299b672ae87e49b6444a457e..033def482fc3d71693c48bd5a942eda4a7833bbf 100644 (file)
                        compatible = "arm,cortex-a7";
                        reg = <0>;
                        clock-frequency = <1000000000>;
+                       clocks = <&cmu CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       #cooling-cells = <2>;
+
+                       operating-points = <
+                               1000000 1150000
+                               900000  1112500
+                               800000  1075000
+                               700000  1037500
+                               600000  1000000
+                               500000  962500
+                               400000  925000
+                               300000  887500
+                               200000  850000
+                               100000  850000
+                       >;
                };
 
                cpu1: cpu@1 {
index b0d52b1a646af0cb1154f7216a81da9b3e64ef6a..98c0a368b7778dc3b13ec9e07d476e5eebff9cbb 100644 (file)
                clocks = <&clock CLK_JPEG>;
                clock-names = "jpeg";
                power-domains = <&pd_cam>;
+               iommus = <&sysmmu_jpeg>;
        };
 
        hdmi: hdmi@12D00000 {
index d9c8efeef208d8d6cf5b39f34506083020d7b0fe..538901123d37ea594e682d6d066b2c3b3bd0064b 100644 (file)
@@ -30,6 +30,9 @@
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA00>;
+                       clocks = <&clock CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       operating-points-v2 = <&cpu0_opp_table>;
                        cooling-min-level = <13>;
                        cooling-max-level = <7>;
                        #cooling-cells = <2>; /* min followed by max */
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA01>;
+                       operating-points-v2 = <&cpu0_opp_table>;
+               };
+       };
+
+       cpu0_opp_table: opp_table0 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp00 {
+                       opp-hz = /bits/ 64 <200000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <300000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <925000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <500000000>;
+                       opp-microvolt = <950000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <600000000>;
+                       opp-microvolt = <975000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp05 {
+                       opp-hz = /bits/ 64 <700000000>;
+                       opp-microvolt = <987500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp06 {
+                       opp-hz = /bits/ 64 <800000000>;
+                       opp-microvolt = <1000000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp07 {
+                       opp-hz = /bits/ 64 <900000000>;
+                       opp-microvolt = <1037500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp08 {
+                       opp-hz = /bits/ 64 <1000000000>;
+                       opp-microvolt = <1087500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp09 {
+                       opp-hz = /bits/ 64 <1100000000>;
+                       opp-microvolt = <1137500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp10 {
+                       opp-hz = /bits/ 64 <1200000000>;
+                       opp-microvolt = <1187500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp11 {
+                       opp-hz = /bits/ 64 <1300000000>;
+                       opp-microvolt = <1250000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp12 {
+                       opp-hz = /bits/ 64 <1400000000>;
+                       opp-microvolt = <1287500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp13 {
+                       opp-hz = /bits/ 64 <1500000000>;
+                       opp-microvolt = <1350000>;
+                       clock-latency-ns = <200000>;
+                       turbo-mode;
                };
        };
 };
index ca7d168d1dd62004aa45db808741b3458ae1da61..db52841297a5744d29b86fc167b43dcf1c6a1db6 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 /* RSTN signal for eMMC */
 &sd1_cd {
        samsung,pin-pud = <0>;
index 44684e57ead1e6a60e86731e67f7f5bc0141adfc..8632f35c6c26892fcbea54b71d41cbed29820458 100644 (file)
@@ -13,6 +13,7 @@
 
 /dts-v1/;
 #include "exynos4412-odroid-common.dtsi"
+#include <dt-bindings/gpio/gpio.h>
 
 / {
        model = "Hardkernel ODROID-U3 board based on Exynos4412";
                "Speakers", "SPKL",
                "Speakers", "SPKR";
 };
+
+&spi_1 {
+       pinctrl-names = "default";
+       pinctrl-0 = <&spi1_bus>;
+       cs-gpios = <&gpb 5 GPIO_ACTIVE_HIGH>;
+       status = "okay";
+};
index 84c76310b31288d8542c2a8d1a0965b9a705bca7..9d528af68c1a45ba8b35b4d8c8100abb27afecae 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &fimd {
        pinctrl-0 = <&lcd_clk &lcd_data24 &pwm1_out>;
        pinctrl-names = "default";
index 8848400590184c4ff309564f3788a17a2bf4e9ad..2a1ebb76ebe0084af6ff07a8617df2050675af0c 100644 (file)
        status = "okay";
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &csis_0 {
        status = "okay";
        vddcore-supply = <&ldo8_reg>;
index b78ada70bd051d6ff3cc2bdd5cbb26859fd644eb..ca0e3c15977f13febd2ae550949ae704ecbf33cb 100644 (file)
@@ -30,6 +30,9 @@
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA00>;
+                       clocks = <&clock CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       operating-points-v2 = <&cpu0_opp_table>;
                        cooling-min-level = <13>;
                        cooling-max-level = <7>;
                        #cooling-cells = <2>; /* min followed by max */
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA01>;
+                       operating-points-v2 = <&cpu0_opp_table>;
                };
 
                cpu@A02 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA02>;
+                       operating-points-v2 = <&cpu0_opp_table>;
                };
 
                cpu@A03 {
                        device_type = "cpu";
                        compatible = "arm,cortex-a9";
                        reg = <0xA03>;
+                       operating-points-v2 = <&cpu0_opp_table>;
+               };
+       };
+
+       cpu0_opp_table: opp_table0 {
+               compatible = "operating-points-v2";
+               opp-shared;
+
+               opp00 {
+                       opp-hz = /bits/ 64 <200000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp01 {
+                       opp-hz = /bits/ 64 <300000000>;
+                       opp-microvolt = <900000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp02 {
+                       opp-hz = /bits/ 64 <400000000>;
+                       opp-microvolt = <925000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp03 {
+                       opp-hz = /bits/ 64 <500000000>;
+                       opp-microvolt = <950000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp04 {
+                       opp-hz = /bits/ 64 <600000000>;
+                       opp-microvolt = <975000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp05 {
+                       opp-hz = /bits/ 64 <700000000>;
+                       opp-microvolt = <987500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp06 {
+                       opp-hz = /bits/ 64 <800000000>;
+                       opp-microvolt = <1000000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp07 {
+                       opp-hz = /bits/ 64 <900000000>;
+                       opp-microvolt = <1037500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp08 {
+                       opp-hz = /bits/ 64 <1000000000>;
+                       opp-microvolt = <1087500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp09 {
+                       opp-hz = /bits/ 64 <1100000000>;
+                       opp-microvolt = <1137500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp10 {
+                       opp-hz = /bits/ 64 <1200000000>;
+                       opp-microvolt = <1187500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp11 {
+                       opp-hz = /bits/ 64 <1300000000>;
+                       opp-microvolt = <1250000>;
+                       clock-latency-ns = <200000>;
+               };
+               opp12 {
+                       opp-hz = /bits/ 64 <1400000000>;
+                       opp-microvolt = <1287500>;
+                       clock-latency-ns = <200000>;
+               };
+               opp13 {
+                       opp-hz = /bits/ 64 <1500000000>;
+                       opp-microvolt = <1350000>;
+                       clock-latency-ns = <200000>;
+                       turbo-mode;
                };
        };
 
index 7e728a1b55590abe15e89d109ad0433a86785f17..db3f65f3eb45995d840a7ddc60284b0ff6e85457 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        status = "okay";
        samsung,color-space = <0>;
index 4fe186d01f8a52b52f9155d76b1496b7d586ed7e..15aea760c1dadee45c631d78c64366cea7739276 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        samsung,color-space = <0>;
        samsung,dynamic-range = <0>;
index b7f4122df456b05438b8f719adaa9eb95a5dfb5f..0720caab5511112026a1d53156deae2cf6338345 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        status = "okay";
        pinctrl-names = "default";
        status = "okay";
        samsung,spi-src-clk = <0>;
        num-cs = <1>;
+       cs-gpios = <&gpa2 5 GPIO_ACTIVE_HIGH>;
 };
 
 &usbdrd_dwc3 {
index d03f9b8d376d082308fa9e06c515f69ae83386c2..c1edd6d038a905dffd68e895cdafae25c967f160 100644 (file)
        };
 };
 
+&cpu0 {
+       cpu0-supply = <&buck2_reg>;
+};
+
 &dp {
        status = "okay";
        pinctrl-names = "default";
index 4a1f88300a281b8248cad85646e98dca6b4d47cf..b24610ea8c2a93619bfe75770b63d1b67b61d680 100644 (file)
                        compatible = "arm,cortex-a15";
                        reg = <0>;
                        clock-frequency = <1700000000>;
+                       clocks = <&clock CLK_ARM_CLK>;
+                       clock-names = "cpu";
+                       clock-latency = <140000>;
+
+                       operating-points = <
+                               1700000 1300000
+                               1600000 1250000
+                               1500000 1225000
+                               1400000 1200000
+                               1300000 1150000
+                               1200000 1125000
+                               1100000 1100000
+                               1000000 1075000
+                                900000 1050000
+                                800000 1025000
+                                700000 1012500
+                                600000 1000000
+                                500000  975000
+                                400000  950000
+                                300000  937500
+                                200000  925000
+                       >;
                        cooling-min-level = <15>;
                        cooling-max-level = <9>;
                        #cooling-cells = <2>; /* min followed by max */
diff --git a/arch/arm/boot/dts/exynos5422-cpus.dtsi b/arch/arm/boot/dts/exynos5422-cpus.dtsi
new file mode 100644 (file)
index 0000000..b7f60c8
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * SAMSUNG EXYNOS5422 SoC cpu device tree source
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com
+ *
+ * The only difference between EXYNOS5422 and EXYNOS5800 is cpu ordering. The
+ * EXYNOS5422 is booting from Cortex-A7 core while the EXYNOS5800 is booting
+ * from Cortex-A15 core.
+ *
+ * EXYNOS5422 based board files can include this file to provide cpu ordering
+ * which could boot a cortex-a7 from cpu0.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+&cpu0 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x100>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu1 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x101>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu2 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x102>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu3 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a7";
+       reg = <0x103>;
+       clock-frequency = <1000000000>;
+       cci-control-port = <&cci_control0>;
+};
+
+&cpu4 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x0>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
+
+&cpu5 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x1>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
+
+&cpu6 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x2>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
+
+&cpu7 {
+       device_type = "cpu";
+       compatible = "arm,cortex-a15";
+       reg = <0x3>;
+       clock-frequency = <1800000000>;
+       cci-control-port = <&cci_control1>;
+};
index 1565667e6f699d171aa769a904a5993698c5203f..79ffdfe712aa4a8ad193d4afd671962edfb73646 100644 (file)
@@ -15,6 +15,7 @@
 #include <dt-bindings/gpio/gpio.h>
 #include <dt-bindings/sound/samsung-i2s.h>
 #include "exynos5800.dtsi"
+#include "exynos5422-cpus.dtsi"
 #include "exynos5422-cpu-thermal.dtsi"
 
 / {
index 34ccb260f12a9e6be49f3a41f20544e293687ad0..47c0282bdfca7ce11a8ce0e05119f8df229b9a4f 100644 (file)
@@ -4,6 +4,14 @@
        model = "CompuLab CM-QS600";
        compatible = "qcom,apq8064-cm-qs600", "qcom,apq8064";
 
+       aliases {
+               serial0 = &gsbi7_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                pinctrl@800000 {
                        i2c1_pins: i2c1 {
index 88d6655ddaf6dfae95301dfb97d85f46063b05ca..f3100da082b2a3cbe1a1229a1f6be0a21ab4ca5a 100644 (file)
                serial1 = &gsbi6_serial;
        };
 
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                pinctrl@800000 {
                        card_detect: card_detect {
index d484d08163e9415557e17d05e34c567c7e2d9de1..835bdc71c5ba4e7db71642305e541f5e203d4758 100644 (file)
@@ -6,6 +6,14 @@
        model = "Qualcomm APQ8074 Dragonboard";
        compatible = "qcom,apq8074-dragonboard", "qcom,apq8074";
 
+       aliases {
+               serial0 = &blsp1_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                serial@f991e000 {
                        status = "ok";
index f7725b96612c65475fec7f67445f4afb54c19e81..c9c2b769554f84d91b0a535e98fb1a070cf6b232 100644 (file)
@@ -5,6 +5,14 @@
        model = "Qualcomm APQ8084/IFC6540";
        compatible = "qcom,apq8084-ifc6540", "qcom,apq8084";
 
+       aliases {
+               serial0 = &blsp2_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                serial@f995e000 {
                        status = "okay";
index cb43acfc5d1d9d0773243d42874d5bd1f5243e4f..3016c7048d446cb5ee1e0810b37986663eb20398 100644 (file)
@@ -5,6 +5,14 @@
        model = "Qualcomm APQ 8084-MTP";
        compatible = "qcom,apq8084-mtp", "qcom,apq8084";
 
+       aliases {
+               serial0 = &blsp2_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                serial@f995e000 {
                        status = "okay";
index 7084010ee61ba463024a9e4ba5d48caf327e3c05..0554fbd72c40ba78f0c7205cdd050821d8b52b43 100644 (file)
                        interrupts = <0 208 0>;
                };
 
-               serial@f995e000 {
+               blsp2_uart2: serial@f995e000 {
                        compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
                        reg = <0xf995e000 0x1000>;
                        interrupts = <0 114 0x0>;
index 55b2910efd872170ca19a99d867be4a34dd60794..d501382493e3d6eb97f4a4ec3635734240180922 100644 (file)
@@ -4,6 +4,14 @@
        model = "Qualcomm IPQ8064/AP148";
        compatible = "qcom,ipq8064-ap148", "qcom,ipq8064";
 
+       aliases {
+               serial0 = &gsbi4_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        reserved-memory {
                #address-cells = <1>;
                #size-cells = <1>;
index 9f727d8eadf6998561e748e0400e46299417f9b3..fa698635eea0d1f859d57c766c679438a416b55d 100644 (file)
 
                        syscon-tcsr = <&tcsr>;
 
-                       serial@16340000 {
+                       gsbi4_serial: serial@16340000 {
                                compatible = "qcom,msm-uartdm-v1.3", "qcom,msm-uartdm";
                                reg = <0x16340000 0x1000>,
                                      <0x16300000 0x1000>;
index e0883c376248073158153de3341b6bae038c58f9..b17f379e8c2afe3f92fe1f905fbe501fccad9118 100644 (file)
@@ -6,6 +6,14 @@
        model = "Qualcomm MSM8660 SURF";
        compatible = "qcom,msm8660-surf", "qcom,msm8660";
 
+       aliases {
+               serial0 = &gsbi12_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                gsbi@19c00000 {
                        status = "ok";
index ef2fe72b54c91e5b139a38788e5695fe991855cc..e5f7f33aa4677739f9bc1e6d57d969db9b856cf9 100644 (file)
@@ -98,7 +98,7 @@
 
                        syscon-tcsr = <&tcsr>;
 
-                       serial@19c40000 {
+                       gsbi12_serial: serial@19c40000 {
                                compatible = "qcom,msm-uartdm-v1.3", "qcom,msm-uartdm";
                                reg = <0x19c40000 0x1000>,
                                      <0x19c00000 0x1000>;
index fad71d5527b0f80d88e7bc433318d53ca35ec8ca..b72a55462caf1b2aaf99d8a293ad47c79ef7858d 100644 (file)
@@ -6,6 +6,14 @@
        model = "Qualcomm MSM8960 CDP";
        compatible = "qcom,msm8960-cdp", "qcom,msm8960";
 
+       aliases {
+               serial0 = &gsbi5_serial;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        soc {
                gsbi@16400000 {
                        status = "ok";
index 2096a94c9b525faa47df980d751763804f10e7c5..134cd91d68ece1034077c0e36f154756f6143cb6 100644 (file)
 
                        syscon-tcsr = <&tcsr>;
 
-                       serial@16440000 {
+                       gsbi5_serial: serial@16440000 {
                                compatible = "qcom,msm-uartdm-v1.3", "qcom,msm-uartdm";
                                reg = <0x16440000 0x1000>,
                                      <0x16400000 0x1000>;
index 9bc72a3356e45ea7c79d43daf9eae826342e8f52..016f9ad9392a9cbad2702834aff6648add614b7b 100644 (file)
@@ -6,6 +6,14 @@
        model = "Sony Xperia Z1";
        compatible = "sony,xperia-honami", "qcom,msm8974";
 
+       aliases {
+               serial0 = &blsp1_uart2;
+       };
+
+       chosen {
+               stdout-path = "serial0:115200n8";
+       };
+
        memory@0 {
                reg = <0 0x40000000>, <0x40000000 0x40000000>;
                device_type = "memory";
index d7c99b894a491c7011e8a95f3800f5585aeb49e4..ab8e5725046809e53b9ef7ce39b1f6ca33d35dcc 100644 (file)
                        hwlocks = <&tcsr_mutex 3>;
                };
 
-               serial@f991e000 {
+               blsp1_uart2: serial@f991e000 {
                        compatible = "qcom,msm-uartdm-v1.4", "qcom,msm-uartdm";
                        reg = <0xf991e000 0x1000>;
                        interrupts = <0 108 0x0>;
index dc01c049a5206c95d32b0a3505f948017c8924d0..3b32d5fd932665be5959a132f6ac87489d7c6a95 100644 (file)
@@ -157,7 +157,7 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_V3020=y
-CONFIG_RTC_DRV_SA1100=y
+CONFIG_RTC_DRV_PXA=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 CONFIG_INOTIFY=y
index 4560c9ca6636adb26289191492ffd9781178086b..8e10df7ba1b40b7975888c9a266fe02b25f6b80e 100644 (file)
@@ -157,7 +157,7 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_V3020=y
-CONFIG_RTC_DRV_SA1100=y
+CONFIG_RTC_DRV_PXA=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 CONFIG_INOTIFY=y
index 3eaf8fbaf60346330215f43bbfd2a28bd4f10857..1ff2bfa2e183f45087571875197888e81e3cc8ad 100644 (file)
@@ -27,6 +27,8 @@ CONFIG_ARM_APPENDED_DTB=y
 CONFIG_ARM_ATAG_DTB_COMPAT=y
 CONFIG_CMDLINE="root=/dev/ram0 rw ramdisk=8192 initrd=0x41000000,8M console=ttySAC1,115200 init=/linuxrc mem=256M"
 CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPUFREQ_DT=y
 CONFIG_CPU_IDLE=y
 CONFIG_ARM_EXYNOS_CPUIDLE=y
 CONFIG_VFP=y
@@ -94,6 +96,7 @@ CONFIG_CHARGER_MAX14577=y
 CONFIG_CHARGER_MAX77693=y
 CONFIG_CHARGER_TPS65090=y
 CONFIG_SENSORS_LM90=y
+CONFIG_SENSORS_NTC_THERMISTOR=y
 CONFIG_SENSORS_PWM_FAN=y
 CONFIG_SENSORS_INA2XX=y
 CONFIG_THERMAL=y
@@ -144,6 +147,8 @@ CONFIG_SND=y
 CONFIG_SND_SOC=y
 CONFIG_SND_SOC_SAMSUNG=y
 CONFIG_SND_SOC_SNOW=y
+CONFIG_SND_SOC_ODROIDX2=y
+CONFIG_SND_SIMPLE_CARD=y
 CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_XHCI_HCD=y
index 557dd291288b2975f4cd20575f6c4fabdfffc595..a5b4920cd6d447cb62bc751c2d9f0a0d84cd2625 100644 (file)
@@ -150,7 +150,7 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_BACKLIGHT=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DEBUG=y
-CONFIG_RTC_DRV_SA1100=y
+CONFIG_RTC_DRV_PXA=y
 CONFIG_EXT2_FS=y
 CONFIG_INOTIFY=y
 CONFIG_MSDOS_FS=m
index f84471d5d2db3ed5a3f8137dfa76fce92c851c29..03deb7fb35e8999522baceb89fbae066d978f7e3 100644 (file)
@@ -362,6 +362,7 @@ CONFIG_POWER_RESET_KEYSTONE=y
 CONFIG_POWER_RESET_RMOBILE=y
 CONFIG_SENSORS_LM90=y
 CONFIG_SENSORS_LM95245=y
+CONFIG_SENSORS_NTC_THERMISTOR=m
 CONFIG_THERMAL=y
 CONFIG_CPU_THERMAL=y
 CONFIG_RCAR_THERMAL=y
@@ -410,7 +411,9 @@ CONFIG_REGULATOR_MAX8907=y
 CONFIG_REGULATOR_MAX8973=y
 CONFIG_REGULATOR_MAX77686=y
 CONFIG_REGULATOR_MAX77693=m
+CONFIG_REGULATOR_MAX77802=m
 CONFIG_REGULATOR_PALMAS=y
+CONFIG_REGULATOR_PBIAS=y
 CONFIG_REGULATOR_PWM=m
 CONFIG_REGULATOR_S2MPS11=y
 CONFIG_REGULATOR_S5M8767=y
@@ -509,8 +512,6 @@ CONFIG_USB_CHIPIDEA_HOST=y
 CONFIG_AB8500_USB=y
 CONFIG_KEYSTONE_USB_PHY=y
 CONFIG_OMAP_USB3=y
-CONFIG_SAMSUNG_USB2PHY=y
-CONFIG_SAMSUNG_USB3PHY=y
 CONFIG_USB_GPIO_VBUS=y
 CONFIG_USB_ISP1301=y
 CONFIG_USB_MXS_PHY=y
@@ -635,6 +636,7 @@ CONFIG_EXTCON=y
 CONFIG_TI_AEMIF=y
 CONFIG_IIO=y
 CONFIG_AT91_ADC=m
+CONFIG_EXYNOS_ADC=m
 CONFIG_XILINX_XADC=y
 CONFIG_AK8975=y
 CONFIG_PWM=y
index 4baa83c1c577526263cde9539ceacdb90659bd87..83c135e19aba0644f0f80882cf10f09760c6b5e0 100644 (file)
@@ -67,7 +67,7 @@ CONFIG_MMC=y
 CONFIG_MMC_DEBUG=y
 CONFIG_MMC_PXA=y
 CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_SA1100=y
+CONFIG_RTC_DRV_PXA=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
 # CONFIG_DNOTIFY is not set
index 0a847d04ddc198ad28e6d254d27a2f0bbb7a5a2f..b5624e325817f859c7483ce8db039b91affed27a 100644 (file)
@@ -82,7 +82,7 @@ CONFIG_MMC=y
 CONFIG_MMC_PXA=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_PCF8563=m
-CONFIG_RTC_DRV_SA1100=m
+CONFIG_RTC_DRV_PXA=m
 CONFIG_EXT2_FS=m
 CONFIG_EXT3_FS=m
 # CONFIG_DNOTIFY is not set
index 932ee4e4a13ae895cee081c3eeacfe7ea5d12b3d..4bc870028035da9641c5a26f09c85b46785a1a97 100644 (file)
@@ -177,7 +177,7 @@ CONFIG_NEW_LEDS=y
 CONFIG_RTC_CLASS=y
 # CONFIG_RTC_HCTOSYS is not set
 CONFIG_RTC_DRV_PCF8583=m
-CONFIG_RTC_DRV_SA1100=y
+CONFIG_RTC_DRV_PXA=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT2_FS_XATTR=y
 CONFIG_EXT2_FS_POSIX_ACL=y
index 7bbf325a4f31f12c9d867381853579a495590f89..b2bc8e11471d3ee3e4fcd5f207406222ffa04bf1 100644 (file)
@@ -491,11 +491,6 @@ THUMB(     orr     \reg , \reg , #PSR_T_BIT        )
 #endif
        .endm
 
-       .macro  uaccess_save_and_disable, tmp
-       uaccess_save \tmp
-       uaccess_disable \tmp
-       .endm
-
        .irp    c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
        .macro  ret\c, reg
 #if __LINUX_ARM_ARCH__ < 6
index b274bde24905a7503f60d38672346636d093854b..e7335a92144ef1d20d296ebdf26798148b2fa711 100644 (file)
@@ -40,6 +40,7 @@ do {                                                          \
                "2:\t.asciz " #__file "\n"                      \
                ".popsection\n"                                 \
                ".pushsection __bug_table,\"a\"\n"              \
+               ".align 2\n"                                    \
                "3:\t.word 1b, 2b\n"                            \
                "\t.hword " #__line ", 0\n"                     \
                ".popsection");                                 \
index a68b9d8a71fed8ee2357d833a023d2bed9696d69..ccb3aa64640dc350da9de1319d65b2a46c44e2ee 100644 (file)
@@ -8,7 +8,6 @@
 #include <linux/dma-attrs.h>
 #include <linux/dma-debug.h>
 
-#include <asm-generic/dma-coherent.h>
 #include <asm/memory.h>
 
 #include <xen/xen.h>
@@ -39,12 +38,15 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
        dev->archdata.dma_ops = ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
+#define HAVE_ARCH_DMA_SUPPORTED 1
+extern int dma_supported(struct device *dev, u64 mask);
 
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->set_dma_mask(dev, mask);
-}
+/*
+ * Note that while the generic code provides dummy dma_{alloc,free}_noncoherent
+ * implementations, we don't provide a dma_cache_sync function so drivers using
+ * this API are highlighted with build warnings.
+ */
+#include <asm-generic/dma-mapping-common.h>
 
 #ifdef __arch_page_to_dma
 #error Please update to __arch_pfn_to_dma
@@ -167,32 +169,6 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline void dma_mark_clean(void *addr, size_t size) { }
 
-/*
- * DMA errors are defined by all-bits-set in the DMA address.
- */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return dma_addr == DMA_ERROR_CODE;
-}
-
-/*
- * Dummy noncoherent implementation.  We don't provide a dma_cache_sync
- * function so drivers using this API are highlighted with build warnings.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-               dma_addr_t *handle, gfp_t gfp)
-{
-       return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-               void *cpu_addr, dma_addr_t handle)
-{
-}
-
-extern int dma_supported(struct device *dev, u64 mask);
-
 extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 
 /**
@@ -209,21 +185,6 @@ extern int arm_dma_set_mask(struct device *dev, u64 dma_mask);
 extern void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                           gfp_t gfp, struct dma_attrs *attrs);
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                      dma_addr_t *dma_handle, gfp_t flag,
-                                      struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-       BUG_ON(!ops);
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-       return cpu_addr;
-}
-
 /**
  * arm_dma_free - free memory allocated by arm_dma_alloc
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
@@ -241,19 +202,6 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 extern void arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
                         dma_addr_t handle, struct dma_attrs *attrs);
 
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                    void *cpu_addr, dma_addr_t dma_handle,
-                                    struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       BUG_ON(!ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 /**
  * arm_dma_mmap - map a coherent DMA allocation into user space
  * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices
index e878129f2fee5dfec3d36bec2e9fcb7d8ef046e4..fc8ba1663601e0743a05b7cda52703df9f9bc07a 100644 (file)
@@ -12,6 +12,7 @@
 
 #ifndef __ASSEMBLY__
 #include <asm/barrier.h>
+#include <asm/thread_info.h>
 #endif
 
 /*
@@ -89,7 +90,8 @@ static inline unsigned int get_domain(void)
 
        asm(
        "mrc    p15, 0, %0, c3, c0      @ get domain"
-        : "=r" (domain));
+        : "=r" (domain)
+        : "m" (current_thread_info()->cpu_domain));
 
        return domain;
 }
@@ -98,7 +100,7 @@ static inline void set_domain(unsigned val)
 {
        asm volatile(
        "mcr    p15, 0, %0, c3, c0      @ set domain"
-         : : "r" (val));
+         : : "r" (val) : "memory");
        isb();
 }
 
index 53c15dec7af6aa09faee9b1851782f9244365e56..be1d07d59ee9784c7b3dd12a2150a009c7c742ba 100644 (file)
@@ -35,6 +35,11 @@ extern void (*handle_arch_irq)(struct pt_regs *);
 extern void set_handle_irq(void (*handle_irq)(struct pt_regs *));
 #endif
 
+#ifdef CONFIG_SMP
+extern void arch_trigger_all_cpu_backtrace(bool);
+#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x)
+#endif
+
 #endif
 
 #endif
index e896d2c196e63b79365e6f637c7b697c56c48f71..dcba0fa5176e990f8a23333f08e34e192351b407 100644 (file)
@@ -231,4 +231,9 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+static inline void kvm_arm_init_debug(void) {}
+static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM_KVM_HOST_H__ */
index b7f6fb462ea0da21e59e67dbb4e75de729ffdfcc..98d58bb04ac57853910860ae5f124c1fd1cb14bf 100644 (file)
 #define DTCM_OFFSET    UL(0xfffe8000)
 #endif
 
-/*
- * Convert a physical address to a Page Frame Number and back
- */
-#define        __phys_to_pfn(paddr)    ((unsigned long)((paddr) >> PAGE_SHIFT))
-#define        __pfn_to_phys(pfn)      ((phys_addr_t)(pfn) << PAGE_SHIFT)
-
 /*
  * Convert a page to/from a physical address
  */
index d0a1119dcaf38ba92d99354aaeba18bbf0452004..776757d1604ab3901996bb24bb02748e54c2aee7 100644 (file)
@@ -25,7 +25,6 @@
 struct task_struct;
 
 #include <asm/types.h>
-#include <asm/domain.h>
 
 typedef unsigned long mm_segment_t;
 
index 8b1f37bfeeecf3c2d8af09ed15bcbc4cc8fa991f..71e473d05fcce8a2b869b84272dfcd498c3847dc 100644 (file)
@@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
                                                            atomic64_t, \
                                                            counter), (val))
 
+/* Rebind event channel is supported by default */
+static inline bool xen_support_evtchn_rebind(void)
+{
+       return true;
+}
+
 #endif /* _ASM_ARM_XEN_EVENTS_H */
index 1bee8ca124945cdde1226e3cbff5503b9132c526..127956353b0060fc1a4e6d7edacdcc0a7219e201 100644 (file)
@@ -34,7 +34,19 @@ typedef struct xpaddr {
 unsigned long __pfn_to_mfn(unsigned long pfn);
 extern struct rb_root phys_to_mach;
 
-static inline unsigned long pfn_to_mfn(unsigned long pfn)
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+       return pfn;
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+       return gfn;
+}
+
+/* Pseudo-physical <-> BUS conversion */
+static inline unsigned long pfn_to_bfn(unsigned long pfn)
 {
        unsigned long mfn;
 
@@ -47,33 +59,21 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
        return pfn;
 }
 
-static inline unsigned long mfn_to_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_pfn(unsigned long bfn)
 {
-       return mfn;
+       return bfn;
 }
 
-#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
+#define bfn_to_local_pfn(bfn)  bfn_to_pfn(bfn)
 
-static inline xmaddr_t phys_to_machine(xpaddr_t phys)
-{
-       unsigned offset = phys.paddr & ~PAGE_MASK;
-       return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
-}
-
-static inline xpaddr_t machine_to_phys(xmaddr_t machine)
-{
-       unsigned offset = machine.maddr & ~PAGE_MASK;
-       return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
-}
-/* VIRT <-> MACHINE conversion */
-#define virt_to_machine(v)     (phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_mfn(v)         (pfn_to_mfn(virt_to_pfn(v)))
-#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v)         (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(m)         (__va(gfn_to_pfn(m) << PAGE_SHIFT))
 
+/* Only used in PV code. But ARM guests are always HVM. */
 static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
-       /* TODO: assuming it is mapped in the kernel 1:1 */
-       return virt_to_machine(vaddr);
+       BUG();
 }
 
 /* TODO: this shouldn't be here but it is because the frontend drivers
@@ -108,7 +108,7 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 
 bool xen_arch_need_swiotlb(struct device *dev,
                           unsigned long pfn,
-                          unsigned long mfn);
+                          unsigned long bfn);
 unsigned long xen_get_swiotlb_free_pages(unsigned int order);
 
 #endif /* _ASM_ARM_XEN_PAGE_H */
index a3089bacb8d822ded284b432f1cb37ecd8b0ff66..7a7c4cea55231b1c793982ab63d6c5598004b14b 100644 (file)
@@ -226,6 +226,7 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
 
        memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
 
+#ifdef CONFIG_CPU_USE_DOMAINS
        /*
         * Copy the initial value of the domain access control register
         * from the current thread: thread->addr_limit will have been
@@ -233,6 +234,7 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
         * kernel/fork.c
         */
        thread->cpu_domain = get_domain();
+#endif
 
        if (likely(!(p->flags & PF_KTHREAD))) {
                *childregs = *current_pt_regs();
index ba0063c539c3fc3436f291c54a04e3cfa4b30793..48185a773852d4ec501702ae3a6471d38b03ce10 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/cpu.h>
 #include <linux/seq_file.h>
 #include <linux/irq.h>
+#include <linux/nmi.h>
 #include <linux/percpu.h>
 #include <linux/clockchips.h>
 #include <linux/completion.h>
@@ -72,6 +73,7 @@ enum ipi_msg_type {
        IPI_CPU_STOP,
        IPI_IRQ_WORK,
        IPI_COMPLETION,
+       IPI_CPU_BACKTRACE = 15,
 };
 
 static DECLARE_COMPLETION(cpu_running);
@@ -643,6 +645,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
                irq_exit();
                break;
 
+       case IPI_CPU_BACKTRACE:
+               irq_enter();
+               nmi_cpu_backtrace(regs);
+               irq_exit();
+               break;
+
        default:
                pr_crit("CPU%u: Unknown IPI message 0x%x\n",
                        cpu, ipinr);
@@ -737,3 +745,13 @@ static int __init register_cpufreq_notifier(void)
 core_initcall(register_cpufreq_notifier);
 
 #endif
+
+static void raise_nmi(cpumask_t *mask)
+{
+       smp_cross_call(mask, IPI_CPU_BACKTRACE);
+}
+
+void arch_trigger_all_cpu_backtrace(bool include_self)
+{
+       nmi_trigger_all_cpu_backtrace(include_self, raise_nmi);
+}
index bc738d2b83929e6a05762ea8b73af6ec1ed86837..ce404a5c30628c72533a62e430c6150a54032fea 100644 (file)
@@ -125,6 +125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        if (ret)
                goto out_free_stage2_pgd;
 
+       kvm_vgic_early_init(kvm);
        kvm_timer_init(kvm);
 
        /* Mark the initial VMID generation invalid */
@@ -249,6 +250,7 @@ out:
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
+       kvm_vgic_vcpu_early_init(vcpu);
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
@@ -278,6 +280,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        /* Set up the timer */
        kvm_timer_vcpu_init(vcpu);
 
+       kvm_arm_reset_debug_ptr(vcpu);
+
        return 0;
 }
 
@@ -301,13 +305,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
        kvm_arm_set_running_vcpu(NULL);
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-                                       struct kvm_guest_debug *dbg)
-{
-       return -EINVAL;
-}
-
-
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
@@ -528,10 +525,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                if (vcpu->arch.pause)
                        vcpu_pause(vcpu);
 
-               kvm_vgic_flush_hwstate(vcpu);
+               /*
+                * Disarming the background timer must be done in a
+                * preemptible context, as this call may sleep.
+                */
                kvm_timer_flush_hwstate(vcpu);
 
+               /*
+                * Preparing the interrupts to be injected also
+                * involves poking the GIC, which must be done in a
+                * non-preemptible context.
+                */
                preempt_disable();
+               kvm_vgic_flush_hwstate(vcpu);
+
                local_irq_disable();
 
                /*
@@ -544,12 +551,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
                if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
                        local_irq_enable();
+                       kvm_vgic_sync_hwstate(vcpu);
                        preempt_enable();
                        kvm_timer_sync_hwstate(vcpu);
-                       kvm_vgic_sync_hwstate(vcpu);
                        continue;
                }
 
+               kvm_arm_setup_debug(vcpu);
+
                /**************************************************************
                 * Enter the guest
                 */
@@ -564,6 +573,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 * Back from guest
                 *************************************************************/
 
+               kvm_arm_clear_debug(vcpu);
+
                /*
                 * We may have taken a host interrupt in HYP mode (ie
                 * while executing the guest). This interrupt is still
@@ -586,11 +597,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                 */
                kvm_guest_exit();
                trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-               preempt_enable();
 
+               kvm_vgic_sync_hwstate(vcpu);
+
+               preempt_enable();
 
                kvm_timer_sync_hwstate(vcpu);
-               kvm_vgic_sync_hwstate(vcpu);
 
                ret = handle_exit(vcpu, run, ret);
        }
@@ -921,6 +933,8 @@ static void cpu_init_hyp_mode(void *dummy)
        vector_ptr = (unsigned long)__kvm_hyp_vector;
 
        __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+
+       kvm_arm_init_debug();
 }
 
 static int hyp_init_cpu_notify(struct notifier_block *self,
index d503fbb787d362752b9b6b688b2829e19b675095..96e935bbc38c8b4fd906aeacdc27ca28696b596a 100644 (file)
@@ -290,3 +290,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        return -EINVAL;
 }
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       return -EINVAL;
+}
index 568494dbbbb5b169581b656ed5126a0164b5a621..900ef6dd8f727e9a510b79cd0d9e36dbafa26cfa 100644 (file)
@@ -361,10 +361,6 @@ hyp_hvc:
        @ Check syndrome register
        mrc     p15, 4, r1, c5, c2, 0   @ HSR
        lsr     r0, r1, #HSR_EC_SHIFT
-#ifdef CONFIG_VFPv3
-       cmp     r0, #HSR_EC_CP_0_13
-       beq     switch_to_guest_vfp
-#endif
        cmp     r0, #HSR_EC_HVC
        bne     guest_trap              @ Not HVC instr.
 
@@ -378,7 +374,10 @@ hyp_hvc:
        cmp     r2, #0
        bne     guest_trap              @ Guest called HVC
 
-host_switch_to_hyp:
+       /*
+        * Getting here means host called HVC, we shift parameters and branch
+        * to Hyp function.
+        */
        pop     {r0, r1, r2}
 
        /* Check for __hyp_get_vectors */
@@ -409,6 +408,10 @@ guest_trap:
 
        @ Check if we need the fault information
        lsr     r1, r1, #HSR_EC_SHIFT
+#ifdef CONFIG_VFPv3
+       cmp     r1, #HSR_EC_CP_0_13
+       beq     switch_to_guest_vfp
+#endif
        cmp     r1, #HSR_EC_IABT
        mrceq   p15, 4, r2, c6, c0, 2   @ HIFAR
        beq     2f
@@ -477,7 +480,6 @@ guest_trap:
  */
 #ifdef CONFIG_VFPv3
 switch_to_guest_vfp:
-       load_vcpu                       @ Load VCPU pointer to r0
        push    {r3-r7}
 
        @ NEON/VFP used.  Turn on VFP access.
index f558c073c02378a449a05d337d47a8161ae5c51d..eeb85858d6bbe6dff02ac453ea2f19a889cd8810 100644 (file)
@@ -77,7 +77,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        kvm_reset_coprocs(vcpu);
 
        /* Reset arch_timer context */
-       kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-       return 0;
+       return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
index 1ec378c334e5ccdb3e37d1b2230e38227e17e5f0..972abdb100284b379070251b469c2460d35a7ad4 100644 (file)
@@ -95,7 +95,7 @@ static struct physmap_flash_data cdb89712_bootrom_pdata __initdata = {
 
 static struct resource cdb89712_bootrom_resources[] __initdata = {
        DEFINE_RES_NAMED(CS7_PHYS_BASE, SZ_128, "BOOTROM", IORESOURCE_MEM |
-                        IORESOURCE_CACHEABLE | IORESOURCE_READONLY),
+                        IORESOURCE_READONLY),
 };
 
 static struct platform_device cdb89712_bootrom_pdev __initdata = {
index 4c4858c566d8c6e22a8dba63b967d85708a08516..3a10f1a8317ae7a053ed997da88a06ddd5311b57 100644 (file)
@@ -15,6 +15,7 @@ menuconfig ARCH_EXYNOS
        select ARM_AMBA
        select ARM_GIC
        select COMMON_CLK_SAMSUNG
+       select EXYNOS_THERMAL
        select HAVE_ARM_SCU if SMP
        select HAVE_S3C2410_I2C if I2C
        select HAVE_S3C2410_WATCHDOG if WATCHDOG
@@ -24,6 +25,7 @@ menuconfig ARCH_EXYNOS
        select PM_GENERIC_DOMAINS if PM
        select S5P_DEV_MFC
        select SRAM
+       select THERMAL
        select MFD_SYSCON
        help
          Support for SAMSUNG EXYNOS SoCs (EXYNOS4/5)
index 5f8ddcdeeacf1117d92313e6cb34608be136a955..1c47aee31e9cc60aeabc8c504b41c76c2379a435 100644 (file)
@@ -225,7 +225,11 @@ static void __init exynos_init_irq(void)
 }
 
 static const struct of_device_id exynos_cpufreq_matches[] = {
+       { .compatible = "samsung,exynos3250", .data = "cpufreq-dt" },
        { .compatible = "samsung,exynos4210", .data = "cpufreq-dt" },
+       { .compatible = "samsung,exynos4212", .data = "cpufreq-dt" },
+       { .compatible = "samsung,exynos4412", .data = "cpufreq-dt" },
+       { .compatible = "samsung,exynos5250", .data = "cpufreq-dt" },
        { /* sentinel */ }
 };
 
diff --git a/arch/arm/mach-mmp/include/mach/regs-rtc.h b/arch/arm/mach-mmp/include/mach/regs-rtc.h
deleted file mode 100644 (file)
index 5bff886..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __ASM_MACH_REGS_RTC_H
-#define __ASM_MACH_REGS_RTC_H
-
-#include <mach/addr-map.h>
-
-#define RTC_VIRT_BASE  (APB_VIRT_BASE + 0x10000)
-#define RTC_REG(x)     (*((volatile u32 __iomem *)(RTC_VIRT_BASE + (x))))
-
-/*
- * Real Time Clock
- */
-
-#define RCNR           RTC_REG(0x00)   /* RTC Count Register */
-#define RTAR           RTC_REG(0x04)   /* RTC Alarm Register */
-#define RTSR           RTC_REG(0x08)   /* RTC Status Register */
-#define RTTR           RTC_REG(0x0C)   /* RTC Timer Trim Register */
-
-#define RTSR_HZE       (1 << 3)        /* HZ interrupt enable */
-#define RTSR_ALE       (1 << 2)        /* RTC alarm interrupt enable */
-#define RTSR_HZ                (1 << 1)        /* HZ rising-edge detected */
-#define RTSR_AL                (1 << 0)        /* RTC alarm detected */
-
-#endif /* __ASM_MACH_REGS_RTC_H */
index e6ce669b54af49af62b74940e19f98d549fd5947..c62473235a1332b3c7211a408b9a4501b116c211 100644 (file)
@@ -440,25 +440,11 @@ struct platform_device pxa_device_rtc = {
        .resource       = pxa_rtc_resources,
 };
 
-static struct resource sa1100_rtc_resources[] = {
-       {
-               .start  = IRQ_RTC1Hz,
-               .end    = IRQ_RTC1Hz,
-               .name   = "rtc 1Hz",
-               .flags  = IORESOURCE_IRQ,
-       }, {
-               .start  = IRQ_RTCAlrm,
-               .end    = IRQ_RTCAlrm,
-               .name   = "rtc alarm",
-               .flags  = IORESOURCE_IRQ,
-       },
-};
-
 struct platform_device sa1100_device_rtc = {
        .name           = "sa1100-rtc",
        .id             = -1,
-       .num_resources  = ARRAY_SIZE(sa1100_rtc_resources),
-       .resource       = sa1100_rtc_resources,
+       .num_resources  = ARRAY_SIZE(pxa_rtc_resources),
+       .resource       = pxa_rtc_resources,
 };
 
 static struct resource pxa_ac97_resources[] = {
index e6aae9e8adfbdc334876308306c6bf9e1dedd2cf..221260d5d1092364792e3eb181eea74a67462bc7 100644 (file)
@@ -282,7 +282,6 @@ static struct platform_device *devices[] __initdata = {
        &pxa_device_asoc_ssp2,
        &pxa_device_asoc_ssp3,
        &pxa_device_asoc_platform,
-       &sa1100_device_rtc,
        &pxa_device_rtc,
        &pxa27x_device_ssp1,
        &pxa27x_device_ssp2,
index 165638462a2f21022dbbac3b2065bbc28aea1e3b..ce0f8d6242e2a047429a031f7ab9e6e53193d917 100644 (file)
@@ -394,7 +394,6 @@ static struct platform_device *devices[] __initdata = {
        &pxa_device_asoc_ssp3,
        &pxa_device_asoc_ssp4,
        &pxa_device_asoc_platform,
-       &sa1100_device_rtc,
        &pxa_device_rtc,
        &pxa3xx_device_ssp1,
        &pxa3xx_device_ssp2,
index 0ac6cc08a19c6c5467ecd35cf4403e9b2656e7a2..7972617cca647a8fe2c3f05922606de72a442e65 100644 (file)
 #define OIER_E3        OIER_E (3)      /* match interrupt Enable 3        */
 
 
-/*
- * Real-Time Clock (RTC) control registers
- *
- * Registers
- *    RTAR             Real-Time Clock (RTC) Alarm Register (read/write).
- *    RCNR             Real-Time Clock (RTC) CouNt Register (read/write).
- *    RTTR             Real-Time Clock (RTC) Trim Register (read/write).
- *    RTSR             Real-Time Clock (RTC) Status Register (read/write).
- *
- * Clocks
- *    frtx, Trtx       Frequency, period of the real-time clock crystal
- *                     (32.768 kHz nominal).
- *    frtc, Trtc       Frequency, period of the real-time clock counter
- *                     (1 Hz nominal).
- */
-
-#define RTAR           __REG(0x90010000)  /* RTC Alarm Reg. */
-#define RCNR           __REG(0x90010004)  /* RTC CouNt Reg. */
-#define RTTR           __REG(0x90010008)  /* RTC Trim Reg. */
-#define RTSR           __REG(0x90010010)  /* RTC Status Reg. */
-
-#define RTTR_C         Fld (16, 0)     /* clock divider Count - 1         */
-#define RTTR_D         Fld (10, 16)    /* trim Delete count               */
-                                       /* frtc = (1023*(C + 1) - D)*frtx/ */
-                                       /*        (1023*(C + 1)^2)         */
-                                       /* Trtc = (1023*(C + 1)^2)*Trtx/   */
-                                       /*        (1023*(C + 1) - D)       */
-
-#define RTSR_AL        0x00000001      /* ALarm detected                  */
-#define RTSR_HZ        0x00000002      /* 1 Hz clock detected             */
-#define RTSR_ALE       0x00000004      /* ALarm interrupt Enable          */
-#define RTSR_HZE       0x00000008      /* 1 Hz clock interrupt Enable     */
-
-
 /*
  * Power Manager (PM) control registers
  *
index 4092ad16e0a42a266e5b7675e2d60662fd958417..0af05d288b09c3ab65b33e82713fb7a173a432cb 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/err.h>
 #include <linux/mm.h>
 #include <linux/spinlock.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include "pm-rcar.h"
 
 /* SYSC Common */
index bf35abcc7d598c6cf7c44e5e2e13fd044893cff0..e62604384945e513a9b1ed14a2a5a2e3d8950630 100644 (file)
@@ -676,10 +676,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
                    gfp_t gfp, struct dma_attrs *attrs)
 {
        pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, handle, &memory))
-               return memory;
 
        return __dma_alloc(dev, size, handle, gfp, prot, false,
                           attrs, __builtin_return_address(0));
@@ -688,11 +684,6 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
        dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, handle, &memory))
-               return memory;
-
        return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true,
                           attrs, __builtin_return_address(0));
 }
@@ -752,9 +743,6 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
        struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
        bool want_vaddr = !dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs);
 
-       if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
-               return;
-
        size = PAGE_ALIGN(size);
 
        if (nommu()) {
index 71df4354765927da922606db930c6665cbd0c7f9..39c20afad7ed9ed3b4b967a54d3a435e07eccf9e 100644 (file)
@@ -95,9 +95,10 @@ emulate:
        reteq   r4                      @ no, return failure
 
 next:
+       uaccess_enable r3
 .Lx1:  ldrt    r6, [r5], #4            @ get the next instruction and
                                        @ increment PC
-
+       uaccess_disable r3
        and     r2, r6, #0x0F000000     @ test for FP insns
        teq     r2, #0x0C000000
        teqne   r2, #0x0D000000
index 6c09cc440a2b24c4c0acfbd3027c74f9a1dc9efe..eeeab074e1542a3567de5dd4d79c5dab72c083da 100644 (file)
@@ -45,46 +45,39 @@ static struct vcpu_info __percpu *xen_vcpu_info;
 unsigned long xen_released_pages;
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
-/* TODO: to be removed */
-__read_mostly int xen_have_vector_callback;
-EXPORT_SYMBOL_GPL(xen_have_vector_callback);
-
-int xen_platform_pci_unplug = XEN_UNPLUG_ALL;
-EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
-
 static __read_mostly unsigned int xen_events_irq;
 
 static __initdata struct device_node *xen_node;
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t *mfn, int nr,
+                              xen_pfn_t *gfn, int nr,
                               int *err_ptr, pgprot_t prot,
                               unsigned domid,
                               struct page **pages)
 {
-       return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+       return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
                                         prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 /* Not used by XENFEAT_auto_translated guests. */
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t mfn, int nr,
+                              xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages)
 {
        return -ENOSYS;
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
                               int nr, struct page **pages)
 {
        return xen_xlate_unmap_gfn_range(vma, nr, pages);
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
 
 static void xen_percpu_init(void)
 {
index f00e080759384afd300398be488c5740f55f1091..10fd99c568c62a9296b4ad7d2cc3584360b1b8a0 100644 (file)
@@ -98,8 +98,23 @@ ENTRY(privcmd_call)
        mov r1, r2
        mov r2, r3
        ldr r3, [sp, #8]
+       /*
+        * Privcmd calls are issued by the userspace. We need to allow the
+        * kernel to access the userspace memory before issuing the hypercall.
+        */
+       uaccess_enable r4
+
+       /* r4 is loaded now as we use it as scratch register before */
        ldr r4, [sp, #4]
        __HVC(XEN_IMM)
+
+       /*
+        * Disable userspace access from kernel. This is fine to do it
+        * unconditionally as no set_fs(KERNEL_DS)/set_fs(get_ds()) is
+        * called before.
+        */
+       uaccess_disable r4
+
        ldm sp!, {r4}
        ret lr
 ENDPROC(privcmd_call);
index 03e75fef15b8254483929f1332de033a1eeeac05..6dd911d1f0ac6c0f2c328abbee593d85d5bd5bcd 100644 (file)
@@ -139,9 +139,9 @@ void __xen_dma_sync_single_for_device(struct device *hwdev,
 
 bool xen_arch_need_swiotlb(struct device *dev,
                           unsigned long pfn,
-                          unsigned long mfn)
+                          unsigned long bfn)
 {
-       return (!hypercall_cflush && (pfn != mfn) && !is_device_dma_coherent(dev));
+       return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev));
 }
 
 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
index 7d95663c0160bd2575199e3836570e46fb4a158d..07d1811aa03fcd1ecd5ee7c260688a59f7ab97e4 100644 (file)
@@ -32,6 +32,7 @@ config ARM64
        select GENERIC_CLOCKEVENTS_BROADCAST
        select GENERIC_CPU_AUTOPROBE
        select GENERIC_EARLY_IOREMAP
+       select GENERIC_IDLE_POLL_SETUP
        select GENERIC_IRQ_PROBE
        select GENERIC_IRQ_SHOW
        select GENERIC_IRQ_SHOW_LEVEL
@@ -331,6 +332,22 @@ config ARM64_ERRATUM_845719
 
          If unsure, say Y.
 
+config ARM64_ERRATUM_843419
+       bool "Cortex-A53: 843419: A load or store might access an incorrect address"
+       depends on MODULES
+       default y
+       help
+         This option builds kernel modules using the large memory model in
+         order to avoid the use of the ADRP instruction, which can cause
+         a subsequent memory access to use an incorrect address on Cortex-A53
+         parts up to r0p4.
+
+         Note that the kernel itself must be linked with a version of ld
+         which fixes potentially affected ADRP instructions through the
+         use of veneers.
+
+         If unsure, say Y.
+
 endmenu
 
 
index 15ff5b4156fd74a041f3ad926efbeae05ba382dc..f9914d7c1bb00b5c4cbe7a19c0f62c8eca54cf81 100644 (file)
@@ -41,6 +41,10 @@ endif
 
 CHECKFLAGS     += -D__aarch64__
 
+ifeq ($(CONFIG_ARM64_ERRATUM_843419), y)
+CFLAGS_MODULE  += -mcmodel=large
+endif
+
 # Default value
 head-y         := arch/arm64/kernel/head.o
 
index f0d6d0bfe55ceceba3339bc9044bed31a159a9cf..cfdb34bedbcd8adeb0a6f012a02459042726030b 100644 (file)
@@ -22,8 +22,6 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <xen/xen.h>
 #include <asm/xen/hypervisor.h>
 
@@ -86,28 +84,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
        return (phys_addr_t)dev_addr;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dev_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       debug_dma_mapping_error(dev, dev_addr);
-       return ops->mapping_error(dev, dev_addr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
@@ -120,50 +96,5 @@ static inline void dma_mark_clean(void *addr, size_t size)
 {
 }
 
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, h, f)  dma_free_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flags,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *vaddr;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &vaddr))
-               return vaddr;
-
-       vaddr = ops->alloc(dev, size, dma_handle, flags, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, vaddr);
-       return vaddr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dev_addr,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, dev_addr);
-       ops->free(dev, size, vaddr, dev_addr, attrs);
-}
-
-/*
- * There is no dma_cache_sync() implementation, so just return NULL here.
- */
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *handle, gfp_t flags)
-{
-       return NULL;
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                       void *cpu_addr, dma_addr_t handle)
-{
-}
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_DMA_MAPPING_H */
index 52b484b6aa1a7fec251a72b028f655523c74236a..4c47cb2fbb526f7ae445e4f1b46178ba99934283 100644 (file)
@@ -16,6 +16,8 @@
 #ifndef __ASM_HW_BREAKPOINT_H
 #define __ASM_HW_BREAKPOINT_H
 
+#include <asm/cputype.h>
+
 #ifdef __KERNEL__
 
 struct arch_hw_breakpoint_ctrl {
@@ -132,5 +134,17 @@ static inline void ptrace_hw_copy_thread(struct task_struct *task)
 
 extern struct pmu perf_ops_bp;
 
+/* Determine number of BRP registers available. */
+static inline int get_num_brps(void)
+{
+       return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
+}
+
+/* Determine number of WRP registers available. */
+static inline int get_num_wrps(void)
+{
+       return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_BREAKPOINT_H */
index ac6fafb95fe71e48048fe3831f226853f2f4914d..7605e095217f7c2434594327ae4b3453cb758f42 100644 (file)
 #define HSTR_EL2_TTEE  (1 << 16)
 #define HSTR_EL2_T(x)  (1 << x)
 
+/* Hyp Coproccessor Trap Register Shifts */
+#define CPTR_EL2_TFP_SHIFT 10
+
 /* Hyp Coprocessor Trap Register */
 #define CPTR_EL2_TCPAC (1 << 31)
 #define CPTR_EL2_TTA   (1 << 20)
-#define CPTR_EL2_TFP   (1 << 10)
+#define CPTR_EL2_TFP   (1 << CPTR_EL2_TFP_SHIFT)
 
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA          (1 << 11)
index 3c5fe685a2d69a55daf68fe40fa782a0843bb2f5..67fa0de3d48324cc19a06871904f619189ed27da 100644 (file)
 #define        CNTKCTL_EL1     20      /* Timer Control Register (EL1) */
 #define        PAR_EL1         21      /* Physical Address Register */
 #define MDSCR_EL1      22      /* Monitor Debug System Control Register */
-#define DBGBCR0_EL1    23      /* Debug Breakpoint Control Registers (0-15) */
-#define DBGBCR15_EL1   38
-#define DBGBVR0_EL1    39      /* Debug Breakpoint Value Registers (0-15) */
-#define DBGBVR15_EL1   54
-#define DBGWCR0_EL1    55      /* Debug Watchpoint Control Registers (0-15) */
-#define DBGWCR15_EL1   70
-#define DBGWVR0_EL1    71      /* Debug Watchpoint Value Registers (0-15) */
-#define DBGWVR15_EL1   86
-#define MDCCINT_EL1    87      /* Monitor Debug Comms Channel Interrupt Enable Reg */
+#define MDCCINT_EL1    23      /* Monitor Debug Comms Channel Interrupt Enable Reg */
 
 /* 32bit specific registers. Keep them at the end of the range */
-#define        DACR32_EL2      88      /* Domain Access Control Register */
-#define        IFSR32_EL2      89      /* Instruction Fault Status Register */
-#define        FPEXC32_EL2     90      /* Floating-Point Exception Control Register */
-#define        DBGVCR32_EL2    91      /* Debug Vector Catch Register */
-#define        TEECR32_EL1     92      /* ThumbEE Configuration Register */
-#define        TEEHBR32_EL1    93      /* ThumbEE Handler Base Register */
-#define        NR_SYS_REGS     94
+#define        DACR32_EL2      24      /* Domain Access Control Register */
+#define        IFSR32_EL2      25      /* Instruction Fault Status Register */
+#define        FPEXC32_EL2     26      /* Floating-Point Exception Control Register */
+#define        DBGVCR32_EL2    27      /* Debug Vector Catch Register */
+#define        TEECR32_EL1     28      /* ThumbEE Configuration Register */
+#define        TEEHBR32_EL1    29      /* ThumbEE Handler Base Register */
+#define        NR_SYS_REGS     30
 
 /* 32bit mapping */
 #define c0_MPIDR       (MPIDR_EL1 * 2) /* MultiProcessor ID Register */
@@ -132,6 +124,8 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 
+extern u32 __kvm_get_mdcr_el2(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
index 2709db2a7eac78a49fc75c072810b9ac41cc9e0a..415938dc45cff94600625963b71011882f3a3ae4 100644 (file)
@@ -103,15 +103,34 @@ struct kvm_vcpu_arch {
 
        /* HYP configuration */
        u64 hcr_el2;
+       u32 mdcr_el2;
 
        /* Exception Information */
        struct kvm_vcpu_fault_info fault;
 
-       /* Debug state */
+       /* Guest debug state */
        u64 debug_flags;
 
+       /*
+        * We maintain more than a single set of debug registers to support
+        * debugging the guest from the host and to maintain separate host and
+        * guest state during world switches. vcpu_debug_state are the debug
+        * registers of the vcpu as the guest sees them.  host_debug_state are
+        * the host registers which are saved and restored during
+        * world switches. external_debug_state contains the debug
+        * values we want to debug the guest. This is set via the
+        * KVM_SET_GUEST_DEBUG ioctl.
+        *
+        * debug_ptr points to the set of debug registers that should be loaded
+        * onto the hardware when running the guest.
+        */
+       struct kvm_guest_debug_arch *debug_ptr;
+       struct kvm_guest_debug_arch vcpu_debug_state;
+       struct kvm_guest_debug_arch external_debug_state;
+
        /* Pointer to host CPU context */
        kvm_cpu_context_t *host_cpu_context;
+       struct kvm_guest_debug_arch host_debug_state;
 
        /* VGIC state */
        struct vgic_cpu vgic_cpu;
@@ -122,6 +141,17 @@ struct kvm_vcpu_arch {
         * here.
         */
 
+       /*
+        * Guest registers we preserve during guest debugging.
+        *
+        * These shadow registers are updated by the kvm_handle_sys_reg
+        * trap handler if the guest accesses or updates them while we
+        * are using guest debug.
+        */
+       struct {
+               u32     mdscr_el1;
+       } guest_debug_preserved;
+
        /* Don't run the guest */
        bool pause;
 
@@ -216,15 +246,15 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
                     hyp_stack_ptr, vector_ptr);
 }
 
-struct vgic_sr_vectors {
-       void    *save_vgic;
-       void    *restore_vgic;
-};
-
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
+void kvm_arm_init_debug(void);
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HOST_H__ */
index 44a59c20e77358f98c7ad22c8ace5a1dd07bc7fd..6b4c3ad75a2a99b0760a38436b7d159bfe3209c3 100644 (file)
 #define __virt_to_phys(x)      (((phys_addr_t)(x) - PAGE_OFFSET + PHYS_OFFSET))
 #define __phys_to_virt(x)      ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET))
 
-/*
- * Convert a physical address to a Page Frame Number and back
- */
-#define        __phys_to_pfn(paddr)    ((unsigned long)((paddr) >> PAGE_SHIFT))
-#define        __pfn_to_phys(pfn)      ((phys_addr_t)(pfn) << PAGE_SHIFT)
-
 /*
  * Convert a page to/from a physical address
  */
index 6900b2d953717c721fb2318ef877428542baef17..b0329be95cb129f3b283f3d75e4dfeff64214bff 100644 (file)
  * Software defined PTE bits definition.
  */
 #define PTE_VALID              (_AT(pteval_t, 1) << 0)
+#define PTE_WRITE              (PTE_DBM)                /* same as DBM (51) */
 #define PTE_DIRTY              (_AT(pteval_t, 1) << 55)
 #define PTE_SPECIAL            (_AT(pteval_t, 1) << 56)
-#ifdef CONFIG_ARM64_HW_AFDBM
-#define PTE_WRITE              (PTE_DBM)                /* same as DBM */
-#else
-#define PTE_WRITE              (_AT(pteval_t, 1) << 57)
-#endif
 #define PTE_PROT_NONE          (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
 
 /*
@@ -146,7 +142,7 @@ extern struct page *empty_zero_page;
 #define pte_exec(pte)          (!(pte_val(pte) & PTE_UXN))
 
 #ifdef CONFIG_ARM64_HW_AFDBM
-#define pte_hw_dirty(pte)      (!(pte_val(pte) & PTE_RDONLY))
+#define pte_hw_dirty(pte)      (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
 #else
 #define pte_hw_dirty(pte)      (0)
 #endif
@@ -238,7 +234,7 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
  * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
  * the page fault mechanism. Checking the dirty status of a pte becomes:
  *
- *   PTE_DIRTY || !PTE_RDONLY
+ *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
  */
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, pte_t pte)
@@ -503,7 +499,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
                              PTE_PROT_NONE | PTE_WRITE | PTE_TYPE_MASK;
        /* preserve the hardware dirty information */
        if (pte_hw_dirty(pte))
-               newprot |= PTE_DIRTY;
+               pte = pte_mkdirty(pte);
        pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
        return pte;
 }
index 86553213c1322cafd3ca816e8e6e6a762c2bbe3d..4318866d053c513738c5fb3194652ceaf2a7770a 100644 (file)
@@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
+/* Rebind event channel is supported by default */
+static inline bool xen_support_evtchn_rebind(void)
+{
+       return true;
+}
+
 #endif /* _ASM_ARM64_XEN_EVENTS_H */
index d26832022127e822d9912f19ca73fe6ea794a309..0cd7b5947dfcfc635c8f6c7de4480f8b015d8fd6 100644 (file)
@@ -53,14 +53,20 @@ struct kvm_regs {
        struct user_fpsimd_state fp_regs;
 };
 
-/* Supported Processor Types */
+/*
+ * Supported CPU Targets - Adding a new target type is not recommended,
+ * unless there are some special registers not supported by the
+ * genericv8 syreg table.
+ */
 #define KVM_ARM_TARGET_AEM_V8          0
 #define KVM_ARM_TARGET_FOUNDATION_V8   1
 #define KVM_ARM_TARGET_CORTEX_A57      2
 #define KVM_ARM_TARGET_XGENE_POTENZA   3
 #define KVM_ARM_TARGET_CORTEX_A53      4
+/* Generic ARM v8 target */
+#define KVM_ARM_TARGET_GENERIC_V8      5
 
-#define KVM_ARM_NUM_TARGETS            5
+#define KVM_ARM_NUM_TARGETS            6
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT      0
@@ -100,12 +106,39 @@ struct kvm_sregs {
 struct kvm_fpu {
 };
 
+/*
+ * See v8 ARM ARM D7.3: Debug Registers
+ *
+ * The architectural limit is 16 debug registers of each type although
+ * in practice there are usually less (see ID_AA64DFR0_EL1).
+ *
+ * Although the control registers are architecturally defined as 32
+ * bits wide we use a 64 bit structure here to keep parity with
+ * KVM_GET/SET_ONE_REG behaviour which treats all system registers as
+ * 64 bit values. It also allows for the possibility of the
+ * architecture expanding the control registers without having to
+ * change the userspace ABI.
+ */
+#define KVM_ARM_MAX_DBG_REGS 16
 struct kvm_guest_debug_arch {
+       __u64 dbg_bcr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_wcr[KVM_ARM_MAX_DBG_REGS];
+       __u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS];
 };
 
 struct kvm_debug_exit_arch {
+       __u32 hsr;
+       __u64 far;      /* used for watchpoints */
 };
 
+/*
+ * Architecture specific defines for kvm_guest_debug->control
+ */
+
+#define KVM_GUESTDBG_USE_SW_BP         (1 << 16)
+#define KVM_GUESTDBG_USE_HW            (1 << 17)
+
 struct kvm_sync_regs {
 };
 
index c99701a34d7b3c9238c80e1e4ebbaeba6e3488c3..8d89cf8dae5556851365e2cee335599b4d8eb359 100644 (file)
@@ -116,17 +116,22 @@ int main(void)
   DEFINE(VCPU_FAR_EL2,         offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_HPFAR_EL2,       offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_DEBUG_FLAGS,     offsetof(struct kvm_vcpu, arch.debug_flags));
+  DEFINE(VCPU_DEBUG_PTR,       offsetof(struct kvm_vcpu, arch.debug_ptr));
+  DEFINE(DEBUG_BCR,            offsetof(struct kvm_guest_debug_arch, dbg_bcr));
+  DEFINE(DEBUG_BVR,            offsetof(struct kvm_guest_debug_arch, dbg_bvr));
+  DEFINE(DEBUG_WCR,            offsetof(struct kvm_guest_debug_arch, dbg_wcr));
+  DEFINE(DEBUG_WVR,            offsetof(struct kvm_guest_debug_arch, dbg_wvr));
   DEFINE(VCPU_HCR_EL2,         offsetof(struct kvm_vcpu, arch.hcr_el2));
+  DEFINE(VCPU_MDCR_EL2,        offsetof(struct kvm_vcpu, arch.mdcr_el2));
   DEFINE(VCPU_IRQ_LINES,       offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_HOST_CONTEXT,    offsetof(struct kvm_vcpu, arch.host_cpu_context));
+  DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
   DEFINE(VCPU_TIMER_CNTV_CTL,  offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
   DEFINE(VCPU_TIMER_CNTV_CVAL, offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
   DEFINE(KVM_TIMER_CNTVOFF,    offsetof(struct kvm, arch.timer.cntvoff));
   DEFINE(KVM_TIMER_ENABLED,    offsetof(struct kvm, arch.timer.enabled));
   DEFINE(VCPU_KVM,             offsetof(struct kvm_vcpu, kvm));
   DEFINE(VCPU_VGIC_CPU,                offsetof(struct kvm_vcpu, arch.vgic_cpu));
-  DEFINE(VGIC_SAVE_FN,         offsetof(struct vgic_sr_vectors, save_vgic));
-  DEFINE(VGIC_RESTORE_FN,      offsetof(struct vgic_sr_vectors, restore_vgic));
   DEFINE(VGIC_V2_CPU_HCR,      offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_VMCR,     offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
   DEFINE(VGIC_V2_CPU_MISR,     offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
index 9b3b62ac9c244ba91b9f7020ac7fe0dbb141c5ec..cebf78661a553775003bfee8ec89f65e33e3ec55 100644 (file)
@@ -134,7 +134,7 @@ static int os_lock_notify(struct notifier_block *self,
                                    unsigned long action, void *data)
 {
        int cpu = (unsigned long)data;
-       if (action == CPU_ONLINE)
+       if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
                smp_call_function_single(cpu, clear_os_lock, NULL, 1);
        return NOTIFY_OK;
 }
index a055be6125cf592d06e957df0b1125d409c9a5cf..90d09eddd5b27368e358efd44ab552db8330d39c 100644 (file)
@@ -523,6 +523,11 @@ CPU_LE(    movk    x0, #0x30d0, lsl #16    )       // Clear EE and E0E on LE systems
        msr     hstr_el2, xzr                   // Disable CP15 traps to EL2
 #endif
 
+       /* EL2 debug */
+       mrs     x0, pmcr_el0                    // Disable debug access traps
+       ubfx    x0, x0, #11, #5                 // to EL2 and allow access to
+       msr     mdcr_el2, x0                    // all PMU counters from EL1
+
        /* Stage-2 translation */
        msr     vttbr_el2, xzr
 
index 003bc3d50636f585e3777661ad270baedaf1fe15..bba85c8f80373937ef9fe746e1a2ed4fc39f58ee 100644 (file)
@@ -48,18 +48,6 @@ static DEFINE_PER_CPU(int, stepping_kernel_bp);
 static int core_num_brps;
 static int core_num_wrps;
 
-/* Determine number of BRP registers available. */
-static int get_num_brps(void)
-{
-       return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
-}
-
-/* Determine number of WRP registers available. */
-static int get_num_wrps(void)
-{
-       return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
-}
-
 int hw_breakpoint_slots(int type)
 {
        /*
@@ -884,7 +872,7 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self,
                                                void *hcpu)
 {
        int cpu = (long)hcpu;
-       if (action == CPU_ONLINE)
+       if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
                smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1);
        return NOTIFY_OK;
 }
index 67bf4107f6efe8401e1df29ad471ff8aac8cb01d..876eb8df50bf3355ac8432a2ddf2a5c46810a5de 100644 (file)
@@ -332,12 +332,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                        ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21,
                                             AARCH64_INSN_IMM_ADR);
                        break;
+#ifndef CONFIG_ARM64_ERRATUM_843419
                case R_AARCH64_ADR_PREL_PG_HI21_NC:
                        overflow_check = false;
                case R_AARCH64_ADR_PREL_PG_HI21:
                        ovf = reloc_insn_imm(RELOC_OP_PAGE, loc, val, 12, 21,
                                             AARCH64_INSN_IMM_ADR);
                        break;
+#endif
                case R_AARCH64_ADD_ABS_LO12_NC:
                case R_AARCH64_LDST8_ABS_LO12_NC:
                        overflow_check = false;
index 8884788812433b26aa734980538dea3cbea6405f..6bab21f84a9ff38402e70345016ed50ae8e95e30 100644 (file)
@@ -339,6 +339,67 @@ static void __init request_standard_resources(void)
        }
 }
 
+#ifdef CONFIG_BLK_DEV_INITRD
+/*
+ * Relocate initrd if it is not completely within the linear mapping.
+ * This would be the case if mem= cuts out all or part of it.
+ */
+static void __init relocate_initrd(void)
+{
+       phys_addr_t orig_start = __virt_to_phys(initrd_start);
+       phys_addr_t orig_end = __virt_to_phys(initrd_end);
+       phys_addr_t ram_end = memblock_end_of_DRAM();
+       phys_addr_t new_start;
+       unsigned long size, to_free = 0;
+       void *dest;
+
+       if (orig_end <= ram_end)
+               return;
+
+       /*
+        * Any of the original initrd which overlaps the linear map should
+        * be freed after relocating.
+        */
+       if (orig_start < ram_end)
+               to_free = ram_end - orig_start;
+
+       size = orig_end - orig_start;
+
+       /* initrd needs to be relocated completely inside linear mapping */
+       new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
+                                          size, PAGE_SIZE);
+       if (!new_start)
+               panic("Cannot relocate initrd of size %ld\n", size);
+       memblock_reserve(new_start, size);
+
+       initrd_start = __phys_to_virt(new_start);
+       initrd_end   = initrd_start + size;
+
+       pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
+               orig_start, orig_start + size - 1,
+               new_start, new_start + size - 1);
+
+       dest = (void *)initrd_start;
+
+       if (to_free) {
+               memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
+               dest += to_free;
+       }
+
+       copy_from_early_mem(dest, orig_start + to_free, size - to_free);
+
+       if (to_free) {
+               pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
+                       orig_start, orig_start + to_free - 1);
+               memblock_free(orig_start, to_free);
+       }
+}
+#else
+static inline void __init relocate_initrd(void)
+{
+}
+#endif
+
 u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
 
 void __init setup_arch(char **cmdline_p)
@@ -372,6 +433,7 @@ void __init setup_arch(char **cmdline_p)
        acpi_boot_table_init();
 
        paging_init();
+       relocate_initrd();
        request_standard_resources();
 
        early_ioremap_reset();
index 948f0ad2de231b5e3f5efa62e204162cadf26503..71ef6dc89ae509cd299eab3f1959ffe4e8f96a33 100644 (file)
@@ -212,14 +212,32 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 
 /*
  * VFP save/restore code.
+ *
+ * We have to be careful with endianness, since the fpsimd context-switch
+ * code operates on 128-bit (Q) register values whereas the compat ABI
+ * uses an array of 64-bit (D) registers. Consequently, we need to swap
+ * the two halves of each Q register when running on a big-endian CPU.
  */
+union __fpsimd_vreg {
+       __uint128_t     raw;
+       struct {
+#ifdef __AARCH64EB__
+               u64     hi;
+               u64     lo;
+#else
+               u64     lo;
+               u64     hi;
+#endif
+       };
+};
+
 static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 {
        struct fpsimd_state *fpsimd = &current->thread.fpsimd_state;
        compat_ulong_t magic = VFP_MAGIC;
        compat_ulong_t size = VFP_STORAGE_SIZE;
        compat_ulong_t fpscr, fpexc;
-       int err = 0;
+       int i, err = 0;
 
        /*
         * Save the hardware registers to the fpsimd_state structure.
@@ -235,10 +253,15 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
        /*
         * Now copy the FP registers. Since the registers are packed,
         * we can copy the prefix we want (V0-V15) as it is.
-        * FIXME: Won't work if big endian.
         */
-       err |= __copy_to_user(&frame->ufp.fpregs, fpsimd->vregs,
-                             sizeof(frame->ufp.fpregs));
+       for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+               union __fpsimd_vreg vreg = {
+                       .raw = fpsimd->vregs[i >> 1],
+               };
+
+               __put_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+               __put_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+       }
 
        /* Create an AArch32 fpscr from the fpsr and the fpcr. */
        fpscr = (fpsimd->fpsr & VFP_FPSCR_STAT_MASK) |
@@ -263,7 +286,7 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
        compat_ulong_t magic = VFP_MAGIC;
        compat_ulong_t size = VFP_STORAGE_SIZE;
        compat_ulong_t fpscr;
-       int err = 0;
+       int i, err = 0;
 
        __get_user_error(magic, &frame->magic, err);
        __get_user_error(size, &frame->size, err);
@@ -273,12 +296,14 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
        if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE)
                return -EINVAL;
 
-       /*
-        * Copy the FP registers into the start of the fpsimd_state.
-        * FIXME: Won't work if big endian.
-        */
-       err |= __copy_from_user(fpsimd.vregs, frame->ufp.fpregs,
-                               sizeof(frame->ufp.fpregs));
+       /* Copy the FP registers into the start of the fpsimd_state. */
+       for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) {
+               union __fpsimd_vreg vreg;
+
+               __get_user_error(vreg.lo, &frame->ufp.fpregs[i], err);
+               __get_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err);
+               fpsimd.vregs[i >> 1] = vreg.raw;
+       }
 
        /* Extract the fpsr and the fpcr from the fpscr */
        __get_user_error(fpscr, &frame->ufp.fpscr, err);
index f90f4aa7f88d9d60ba4e8b53645c7d81d15f3e07..1949fe5f54246a3f14983753570f4e4d0f978e76 100644 (file)
@@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
new file mode 100644 (file)
index 0000000..47e5f0f
--- /dev/null
@@ -0,0 +1,217 @@
+/*
+ * Debug and Guest Debug support
+ *
+ * Copyright (C) 2015 - Linaro Ltd
+ * Author: Alex Bennée <alex.bennee@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hw_breakpoint.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+
+#include "trace.h"
+
+/* These are the bits of MDSCR_EL1 we may manipulate */
+#define MDSCR_EL1_DEBUG_MASK   (DBG_MDSCR_SS | \
+                               DBG_MDSCR_KDE | \
+                               DBG_MDSCR_MDE)
+
+static DEFINE_PER_CPU(u32, mdcr_el2);
+
+/**
+ * save/restore_guest_debug_regs
+ *
+ * For some debug operations we need to tweak some guest registers. As
+ * a result we need to save the state of those registers before we
+ * make those modifications.
+ *
+ * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
+ * after we have restored the preserved value to the main context.
+ */
+static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1);
+
+       trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
+                               vcpu->arch.guest_debug_preserved.mdscr_el1);
+}
+
+static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+       vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1;
+
+       trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
+                               vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+/**
+ * kvm_arm_init_debug - grab what we need for debug
+ *
+ * Currently the sole task of this function is to retrieve the initial
+ * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
+ * presumably been set-up by some knowledgeable bootcode.
+ *
+ * It is called once per-cpu during CPU hyp initialisation.
+ */
+
+void kvm_arm_init_debug(void)
+{
+       __this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2));
+}
+
+/**
+ * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
+ */
+
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state;
+}
+
+/**
+ * kvm_arm_setup_debug - set up debug related stuff
+ *
+ * @vcpu:      the vcpu pointer
+ *
+ * This is called before each entry into the hypervisor to setup any
+ * debug related registers. Currently this just ensures we will trap
+ * access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *
+ * Additionally, KVM only traps guest accesses to the debug registers if
+ * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
+ * flag on vcpu->arch.debug_flags).  Since the guest must not interfere
+ * with the hardware state when debugging the guest, we must ensure that
+ * trapping is enabled whenever we are debugging the guest using the
+ * debug registers.
+ */
+
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
+{
+       bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY);
+
+       trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
+
+       vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+       vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+                               MDCR_EL2_TPMCR |
+                               MDCR_EL2_TDRA |
+                               MDCR_EL2_TDOSA);
+
+       /* Is Guest debugging in effect? */
+       if (vcpu->guest_debug) {
+               /* Route all software debug exceptions to EL2 */
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+               /* Save guest debug state */
+               save_guest_debug_regs(vcpu);
+
+               /*
+                * Single Step (ARM ARM D2.12.3 The software step state
+                * machine)
+                *
+                * If we are doing Single Step we need to manipulate
+                * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the
+                * step has occurred the hypervisor will trap the
+                * debug exception and we return to userspace.
+                *
+                * If the guest attempts to single step its userspace
+                * we would have to deal with a trapped exception
+                * while in the guest kernel. Because this would be
+                * hard to unwind we suppress the guest's ability to
+                * do so by masking MDSCR_EL.SS.
+                *
+                * This confuses guest debuggers which use
+                * single-step behind the scenes but everything
+                * returns to normal once the host is no longer
+                * debugging the system.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+                       *vcpu_cpsr(vcpu) |=  DBG_SPSR_SS;
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS;
+               } else {
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS;
+               }
+
+               trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
+
+               /*
+                * HW Breakpoints and watchpoints
+                *
+                * We simply switch the debug_ptr to point to our new
+                * external_debug_state which has been populated by the
+                * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY
+                * mechanism ensures the registers are updated on the
+                * world switch.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       /* Enable breakpoints/watchpoints */
+                       vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE;
+
+                       vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
+                       vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+                       trap_debug = true;
+
+                       trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+                                               &vcpu->arch.debug_ptr->dbg_bcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+                       trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+                                               &vcpu->arch.debug_ptr->dbg_wcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_wvr[0]);
+               }
+       }
+
+       BUG_ON(!vcpu->guest_debug &&
+               vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
+
+       /* Trap debug register access */
+       if (trap_debug)
+               vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+       trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+       trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
+{
+       trace_kvm_arm_clear_debug(vcpu->guest_debug);
+
+       if (vcpu->guest_debug) {
+               restore_guest_debug_regs(vcpu);
+
+               /*
+                * If we were using HW debug we need to restore the
+                * debug_ptr to the guest debug state.
+                */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       kvm_arm_reset_debug_ptr(vcpu);
+
+                       trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+                                               &vcpu->arch.debug_ptr->dbg_bcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+                       trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+                                               &vcpu->arch.debug_ptr->dbg_wcr[0],
+                                               &vcpu->arch.debug_ptr->dbg_wvr[0]);
+               }
+       }
+}
index 9535bd555d1d47cf190f78306eb879484d70a261..d250160d32bc68ae636c804b9cdfe2499bdddcb9 100644 (file)
@@ -32,6 +32,8 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
 
+#include "trace.h"
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
@@ -293,7 +295,8 @@ int __attribute_const__ kvm_target_cpu(void)
                break;
        };
 
-       return -EINVAL;
+       /* Return a default generic target */
+       return KVM_ARM_TARGET_GENERIC_V8;
 }
 
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
@@ -331,3 +334,41 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
        return -EINVAL;
 }
+
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
+                           KVM_GUESTDBG_USE_SW_BP | \
+                           KVM_GUESTDBG_USE_HW | \
+                           KVM_GUESTDBG_SINGLESTEP)
+
+/**
+ * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
+ * @kvm:       pointer to the KVM struct
+ * @kvm_guest_debug: the ioctl data buffer
+ *
+ * This sets up and enables the VM for guest debugging. Userspace
+ * passes in a control flag to enable different debug types and
+ * potentially other architecture specific information in the rest of
+ * the structure.
+ */
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       trace_kvm_set_guest_debug(vcpu, dbg->control);
+
+       if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
+               return -EINVAL;
+
+       if (dbg->control & KVM_GUESTDBG_ENABLE) {
+               vcpu->guest_debug = dbg->control;
+
+               /* Hardware assisted Break and Watch points */
+               if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+                       vcpu->arch.external_debug_state = dbg->arch;
+               }
+
+       } else {
+               /* If not enabled clear all flags */
+               vcpu->guest_debug = 0;
+       }
+       return 0;
+}
index 524fa25671fc85da0a0eafac65b95698b6de3729..68a0759b1375e3d6824b55b0c0548ba21c434ca0 100644 (file)
@@ -82,6 +82,45 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
        return 1;
 }
 
+/**
+ * kvm_handle_guest_debug - handle a debug exception instruction
+ *
+ * @vcpu:      the vcpu pointer
+ * @run:       access to the kvm_run structure for results
+ *
+ * We route all debug exceptions through the same handler. If both the
+ * guest and host are using the same debug facilities it will be up to
+ * userspace to re-inject the correct exception for guest delivery.
+ *
+ * @return: 0 (while setting run->exit_reason), -1 for error
+ */
+static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+       u32 hsr = kvm_vcpu_get_hsr(vcpu);
+       int ret = 0;
+
+       run->exit_reason = KVM_EXIT_DEBUG;
+       run->debug.arch.hsr = hsr;
+
+       switch (hsr >> ESR_ELx_EC_SHIFT) {
+       case ESR_ELx_EC_WATCHPT_LOW:
+               run->debug.arch.far = vcpu->arch.fault.far_el2;
+               /* fall through */
+       case ESR_ELx_EC_SOFTSTP_LOW:
+       case ESR_ELx_EC_BREAKPT_LOW:
+       case ESR_ELx_EC_BKPT32:
+       case ESR_ELx_EC_BRK64:
+               break;
+       default:
+               kvm_err("%s: un-handled case hsr: %#08x\n",
+                       __func__, (unsigned int) hsr);
+               ret = -1;
+               break;
+       }
+
+       return ret;
+}
+
 static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_WFx]        = kvm_handle_wfx,
        [ESR_ELx_EC_CP15_32]    = kvm_handle_cp15_32,
@@ -96,6 +135,11 @@ static exit_handle_fn arm_exit_handlers[] = {
        [ESR_ELx_EC_SYS64]      = kvm_handle_sys_reg,
        [ESR_ELx_EC_IABT_LOW]   = kvm_handle_guest_abort,
        [ESR_ELx_EC_DABT_LOW]   = kvm_handle_guest_abort,
+       [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
+       [ESR_ELx_EC_BKPT32]     = kvm_handle_guest_debug,
+       [ESR_ELx_EC_BRK64]      = kvm_handle_guest_debug,
 };
 
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
index 10915aaf0b015b7281efa832e3ea50a90e42394c..37c89ea2c572ed858c0425344b45e0897a89b74f 100644 (file)
        stp     x24, x25, [x3, #160]
 .endm
 
-.macro save_debug
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgbcr15_el1
-       mrs     x19, dbgbcr14_el1
-       mrs     x18, dbgbcr13_el1
-       mrs     x17, dbgbcr12_el1
-       mrs     x16, dbgbcr11_el1
-       mrs     x15, dbgbcr10_el1
-       mrs     x14, dbgbcr9_el1
-       mrs     x13, dbgbcr8_el1
-       mrs     x12, dbgbcr7_el1
-       mrs     x11, dbgbcr6_el1
-       mrs     x10, dbgbcr5_el1
-       mrs     x9, dbgbcr4_el1
-       mrs     x8, dbgbcr3_el1
-       mrs     x7, dbgbcr2_el1
-       mrs     x6, dbgbcr1_el1
-       mrs     x5, dbgbcr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+.macro save_debug type
+       // x4: pointer to register set
+       // x5: number of registers to skip
+       // x6..x22 trashed
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       mrs     x20, dbgbvr15_el1
-       mrs     x19, dbgbvr14_el1
-       mrs     x18, dbgbvr13_el1
-       mrs     x17, dbgbvr12_el1
-       mrs     x16, dbgbvr11_el1
-       mrs     x15, dbgbvr10_el1
-       mrs     x14, dbgbvr9_el1
-       mrs     x13, dbgbvr8_el1
-       mrs     x12, dbgbvr7_el1
-       mrs     x11, dbgbvr6_el1
-       mrs     x10, dbgbvr5_el1
-       mrs     x9, dbgbvr4_el1
-       mrs     x8, dbgbvr3_el1
-       mrs     x7, dbgbvr2_el1
-       mrs     x6, dbgbvr1_el1
-       mrs     x5, dbgbvr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgwcr15_el1
-       mrs     x19, dbgwcr14_el1
-       mrs     x18, dbgwcr13_el1
-       mrs     x17, dbgwcr12_el1
-       mrs     x16, dbgwcr11_el1
-       mrs     x15, dbgwcr10_el1
-       mrs     x14, dbgwcr9_el1
-       mrs     x13, dbgwcr8_el1
-       mrs     x12, dbgwcr7_el1
-       mrs     x11, dbgwcr6_el1
-       mrs     x10, dbgwcr5_el1
-       mrs     x9, dbgwcr4_el1
-       mrs     x8, dbgwcr3_el1
-       mrs     x7, dbgwcr2_el1
-       mrs     x6, dbgwcr1_el1
-       mrs     x5, dbgwcr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-
-1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       mrs     x20, dbgwvr15_el1
-       mrs     x19, dbgwvr14_el1
-       mrs     x18, dbgwvr13_el1
-       mrs     x17, dbgwvr12_el1
-       mrs     x16, dbgwvr11_el1
-       mrs     x15, dbgwvr10_el1
-       mrs     x14, dbgwvr9_el1
-       mrs     x13, dbgwvr8_el1
-       mrs     x12, dbgwvr7_el1
-       mrs     x11, dbgwvr6_el1
-       mrs     x10, dbgwvr5_el1
-       mrs     x9, dbgwvr4_el1
-       mrs     x8, dbgwvr3_el1
-       mrs     x7, dbgwvr2_el1
-       mrs     x6, dbgwvr1_el1
-       mrs     x5, dbgwvr0_el1
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-
+       mrs     x21, \type\()15_el1
+       mrs     x20, \type\()14_el1
+       mrs     x19, \type\()13_el1
+       mrs     x18, \type\()12_el1
+       mrs     x17, \type\()11_el1
+       mrs     x16, \type\()10_el1
+       mrs     x15, \type\()9_el1
+       mrs     x14, \type\()8_el1
+       mrs     x13, \type\()7_el1
+       mrs     x12, \type\()6_el1
+       mrs     x11, \type\()5_el1
+       mrs     x10, \type\()4_el1
+       mrs     x9, \type\()3_el1
+       mrs     x8, \type\()2_el1
+       mrs     x7, \type\()1_el1
+       mrs     x6, \type\()0_el1
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       str     x20, [x3, #(15 * 8)]
-       str     x19, [x3, #(14 * 8)]
-       str     x18, [x3, #(13 * 8)]
-       str     x17, [x3, #(12 * 8)]
-       str     x16, [x3, #(11 * 8)]
-       str     x15, [x3, #(10 * 8)]
-       str     x14, [x3, #(9 * 8)]
-       str     x13, [x3, #(8 * 8)]
-       str     x12, [x3, #(7 * 8)]
-       str     x11, [x3, #(6 * 8)]
-       str     x10, [x3, #(5 * 8)]
-       str     x9, [x3, #(4 * 8)]
-       str     x8, [x3, #(3 * 8)]
-       str     x7, [x3, #(2 * 8)]
-       str     x6, [x3, #(1 * 8)]
-       str     x5, [x3, #(0 * 8)]
-
-       mrs     x21, mdccint_el1
-       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+       str     x21, [x4, #(15 * 8)]
+       str     x20, [x4, #(14 * 8)]
+       str     x19, [x4, #(13 * 8)]
+       str     x18, [x4, #(12 * 8)]
+       str     x17, [x4, #(11 * 8)]
+       str     x16, [x4, #(10 * 8)]
+       str     x15, [x4, #(9 * 8)]
+       str     x14, [x4, #(8 * 8)]
+       str     x13, [x4, #(7 * 8)]
+       str     x12, [x4, #(6 * 8)]
+       str     x11, [x4, #(5 * 8)]
+       str     x10, [x4, #(4 * 8)]
+       str     x9, [x4, #(3 * 8)]
+       str     x8, [x4, #(2 * 8)]
+       str     x7, [x4, #(1 * 8)]
+       str     x6, [x4, #(0 * 8)]
 .endm
 
 .macro restore_sysregs
        msr     mdscr_el1,      x25
 .endm
 
-.macro restore_debug
-       // x2: base address for cpu context
-       // x3: tmp register
-
-       mrs     x26, id_aa64dfr0_el1
-       ubfx    x24, x26, #12, #4       // Extract BRPs
-       ubfx    x25, x26, #20, #4       // Extract WRPs
-       mov     w26, #15
-       sub     w24, w26, w24           // How many BPs to skip
-       sub     w25, w26, w25           // How many WPs to skip
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
+.macro restore_debug type
+       // x4: pointer to register set
+       // x5: number of registers to skip
+       // x6..x22 trashed
 
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       msr     dbgbcr15_el1, x20
-       msr     dbgbcr14_el1, x19
-       msr     dbgbcr13_el1, x18
-       msr     dbgbcr12_el1, x17
-       msr     dbgbcr11_el1, x16
-       msr     dbgbcr10_el1, x15
-       msr     dbgbcr9_el1, x14
-       msr     dbgbcr8_el1, x13
-       msr     dbgbcr7_el1, x12
-       msr     dbgbcr6_el1, x11
-       msr     dbgbcr5_el1, x10
-       msr     dbgbcr4_el1, x9
-       msr     dbgbcr3_el1, x8
-       msr     dbgbcr2_el1, x7
-       msr     dbgbcr1_el1, x6
-       msr     dbgbcr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
+       ldr     x21, [x4, #(15 * 8)]
+       ldr     x20, [x4, #(14 * 8)]
+       ldr     x19, [x4, #(13 * 8)]
+       ldr     x18, [x4, #(12 * 8)]
+       ldr     x17, [x4, #(11 * 8)]
+       ldr     x16, [x4, #(10 * 8)]
+       ldr     x15, [x4, #(9 * 8)]
+       ldr     x14, [x4, #(8 * 8)]
+       ldr     x13, [x4, #(7 * 8)]
+       ldr     x12, [x4, #(6 * 8)]
+       ldr     x11, [x4, #(5 * 8)]
+       ldr     x10, [x4, #(4 * 8)]
+       ldr     x9, [x4, #(3 * 8)]
+       ldr     x8, [x4, #(2 * 8)]
+       ldr     x7, [x4, #(1 * 8)]
+       ldr     x6, [x4, #(0 * 8)]
+
+       adr     x22, 1f
+       add     x22, x22, x5, lsl #2
+       br      x22
 1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x24, lsl #2
-       br      x26
-1:
-       msr     dbgbvr15_el1, x20
-       msr     dbgbvr14_el1, x19
-       msr     dbgbvr13_el1, x18
-       msr     dbgbvr12_el1, x17
-       msr     dbgbvr11_el1, x16
-       msr     dbgbvr10_el1, x15
-       msr     dbgbvr9_el1, x14
-       msr     dbgbvr8_el1, x13
-       msr     dbgbvr7_el1, x12
-       msr     dbgbvr6_el1, x11
-       msr     dbgbvr5_el1, x10
-       msr     dbgbvr4_el1, x9
-       msr     dbgbvr3_el1, x8
-       msr     dbgbvr2_el1, x7
-       msr     dbgbvr1_el1, x6
-       msr     dbgbvr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       msr     dbgwcr15_el1, x20
-       msr     dbgwcr14_el1, x19
-       msr     dbgwcr13_el1, x18
-       msr     dbgwcr12_el1, x17
-       msr     dbgwcr11_el1, x16
-       msr     dbgwcr10_el1, x15
-       msr     dbgwcr9_el1, x14
-       msr     dbgwcr8_el1, x13
-       msr     dbgwcr7_el1, x12
-       msr     dbgwcr6_el1, x11
-       msr     dbgwcr5_el1, x10
-       msr     dbgwcr4_el1, x9
-       msr     dbgwcr3_el1, x8
-       msr     dbgwcr2_el1, x7
-       msr     dbgwcr1_el1, x6
-       msr     dbgwcr0_el1, x5
-
-       add     x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       ldr     x20, [x3, #(15 * 8)]
-       ldr     x19, [x3, #(14 * 8)]
-       ldr     x18, [x3, #(13 * 8)]
-       ldr     x17, [x3, #(12 * 8)]
-       ldr     x16, [x3, #(11 * 8)]
-       ldr     x15, [x3, #(10 * 8)]
-       ldr     x14, [x3, #(9 * 8)]
-       ldr     x13, [x3, #(8 * 8)]
-       ldr     x12, [x3, #(7 * 8)]
-       ldr     x11, [x3, #(6 * 8)]
-       ldr     x10, [x3, #(5 * 8)]
-       ldr     x9, [x3, #(4 * 8)]
-       ldr     x8, [x3, #(3 * 8)]
-       ldr     x7, [x3, #(2 * 8)]
-       ldr     x6, [x3, #(1 * 8)]
-       ldr     x5, [x3, #(0 * 8)]
-
-       adr     x26, 1f
-       add     x26, x26, x25, lsl #2
-       br      x26
-1:
-       msr     dbgwvr15_el1, x20
-       msr     dbgwvr14_el1, x19
-       msr     dbgwvr13_el1, x18
-       msr     dbgwvr12_el1, x17
-       msr     dbgwvr11_el1, x16
-       msr     dbgwvr10_el1, x15
-       msr     dbgwvr9_el1, x14
-       msr     dbgwvr8_el1, x13
-       msr     dbgwvr7_el1, x12
-       msr     dbgwvr6_el1, x11
-       msr     dbgwvr5_el1, x10
-       msr     dbgwvr4_el1, x9
-       msr     dbgwvr3_el1, x8
-       msr     dbgwvr2_el1, x7
-       msr     dbgwvr1_el1, x6
-       msr     dbgwvr0_el1, x5
-
-       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
-       msr     mdccint_el1, x21
+       msr     \type\()15_el1, x21
+       msr     \type\()14_el1, x20
+       msr     \type\()13_el1, x19
+       msr     \type\()12_el1, x18
+       msr     \type\()11_el1, x17
+       msr     \type\()10_el1, x16
+       msr     \type\()9_el1, x15
+       msr     \type\()8_el1, x14
+       msr     \type\()7_el1, x13
+       msr     \type\()6_el1, x12
+       msr     \type\()5_el1, x11
+       msr     \type\()4_el1, x10
+       msr     \type\()3_el1, x9
+       msr     \type\()2_el1, x8
+       msr     \type\()1_el1, x7
+       msr     \type\()0_el1, x6
 .endm
 
 .macro skip_32bit_state tmp, target
        tbz     \tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 .endm
 
+/*
+ * Branch to target if CPTR_EL2.TFP bit is set (VFP/SIMD trapping enabled)
+ */
+.macro skip_fpsimd_state tmp, target
+       mrs     \tmp, cptr_el2
+       tbnz    \tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
 .macro compute_debug_state target
        // Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
        // is set, we do a full save/restore cycle and disable trapping.
        add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
        mrs     x4, dacr32_el2
        mrs     x5, ifsr32_el2
-       mrs     x6, fpexc32_el2
        stp     x4, x5, [x3]
-       str     x6, [x3, #16]
 
+       skip_fpsimd_state x8, 3f
+       mrs     x6, fpexc32_el2
+       str     x6, [x3, #16]
+3:
        skip_debug_state x8, 2f
        mrs     x7, dbgvcr32_el2
        str     x7, [x3, #24]
 
        add     x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
        ldp     x4, x5, [x3]
-       ldr     x6, [x3, #16]
        msr     dacr32_el2, x4
        msr     ifsr32_el2, x5
-       msr     fpexc32_el2, x6
 
        skip_debug_state x8, 2f
        ldr     x7, [x3, #24]
 
 .macro activate_traps
        ldr     x2, [x0, #VCPU_HCR_EL2]
+
+       /*
+        * We are about to set CPTR_EL2.TFP to trap all floating point
+        * register accesses to EL2, however, the ARM ARM clearly states that
+        * traps are only taken to EL2 if the operation would not otherwise
+        * trap to EL1.  Therefore, always make sure that for 32-bit guests,
+        * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+        */
+       tbnz    x2, #HCR_RW_SHIFT, 99f // open code skip_32bit_state
+       mov     x3, #(1 << 30)
+       msr     fpexc32_el2, x3
+       isb
+99:
        msr     hcr_el2, x2
        mov     x2, #CPTR_EL2_TTA
+       orr     x2, x2, #CPTR_EL2_TFP
        msr     cptr_el2, x2
 
        mov     x2, #(1 << 15)  // Trap CP15 Cr=15
        msr     hstr_el2, x2
 
-       mrs     x2, mdcr_el2
-       and     x2, x2, #MDCR_EL2_HPMN_MASK
-       orr     x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
-       orr     x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA)
-
-       // Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap
-       // if not dirty.
-       ldr     x3, [x0, #VCPU_DEBUG_FLAGS]
-       tbnz    x3, #KVM_ARM64_DEBUG_DIRTY_SHIFT, 1f
-       orr     x2, x2,  #MDCR_EL2_TDA
-1:
+       // Monitor Debug Config - see kvm_arm_setup_debug()
+       ldr     x2, [x0, #VCPU_MDCR_EL2]
        msr     mdcr_el2, x2
 .endm
 
 .macro deactivate_traps
        mov     x2, #HCR_RW
        msr     hcr_el2, x2
-       msr     cptr_el2, xzr
        msr     hstr_el2, xzr
 
        mrs     x2, mdcr_el2
@@ -900,21 +622,101 @@ __restore_sysregs:
        restore_sysregs
        ret
 
+/* Save debug state */
 __save_debug:
-       save_debug
+       // x2: ptr to CPU context
+       // x3: ptr to debug reg struct
+       // x4/x5/x6-22/x24-26: trashed
+
+       mrs     x26, id_aa64dfr0_el1
+       ubfx    x24, x26, #12, #4       // Extract BRPs
+       ubfx    x25, x26, #20, #4       // Extract WRPs
+       mov     w26, #15
+       sub     w24, w26, w24           // How many BPs to skip
+       sub     w25, w26, w25           // How many WPs to skip
+
+       mov     x5, x24
+       add     x4, x3, #DEBUG_BCR
+       save_debug dbgbcr
+       add     x4, x3, #DEBUG_BVR
+       save_debug dbgbvr
+
+       mov     x5, x25
+       add     x4, x3, #DEBUG_WCR
+       save_debug dbgwcr
+       add     x4, x3, #DEBUG_WVR
+       save_debug dbgwvr
+
+       mrs     x21, mdccint_el1
+       str     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
        ret
 
+/* Restore debug state */
 __restore_debug:
-       restore_debug
+       // x2: ptr to CPU context
+       // x3: ptr to debug reg struct
+       // x4/x5/x6-22/x24-26: trashed
+
+       mrs     x26, id_aa64dfr0_el1
+       ubfx    x24, x26, #12, #4       // Extract BRPs
+       ubfx    x25, x26, #20, #4       // Extract WRPs
+       mov     w26, #15
+       sub     w24, w26, w24           // How many BPs to skip
+       sub     w25, w26, w25           // How many WPs to skip
+
+       mov     x5, x24
+       add     x4, x3, #DEBUG_BCR
+       restore_debug dbgbcr
+       add     x4, x3, #DEBUG_BVR
+       restore_debug dbgbvr
+
+       mov     x5, x25
+       add     x4, x3, #DEBUG_WCR
+       restore_debug dbgwcr
+       add     x4, x3, #DEBUG_WVR
+       restore_debug dbgwvr
+
+       ldr     x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+       msr     mdccint_el1, x21
+
        ret
 
 __save_fpsimd:
+       skip_fpsimd_state x3, 1f
        save_fpsimd
-       ret
+1:     ret
 
 __restore_fpsimd:
+       skip_fpsimd_state x3, 1f
        restore_fpsimd
-       ret
+1:     ret
+
+switch_to_guest_fpsimd:
+       push    x4, lr
+
+       mrs     x2, cptr_el2
+       bic     x2, x2, #CPTR_EL2_TFP
+       msr     cptr_el2, x2
+       isb
+
+       mrs     x0, tpidr_el2
+
+       ldr     x2, [x0, #VCPU_HOST_CONTEXT]
+       kern_hyp_va x2
+       bl __save_fpsimd
+
+       add     x2, x0, #VCPU_CONTEXT
+       bl __restore_fpsimd
+
+       skip_32bit_state x3, 1f
+       ldr     x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
+       msr     fpexc32_el2, x4
+1:
+       pop     x4, lr
+       pop     x2, x3
+       pop     x0, x1
+
+       eret
 
 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -936,10 +738,10 @@ ENTRY(__kvm_vcpu_run)
        kern_hyp_va x2
 
        save_host_regs
-       bl __save_fpsimd
        bl __save_sysregs
 
        compute_debug_state 1f
+       add     x3, x0, #VCPU_HOST_DEBUG_STATE
        bl      __save_debug
 1:
        activate_traps
@@ -952,9 +754,10 @@ ENTRY(__kvm_vcpu_run)
        add     x2, x0, #VCPU_CONTEXT
 
        bl __restore_sysregs
-       bl __restore_fpsimd
 
        skip_debug_state x3, 1f
+       ldr     x3, [x0, #VCPU_DEBUG_PTR]
+       kern_hyp_va x3
        bl      __restore_debug
 1:
        restore_guest_32bit_state
@@ -975,6 +778,8 @@ __kvm_vcpu_return:
        bl __save_sysregs
 
        skip_debug_state x3, 1f
+       ldr     x3, [x0, #VCPU_DEBUG_PTR]
+       kern_hyp_va x3
        bl      __save_debug
 1:
        save_guest_32bit_state
@@ -991,12 +796,15 @@ __kvm_vcpu_return:
 
        bl __restore_sysregs
        bl __restore_fpsimd
+       /* Clear FPSIMD and Trace trapping */
+       msr     cptr_el2, xzr
 
        skip_debug_state x3, 1f
        // Clear the dirty flag for the next run, as all the state has
        // already been saved. Note that we nuke the whole 64bit word.
        // If we ever add more flags, we'll have to be more careful...
        str     xzr, [x0, #VCPU_DEBUG_FLAGS]
+       add     x3, x0, #VCPU_HOST_DEBUG_STATE
        bl      __restore_debug
 1:
        restore_host_regs
@@ -1199,6 +1007,11 @@ el1_trap:
         * x1: ESR
         * x2: ESR_EC
         */
+
+       /* Guest accessed VFP/SIMD registers, save host, restore Guest */
+       cmp     x2, #ESR_ELx_EC_FP_ASIMD
+       b.eq    switch_to_guest_fpsimd
+
        cmp     x2, #ESR_ELx_EC_DABT_LOW
        mov     x0, #ESR_ELx_EC_IABT_LOW
        ccmp    x2, x0, #4, ne
@@ -1293,4 +1106,10 @@ ENTRY(__kvm_hyp_vector)
        ventry  el1_error_invalid               // Error 32-bit EL1
 ENDPROC(__kvm_hyp_vector)
 
+
+ENTRY(__kvm_get_mdcr_el2)
+       mrs     x0, mdcr_el2
+       ret
+ENDPROC(__kvm_get_mdcr_el2)
+
        .popsection
index 0b43265789858cbe71f761eebbc48927834b7fe8..91cf5350b3283232cd6d88aca9af669ce972c697 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/errno.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
+#include <linux/hw_breakpoint.h>
 
 #include <kvm/arm_arch_timer.h>
 
@@ -56,6 +57,12 @@ static bool cpu_has_32bit_el1(void)
        return !!(pfr0 & 0x20);
 }
 
+/**
+ * kvm_arch_dev_ioctl_check_extension
+ *
+ * We currently assume that the number of HW registers is uniform
+ * across all CPUs (see cpuinfo_sanity_check).
+ */
 int kvm_arch_dev_ioctl_check_extension(long ext)
 {
        int r;
@@ -64,6 +71,15 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
        case KVM_CAP_ARM_EL1_32BIT:
                r = cpu_has_32bit_el1();
                break;
+       case KVM_CAP_GUEST_DEBUG_HW_BPS:
+               r = get_num_brps();
+               break;
+       case KVM_CAP_GUEST_DEBUG_HW_WPS:
+               r = get_num_wrps();
+               break;
+       case KVM_CAP_SET_GUEST_DEBUG:
+               r = 1;
+               break;
        default:
                r = 0;
        }
@@ -105,7 +121,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
        kvm_reset_sys_regs(vcpu);
 
        /* Reset timer */
-       kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-       return 0;
+       return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
index c370b4014799697c292a99cf24721a425fcb3790..b41607d270ac83ebd1413a185753bf0b9e2af7f0 100644 (file)
@@ -38,6 +38,8 @@
 
 #include "sys_regs.h"
 
+#include "trace.h"
+
 /*
  * All of this file is extremly similar to the ARM coproc.c, but the
  * types are different. My gut feeling is that it should be pretty
@@ -208,9 +210,217 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
                *vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg);
        }
 
+       trace_trap_reg(__func__, r->reg, p->is_write, *vcpu_reg(vcpu, p->Rt));
+
+       return true;
+}
+
+/*
+ * reg_to_dbg/dbg_to_reg
+ *
+ * A 32 bit write to a debug register leave top bits alone
+ * A 32 bit read from a debug register only returns the bottom bits
+ *
+ * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the
+ * hyp.S code switches between host and guest values in future.
+ */
+static inline void reg_to_dbg(struct kvm_vcpu *vcpu,
+                             const struct sys_reg_params *p,
+                             u64 *dbg_reg)
+{
+       u64 val = *vcpu_reg(vcpu, p->Rt);
+
+       if (p->is_32bit) {
+               val &= 0xffffffffUL;
+               val |= ((*dbg_reg >> 32) << 32);
+       }
+
+       *dbg_reg = val;
+       vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+}
+
+static inline void dbg_to_reg(struct kvm_vcpu *vcpu,
+                             const struct sys_reg_params *p,
+                             u64 *dbg_reg)
+{
+       u64 val = *dbg_reg;
+
+       if (p->is_32bit)
+               val &= 0xffffffffUL;
+
+       *vcpu_reg(vcpu, p->Rt) = val;
+}
+
+static inline bool trap_bvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_bvr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_bcr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+
+       return 0;
+}
+
+static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_bcr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write,
+               vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]);
+
        return true;
 }
 
+static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_wvr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wcr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (p->is_write)
+               reg_to_dbg(vcpu, p, dbg_reg);
+       else
+               dbg_to_reg(vcpu, p, dbg_reg);
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+               const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+       const struct kvm_one_reg *reg, void __user *uaddr)
+{
+       __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+       if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+               return -EFAULT;
+       return 0;
+}
+
+static inline void reset_wcr(struct kvm_vcpu *vcpu,
+                            const struct sys_reg_desc *rd)
+{
+       vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val;
+}
+
 static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
        u64 amair;
@@ -240,16 +450,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)                                     \
        /* DBGBVRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100),     \
-         trap_debug_regs, reset_val, (DBGBVR0_EL1 + (n)), 0 },         \
+         trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr },                \
        /* DBGBCRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101),     \
-         trap_debug_regs, reset_val, (DBGBCR0_EL1 + (n)), 0 },         \
+         trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr },                \
        /* DBGWVRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110),     \
-         trap_debug_regs, reset_val, (DBGWVR0_EL1 + (n)), 0 },         \
+         trap_wvr, reset_wvr, n, 0,  get_wvr, set_wvr },               \
        /* DBGWCRn_EL1 */                                               \
        { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),     \
-         trap_debug_regs, reset_val, (DBGWCR0_EL1 + (n)), 0 }
+         trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
 /*
  * Architected system registers.
@@ -516,28 +726,57 @@ static bool trap_debug32(struct kvm_vcpu *vcpu,
        return true;
 }
 
-#define DBG_BCR_BVR_WCR_WVR(n)                                 \
-       /* DBGBVRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_debug32,    \
-         NULL, (cp14_DBGBVR0 + (n) * 2) },                     \
-       /* DBGBCRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_debug32,    \
-         NULL, (cp14_DBGBCR0 + (n) * 2) },                     \
-       /* DBGWVRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_debug32,    \
-         NULL, (cp14_DBGWVR0 + (n) * 2) },                     \
-       /* DBGWCRn */                                           \
-       { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_debug32,    \
-         NULL, (cp14_DBGWCR0 + (n) * 2) }
-
-#define DBGBXVR(n)                                             \
-       { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_debug32,    \
-         NULL, cp14_DBGBXVR0 + n * 2 }
+/* AArch32 debug register mappings
+ *
+ * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0]
+ * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32]
+ *
+ * All control registers and watchpoint value registers are mapped to
+ * the lower 32 bits of their AArch64 equivalents. We share the trap
+ * handlers with the above AArch64 code which checks what mode the
+ * system is in.
+ */
+
+static inline bool trap_xvr(struct kvm_vcpu *vcpu,
+                           const struct sys_reg_params *p,
+                           const struct sys_reg_desc *rd)
+{
+       u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+       if (p->is_write) {
+               u64 val = *dbg_reg;
+
+               val &= 0xffffffffUL;
+               val |= *vcpu_reg(vcpu, p->Rt) << 32;
+               *dbg_reg = val;
+
+               vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+       } else {
+               *vcpu_reg(vcpu, p->Rt) = *dbg_reg >> 32;
+       }
+
+       trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+       return true;
+}
+
+#define DBG_BCR_BVR_WCR_WVR(n)                                         \
+       /* DBGBVRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n },     \
+       /* DBGBCRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n },     \
+       /* DBGWVRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n },     \
+       /* DBGWCRn */                                                   \
+       { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
+
+#define DBGBXVR(n)                                                     \
+       { Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n }
 
 /*
  * Trapped cp14 registers. We generally ignore most of the external
  * debug, on the principle that they don't really make sense to a
- * guest. Revisit this one day, whould this principle change.
+ * guest. Revisit this one day, would this principle change.
  */
 static const struct sys_reg_desc cp14_regs[] = {
        /* DBGIDR */
@@ -999,6 +1238,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_hsr(vcpu);
 
+       trace_kvm_handle_sys_reg(esr);
+
        params.is_aarch32 = false;
        params.is_32bit = false;
        params.Op0 = (esr >> 20) & 3;
@@ -1303,6 +1544,9 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
        if (!r)
                return get_invariant_sys_reg(reg->id, uaddr);
 
+       if (r->get_user)
+               return (r->get_user)(vcpu, r, reg, uaddr);
+
        return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
 }
 
@@ -1321,6 +1565,9 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
        if (!r)
                return set_invariant_sys_reg(reg->id, uaddr);
 
+       if (r->set_user)
+               return (r->set_user)(vcpu, r, reg, uaddr);
+
        return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
 }
 
index d411e251412c316bb28b7d971b4dbc0c6e996ee3..eaa324e4db4da1149adfe7ae012b895353f05e91 100644 (file)
@@ -55,6 +55,12 @@ struct sys_reg_desc {
 
        /* Value (usually reset value) */
        u64 val;
+
+       /* Custom get/set_user functions, fallback to generic if NULL */
+       int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                       const struct kvm_one_reg *reg, void __user *uaddr);
+       int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+                       const struct kvm_one_reg *reg, void __user *uaddr);
 };
 
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)
index 475fd29293102649ea45ee411c077cca64f37782..1e4576824165502d2a6d5a1caee9e0c1da10c3fa 100644 (file)
@@ -94,6 +94,8 @@ static int __init sys_reg_genericv8_init(void)
                                          &genericv8_target_table);
        kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,
                                          &genericv8_target_table);
+       kvm_register_target_sys_reg_table(KVM_ARM_TARGET_GENERIC_V8,
+                                         &genericv8_target_table);
 
        return 0;
 }
index 157416e963f2f2e2e2ee83a2656e27fb611f7b76..7fb0008c4fa3d487cc282225b13fbec8fb3fc215 100644 (file)
@@ -44,6 +44,129 @@ TRACE_EVENT(kvm_hvc_arm64,
                  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 
+TRACE_EVENT(kvm_arm_setup_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+       TP_PROTO(__u32 guest_debug),
+       TP_ARGS(guest_debug),
+
+       TP_STRUCT__entry(
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+       TP_PROTO(const char *name, __u32 value),
+       TP_ARGS(name, value),
+
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(__u32, value)
+       ),
+
+       TP_fast_assign(
+               __entry->name = name;
+               __entry->value = value;
+       ),
+
+       TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_EVENT(kvm_arm_set_regset,
+       TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+       TP_ARGS(type, len, control, value),
+       TP_STRUCT__entry(
+               __field(const char *, name)
+               __field(int, len)
+               __array(u64, ctrls, 16)
+               __array(u64, values, 16)
+       ),
+       TP_fast_assign(
+               __entry->name = type;
+               __entry->len = len;
+               memcpy(__entry->ctrls, control, len << 3);
+               memcpy(__entry->values, value, len << 3);
+       ),
+       TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+               __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+               __print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+       TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+       TP_ARGS(fn, reg, is_write, write_value),
+
+       TP_STRUCT__entry(
+               __field(const char *, fn)
+               __field(int, reg)
+               __field(bool, is_write)
+               __field(u64, write_value)
+       ),
+
+       TP_fast_assign(
+               __entry->fn = fn;
+               __entry->reg = reg;
+               __entry->is_write = is_write;
+               __entry->write_value = write_value;
+       ),
+
+       TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+       TP_PROTO(unsigned long hsr),
+       TP_ARGS(hsr),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,  hsr)
+       ),
+
+       TP_fast_assign(
+               __entry->hsr = hsr;
+       ),
+
+       TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+       TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+       TP_ARGS(vcpu, guest_debug),
+
+       TP_STRUCT__entry(
+               __field(struct kvm_vcpu *, vcpu)
+               __field(__u32, guest_debug)
+       ),
+
+       TP_fast_assign(
+               __entry->vcpu = vcpu;
+               __entry->guest_debug = guest_debug;
+       ),
+
+       TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+
 #endif /* _TRACE_ARM64_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
index 0bcc4bc94b4ad3b4d72d735dc7e88a190c2e5cbe..99224dcebdc51d40cb2dff423280727ec44bacd3 100644 (file)
@@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size,
        if (IS_ENABLED(CONFIG_ZONE_DMA) &&
            dev->coherent_dma_mask <= DMA_BIT_MASK(32))
                flags |= GFP_DMA;
-       if (IS_ENABLED(CONFIG_DMA_CMA) && (flags & __GFP_WAIT)) {
+       if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) {
                struct page *page;
                void *addr;
 
index 0314e325a669345eb8f9aea8d006a0bc66a13e1d..8da5653bd8958605e1dbb2e3e0335aaa24abc9dc 100644 (file)
@@ -36,6 +36,17 @@ config FORCE_MAX_ZONEORDER
        int
        default 6
 
+config TRACE_IRQFLAGS_SUPPORT
+       depends on ETRAX_ARCH_V32
+       def_bool y
+
+config STACKTRACE_SUPPORT
+       def_bool y
+
+config LOCKDEP_SUPPORT
+       depends on ETRAX_ARCH_V32
+       def_bool y
+
 config CRIS
        bool
        default y
@@ -58,6 +69,7 @@ config CRIS
        select CLKSRC_MMIO if ETRAX_ARCH_V32
        select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
        select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
+       select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
 
 config HZ
        int
index 81570fcd0412815917617ee4a3063c2a76c32976..b5622521dad50b18fc4065e823d7bd9c686279a3 100644 (file)
@@ -955,6 +955,14 @@ sys_call_table:
        .long sys_process_vm_writev
        .long sys_kcmp                  /* 350 */
        .long sys_finit_module
+       .long sys_sched_setattr
+       .long sys_sched_getattr
+       .long sys_renameat2
+       .long sys_seccomp               /* 355 */
+       .long sys_getrandom
+       .long sys_memfd_create
+       .long sys_bpf
+       .long sys_execveat
 
         /*
          * NOTE!! This doesn't have to be exact - we just have
diff --git a/arch/cris/arch-v10/lib/dmacopy.c b/arch/cris/arch-v10/lib/dmacopy.c
deleted file mode 100644 (file)
index 49f5b8c..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * memcpy for large blocks, using memory-memory DMA channels 6 and 7 in Etrax
- */
-
-#include <asm/svinto.h>
-#include <asm/io.h>
-
-#define D(x)
-
-void *dma_memcpy(void *pdst,
-                const void *psrc,
-                unsigned int pn)
-{
-       static etrax_dma_descr indma, outdma;
-
-       D(printk(KERN_DEBUG "dma_memcpy %d bytes... ", pn));
-
-#if 0
-       *R_GEN_CONFIG = genconfig_shadow =
-               (genconfig_shadow & ~0x3c0000) |
-               IO_STATE(R_GEN_CONFIG, dma6, intdma7) |
-               IO_STATE(R_GEN_CONFIG, dma7, intdma6);
-#endif
-       indma.sw_len = outdma.sw_len = pn;
-       indma.ctrl = d_eol | d_eop;
-       outdma.ctrl = d_eol;
-       indma.buf = psrc;
-       outdma.buf = pdst;
-
-       *R_DMA_CH6_FIRST = &indma;
-       *R_DMA_CH7_FIRST = &outdma;
-       *R_DMA_CH6_CMD = IO_STATE(R_DMA_CH6_CMD, cmd, start);
-       *R_DMA_CH7_CMD = IO_STATE(R_DMA_CH7_CMD, cmd, start);
-
-       while (*R_DMA_CH7_CMD == 1)
-               /* wait for completion */;
-
-       D(printk(KERN_DEBUG "done\n"));
-}
-
-
-
diff --git a/arch/cris/arch-v10/lib/old_checksum.c b/arch/cris/arch-v10/lib/old_checksum.c
deleted file mode 100644 (file)
index 8f79163..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * INET                An implementation of the TCP/IP protocol suite for the LINUX
- *             operating system.  INET is implemented using the  BSD Socket
- *             interface as the means of communication with the user level.
- *
- *             IP/TCP/UDP checksumming routines
- *
- * Authors:    Jorge Cwik, <jorge@laser.satlink.net>
- *             Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *             Tom May, <ftom@netcom.com>
- *             Lots of code moved from tcp.c and ip.c; see those files
- *             for more names.
- *
- *             This program is free software; you can redistribute it and/or
- *             modify it under the terms of the GNU General Public License
- *             as published by the Free Software Foundation; either version
- *             2 of the License, or (at your option) any later version.
- */
-
-#include <net/checksum.h>
-#include <net/module.h>
-
-#undef PROFILE_CHECKSUM
-
-#ifdef PROFILE_CHECKSUM
-/* these are just for profiling the checksum code with an oscillioscope.. uh */
-#if 0
-#define BITOFF *((unsigned char *)0xb0000030) = 0xff
-#define BITON *((unsigned char *)0xb0000030) = 0x0
-#endif
-#include <asm/io.h>
-#define CBITON LED_ACTIVE_SET(1)
-#define CBITOFF LED_ACTIVE_SET(0)
-#define BITOFF
-#define BITON
-#else
-#define BITOFF
-#define BITON
-#define CBITOFF
-#define CBITON
-#endif
-
-/*
- * computes a partial checksum, e.g. for TCP/UDP fragments
- */
-
-#include <asm/delay.h>
-
-__wsum csum_partial(const void *p, int len, __wsum __sum)
-{
-       u32 sum = (__force u32)__sum;
-       const u16 *buff = p;
-       /*
-       * Experiments with ethernet and slip connections show that buff
-       * is aligned on either a 2-byte or 4-byte boundary.
-       */
-       const void *endMarker = p + len;
-       const void *marker = endMarker - (len % 16);
-#if 0
-       if((int)buff & 0x3)
-               printk("unaligned buff %p\n", buff);
-       __delay(900); /* extra delay of 90 us to test performance hit */
-#endif
-       BITON;
-       while (buff < marker) {
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-               sum += *buff++;
-       }
-       marker = endMarker - (len % 2);
-       while (buff < marker)
-               sum += *buff++;
-
-       if (endMarker > buff)
-               sum += *(const u8 *)buff;       /* add extra byte separately */
-
-       BITOFF;
-       return (__force __wsum)sum;
-}
-
-EXPORT_SYMBOL(csum_partial);
index 4fc16b44fff26e5196729e61311464bc2c214b82..e6c523cc40bc824bd1c83735873b25426f0a46bc 100644 (file)
@@ -202,7 +202,7 @@ config ETRAX_PA_CHANGEABLE_DIR
        default "0x00" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (8 bits) with information of what bits in PA that a
+         This is a bitmask with information of what bits in PA that a
          user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0 here, but it depends on your hardware.
@@ -213,7 +213,7 @@ config ETRAX_PA_CHANGEABLE_BITS
        default "0x00" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (8 bits) with information of what bits in PA
+         This is a bitmask with information of what bits in PA
          that a user can change the value on using ioctl's.
          Bit set = changeable.
 
@@ -223,7 +223,7 @@ config ETRAX_PB_CHANGEABLE_DIR
        default "0x00000" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PB
+         This is a bitmask with information of what bits in PB
          that a user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0 here, but it depends on your hardware.
@@ -234,7 +234,7 @@ config ETRAX_PB_CHANGEABLE_BITS
        default "0x00000" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PB
+         This is a bitmask with information of what bits in PB
          that a user can change the value on using ioctl's.
          Bit set = changeable.
 
@@ -244,7 +244,7 @@ config ETRAX_PC_CHANGEABLE_DIR
        default "0x00000" if ETRAXFS
        default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PC
+         This is a bitmask with information of what bits in PC
          that a user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0 here, but it depends on your hardware.
@@ -253,9 +253,9 @@ config ETRAX_PC_CHANGEABLE_BITS
        hex "PC user changeable bits mask"
        depends on ETRAX_GPIO
        default "0x00000" if ETRAXFS
-       default "0x00000000" if ETRAXFS
+       default "0x00000000" if !ETRAXFS
        help
-         This is a bitmask (18 bits) with information of what bits in PC
+         This is a bitmask with information of what bits in PC
          that a user can change the value on using ioctl's.
          Bit set = changeable.
 
@@ -264,7 +264,7 @@ config ETRAX_PD_CHANGEABLE_DIR
        depends on ETRAX_GPIO && ETRAXFS
        default "0x00000"
        help
-         This is a bitmask (18 bits) with information of what bits in PD
+         This is a bitmask with information of what bits in PD
          that a user can change direction on using ioctl's.
          Bit set = changeable.
          You probably want 0x00000 here, but it depends on your hardware.
index 28dd77144e8fe8e24a0ac94cd309b5ac1587aac4..5387424683ccce2b26e912474a0e2974e6abc558 100644 (file)
@@ -313,6 +313,7 @@ static int __init init_axis_flash(void)
        size_t len;
        int ram_rootfs_partition = -1; /* -1 => no RAM rootfs partition */
        int part;
+       struct mtd_partition *partition;
 
        /* We need a root fs. If it resides in RAM, we need to use an
         * MTDRAM device, so it must be enabled in the kernel config,
@@ -329,7 +330,7 @@ static int __init init_axis_flash(void)
 
        main_mtd = flash_probe();
        if (main_mtd)
-               printk(KERN_INFO "%s: 0x%08x bytes of NOR flash memory.\n",
+               printk(KERN_INFO "%s: 0x%08llx bytes of NOR flash memory.\n",
                       main_mtd->name, main_mtd->size);
 
 #ifdef CONFIG_ETRAX_NANDFLASH
@@ -388,10 +389,10 @@ static int __init init_axis_flash(void)
 #endif
 
        if (main_mtd) {
+               loff_t ptable_sector = CONFIG_ETRAX_PTABLE_SECTOR;
                main_mtd->owner = THIS_MODULE;
                axisflash_mtd = main_mtd;
 
-               loff_t ptable_sector = CONFIG_ETRAX_PTABLE_SECTOR;
 
                /* First partition (rescue) is always set to the default. */
                pidx++;
@@ -517,7 +518,7 @@ static int __init init_axis_flash(void)
        /* Decide whether to use default partition table. */
        /* Only use default table if we actually have a device (main_mtd) */
 
-       struct mtd_partition *partition = &axis_partitions[0];
+       partition = &axis_partitions[0];
        if (main_mtd && !ptable_ok) {
                memcpy(axis_partitions, axis_default_partitions,
                       sizeof(axis_default_partitions));
@@ -580,7 +581,7 @@ static int __init init_axis_flash(void)
                        printk(KERN_INFO "axisflashmap: Adding RAM partition "
                               "for rootfs image.\n");
                        err = mtdram_init_device(mtd_ram,
-                                                (void *)partition[part].offset,
+                                                (void *)(u_int32_t)partition[part].offset,
                                                 partition[part].size,
                                                 partition[part].name);
                        if (err)
index 74f9fe80940c73cd2d66bf9a8e9d5d23c4e5ab38..c92e1da3684db69c867cbbf9f5aa9521891bef54 100644 (file)
@@ -957,7 +957,7 @@ static void __init virtual_gpio_init(void)
 
 static int __init gpio_init(void)
 {
-       int res;
+       int res, res2;
 
        printk(KERN_INFO "ETRAX FS GPIO driver v2.7, (c) 2003-2008 "
                "Axis Communications AB\n");
@@ -977,7 +977,7 @@ static int __init gpio_init(void)
        CRIS_LED_DISK_READ(0);
        CRIS_LED_DISK_WRITE(0);
 
-       int res2 = request_irq(GIO_INTR_VECT, gpio_interrupt,
+       res2 = request_irq(GIO_INTR_VECT, gpio_interrupt,
                IRQF_SHARED, "gpio", &alarmlist);
        if (res2) {
                printk(KERN_ERR "err: irq for gpio\n");
index 009f4ee1bd09535d8a6753462d1fff9c1d84f9fa..72968fbf814b88fcb75d67d39d4dbab817317057 100644 (file)
@@ -425,12 +425,11 @@ gpio_open(struct inode *inode, struct file *filp)
        if (p > GPIO_MINOR_LAST)
                return -EINVAL;
 
-       priv = kmalloc(sizeof(struct gpio_private), GFP_KERNEL);
+       priv = kzalloc(sizeof(struct gpio_private), GFP_KERNEL);
        if (!priv)
                return -ENOMEM;
 
        mutex_lock(&gpio_mutex);
-       memset(priv, 0, sizeof(*priv));
 
        priv->minor = p;
 
index 026a0b21b8f0f4018a6922386e2913c368b37b83..b17a20999f87b2812c553dbe39c0b44a90a8d20e 100644 (file)
@@ -240,6 +240,17 @@ ret_from_sys_call:
 
        .type   _Rexit,@function
 _Rexit:
+#if defined(CONFIG_TRACE_IRQFLAGS)
+       addoq   +PT_ccs, $sp, $acr
+       move.d  [$acr], $r0
+       btstq   15, $r0         ; I1
+       bpl     1f
+       nop
+       jsr     trace_hardirqs_on
+       nop
+1:
+#endif
+
        ;; This epilogue MUST match the prologues in multiple_interrupt, irq.h
        ;; and ptregs.h.
        addq    4, $sp          ; Skip orig_r10.
@@ -875,6 +886,14 @@ sys_call_table:
        .long sys_process_vm_writev
        .long sys_kcmp                  /* 350 */
        .long sys_finit_module
+       .long sys_sched_setattr
+       .long sys_sched_getattr
+       .long sys_renameat2
+       .long sys_seccomp               /* 355 */
+       .long sys_getrandom
+       .long sys_memfd_create
+       .long sys_bpf
+       .long sys_execveat
 
        /*
         * NOTE!! This doesn't have to be exact - we just have
index cebd32e2a8fb912b01c007de6756d5f129a52ca9..c7ce784a393cc24bfaa1a03ba6a486fff9e33363 100644 (file)
@@ -23,9 +23,9 @@ extern void stop_watchdog(void);
 /* We use this if we don't have any better idle routine. */
 void default_idle(void)
 {
+       local_irq_enable();
        /* Halt until exception. */
-       __asm__ volatile("ei    \n\t"
-                        "halt      ");
+       __asm__ volatile("halt");
 }
 
 /*
index 3a36ae6b79d5a30d2723eee234ef9e2063be0f78..150d1d76c29d2e2c038141177145f82cbfc3b03a 100644 (file)
@@ -19,7 +19,6 @@
 #include <asm/processor.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
-#include <arch/ptrace.h>
 #include <arch/hwregs/cpu_vect.h>
 
 extern unsigned long cris_signal_return_page;
index 05a04708b8eb45b43bfe6cb615e306b472c3de32..d8a3a3c439dd0810bd1593b18587840a92b35ad0 100644 (file)
@@ -46,6 +46,8 @@ static int __crisv32_pinmux_alloc(int port, int first_pin, int last_pin,
                pins[port][i] = mode;
 
        crisv32_pinmux_set(port);
+
+       return 0;
 }
 
 static int crisv32_pinmux_init(void)
@@ -93,6 +95,7 @@ int crisv32_pinmux_alloc_fixed(enum fixed_function function)
        int ret = -EINVAL;
        char saved[sizeof pins];
        unsigned long flags;
+       reg_pinmux_rw_hwprot hwprot;
 
        spin_lock_irqsave(&pinmux_lock, flags);
 
@@ -101,7 +104,7 @@ int crisv32_pinmux_alloc_fixed(enum fixed_function function)
 
        crisv32_pinmux_init();  /* Must be done before we read rw_hwprot */
 
-       reg_pinmux_rw_hwprot hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
+       hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
 
        switch (function) {
        case pinmux_ser1:
@@ -227,6 +230,7 @@ int crisv32_pinmux_dealloc_fixed(enum fixed_function function)
        int ret = -EINVAL;
        char saved[sizeof pins];
        unsigned long flags;
+       reg_pinmux_rw_hwprot hwprot;
 
        spin_lock_irqsave(&pinmux_lock, flags);
 
@@ -235,7 +239,7 @@ int crisv32_pinmux_dealloc_fixed(enum fixed_function function)
 
        crisv32_pinmux_init();  /* Must be done before we read rw_hwprot */
 
-       reg_pinmux_rw_hwprot hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
+       hwprot = REG_RD(pinmux, regi_pinmux, rw_hwprot);
 
        switch (function) {
        case pinmux_ser1:
index 71854d41c5a0238b16c9c6ff20c9953534ecc7c1..70e497e0b03ee0b49f6079b8e791684e79b0f782 100644 (file)
@@ -12,10 +12,6 @@ CONFIG_ETRAX_FAST_TIMER=y
 CONFIG_CRIS_MACH_ARTPEC3=y
 CONFIG_ETRAX_DRAM_SIZE=32
 CONFIG_ETRAX_FLASH1_SIZE=4
-CONFIG_ETRAX_DEF_GIO_PA_OE=1c
-CONFIG_ETRAX_DEF_GIO_PA_OUT=00
-CONFIG_ETRAX_DEF_GIO_PB_OE=00000
-CONFIG_ETRAX_DEF_GIO_PB_OUT=00000
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -42,3 +38,4 @@ CONFIG_JFFS2_FS=y
 CONFIG_CRAMFS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
+CONFIG_ETRAX_GPIO=y
index 87c7227fecb2d622001d5cbdb7509a8881ae8796..91232680d6c8641bf1598aced914f2790e3bfd0c 100644 (file)
@@ -38,3 +38,4 @@ CONFIG_JFFS2_FS=y
 CONFIG_CRAMFS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
+CONFIG_ETRAX_GPIO=y
diff --git a/arch/cris/include/arch-v10/arch/elf.h b/arch/cris/include/arch-v10/arch/elf.h
deleted file mode 100644 (file)
index 1eb638a..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef __ASMCRIS_ARCH_ELF_H
-#define __ASMCRIS_ARCH_ELF_H
-
-#include <arch/system.h>
-
-#define ELF_MACH EF_CRIS_VARIANT_ANY_V0_V10
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x)                      \
- ((x)->e_machine == EM_CRIS                    \
-  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_ANY_V0_V10     \
-      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/ptrace.h>
-
-/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program
-   starts (a register; assume first param register for CRIS)
-   contains a pointer to a function which might be
-   registered using `atexit'.  This provides a mean for the
-   dynamic linker to call DT_FINI functions for shared libraries
-   that have been loaded before the code runs.
-
-   A value of 0 tells we have no such handler.  */
-
-/* Explicitly set registers to 0 to increase determinism.  */
-#define ELF_PLAT_INIT(_r, load_addr)   do { \
-       (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
-       (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
-       (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
-       (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
-} while (0)
-
-/* The additional layer below is because the stack pointer is missing in 
-   the pt_regs struct, but needed in a core dump. pr_reg is a elf_gregset_t,
-   and should be filled in according to the layout of the user_regs_struct
-   struct; regs is a pt_regs struct. We dump all registers, though several are
-   obviously unnecessary. That way there's less need for intelligence at 
-   the receiving end (i.e. gdb). */
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
-       pr_reg[0] = regs->r0;                              \
-       pr_reg[1] = regs->r1;                              \
-       pr_reg[2] = regs->r2;                              \
-       pr_reg[3] = regs->r3;                              \
-       pr_reg[4] = regs->r4;                              \
-       pr_reg[5] = regs->r5;                              \
-       pr_reg[6] = regs->r6;                              \
-       pr_reg[7] = regs->r7;                              \
-       pr_reg[8] = regs->r8;                              \
-       pr_reg[9] = regs->r9;                              \
-       pr_reg[10] = regs->r10;                            \
-       pr_reg[11] = regs->r11;                            \
-       pr_reg[12] = regs->r12;                            \
-       pr_reg[13] = regs->r13;                            \
-       pr_reg[14] = rdusp();               /* sp */       \
-       pr_reg[15] = regs->irp;             /* pc */       \
-       pr_reg[16] = 0;                     /* p0 */       \
-       pr_reg[17] = rdvr();                /* vr */       \
-       pr_reg[18] = 0;                     /* p2 */       \
-       pr_reg[19] = 0;                     /* p3 */       \
-       pr_reg[20] = 0;                     /* p4 */       \
-       pr_reg[21] = (regs->dccr & 0xffff); /* ccr */      \
-       pr_reg[22] = 0;                     /* p6 */       \
-       pr_reg[23] = regs->mof;             /* mof */      \
-       pr_reg[24] = 0;                     /* p8 */       \
-       pr_reg[25] = 0;                     /* ibr */      \
-       pr_reg[26] = 0;                     /* irp */      \
-       pr_reg[27] = regs->srp;             /* srp */      \
-       pr_reg[28] = 0;                     /* bar */      \
-       pr_reg[29] = regs->dccr;            /* dccr */     \
-       pr_reg[30] = 0;                     /* brp */      \
-       pr_reg[31] = rdusp();               /* usp */      \
-       pr_reg[32] = 0;                     /* csrinstr */ \
-       pr_reg[33] = 0;                     /* csraddr */  \
-       pr_reg[34] = 0;                     /* csrdata */
-
-
-#endif
diff --git a/arch/cris/include/arch-v10/arch/ptrace.h b/arch/cris/include/arch-v10/arch/ptrace.h
deleted file mode 100644 (file)
index 1a23273..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef _CRIS_ARCH_PTRACE_H
-#define _CRIS_ARCH_PTRACE_H
-
-/* Frame types */
-
-#define CRIS_FRAME_NORMAL   0 /* normal frame without SBFS stacking */
-#define CRIS_FRAME_BUSFAULT 1 /* frame stacked using SBFS, need RBF return
-                                path */
-
-/* Register numbers in the ptrace system call interface */
-
-#define PT_FRAMETYPE 0
-#define PT_ORIG_R10  1
-#define PT_R13       2
-#define PT_R12       3
-#define PT_R11       4
-#define PT_R10       5
-#define PT_R9        6
-#define PT_R8        7
-#define PT_R7        8
-#define PT_R6        9
-#define PT_R5        10
-#define PT_R4        11
-#define PT_R3        12
-#define PT_R2        13
-#define PT_R1        14
-#define PT_R0        15
-#define PT_MOF       16
-#define PT_DCCR      17
-#define PT_SRP       18
-#define PT_IRP       19    /* This is actually the debugged process' PC */
-#define PT_CSRINSTR  20    /* CPU Status record remnants -
-                             valid if frametype == busfault */
-#define PT_CSRADDR   21
-#define PT_CSRDATA   22
-#define PT_USP       23    /* special case - USP is not in the pt_regs */
-#define PT_MAX       23
-
-/* Condition code bit numbers.  The same numbers apply to CCR of course,
-   but we use DCCR everywhere else, so let's try and be consistent.  */
-#define C_DCCR_BITNR 0
-#define V_DCCR_BITNR 1
-#define Z_DCCR_BITNR 2
-#define N_DCCR_BITNR 3
-#define X_DCCR_BITNR 4
-#define I_DCCR_BITNR 5
-#define B_DCCR_BITNR 6
-#define M_DCCR_BITNR 7
-#define U_DCCR_BITNR 8
-#define P_DCCR_BITNR 9
-#define F_DCCR_BITNR 10
-
-/* pt_regs not only specifices the format in the user-struct during
- * ptrace but is also the frame format used in the kernel prologue/epilogues 
- * themselves
- */
-
-struct pt_regs {
-       unsigned long frametype;  /* type of stackframe */
-       unsigned long orig_r10;
-       /* pushed by movem r13, [sp] in SAVE_ALL, movem pushes backwards */
-       unsigned long r13;
-       unsigned long r12;
-       unsigned long r11;
-       unsigned long r10;
-       unsigned long r9;
-       unsigned long r8;
-       unsigned long r7;
-       unsigned long r6;
-       unsigned long r5;
-       unsigned long r4;
-       unsigned long r3;
-       unsigned long r2;
-       unsigned long r1;
-       unsigned long r0;
-       unsigned long mof;
-       unsigned long dccr;
-       unsigned long srp;
-       unsigned long irp; /* This is actually the debugged process' PC */
-       unsigned long csrinstr;
-       unsigned long csraddr;
-       unsigned long csrdata;
-};
-
-/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
- * when doing a context-switch. it is used (apart from in resume) when a new
- * thread is made and we need to make _resume (which is starting it for the
- * first time) realise what is going on.
- *
- * Actually, the use is very close to the thread struct (TSS) in that both the
- * switch_stack and the TSS are used to keep thread stuff when switching in
- * _resume.
- */
-
-struct switch_stack {
-       unsigned long r9;
-       unsigned long r8;
-       unsigned long r7;
-       unsigned long r6;
-       unsigned long r5;
-       unsigned long r4;
-       unsigned long r3;
-       unsigned long r2;
-       unsigned long r1;
-       unsigned long r0;
-       unsigned long return_ip; /* ip that _resume will return to */
-};
-
-#ifdef __KERNEL__
-
-/* bit 8 is user-mode flag */
-#define user_mode(regs) (((regs)->dccr & 0x100) != 0)
-#define instruction_pointer(regs) ((regs)->irp)
-#define profile_pc(regs) instruction_pointer(regs)
-
-#endif  /*  __KERNEL__  */
-
-#endif
index 0f211e13524841591926e43ae844666d44acf6b6..fb59faaaae0af41ff8e0908a776bdbc3edcd5ab8 100644 (file)
@@ -10,6 +10,7 @@
  * All other stuff is done out-of-band with exception handlers.
  */
 #define BUG()                                                          \
+do {                                                                   \
        __asm__ __volatile__ ("0: break 14\n\t"                         \
                              ".section .fixup,\"ax\"\n"                \
                              "1:\n\t"                                  \
                              ".section __ex_table,\"a\"\n\t"           \
                              ".dword 0b, 1b\n\t"                       \
                              ".previous\n\t"                           \
-                             : : "ri" (__FILE__), "i" (__LINE__))
+                             : : "ri" (__FILE__), "i" (__LINE__));     \
+       unreachable();                          \
+} while (0)
 #else
-#define BUG() __asm__ __volatile__ ("break 14\n\t")
+#define BUG()                                  \
+do {                                           \
+       __asm__ __volatile__ ("break 14\n\t");  \
+       unreachable();                          \
+} while (0)
 #endif
 
 #define HAVE_ARCH_BUG
diff --git a/arch/cris/include/arch-v32/arch/elf.h b/arch/cris/include/arch-v32/arch/elf.h
deleted file mode 100644 (file)
index c46d582..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef _ASM_CRIS_ELF_H
-#define _ASM_CRIS_ELF_H
-
-#include <arch/system.h>
-
-#define ELF_CORE_EFLAGS EF_CRIS_VARIANT_V32
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-#define elf_check_arch(x)                      \
- ((x)->e_machine == EM_CRIS                    \
-  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_V32    \
-      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
-
-/* CRISv32 ELF register definitions. */
-
-#include <asm/ptrace.h>
-
-/* Explicitly zero out registers to increase determinism. */
-#define ELF_PLAT_INIT(_r, load_addr)    do { \
-        (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
-        (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
-        (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
-        (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
-        (_r)->acr = 0; \
-} while (0)
-
-/*
- * An executable for which elf_read_implies_exec() returns TRUE will
- * have the READ_IMPLIES_EXEC personality flag set automatically.
- */
-#define elf_read_implies_exec_binary(ex, have_pt_gnu_stack)    (!(have_pt_gnu_stack))
-
-/*
- * This is basically a pt_regs with the additional definition
- * of the stack pointer since it's needed in a core dump.
- * pr_regs is a elf_gregset_t and should be filled according
- * to the layout of user_regs_struct.
- */
-#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
-        pr_reg[0] = regs->r0;                              \
-        pr_reg[1] = regs->r1;                              \
-        pr_reg[2] = regs->r2;                              \
-        pr_reg[3] = regs->r3;                              \
-        pr_reg[4] = regs->r4;                              \
-        pr_reg[5] = regs->r5;                              \
-        pr_reg[6] = regs->r6;                              \
-        pr_reg[7] = regs->r7;                              \
-        pr_reg[8] = regs->r8;                              \
-        pr_reg[9] = regs->r9;                              \
-        pr_reg[10] = regs->r10;                            \
-        pr_reg[11] = regs->r11;                            \
-        pr_reg[12] = regs->r12;                            \
-        pr_reg[13] = regs->r13;                            \
-        pr_reg[14] = rdusp();               /* SP */       \
-        pr_reg[15] = regs->acr;             /* ACR */      \
-        pr_reg[16] = 0;                     /* BZ */       \
-        pr_reg[17] = rdvr();                /* VR */       \
-        pr_reg[18] = 0;                     /* PID */      \
-        pr_reg[19] = regs->srs;             /* SRS */      \
-        pr_reg[20] = 0;                     /* WZ */       \
-        pr_reg[21] = regs->exs;             /* EXS */      \
-        pr_reg[22] = regs->eda;             /* EDA */      \
-        pr_reg[23] = regs->mof;             /* MOF */      \
-        pr_reg[24] = 0;                     /* DZ */       \
-        pr_reg[25] = 0;                     /* EBP */      \
-        pr_reg[26] = regs->erp;             /* ERP */      \
-        pr_reg[27] = regs->srp;             /* SRP */      \
-        pr_reg[28] = 0;                     /* NRP */      \
-        pr_reg[29] = regs->ccs;             /* CCS */      \
-        pr_reg[30] = rdusp();               /* USP */      \
-        pr_reg[31] = regs->spc;             /* SPC */      \
-
-#endif /* _ASM_CRIS_ELF_H */
index 041851f8ec6f9d4aeb5f458b96a1c26ef65908f5..5f6fddf9950997ea56936e43c6ebd070dc8850ca 100644 (file)
@@ -2,7 +2,7 @@
 #define __ASM_CRIS_ARCH_IRQFLAGS_H
 
 #include <linux/types.h>
-#include <arch/ptrace.h>
+#include <asm/ptrace.h>
 
 static inline unsigned long arch_local_save_flags(void)
 {
diff --git a/arch/cris/include/arch-v32/arch/ptrace.h b/arch/cris/include/arch-v32/arch/ptrace.h
deleted file mode 100644 (file)
index 19773d3..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef _CRIS_ARCH_PTRACE_H
-#define _CRIS_ARCH_PTRACE_H
-
-/* Register numbers in the ptrace system call interface */
-
-#define PT_ORIG_R10  0
-#define PT_R0        1
-#define PT_R1        2
-#define PT_R2        3
-#define PT_R3        4
-#define PT_R4        5
-#define PT_R5        6
-#define PT_R6        7
-#define PT_R7        8
-#define PT_R8        9
-#define PT_R9        10
-#define PT_R10       11
-#define PT_R11       12
-#define PT_R12       13
-#define PT_R13       14
-#define PT_ACR       15
-#define PT_SRS       16
-#define PT_MOF       17
-#define PT_SPC       18
-#define PT_CCS       19
-#define PT_SRP       20
-#define PT_ERP       21    /* This is actually the debugged process' PC */
-#define PT_EXS       22
-#define PT_EDA       23
-#define PT_USP       24    /* special case - USP is not in the pt_regs */
-#define PT_PPC       25    /* special case - pseudo PC */
-#define PT_BP        26    /* Base number for BP registers. */
-#define PT_BP_CTRL   26    /* BP control register. */
-#define PT_MAX       40
-
-/* Condition code bit numbers. */
-#define C_CCS_BITNR 0
-#define V_CCS_BITNR 1
-#define Z_CCS_BITNR 2
-#define N_CCS_BITNR 3
-#define X_CCS_BITNR 4
-#define I_CCS_BITNR 5
-#define U_CCS_BITNR 6
-#define P_CCS_BITNR 7
-#define R_CCS_BITNR 8
-#define S_CCS_BITNR 9
-#define M_CCS_BITNR 30
-#define Q_CCS_BITNR 31
-#define CCS_SHIFT   10 /* Shift count for each level in CCS */
-
-/* pt_regs not only specifices the format in the user-struct during
- * ptrace but is also the frame format used in the kernel prologue/epilogues
- * themselves
- */
-
-struct pt_regs {
-       unsigned long orig_r10;
-       /* pushed by movem r13, [sp] in SAVE_ALL. */
-       unsigned long r0;
-       unsigned long r1;
-       unsigned long r2;
-       unsigned long r3;
-       unsigned long r4;
-       unsigned long r5;
-       unsigned long r6;
-       unsigned long r7;
-       unsigned long r8;
-       unsigned long r9;
-       unsigned long r10;
-       unsigned long r11;
-       unsigned long r12;
-       unsigned long r13;
-       unsigned long acr;
-       unsigned long srs;
-       unsigned long mof;
-       unsigned long spc;
-       unsigned long ccs;
-       unsigned long srp;
-       unsigned long erp; /* This is actually the debugged process' PC */
-       /* For debugging purposes; saved only when needed. */
-       unsigned long exs;
-       unsigned long eda;
-};
-
-/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
- * when doing a context-switch. it is used (apart from in resume) when a new
- * thread is made and we need to make _resume (which is starting it for the
- * first time) realise what is going on.
- *
- * Actually, the use is very close to the thread struct (TSS) in that both the
- * switch_stack and the TSS are used to keep thread stuff when switching in
- * _resume.
- */
-
-struct switch_stack {
-       unsigned long r0;
-       unsigned long r1;
-       unsigned long r2;
-       unsigned long r3;
-       unsigned long r4;
-       unsigned long r5;
-       unsigned long r6;
-       unsigned long r7;
-       unsigned long r8;
-       unsigned long r9;
-       unsigned long return_ip; /* ip that _resume will return to */
-};
-
-#ifdef __KERNEL__
-
-#define arch_has_single_step() (1)
-#define user_mode(regs) (((regs)->ccs & (1 << (U_CCS_BITNR + CCS_SHIFT))) != 0)
-#define instruction_pointer(regs) ((regs)->erp)
-#define profile_pc(regs) instruction_pointer(regs)
-
-#endif  /*  __KERNEL__  */
-
-#endif
index ad2244f35bca0c991e8c42702edb613dc8aa5334..b7f68192d15b52cb4e6c34c78eac88a02fa971d5 100644 (file)
@@ -1,14 +1,20 @@
 generic-y += atomic.h
+generic-y += auxvec.h
 generic-y += barrier.h
+generic-y += bitsperlong.h
 generic-y += clkdev.h
 generic-y += cmpxchg.h
 generic-y += cputime.h
 generic-y += device.h
 generic-y += div64.h
+generic-y += errno.h
 generic-y += exec.h
 generic-y += emergency-restart.h
+generic-y += fcntl.h
 generic-y += futex.h
 generic-y += hardirq.h
+generic-y += ioctl.h
+generic-y += ipcbuf.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h
@@ -19,11 +25,22 @@ generic-y += local.h
 generic-y += local64.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
+generic-y += mman.h
 generic-y += module.h
+generic-y += msgbuf.h
 generic-y += percpu.h
+generic-y += poll.h
 generic-y += preempt.h
+generic-y += resource.h
 generic-y += sections.h
+generic-y += sembuf.h
+generic-y += shmbuf.h
+generic-y += siginfo.h
+generic-y += socket.h
+generic-y += sockios.h
+generic-y += statfs.h
 generic-y += topology.h
 generic-y += trace_clock.h
+generic-y += types.h
 generic-y += vga.h
 generic-y += xor.h
diff --git a/arch/cris/include/asm/elf.h b/arch/cris/include/asm/elf.h
deleted file mode 100644 (file)
index c2a394f..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef __ASMCRIS_ELF_H
-#define __ASMCRIS_ELF_H
-
-/*
- * ELF register definitions..
- */
-
-#include <asm/user.h>
-
-#define R_CRIS_NONE             0
-#define R_CRIS_8                1
-#define R_CRIS_16               2
-#define R_CRIS_32               3
-#define R_CRIS_8_PCREL          4
-#define R_CRIS_16_PCREL         5
-#define R_CRIS_32_PCREL         6
-#define R_CRIS_GNU_VTINHERIT    7
-#define R_CRIS_GNU_VTENTRY      8
-#define R_CRIS_COPY             9
-#define R_CRIS_GLOB_DAT         10
-#define R_CRIS_JUMP_SLOT        11
-#define R_CRIS_RELATIVE         12
-#define R_CRIS_16_GOT           13
-#define R_CRIS_32_GOT           14
-#define R_CRIS_16_GOTPLT        15
-#define R_CRIS_32_GOTPLT        16
-#define R_CRIS_32_GOTREL        17
-#define R_CRIS_32_PLT_GOTREL    18
-#define R_CRIS_32_PLT_PCREL     19
-
-typedef unsigned long elf_greg_t;
-
-/* Note that NGREG is defined to ELF_NGREG in include/linux/elfcore.h, and is
-   thus exposed to user-space. */
-#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-/* A placeholder; CRIS does not have any fp regs.  */
-typedef unsigned long elf_fpregset_t;
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS      ELFCLASS32
-#define ELF_DATA       ELFDATA2LSB
-#define ELF_ARCH       EM_CRIS
-
-#include <arch/elf.h>
-
-/* The master for these definitions is {binutils}/include/elf/cris.h:  */
-/* User symbols in this file have a leading underscore.  */
-#define EF_CRIS_UNDERSCORE             0x00000001
-
-/* This is a mask for different incompatible machine variants.  */
-#define EF_CRIS_VARIANT_MASK           0x0000000e
-
-/* Variant 0; may contain v0..10 object.  */
-#define EF_CRIS_VARIANT_ANY_V0_V10     0x00000000
-
-/* Variant 1; contains v32 object.  */
-#define EF_CRIS_VARIANT_V32            0x00000002
-
-/* Variant 2; contains object compatible with v32 and v10.  */
-#define EF_CRIS_VARIANT_COMMON_V10_V32 0x00000004
-/* End of excerpt from {binutils}/include/elf/cris.h.  */
-
-#define ELF_EXEC_PAGESIZE      8192
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#define ELF_ET_DYN_BASE         (TASK_SIZE / 3 * 2)
-
-/* This yields a mask that user programs can use to figure out what
-   instruction set this CPU supports.  This could be done in user space,
-   but it's not easy, and we've already done it here.  */
-
-#define ELF_HWCAP       (0)
-
-/* This yields a string that ld.so will use to load implementation
-   specific libraries for optimization.  This is more specific in
-   intent than poking at uname or /proc/cpuinfo.
-*/
-
-#define ELF_PLATFORM  (NULL)
-
-#endif
index 1d45fd6365b729416e2e53e00ee40a6c3405ab95..349acfd25d2f6164248104e3390cfdaa24a13b53 100644 (file)
@@ -11,7 +11,14 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 #define deactivate_mm(tsk,mm)  do { } while (0)
 
-#define activate_mm(prev,next) switch_mm((prev),(next),NULL)
+static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       switch_mm(prev, next, NULL);
+       local_irq_restore(flags);
+}
 
 /* current active pgd - this is similar to other processors pgd 
  * registers like cr3 on the i386
diff --git a/arch/cris/include/asm/stacktrace.h b/arch/cris/include/asm/stacktrace.h
new file mode 100644 (file)
index 0000000..2d90856
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef __CRIS_STACKTRACE_H
+#define __CRIS_STACKTRACE_H
+
+void walk_stackframe(unsigned long sp,
+                    int (*fn)(unsigned long addr, void *data),
+                    void *data);
+
+#endif
diff --git a/arch/cris/include/asm/types.h b/arch/cris/include/asm/types.h
deleted file mode 100644 (file)
index a3cac77..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _ETRAX_TYPES_H
-#define _ETRAX_TYPES_H
-
-#include <uapi/asm/types.h>
-
-/*
- * These aren't exported outside the kernel to avoid name space clashes
- */
-
-#define BITS_PER_LONG 32
-
-#endif
index 0f40fed1ba25852d0d681749ad07d95b4bcca3fa..9c23535821c01c37f1fb404b7e7d8d8f11f3f27b 100644 (file)
@@ -4,7 +4,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define NR_syscalls 360
+#define NR_syscalls 365
 
 #include <arch/unistd.h>
 
index 01f66b8f15e50b83a75fcd9693d253348a459297..d5564a0ae66adc23b7c42fe31f69ad1f3ac131e6 100644 (file)
@@ -6,6 +6,9 @@ header-y += ../arch-v32/arch/
 header-y += auxvec.h
 header-y += bitsperlong.h
 header-y += byteorder.h
+header-y += elf.h
+header-y += elf_v10.h
+header-y += elf_v32.h
 header-y += errno.h
 header-y += ethernet.h
 header-y += etraxgpio.h
@@ -19,6 +22,8 @@ header-y += param.h
 header-y += poll.h
 header-y += posix_types.h
 header-y += ptrace.h
+header-y += ptrace_v10.h
+header-y += ptrace_v32.h
 header-y += resource.h
 header-y += rs485.h
 header-y += sembuf.h
diff --git a/arch/cris/include/uapi/asm/auxvec.h b/arch/cris/include/uapi/asm/auxvec.h
deleted file mode 100644 (file)
index cb30b01..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __ASMCRIS_AUXVEC_H
-#define __ASMCRIS_AUXVEC_H
-
-#endif
diff --git a/arch/cris/include/uapi/asm/bitsperlong.h b/arch/cris/include/uapi/asm/bitsperlong.h
deleted file mode 100644 (file)
index 6dc0bb0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/bitsperlong.h>
diff --git a/arch/cris/include/uapi/asm/elf.h b/arch/cris/include/uapi/asm/elf.h
new file mode 100644 (file)
index 0000000..a5df05b
--- /dev/null
@@ -0,0 +1,90 @@
+#ifndef __ASMCRIS_ELF_H
+#define __ASMCRIS_ELF_H
+
+/*
+ * ELF register definitions..
+ */
+
+#ifdef __arch_v32
+#include <asm/elf_v32.h>
+#else
+#include <asm/elf_v10.h>
+#endif
+
+#define R_CRIS_NONE             0
+#define R_CRIS_8                1
+#define R_CRIS_16               2
+#define R_CRIS_32               3
+#define R_CRIS_8_PCREL          4
+#define R_CRIS_16_PCREL         5
+#define R_CRIS_32_PCREL         6
+#define R_CRIS_GNU_VTINHERIT    7
+#define R_CRIS_GNU_VTENTRY      8
+#define R_CRIS_COPY             9
+#define R_CRIS_GLOB_DAT         10
+#define R_CRIS_JUMP_SLOT        11
+#define R_CRIS_RELATIVE         12
+#define R_CRIS_16_GOT           13
+#define R_CRIS_32_GOT           14
+#define R_CRIS_16_GOTPLT        15
+#define R_CRIS_32_GOTPLT        16
+#define R_CRIS_32_GOTREL        17
+#define R_CRIS_32_PLT_GOTREL    18
+#define R_CRIS_32_PLT_PCREL     19
+
+typedef unsigned long elf_greg_t;
+
+/* Note that NGREG is defined to ELF_NGREG in include/linux/elfcore.h, and is
+   thus exposed to user-space. */
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+/* A placeholder; CRIS does not have any fp regs.  */
+typedef unsigned long elf_fpregset_t;
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS      ELFCLASS32
+#define ELF_DATA       ELFDATA2LSB
+#define ELF_ARCH       EM_CRIS
+
+/* The master for these definitions is {binutils}/include/elf/cris.h:  */
+/* User symbols in this file have a leading underscore.  */
+#define EF_CRIS_UNDERSCORE             0x00000001
+
+/* This is a mask for different incompatible machine variants.  */
+#define EF_CRIS_VARIANT_MASK           0x0000000e
+
+/* Variant 0; may contain v0..10 object.  */
+#define EF_CRIS_VARIANT_ANY_V0_V10     0x00000000
+
+/* Variant 1; contains v32 object.  */
+#define EF_CRIS_VARIANT_V32            0x00000002
+
+/* Variant 2; contains object compatible with v32 and v10.  */
+#define EF_CRIS_VARIANT_COMMON_V10_V32 0x00000004
+/* End of excerpt from {binutils}/include/elf/cris.h.  */
+
+#define ELF_EXEC_PAGESIZE      8192
+
+/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
+   use of this is to invoke "./ld.so someprog" to test out a new version of
+   the loader.  We need to make sure that it is out of the way of the program
+   that it will "exec", and that there is sufficient room for the brk.  */
+
+#define ELF_ET_DYN_BASE         (TASK_SIZE / 3 * 2)
+
+/* This yields a mask that user programs can use to figure out what
+   instruction set this CPU supports.  This could be done in user space,
+   but it's not easy, and we've already done it here.  */
+
+#define ELF_HWCAP       (0)
+
+/* This yields a string that ld.so will use to load implementation
+   specific libraries for optimization.  This is more specific in
+   intent than poking at uname or /proc/cpuinfo.
+*/
+
+#define ELF_PLATFORM  (NULL)
+
+#endif
diff --git a/arch/cris/include/uapi/asm/elf_v10.h b/arch/cris/include/uapi/asm/elf_v10.h
new file mode 100644 (file)
index 0000000..3ea65ce
--- /dev/null
@@ -0,0 +1,84 @@
+#ifndef __ASMCRIS_ARCH_ELF_H
+#define __ASMCRIS_ARCH_ELF_H
+
+#define ELF_MACH EF_CRIS_VARIANT_ANY_V0_V10
+
+/* Matches struct user_regs_struct */
+#define ELF_NGREG 35
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x)                      \
+ ((x)->e_machine == EM_CRIS                    \
+  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_ANY_V0_V10     \
+      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
+
+/*
+ * ELF register definitions..
+ */
+
+#include <asm/ptrace.h>
+
+/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program
+   starts (a register; assume first param register for CRIS)
+   contains a pointer to a function which might be
+   registered using `atexit'.  This provides a mean for the
+   dynamic linker to call DT_FINI functions for shared libraries
+   that have been loaded before the code runs.
+
+   A value of 0 tells we have no such handler.  */
+
+/* Explicitly set registers to 0 to increase determinism.  */
+#define ELF_PLAT_INIT(_r, load_addr)   do { \
+       (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
+       (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
+       (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
+       (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
+} while (0)
+
+/* The additional layer below is because the stack pointer is missing in 
+   the pt_regs struct, but needed in a core dump. pr_reg is a elf_gregset_t,
+   and should be filled in according to the layout of the user_regs_struct
+   struct; regs is a pt_regs struct. We dump all registers, though several are
+   obviously unnecessary. That way there's less need for intelligence at 
+   the receiving end (i.e. gdb). */
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
+       pr_reg[0] = regs->r0;                              \
+       pr_reg[1] = regs->r1;                              \
+       pr_reg[2] = regs->r2;                              \
+       pr_reg[3] = regs->r3;                              \
+       pr_reg[4] = regs->r4;                              \
+       pr_reg[5] = regs->r5;                              \
+       pr_reg[6] = regs->r6;                              \
+       pr_reg[7] = regs->r7;                              \
+       pr_reg[8] = regs->r8;                              \
+       pr_reg[9] = regs->r9;                              \
+       pr_reg[10] = regs->r10;                            \
+       pr_reg[11] = regs->r11;                            \
+       pr_reg[12] = regs->r12;                            \
+       pr_reg[13] = regs->r13;                            \
+       pr_reg[14] = rdusp();               /* sp */       \
+       pr_reg[15] = regs->irp;             /* pc */       \
+       pr_reg[16] = 0;                     /* p0 */       \
+       pr_reg[17] = rdvr();                /* vr */       \
+       pr_reg[18] = 0;                     /* p2 */       \
+       pr_reg[19] = 0;                     /* p3 */       \
+       pr_reg[20] = 0;                     /* p4 */       \
+       pr_reg[21] = (regs->dccr & 0xffff); /* ccr */      \
+       pr_reg[22] = 0;                     /* p6 */       \
+       pr_reg[23] = regs->mof;             /* mof */      \
+       pr_reg[24] = 0;                     /* p8 */       \
+       pr_reg[25] = 0;                     /* ibr */      \
+       pr_reg[26] = 0;                     /* irp */      \
+       pr_reg[27] = regs->srp;             /* srp */      \
+       pr_reg[28] = 0;                     /* bar */      \
+       pr_reg[29] = regs->dccr;            /* dccr */     \
+       pr_reg[30] = 0;                     /* brp */      \
+       pr_reg[31] = rdusp();               /* usp */      \
+       pr_reg[32] = 0;                     /* csrinstr */ \
+       pr_reg[33] = 0;                     /* csraddr */  \
+       pr_reg[34] = 0;                     /* csrdata */
+
+
+#endif
diff --git a/arch/cris/include/uapi/asm/elf_v32.h b/arch/cris/include/uapi/asm/elf_v32.h
new file mode 100644 (file)
index 0000000..f09fe49
--- /dev/null
@@ -0,0 +1,76 @@
+#ifndef _ASM_CRIS_ELF_H
+#define _ASM_CRIS_ELF_H
+
+#define ELF_CORE_EFLAGS EF_CRIS_VARIANT_V32
+
+/* Matches struct user_regs_struct */
+#define ELF_NGREG 32
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x)                      \
+ ((x)->e_machine == EM_CRIS                    \
+  && ((((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_V32    \
+      || (((x)->e_flags & EF_CRIS_VARIANT_MASK) == EF_CRIS_VARIANT_COMMON_V10_V32))))
+
+/* CRISv32 ELF register definitions. */
+
+#include <asm/ptrace.h>
+
+/* Explicitly zero out registers to increase determinism. */
+#define ELF_PLAT_INIT(_r, load_addr)    do { \
+        (_r)->r13 = 0; (_r)->r12 = 0; (_r)->r11 = 0; (_r)->r10 = 0; \
+        (_r)->r9 = 0;  (_r)->r8 = 0;  (_r)->r7 = 0;  (_r)->r6 = 0;  \
+        (_r)->r5 = 0;  (_r)->r4 = 0;  (_r)->r3 = 0;  (_r)->r2 = 0;  \
+        (_r)->r1 = 0;  (_r)->r0 = 0;  (_r)->mof = 0; (_r)->srp = 0; \
+        (_r)->acr = 0; \
+} while (0)
+
+/*
+ * An executable for which elf_read_implies_exec() returns TRUE will
+ * have the READ_IMPLIES_EXEC personality flag set automatically.
+ */
+#define elf_read_implies_exec_binary(ex, have_pt_gnu_stack)    (!(have_pt_gnu_stack))
+
+/*
+ * This is basically a pt_regs with the additional definition
+ * of the stack pointer since it's needed in a core dump.
+ * pr_regs is a elf_gregset_t and should be filled according
+ * to the layout of user_regs_struct.
+ */
+#define ELF_CORE_COPY_REGS(pr_reg, regs)                   \
+        pr_reg[0] = regs->r0;                              \
+        pr_reg[1] = regs->r1;                              \
+        pr_reg[2] = regs->r2;                              \
+        pr_reg[3] = regs->r3;                              \
+        pr_reg[4] = regs->r4;                              \
+        pr_reg[5] = regs->r5;                              \
+        pr_reg[6] = regs->r6;                              \
+        pr_reg[7] = regs->r7;                              \
+        pr_reg[8] = regs->r8;                              \
+        pr_reg[9] = regs->r9;                              \
+        pr_reg[10] = regs->r10;                            \
+        pr_reg[11] = regs->r11;                            \
+        pr_reg[12] = regs->r12;                            \
+        pr_reg[13] = regs->r13;                            \
+        pr_reg[14] = rdusp();               /* SP */       \
+        pr_reg[15] = regs->acr;             /* ACR */      \
+        pr_reg[16] = 0;                     /* BZ */       \
+        pr_reg[17] = rdvr();                /* VR */       \
+        pr_reg[18] = 0;                     /* PID */      \
+        pr_reg[19] = regs->srs;             /* SRS */      \
+        pr_reg[20] = 0;                     /* WZ */       \
+        pr_reg[21] = regs->exs;             /* EXS */      \
+        pr_reg[22] = regs->eda;             /* EDA */      \
+        pr_reg[23] = regs->mof;             /* MOF */      \
+        pr_reg[24] = 0;                     /* DZ */       \
+        pr_reg[25] = 0;                     /* EBP */      \
+        pr_reg[26] = regs->erp;             /* ERP */      \
+        pr_reg[27] = regs->srp;             /* SRP */      \
+        pr_reg[28] = 0;                     /* NRP */      \
+        pr_reg[29] = regs->ccs;             /* CCS */      \
+        pr_reg[30] = rdusp();               /* USP */      \
+        pr_reg[31] = regs->spc;             /* SPC */      \
+
+#endif /* _ASM_CRIS_ELF_H */
diff --git a/arch/cris/include/uapi/asm/errno.h b/arch/cris/include/uapi/asm/errno.h
deleted file mode 100644 (file)
index 2bf5eb5..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_ERRNO_H
-#define _CRIS_ERRNO_H
-
-#include <asm-generic/errno.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/fcntl.h b/arch/cris/include/uapi/asm/fcntl.h
deleted file mode 100644 (file)
index 46ab12d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/fcntl.h>
diff --git a/arch/cris/include/uapi/asm/ioctl.h b/arch/cris/include/uapi/asm/ioctl.h
deleted file mode 100644 (file)
index b279fe0..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ioctl.h>
diff --git a/arch/cris/include/uapi/asm/ipcbuf.h b/arch/cris/include/uapi/asm/ipcbuf.h
deleted file mode 100644 (file)
index 84c7e51..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/ipcbuf.h>
diff --git a/arch/cris/include/uapi/asm/kvm_para.h b/arch/cris/include/uapi/asm/kvm_para.h
deleted file mode 100644 (file)
index 14fab8f..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/arch/cris/include/uapi/asm/mman.h b/arch/cris/include/uapi/asm/mman.h
deleted file mode 100644 (file)
index 8eebf89..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/mman.h>
diff --git a/arch/cris/include/uapi/asm/msgbuf.h b/arch/cris/include/uapi/asm/msgbuf.h
deleted file mode 100644 (file)
index ada63df..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _CRIS_MSGBUF_H
-#define _CRIS_MSGBUF_H
-
-/* verbatim copy of asm-i386 version */
-
-/* 
- * The msqid64_ds structure for CRIS architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct msqid64_ds {
-       struct ipc64_perm msg_perm;
-       __kernel_time_t msg_stime;      /* last msgsnd time */
-       unsigned long   __unused1;
-       __kernel_time_t msg_rtime;      /* last msgrcv time */
-       unsigned long   __unused2;
-       __kernel_time_t msg_ctime;      /* last change time */
-       unsigned long   __unused3;
-       unsigned long  msg_cbytes;      /* current number of bytes on queue */
-       unsigned long  msg_qnum;        /* number of messages in queue */
-       unsigned long  msg_qbytes;      /* max number of bytes on queue */
-       __kernel_pid_t msg_lspid;       /* pid of last msgsnd */
-       __kernel_pid_t msg_lrpid;       /* last receive pid */
-       unsigned long  __unused4;
-       unsigned long  __unused5;
-};
-
-#endif /* _CRIS_MSGBUF_H */
diff --git a/arch/cris/include/uapi/asm/poll.h b/arch/cris/include/uapi/asm/poll.h
deleted file mode 100644 (file)
index c98509d..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/poll.h>
index c689c9bbbe503375055822f51bd9b0666c929395..bd8946f83ed3a13ffb94c64a53db435b8e32ac67 100644 (file)
@@ -1 +1,5 @@
-#include <arch/ptrace.h>
+#ifdef __arch_v32
+#include <asm/ptrace_v32.h>
+#else
+#include <asm/ptrace_v10.h>
+#endif
diff --git a/arch/cris/include/uapi/asm/ptrace_v10.h b/arch/cris/include/uapi/asm/ptrace_v10.h
new file mode 100644 (file)
index 0000000..1a23273
--- /dev/null
@@ -0,0 +1,118 @@
+#ifndef _CRIS_ARCH_PTRACE_H
+#define _CRIS_ARCH_PTRACE_H
+
+/* Frame types */
+
+#define CRIS_FRAME_NORMAL   0 /* normal frame without SBFS stacking */
+#define CRIS_FRAME_BUSFAULT 1 /* frame stacked using SBFS, need RBF return
+                                path */
+
+/* Register numbers in the ptrace system call interface */
+
+#define PT_FRAMETYPE 0
+#define PT_ORIG_R10  1
+#define PT_R13       2
+#define PT_R12       3
+#define PT_R11       4
+#define PT_R10       5
+#define PT_R9        6
+#define PT_R8        7
+#define PT_R7        8
+#define PT_R6        9
+#define PT_R5        10
+#define PT_R4        11
+#define PT_R3        12
+#define PT_R2        13
+#define PT_R1        14
+#define PT_R0        15
+#define PT_MOF       16
+#define PT_DCCR      17
+#define PT_SRP       18
+#define PT_IRP       19    /* This is actually the debugged process' PC */
+#define PT_CSRINSTR  20    /* CPU Status record remnants -
+                             valid if frametype == busfault */
+#define PT_CSRADDR   21
+#define PT_CSRDATA   22
+#define PT_USP       23    /* special case - USP is not in the pt_regs */
+#define PT_MAX       23
+
+/* Condition code bit numbers.  The same numbers apply to CCR of course,
+   but we use DCCR everywhere else, so let's try and be consistent.  */
+#define C_DCCR_BITNR 0
+#define V_DCCR_BITNR 1
+#define Z_DCCR_BITNR 2
+#define N_DCCR_BITNR 3
+#define X_DCCR_BITNR 4
+#define I_DCCR_BITNR 5
+#define B_DCCR_BITNR 6
+#define M_DCCR_BITNR 7
+#define U_DCCR_BITNR 8
+#define P_DCCR_BITNR 9
+#define F_DCCR_BITNR 10
+
+/* pt_regs not only specifices the format in the user-struct during
+ * ptrace but is also the frame format used in the kernel prologue/epilogues 
+ * themselves
+ */
+
+struct pt_regs {
+       unsigned long frametype;  /* type of stackframe */
+       unsigned long orig_r10;
+       /* pushed by movem r13, [sp] in SAVE_ALL, movem pushes backwards */
+       unsigned long r13;
+       unsigned long r12;
+       unsigned long r11;
+       unsigned long r10;
+       unsigned long r9;
+       unsigned long r8;
+       unsigned long r7;
+       unsigned long r6;
+       unsigned long r5;
+       unsigned long r4;
+       unsigned long r3;
+       unsigned long r2;
+       unsigned long r1;
+       unsigned long r0;
+       unsigned long mof;
+       unsigned long dccr;
+       unsigned long srp;
+       unsigned long irp; /* This is actually the debugged process' PC */
+       unsigned long csrinstr;
+       unsigned long csraddr;
+       unsigned long csrdata;
+};
+
+/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
+ * when doing a context-switch. it is used (apart from in resume) when a new
+ * thread is made and we need to make _resume (which is starting it for the
+ * first time) realise what is going on.
+ *
+ * Actually, the use is very close to the thread struct (TSS) in that both the
+ * switch_stack and the TSS are used to keep thread stuff when switching in
+ * _resume.
+ */
+
+struct switch_stack {
+       unsigned long r9;
+       unsigned long r8;
+       unsigned long r7;
+       unsigned long r6;
+       unsigned long r5;
+       unsigned long r4;
+       unsigned long r3;
+       unsigned long r2;
+       unsigned long r1;
+       unsigned long r0;
+       unsigned long return_ip; /* ip that _resume will return to */
+};
+
+#ifdef __KERNEL__
+
+/* bit 8 is user-mode flag */
+#define user_mode(regs) (((regs)->dccr & 0x100) != 0)
+#define instruction_pointer(regs) ((regs)->irp)
+#define profile_pc(regs) instruction_pointer(regs)
+
+#endif  /*  __KERNEL__  */
+
+#endif
diff --git a/arch/cris/include/uapi/asm/ptrace_v32.h b/arch/cris/include/uapi/asm/ptrace_v32.h
new file mode 100644 (file)
index 0000000..19773d3
--- /dev/null
@@ -0,0 +1,118 @@
+#ifndef _CRIS_ARCH_PTRACE_H
+#define _CRIS_ARCH_PTRACE_H
+
+/* Register numbers in the ptrace system call interface */
+
+#define PT_ORIG_R10  0
+#define PT_R0        1
+#define PT_R1        2
+#define PT_R2        3
+#define PT_R3        4
+#define PT_R4        5
+#define PT_R5        6
+#define PT_R6        7
+#define PT_R7        8
+#define PT_R8        9
+#define PT_R9        10
+#define PT_R10       11
+#define PT_R11       12
+#define PT_R12       13
+#define PT_R13       14
+#define PT_ACR       15
+#define PT_SRS       16
+#define PT_MOF       17
+#define PT_SPC       18
+#define PT_CCS       19
+#define PT_SRP       20
+#define PT_ERP       21    /* This is actually the debugged process' PC */
+#define PT_EXS       22
+#define PT_EDA       23
+#define PT_USP       24    /* special case - USP is not in the pt_regs */
+#define PT_PPC       25    /* special case - pseudo PC */
+#define PT_BP        26    /* Base number for BP registers. */
+#define PT_BP_CTRL   26    /* BP control register. */
+#define PT_MAX       40
+
+/* Condition code bit numbers. */
+#define C_CCS_BITNR 0
+#define V_CCS_BITNR 1
+#define Z_CCS_BITNR 2
+#define N_CCS_BITNR 3
+#define X_CCS_BITNR 4
+#define I_CCS_BITNR 5
+#define U_CCS_BITNR 6
+#define P_CCS_BITNR 7
+#define R_CCS_BITNR 8
+#define S_CCS_BITNR 9
+#define M_CCS_BITNR 30
+#define Q_CCS_BITNR 31
+#define CCS_SHIFT   10 /* Shift count for each level in CCS */
+
+/* pt_regs not only specifices the format in the user-struct during
+ * ptrace but is also the frame format used in the kernel prologue/epilogues
+ * themselves
+ */
+
+struct pt_regs {
+       unsigned long orig_r10;
+       /* pushed by movem r13, [sp] in SAVE_ALL. */
+       unsigned long r0;
+       unsigned long r1;
+       unsigned long r2;
+       unsigned long r3;
+       unsigned long r4;
+       unsigned long r5;
+       unsigned long r6;
+       unsigned long r7;
+       unsigned long r8;
+       unsigned long r9;
+       unsigned long r10;
+       unsigned long r11;
+       unsigned long r12;
+       unsigned long r13;
+       unsigned long acr;
+       unsigned long srs;
+       unsigned long mof;
+       unsigned long spc;
+       unsigned long ccs;
+       unsigned long srp;
+       unsigned long erp; /* This is actually the debugged process' PC */
+       /* For debugging purposes; saved only when needed. */
+       unsigned long exs;
+       unsigned long eda;
+};
+
+/* switch_stack is the extra stuff pushed onto the stack in _resume (entry.S)
+ * when doing a context-switch. it is used (apart from in resume) when a new
+ * thread is made and we need to make _resume (which is starting it for the
+ * first time) realise what is going on.
+ *
+ * Actually, the use is very close to the thread struct (TSS) in that both the
+ * switch_stack and the TSS are used to keep thread stuff when switching in
+ * _resume.
+ */
+
+struct switch_stack {
+       unsigned long r0;
+       unsigned long r1;
+       unsigned long r2;
+       unsigned long r3;
+       unsigned long r4;
+       unsigned long r5;
+       unsigned long r6;
+       unsigned long r7;
+       unsigned long r8;
+       unsigned long r9;
+       unsigned long return_ip; /* ip that _resume will return to */
+};
+
+#ifdef __KERNEL__
+
+#define arch_has_single_step() (1)
+#define user_mode(regs) (((regs)->ccs & (1 << (U_CCS_BITNR + CCS_SHIFT))) != 0)
+#define instruction_pointer(regs) ((regs)->erp)
+#define profile_pc(regs) instruction_pointer(regs)
+
+#endif  /*  __KERNEL__  */
+
+#endif
diff --git a/arch/cris/include/uapi/asm/resource.h b/arch/cris/include/uapi/asm/resource.h
deleted file mode 100644 (file)
index b5d2944..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_RESOURCE_H
-#define _CRIS_RESOURCE_H
-
-#include <asm-generic/resource.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/sembuf.h b/arch/cris/include/uapi/asm/sembuf.h
deleted file mode 100644 (file)
index 7fed984..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _CRIS_SEMBUF_H
-#define _CRIS_SEMBUF_H
-
-/* 
- * The semid64_ds structure for CRIS architecture.
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct semid64_ds {
-       struct ipc64_perm sem_perm;             /* permissions .. see ipc.h */
-       __kernel_time_t sem_otime;              /* last semop time */
-       unsigned long   __unused1;
-       __kernel_time_t sem_ctime;              /* last change time */
-       unsigned long   __unused2;
-       unsigned long   sem_nsems;              /* no. of semaphores in array */
-       unsigned long   __unused3;
-       unsigned long   __unused4;
-};
-
-#endif /* _CRIS_SEMBUF_H */
diff --git a/arch/cris/include/uapi/asm/shmbuf.h b/arch/cris/include/uapi/asm/shmbuf.h
deleted file mode 100644 (file)
index 3239e3f..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _CRIS_SHMBUF_H
-#define _CRIS_SHMBUF_H
-
-/* 
- * The shmid64_ds structure for CRIS architecture (same as for i386)
- * Note extra padding because this structure is passed back and forth
- * between kernel and user space.
- *
- * Pad space is left for:
- * - 64-bit time_t to solve y2038 problem
- * - 2 miscellaneous 32-bit values
- */
-
-struct shmid64_ds {
-       struct ipc64_perm       shm_perm;       /* operation perms */
-       size_t                  shm_segsz;      /* size of segment (bytes) */
-       __kernel_time_t         shm_atime;      /* last attach time */
-       unsigned long           __unused1;
-       __kernel_time_t         shm_dtime;      /* last detach time */
-       unsigned long           __unused2;
-       __kernel_time_t         shm_ctime;      /* last change time */
-       unsigned long           __unused3;
-       __kernel_pid_t          shm_cpid;       /* pid of creator */
-       __kernel_pid_t          shm_lpid;       /* pid of last operator */
-       unsigned long           shm_nattch;     /* no. of current attaches */
-       unsigned long           __unused4;
-       unsigned long           __unused5;
-};
-
-struct shminfo64 {
-       unsigned long   shmmax;
-       unsigned long   shmmin;
-       unsigned long   shmmni;
-       unsigned long   shmseg;
-       unsigned long   shmall;
-       unsigned long   __unused1;
-       unsigned long   __unused2;
-       unsigned long   __unused3;
-       unsigned long   __unused4;
-};
-
-#endif /* _CRIS_SHMBUF_H */
diff --git a/arch/cris/include/uapi/asm/siginfo.h b/arch/cris/include/uapi/asm/siginfo.h
deleted file mode 100644 (file)
index c1cd6d1..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_SIGINFO_H
-#define _CRIS_SIGINFO_H
-
-#include <asm-generic/siginfo.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
deleted file mode 100644 (file)
index e2503d9..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _ASM_SOCKET_H
-#define _ASM_SOCKET_H
-
-/* almost the same as asm-i386/socket.h */
-
-#include <asm/sockios.h>
-
-/* For setsockoptions(2) */
-#define SOL_SOCKET     1
-
-#define SO_DEBUG       1
-#define SO_REUSEADDR   2
-#define SO_TYPE                3
-#define SO_ERROR       4
-#define SO_DONTROUTE   5
-#define SO_BROADCAST   6
-#define SO_SNDBUF      7
-#define SO_RCVBUF      8
-#define SO_SNDBUFFORCE 32
-#define SO_RCVBUFFORCE 33
-#define SO_KEEPALIVE   9
-#define SO_OOBINLINE   10
-#define SO_NO_CHECK    11
-#define SO_PRIORITY    12
-#define SO_LINGER      13
-#define SO_BSDCOMPAT   14
-#define SO_REUSEPORT   15
-#define SO_PASSCRED    16
-#define SO_PEERCRED    17
-#define SO_RCVLOWAT    18
-#define SO_SNDLOWAT    19
-#define SO_RCVTIMEO    20
-#define SO_SNDTIMEO    21
-
-/* Security levels - as per NRL IPv6 - don't actually do anything */
-#define SO_SECURITY_AUTHENTICATION             22
-#define SO_SECURITY_ENCRYPTION_TRANSPORT       23
-#define SO_SECURITY_ENCRYPTION_NETWORK         24
-
-#define SO_BINDTODEVICE        25
-
-/* Socket filtering */
-#define SO_ATTACH_FILTER        26
-#define SO_DETACH_FILTER        27
-#define SO_GET_FILTER          SO_ATTACH_FILTER
-
-#define SO_PEERNAME            28
-#define SO_TIMESTAMP           29
-#define SCM_TIMESTAMP          SO_TIMESTAMP
-
-#define SO_ACCEPTCONN          30
-
-#define SO_PEERSEC             31
-#define SO_PASSSEC             34
-#define SO_TIMESTAMPNS         35
-#define SCM_TIMESTAMPNS                SO_TIMESTAMPNS
-
-#define SO_MARK                        36
-
-#define SO_TIMESTAMPING                37
-#define SCM_TIMESTAMPING       SO_TIMESTAMPING
-
-#define SO_PROTOCOL            38
-#define SO_DOMAIN              39
-
-#define SO_RXQ_OVFL             40
-
-#define SO_WIFI_STATUS         41
-#define SCM_WIFI_STATUS                SO_WIFI_STATUS
-#define SO_PEEK_OFF            42
-
-/* Instruct lower device to use last 4-bytes of skb data as FCS */
-#define SO_NOFCS               43
-
-#define SO_LOCK_FILTER         44
-
-#define SO_SELECT_ERR_QUEUE    45
-
-#define SO_BUSY_POLL           46
-
-#define SO_MAX_PACING_RATE     47
-
-#define SO_BPF_EXTENSIONS      48
-
-#define SO_INCOMING_CPU                49
-
-#define SO_ATTACH_BPF          50
-#define SO_DETACH_BPF          SO_DETACH_FILTER
-
-#endif /* _ASM_SOCKET_H */
-
-
diff --git a/arch/cris/include/uapi/asm/sockios.h b/arch/cris/include/uapi/asm/sockios.h
deleted file mode 100644 (file)
index cfe7bfe..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __ARCH_CRIS_SOCKIOS__
-#define __ARCH_CRIS_SOCKIOS__
-
-/* Socket-level I/O control calls. */
-#define FIOSETOWN      0x8901
-#define SIOCSPGRP      0x8902
-#define FIOGETOWN      0x8903
-#define SIOCGPGRP      0x8904
-#define SIOCATMARK     0x8905
-#define SIOCGSTAMP     0x8906          /* Get stamp (timeval) */
-#define SIOCGSTAMPNS   0x8907          /* Get stamp (timespec) */
-
-#endif
diff --git a/arch/cris/include/uapi/asm/statfs.h b/arch/cris/include/uapi/asm/statfs.h
deleted file mode 100644 (file)
index fdaf921..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _CRIS_STATFS_H
-#define _CRIS_STATFS_H
-
-#include <asm-generic/statfs.h>
-
-#endif
diff --git a/arch/cris/include/uapi/asm/types.h b/arch/cris/include/uapi/asm/types.h
deleted file mode 100644 (file)
index 9ec9d4c..0000000
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/int-ll64.h>
index f3287face443b4094a35231c972912970178e1ec..062b648b27e1b1c707d620cce8615b766e2d30d9 100644 (file)
 #define __NR_process_vm_writev 349
 #define __NR_kcmp              350
 #define __NR_finit_module      351
+#define __NR_sched_setattr     352
+#define __NR_sched_getattr     353
+#define __NR_renameat2         354
+#define __NR_seccomp           355
+#define __NR_getrandom         356
+#define __NR_memfd_create      357
+#define __NR_bpf               358
+#define __NR_execveat          359
 
 #endif /* _UAPI_ASM_CRIS_UNISTD_H_ */
index edef71f12bb8860c147dfb1ef703d817cef33d9d..5fae398ca9152749155b4e5f3dfd0209a9cab832 100644 (file)
@@ -8,6 +8,7 @@ extra-y := vmlinux.lds
 
 obj-y   := process.o traps.o irq.o ptrace.o setup.o time.o sys_cris.o
 obj-y += devicetree.o
+obj-y += stacktrace.o
 
 obj-$(CONFIG_MODULES)    += crisksyms.o
 obj-$(CONFIG_MODULES)   += module.o
index dd0be5de55d5b6ea27a2410b984e3bba0e5b9e72..694850e8f077afe439960ed173830c274dd7d718 100644 (file)
 asmlinkage void do_IRQ(int irq, struct pt_regs * regs)
 {
        unsigned long sp;
-       struct pt_regs *old_regs = set_irq_regs(regs);
+       struct pt_regs *old_regs;
+
+       trace_hardirqs_off();
+
+       old_regs = set_irq_regs(regs);
        irq_enter();
        sp = rdsp();
        if (unlikely((sp & (PAGE_SIZE - 1)) < (PAGE_SIZE/8))) {
diff --git a/arch/cris/kernel/stacktrace.c b/arch/cris/kernel/stacktrace.c
new file mode 100644 (file)
index 0000000..99838c7
--- /dev/null
@@ -0,0 +1,76 @@
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/stacktrace.h>
+#include <asm/stacktrace.h>
+
+void walk_stackframe(unsigned long sp,
+                    int (*fn)(unsigned long addr, void *data),
+                    void *data)
+{
+       unsigned long high = ALIGN(sp, THREAD_SIZE);
+
+       for (; sp <= high - 4; sp += 4) {
+               unsigned long addr = *(unsigned long *) sp;
+
+               if (!kernel_text_address(addr))
+                       continue;
+
+               if (fn(addr, data))
+                       break;
+       }
+}
+
+struct stack_trace_data {
+       struct stack_trace *trace;
+       unsigned int no_sched_functions;
+       unsigned int skip;
+};
+
+#ifdef CONFIG_STACKTRACE
+
+static int save_trace(unsigned long addr, void *d)
+{
+       struct stack_trace_data *data = d;
+       struct stack_trace *trace = data->trace;
+
+       if (data->no_sched_functions && in_sched_functions(addr))
+               return 0;
+
+       if (data->skip) {
+               data->skip--;
+               return 0;
+       }
+
+       trace->entries[trace->nr_entries++] = addr;
+
+       return trace->nr_entries >= trace->max_entries;
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+       struct stack_trace_data data;
+       unsigned long sp;
+
+       data.trace = trace;
+       data.skip = trace->skip;
+
+       if (tsk != current) {
+               data.no_sched_functions = 1;
+               sp = tsk->thread.ksp;
+       } else {
+               data.no_sched_functions = 0;
+               sp = rdsp();
+       }
+
+       walk_stackframe(sp, save_trace, &data);
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+void save_stack_trace(struct stack_trace *trace)
+{
+       save_stack_trace_tsk(current, trace);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+#endif /* CONFIG_STACKTRACE */
index 704274127c07e9f5c67e074264d82bf08018ddbf..c4f2cfcb117bd6a6b9f1844c3a3313fa5e6be94a 100644 (file)
@@ -70,5 +70,5 @@ void decompress_kernel(void)
        free_mem_ptr = (unsigned long)&_end;
        free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
 
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
 }
index 6e67a90902f2894293281d2c9df4070128a880bb..d9b5b806afe6fcecfaf6c727c1a09310ed8867a1 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _H8300_DMA_MAPPING_H
 #define _H8300_DMA_MAPPING_H
 
-#include <asm-generic/dma-coherent.h>
-
 extern struct dma_map_ops h8300_dma_map_ops;
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -12,46 +10,4 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       return 0;
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-       return memory;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
 #endif
index 16965427f6b4827d1b9807fe98248f80540e910b..268fde8a45756e580ef06da3a2b051c4fb2e9f58 100644 (file)
 
 struct device;
 extern int bad_dma_address;
+#define DMA_ERROR_CODE bad_dma_address
 
 extern struct dma_map_ops *dma_ops;
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 {
        if (unlikely(dev == NULL))
@@ -45,8 +43,8 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *dev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
 extern int dma_is_consistent(struct device *dev, dma_addr_t dma_handle);
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                           enum dma_data_direction direction);
@@ -60,47 +58,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == bad_dma_address);
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       ret = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-}
-
 #endif
index 98106e55ad4ff73299a156b87e527b29bc6384b1..24b998888916a663659f0f4f8fa0befc88f40cdd 100644 (file)
@@ -19,8 +19,6 @@
 #ifndef _ASM_SIGNAL_H
 #define _ASM_SIGNAL_H
 
-#include <uapi/asm/registers.h>
-
 extern unsigned long __rt_sigtramp_template[2];
 
 void do_signal(struct pt_regs *regs);
index b74f9bae31a3b9e81204d3c86b51acd3dcfe65c6..9e3ddf792bd3e00afc44afc567e52cf290b8c54f 100644 (file)
@@ -44,17 +44,6 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 static struct gen_pool *coherent_pool;
 
 
index 17fbf45bf1502ad4c7212346e9333b95d2914937..a6a1d1f8309a40918e0e884715a09380d45c6d18 100644 (file)
@@ -97,20 +97,6 @@ static int set_next_event(unsigned long delta, struct clock_event_device *evt)
        return 0;
 }
 
-/*
- * Sets the mode (periodic, shutdown, oneshot, etc) of a timer.
- */
-static void set_mode(enum clock_event_mode mode,
-       struct clock_event_device *evt)
-{
-       switch (mode) {
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               /* XXX implement me */
-       default:
-               break;
-       }
-}
-
 #ifdef CONFIG_SMP
 /*  Broadcast mechanism  */
 static void broadcast(const struct cpumask *mask)
@@ -119,13 +105,13 @@ static void broadcast(const struct cpumask *mask)
 }
 #endif
 
+/* XXX Implement set_state_shutdown() */
 static struct clock_event_device hexagon_clockevent_dev = {
        .name           = "clockevent",
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .rating         = 400,
        .irq            = RTOS_TIMER_INT,
        .set_next_event = set_next_event,
-       .set_mode       = set_mode,
 #ifdef CONFIG_SMP
        .broadcast      = broadcast,
 #endif
@@ -146,7 +132,6 @@ void setup_percpu_clockdev(void)
 
        dummy_clock_dev->features = CLOCK_EVT_FEAT_DUMMY;
        dummy_clock_dev->cpumask = cpumask_of(cpu);
-       dummy_clock_dev->mode = CLOCK_EVT_MODE_UNUSED;
 
        clockevents_register_device(dummy_clock_dev);
 }
index 42a91a7aa2b08fa3a9ba4f1de06e07fdb47bfa9d..eb0249e3798112615fd5774d6f30229aa6241e53 100644 (file)
@@ -518,6 +518,7 @@ source "drivers/sn/Kconfig"
 config KEXEC
        bool "kexec system call"
        depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU)
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 344387a554066b7c7b455c08ff583ede09d82625..a6d6190c9d24c01b3878b44649911e34322c1894 100644 (file)
@@ -1140,13 +1140,9 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 #ifdef CONFIG_NUMA
        {
-               int node = ioc->node;
                struct page *page;
 
-               if (node == NUMA_NO_NODE)
-                       node = numa_node_id();
-
-               page = alloc_pages_exact_node(node, flags, get_order(size));
+               page = alloc_pages_node(ioc->node, flags, get_order(size));
                if (unlikely(!page))
                        return NULL;
 
index cf3ab7e784b5474be705c6e091431f5538b1ee00..9beccf8010bd6bf8eaa64ab292a753449a0ca609 100644 (file)
@@ -23,60 +23,10 @@ extern void machvec_dma_sync_single(struct device *, dma_addr_t, size_t,
 extern void machvec_dma_sync_sg(struct device *, struct scatterlist *, int,
                                enum dma_data_direction);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *daddr, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       void *caddr;
-
-       caddr = ops->alloc(dev, size, daddr, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *daddr, caddr);
-       return caddr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *caddr, dma_addr_t daddr,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       debug_dma_free_coherent(dev, size, caddr, daddr);
-       ops->free(dev, size, caddr, daddr, attrs);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #define get_dma_ops(dev) platform_dma_get_ops(dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t daddr)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       debug_dma_mapping_error(dev, daddr);
-       return ops->mapping_error(dev, daddr);
-}
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = platform_dma_get_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int
-dma_set_mask (struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       *dev->dma_mask = mask;
-       return 0;
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
index 80a7e34be00984a104923d933eafa7faa0d937fc..9041bbe2b7b42dfe344a7bfe8759ba493955ed94 100644 (file)
@@ -435,6 +435,7 @@ static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned lo
 {
        return ioremap(phys_addr, size);
 }
+#define ioremap_cache ioremap_cache
 
 
 /*
index 95c39b95e97e24f1ed3d7a58cf56dbbefc2ff419..99c96a5e6016b50a951ba8dfccf609cb06962232 100644 (file)
@@ -11,7 +11,7 @@
 
 
 
-#define NR_syscalls                    319 /* length of syscall table */
+#define NR_syscalls                    321 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
index 461079560c78728848b7631de5efbe700d146620..98e94e19a5a0870fc71b34f85310f475de85697b 100644 (file)
 #define __NR_memfd_create              1340
 #define __NR_bpf                       1341
 #define __NR_execveat                  1342
+#define __NR_userfaultfd               1343
+#define __NR_membarrier                        1344
 
 #endif /* _UAPI_ASM_IA64_UNISTD_H */
index 4826ff957a3d18f6e0266f55dd7519b190ac198e..5fa3848ba22497c34d68e4f0e3f4069f79c73d09 100644 (file)
@@ -4,7 +4,7 @@
 #include <linux/errno.h>
 #include <linux/timex.h>
 #include <linux/clocksource.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
 #define CYCLONE_CBAR_ADDR 0xFEB00CD0
index ae0de7bf55257682dc11a1cc1fa31bc6cffa3112..37cc7a65cd3ee1fc3304d862776f438db4158b4d 100644 (file)
@@ -1768,5 +1768,7 @@ sys_call_table:
        data8 sys_memfd_create                  // 1340
        data8 sys_bpf
        data8 sys_execveat
+       data8 sys_userfaultfd
+       data8 sys_membarrier
 
        .org sys_call_table + 8*NR_syscalls     // guard against failures to increase NR_syscalls
index 20e8a9b21d7519ebf7d506e4825d83c05629ba37..f3976da36721a94353dc68e6dd737dc3697f7aec 100644 (file)
@@ -97,7 +97,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 
        /* attempt to allocate a granule's worth of cached memory pages */
 
-       page = alloc_pages_exact_node(nid,
+       page = __alloc_pages_node(nid,
                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                IA64_GRANULE_SHIFT-PAGE_SHIFT);
        if (!page) {
index 97e48b0eefc7c18f54f0d1ee76860eed43a53c4e..1841ef69183d8742db20194334f248b88ffac4a4 100644 (file)
@@ -645,7 +645,7 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
        pg_data_t *pgdat;
        struct zone *zone;
@@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
        pgdat = NODE_DATA(nid);
 
        zone = pgdat->node_zones +
-               zone_for_memory(nid, start, size, ZONE_NORMAL);
+               zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
        ret = __add_pages(nid, zone, start_pfn, nr_pages);
 
        if (ret)
index d0853e8e8623e46a4a5e7ad87072e7ca354af0a0..8f59907007cbe3d153329f58c9b67ae81cb8a824 100644 (file)
@@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
         */
        node = pcibus_to_node(pdev->bus);
        if (likely(node >=0)) {
-               struct page *p = alloc_pages_exact_node(node,
+               struct page *p = __alloc_pages_node(node,
                                                flags, get_order(size));
 
                if (likely(p))
index 28a09529f206915fd00633bf846dde48342b8b36..3a76927458681d3785e2ed558a88961ba5b18532 100644 (file)
@@ -86,6 +86,7 @@ decompress_kernel(int mmu_on, unsigned char *zimage_data,
        free_mem_end_ptr = free_mem_ptr + BOOT_HEAP_SIZE;
 
        puts("\nDecompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output_data, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output_data, 0,
+                       NULL, error);
        puts("done.\nBooting the kernel.\n");
 }
index 2dd8f63bfbbb7850e7e797eb108f708eb1fc6d54..498b567f007b0a80d1905dc21e8fb503101b3724 100644 (file)
@@ -95,6 +95,7 @@ config MMU_SUN3
 config KEXEC
        bool "kexec system call"
        depends on M68KCLASSIC
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 075aaabd136026a408ee4560ee4870127c6b1b9d..f7836c6a6b60eb24981a5f9f53207a96bff7a6dc 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/m54xxgpt.h>
 #ifdef CONFIG_MMU
 #include <asm/mmu_context.h>
+#include <linux/pfn.h>
 #endif
 
 /***************************************************************************/
@@ -91,13 +92,13 @@ static void __init mcf54xx_bootmem_alloc(void)
        m68k_memory[0].size = _ramend - _rambase;
 
        /* compute total pages in system */
-       num_pages = (_ramend - _rambase) >> PAGE_SHIFT;
+       num_pages = PFN_DOWN(_ramend - _rambase);
 
        /* page numbers */
        memstart = PAGE_ALIGN(_ramstart);
-       min_low_pfn = _rambase >> PAGE_SHIFT;
-       start_pfn = memstart >> PAGE_SHIFT;
-       max_low_pfn = _ramend >> PAGE_SHIFT;
+       min_low_pfn = PFN_DOWN(_rambase);
+       start_pfn = PFN_DOWN(memstart);
+       max_low_pfn = PFN_DOWN(_ramend);
        high_memory = (void *)_ramend;
 
        m68k_virt_to_node_shift = fls(_ramend - _rambase - 1) - 6;
index 493b3111d4c12b6d96e139a2e96afb84d022f558..d86a9ffb3f13eadde3ee66380945b8932622dc93 100644 (file)
@@ -42,37 +42,28 @@ static u32 pit_cnt;
  * This is also called after resume to bring the PIT into operation again.
  */
 
-static void init_cf_pit_timer(enum clock_event_mode mode,
-                             struct clock_event_device *evt)
+static int cf_pit_set_periodic(struct clock_event_device *evt)
 {
-       switch (mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-
-               __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR));
-               __raw_writew(PIT_CYCLES_PER_JIFFY, TA(MCFPIT_PMR));
-               __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | \
-                               MCFPIT_PCSR_OVW | MCFPIT_PCSR_RLD | \
-                               MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR));
-               break;
-
-       case CLOCK_EVT_MODE_SHUTDOWN:
-       case CLOCK_EVT_MODE_UNUSED:
-
-               __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR));
-               break;
-
-       case CLOCK_EVT_MODE_ONESHOT:
-
-               __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR));
-               __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | \
-                               MCFPIT_PCSR_OVW | MCFPIT_PCSR_CLK64, \
-                               TA(MCFPIT_PCSR));
-               break;
-
-       case CLOCK_EVT_MODE_RESUME:
-               /* Nothing to do here */
-               break;
-       }
+       __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR));
+       __raw_writew(PIT_CYCLES_PER_JIFFY, TA(MCFPIT_PMR));
+       __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE |
+                    MCFPIT_PCSR_OVW | MCFPIT_PCSR_RLD |
+                    MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR));
+       return 0;
+}
+
+static int cf_pit_set_oneshot(struct clock_event_device *evt)
+{
+       __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR));
+       __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE |
+                    MCFPIT_PCSR_OVW | MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR));
+       return 0;
+}
+
+static int cf_pit_shutdown(struct clock_event_device *evt)
+{
+       __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR));
+       return 0;
 }
 
 /*
@@ -88,12 +79,15 @@ static int cf_pit_next_event(unsigned long delta,
 }
 
 struct clock_event_device cf_pit_clockevent = {
-       .name           = "pit",
-       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .set_mode       = init_cf_pit_timer,
-       .set_next_event = cf_pit_next_event,
-       .shift          = 32,
-       .irq            = MCF_IRQ_PIT1,
+       .name                   = "pit",
+       .features               = CLOCK_EVT_FEAT_PERIODIC |
+                                 CLOCK_EVT_FEAT_ONESHOT,
+       .set_state_shutdown     = cf_pit_shutdown,
+       .set_state_periodic     = cf_pit_set_periodic,
+       .set_state_oneshot      = cf_pit_set_oneshot,
+       .set_next_event         = cf_pit_next_event,
+       .shift                  = 32,
+       .irq                    = MCF_IRQ_PIT1,
 };
 
 
index 2901f0f7d944ec55dbc47a45adcd27824ec14ba0..a2269d60a945bc3cfc9f088afd0731228a27583b 100644 (file)
@@ -6,7 +6,7 @@
 
 #ifndef __ASSEMBLY__
 extern void mcount_wrapper(void);
-#define MCOUNT_ADDR            ((long)(mcount_wrapper))
+#define MCOUNT_ADDR            ((unsigned long)(mcount_wrapper))
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
index 4f8f1f87ef1160fb5cd88a2fc7e771682891b2df..a336094a7a6c943b43dda144b44054d83bb4f54e 100644 (file)
@@ -270,23 +270,25 @@ void migrate_irqs(void)
 
        for_each_active_irq(i) {
                struct irq_data *data = irq_get_irq_data(i);
+               struct cpumask *mask;
                unsigned int newcpu;
 
                if (irqd_is_per_cpu(data))
                        continue;
 
-               if (!cpumask_test_cpu(cpu, data->affinity))
+               mask = irq_data_get_affinity_mask(data);
+               if (!cpumask_test_cpu(cpu, mask))
                        continue;
 
-               newcpu = cpumask_any_and(data->affinity, cpu_online_mask);
+               newcpu = cpumask_any_and(mask, cpu_online_mask);
 
                if (newcpu >= nr_cpu_ids) {
                        pr_info_ratelimited("IRQ%u no longer affine to CPU%u\n",
                                            i, cpu);
 
-                       cpumask_setall(data->affinity);
+                       cpumask_setall(mask);
                }
-               irq_set_affinity(i, data->affinity);
+               irq_set_affinity(i, mask);
        }
 }
 #endif /* CONFIG_HOTPLUG_CPU */
index ab353723076a8d1dde001295cf49cb7ddb5f4195..24b12970c9cff772d24d8b581aeb945b8d917b6b 100644 (file)
@@ -27,7 +27,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
-#include <asm-generic/dma-coherent.h>
 #include <asm/cacheflush.h>
 
 #define DMA_ERROR_CODE         (~(dma_addr_t)0x0)
@@ -45,31 +44,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &dma_direct_ops;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (unlikely(!ops))
-               return 0;
-       if (!ops->dma_supported)
-               return 1;
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (unlikely(ops == NULL))
-               return -EIO;
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, dma_mask);
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-       *dev->dma_mask = dma_mask;
-       return 0;
-}
-
 #include <asm-generic/dma-mapping-common.h>
 
 static inline void __dma_sync(unsigned long paddr,
@@ -88,50 +62,6 @@ static inline void __dma_sync(unsigned long paddr,
        }
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       BUG_ON(!ops);
-
-       memory = ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!ops);
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                enum dma_data_direction direction)
 {
index fd2fa2eca62f18fe2541300a8ffc5453632760d7..da0144f40d99d5bc5590838dec4b59e07e0a9b31 100644 (file)
@@ -3,7 +3,7 @@
 
 #ifdef CONFIG_FUNCTION_TRACER
 
-#define MCOUNT_ADDR            ((long)(_mcount))
+#define MCOUNT_ADDR            ((unsigned long)(_mcount))
 #define MCOUNT_INSN_SIZE       8 /* sizeof mcount call */
 
 #ifndef __ASSEMBLY__
index be1731d5e2fa2b600fb17acabe2622b04a0c65fb..e9bcdb6e0086ba2b911621700b7712bc5b442057 100644 (file)
 #ifndef _UAPI_ASM_MICROBLAZE_ELF_H
 #define _UAPI_ASM_MICROBLAZE_ELF_H
 
+#include <linux/elf-em.h>
+
 /*
  * Note there is no "official" ELF designation for Microblaze.
  * I've snaffled the value from the microblaze binutils source code
  * /binutils/microblaze/include/elf/microblaze.h
  */
-#define EM_MICROBLAZE          189
 #define EM_MICROBLAZE_OLD      0xbaab
 #define ELF_ARCH               EM_MICROBLAZE
 
index 752acca8de1fa9f6f73aaf1d218840c04cd1abc7..e3aa5b0b4ef17771fbd2afa1557f29ee6a7a2b3d 100644 (file)
@@ -2597,6 +2597,7 @@ source "kernel/Kconfig.preempt"
 
 config KEXEC
        bool "Kexec system call"
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 54831069a206249444b31e40c4c749a37a667aa2..080cd53bac369158481785fd285733e7d5372e8c 100644 (file)
@@ -111,8 +111,8 @@ void decompress_kernel(unsigned long boot_heap_start)
        puts("\n");
 
        /* Decompress the kernel with according algorithm */
-       decompress((char *)zimage_start, zimage_size, 0, 0,
-                  (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, error);
+       __decompress((char *)zimage_start, zimage_size, 0, 0,
+                  (void *)VMLINUX_LOAD_ADDRESS_ULL, 0, 0, error);
 
        /* FIXME: should we flush cache here? */
        puts("Now, booting the kernel...\n");
index d8960d46417b07ec5e3b0603b2f934efe4c1f54c..2cd45f5f9481cec75b8e32e384bdcc1396ffeb9b 100644 (file)
@@ -161,9 +161,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 {
        void *ret;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -194,11 +191,6 @@ static void *octeon_dma_alloc_coherent(struct device *dev, size_t size,
 static void octeon_dma_free_coherent(struct device *dev, size_t size,
        void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
index 1646cce032c34aa17d41dd8c51fe25a6d803d6ac..642b50946943ccebd0fc24b9e4b6cc06c4a90d63 100644 (file)
@@ -320,7 +320,6 @@ CONFIG_KEYS=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_YAMA=y
-CONFIG_SECURITY_YAMA_STACKED=y
 CONFIG_DEFAULT_SECURITY_DAC=y
 CONFIG_CRYPTO_AUTHENC=y
 CONFIG_CRYPTO_HMAC=y
index 360b3387182af251713106cb304ac85ca78c5ca5..e604f760c4a076b44255b312b6f45180f06200c4 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/scatterlist.h>
 #include <asm/dma-coherence.h>
 #include <asm/cache.h>
-#include <asm-generic/dma-coherent.h>
 
 #ifndef CONFIG_SGI_IP27 /* Kludge to fix 2.6.39 build for IP27 */
 #include <dma-coherence.h>
@@ -32,73 +31,7 @@ static inline void dma_mark_clean(void *addr, size_t size) {}
 
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       return ops->dma_supported(dev, mask);
-}
-
-static inline int dma_mapping_error(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, mask);
-       return ops->mapping_error(dev, mask);
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if(!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, mask);
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 extern void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
               enum dma_data_direction direction);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       ops->free(dev, size, vaddr, dma_handle, attrs);
-
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                          dma_addr_t *dma_handle, gfp_t flag);
-
-void dma_free_noncoherent(struct device *dev, size_t size,
-                        void *vaddr, dma_addr_t dma_handle);
-
 #endif /* _ASM_DMA_MAPPING_H */
index 2c6b989c1bc4054c354b65fc0fbded4ac8c52b66..4ffa6fc81c8f78acaf24ae15849ee475a814d37a 100644 (file)
@@ -14,9 +14,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 {
        void *ret;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -46,11 +43,6 @@ static void *loongson_dma_alloc_coherent(struct device *dev, size_t size,
 static void loongson_dma_free_coherent(struct device *dev, size_t size,
                void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
@@ -93,6 +85,9 @@ static void loongson_dma_sync_sg_for_device(struct device *dev,
 
 static int loongson_dma_set_mask(struct device *dev, u64 mask)
 {
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+
        if (mask > DMA_BIT_MASK(loongson_sysconf.dma_mask_bits)) {
                *dev->dma_mask = DMA_BIT_MASK(loongson_sysconf.dma_mask_bits);
                return -EIO;
index 8f23cf08f4baa68d4d73b93b62a396867085f877..a914dc1cb6d1bc339cf44cc0c5aeac887a2e5f74 100644 (file)
@@ -112,7 +112,7 @@ static gfp_t massage_gfp_flags(const struct device *dev, gfp_t gfp)
        return gfp | dma_flag;
 }
 
-void *dma_alloc_noncoherent(struct device *dev, size_t size,
+static void *mips_dma_alloc_noncoherent(struct device *dev, size_t size,
        dma_addr_t * dma_handle, gfp_t gfp)
 {
        void *ret;
@@ -128,7 +128,6 @@ void *dma_alloc_noncoherent(struct device *dev, size_t size,
 
        return ret;
 }
-EXPORT_SYMBOL(dma_alloc_noncoherent);
 
 static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t * dma_handle, gfp_t gfp, struct dma_attrs *attrs)
@@ -137,8 +136,12 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
        struct page *page = NULL;
        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
+       /*
+        * XXX: seems like the coherent and non-coherent implementations could
+        * be consolidated.
+        */
+       if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs))
+               return mips_dma_alloc_noncoherent(dev, size, dma_handle, gfp);
 
        gfp = massage_gfp_flags(dev, gfp);
 
@@ -164,24 +167,24 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size,
 }
 
 
-void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr,
-       dma_addr_t dma_handle)
+static void mips_dma_free_noncoherent(struct device *dev, size_t size,
+               void *vaddr, dma_addr_t dma_handle)
 {
        plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
        free_pages((unsigned long) vaddr, get_order(size));
 }
-EXPORT_SYMBOL(dma_free_noncoherent);
 
 static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr,
        dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
        unsigned long addr = (unsigned long) vaddr;
-       int order = get_order(size);
        unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
        struct page *page = NULL;
 
-       if (dma_release_from_coherent(dev, order, vaddr))
+       if (dma_get_attr(DMA_ATTR_NON_CONSISTENT, attrs)) {
+               mips_dma_free_noncoherent(dev, size, vaddr, dma_handle);
                return;
+       }
 
        plat_unmap_dma_mem(dev, dma_handle, size, DMA_BIDIRECTIONAL);
 
index f3d4ae87abc7ffebcb2dc1f3827da069b8d12c02..3758715d4ab671af54399d8e2cfe1eeade8a41ac 100644 (file)
@@ -47,11 +47,6 @@ static char *nlm_swiotlb;
 static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t *dma_handle, gfp_t gfp, struct dma_attrs *attrs)
 {
-       void *ret;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
 
@@ -69,11 +64,6 @@ static void *nlm_dma_alloc_coherent(struct device *dev, size_t size,
 static void nlm_dma_free_coherent(struct device *dev, size_t size,
        void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs)
 {
-       int order = get_order(size);
-
-       if (dma_release_from_coherent(dev, order, vaddr))
-               return;
-
        swiotlb_free_coherent(dev, size, vaddr, dma_handle);
 }
 
diff --git a/arch/nios2/boot/dts/10m50_devboard.dts b/arch/nios2/boot/dts/10m50_devboard.dts
new file mode 100755 (executable)
index 0000000..3e411c6
--- /dev/null
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2015 Altera Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/dts-v1/;
+
+/ {
+       model = "Altera NiosII Max10";
+       compatible = "altr,niosii-max10";
+       #address-cells = <1>;
+       #size-cells = <1>;
+
+       cpus {
+               #address-cells = <1>;
+               #size-cells = <0>;
+
+               cpu: cpu@0 {
+                       device_type = "cpu";
+                       compatible = "altr,nios2-1.1";
+                       reg = <0x00000000>;
+                       interrupt-controller;
+                       #interrupt-cells = <1>;
+                       altr,exception-addr = <0xc8000120>;
+                       altr,fast-tlb-miss-addr = <0xc0000100>;
+                       altr,has-div = <1>;
+                       altr,has-initda = <1>;
+                       altr,has-mmu = <1>;
+                       altr,has-mul = <1>;
+                       altr,implementation = "fast";
+                       altr,pid-num-bits = <8>;
+                       altr,reset-addr = <0xd4000000>;
+                       altr,tlb-num-entries = <256>;
+                       altr,tlb-num-ways = <16>;
+                       altr,tlb-ptr-sz = <8>;
+                       clock-frequency = <75000000>;
+                       dcache-line-size = <32>;
+                       dcache-size = <32768>;
+                       icache-line-size = <32>;
+                       icache-size = <32768>;
+               };
+       };
+
+       memory {
+               device_type = "memory";
+               reg = <0x08000000 0x08000000>,
+                       <0x00000000 0x00000400>;
+       };
+
+       sopc0: sopc@0 {
+               device_type = "soc";
+               ranges;
+               #address-cells = <1>;
+               #size-cells = <1>;
+               compatible = "altr,avalon", "simple-bus";
+               bus-frequency = <75000000>;
+
+               jtag_uart: serial@18001530 {
+                       compatible = "altr,juart-1.0";
+                       reg = <0x18001530 0x00000008>;
+                       interrupt-parent = <&cpu>;
+                       interrupts = <7>;
+               };
+
+               a_16550_uart_0: serial@18001600 {
+                       compatible = "altr,16550-FIFO32", "ns16550a";
+                       reg = <0x18001600 0x00000200>;
+                       interrupt-parent = <&cpu>;
+                       interrupts = <1>;
+                       auto-flow-control = <1>;
+                       clock-frequency = <50000000>;
+                       fifo-size = <32>;
+                       reg-io-width = <4>;
+                       reg-shift = <2>;
+               };
+
+               sysid: sysid@18001528 {
+                       compatible = "altr,sysid-1.0";
+                       reg = <0x18001528 0x00000008>;
+                       id = <4207856382>;
+                       timestamp = <1431309290>;
+               };
+
+               rgmii_0_eth_tse_0: ethernet@400 {
+                       compatible = "altr,tse-msgdma-1.0", "altr,tse-1.0";
+                       reg = <0x00000400 0x00000400>,
+                               <0x00000820 0x00000020>,
+                               <0x00000800 0x00000020>,
+                               <0x000008c0 0x00000008>,
+                               <0x00000840 0x00000020>,
+                               <0x00000860 0x00000020>;
+                       reg-names = "control_port", "rx_csr", "rx_desc", "rx_resp", "tx_csr", "tx_desc";
+                       interrupt-parent = <&cpu>;
+                       interrupts = <2 3>;
+                       interrupt-names = "rx_irq", "tx_irq";
+                       rx-fifo-depth = <8192>;
+                       tx-fifo-depth = <8192>;
+                       address-bits = <48>;
+                       max-frame-size = <1518>;
+                       local-mac-address = [00 00 00 00 00 00];
+                       altr,has-supplementary-unicast;
+                       altr,enable-sup-addr = <1>;
+                       altr,has-hash-multicast-filter;
+                       altr,enable-hash = <1>;
+                       phy-mode = "rgmii-id";
+                       phy-handle = <&phy0>;
+                       rgmii_0_eth_tse_0_mdio: mdio {
+                               compatible = "altr,tse-mdio";
+                               #address-cells = <1>;
+                               #size-cells = <0>;
+                               phy0: ethernet-phy@0 {
+                                       reg = <0>;
+                                       device_type = "ethernet-phy";
+                               };
+                       };
+               };
+
+               enet_pll: clock@0 {
+                       compatible = "altr,pll-1.0";
+                       #clock-cells = <1>;
+
+                       enet_pll_c0: enet_pll_c0 {
+                               compatible = "fixed-clock";
+                               #clock-cells = <0>;
+                               clock-frequency = <125000000>;
+                               clock-output-names = "enet_pll-c0";
+                       };
+
+                       enet_pll_c1: enet_pll_c1 {
+                               compatible = "fixed-clock";
+                               #clock-cells = <0>;
+                               clock-frequency = <25000000>;
+                               clock-output-names = "enet_pll-c1";
+                       };
+
+                       enet_pll_c2: enet_pll_c2 {
+                               compatible = "fixed-clock";
+                               #clock-cells = <0>;
+                               clock-frequency = <2500000>;
+                               clock-output-names = "enet_pll-c2";
+                       };
+               };
+
+               sys_pll: clock@1 {
+                       compatible = "altr,pll-1.0";
+                       #clock-cells = <1>;
+
+                       sys_pll_c0: sys_pll_c0 {
+                               compatible = "fixed-clock";
+                               #clock-cells = <0>;
+                               clock-frequency = <100000000>;
+                               clock-output-names = "sys_pll-c0";
+                       };
+
+                       sys_pll_c1: sys_pll_c1 {
+                               compatible = "fixed-clock";
+                               #clock-cells = <0>;
+                               clock-frequency = <50000000>;
+                               clock-output-names = "sys_pll-c1";
+                       };
+
+                       sys_pll_c2: sys_pll_c2 {
+                               compatible = "fixed-clock";
+                               #clock-cells = <0>;
+                               clock-frequency = <75000000>;
+                               clock-output-names = "sys_pll-c2";
+                       };
+               };
+
+               sys_clk_timer: timer@18001440 {
+                       compatible = "altr,timer-1.0";
+                       reg = <0x18001440 0x00000020>;
+                       interrupt-parent = <&cpu>;
+                       interrupts = <0>;
+                       clock-frequency = <75000000>;
+               };
+
+               led_pio: gpio@180014d0 {
+                       compatible = "altr,pio-1.0";
+                       reg = <0x180014d0 0x00000010>;
+                       altr,gpio-bank-width = <4>;
+                       resetvalue = <15>;
+                       #gpio-cells = <2>;
+                       gpio-controller;
+               };
+
+               button_pio: gpio@180014c0 {
+                       compatible = "altr,pio-1.0";
+                       reg = <0x180014c0 0x00000010>;
+                       interrupt-parent = <&cpu>;
+                       interrupts = <6>;
+                       altr,gpio-bank-width = <3>;
+                       altr,interrupt-type = <2>;
+                       edge_type = <1>;
+                       level_trigger = <0>;
+                       resetvalue = <0>;
+                       #gpio-cells = <2>;
+                       gpio-controller;
+               };
+
+               sys_clk_timer_1: timer@880 {
+                       compatible = "altr,timer-1.0";
+                       reg = <0x00000880 0x00000020>;
+                       interrupt-parent = <&cpu>;
+                       interrupts = <5>;
+                       clock-frequency = <75000000>;
+               };
+
+               fpga_leds: leds {
+                       compatible = "gpio-leds";
+
+                       led_fpga0: fpga0 {
+                               label = "fpga_led0";
+                               gpios = <&led_pio 0 1>;
+                       };
+
+                       led_fpga1: fpga1 {
+                               label = "fpga_led1";
+                               gpios = <&led_pio 1 1>;
+                       };
+
+                       led_fpga2: fpga2 {
+                               label = "fpga_led2";
+                               gpios = <&led_pio 2 1>;
+                       };
+
+                       led_fpga3: fpga3 {
+                               label = "fpga_led3";
+                               gpios = <&led_pio 3 1>;
+                       };
+               };
+       };
+
+       chosen {
+               bootargs = "debug console=ttyS0,115200";
+       };
+};
diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig
new file mode 100755 (executable)
index 0000000..8b2a30b
--- /dev/null
@@ -0,0 +1,81 @@
+CONFIG_SYSVIPC=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_SYSCTL_SYSCALL=y
+# CONFIG_ELF_CORE is not set
+# CONFIG_EPOLL is not set
+# CONFIG_SIGNALFD is not set
+# CONFIG_TIMERFD is not set
+# CONFIG_EVENTFD is not set
+# CONFIG_SHMEM is not set
+# CONFIG_AIO is not set
+CONFIG_EMBEDDED=y
+CONFIG_SLAB=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_NIOS2_MEM_BASE=0x8000000
+CONFIG_NIOS2_HW_MUL_SUPPORT=y
+CONFIG_NIOS2_HW_DIV_SUPPORT=y
+CONFIG_CUSTOM_CACHE_SETTINGS=y
+CONFIG_NIOS2_DCACHE_SIZE=0x8000
+CONFIG_NIOS2_ICACHE_SIZE=0x8000
+# CONFIG_NIOS2_CMDLINE_IGNORE_DTB is not set
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_IPV6 is not set
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+# CONFIG_FW_LOADER is not set
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_NETDEVICES=y
+CONFIG_ALTERA_TSE=y
+CONFIG_MARVELL_PHY=y
+# CONFIG_WLAN is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_VT is not set
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+CONFIG_SERIAL_ALTERA_JTAGUART=y
+# CONFIG_HW_RANDOM is not set
+CONFIG_GPIOLIB=y
+CONFIG_GPIO_SYSFS=y
+CONFIG_GPIO_ALTERA=y
+# CONFIG_HWMON is not set
+# CONFIG_USB_SUPPORT is not set
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_HEARTBEAT=y
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY_USER is not set
+CONFIG_JFFS2_FS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_ROOT_NFS=y
+CONFIG_SUNRPC_DEBUG=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
index 4e5907a0cabe88d4a797b9cdbc0f0081392c62bd..23e0544e117cef65f8b771fdd2917ed8995c50e8 100644 (file)
@@ -32,8 +32,6 @@
 #define INST_STW       0x15
 #define INST_LDW       0x17
 
-static unsigned long ma_user, ma_kern, ma_skipped, ma_half, ma_word;
-
 static unsigned int ma_usermode;
 #define UM_WARN                0x01
 #define UM_FIXUP       0x02
@@ -53,7 +51,6 @@ static int reg_offsets[32];
 static inline u32 get_reg_val(struct pt_regs *fp, int reg)
 {
        u8 *p = ((u8 *)fp) + reg_offsets[reg];
-
        return *(u32 *)p;
 }
 
@@ -71,14 +68,13 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
        u32 isn, addr, val;
        int in_kernel;
        u8 a, b, d0, d1, d2, d3;
-       u16 imm16;
+       s16 imm16;
        unsigned int fault;
 
        /* back up one instruction */
        fp->ea -= 4;
 
        if (fixup_exception(fp)) {
-               ma_skipped++;
                return;
        }
 
@@ -103,18 +99,11 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
                        fault |= __get_user(d1, (u8 *)(addr+1));
                        val = (d1 << 8) | d0;
                        put_reg_val(fp, b, val);
-                       ma_half++;
                        break;
                case INST_STH:
                        val = get_reg_val(fp, b);
                        d1 = val >> 8;
                        d0 = val >> 0;
-
-                       pr_debug("sth: ra=%d (%08x) rb=%d (%08x), imm16 %04x addr %08x val %08x\n",
-                               a, get_reg_val(fp, a),
-                               b, get_reg_val(fp, b),
-                               imm16, addr, val);
-
                        if (in_kernel) {
                                *(u8 *)(addr+0) = d0;
                                *(u8 *)(addr+1) = d1;
@@ -122,14 +111,12 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
                                fault |= __put_user(d0, (u8 *)(addr+0));
                                fault |= __put_user(d1, (u8 *)(addr+1));
                        }
-                       ma_half++;
                        break;
                case INST_LDH:
                        fault |= __get_user(d0, (u8 *)(addr+0));
                        fault |= __get_user(d1, (u8 *)(addr+1));
                        val = (short)((d1 << 8) | d0);
                        put_reg_val(fp, b, val);
-                       ma_half++;
                        break;
                case INST_STW:
                        val = get_reg_val(fp, b);
@@ -148,7 +135,6 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
                                fault |= __put_user(d2, (u8 *)(addr+2));
                                fault |= __put_user(d3, (u8 *)(addr+3));
                        }
-                       ma_word++;
                        break;
                case INST_LDW:
                        fault |= __get_user(d0, (u8 *)(addr+0));
@@ -157,7 +143,6 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
                        fault |= __get_user(d3, (u8 *)(addr+3));
                        val = (d3 << 24) | (d2 << 16) | (d1 << 8) | d0;
                        put_reg_val(fp, b, val);
-                       ma_word++;
                        break;
                }
        }
@@ -186,7 +171,6 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
         *  note exception and skip bad instruction (return)
         */
        if (in_kernel) {
-               ma_kern++;
                fp->ea += 4;
 
                if (ma_usermode & KM_WARN) {
@@ -200,8 +184,6 @@ asmlinkage void handle_unaligned_c(struct pt_regs *fp, int cause)
                return;
        }
 
-       ma_user++;
-
        /*
         * user mode -
         *  possibly warn,
index 9e3cc8a40ee9feb164842780ef885426681c9466..bbc3f9157f9c4fec3418e8a92c20f204924e0c5b 100644 (file)
@@ -130,7 +130,7 @@ static void nios2_timer_stop(struct nios2_timer *timer)
 }
 
 static void nios2_timer_config(struct nios2_timer *timer, unsigned long period,
-       enum clock_event_mode mode)
+                              bool periodic)
 {
        u16 ctrl;
 
@@ -148,7 +148,7 @@ static void nios2_timer_config(struct nios2_timer *timer, unsigned long period,
        timer_writew(timer, period >> 16, ALTERA_TIMER_PERIODH_REG);
 
        ctrl |= ALTERA_TIMER_CONTROL_START_MSK | ALTERA_TIMER_CONTROL_ITO_MSK;
-       if (mode == CLOCK_EVT_MODE_PERIODIC)
+       if (periodic)
                ctrl |= ALTERA_TIMER_CONTROL_CONT_MSK;
        else
                ctrl &= ~ALTERA_TIMER_CONTROL_CONT_MSK;
@@ -160,32 +160,38 @@ static int nios2_timer_set_next_event(unsigned long delta,
 {
        struct nios2_clockevent_dev *nios2_ced = to_nios2_clkevent(evt);
 
-       nios2_timer_config(&nios2_ced->timer, delta, evt->mode);
+       nios2_timer_config(&nios2_ced->timer, delta, false);
 
        return 0;
 }
 
-static void nios2_timer_set_mode(enum clock_event_mode mode,
-       struct clock_event_device *evt)
+static int nios2_timer_shutdown(struct clock_event_device *evt)
+{
+       struct nios2_clockevent_dev *nios2_ced = to_nios2_clkevent(evt);
+       struct nios2_timer *timer = &nios2_ced->timer;
+
+       nios2_timer_stop(timer);
+       return 0;
+}
+
+static int nios2_timer_set_periodic(struct clock_event_device *evt)
 {
        unsigned long period;
        struct nios2_clockevent_dev *nios2_ced = to_nios2_clkevent(evt);
        struct nios2_timer *timer = &nios2_ced->timer;
 
-       switch (mode) {
-       case CLOCK_EVT_MODE_PERIODIC:
-               period = DIV_ROUND_UP(timer->freq, HZ);
-               nios2_timer_config(timer, period, CLOCK_EVT_MODE_PERIODIC);
-               break;
-       case CLOCK_EVT_MODE_ONESHOT:
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               nios2_timer_stop(timer);
-               break;
-       case CLOCK_EVT_MODE_RESUME:
-               nios2_timer_start(timer);
-               break;
-       }
+       period = DIV_ROUND_UP(timer->freq, HZ);
+       nios2_timer_config(timer, period, true);
+       return 0;
+}
+
+static int nios2_timer_resume(struct clock_event_device *evt)
+{
+       struct nios2_clockevent_dev *nios2_ced = to_nios2_clkevent(evt);
+       struct nios2_timer *timer = &nios2_ced->timer;
+
+       nios2_timer_start(timer);
+       return 0;
 }
 
 irqreturn_t timer_interrupt(int irq, void *dev_id)
@@ -218,7 +224,10 @@ static struct nios2_clockevent_dev nios2_ce = {
                .rating = 250,
                .shift = 32,
                .set_next_event = nios2_timer_set_next_event,
-               .set_mode = nios2_timer_set_mode,
+               .set_state_shutdown = nios2_timer_shutdown,
+               .set_state_periodic = nios2_timer_set_periodic,
+               .set_state_oneshot = nios2_timer_shutdown,
+               .tick_resume = nios2_timer_resume,
        },
 };
 
index fab8628e1b6e70d5c8159ca652992f2556e7eeb7..413bfcf863848fba556078d8ad6ea1cba59cf770 100644 (file)
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
@@ -36,75 +35,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &or1k_dma_map_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL) 
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-                                         dma_addr_t *dma_handle, gfp_t gfp)
-{
-       struct dma_attrs attrs;
-
-       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-
-       return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-                                        void *cpu_addr, dma_addr_t dma_handle)
-{
-       struct dma_attrs attrs;
-
-       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
-
-       dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
-}
-
+#define HAVE_ARCH_DMA_SUPPORTED 1
 static inline int dma_supported(struct device *dev, u64 dma_mask)
 {
        /* Support 32 bit DMA mask exclusively */
        return dma_mask == DMA_BIT_MASK(32);
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       return 0;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
+#include <asm-generic/dma-mapping-common.h>
 
-       return 0;
-}
 #endif /* __ASM_OPENRISC_DMA_MAPPING_H */
index 8cd0abf28ffbbe400a4f5e1c408573085184add2..1a16f1d1075fc93a39bef3fad4c1c24096888d5d 100644 (file)
@@ -137,6 +137,8 @@ static inline void __iomem * ioremap(unsigned long offset, unsigned long size)
        return __ioremap(offset, size, _PAGE_NO_CACHE);
 }
 #define ioremap_nocache(off, sz)       ioremap((off), (sz))
+#define ioremap_wc                     ioremap_nocache
+#define ioremap_uc                     ioremap_nocache
 
 extern void iounmap(const volatile void __iomem *addr);
 
index 413ec3c3f9cc509099b25bf3fc60e0de8195ba53..ba5e1c7b1f177d45f743392c0950017622b143a8 100644 (file)
@@ -507,8 +507,8 @@ void do_cpu_irq_mask(struct pt_regs *regs)
        struct pt_regs *old_regs;
        unsigned long eirr_val;
        int irq, cpu = smp_processor_id();
-#ifdef CONFIG_SMP
        struct irq_data *irq_data;
+#ifdef CONFIG_SMP
        cpumask_t dest;
 #endif
 
@@ -521,8 +521,13 @@ void do_cpu_irq_mask(struct pt_regs *regs)
                goto set_out;
        irq = eirr_to_irq(eirr_val);
 
-#ifdef CONFIG_SMP
        irq_data = irq_get_irq_data(irq);
+
+       /* Filter out spurious interrupts, mostly from serial port at bootup */
+       if (unlikely(!irq_desc_has_action(irq_data_to_desc(irq_data))))
+               goto set_out;
+
+#ifdef CONFIG_SMP
        cpumask_copy(&dest, irq_data_get_affinity_mask(irq_data));
        if (irqd_is_per_cpu(irq_data) &&
            !cpumask_test_cpu(smp_processor_id(), &dest)) {
index 7ef22e3387e09f8b63b2a6532b0c995bb12b6e96..0b8d26d3ba43be545308dc70075a4a224a8b8155 100644 (file)
@@ -821,7 +821,7 @@ cas2_action:
        /* 64bit CAS */
 #ifdef CONFIG_64BIT
 19:    ldd,ma  0(%sr3,%r26), %r29
-       sub,  %r29, %r25, %r0
+       sub,*=  %r29, %r25, %r0
        b,n     cas2_end
 20:    std,ma  %r24, 0(%sr3,%r26)
        copy    %r0, %r28
index 70e105d62423f5c8bde7588d6a8a4afa4c67e646..400acac0a304d12b235e05773d9c9f53a87f55b2 100644 (file)
@@ -202,7 +202,6 @@ static struct clocksource clocksource_cr16 = {
        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-#ifdef CONFIG_SMP
 int update_cr16_clocksource(void)
 {
        /* since the cr16 cycle counters are not synchronized across CPUs,
@@ -214,12 +213,6 @@ int update_cr16_clocksource(void)
 
        return 0;
 }
-#else
-int update_cr16_clocksource(void)
-{
-       return 0; /* no change */
-}
-#endif /*CONFIG_SMP*/
 
 void __init start_cpu_itimer(void)
 {
@@ -231,20 +224,14 @@ void __init start_cpu_itimer(void)
        per_cpu(cpu_data, cpu).it_value = next_tick;
 }
 
-static struct platform_device rtc_generic_dev = {
-       .name = "rtc-generic",
-       .id = -1,
-};
-
 static int __init rtc_init(void)
 {
-       if (platform_device_register(&rtc_generic_dev) < 0)
-               printk(KERN_ERR "unable to register rtc device...\n");
+       struct platform_device *pdev;
 
-       /* not necessarily an error */
-       return 0;
+       pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+       return PTR_ERR_OR_ZERO(pdev);
 }
-module_init(rtc_init);
+device_initcall(rtc_init);
 
 void read_persistent_clock(struct timespec *ts)
 {
index 15503adddf4f59695d34f3b5adb428250594bf66..a762864ec92e9baf9bf2d2016474c21b94e026b8 100644 (file)
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
        int fault;
        unsigned int flags;
 
-       if (pagefault_disabled())
+       if (faulthandler_disabled())
                goto no_context;
 
        tsk = current;
index b447918b9e2c8bd289372d5bcf69d0f3bb1f2aad..9a7057ec21541a09af3cedc4e49350852cba1791 100644 (file)
@@ -420,6 +420,7 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
 config KEXEC
        bool "kexec system call"
        depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP))
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 4ca54fdd8768c2b04da0f72c49614ffb3c130783..b9b4af2af9a5b297f72c930c4272158d9c3d39fd 100644 (file)
@@ -67,7 +67,7 @@ UTS_MACHINE := $(OLDARCH)
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override CC    += -mlittle-endian
-ifneq ($(COMPILER),clang)
+ifneq ($(cc-name),clang)
 override CC    += -mno-strict-align
 endif
 override AS    += -mlittle-endian
@@ -353,7 +353,7 @@ TOUT        := .tmp_gas_check
 # - Require gcc 4.0 or above on 64-bit
 # - gcc-4.2.0 has issues compiling modules on 64-bit
 checkbin:
-       @if test "${COMPILER}" != "clang" \
+       @if test "$(cc-name)" != "clang" \
            && test "$(cc-version)" = "0304" ; then \
                if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \
                        echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \
@@ -362,14 +362,14 @@ checkbin:
                        false; \
                fi ; \
        fi
-       @if test "${COMPILER}" != "clang" \
+       @if test "$(cc-name)" != "clang" \
            && test "$(cc-version)" -lt "0400" \
            && test "x${CONFIG_PPC64}" = "xy" ; then \
                 echo -n "Sorry, GCC v4.0 or above is required to build " ; \
                 echo "the 64-bit powerpc kernel." ; \
                 false ; \
         fi
-       @if test "${COMPILER}" != "clang" \
+       @if test "$(cc-name)" != "clang" \
            && test "$(cc-fullversion)" = "040200" \
            && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \
                echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \
index 710f60e380e07dbcd55dd6a239cb78f5e3a09c60..7f522c021dc3087af2393b40e603a054f992952b 100644 (file)
@@ -18,7 +18,9 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 
+#ifdef CONFIG_PPC64
 #define DMA_ERROR_CODE         (~(dma_addr_t)0x0)
+#endif
 
 /* Some dma direct funcs must be visible for use in other dma_ops */
 extern void *__dma_direct_alloc_coherent(struct device *dev, size_t size,
@@ -120,71 +122,14 @@ static inline void set_dma_offset(struct device *dev, dma_addr_t off)
 /* this will be removed soon */
 #define flush_write_buffers()
 
-#include <asm-generic/dma-mapping-common.h>
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
+#define HAVE_ARCH_DMA_SET_MASK 1
+extern int dma_set_mask(struct device *dev, u64 dma_mask);
 
-       if (unlikely(dma_ops == NULL))
-               return 0;
-       if (dma_ops->dma_supported == NULL)
-               return 1;
-       return dma_ops->dma_supported(dev, mask);
-}
+#include <asm-generic/dma-mapping-common.h>
 
-extern int dma_set_mask(struct device *dev, u64 dma_mask);
 extern int __dma_set_mask(struct device *dev, u64 dma_mask);
 extern u64 __dma_get_required_mask(struct device *dev);
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       BUG_ON(!dma_ops);
-
-       cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       BUG_ON(!dma_ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-#ifdef CONFIG_PPC64
-       return (dma_addr == DMA_ERROR_CODE);
-#else
-       return 0;
-#endif
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
 #ifdef CONFIG_SWIOTLB
@@ -210,9 +155,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
        return daddr - get_dma_offset(dev);
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #define ARCH_HAS_DMA_MMAP_COHERENT
 
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
index e3661872fbea5b03888aa88c35753bcfc9983403..ef89b14655731006c7adac49afbe26a073782868 100644 (file)
@@ -2,7 +2,7 @@
 #define _ASM_POWERPC_FTRACE
 
 #ifdef CONFIG_FUNCTION_TRACER
-#define MCOUNT_ADDR            ((long)(_mcount))
+#define MCOUNT_ADDR            ((unsigned long)(_mcount))
 #define MCOUNT_INSN_SIZE       4 /* sizeof mcount call */
 
 #ifdef __ASSEMBLY__
index b91e74a817d89742355e547c61b9ce36a243f11b..9fac01cb89c14df3fd4ae8bb51ef51acb56ec54b 100644 (file)
@@ -158,6 +158,7 @@ extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
                        bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
                        unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
                        unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
@@ -225,12 +226,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr;
 }
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
        vcpu->arch.xer = val;
 }
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.xer;
 }
index 5bdfb5dd34002348578b8b9c0a6164a04b3d4820..72b6225aca73d9aa4ec47aa3087cb90b623042be 100644 (file)
 #define XICS_MFRR              0xc
 #define XICS_IPI               2       /* interrupt source # for IPIs */
 
+/* Maximum number of threads per physical core */
+#define MAX_SMT_THREADS                8
+
+/* Maximum number of subcores per physical core */
+#define MAX_SUBCORES           4
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -65,6 +71,19 @@ kvmppc_resume_\intno:
 
 #else  /*__ASSEMBLY__ */
 
+struct kvmppc_vcore;
+
+/* Struct used for coordinating micro-threading (split-core) mode changes */
+struct kvm_split_mode {
+       unsigned long   rpr;
+       unsigned long   pmmar;
+       unsigned long   ldbar;
+       u8              subcore_size;
+       u8              do_nap;
+       u8              napped[MAX_SMT_THREADS];
+       struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
+};
+
 /*
  * This struct goes in the PACA on 64-bit processors.  It is used
  * to store host state that needs to be saved when we enter a guest
@@ -100,6 +119,7 @@ struct kvmppc_host_state {
        u64 host_spurr;
        u64 host_dscr;
        u64 dec_expires;
+       struct kvm_split_mode *kvm_split_mode;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
        u64 cfar;
@@ -112,7 +132,7 @@ struct kvmppc_book3s_shadow_vcpu {
        bool in_use;
        ulong gpr[14];
        u32 cr;
-       u32 xer;
+       ulong xer;
        ulong ctr;
        ulong lr;
        ulong pc;
index 3286f0d6a86c1d7f614989dc676d6ab47735e6d7..bc6e29e4dfd4a125406a5736b995b8fa8500052e 100644 (file)
@@ -54,12 +54,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
        return vcpu->arch.cr;
 }
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
        vcpu->arch.xer = val;
 }
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.xer;
 }
index d91f65b28e322808d2d05c260ffa13614c01333b..98eebbf663405c59ebcc3174d8619773f9813ac1 100644 (file)
@@ -205,8 +205,10 @@ struct revmap_entry {
  */
 #define KVMPPC_RMAP_LOCK_BIT   63
 #define KVMPPC_RMAP_RC_SHIFT   32
+#define KVMPPC_RMAP_CHG_SHIFT  48
 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED    (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHG_ORDER  (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT    0x100000000ul
 #define KVMPPC_RMAP_INDEX      0xfffffffful
 
@@ -278,7 +280,9 @@ struct kvmppc_vcore {
        u16 last_cpu;
        u8 vcore_state;
        u8 in_guest;
+       struct kvmppc_vcore *master_vcore;
        struct list_head runnable_threads;
+       struct list_head preempt_list;
        spinlock_t lock;
        wait_queue_head_t wq;
        spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
@@ -300,12 +304,21 @@ struct kvmppc_vcore {
 #define VCORE_EXIT_MAP(vc)     ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
 
-/* Values for vcore_state */
+/* This bit is used when a vcore exit is triggered from outside the vcore */
+#define VCORE_EXIT_REQ         0x10000
+
+/*
+ * Values for vcore_state.
+ * Note that these are arranged such that lower values
+ * (< VCORE_SLEEPING) don't require stolen time accounting
+ * on load/unload, and higher values do.
+ */
 #define VCORE_INACTIVE 0
-#define VCORE_SLEEPING 1
-#define VCORE_PREEMPT  2
-#define VCORE_RUNNING  3
-#define VCORE_EXITING  4
+#define VCORE_PREEMPT  1
+#define VCORE_PIGGYBACK        2
+#define VCORE_SLEEPING 3
+#define VCORE_RUNNING  4
+#define VCORE_EXITING  5
 
 /*
  * Struct used to manage memory for a virtual processor area
@@ -473,7 +486,7 @@ struct kvm_vcpu_arch {
        ulong ciabr;
        ulong cfar;
        ulong ppr;
-       ulong pspb;
+       u32 pspb;
        ulong fscr;
        ulong shadow_fscr;
        ulong ebbhr;
@@ -619,6 +632,7 @@ struct kvm_vcpu_arch {
        int trap;
        int state;
        int ptid;
+       int thread_cpu;
        bool timer_running;
        wait_queue_head_t cpu_run;
 
index 8452335661a5d5fd896eba78c4e43b40a4532291..790f5d1d9a4624d6f17bdde78706be7240d29ad2 100644 (file)
 
 /* POWER8 Micro Partition Prefetch (MPP) parameters */
 /* Address mask is common for LOGMPP instruction and MPPR SPR */
-#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000
+#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000ULL
 
 /* Bits 60 and 61 of MPP SPR should be set to one of the following */
 /* Aborting the fetch is indeed setting 00 in the table size bits */
index 810f433731dcac24809dea42f1973c5a967bcae9..221d584d089f9418abfa1087cffd9d0b0117c090 100644 (file)
@@ -511,6 +511,8 @@ int main(void)
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
        DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
        DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
+       DEFINE(VCPU_CPU, offsetof(struct kvm_vcpu, cpu));
+       DEFINE(VCPU_THREAD_CPU, offsetof(struct kvm_vcpu, arch.thread_cpu));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -673,7 +675,14 @@ int main(void)
        HSTATE_FIELD(HSTATE_DSCR, host_dscr);
        HSTATE_FIELD(HSTATE_DABR, dabr);
        HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+       HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode);
        DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+       DEFINE(KVM_SPLIT_RPR, offsetof(struct kvm_split_mode, rpr));
+       DEFINE(KVM_SPLIT_PMMAR, offsetof(struct kvm_split_mode, pmmar));
+       DEFINE(KVM_SPLIT_LDBAR, offsetof(struct kvm_split_mode, ldbar));
+       DEFINE(KVM_SPLIT_SIZE, offsetof(struct kvm_split_mode, subcore_size));
+       DEFINE(KVM_SPLIT_DO_NAP, offsetof(struct kvm_split_mode, do_nap));
+       DEFINE(KVM_SPLIT_NAPPED, offsetof(struct kvm_split_mode, napped));
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
index c8c62c7fc31ce67e6888391d821fbe822e1e1f19..2e710c15893fe0da0295f9cf1ddeb6a827d88fcb 100644 (file)
@@ -102,7 +102,7 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev)
                        res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2];
                } else if (i == dev->rom_base_reg) {
                        res = &dev->resource[PCI_ROM_RESOURCE];
-                       flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE;
+                       flags |= IORESOURCE_READONLY;
                } else {
                        printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i);
                        continue;
index 3caec2c421058b0065e9d0cfd382e23e9d7102a6..c2024ac9d4e859963c1c15408824b98baa3e92c3 100644 (file)
@@ -74,14 +74,14 @@ config KVM_BOOK3S_64
          If unsure, say N.
 
 config KVM_BOOK3S_64_HV
-       tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+       tristate "KVM for POWER7 and later using hypervisor mode in host"
        depends on KVM_BOOK3S_64 && PPC_POWERNV
        select KVM_BOOK3S_HV_POSSIBLE
        select MMU_NOTIFIER
        select CMA
        ---help---
          Support running unmodified book3s_64 guest kernels in
-         virtual machines on POWER7 and PPC970 processors that have
+         virtual machines on POWER7 and newer processors that have
          hypervisor mode available to the host.
 
          If you say Y here, KVM will use the hardware virtualization
@@ -89,8 +89,8 @@ config KVM_BOOK3S_64_HV
          guest operating systems will run at full hardware speed
          using supervisor and user modes.  However, this also means
          that KVM is not usable under PowerVM (pHyp), is only usable
-         on POWER7 (or later) processors and PPC970-family processors,
-         and cannot emulate a different processor from the host processor.
+         on POWER7 or later processors, and cannot emulate a
+         different processor from the host processor.
 
          If unsure, say N.
 
index 6d6398f4d632d0c4e5e9d2439ce7e8df4f46767f..d75bf325f54a17ebf4e19dda7a85eed271e4f3ed 100644 (file)
@@ -240,7 +240,8 @@ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
        kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 }
 
-int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
 {
        int deliver = 1;
        int vec = 0;
index 2035d16a9262ac5e547bacc0cbcb4fc862706aa5..d5c9bfeb0c9c7e926aa5a40761959ce5e51f90eb 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
+#include "book3s.h"
 
 /* #define DEBUG_MMU */
 /* #define DEBUG_SR */
index b982d925c7105f910003ed3e53b5a595558f67ef..79ad35abd1967c0ea68c900f9b5bb83d25a488a6 100644 (file)
@@ -28,6 +28,7 @@
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 #include "trace_pr.h"
+#include "book3s.h"
 
 #define PTE_SIZE 12
 
index dab68b7af3f280225288c68f8d09eee753d85824..1f9c0a17f445f73b858dcfabb761aa098a6fd781 100644 (file)
@@ -761,6 +761,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        /* Harvest R and C */
                        rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
                        *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+                       if (rcbits & HPTE_R_C)
+                               kvmppc_update_rmap_change(rmapp, psize);
                        if (rcbits & ~rev[i].guest_rpte) {
                                rev[i].guest_rpte = ptel | rcbits;
                                note_hpte_modification(kvm, &rev[i]);
@@ -927,8 +929,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
  retry:
        lock_rmap(rmapp);
        if (*rmapp & KVMPPC_RMAP_CHANGED) {
-               *rmapp &= ~KVMPPC_RMAP_CHANGED;
+               long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
+                       >> KVMPPC_RMAP_CHG_SHIFT;
+               *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
                npages_dirty = 1;
+               if (change_order > PAGE_SHIFT)
+                       npages_dirty = 1ul << (change_order - PAGE_SHIFT);
        }
        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
                unlock_rmap(rmapp);
index 5a2bc4b0dfe5a9b83c4d5be3b820bda90e57a641..2afdb9c0937dbd3b7471b88d75e17ca714bba1ae 100644 (file)
@@ -23,6 +23,7 @@
 #include <asm/reg.h>
 #include <asm/switch_to.h>
 #include <asm/time.h>
+#include "book3s.h"
 
 #define OP_19_XOP_RFID         18
 #define OP_19_XOP_RFI          50
index a9f753fb73a816143ec4a013d9424b2acf6dbee4..9754e6815e521c80d6b8cadfe9123ab24b97c014 100644 (file)
@@ -81,6 +81,12 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 #define MPP_BUFFER_ORDER       3
 #endif
 
+static int dynamic_mt_modes = 6;
+module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
+static int target_smt_mode;
+module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -114,7 +120,7 @@ static bool kvmppc_ipi_thread(int cpu)
 
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-       int cpu = vcpu->cpu;
+       int cpu;
        wait_queue_head_t *wqp;
 
        wqp = kvm_arch_vcpu_wq(vcpu);
@@ -123,10 +129,11 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
                ++vcpu->stat.halt_wakeup;
        }
 
-       if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+       if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
                return;
 
        /* CPU points to the first thread of the core */
+       cpu = vcpu->cpu;
        if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
                smp_send_reschedule(cpu);
 }
@@ -164,6 +171,27 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  * they should never fail.)
  */
 
+static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vc->stoltb_lock, flags);
+       vc->preempt_tb = mftb();
+       spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
+static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vc->stoltb_lock, flags);
+       if (vc->preempt_tb != TB_NIL) {
+               vc->stolen_tb += mftb() - vc->preempt_tb;
+               vc->preempt_tb = TB_NIL;
+       }
+       spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 {
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -175,14 +203,9 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
         * vcpu, and once it is set to this vcpu, only this task
         * ever sets it to NULL.
         */
-       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-               spin_lock_irqsave(&vc->stoltb_lock, flags);
-               if (vc->preempt_tb != TB_NIL) {
-                       vc->stolen_tb += mftb() - vc->preempt_tb;
-                       vc->preempt_tb = TB_NIL;
-               }
-               spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-       }
+       if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+               kvmppc_core_end_stolen(vc);
+
        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
            vcpu->arch.busy_preempt != TB_NIL) {
@@ -197,11 +220,9 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
        unsigned long flags;
 
-       if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-               spin_lock_irqsave(&vc->stoltb_lock, flags);
-               vc->preempt_tb = mftb();
-               spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-       }
+       if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+               kvmppc_core_start_stolen(vc);
+
        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
                vcpu->arch.busy_preempt = mftb();
@@ -214,12 +235,12 @@ static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
        kvmppc_end_cede(vcpu);
 }
 
-void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 {
        vcpu->arch.pvr = pvr;
 }
 
-int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 {
        unsigned long pcr = 0;
        struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -259,7 +280,7 @@ int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
        return 0;
 }
 
-void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 {
        int r;
 
@@ -292,7 +313,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
               vcpu->arch.last_inst);
 }
 
-struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 {
        int r;
        struct kvm_vcpu *v, *ret = NULL;
@@ -641,7 +662,8 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
 
        spin_lock(&vcore->lock);
        if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
-           vcore->vcore_state != VCORE_INACTIVE)
+           vcore->vcore_state != VCORE_INACTIVE &&
+           vcore->runner)
                target = vcore->runner;
        spin_unlock(&vcore->lock);
 
@@ -1431,6 +1453,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
        vcore->lpcr = kvm->arch.lpcr;
        vcore->first_vcpuid = core * threads_per_subcore;
        vcore->kvm = kvm;
+       INIT_LIST_HEAD(&vcore->preempt_list);
 
        vcore->mpp_buffer_is_valid = false;
 
@@ -1655,6 +1678,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
        spin_unlock(&vcore->lock);
        vcpu->arch.vcore = vcore;
        vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
+       vcpu->arch.thread_cpu = -1;
 
        vcpu->arch.cpu_type = KVM_CPU_3S_64;
        kvmppc_sanity_check(vcpu);
@@ -1749,6 +1773,7 @@ static int kvmppc_grab_hwthread(int cpu)
 
        /* Ensure the thread won't go into the kernel if it wakes */
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.kvm_vcore = NULL;
        tpaca->kvm_hstate.napping = 0;
        smp_wmb();
        tpaca->kvm_hstate.hwthread_req = 1;
@@ -1780,26 +1805,32 @@ static void kvmppc_release_hwthread(int cpu)
        tpaca = &paca[cpu];
        tpaca->kvm_hstate.hwthread_req = 0;
        tpaca->kvm_hstate.kvm_vcpu = NULL;
+       tpaca->kvm_hstate.kvm_vcore = NULL;
+       tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
-static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
        int cpu;
        struct paca_struct *tpaca;
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       struct kvmppc_vcore *mvc = vc->master_vcore;
 
-       if (vcpu->arch.timer_running) {
-               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-               vcpu->arch.timer_running = 0;
+       cpu = vc->pcpu;
+       if (vcpu) {
+               if (vcpu->arch.timer_running) {
+                       hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+                       vcpu->arch.timer_running = 0;
+               }
+               cpu += vcpu->arch.ptid;
+               vcpu->cpu = mvc->pcpu;
+               vcpu->arch.thread_cpu = cpu;
        }
-       cpu = vc->pcpu + vcpu->arch.ptid;
        tpaca = &paca[cpu];
-       tpaca->kvm_hstate.kvm_vcore = vc;
-       tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
-       vcpu->cpu = vc->pcpu;
-       /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
-       smp_wmb();
        tpaca->kvm_hstate.kvm_vcpu = vcpu;
+       tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
+       /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
+       smp_wmb();
+       tpaca->kvm_hstate.kvm_vcore = mvc;
        if (cpu != smp_processor_id())
                kvmppc_ipi_thread(cpu);
 }
@@ -1812,12 +1843,12 @@ static void kvmppc_wait_for_nap(void)
        for (loops = 0; loops < 1000000; ++loops) {
                /*
                 * Check if all threads are finished.
-                * We set the vcpu pointer when starting a thread
+                * We set the vcore pointer when starting a thread
                 * and the thread clears it when finished, so we look
-                * for any threads that still have a non-NULL vcpu ptr.
+                * for any threads that still have a non-NULL vcore ptr.
                 */
                for (i = 1; i < threads_per_subcore; ++i)
-                       if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+                       if (paca[cpu + i].kvm_hstate.kvm_vcore)
                                break;
                if (i == threads_per_subcore) {
                        HMT_medium();
@@ -1827,7 +1858,7 @@ static void kvmppc_wait_for_nap(void)
        }
        HMT_medium();
        for (i = 1; i < threads_per_subcore; ++i)
-               if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+               if (paca[cpu + i].kvm_hstate.kvm_vcore)
                        pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
@@ -1890,6 +1921,278 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
        mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 
+/*
+ * A list of virtual cores for each physical CPU.
+ * These are vcores that could run but their runner VCPU tasks are
+ * (or may be) preempted.
+ */
+struct preempted_vcore_list {
+       struct list_head        list;
+       spinlock_t              lock;
+};
+
+static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
+
+static void init_vcore_lists(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
+               spin_lock_init(&lp->lock);
+               INIT_LIST_HEAD(&lp->list);
+       }
+}
+
+static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
+{
+       struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+
+       vc->vcore_state = VCORE_PREEMPT;
+       vc->pcpu = smp_processor_id();
+       if (vc->num_threads < threads_per_subcore) {
+               spin_lock(&lp->lock);
+               list_add_tail(&vc->preempt_list, &lp->list);
+               spin_unlock(&lp->lock);
+       }
+
+       /* Start accumulating stolen time */
+       kvmppc_core_start_stolen(vc);
+}
+
+static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
+{
+       struct preempted_vcore_list *lp;
+
+       kvmppc_core_end_stolen(vc);
+       if (!list_empty(&vc->preempt_list)) {
+               lp = &per_cpu(preempted_vcores, vc->pcpu);
+               spin_lock(&lp->lock);
+               list_del_init(&vc->preempt_list);
+               spin_unlock(&lp->lock);
+       }
+       vc->vcore_state = VCORE_INACTIVE;
+}
+
+/*
+ * This stores information about the virtual cores currently
+ * assigned to a physical core.
+ */
+struct core_info {
+       int             n_subcores;
+       int             max_subcore_threads;
+       int             total_threads;
+       int             subcore_threads[MAX_SUBCORES];
+       struct kvm      *subcore_vm[MAX_SUBCORES];
+       struct list_head vcs[MAX_SUBCORES];
+};
+
+/*
+ * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
+ * respectively in 2-way micro-threading (split-core) mode.
+ */
+static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
+
+static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
+{
+       int sub;
+
+       memset(cip, 0, sizeof(*cip));
+       cip->n_subcores = 1;
+       cip->max_subcore_threads = vc->num_threads;
+       cip->total_threads = vc->num_threads;
+       cip->subcore_threads[0] = vc->num_threads;
+       cip->subcore_vm[0] = vc->kvm;
+       for (sub = 0; sub < MAX_SUBCORES; ++sub)
+               INIT_LIST_HEAD(&cip->vcs[sub]);
+       list_add_tail(&vc->preempt_list, &cip->vcs[0]);
+}
+
+static bool subcore_config_ok(int n_subcores, int n_threads)
+{
+       /* Can only dynamically split if unsplit to begin with */
+       if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
+               return false;
+       if (n_subcores > MAX_SUBCORES)
+               return false;
+       if (n_subcores > 1) {
+               if (!(dynamic_mt_modes & 2))
+                       n_subcores = 4;
+               if (n_subcores > 2 && !(dynamic_mt_modes & 4))
+                       return false;
+       }
+
+       return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
+}
+
+static void init_master_vcore(struct kvmppc_vcore *vc)
+{
+       vc->master_vcore = vc;
+       vc->entry_exit_map = 0;
+       vc->in_guest = 0;
+       vc->napping_threads = 0;
+       vc->conferring_threads = 0;
+}
+
+/*
+ * See if the existing subcores can be split into 3 (or fewer) subcores
+ * of at most two threads each, so we can fit in another vcore.  This
+ * assumes there are at most two subcores and at most 6 threads in total.
+ */
+static bool can_split_piggybacked_subcores(struct core_info *cip)
+{
+       int sub, new_sub;
+       int large_sub = -1;
+       int thr;
+       int n_subcores = cip->n_subcores;
+       struct kvmppc_vcore *vc, *vcnext;
+       struct kvmppc_vcore *master_vc = NULL;
+
+       for (sub = 0; sub < cip->n_subcores; ++sub) {
+               if (cip->subcore_threads[sub] <= 2)
+                       continue;
+               if (large_sub >= 0)
+                       return false;
+               large_sub = sub;
+               vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+                                     preempt_list);
+               if (vc->num_threads > 2)
+                       return false;
+               n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
+       }
+       if (n_subcores > 3 || large_sub < 0)
+               return false;
+
+       /*
+        * Seems feasible, so go through and move vcores to new subcores.
+        * Note that when we have two or more vcores in one subcore,
+        * all those vcores must have only one thread each.
+        */
+       new_sub = cip->n_subcores;
+       thr = 0;
+       sub = large_sub;
+       list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
+               if (thr >= 2) {
+                       list_del(&vc->preempt_list);
+                       list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
+                       /* vc->num_threads must be 1 */
+                       if (++cip->subcore_threads[new_sub] == 1) {
+                               cip->subcore_vm[new_sub] = vc->kvm;
+                               init_master_vcore(vc);
+                               master_vc = vc;
+                               ++cip->n_subcores;
+                       } else {
+                               vc->master_vcore = master_vc;
+                               ++new_sub;
+                       }
+               }
+               thr += vc->num_threads;
+       }
+       cip->subcore_threads[large_sub] = 2;
+       cip->max_subcore_threads = 2;
+
+       return true;
+}
+
+static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
+{
+       int n_threads = vc->num_threads;
+       int sub;
+
+       if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+               return false;
+
+       if (n_threads < cip->max_subcore_threads)
+               n_threads = cip->max_subcore_threads;
+       if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
+               cip->max_subcore_threads = n_threads;
+       } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
+                  vc->num_threads <= 2) {
+               /*
+                * We may be able to fit another subcore in by
+                * splitting an existing subcore with 3 or 4
+                * threads into two 2-thread subcores, or one
+                * with 5 or 6 threads into three subcores.
+                * We can only do this if those subcores have
+                * piggybacked virtual cores.
+                */
+               if (!can_split_piggybacked_subcores(cip))
+                       return false;
+       } else {
+               return false;
+       }
+
+       sub = cip->n_subcores;
+       ++cip->n_subcores;
+       cip->total_threads += vc->num_threads;
+       cip->subcore_threads[sub] = vc->num_threads;
+       cip->subcore_vm[sub] = vc->kvm;
+       init_master_vcore(vc);
+       list_del(&vc->preempt_list);
+       list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
+
+       return true;
+}
+
+static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
+                                 struct core_info *cip, int sub)
+{
+       struct kvmppc_vcore *vc;
+       int n_thr;
+
+       vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+                             preempt_list);
+
+       /* require same VM and same per-core reg values */
+       if (pvc->kvm != vc->kvm ||
+           pvc->tb_offset != vc->tb_offset ||
+           pvc->pcr != vc->pcr ||
+           pvc->lpcr != vc->lpcr)
+               return false;
+
+       /* P8 guest with > 1 thread per core would see wrong TIR value */
+       if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+           (vc->num_threads > 1 || pvc->num_threads > 1))
+               return false;
+
+       n_thr = cip->subcore_threads[sub] + pvc->num_threads;
+       if (n_thr > cip->max_subcore_threads) {
+               if (!subcore_config_ok(cip->n_subcores, n_thr))
+                       return false;
+               cip->max_subcore_threads = n_thr;
+       }
+
+       cip->total_threads += pvc->num_threads;
+       cip->subcore_threads[sub] = n_thr;
+       pvc->master_vcore = vc;
+       list_del(&pvc->preempt_list);
+       list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
+
+       return true;
+}
+
+/*
+ * Work out whether it is possible to piggyback the execution of
+ * vcore *pvc onto the execution of the other vcores described in *cip.
+ */
+static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
+                         int target_threads)
+{
+       int sub;
+
+       if (cip->total_threads + pvc->num_threads > target_threads)
+               return false;
+       for (sub = 0; sub < cip->n_subcores; ++sub)
+               if (cip->subcore_threads[sub] &&
+                   can_piggyback_subcore(pvc, cip, sub))
+                       return true;
+
+       if (can_dynamic_split(pvc, cip))
+               return true;
+
+       return false;
+}
+
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
        struct kvm_vcpu *vcpu, *vnext;
@@ -1909,12 +2212,45 @@ static void prepare_threads(struct kvmppc_vcore *vc)
        }
 }
 
-static void post_guest_process(struct kvmppc_vcore *vc)
+static void collect_piggybacks(struct core_info *cip, int target_threads)
+{
+       struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+       struct kvmppc_vcore *pvc, *vcnext;
+
+       spin_lock(&lp->lock);
+       list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
+               if (!spin_trylock(&pvc->lock))
+                       continue;
+               prepare_threads(pvc);
+               if (!pvc->n_runnable) {
+                       list_del_init(&pvc->preempt_list);
+                       if (pvc->runner == NULL) {
+                               pvc->vcore_state = VCORE_INACTIVE;
+                               kvmppc_core_end_stolen(pvc);
+                       }
+                       spin_unlock(&pvc->lock);
+                       continue;
+               }
+               if (!can_piggyback(pvc, cip, target_threads)) {
+                       spin_unlock(&pvc->lock);
+                       continue;
+               }
+               kvmppc_core_end_stolen(pvc);
+               pvc->vcore_state = VCORE_PIGGYBACK;
+               if (cip->total_threads >= target_threads)
+                       break;
+       }
+       spin_unlock(&lp->lock);
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
+       int still_running = 0;
        u64 now;
        long ret;
        struct kvm_vcpu *vcpu, *vnext;
 
+       spin_lock(&vc->lock);
        now = get_tb();
        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
                                 arch.run_list) {
@@ -1933,17 +2269,36 @@ static void post_guest_process(struct kvmppc_vcore *vc)
                vcpu->arch.ret = ret;
                vcpu->arch.trap = 0;
 
-               if (vcpu->arch.ceded) {
-                       if (!is_kvmppc_resume_guest(ret))
-                               kvmppc_end_cede(vcpu);
-                       else
+               if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       if (vcpu->arch.pending_exceptions)
+                               kvmppc_core_prepare_to_enter(vcpu);
+                       if (vcpu->arch.ceded)
                                kvmppc_set_timer(vcpu);
-               }
-               if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+                       else
+                               ++still_running;
+               } else {
                        kvmppc_remove_runnable(vc, vcpu);
                        wake_up(&vcpu->arch.cpu_run);
                }
        }
+       list_del_init(&vc->preempt_list);
+       if (!is_master) {
+               if (still_running > 0) {
+                       kvmppc_vcore_preempt(vc);
+               } else if (vc->runner) {
+                       vc->vcore_state = VCORE_PREEMPT;
+                       kvmppc_core_start_stolen(vc);
+               } else {
+                       vc->vcore_state = VCORE_INACTIVE;
+               }
+               if (vc->n_runnable > 0 && vc->runner == NULL) {
+                       /* make sure there's a candidate runner awake */
+                       vcpu = list_first_entry(&vc->runnable_threads,
+                                               struct kvm_vcpu, arch.run_list);
+                       wake_up(&vcpu->arch.cpu_run);
+               }
+       }
+       spin_unlock(&vc->lock);
 }
 
 /*
@@ -1955,6 +2310,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        struct kvm_vcpu *vcpu, *vnext;
        int i;
        int srcu_idx;
+       struct core_info core_info;
+       struct kvmppc_vcore *pvc, *vcnext;
+       struct kvm_split_mode split_info, *sip;
+       int split, subcore_size, active;
+       int sub;
+       bool thr0_done;
+       unsigned long cmd_bit, stat_bit;
+       int pcpu, thr;
+       int target_threads;
 
        /*
         * Remove from the list any threads that have a signal pending
@@ -1969,11 +2333,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
        /*
         * Initialize *vc.
         */
-       vc->entry_exit_map = 0;
+       init_master_vcore(vc);
        vc->preempt_tb = TB_NIL;
-       vc->in_guest = 0;
-       vc->napping_threads = 0;
-       vc->conferring_threads = 0;
 
        /*
         * Make sure we are running on primary threads, and that secondary
@@ -1991,24 +2352,120 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
                goto out;
        }
 
+       /*
+        * See if we could run any other vcores on the physical core
+        * along with this one.
+        */
+       init_core_info(&core_info, vc);
+       pcpu = smp_processor_id();
+       target_threads = threads_per_subcore;
+       if (target_smt_mode && target_smt_mode < target_threads)
+               target_threads = target_smt_mode;
+       if (vc->num_threads < target_threads)
+               collect_piggybacks(&core_info, target_threads);
+
+       /* Decide on micro-threading (split-core) mode */
+       subcore_size = threads_per_subcore;
+       cmd_bit = stat_bit = 0;
+       split = core_info.n_subcores;
+       sip = NULL;
+       if (split > 1) {
+               /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
+               if (split == 2 && (dynamic_mt_modes & 2)) {
+                       cmd_bit = HID0_POWER8_1TO2LPAR;
+                       stat_bit = HID0_POWER8_2LPARMODE;
+               } else {
+                       split = 4;
+                       cmd_bit = HID0_POWER8_1TO4LPAR;
+                       stat_bit = HID0_POWER8_4LPARMODE;
+               }
+               subcore_size = MAX_SMT_THREADS / split;
+               sip = &split_info;
+               memset(&split_info, 0, sizeof(split_info));
+               split_info.rpr = mfspr(SPRN_RPR);
+               split_info.pmmar = mfspr(SPRN_PMMAR);
+               split_info.ldbar = mfspr(SPRN_LDBAR);
+               split_info.subcore_size = subcore_size;
+               for (sub = 0; sub < core_info.n_subcores; ++sub)
+                       split_info.master_vcs[sub] =
+                               list_first_entry(&core_info.vcs[sub],
+                                       struct kvmppc_vcore, preempt_list);
+               /* order writes to split_info before kvm_split_mode pointer */
+               smp_wmb();
+       }
+       pcpu = smp_processor_id();
+       for (thr = 0; thr < threads_per_subcore; ++thr)
+               paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+
+       /* Initiate micro-threading (split-core) if required */
+       if (cmd_bit) {
+               unsigned long hid0 = mfspr(SPRN_HID0);
+
+               hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
+               mb();
+               mtspr(SPRN_HID0, hid0);
+               isync();
+               for (;;) {
+                       hid0 = mfspr(SPRN_HID0);
+                       if (hid0 & stat_bit)
+                               break;
+                       cpu_relax();
+               }
+       }
 
-       vc->pcpu = smp_processor_id();
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-               kvmppc_start_thread(vcpu);
-               kvmppc_create_dtl_entry(vcpu, vc);
-               trace_kvm_guest_enter(vcpu);
+       /* Start all the threads */
+       active = 0;
+       for (sub = 0; sub < core_info.n_subcores; ++sub) {
+               thr = subcore_thread_map[sub];
+               thr0_done = false;
+               active |= 1 << thr;
+               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
+                       pvc->pcpu = pcpu + thr;
+                       list_for_each_entry(vcpu, &pvc->runnable_threads,
+                                           arch.run_list) {
+                               kvmppc_start_thread(vcpu, pvc);
+                               kvmppc_create_dtl_entry(vcpu, pvc);
+                               trace_kvm_guest_enter(vcpu);
+                               if (!vcpu->arch.ptid)
+                                       thr0_done = true;
+                               active |= 1 << (thr + vcpu->arch.ptid);
+                       }
+                       /*
+                        * We need to start the first thread of each subcore
+                        * even if it doesn't have a vcpu.
+                        */
+                       if (pvc->master_vcore == pvc && !thr0_done)
+                               kvmppc_start_thread(NULL, pvc);
+                       thr += pvc->num_threads;
+               }
        }
 
-       /* Set this explicitly in case thread 0 doesn't have a vcpu */
-       get_paca()->kvm_hstate.kvm_vcore = vc;
-       get_paca()->kvm_hstate.ptid = 0;
+       /*
+        * Ensure that split_info.do_nap is set after setting
+        * the vcore pointer in the PACA of the secondaries.
+        */
+       smp_mb();
+       if (cmd_bit)
+               split_info.do_nap = 1;  /* ask secondaries to nap when done */
+
+       /*
+        * When doing micro-threading, poke the inactive threads as well.
+        * This gets them to the nap instruction after kvm_do_nap,
+        * which reduces the time taken to unsplit later.
+        */
+       if (split > 1)
+               for (thr = 1; thr < threads_per_subcore; ++thr)
+                       if (!(active & (1 << thr)))
+                               kvmppc_ipi_thread(pcpu + thr);
 
        vc->vcore_state = VCORE_RUNNING;
        preempt_disable();
 
        trace_kvmppc_run_core(vc, 0);
 
-       spin_unlock(&vc->lock);
+       for (sub = 0; sub < core_info.n_subcores; ++sub)
+               list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
+                       spin_unlock(&pvc->lock);
 
        kvm_guest_enter();
 
@@ -2019,32 +2476,58 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
        __kvmppc_vcore_entry();
 
-       spin_lock(&vc->lock);
-
        if (vc->mpp_buffer)
                kvmppc_start_saving_l2_cache(vc);
 
-       /* disable sending of IPIs on virtual external irqs */
-       list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-               vcpu->cpu = -1;
-       /* wait for secondary threads to finish writing their state to memory */
-       kvmppc_wait_for_nap();
-       for (i = 0; i < threads_per_subcore; ++i)
-               kvmppc_release_hwthread(vc->pcpu + i);
+       srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
+       spin_lock(&vc->lock);
        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
        vc->vcore_state = VCORE_EXITING;
-       spin_unlock(&vc->lock);
 
-       srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+       /* wait for secondary threads to finish writing their state to memory */
+       kvmppc_wait_for_nap();
+
+       /* Return to whole-core mode if we split the core earlier */
+       if (split > 1) {
+               unsigned long hid0 = mfspr(SPRN_HID0);
+               unsigned long loops = 0;
+
+               hid0 &= ~HID0_POWER8_DYNLPARDIS;
+               stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+               mb();
+               mtspr(SPRN_HID0, hid0);
+               isync();
+               for (;;) {
+                       hid0 = mfspr(SPRN_HID0);
+                       if (!(hid0 & stat_bit))
+                               break;
+                       cpu_relax();
+                       ++loops;
+               }
+               split_info.do_nap = 0;
+       }
+
+       /* Let secondaries go back to the offline loop */
+       for (i = 0; i < threads_per_subcore; ++i) {
+               kvmppc_release_hwthread(pcpu + i);
+               if (sip && sip->napped[i])
+                       kvmppc_ipi_thread(pcpu + i);
+       }
+
+       spin_unlock(&vc->lock);
 
        /* make sure updates to secondary vcpu structs are visible now */
        smp_mb();
        kvm_guest_exit();
 
-       preempt_enable();
+       for (sub = 0; sub < core_info.n_subcores; ++sub)
+               list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
+                                        preempt_list)
+                       post_guest_process(pvc, pvc == vc);
 
        spin_lock(&vc->lock);
-       post_guest_process(vc);
+       preempt_enable();
 
  out:
        vc->vcore_state = VCORE_INACTIVE;
@@ -2055,13 +2538,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
  * Wait for some other vcpu thread to execute us, and
  * wake us up when we need to handle something in the host.
  */
-static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
+static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
+                                struct kvm_vcpu *vcpu, int wait_state)
 {
        DEFINE_WAIT(wait);
 
        prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+       if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+               spin_unlock(&vc->lock);
                schedule();
+               spin_lock(&vc->lock);
+       }
        finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 
@@ -2137,9 +2624,21 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
         * this thread straight away and have it join in.
         */
        if (!signal_pending(current)) {
-               if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
+               if (vc->vcore_state == VCORE_PIGGYBACK) {
+                       struct kvmppc_vcore *mvc = vc->master_vcore;
+                       if (spin_trylock(&mvc->lock)) {
+                               if (mvc->vcore_state == VCORE_RUNNING &&
+                                   !VCORE_IS_EXITING(mvc)) {
+                                       kvmppc_create_dtl_entry(vcpu, vc);
+                                       kvmppc_start_thread(vcpu, vc);
+                                       trace_kvm_guest_enter(vcpu);
+                               }
+                               spin_unlock(&mvc->lock);
+                       }
+               } else if (vc->vcore_state == VCORE_RUNNING &&
+                          !VCORE_IS_EXITING(vc)) {
                        kvmppc_create_dtl_entry(vcpu, vc);
-                       kvmppc_start_thread(vcpu);
+                       kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
                        wake_up(&vc->wq);
@@ -2149,10 +2648,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               !signal_pending(current)) {
+               if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
+                       kvmppc_vcore_end_preempt(vc);
+
                if (vc->vcore_state != VCORE_INACTIVE) {
-                       spin_unlock(&vc->lock);
-                       kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
-                       spin_lock(&vc->lock);
+                       kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
                        continue;
                }
                list_for_each_entry_safe(v, vn, &vc->runnable_threads,
@@ -2179,10 +2679,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                if (n_ceded == vc->n_runnable) {
                        kvmppc_vcore_blocked(vc);
                } else if (need_resched()) {
-                       vc->vcore_state = VCORE_PREEMPT;
+                       kvmppc_vcore_preempt(vc);
                        /* Let something else run */
                        cond_resched_lock(&vc->lock);
-                       vc->vcore_state = VCORE_INACTIVE;
+                       if (vc->vcore_state == VCORE_PREEMPT)
+                               kvmppc_vcore_end_preempt(vc);
                } else {
                        kvmppc_run_core(vc);
                }
@@ -2191,11 +2692,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
               (vc->vcore_state == VCORE_RUNNING ||
-               vc->vcore_state == VCORE_EXITING)) {
-               spin_unlock(&vc->lock);
-               kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
-               spin_lock(&vc->lock);
-       }
+               vc->vcore_state == VCORE_EXITING))
+               kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
 
        if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
                kvmppc_remove_runnable(vc, vcpu);
@@ -2755,6 +3253,8 @@ static int kvmppc_book3s_init_hv(void)
 
        init_default_hcalls();
 
+       init_vcore_lists();
+
        r = kvmppc_mmu_hv_init();
        return r;
 }
index ed2589d4593fb710085014bcc81a3d2a0e9e54ff..fd7006bf6b1a1a59a86fe42efcbbb5c9b485e6a1 100644 (file)
@@ -110,14 +110,15 @@ void __init kvm_cma_reserve(void)
 long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                            unsigned int yield_count)
 {
-       struct kvmppc_vcore *vc = vcpu->arch.vcore;
+       struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+       int ptid = local_paca->kvm_hstate.ptid;
        int threads_running;
        int threads_ceded;
        int threads_conferring;
        u64 stop = get_tb() + 10 * tb_ticks_per_usec;
        int rv = H_SUCCESS; /* => don't yield */
 
-       set_bit(vcpu->arch.ptid, &vc->conferring_threads);
+       set_bit(ptid, &vc->conferring_threads);
        while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
                threads_running = VCORE_ENTRY_MAP(vc);
                threads_ceded = vc->napping_threads;
@@ -127,7 +128,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
                        break;
                }
        }
-       clear_bit(vcpu->arch.ptid, &vc->conferring_threads);
+       clear_bit(ptid, &vc->conferring_threads);
        return rv;
 }
 
@@ -238,7 +239,8 @@ void kvmhv_commence_exit(int trap)
 {
        struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
        int ptid = local_paca->kvm_hstate.ptid;
-       int me, ee;
+       struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
+       int me, ee, i;
 
        /* Set our bit in the threads-exiting-guest map in the 0xff00
           bits of vcore->entry_exit_map */
@@ -258,4 +260,26 @@ void kvmhv_commence_exit(int trap)
         */
        if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
                kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+
+       /*
+        * If we are doing dynamic micro-threading, interrupt the other
+        * subcores to pull them out of their guests too.
+        */
+       if (!sip)
+               return;
+
+       for (i = 0; i < MAX_SUBCORES; ++i) {
+               vc = sip->master_vcs[i];
+               if (!vc)
+                       break;
+               do {
+                       ee = vc->entry_exit_map;
+                       /* Already asked to exit? */
+                       if ((ee >> 8) != 0)
+                               break;
+               } while (cmpxchg(&vc->entry_exit_map, ee,
+                                ee | VCORE_EXIT_REQ) != ee);
+               if ((ee >> 8) == 0)
+                       kvmhv_interrupt_vcore(vc, ee);
+       }
 }
index b027a89737b62cee4399e63eb9d61232f156687d..c1df9bb1e413a1ec76222a58cf0bc13c9bfdb280 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/kvm_host.h>
 #include <linux/hugetlb.h>
 #include <linux/module.h>
+#include <linux/log2.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -97,25 +98,52 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
+/* Update the changed page order field of an rmap entry */
+void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+{
+       unsigned long order;
+
+       if (!psize)
+               return;
+       order = ilog2(psize);
+       order <<= KVMPPC_RMAP_CHG_SHIFT;
+       if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
+               *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
+
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+                                     unsigned long hpte_gr)
+{
+       struct kvm_memory_slot *memslot;
+       unsigned long *rmap;
+       unsigned long gfn;
+
+       gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+       if (!memslot)
+               return NULL;
+
+       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
+       return rmap;
+}
+
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                                struct revmap_entry *rev,
                                unsigned long hpte_v, unsigned long hpte_r)
 {
        struct revmap_entry *next, *prev;
-       unsigned long gfn, ptel, head;
-       struct kvm_memory_slot *memslot;
+       unsigned long ptel, head;
        unsigned long *rmap;
        unsigned long rcbits;
 
        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
        ptel = rev->guest_rpte |= rcbits;
-       gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-       memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
-       if (!memslot)
+       rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+       if (!rmap)
                return;
-
-       rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
        lock_rmap(rmap);
 
        head = *rmap & KVMPPC_RMAP_INDEX;
@@ -131,6 +159,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
        }
        *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+       if (rcbits & HPTE_R_C)
+               kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
        unlock_rmap(rmap);
 }
 
@@ -421,14 +451,20 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
        v = pte & ~HPTE_V_HVLOCK;
        if (v & HPTE_V_VALID) {
-               u64 pte1;
-
-               pte1 = be64_to_cpu(hpte[1]);
                hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
-               rb = compute_tlbie_rb(v, pte1, pte_index);
+               rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-               /* Read PTE low word after tlbie to get final R/C values */
-               remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+               /*
+                * The reference (R) and change (C) bits in a HPT
+                * entry can be set by hardware at any time up until
+                * the HPTE is invalidated and the TLB invalidation
+                * sequence has completed.  This means that when
+                * removing a HPTE, we need to re-read the HPTE after
+                * the invalidation sequence has completed in order to
+                * obtain reliable values of R and C.
+                */
+               remove_revmap_chain(kvm, pte_index, rev, v,
+                                   be64_to_cpu(hpte[1]));
        }
        r = rev->guest_rpte & ~HPTE_GR_RESERVED;
        note_hpte_modification(kvm, rev);
@@ -655,6 +691,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
        return H_SUCCESS;
 }
 
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+                       unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       __be64 *hpte;
+       unsigned long v, r, gr;
+       struct revmap_entry *rev;
+       unsigned long *rmap;
+       long ret = H_NOT_FOUND;
+
+       if (pte_index >= kvm->arch.hpt_npte)
+               return H_PARAMETER;
+
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       v = be64_to_cpu(hpte[0]);
+       r = be64_to_cpu(hpte[1]);
+       if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+               goto out;
+
+       gr = rev->guest_rpte;
+       if (rev->guest_rpte & HPTE_R_R) {
+               rev->guest_rpte &= ~HPTE_R_R;
+               note_hpte_modification(kvm, rev);
+       }
+       if (v & HPTE_V_VALID) {
+               gr |= r & (HPTE_R_R | HPTE_R_C);
+               if (r & HPTE_R_R) {
+                       kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+                       rmap = revmap_for_hpte(kvm, v, gr);
+                       if (rmap) {
+                               lock_rmap(rmap);
+                               *rmap |= KVMPPC_RMAP_REFERENCED;
+                               unlock_rmap(rmap);
+                       }
+               }
+       }
+       vcpu->arch.gpr[4] = gr;
+       ret = H_SUCCESS;
+ out:
+       unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+       return ret;
+}
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+                       unsigned long pte_index)
+{
+       struct kvm *kvm = vcpu->kvm;
+       __be64 *hpte;
+       unsigned long v, r, gr;
+       struct revmap_entry *rev;
+       unsigned long *rmap;
+       long ret = H_NOT_FOUND;
+
+       if (pte_index >= kvm->arch.hpt_npte)
+               return H_PARAMETER;
+
+       rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+       hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+       while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+               cpu_relax();
+       v = be64_to_cpu(hpte[0]);
+       r = be64_to_cpu(hpte[1]);
+       if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+               goto out;
+
+       gr = rev->guest_rpte;
+       if (gr & HPTE_R_C) {
+               rev->guest_rpte &= ~HPTE_R_C;
+               note_hpte_modification(kvm, rev);
+       }
+       if (v & HPTE_V_VALID) {
+               /* need to make it temporarily absent so C is stable */
+               hpte[0] |= cpu_to_be64(HPTE_V_ABSENT);
+               kvmppc_invalidate_hpte(kvm, hpte, pte_index);
+               r = be64_to_cpu(hpte[1]);
+               gr |= r & (HPTE_R_R | HPTE_R_C);
+               if (r & HPTE_R_C) {
+                       unsigned long psize = hpte_page_size(v, r);
+                       hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
+                       eieio();
+                       rmap = revmap_for_hpte(kvm, v, gr);
+                       if (rmap) {
+                               lock_rmap(rmap);
+                               *rmap |= KVMPPC_RMAP_CHANGED;
+                               kvmppc_update_rmap_change(rmap, psize);
+                               unlock_rmap(rmap);
+                       }
+               }
+       }
+       vcpu->arch.gpr[4] = gr;
+       ret = H_SUCCESS;
+ out:
+       unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+       return ret;
+}
+
 void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
                        unsigned long pte_index)
 {
index 00e45b6d4f2464a0bde10762f5f995ea43b2296a..24f58076d49e1eada68920615c8ce2c3b1166744 100644 (file)
@@ -67,14 +67,12 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
        }
 
        /* Check if the core is loaded, if not, too hard */
-       cpu = vcpu->cpu;
+       cpu = vcpu->arch.thread_cpu;
        if (cpu < 0 || cpu >= nr_cpu_ids) {
                this_icp->rm_action |= XICS_RM_KICK_VCPU;
                this_icp->rm_kick_target = vcpu;
                return;
        }
-       /* In SMT cpu will always point to thread 0, we adjust it */
-       cpu += vcpu->arch.ptid;
 
        smp_mb();
        kvmhv_rm_send_ipi(cpu);
index faa86e9c05510973001b7d3c64557e8a9cbd212d..2273dcacef39fe9e2257303a18121116044694c6 100644 (file)
@@ -128,6 +128,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
        subf    r4, r4, r3
        mtspr   SPRN_DEC, r4
 
+       /* hwthread_req may have got set by cede or no vcpu, so clear it */
+       li      r0, 0
+       stb     r0, HSTATE_HWTHREAD_REQ(r13)
+
        /*
         * For external and machine check interrupts, we need
         * to call the Linux handler to process the interrupt.
@@ -215,7 +219,6 @@ kvm_novcpu_wakeup:
        ld      r5, HSTATE_KVM_VCORE(r13)
        li      r0, 0
        stb     r0, HSTATE_NAPPING(r13)
-       stb     r0, HSTATE_HWTHREAD_REQ(r13)
 
        /* check the wake reason */
        bl      kvmppc_check_wake_reason
@@ -315,10 +318,10 @@ kvm_start_guest:
        cmpdi   r3, 0
        bge     kvm_no_guest
 
-       /* get vcpu pointer, NULL if we have no vcpu to run */
-       ld      r4,HSTATE_KVM_VCPU(r13)
-       cmpdi   r4,0
-       /* if we have no vcpu to run, go back to sleep */
+       /* get vcore pointer, NULL if we have nothing to run */
+       ld      r5,HSTATE_KVM_VCORE(r13)
+       cmpdi   r5,0
+       /* if we have no vcore to run, go back to sleep */
        beq     kvm_no_guest
 
 kvm_secondary_got_guest:
@@ -327,21 +330,42 @@ kvm_secondary_got_guest:
        ld      r6, PACA_DSCR_DEFAULT(r13)
        std     r6, HSTATE_DSCR(r13)
 
-       /* Order load of vcore, ptid etc. after load of vcpu */
+       /* On thread 0 of a subcore, set HDEC to max */
+       lbz     r4, HSTATE_PTID(r13)
+       cmpwi   r4, 0
+       bne     63f
+       lis     r6, 0x7fff
+       ori     r6, r6, 0xffff
+       mtspr   SPRN_HDEC, r6
+       /* and set per-LPAR registers, if doing dynamic micro-threading */
+       ld      r6, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r6, 0
+       beq     63f
+       ld      r0, KVM_SPLIT_RPR(r6)
+       mtspr   SPRN_RPR, r0
+       ld      r0, KVM_SPLIT_PMMAR(r6)
+       mtspr   SPRN_PMMAR, r0
+       ld      r0, KVM_SPLIT_LDBAR(r6)
+       mtspr   SPRN_LDBAR, r0
+       isync
+63:
+       /* Order load of vcpu after load of vcore */
        lwsync
+       ld      r4, HSTATE_KVM_VCPU(r13)
        bl      kvmppc_hv_entry
 
        /* Back from the guest, go back to nap */
-       /* Clear our vcpu pointer so we don't come back in early */
+       /* Clear our vcpu and vcore pointers so we don't come back in early */
        li      r0, 0
+       std     r0, HSTATE_KVM_VCPU(r13)
        /*
-        * Once we clear HSTATE_KVM_VCPU(r13), the code in
+        * Once we clear HSTATE_KVM_VCORE(r13), the code in
         * kvmppc_run_core() is going to assume that all our vcpu
         * state is visible in memory.  This lwsync makes sure
         * that that is true.
         */
        lwsync
-       std     r0, HSTATE_KVM_VCPU(r13)
+       std     r0, HSTATE_KVM_VCORE(r13)
 
 /*
  * At this point we have finished executing in the guest.
@@ -374,16 +398,71 @@ kvm_no_guest:
        b       power7_wakeup_loss
 
 53:    HMT_LOW
-       ld      r4, HSTATE_KVM_VCPU(r13)
-       cmpdi   r4, 0
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       cmpdi   r5, 0
+       bne     60f
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       cmpdi   r3, 0
+       beq     kvm_no_guest
+       lbz     r0, KVM_SPLIT_DO_NAP(r3)
+       cmpwi   r0, 0
        beq     kvm_no_guest
        HMT_MEDIUM
+       b       kvm_unsplit_nap
+60:    HMT_MEDIUM
        b       kvm_secondary_got_guest
 
 54:    li      r0, KVM_HWTHREAD_IN_KVM
        stb     r0, HSTATE_HWTHREAD_STATE(r13)
        b       kvm_no_guest
 
+/*
+ * Here the primary thread is trying to return the core to
+ * whole-core mode, so we need to nap.
+ */
+kvm_unsplit_nap:
+       /*
+        * Ensure that secondary doesn't nap when it has
+        * its vcore pointer set.
+        */
+       sync            /* matches smp_mb() before setting split_info.do_nap */
+       ld      r0, HSTATE_KVM_VCORE(r13)
+       cmpdi   r0, 0
+       bne     kvm_no_guest
+       /* clear any pending message */
+BEGIN_FTR_SECTION
+       lis     r6, (PPC_DBELL_SERVER << (63-36))@h
+       PPC_MSGCLR(6)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+       /* Set kvm_split_mode.napped[tid] = 1 */
+       ld      r3, HSTATE_SPLIT_MODE(r13)
+       li      r0, 1
+       lhz     r4, PACAPACAINDEX(r13)
+       clrldi  r4, r4, 61      /* micro-threading => P8 => 8 threads/core */
+       addi    r4, r4, KVM_SPLIT_NAPPED
+       stbx    r0, r3, r4
+       /* Check the do_nap flag again after setting napped[] */
+       sync
+       lbz     r0, KVM_SPLIT_DO_NAP(r3)
+       cmpwi   r0, 0
+       beq     57f
+       li      r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
+       mfspr   r4, SPRN_LPCR
+       rlwimi  r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
+       mtspr   SPRN_LPCR, r4
+       isync
+       std     r0, HSTATE_SCRATCH0(r13)
+       ptesync
+       ld      r0, HSTATE_SCRATCH0(r13)
+1:     cmpd    r0, r0
+       bne     1b
+       nap
+       b       .
+
+57:    li      r0, 0
+       stbx    r0, r3, r4
+       b       kvm_no_guest
+
 /******************************************************************************
  *                                                                            *
  *                               Entry code                                   *
@@ -854,7 +933,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        cmpwi   r0, 0
        bne     21f
        HMT_LOW
-20:    lbz     r0, VCORE_IN_GUEST(r5)
+20:    lwz     r3, VCORE_ENTRY_EXIT(r5)
+       cmpwi   r3, 0x100
+       bge     no_switch_exit
+       lbz     r0, VCORE_IN_GUEST(r5)
        cmpwi   r0, 0
        beq     20b
        HMT_MEDIUM
@@ -870,7 +952,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
        blt     hdec_soon
 
        ld      r6, VCPU_CTR(r4)
-       lwz     r7, VCPU_XER(r4)
+       l     r7, VCPU_XER(r4)
 
        mtctr   r6
        mtxer   r7
@@ -985,9 +1067,13 @@ secondary_too_late:
 #endif
 11:    b       kvmhv_switch_to_host
 
+no_switch_exit:
+       HMT_MEDIUM
+       li      r12, 0
+       b       12f
 hdec_soon:
        li      r12, BOOK3S_INTERRUPT_HV_DECREMENTER
-       stw     r12, VCPU_TRAP(r4)
+12:    stw     r12, VCPU_TRAP(r4)
        mr      r9, r4
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
        addi    r3, r4, VCPU_TB_RMEXIT
@@ -1103,7 +1189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        mfctr   r3
        mfxer   r4
        std     r3, VCPU_CTR(r9)
-       stw     r4, VCPU_XER(r9)
+       std     r4, VCPU_XER(r9)
 
        /* If this is a page table miss then see if it's theirs or ours */
        cmpwi   r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
@@ -1127,6 +1213,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
        cmpwi   r12, BOOK3S_INTERRUPT_H_DOORBELL
        bne     3f
        lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
        beq     4f
        b       guest_exit_cont
 3:
@@ -1176,6 +1263,11 @@ mc_cont:
        ld      r9, HSTATE_KVM_VCPU(r13)
        lwz     r12, VCPU_TRAP(r9)
 
+       /* Stop others sending VCPU interrupts to this physical CPU */
+       li      r0, -1
+       stw     r0, VCPU_CPU(r9)
+       stw     r0, VCPU_THREAD_CPU(r9)
+
        /* Save guest CTRL register, set runlatch to 1 */
        mfspr   r6,SPRN_CTRLF
        stw     r6,VCPU_CTRL(r9)
@@ -1540,12 +1632,17 @@ kvmhv_switch_to_host:
 
        /* Primary thread waits for all the secondaries to exit guest */
 15:    lwz     r3,VCORE_ENTRY_EXIT(r5)
-       srwi    r0,r3,8
+       rlwinm  r0,r3,32-8,0xff
        clrldi  r3,r3,56
        cmpw    r3,r0
        bne     15b
        isync
 
+       /* Did we actually switch to the guest at all? */
+       lbz     r6, VCORE_IN_GUEST(r5)
+       cmpwi   r6, 0
+       beq     19f
+
        /* Primary thread switches back to host partition */
        ld      r6,KVM_HOST_SDR1(r4)
        lwz     r7,KVM_HOST_LPID(r4)
@@ -1589,7 +1686,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 18:
        /* Signal secondary CPUs to continue */
        stb     r0,VCORE_IN_GUEST(r5)
-       lis     r8,0x7fff               /* MAX_INT@h */
+19:    lis     r8,0x7fff               /* MAX_INT@h */
        mtspr   SPRN_HDEC,r8
 
 16:    ld      r8,KVM_HOST_LPCR(r4)
@@ -1675,7 +1772,7 @@ kvmppc_hdsi:
        bl      kvmppc_msr_interrupt
 fast_interrupt_c_return:
 6:     ld      r7, VCPU_CTR(r9)
-       lwz     r8, VCPU_XER(r9)
+       l     r8, VCPU_XER(r9)
        mtctr   r7
        mtxer   r8
        mr      r4, r9
@@ -1816,8 +1913,8 @@ hcall_real_table:
        .long   DOTSYM(kvmppc_h_remove) - hcall_real_table
        .long   DOTSYM(kvmppc_h_enter) - hcall_real_table
        .long   DOTSYM(kvmppc_h_read) - hcall_real_table
-       .long   0               /* 0x10 - H_CLEAR_MOD */
-       .long   0               /* 0x14 - H_CLEAR_REF */
+       .long   DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
+       .long   DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
        .long   DOTSYM(kvmppc_h_protect) - hcall_real_table
        .long   DOTSYM(kvmppc_h_get_tce) - hcall_real_table
        .long   DOTSYM(kvmppc_h_put_tce) - hcall_real_table
index bd6ab1672ae64a9261282c800a6160b20356a950..a759d9adb0b6f8218c38c52520dfe3cdfa70c884 100644 (file)
@@ -352,7 +352,7 @@ static inline u32 inst_get_field(u32 inst, int msb, int lsb)
        return kvmppc_get_field(inst, msb + 32, lsb + 32);
 }
 
-bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+static bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
 {
        if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
                return false;
index acee37cde840a3b18a512a09bfc79634a8e26f07..ca8f174289bb43a148bdd9159f71ca645e0c416a 100644 (file)
@@ -123,7 +123,7 @@ no_dcbz32_on:
        PPC_LL  r8, SVCPU_CTR(r3)
        PPC_LL  r9, SVCPU_LR(r3)
        lwz     r10, SVCPU_CR(r3)
-       lwz     r11, SVCPU_XER(r3)
+       PPC_LL  r11, SVCPU_XER(r3)
 
        mtctr   r8
        mtlr    r9
@@ -237,7 +237,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
        mfctr   r8
        mflr    r9
 
-       stw     r5, SVCPU_XER(r13)
+       PPC_STL r5, SVCPU_XER(r13)
        PPC_STL r6, SVCPU_FAULT_DAR(r13)
        stw     r7, SVCPU_FAULT_DSISR(r13)
        PPC_STL r8, SVCPU_CTR(r13)
index c6ca7db646735428fb14bde6af9ed1eb68b98351..905e94a1370f4982060b4d917e6fb5b8202d4fd3 100644 (file)
@@ -41,7 +41,7 @@
  * =======
  *
  * Each ICS has a spin lock protecting the information about the IRQ
- * sources and avoiding simultaneous deliveries if the same interrupt.
+ * sources and avoiding simultaneous deliveries of the same interrupt.
  *
  * ICP operations are done via a single compare & swap transaction
  * (most ICP state fits in the union kvmppc_icp_state)
index cc5842657161580262d10fb49e0bdfde4de497fd..ae458f0fd061efea7cdd569c1bdb32970503659c 100644 (file)
@@ -933,6 +933,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
 #endif
                break;
        case BOOKE_INTERRUPT_CRITICAL:
+               kvmppc_fill_pt_regs(&regs);
                unknown_exception(&regs);
                break;
        case BOOKE_INTERRUPT_DEBUG:
index 50860e919cb81777a0ef4015e48f9bfda2b3e810..29911a07bcdb071d1f94db55ca8e7ce4e2b9e5d3 100644 (file)
@@ -377,7 +377,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
                        | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
                vcpu->arch.shared->mas1 =
                          (vcpu->arch.shared->mas6 & MAS6_SPID0)
-                       | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+                       | ((vcpu->arch.shared->mas6 & MAS6_SAS) ? MAS1_TS : 0)
                        | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
                vcpu->arch.shared->mas2 &= MAS2_EPN;
                vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
index e5dde32fe71fc1856cb5ae515150aa7130de3c96..2e51289610e432b420310971afbe374f0e0938cf 100644 (file)
@@ -660,7 +660,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
        return kvmppc_core_pending_dec(vcpu);
 }
 
-enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
 {
        struct kvm_vcpu *vcpu;
 
index e1fe333da94680b9d64915c5a13850e6079d8578..22d94c3e6fc43bf7afbd63bfd483074a9e18589a 100644 (file)
@@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
        struct pglist_data *pgdata;
        struct zone *zone;
@@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
        /* this should work for most non-highmem platforms */
        zone = pgdata->node_zones +
-               zone_for_memory(nid, start, size, 0);
+               zone_for_memory(nid, start, size, 0, for_device);
 
        return __add_pages(nid, zone, start_pfn, nr_pages);
 }
index e865d748179b2ac1b0b7550c1d3bebbcae71e049..2d4f60c0119aa72bdf0e63c5fa8c6d49b8c1968a 100644 (file)
@@ -123,7 +123,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
 
        area->nid = nid;
        area->order = order;
-       area->pages = alloc_pages_exact_node(area->nid,
+       area->pages = __alloc_pages_node(area->nid,
                                                GFP_KERNEL|__GFP_THISNODE,
                                                area->order);
 
index f86250c48b53fa94638e0e50b637cd43f23ece98..d2b79bc336c1036885a289ab227319ef0e6eb81f 100644 (file)
@@ -141,13 +141,14 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
  */
 static long
 axon_ram_direct_access(struct block_device *device, sector_t sector,
-                      void **kaddr, unsigned long *pfn, long size)
+                      void __pmem **kaddr, unsigned long *pfn)
 {
        struct axon_ram_bank *bank = device->bd_disk->private_data;
        loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
+       void *addr = (void *)(bank->ph_addr + offset);
 
-       *kaddr = (void *)(bank->ph_addr + offset);
-       *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
+       *kaddr = (void __pmem *)addr;
+       *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
 
        return bank->size - offset;
 }
index 4827870f7a6d8c00925b7d052ed68efda20f364e..1d57000b1b24ad6c6946f67ea821385e436391b6 100644 (file)
@@ -48,6 +48,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 
 config KEXEC
        def_bool y
+       select KEXEC_CORE
 
 config AUDIT_ARCH
        def_bool y
index 42506b371b74144886e42a9ec21d43edcf680566..4da604ebf6fd8edd75eb01951913c79991d0eca5 100644 (file)
@@ -167,7 +167,7 @@ unsigned long decompress_kernel(void)
 #endif
 
        puts("Uncompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
        puts("Ok, booting the kernel.\n");
        return (unsigned long) output;
 }
index 9d395961e71380484e7813f42240e6dacf5e6714..b3fd54d93dd20f85147c9e1e884e08e7358ec914 100644 (file)
@@ -18,27 +18,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &s390_dma_ops;
 }
 
-extern int dma_set_mask(struct device *dev, u64 mask);
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                                  enum dma_data_direction direction)
 {
 }
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 #include <asm-generic/dma-mapping-common.h>
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->dma_supported == NULL)
-               return 1;
-       return dma_ops->dma_supported(dev, mask);
-}
-
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 {
        if (!dev->dma_mask)
@@ -46,45 +32,4 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-       return dma_addr == DMA_ERROR_CODE;
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flags,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       BUG_ON(!ops);
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flags, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       BUG_ON(!ops);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
 #endif /* _ASM_S390_DMA_MAPPING_H */
index 2963b563621c29e345ca61541aaa472b453d9072..c3c07d3505ba17ebe953d4310508abeb6b63fba1 100644 (file)
@@ -169,7 +169,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
        unsigned long normal_end_pfn = PFN_DOWN(memblock_end_of_DRAM());
        unsigned long dma_end_pfn = PFN_DOWN(MAX_DMA_ADDRESS);
index 42b76580c8b8a6fa155da58f106d83096089389b..37505b8b4093782bae7e7062e39ea8e8fa50dbc7 100644 (file)
@@ -262,16 +262,6 @@ out:
        spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, flags);
 }
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(dma_set_mask);
-
 static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
                                     unsigned long offset, size_t size,
                                     enum dma_data_direction direction,
index 50057fed819ddf3c07a8e16841c65d63cbaa5168..d514df7e04dd4c866597bd0772bfdc6a7dc000f9 100644 (file)
@@ -602,6 +602,7 @@ source kernel/Kconfig.hz
 config KEXEC
        bool "kexec system call (EXPERIMENTAL)"
        depends on SUPERH32 && MMU
+       select KEXEC_CORE
        help
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 95470a472d2cf793ddad131f55805385a514e65f..208a9753ab38cd0a532c63b19c379cae8a8494e7 100644 (file)
@@ -132,7 +132,7 @@ void decompress_kernel(void)
 
        puts("Uncompressing Linux... ");
        cache_control(CACHE_ENABLE);
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, 0, NULL, error);
        cache_control(CACHE_DISABLE);
        puts("Ok, booting the kernel.\n");
 }
index b437f2c780b83f6c84d427f18af4416fa9e42fd7..a3745a3fe0290896a2a14450e6e47e8caf30a793 100644 (file)
@@ -9,86 +9,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
-#include <asm-generic/dma-coherent.h>
-#include <asm-generic/dma-mapping-common.h>
-
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (ops->dma_supported)
-               return ops->dma_supported(dev, mask);
-
-       return 1;
-}
-
-static inline int dma_set_mask(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
+#define DMA_ERROR_CODE 0
 
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-       if (ops->set_dma_mask)
-               return ops->set_dma_mask(dev, mask);
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
+#include <asm-generic/dma-mapping-common.h>
 
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                    enum dma_data_direction dir);
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return dma_addr == 0;
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-       if (ops->free)
-               ops->free(dev, size, vaddr, dma_handle, attrs);
-}
-
 /* arch/sh/mm/consistent.c */
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag,
index e79fb6ebaa42370d7db5295355842bbd3b25edaa..1f157b86eaa7a188cd2203a9eebe5888c259acdb 100644 (file)
@@ -9,7 +9,7 @@
 #ifndef __ASSEMBLY__
 extern void mcount(void);
 
-#define MCOUNT_ADDR            ((long)(mcount))
+#define MCOUNT_ADDR            ((unsigned long)(mcount))
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 #define CALL_ADDR              ((long)(ftrace_call))
index 93ec9066dbef307d46e4111cf572306ee5d013c8..3280a6bfa5036c7db9f51b9f6d65266eedb74603 100644 (file)
@@ -342,6 +342,7 @@ ioremap_cache(phys_addr_t offset, unsigned long size)
 {
        return __ioremap_mode(offset, size, PAGE_KERNEL);
 }
+#define ioremap_cache ioremap_cache
 
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 static inline void __iomem *
index 17f486233db03c4d6d14d4c283b2699a73b60caa..75491862d9009f3ba8ce3478d76960257dc215a3 100644 (file)
@@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
        pg_data_t *pgdat;
        unsigned long start_pfn = PFN_DOWN(start);
@@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
        /* We only have ZONE_NORMAL, so this is easy.. */
        ret = __add_pages(nid, pgdat->node_zones +
-                       zone_for_memory(nid, start, size, ZONE_NORMAL),
+                       zone_for_memory(nid, start, size, ZONE_NORMAL,
+                       for_device),
                        start_pfn, nr_pages);
        if (unlikely(ret))
                printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
index 7e064c68c5ec8a0ab538a15947d5c44b2db0a322..a21da597b0b59d49cce2f5fcca68a4274bccbfc6 100644 (file)
@@ -7,11 +7,9 @@
 
 #define DMA_ERROR_CODE (~(dma_addr_t)0x0)
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 int dma_supported(struct device *dev, u64 mask);
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                                  enum dma_data_direction dir)
 {
@@ -39,39 +37,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return dma_ops;
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-       return cpu_addr;
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-       ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return (dma_addr == DMA_ERROR_CODE);
-}
+#define HAVE_ARCH_DMA_SET_MASK 1
 
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
@@ -86,4 +52,6 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
        return -EINVAL;
 }
 
+#include <asm-generic/dma-mapping-common.h>
+
 #endif
index 9ec94ad116fbdded41d184e4b11f3adb8d0544f7..3192a8e42fd62c6f244a2b5ba5887da395847abd 100644 (file)
@@ -2,7 +2,7 @@
 #define _ASM_SPARC64_FTRACE
 
 #ifdef CONFIG_MCOUNT
-#define MCOUNT_ADDR            ((long)(_mcount))
+#define MCOUNT_ADDR            ((unsigned long)(_mcount))
 #define MCOUNT_INSN_SIZE       4 /* sizeof mcount call */
 
 #ifndef __ASSEMBLY__
index f06b36a00a3b1eda581464d2fe9f39a8980fe17f..91b963a887b781a3b04c0cdc3cfed727ef8e5d73 100644 (file)
@@ -14,7 +14,7 @@
 #include <asm-generic/4level-fixup.h>
 
 #include <linux/spinlock.h>
-#include <linux/swap.h>
+#include <linux/mm_types.h>
 #include <asm/types.h>
 #include <asm/pgtsrmmu.h>
 #include <asm/vaddrs.h>
index 3a14a35592fe928754d7d1d946110b576a4bd95f..b91d7f1461758ba759413042c1529faf4196912b 100644 (file)
@@ -231,8 +231,7 @@ static void pci_parse_of_addrs(struct platform_device *op,
                        res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2];
                } else if (i == dev->rom_base_reg) {
                        res = &dev->resource[PCI_ROM_RESOURCE];
-                       flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE
-                             | IORESOURCE_SIZEALIGN;
+                       flags |= IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
                } else {
                        printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i);
                        continue;
index 2ba12d7617234417c4bec81988ae64f688e34704..106c21bd7f449d947094db5fdefce8a9a6e1b142 100644 (file)
@@ -205,6 +205,7 @@ source "kernel/Kconfig.hz"
 
 config KEXEC
        bool "kexec system call"
+       select KEXEC_CORE
        ---help---
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
index 1eae359d83150662a0c5d0ea7b477bfe35f13c2b..96ac6cce4a32c03ead94166ac1190b91ac5b032d 100644 (file)
@@ -59,8 +59,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-#include <asm-generic/dma-mapping-common.h>
-
 static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
 {
        dev->archdata.dma_ops = ops;
@@ -74,18 +72,9 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
        return addr + size - 1 <= *dev->dma_mask;
 }
 
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       debug_dma_mapping_error(dev, dma_addr);
-       return get_dma_ops(dev)->mapping_error(dev, dma_addr);
-}
+#define HAVE_ARCH_DMA_SET_MASK 1
 
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-       return get_dma_ops(dev)->dma_supported(dev, mask);
-}
+#include <asm-generic/dma-mapping-common.h>
 
 static inline int
 dma_set_mask(struct device *dev, u64 mask)
@@ -116,36 +105,6 @@ dma_set_mask(struct device *dev, u64 mask)
        return 0;
 }
 
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-       void *cpu_addr;
-
-       cpu_addr = dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-
-       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
-
-       return cpu_addr;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-
 /*
  * dma_alloc_noncoherent() is #defined to return coherent memory,
  * so there's no need to do any flushing here.
index 5bd252e3fdc506a6aa393419252a94261768d400..d4e1fc41d06db21a475b1bdbd6508df5b7f5ab8a 100644 (file)
@@ -863,7 +863,7 @@ void __init mem_init(void)
  * memory to the highmem for now.
  */
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-int arch_add_memory(u64 start, u64 size)
+int arch_add_memory(u64 start, u64 size, bool for_device)
 {
        struct pglist_data *pgdata = &contig_page_data;
        struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
index 176d5bda3559de3ffc23bdc667c0ee7be3f6d5bb..5c65dfee278c0319bcd6059a95249431f836d278 100644 (file)
@@ -119,8 +119,8 @@ unsigned long decompress_kernel(unsigned long output_start,
        output_ptr = get_unaligned_le32(tmp);
 
        arch_decomp_puts("Uncompressing Linux...");
-       decompress(input_data, input_data_end - input_data, NULL, NULL,
-                       output_data, NULL, error);
+       __decompress(input_data, input_data_end - input_data, NULL, NULL,
+                       output_data, 0, NULL, error);
        arch_decomp_puts(" done, booting the kernel.\n");
        return output_ptr;
 }
index 366460a817965d2ed7d1d4dceb987784067e87e6..8140e053ccd351332f33d3df8decccfdeb86074b 100644 (file)
@@ -18,8 +18,6 @@
 #include <linux/scatterlist.h>
 #include <linux/swiotlb.h>
 
-#include <asm-generic/dma-coherent.h>
-
 #include <asm/memory.h>
 #include <asm/cacheflush.h>
 
@@ -30,26 +28,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
        return &swiotlb_dma_map_ops;
 }
 
-static inline int dma_supported(struct device *dev, u64 mask)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (unlikely(dma_ops == NULL))
-               return 0;
-
-       return dma_ops->dma_supported(dev, mask);
-}
-
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       if (dma_ops->mapping_error)
-               return dma_ops->mapping_error(dev, dma_addr);
-
-       return 0;
-}
-
 #include <asm-generic/dma-mapping-common.h>
 
 static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
@@ -72,41 +50,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 
 static inline void dma_mark_clean(void *addr, size_t size) {}
 
-static inline int dma_set_mask(struct device *dev, u64 dma_mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, dma_mask))
-               return -EIO;
-
-       *dev->dma_mask = dma_mask;
-
-       return 0;
-}
-
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t flag,
-                                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       return dma_ops->alloc(dev, size, dma_handle, flag, attrs);
-}
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *cpu_addr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
-       dma_ops->free(dev, size, cpu_addr, dma_handle, attrs);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
-
 static inline void dma_cache_sync(struct device *dev, void *vaddr,
                size_t size, enum dma_data_direction direction)
 {
index debafc40200a4faf985c737aefbae80db4806e1c..3bb0a29fd2d7b11739e9a22247f4d2edaa1d9403 100644 (file)
 #define __phys_to_virt(x)      ((x) - PHYS_OFFSET + PAGE_OFFSET)
 #endif
 
-/*
- * Convert a physical address to a Page Frame Number and back
- */
-#define        __phys_to_pfn(paddr)    ((paddr) >> PAGE_SHIFT)
-#define        __pfn_to_phys(pfn)      ((pfn) << PAGE_SHIFT)
-
 /*
  * Convert a page to/from a physical address
  */
index 117e2f373e50d40cee118dcab42a0e86a617b2ee..328c8352480c5dcfd34d72a70d01d6a57e5bb515 100644 (file)
@@ -27,7 +27,8 @@ config X86
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FAST_MULTIPLIER
        select ARCH_HAS_GCOV_PROFILE_ALL
-       select ARCH_HAS_PMEM_API
+       select ARCH_HAS_PMEM_API                if X86_64
+       select ARCH_HAS_MMIO_FLUSH
        select ARCH_HAS_SG_CHAIN
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
        select ARCH_MIGHT_HAVE_ACPI_PDC         if ACPI
@@ -1005,7 +1006,7 @@ config X86_THERMAL_VECTOR
        depends on X86_MCE_INTEL
 
 config X86_LEGACY_VM86
-       bool "Legacy VM86 support (obsolete)"
+       bool "Legacy VM86 support"
        default n
        depends on X86_32
        ---help---
@@ -1017,19 +1018,20 @@ config X86_LEGACY_VM86
          available to accelerate real mode DOS programs.  However, any
          recent version of DOSEMU, X, or vbetool should be fully
          functional even without kernel VM86 support, as they will all
-         fall back to (pretty well performing) software emulation.
+         fall back to software emulation. Nevertheless, if you are using
+         a 16-bit DOS program where 16-bit performance matters, vm86
+         mode might be faster than emulation and you might want to
+         enable this option.
 
-         Anything that works on a 64-bit kernel is unlikely to need
-         this option, as 64-bit kernels don't, and can't, support V8086
-         mode.  This option is also unrelated to 16-bit protected mode
-         and is not needed to run most 16-bit programs under Wine.
+         Note that any app that works on a 64-bit kernel is unlikely to
+         need this option, as 64-bit kernels don't, and can't, support
+         V8086 mode. This option is also unrelated to 16-bit protected
+         mode and is not needed to run most 16-bit programs under Wine.
 
-         Enabling this option adds considerable attack surface to the
-         kernel and slows down system calls and exception handling.
+         Enabling this option increases the complexity of the kernel
+         and slows down exception handling a tiny bit.
 
-         Unless you use very old userspace or need the last drop of
-         performance in your real mode DOS games and can't use KVM,
-         say N here.
+         If unsure, say N here.
 
 config VM86
        bool
@@ -1450,10 +1452,14 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY_DEVICE
+       bool
+
 config X86_PMEM_LEGACY
-       bool "Support non-standard NVDIMMs and ADR protected memory"
+       tristate "Support non-standard NVDIMMs and ADR protected memory"
        depends on PHYS_ADDR_T_64BIT
        depends on BLK_DEV
+       select X86_PMEM_LEGACY_DEVICE
        select LIBNVDIMM
        help
          Treat memory marked using the non-standard e820 type of 12 as used
@@ -1749,6 +1755,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
        bool "kexec system call"
+       select KEXEC_CORE
        ---help---
          kexec is a system call that implements the ability to shutdown your
          current kernel, and to start another kernel.  It is like a reboot
@@ -1765,8 +1772,8 @@ config KEXEC
 
 config KEXEC_FILE
        bool "kexec file based system call"
+       select KEXEC_CORE
        select BUILD_BIN2C
-       depends on KEXEC
        depends on X86_64
        depends on CRYPTO=y
        depends on CRYPTO_SHA256=y
index f63797942bb5951adc91bf5bd4d355ff5e48db6e..79dac1758e7c00d8c062be2e3c2b054bc4dfc475 100644 (file)
@@ -448,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 #endif
 
        debug_putstr("\nDecompressing Linux... ");
-       decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+       __decompress(input_data, input_len, NULL, NULL, output, output_len,
+                       NULL, error);
        parse_elf(output);
        /*
         * 32-bit always performs relocations. 64-bit relocations are only
index 16ef02596db2daf1fa8eadd9c17fd994ec3c21b3..2d6b309c8e9a12ac67ddf9d9cb429cb1fe8a7eae 100644 (file)
@@ -414,7 +414,7 @@ xloadflags:
 # define XLF23 0
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
 # define XLF4 XLF_EFI_KEXEC
 #else
 # define XLF4 0
index 64d7cf1b50e112ab370ac63e59e9ff89d4d677e0..440df0c7a2eef7828b31a3c97db4fa110340d339 100644 (file)
@@ -294,6 +294,7 @@ static struct ahash_alg ghash_async_alg = {
                        .cra_name               = "ghash",
                        .cra_driver_name        = "ghash-clmulni",
                        .cra_priority           = 400,
+                       .cra_ctxsize            = sizeof(struct ghash_async_ctx),
                        .cra_flags              = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
                        .cra_blocksize          = GHASH_BLOCK_SIZE,
                        .cra_type               = &crypto_ahash_type,
index 477bfa6db370783294e858210a8064b295da3082..7663c455b9f650f67292e5a2c4664606fb1dbacb 100644 (file)
 372    i386    recvmsg                 sys_recvmsg                     compat_sys_recvmsg
 373    i386    shutdown                sys_shutdown
 374    i386    userfaultfd             sys_userfaultfd
+375    i386    membarrier              sys_membarrier
index 81c490634db994ba810984f8f6dab052a54c8139..278842fdf1f6393d58ea0ac09abe60ef09e8ce79 100644 (file)
 321    common  bpf                     sys_bpf
 322    64      execveat                stub_execveat
 323    common  userfaultfd             sys_userfaultfd
+324    common  membarrier              sys_membarrier
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
index 26a46f44e29819c17ba5f5aef2eb9f896b98f878..b160c0c6baed54c38cc0efbf15ff67075ec869c3 100644 (file)
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
 {
        return "[vsyscall]";
 }
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
 };
 static struct vm_area_struct gate_vma = {
index 9bf3ea14b9f0a2a814c2eb813939786eadb1de5b..e63aa38e85fb23375ecefa40efe8cd9d76da6c43 100644 (file)
@@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages);
 
 void clflush_cache_range(void *addr, unsigned int size);
 
+#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
+
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 extern const int rodata_test_data;
@@ -109,75 +111,4 @@ static inline int rodata_test(void)
 }
 #endif
 
-#ifdef ARCH_HAS_NOCACHE_UACCESS
-
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent arch_wmb_pmem() can flush cpu and memory controller
- * write buffers to guarantee durability.
- */
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
-               size_t n)
-{
-       int unwritten;
-
-       /*
-        * We are copying between two kernel buffers, if
-        * __copy_from_user_inatomic_nocache() returns an error (page
-        * fault) we would have already reported a general protection fault
-        * before the WARN+BUG.
-        */
-       unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
-                       (void __user *) src, n);
-       if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
-                               __func__, dst, src, unwritten))
-               BUG();
-}
-
-/**
- * arch_wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of arch_memcpy_to_pmem() operations this drains data
- * from cpu write buffers and any platform (memory controller) buffers
- * to ensure that written data is durable on persistent memory media.
- */
-static inline void arch_wmb_pmem(void)
-{
-       /*
-        * wmb() to 'sfence' all previous writes such that they are
-        * architecturally visible to 'pcommit'.  Note, that we've
-        * already arranged for pmem writes to avoid the cache via
-        * arch_memcpy_to_pmem().
-        */
-       wmb();
-       pcommit_sfence();
-}
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-#ifdef CONFIG_X86_64
-       /*
-        * We require that wmb() be an 'sfence', that is only guaranteed on
-        * 64-bit builds
-        */
-       return static_cpu_has(X86_FEATURE_PCOMMIT);
-#else
-       return false;
-#endif
-}
-#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
-extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
-extern void arch_wmb_pmem(void);
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-       return false;
-}
-#endif
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
index 477fc28050e447681e957a470d756205fef81eca..e6cf2ad350d15a8e6ca207a2618c77d820aacc22 100644 (file)
 #define X86_FEATURE_AVX512PF   ( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER   ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD   ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI     ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
 
 /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
 #define X86_FEATURE_XSAVEOPT   (10*32+ 0) /* XSAVEOPT */
index 1f5b7287d1ad8df92f789003018fec3913b03e1c..953b7263f84466f463d416814be709d22971c701 100644 (file)
@@ -12,7 +12,6 @@
 #include <linux/dma-attrs.h>
 #include <asm/io.h>
 #include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
 #include <linux/dma-contiguous.h>
 
 #ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 #endif
 }
 
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       debug_dma_mapping_error(dev, dma_addr);
-       if (ops->mapping_error)
-               return ops->mapping_error(dev, dma_addr);
-
-       return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
 
+#define HAVE_ARCH_DMA_SUPPORTED 1
 extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
 
 extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
                                        dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
        return gfp;
 }
 
-#define dma_alloc_coherent(d,s,h,f)    dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-               gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
-                   void *vaddr, dma_addr_t bus,
-                   struct dma_attrs *attrs);
-
 #endif
index f45acad3c4b67849922f6d05b6c32e8744e0b258..24938852db3013e0290df0b2ff77c2aafeff73d3 100644 (file)
@@ -3,9 +3,9 @@
 
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CC_USING_FENTRY
-# define MCOUNT_ADDR           ((long)(__fentry__))
+# define MCOUNT_ADDR           ((unsigned long)(__fentry__))
 #else
-# define MCOUNT_ADDR           ((long)(mcount))
+# define MCOUNT_ADDR           ((unsigned long)(mcount))
 #endif
 #define MCOUNT_INSN_SIZE       5 /* sizeof mcount call */
 
index 7cfc085b6879babbc47695352835852b4846e650..de25aad0785389c399dd12c843ca2d4d2ff0d112 100644 (file)
@@ -250,12 +250,6 @@ static inline void flush_write_buffers(void)
 #endif
 }
 
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
-       unsigned long size)
-{
-       return (void __force __pmem *) ioremap_cache(offset, size);
-}
-
 #endif /* __KERNEL__ */
 
 extern void native_io_delay(void);
index 32ce71375b212cd0fc8fa5d847d09c1d7aa5bc6b..b130d59406fb12ab3a75d5a2a8631b202be50ab3 100644 (file)
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 extern int in_crash_kexec;
 #else
 /* no crash dump is ever in progress if no crash kernel can be kexec'd */
index ce029e4fa7c62bc2bf969ed8df9f2330af2abb28..31247b5bff7c8ff86d893851dc9073b72a647cc2 100644 (file)
@@ -97,7 +97,6 @@ struct pv_lazy_ops {
 struct pv_time_ops {
        unsigned long long (*sched_clock)(void);
        unsigned long long (*steal_clock)(int cpu);
-       unsigned long (*get_tsc_khz)(void);
 };
 
 struct pv_cpu_ops {
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
new file mode 100644 (file)
index 0000000..d8ce3ec
--- /dev/null
@@ -0,0 +1,153 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_X86_PMEM_H__
+#define __ASM_X86_PMEM_H__
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+               size_t n)
+{
+       int unwritten;
+
+       /*
+        * We are copying between two kernel buffers, if
+        * __copy_from_user_inatomic_nocache() returns an error (page
+        * fault) we would have already reported a general protection fault
+        * before the WARN+BUG.
+        */
+       unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+                       (void __user *) src, n);
+       if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+                               __func__, dst, src, unwritten))
+               BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+       /*
+        * wmb() to 'sfence' all previous writes such that they are
+        * architecturally visible to 'pcommit'.  Note, that we've
+        * already arranged for pmem writes to avoid the cache via
+        * arch_memcpy_to_pmem().
+        */
+       wmb();
+       pcommit_sfence();
+}
+
+/**
+ * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * @vaddr:     virtual start address
+ * @size:      number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction.  This function requires explicit ordering with an
+ * arch_wmb_pmem() call.  This API is internal to the x86 PMEM implementation.
+ */
+static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+{
+       u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+       unsigned long clflush_mask = x86_clflush_size - 1;
+       void *vend = vaddr + size;
+       void *p;
+
+       for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+            p < vend; p += x86_clflush_size)
+               clwb(p);
+}
+
+/*
+ * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+ * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+ */
+static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+{
+       return iter_is_iovec(i) == false;
+}
+
+/**
+ * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr:      PMEM destination address
+ * @bytes:     number of bytes to copy
+ * @i:         iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+               struct iov_iter *i)
+{
+       void *vaddr = (void __force *)addr;
+       size_t len;
+
+       /* TODO: skip the write-back by always using non-temporal stores */
+       len = copy_from_iter_nocache(vaddr, bytes, i);
+
+       if (__iter_needs_pmem_wb(i))
+               __arch_wb_cache_pmem(vaddr, bytes);
+
+       return len;
+}
+
+/**
+ * arch_clear_pmem - zero a PMEM memory range
+ * @addr:      virtual start address
+ * @size:      number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+       void *vaddr = (void __force *)addr;
+
+       /* TODO: implement the zeroing via non-temporal writes */
+       if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+               clear_page(vaddr);
+       else
+               memset(vaddr, 0, size);
+
+       __arch_wb_cache_pmem(vaddr, size);
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+       /*
+        * We require that wmb() be an 'sfence', that is only guaranteed on
+        * 64-bit builds
+        */
+       return static_cpu_has(X86_FEATURE_PCOMMIT);
+}
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+#endif /* __ASM_X86_PMEM_H__ */
index 9d51fae1cba345e5cf01387a6c16207a6b5be872..eaba0807603009e6ed1612a7049bb35bafeaf31e 100644 (file)
@@ -39,18 +39,27 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
 }
 #endif
 
-#define virt_queued_spin_lock virt_queued_spin_lock
-
-static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#ifdef CONFIG_PARAVIRT
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
 {
        if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
                return false;
 
-       while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
-               cpu_relax();
+       /*
+        * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+        * back to a Test-and-Set spinlock, because fair locks have
+        * horrible lock 'holder' preemption issues.
+        */
+
+       do {
+               while (atomic_read(&lock->val) != 0)
+                       cpu_relax();
+       } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
 
        return true;
 }
+#endif /* CONFIG_PARAVIRT */
 
 #include <asm-generic/qspinlock.h>
 
index 608a79d5a4669ebf3a9060f895d0f5f03ab3286c..e6911caf5bbf16ddc46430c44fcd4186258b713c 100644 (file)
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+       return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
 #endif /* _ASM_X86_XEN_EVENTS_H */
index ca08a27b90b3dbcf64dc69515fdadd3734b7a05e..83aea8055119e2f26beb1c909536609d30e88943 100644 (file)
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
        return _hypercall1(int, tmem_op, op);
 }
 
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+       return _hypercall2(int, xenpmu_op, op, arg);
+}
+
 static inline void
 MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
index 3400dbaec3c31eb6e5097715521b6aaaa1cf8740..62ca03ef5c657c68ecb312a9a811ea2916f731c5 100644 (file)
@@ -3,12 +3,38 @@
  *
  * Guest OS interface to x86 Xen.
  *
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
  */
 
 #ifndef _ASM_X86_XEN_INTERFACE_H
 #define _ASM_X86_XEN_INTERFACE_H
 
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
 #ifdef __XEN__
 #define __DEFINE_GUEST_HANDLE(name, type) \
     typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
  * start of the GDT because some stupid OSes export hard-coded selector values
  * in their ABI. These hard-coded values are always near the start of the GDT,
  * so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
  */
 #define FIRST_RESERVED_GDT_PAGE  14
 #define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
 #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
 
 /*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
  * The privilege level specifies which modes may enter a trap via a software
  * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
  * privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
 DEFINE_GUEST_HANDLE_STRUCT(trap_info);
 
 struct arch_shared_info {
-    unsigned long max_pfn;                  /* max pfn that appears in table */
-    /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
-    unsigned long nmi_reason;
+       /*
+        * Number of valid entries in the p2m table(s) anchored at
+        * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+        */
+       unsigned long max_pfn;
+       /*
+        * Frame containing list of mfns containing list of mfns containing p2m.
+        * A value of 0 indicates it has not yet been set up, ~0 indicates it
+        * has been set to invalid e.g. due to the p2m being too large for the
+        * 3-level p2m tree. In this case the linear mapper p2m list anchored
+        * at p2m_vaddr is to be used.
+        */
+       xen_pfn_t pfn_to_mfn_frame_list_list;
+       unsigned long nmi_reason;
+       /*
+        * Following three fields are valid if p2m_cr3 contains a value
+        * different from 0.
+        * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+        * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+        * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+        * of a L3 or L4 page table.
+        * p2m_vaddr holds the virtual address of the linear p2m list. All
+        * entries in the range [0...max_pfn[ are accessible via this pointer.
+        * p2m_generation will be incremented by the guest before and after each
+        * change of the mappings of the p2m list. p2m_generation starts at 0
+        * and a value with the least significant bit set indicates that a
+        * mapping update is in progress. This allows guest external software
+        * (e.g. in Dom0) to verify that read mappings are consistent and
+        * whether they have changed since the last check.
+        * Modifying a p2m element in the linear p2m list is allowed via an
+        * atomic write only.
+        */
+       unsigned long p2m_cr3;          /* cr3 value of the p2m address space */
+       unsigned long p2m_vaddr;        /* virtual address of the p2m list */
+       unsigned long p2m_generation;   /* generation count of p2m mapping */
 };
 #endif /* !__ASSEMBLY__ */
 
@@ -137,13 +197,31 @@ struct arch_shared_info {
 /*
  * The following is all CPU context. Note that the fpu_ctxt block is filled
  * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
  */
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online                   5
+#define VGCF_online                    (1<<_VGCF_online)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
 #endif
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+       /*
+        * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+        * For PV(H) guests these fields are RO.
+        */
+       uint32_t counters;
+       uint32_t ctrls;
+
+       /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+       uint64_t regs[];
+#elif defined(__GNUC__)
+       uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+       uint64_t counter;
+       uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+       /*
+        * Offsets to fixed and architectural counter MSRs (relative to
+        * xen_pmu_arch.c.intel).
+        * For PV(H) guests these fields are RO.
+        */
+       uint32_t fixed_counters;
+       uint32_t arch_counters;
+
+       /* PMU registers */
+       uint64_t global_ctrl;
+       uint64_t global_ovf_ctrl;
+       uint64_t global_status;
+       uint64_t fixed_ctrl;
+       uint64_t ds_area;
+       uint64_t pebs_enable;
+       uint64_t debugctl;
+
+       /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+       uint64_t regs[];
+#elif defined(__GNUC__)
+       uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+       uint64_t ip;
+       uint64_t sp;
+       uint64_t flags;
+       uint16_t cs;
+       uint16_t ss;
+       uint8_t cpl;
+       uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED        (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER           (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL           (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV     (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+       union {
+               /*
+                * Processor's registers at the time of interrupt.
+                * WO for hypervisor, RO for guests.
+                */
+               struct xen_pmu_regs regs;
+               /*
+                * Padding for adding new registers to xen_pmu_regs in
+                * the future
+                */
+#define XENPMU_REGS_PAD_SZ  64
+               uint8_t pad[XENPMU_REGS_PAD_SZ];
+       } r;
+
+       /* WO for hypervisor, RO for guest */
+       uint64_t pmu_flags;
+
+       /*
+        * APIC LVTPC register.
+        * RW for both hypervisor and guest.
+        * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+        * during XENPMU_flush or XENPMU_lvtpc_set.
+        */
+       union {
+               uint32_t lapic_lvtpc;
+               uint64_t pad;
+       } l;
+
+       /*
+        * Vendor-specific PMU registers.
+        * RW for both hypervisor and guest (see exceptions above).
+        * Guest's updates to this field are verified and then loaded by the
+        * hypervisor into hardware during XENPMU_flush
+        */
+       union {
+               struct xen_pmu_amd_ctxt amd;
+               struct xen_pmu_intel_ctxt intel;
+
+               /*
+                * Padding for contexts (fixed parts only, does not include
+                * MSR banks that are specified by offsets)
+                */
+#define XENPMU_CTXT_PAD_SZ  128
+               uint8_t pad[XENPMU_CTXT_PAD_SZ];
+       } c;
+};
+
 #endif /* !__ASSEMBLY__ */
 
 /*
index c44a5d53e464733509de6d3d6347cf9b89889c06..0679e11d2cf7ddf4f7a72bee6db3f4368aff37d1 100644 (file)
@@ -35,9 +35,7 @@ typedef struct xpaddr {
 #define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
 #define IDENTITY_FRAME(m)      ((m) | IDENTITY_FRAME_BIT)
 
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES                                               \
-    ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE           (PAGE_SIZE / sizeof(unsigned long))
 
 extern unsigned long *machine_to_phys_mapping;
 extern unsigned long  machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long  xen_max_p2m_pfn;
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
-                                            unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+                                                   unsigned long pfn_e);
 
 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
                                   struct gnttab_map_grant_ref *kmap_ops,
@@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
        unsigned long mfn;
 
+       /*
+        * Some x86 code are still using pfn_to_mfn instead of
+        * pfn_to_mfn. This will have to be removed when we figured
+        * out which call.
+        */
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return pfn;
 
@@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
        unsigned long pfn;
 
+       /*
+        * Some x86 code are still using mfn_to_pfn instead of
+        * gfn_to_pfn. This will have to be removed when we figure
+        * out which call.
+        */
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return mfn;
 
@@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
        return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
 }
 
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return pfn;
+       else
+               return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return gfn;
+       else
+               return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn)                pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn)                gfn_to_pfn(bfn)
+
 /*
  * We detect special mappings in one of two ways:
  *  1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
  *      require. In all the cases we care about, the FOREIGN_FRAME bit is
  *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
  */
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
 {
        unsigned long pfn;
 
@@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 #define virt_to_mfn(v)         (pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v)         (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g)         (__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
 static inline unsigned long pte_mfn(pte_t pte)
 {
        return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr);
 
 static inline bool xen_arch_need_swiotlb(struct device *dev,
                                         unsigned long pfn,
-                                        unsigned long mfn)
+                                        unsigned long bfn)
 {
        return false;
 }
index 0f457e6eab18ee14b18c0b575bdaf155998677c4..9dafe59cf6e2fe56d6af1e7e0d02fbad90afc009 100644 (file)
@@ -37,7 +37,7 @@
 /*
  * This is a non-standardized way to represent ADR or NVDIMM regions that
  * persist over a reboot.  The kernel will ignore their special capabilities
- * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
  *
  * ( Note that older platforms also used 6 for the same type of memory,
  *   but newer versions switched to 12 as 6 was assigned differently.  Some
index 3c362217634005f137434abe5420cae7cd8b7e5b..b1b78ffe01d060a38c93c3c7486393702edd4ffc 100644 (file)
@@ -71,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH)               += livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)  += ftrace.o
 obj-$(CONFIG_X86_TSC)          += trace_clock.o
-obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE)       += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE)       += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_KEXEC_FILE)       += kexec-bzimage64.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
 obj-y                          += kprobes/
@@ -94,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST)               += kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
-obj-$(CONFIG_X86_PMEM_LEGACY += pmem.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)  += pcspeaker.o
 
index c42827eb86cf0c52c36389d0c26dc937776b03bf..25f909362b7a89c42f32c8ebe74fb91239df3ca8 100644 (file)
@@ -338,10 +338,15 @@ done:
 
 static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
 {
+       unsigned long flags;
+
        if (instr[0] != 0x90)
                return;
 
+       local_irq_save(flags);
        add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+       sync_core();
+       local_irq_restore(flags);
 
        DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
                   instr, a->instrlen - a->padlen, a->padlen);
index 3ca3e46aa405ff606c205e849e9470735405f4cf..24e94ce454e2363e6ad14ae2e4cff6c4ac492ab4 100644 (file)
@@ -336,6 +336,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
        apic_write(APIC_LVTT, lvtt_value);
 
        if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+               /*
+                * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+                * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+                * According to Intel, MFENCE can do the serialization here.
+                */
+               asm volatile("mfence" : : : "memory");
+
                printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
                return;
        }
index 6873ab925d00abcf51026f0c33f4380730b52a90..045e424fb3680f67405849be4aaa5355f5b4f3ff 100644 (file)
@@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 #endif
 
 #ifdef arch_trigger_all_cpu_backtrace
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE           4096
-
-struct nmi_seq_buf {
-       unsigned char           buffer[NMI_BUF_SIZE];
-       struct seq_buf          seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-static unsigned long backtrace_flag;
-
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
 {
-       const char *buf = s->buffer + start;
-
-       printk("%.*s", (end - start) + 1, buf);
+       apic->send_IPI_mask(mask, NMI_VECTOR);
 }
 
 void arch_trigger_all_cpu_backtrace(bool include_self)
 {
-       struct nmi_seq_buf *s;
-       int len;
-       int cpu;
-       int i;
-       int this_cpu = get_cpu();
-
-       if (test_and_set_bit(0, &backtrace_flag)) {
-               /*
-                * If there is already a trigger_all_cpu_backtrace() in progress
-                * (backtrace_flag == 1), don't output double cpu dump infos.
-                */
-               put_cpu();
-               return;
-       }
-
-       cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-       if (!include_self)
-               cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
-
-       cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-       /*
-        * Set up per_cpu seq_buf buffers that the NMIs running on the other
-        * CPUs will write to.
-        */
-       for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-               s = &per_cpu(nmi_print_seq, cpu);
-               seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-       }
-
-       if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-               pr_info("sending NMI to %s CPUs:\n",
-                       (include_self ? "all" : "other"));
-               apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
-       }
-
-       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-       for (i = 0; i < 10 * 1000; i++) {
-               if (cpumask_empty(to_cpumask(backtrace_mask)))
-                       break;
-               mdelay(1);
-               touch_softlockup_watchdog();
-       }
-
-       /*
-        * Now that all the NMIs have triggered, we can dump out their
-        * back traces safely to the console.
-        */
-       for_each_cpu(cpu, &printtrace_mask) {
-               int last_i = 0;
-
-               s = &per_cpu(nmi_print_seq, cpu);
-               len = seq_buf_used(&s->seq);
-               if (!len)
-                       continue;
-
-               /* Print line by line. */
-               for (i = 0; i < len; i++) {
-                       if (s->buffer[i] == '\n') {
-                               print_seq_line(s, last_i, i);
-                               last_i = i + 1;
-                       }
-               }
-               /* Check if there was a partial line. */
-               if (last_i < len) {
-                       print_seq_line(s, last_i, len - 1);
-                       pr_cont("\n");
-               }
-       }
-
-       clear_bit(0, &backtrace_flag);
-       smp_mb__after_atomic();
-       put_cpu();
-}
-
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-       struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-       unsigned int len = seq_buf_used(&s->seq);
-
-       seq_buf_vprintf(&s->seq, fmt, args);
-       return seq_buf_used(&s->seq) - len;
+       nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
 }
 
 static int
 arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
-       int cpu;
-
-       cpu = smp_processor_id();
-
-       if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-               printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-               /* Replace printk to write into the NMI seq */
-               this_cpu_write(printk_func, nmi_vprintk);
-               printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-               show_regs(regs);
-               this_cpu_write(printk_func, printk_func_save);
-
-               cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+       if (nmi_cpu_backtrace(regs))
                return NMI_HANDLED;
-       }
 
        return NMI_DONE;
 }
index 38a76f826530358c898ae781cebea72069e3277c..5c60bb16262203ab63720ee349384180e9e7d8fb 100644 (file)
@@ -2522,6 +2522,7 @@ void __init setup_ioapic_dest(void)
        int pin, ioapic, irq, irq_entry;
        const struct cpumask *mask;
        struct irq_data *idata;
+       struct irq_chip *chip;
 
        if (skip_ioapic_setup == 1)
                return;
@@ -2545,9 +2546,9 @@ void __init setup_ioapic_dest(void)
                else
                        mask = apic->target_cpus();
 
-               irq_set_affinity(irq, mask);
+               chip = irq_data_get_irq_chip(idata);
+               chip->irq_set_affinity(idata, mask, false);
        }
-
 }
 #endif
 
index 07ce52c22ec843b72177307942bb5785e31458ef..de22ea7ff82f93ea1a8d4a204f10a7c6e8fdef76 100644 (file)
@@ -1110,10 +1110,10 @@ void print_cpu_info(struct cpuinfo_x86 *c)
        else
                printk(KERN_CONT "%d86", c->x86);
 
-       printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+       printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
        if (c->x86_mask || c->cpuid_level >= 0)
-               printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
+               printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
        else
                printk(KERN_CONT ")\n");
 
index cd9b6d0b10bf408d04956e45c1a2d77bd3f99b07..3fefebfbdf4bb4f68e5a5bf1647550ad0cd71a75 100644 (file)
@@ -2316,9 +2316,12 @@ static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
                            struct perf_event *event)
 {
-       struct event_constraint *c1 = cpuc->event_constraint[idx];
+       struct event_constraint *c1 = NULL;
        struct event_constraint *c2;
 
+       if (idx >= 0) /* fake does < 0 */
+               c1 = cpuc->event_constraint[idx];
+
        /*
         * first time only
         * - static constraint: no change across incremental scheduling calls
index 54690e885759dd7b89711d3568e8916495e7b34f..d1c0f254afbeefe61fcfaeeb7625664d7d352918 100644 (file)
@@ -222,6 +222,7 @@ static void __bts_event_start(struct perf_event *event)
        if (!buf || bts_buffer_is_full(buf, bts))
                return;
 
+       event->hw.itrace_started = 1;
        event->hw.state = 0;
 
        if (!buf->snapshot)
index 961e51e9c6f64b2411232f56a43b1f049ad77ea1..0f8a6bbaaa443c9cc1b936755373e2b28c59ab4d 100644 (file)
@@ -533,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
        int ret;
 
        ret = verify_pefile_signature(kernel, kernel_len,
-                                     system_trusted_keyring, &trusted);
+                                     system_trusted_keyring,
+                                     VERIFYING_KEXEC_PE_SIGNATURE,
+                                     &trusted);
        if (ret < 0)
                return ret;
        if (!trusted)
index 49487b4880616a225427c99d8eb7c498da36bae3..2c7aafa7070274420a909f4f804e964d2ddc473d 100644 (file)
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
  * kind of shutdown from our side, we unregister the clock by writting anything
  * that does not have the 'enable' bit set in the msr
  */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
        native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
        x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
        x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
        machine_ops.shutdown  = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
        kvm_get_preset_lpj();
index 2bcc0525f1c10e80b3db33df075b3189c4a68239..6acc9dd91f368a1fada3f2d6dd2a0755f1d7b46d 100644 (file)
@@ -58,7 +58,7 @@ static struct ldt_struct *alloc_ldt_struct(int size)
        if (alloc_size > PAGE_SIZE)
                new_ldt->entries = vzalloc(alloc_size);
        else
-               new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
 
        if (!new_ldt->entries) {
                kfree(new_ldt);
@@ -95,7 +95,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
        if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
                vfree(ldt->entries);
        else
-               kfree(ldt->entries);
+               free_page((unsigned long)ldt->entries);
        kfree(ldt);
 }
 
index 353972c1946cd35f378054439a05bed8200f92c9..84b8ef82a159bc7756914b40518d7fa00ed968ff 100644 (file)
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
 /* Number of entries preallocated for DMA-API debugging */
 #define PREALLOC_DMA_DEBUG_ENTRIES       65536
 
-int dma_set_mask(struct device *dev, u64 mask)
-{
-       if (!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
 void __init pci_iommu_alloc(void)
 {
        struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
                free_pages((unsigned long)vaddr, get_order(size));
 }
 
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-                     gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
 {
-       struct dma_map_ops *ops = get_dma_ops(dev);
-       void *memory;
-
-       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-               return memory;
-
-       if (!dev)
-               dev = &x86_dma_fallback_dev;
-
-       if (!is_device_dma_capable(dev))
-               return NULL;
-
-       if (!ops->alloc)
-               return NULL;
-
-       memory = ops->alloc(dev, size, dma_handle,
-                           dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-       return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
-                   void *vaddr, dma_addr_t bus,
-                   struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       WARN_ON(irqs_disabled());       /* for portability */
+       *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
+       *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
+       if (!*dev)
+               *dev = &x86_dma_fallback_dev;
+       if (!is_device_dma_capable(*dev))
+               return false;
+       return true;
 
-       debug_dma_free_coherent(dev, size, vaddr, bus);
-       if (ops->free)
-               ops->free(dev, size, vaddr, bus, attrs);
 }
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
 
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
index 64f90f53bb854fe1cb48b6fe854df27f3241267c..4f00b63d7ff33bea8f0016eaa8393d8cb6972fad 100644 (file)
@@ -3,80 +3,17 @@
  * Copyright (c) 2015, Intel Corporation.
  */
 #include <linux/platform_device.h>
-#include <linux/libnvdimm.h>
 #include <linux/module.h>
-#include <asm/e820.h>
-
-static void e820_pmem_release(struct device *dev)
-{
-       struct nvdimm_bus *nvdimm_bus = dev->platform_data;
-
-       if (nvdimm_bus)
-               nvdimm_bus_unregister(nvdimm_bus);
-}
-
-static struct platform_device e820_pmem = {
-       .name = "e820_pmem",
-       .id = -1,
-       .dev = {
-               .release = e820_pmem_release,
-       },
-};
-
-static const struct attribute_group *e820_pmem_attribute_groups[] = {
-       &nvdimm_bus_attribute_group,
-       NULL,
-};
-
-static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
-       &nd_region_attribute_group,
-       &nd_device_attribute_group,
-       NULL,
-};
 
 static __init int register_e820_pmem(void)
 {
-       static struct nvdimm_bus_descriptor nd_desc;
-       struct device *dev = &e820_pmem.dev;
-       struct nvdimm_bus *nvdimm_bus;
-       int rc, i;
-
-       rc = platform_device_register(&e820_pmem);
-       if (rc)
-               return rc;
-
-       nd_desc.attr_groups = e820_pmem_attribute_groups;
-       nd_desc.provider_name = "e820";
-       nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
-       if (!nvdimm_bus)
-               goto err;
-       dev->platform_data = nvdimm_bus;
-
-       for (i = 0; i < e820.nr_map; i++) {
-               struct e820entry *ei = &e820.map[i];
-               struct resource res = {
-                       .flags  = IORESOURCE_MEM,
-                       .start  = ei->addr,
-                       .end    = ei->addr + ei->size - 1,
-               };
-               struct nd_region_desc ndr_desc;
-
-               if (ei->type != E820_PRAM)
-                       continue;
-
-               memset(&ndr_desc, 0, sizeof(ndr_desc));
-               ndr_desc.res = &res;
-               ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
-               ndr_desc.numa_node = NUMA_NO_NODE;
-               if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
-                       goto err;
-       }
-
-       return 0;
-
- err:
-       dev_err(dev, "failed to register legacy persistent memory ranges\n");
-       platform_device_unregister(&e820_pmem);
-       return -ENXIO;
+       struct platform_device *pdev;
+
+       /*
+        * See drivers/nvdimm/e820.c for the implementation, this is
+        * simply here to trigger the module to load on demand.
+        */
+       pdev = platform_device_alloc("e820_pmem", -1);
+       return platform_device_add(pdev);
 }
 device_initcall(register_e820_pmem);
index 86db4bcd7ce52bcb74a5bf42efcd8e7152488cf1..02693dd9a0790b804a515294714d59ed68688ba8 100644 (file)
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
        .emergency_restart = native_machine_emergency_restart,
        .restart = native_machine_restart,
        .halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        .crash_shutdown = native_machine_crash_shutdown,
 #endif
 };
@@ -703,7 +703,7 @@ void machine_halt(void)
        machine_ops.halt();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void machine_crash_shutdown(struct pt_regs *regs)
 {
        machine_ops.crash_shutdown(regs);
index b143c2d04420c8ad0ccc0b0b3fb86ab6e3cb26e4..fdb7f2a2d3286013a7ea41d392e48596c90fc672 100644 (file)
@@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
        return ramdisk_size;
 }
 
-#define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
        /* Assume only end is not page aligned */
        u64 ramdisk_image = get_ramdisk_image();
        u64 ramdisk_size  = get_ramdisk_size();
        u64 area_size     = PAGE_ALIGN(ramdisk_size);
-       unsigned long slop, clen, mapaddr;
-       char *p, *q;
 
        /* We need to move the initrd down into directly mapped mem */
        relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
        printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
               relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
 
-       q = (char *)initrd_start;
-
-       /* Copy the initrd */
-       while (ramdisk_size) {
-               slop = ramdisk_image & ~PAGE_MASK;
-               clen = ramdisk_size;
-               if (clen > MAX_MAP_CHUNK-slop)
-                       clen = MAX_MAP_CHUNK-slop;
-               mapaddr = ramdisk_image & PAGE_MASK;
-               p = early_memremap(mapaddr, clen+slop);
-               memcpy(q, p+slop, clen);
-               early_memunmap(p, clen+slop);
-               q += clen;
-               ramdisk_image += clen;
-               ramdisk_size  -= clen;
-       }
+       copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
 
-       ramdisk_image = get_ramdisk_image();
-       ramdisk_size  = get_ramdisk_size();
        printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
                " [mem %#010llx-%#010llx]\n",
                ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
  * --------- Crashkernel reservation ------------------------------
  */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
index c8d52cb4cb6e8b9ee9d81cfc9c0fa3603284ce0e..c3f7602cd0386b2fb0a1a7437263f9c71f27d03c 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/hypervisor.h>
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
+#include <asm/geode.h>
 
 unsigned int __read_mostly cpu_khz;    /* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -1013,15 +1014,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
 static void __init check_system_tsc_reliable(void)
 {
-#ifdef CONFIG_MGEODE_LX
-       /* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+       if (is_geode_lx()) {
+               /* RTSC counts during suspend */
 #define RTSC_SUSP 0x100
-       unsigned long res_low, res_high;
+               unsigned long res_low, res_high;
 
-       rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
-       /* Geode_LX - the OLPC CPU has a very reliable TSC */
-       if (res_low & RTSC_SUSP)
-               tsc_clocksource_reliable = 1;
+               rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+               /* Geode_LX - the OLPC CPU has a very reliable TSC */
+               if (res_low & RTSC_SUSP)
+                       tsc_clocksource_reliable = 1;
+       }
 #endif
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
                tsc_clocksource_reliable = 1;
index abd8b856bd2b50af307caa2eaad953dc80c4ed00..5246193519614dbd8d3a602544e8117376df05f7 100644 (file)
@@ -45,6 +45,7 @@
 #include <linux/audit.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -232,6 +233,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
        struct pt_regs *regs = current_pt_regs();
        unsigned long err = 0;
 
+       err = security_mmap_addr(0);
+       if (err) {
+               /*
+                * vm86 cannot virtualize the address space, so vm86 users
+                * need to manage the low 1MB themselves using mmap.  Given
+                * that BIOS places important data in the first page, vm86
+                * is essentially useless if mmap_min_addr != 0.  DOSEMU,
+                * for example, won't even bother trying to use vm86 if it
+                * can't map a page at virtual address 0.
+                *
+                * To reduce the available kernel attack surface, simply
+                * disallow vm86(old) for users who cannot mmap at va 0.
+                *
+                * The implementation of security_mmap_addr will allow
+                * suitably privileged users to map va 0 even if
+                * vm.mmap_min_addr is set above 0, and we want this
+                * behavior for vm86 as well, as it ensures that legacy
+                * tools like vbetool will not fail just because of
+                * vm.mmap_min_addr.
+                */
+               pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d).  Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+                            current->comm, task_pid_nr(current),
+                            from_kuid_munged(&init_user_ns, current_uid()));
+               return -EPERM;
+       }
+
        if (!vm86) {
                if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
                        return -ENOMEM;
index 00bf300fd8468db0e5bcd2fd9e32fc4f80e48adb..74e4bf11f562e0354c227518421e2375ec16fafa 100644 (file)
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
 
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <asm/kexec.h>
 
 . = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
index e7a4fde5d631031908b6f336d9ecc40e7a3a413d..b372a7557c16c7d8391fffafdf0b1c74b49c4822 100644 (file)
@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        u16 sel;
 
        la = seg_base(ctxt, addr.seg) + addr.ea;
+       *linear = la;
        *max_size = 0;
        switch (mode) {
        case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
        }
        if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
                return emulate_gp(ctxt, 0);
-       *linear = la;
        return X86EMUL_CONTINUE;
 bad:
        if (addr.seg == VCPU_SREG_SS)
index fb16a8ea3dee026d24a6e47332326226cf3a70e0..69088a1ba5090ffa763a56f5c105560f360de767 100644 (file)
@@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
        walk_shadow_page_lockless_begin(vcpu);
 
-       for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+       for (shadow_walk_init(&iterator, vcpu, addr),
+                leaf = root = iterator.level;
             shadow_walk_okay(&iterator);
             __shadow_walk_next(&iterator, spte)) {
-               leaf = iterator.level;
                spte = mmu_spte_get_lockless(iterator.sptep);
 
                sptes[leaf - 1] = spte;
+               leaf--;
 
                if (!is_shadow_present_pte(spte))
                        break;
@@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
        if (reserved) {
                pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
                       __func__, addr);
-               while (root >= leaf) {
+               while (root > leaf) {
                        pr_err("------ spte 0x%llx level %d.\n",
                               sptes[root - 1], root);
                        root--;
index 4a4eec30cc08c6924e370a6ab4ac56d9c03c6e69..d01986832afc28ed225b2f414ccb2742e528169c 100644 (file)
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This bitmap is used to indicate whether the vmclear
  * operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
 #else
 static inline void crash_enable_local_vmclear(int cpu) { }
 static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
        struct page *pages;
        struct vmcs *vmcs;
 
-       pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+       pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
        if (!pages)
                return NULL;
        vmcs = page_address(pages);
@@ -10411,7 +10411,7 @@ static int __init vmx_init(void)
        if (r)
                return r;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
 #endif
@@ -10421,7 +10421,7 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
        synchronize_rcu();
 #endif
index 1e7e76e14e8927ea3c909e4d83e879d63b90a815..a60bdbccff5189b5a98b9a7fcc6a3b9f7ff5eeec 100644 (file)
@@ -5943,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
        put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
 }
 
+#ifdef CONFIG_X86_64
 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
        struct kvm_segment seg;
@@ -5958,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
        put_smstate(u32, buf, offset + 4, seg.limit);
        put_smstate(u64, buf, offset + 8, seg.base);
 }
+#endif
 
 static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
index 68aec42545c2a3e90d3731b217f2cc097cbd8045..7562f42914b4adb2df9c23a0e85493e8378d0543 100644 (file)
@@ -823,11 +823,11 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
        struct pglist_data *pgdata = NODE_DATA(nid);
        struct zone *zone = pgdata->node_zones +
-               zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+               zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
 
index 3fba623e3ba558553d9740d3a994343a2960428f..30564e2752d361870e91a4e25a5afbb3d029b7d6 100644 (file)
@@ -687,11 +687,11 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
  * Memory is added always to NORMAL zone. This means you will never get
  * additional DMA/DMA32 memory.
  */
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
        struct pglist_data *pgdat = NODE_DATA(nid);
        struct zone *zone = pgdat->node_zones +
-               zone_for_memory(nid, start, size, ZONE_NORMAL);
+               zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
index db1b0bc5017c9f01b456a97b5b84d6d40a2c03b2..134948b0926f521afe63f9e8c3679835f72cccee 100644 (file)
@@ -42,58 +42,21 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
  */
 static unsigned long mpx_mmap(unsigned long len)
 {
-       unsigned long ret;
-       unsigned long addr, pgoff;
        struct mm_struct *mm = current->mm;
-       vm_flags_t vm_flags;
-       struct vm_area_struct *vma;
+       unsigned long addr, populate;
 
        /* Only bounds table can be allocated here */
        if (len != mpx_bt_size_bytes(mm))
                return -EINVAL;
 
        down_write(&mm->mmap_sem);
-
-       /* Too many mappings? */
-       if (mm->map_count > sysctl_max_map_count) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* Obtain the address to map to. we verify (or select) it and ensure
-        * that it represents a valid section of the address space.
-        */
-       addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
-       if (addr & ~PAGE_MASK) {
-               ret = addr;
-               goto out;
-       }
-
-       vm_flags = VM_READ | VM_WRITE | VM_MPX |
-                       mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
-       /* Set pgoff according to addr for anon_vma */
-       pgoff = addr >> PAGE_SHIFT;
-
-       ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
-       if (IS_ERR_VALUE(ret))
-               goto out;
-
-       vma = find_vma(mm, ret);
-       if (!vma) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (vm_flags & VM_LOCKED) {
-               up_write(&mm->mmap_sem);
-               mm_populate(ret, len);
-               return ret;
-       }
-
-out:
+       addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
        up_write(&mm->mmap_sem);
-       return ret;
+       if (populate)
+               mm_populate(addr, populate);
+
+       return addr;
 }
 
 enum reg_type {
index 4053bb58bf92e6c328936aed1c8439b528468e1d..c3b3f653ed0c6c9112297164c5cf535494f19461 100644 (file)
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
                bi->start = max(bi->start, low);
                bi->end = min(bi->end, high);
 
-               /* and there's no empty block */
-               if (bi->start >= bi->end)
+               /* and there's no empty or non-exist block */
+               if (bi->start >= bi->end ||
+                   !memblock_overlaps_region(&memblock.memory,
+                       bi->start, bi->end - bi->start))
                        numa_remove_memblk_from(i--, mi);
        }
 
index 66338a60aa6ef961c6017731b6fd6d9d169736e1..c2aea63bee2085fdb08c9d4ea8f3accaf4ae2e5e 100644 (file)
@@ -192,10 +192,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
        node_set(node, numa_nodes_parsed);
 
-       pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+       pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
                node, pxm,
                (unsigned long long) start, (unsigned long long) end - 1,
-               hotpluggable ? " hotplug" : "");
+               hotpluggable ? " hotplug" : "",
+               ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
 
        /* Mark hotplug range in memblock. */
        if (hotpluggable && memblock_mark_hotplug(start, ma->length))
index e4308fe6afe81e4d8be5a42a6cc682174761fe1f..1db84c0758b732b3465fcc896ef98862dabe0f16 100644 (file)
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
 
 static void __init save_runtime_map(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        efi_memory_desc_t *md;
        void *tmp, *p, *q = NULL;
        int count = 0;
@@ -748,7 +748,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
 
 static void __init kexec_enter_virtual_mode(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        efi_memory_desc_t *md;
        void *p;
 
index 020c101c255fec8386ba36c13c82ac8ddaf715b3..5c9f63fa6abf24ed575005d7ffb0c3118a728505 100644 (file)
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
        touch_nmi_watchdog();
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static atomic_t uv_nmi_kexec_failed;
 static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
        uv_nmi_sync_exit(0);
 }
 
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
 static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
 {
        if (master)
                pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
 }
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_KGDB
 #ifdef CONFIG_KGDB_KDB
index 484145368a241207d8aa80a5f758a7d0f3ef54cb..c7b15f3e2cf37096f1041169aa544f706394a886 100644 (file)
@@ -7,6 +7,7 @@ config XEN
        depends on PARAVIRT
        select PARAVIRT_CLOCK
        select XEN_HAVE_PVMMU
+       select XEN_HAVE_VPMU
        depends on X86_64 || (X86_32 && X86_PAE)
        depends on X86_LOCAL_APIC && X86_TSC
        help
@@ -23,14 +24,18 @@ config XEN_PVHVM
        def_bool y
        depends on XEN && PCI && X86_LOCAL_APIC
 
-config XEN_MAX_DOMAIN_MEMORY
-       int
-       default 500 if X86_64
-       default 64 if X86_32
-       depends on XEN
-       help
-         This only affects the sizing of some bss arrays, the unused
-         portions of which are freed.
+config XEN_512GB
+       bool "Limit Xen pv-domain memory to 512GB"
+       depends on XEN && X86_64
+       default y
+       help
+         Limit paravirtualized user domains to 512GB of RAM.
+
+         The Xen tools and crash dump analysis tools might not support
+         pv-domains with more than 512 GB of RAM. This option controls the
+         default setting of the kernel to use only up to 512 GB or more.
+         It is always possible to change the default via specifying the
+         boot parameter "xen_512gb_limit".
 
 config XEN_SAVE_RESTORE
        bool
index 4b6e29ac0968c1a76451d3ff773652bc4afed138..e47e52787d32eb9e4b9c7b6cb50b7a11f57dc586 100644 (file)
@@ -13,7 +13,7 @@ CFLAGS_mmu.o                  := $(nostackp)
 obj-y          := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        time.o xen-asm.o xen-asm_$(BITS).o \
                        grant-table.o suspend.o platform-pci-unplug.o \
-                       p2m.o apic.o
+                       p2m.o apic.o pmu.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
index 70e060ad879a129effcbc1043222162f5429b7e6..acda713ab5beaba2c986191012a25c875c3a267c 100644 (file)
@@ -7,6 +7,7 @@
 #include <xen/xen.h>
 #include <xen/interface/physdev.h>
 #include "xen-ops.h"
+#include "pmu.h"
 #include "smp.h"
 
 static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
 
 static void xen_apic_write(u32 reg, u32 val)
 {
+       if (reg == APIC_LVTPC) {
+               (void)pmu_apic_update(reg);
+               return;
+       }
+
        /* Warn to see if there's any stray references */
        WARN(1,"register: %x, value: %x\n", reg, val);
 }
index d9cfa452da9d34e2ae7c849680e82755a51fa6ae..30d12afe52ed173b2a81720cd5c89c24e667de2a 100644 (file)
@@ -84,6 +84,7 @@
 #include "mmu.h"
 #include "smp.h"
 #include "multicalls.h"
+#include "pmu.h"
 
 EXPORT_SYMBOL_GPL(hypercall_page);
 
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
 
 static void xen_write_cr4(unsigned long cr4)
 {
-       cr4 &= ~X86_CR4_PGE;
-       cr4 &= ~X86_CR4_PSE;
+       cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
 
        native_write_cr4(cr4);
 }
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
 {
        u64 val;
 
+       if (pmu_msr_read(msr, &val, err))
+               return val;
+
        val = native_read_msr_safe(msr, err);
        switch (msr) {
        case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
                   Xen console noise. */
 
        default:
-               ret = native_write_msr_safe(msr, low, high);
+               if (!pmu_msr_write(msr, low, high, &ret))
+                       ret = native_write_msr_safe(msr, low, high);
        }
 
        return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
        .read_msr = xen_read_msr_safe,
        .write_msr = xen_write_msr_safe,
 
-       .read_pmc = native_read_pmc,
+       .read_pmc = xen_read_pmc,
 
        .iret = xen_iret,
 #ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
 static void xen_reboot(int reason)
 {
        struct sched_shutdown r = { .reason = reason };
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               xen_pmu_finish(cpu);
 
        if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
                BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
        early_boot_irqs_disabled = true;
 
        xen_raw_console_write("mapping kernel into physical memory\n");
-       xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+       xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+                                  xen_start_info->nr_pages);
+       xen_reserve_special_pages();
 
        /*
         * Modify the cache mode translation tables to match Xen's PAT
index dd151b2045b0e395d58a6b8f5b2f8bac7292dc05..9c479fe4045912d9152315bc1fa4af9ed334304b 100644 (file)
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 DEFINE_PER_CPU(unsigned long, xen_cr3);         /* cr3 stored as physaddr */
 DEFINE_PER_CPU(unsigned long, xen_current_cr3);         /* actual vcpu cr3 */
 
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
 
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void xen_post_allocator_init(void);
 
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+       struct mmuext_op op;
+
+       op.cmd = cmd;
+       op.arg1.mfn = pfn_to_mfn(pfn);
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+               BUG();
+}
+
 #ifdef CONFIG_X86_64
 static void __init xen_cleanhighmap(unsigned long vaddr,
                                    unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
        xen_mc_flush();
 }
 
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+       void *vaddr = __va(paddr);
+       void *vaddr_end = vaddr + size;
+
+       for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+               make_lowmem_page_readwrite(vaddr);
+
+       memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+       unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+       if (unpin)
+               pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+       ClearPagePinned(virt_to_page(__va(pa)));
+       xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+       unsigned long va = vaddr & PMD_MASK;
+       unsigned long pa;
+       pgd_t *pgd = pgd_offset_k(va);
+       pud_t *pud_page = pud_offset(pgd, 0);
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned int i;
+       bool unpin;
+
+       unpin = (vaddr == 2 * PGDIR_SIZE);
+       set_pgd(pgd, __pgd(0));
+       do {
+               pud = pud_page + pud_index(va);
+               if (pud_none(*pud)) {
+                       va += PUD_SIZE;
+               } else if (pud_large(*pud)) {
+                       pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+                       xen_free_ro_pages(pa, PUD_SIZE);
+                       va += PUD_SIZE;
+               } else {
+                       pmd = pmd_offset(pud, va);
+                       if (pmd_large(*pmd)) {
+                               pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+                               xen_free_ro_pages(pa, PMD_SIZE);
+                       } else if (!pmd_none(*pmd)) {
+                               pte = pte_offset_kernel(pmd, va);
+                               set_pmd(pmd, __pmd(0));
+                               for (i = 0; i < PTRS_PER_PTE; ++i) {
+                                       if (pte_none(pte[i]))
+                                               break;
+                                       pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+                                       xen_free_ro_pages(pa, PAGE_SIZE);
+                               }
+                               xen_cleanmfnmap_free_pgtbl(pte, unpin);
+                       }
+                       va += PMD_SIZE;
+                       if (pmd_index(va))
+                               continue;
+                       set_pud(pud, __pud(0));
+                       xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+               }
+
+       } while (pud_index(va) || pmd_index(va));
+       xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
 static void __init xen_pagetable_p2m_free(void)
 {
        unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
        /* using __ka address and sticking INVALID_P2M_ENTRY! */
        memset((void *)xen_start_info->mfn_list, 0xff, size);
 
-       /* We should be in __ka space. */
-       BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
        addr = xen_start_info->mfn_list;
-       /* We roundup to the PMD, which means that if anybody at this stage is
-        * using the __ka address of xen_start_info or xen_start_info->shared_info
-        * they are in going to crash. Fortunatly we have already revectored
-        * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+       /*
+        * We could be in __ka space.
+        * We roundup to the PMD, which means that if anybody at this stage is
+        * using the __ka address of xen_start_info or
+        * xen_start_info->shared_info they are in going to crash. Fortunatly
+        * we have already revectored in xen_setup_kernel_pagetable and in
+        * xen_setup_shared_info.
+        */
        size = roundup(size, PMD_SIZE);
-       xen_cleanhighmap(addr, addr + size);
 
-       size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
-       memblock_free(__pa(xen_start_info->mfn_list), size);
+       if (addr >= __START_KERNEL_map) {
+               xen_cleanhighmap(addr, addr + size);
+               size = PAGE_ALIGN(xen_start_info->nr_pages *
+                                 sizeof(unsigned long));
+               memblock_free(__pa(addr), size);
+       } else {
+               xen_cleanmfnmap(addr);
+       }
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+       unsigned long size;
+       unsigned long addr;
 
        /* At this stage, cleanup_highmap has already cleaned __ka space
         * from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
 
 #ifdef CONFIG_X86_64
        xen_pagetable_p2m_free();
+
+       xen_pagetable_cleanhighmap();
 #endif
        /* And revector! Bye bye old array */
        xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 #else /* CONFIG_X86_64 */
 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
+       unsigned long pfn;
+
+       if (xen_feature(XENFEAT_writable_page_tables) ||
+           xen_feature(XENFEAT_auto_translated_physmap) ||
+           xen_start_info->mfn_list >= __START_KERNEL_map)
+               return pte;
+
+       /*
+        * Pages belonging to the initial p2m list mapped outside the default
+        * address range must be mapped read-only. This region contains the
+        * page tables for mapping the p2m list, too, and page tables MUST be
+        * mapped read-only.
+        */
+       pfn = pte_pfn(pte);
+       if (pfn >= xen_start_info->first_p2m_pfn &&
+           pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+               pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
        return pte;
 }
 #endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
        native_set_pte(ptep, pte);
 }
 
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-       struct mmuext_op op;
-       op.cmd = cmd;
-       op.arg1.mfn = pfn_to_mfn(pfn);
-       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-               BUG();
-}
-
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
         * mappings. Considering that on Xen after the kernel mappings we
         * have the mappings of some pages that don't exist in pfn space, we
         * set max_pfn_mapped to the last real pfn mapped. */
-       max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+       if (xen_start_info->mfn_list < __START_KERNEL_map)
+               max_pfn_mapped = xen_start_info->first_p2m_pfn;
+       else
+               max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
 
        pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
        pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
        /* Graft it onto L4[511][510] */
        copy_page(level2_kernel_pgt, l2);
 
+       /* Copy the initial P->M table mappings if necessary. */
+       i = pgd_index(xen_start_info->mfn_list);
+       if (i && i < pgd_index(__START_KERNEL_map))
+               init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
        if (!xen_feature(XENFEAT_auto_translated_physmap)) {
                /* Make pagetable pieces RO */
                set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
                check_pt_base(&pt_base, &pt_end, addr[i]);
 
        /* Our (by three pages) smaller Xen pagetable that we are using */
-       memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+       xen_pt_base = PFN_PHYS(pt_base);
+       xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+       memblock_reserve(xen_pt_base, xen_pt_size);
+
        /* Revector the xen_start_info */
        xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
 }
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+       unsigned long *vaddr;
+       unsigned long val;
+
+       vaddr = early_memremap_ro(addr, sizeof(val));
+       val = *vaddr;
+       early_memunmap(vaddr, sizeof(val));
+       return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+       phys_addr_t pa;
+       pgd_t pgd;
+       pud_t pud;
+       pmd_t pmd;
+       pte_t pte;
+
+       pa = read_cr3();
+       pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+                                                      sizeof(pgd)));
+       if (!pgd_present(pgd))
+               return 0;
+
+       pa = pgd_val(pgd) & PTE_PFN_MASK;
+       pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+                                                      sizeof(pud)));
+       if (!pud_present(pud))
+               return 0;
+       pa = pud_pfn(pud) << PAGE_SHIFT;
+       if (pud_large(pud))
+               return pa + (vaddr & ~PUD_MASK);
+
+       pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+                                                      sizeof(pmd)));
+       if (!pmd_present(pmd))
+               return 0;
+       pa = pmd_pfn(pmd) << PAGE_SHIFT;
+       if (pmd_large(pmd))
+               return pa + (vaddr & ~PMD_MASK);
+
+       pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+                                                      sizeof(pte)));
+       if (!pte_present(pte))
+               return 0;
+       pa = pte_pfn(pte) << PAGE_SHIFT;
+
+       return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+       phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+       unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+       int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+       pte_t *pt;
+       pmd_t *pmd;
+       pud_t *pud;
+       pgd_t *pgd;
+       unsigned long *new_p2m;
+
+       size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+       n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+       n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+       n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+       n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+       n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+       new_area = xen_find_free_area(PFN_PHYS(n_frames));
+       if (!new_area) {
+               xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+               BUG();
+       }
+
+       /*
+        * Setup the page tables for addressing the new p2m list.
+        * We have asked the hypervisor to map the p2m list at the user address
+        * PUD_SIZE. It may have done so, or it may have used a kernel space
+        * address depending on the Xen version.
+        * To avoid any possible virtual address collision, just use
+        * 2 * PUD_SIZE for the new area.
+        */
+       pud_phys = new_area;
+       pmd_phys = pud_phys + PFN_PHYS(n_pud);
+       pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+       p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+       pgd = __va(read_cr3());
+       new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+       for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+               pud = early_memremap(pud_phys, PAGE_SIZE);
+               clear_page(pud);
+               for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+                    idx_pmd++) {
+                       pmd = early_memremap(pmd_phys, PAGE_SIZE);
+                       clear_page(pmd);
+                       for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+                            idx_pt++) {
+                               pt = early_memremap(pt_phys, PAGE_SIZE);
+                               clear_page(pt);
+                               for (idx_pte = 0;
+                                    idx_pte < min(n_pte, PTRS_PER_PTE);
+                                    idx_pte++) {
+                                       set_pte(pt + idx_pte,
+                                               pfn_pte(p2m_pfn, PAGE_KERNEL));
+                                       p2m_pfn++;
+                               }
+                               n_pte -= PTRS_PER_PTE;
+                               early_memunmap(pt, PAGE_SIZE);
+                               make_lowmem_page_readonly(__va(pt_phys));
+                               pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+                                                 PFN_DOWN(pt_phys));
+                               set_pmd(pmd + idx_pt,
+                                       __pmd(_PAGE_TABLE | pt_phys));
+                               pt_phys += PAGE_SIZE;
+                       }
+                       n_pt -= PTRS_PER_PMD;
+                       early_memunmap(pmd, PAGE_SIZE);
+                       make_lowmem_page_readonly(__va(pmd_phys));
+                       pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+                                         PFN_DOWN(pmd_phys));
+                       set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+                       pmd_phys += PAGE_SIZE;
+               }
+               n_pmd -= PTRS_PER_PUD;
+               early_memunmap(pud, PAGE_SIZE);
+               make_lowmem_page_readonly(__va(pud_phys));
+               pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+               set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+               pud_phys += PAGE_SIZE;
+       }
+
+       /* Now copy the old p2m info to the new area. */
+       memcpy(new_p2m, xen_p2m_addr, size);
+       xen_p2m_addr = new_p2m;
+
+       /* Release the old p2m list and set new list info. */
+       p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+       BUG_ON(!p2m_pfn);
+       p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+       if (xen_start_info->mfn_list < __START_KERNEL_map) {
+               pfn = xen_start_info->first_p2m_pfn;
+               pfn_end = xen_start_info->first_p2m_pfn +
+                         xen_start_info->nr_p2m_frames;
+               set_pgd(pgd + 1, __pgd(0));
+       } else {
+               pfn = p2m_pfn;
+               pfn_end = p2m_pfn_end;
+       }
+
+       memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+       while (pfn < pfn_end) {
+               if (pfn == p2m_pfn) {
+                       pfn = p2m_pfn_end;
+                       continue;
+               }
+               make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+               pfn++;
+       }
+
+       xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+       xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
+       xen_start_info->nr_p2m_frames = n_frames;
+}
+
 #else  /* !CONFIG_X86_64 */
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
        pv_mmu_ops.write_cr3 = &xen_write_cr3;
 }
 
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+       phys_addr_t pt_base, paddr;
+       unsigned pmdidx;
+
+       pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+       for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+               if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+                       paddr = m2p(pmd[pmdidx].pmd);
+                       pt_base = min(pt_base, paddr);
+               }
+
+       return pt_base;
+}
+
 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 {
        pmd_t *kernel_pmd;
 
+       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+       xen_pt_base = xen_find_pt_base(kernel_pmd);
+       xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
        initial_kernel_pmd =
                extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
-       max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
-                                 xen_start_info->nr_pt_frames * PAGE_SIZE +
-                                 512*1024);
+       max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
 
-       kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
        copy_page(initial_kernel_pmd, kernel_pmd);
 
        xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
                          PFN_DOWN(__pa(initial_page_table)));
        xen_write_cr3(__pa(initial_page_table));
 
-       memblock_reserve(__pa(xen_start_info->pt_base),
-                        xen_start_info->nr_pt_frames * PAGE_SIZE);
+       memblock_reserve(xen_pt_base, xen_pt_size);
 }
 #endif /* CONFIG_X86_64 */
 
+void __init xen_reserve_special_pages(void)
+{
+       phys_addr_t paddr;
+
+       memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+       if (xen_start_info->store_mfn) {
+               paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+               memblock_reserve(paddr, PAGE_SIZE);
+       }
+       if (!xen_initial_domain()) {
+               paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+               memblock_reserve(paddr, PAGE_SIZE);
+       }
+}
+
+void __init xen_pt_check_e820(void)
+{
+       if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+               xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+               BUG();
+       }
+}
+
 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
 
 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
@@ -2465,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
        return 0;
 }
 
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
                        unsigned long addr,
-                       xen_pfn_t *mfn, int nr,
+                       xen_pfn_t *gfn, int nr,
                        int *err_ptr, pgprot_t prot,
                        unsigned domid,
                        struct page **pages)
@@ -2483,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
        if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
                /* We need to update the local page tables and the xen HAP */
-               return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+               return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
                                                 prot, domid, pages);
 #else
                return -EINVAL;
 #endif
         }
 
-       rmd.mfn = mfn;
+       rmd.mfn = gfn;
        rmd.prot = prot;
        /* We use the err_ptr to indicate if there we are doing a contigious
         * mapping or a discontigious mapping. */
@@ -2518,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
                                                    batch_left, &done, domid);
 
                        /*
-                        * @err_ptr may be the same buffer as @mfn, so
-                        * only clear it after each chunk of @mfn is
+                        * @err_ptr may be the same buffer as @gfn, so
+                        * only clear it after each chunk of @gfn is
                         * used.
                         */
                        if (err_ptr) {
@@ -2549,19 +2896,19 @@ out:
        return err < 0 ? err : mapped;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t mfn, int nr,
+                              xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages)
 {
-       return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+       return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
 
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
                               unsigned long addr,
-                              xen_pfn_t *mfn, int nr,
+                              xen_pfn_t *gfn, int nr,
                               int *err_ptr, pgprot_t prot,
                               unsigned domid, struct page **pages)
 {
@@ -2570,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
         * cause of "wrong memory was mapped in".
         */
        BUG_ON(err_ptr == NULL);
-       return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+       return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
 }
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
 
 
 /* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
                               int numpgs, struct page **pages)
 {
        if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2588,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
        return -EINVAL;
 #endif
 }
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
index 8b7f18e200aa4a453d8ae60d02b85b4141abc677..bfc08b13044b181c5948e5a2f22c205e900e0b47 100644 (file)
 #include <xen/balloon.h>
 #include <xen/grant_table.h>
 
-#include "p2m.h"
 #include "multicalls.h"
 #include "xen-ops.h"
 
+#define P2M_MID_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE       (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN    (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
 #define PMDS_PER_MID_PAGE      (P2M_MID_PER_PAGE / PTRS_PER_PTE)
 
 unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
        unsigned int level, topidx, mididx;
        unsigned long *mid_mfn_p;
 
-       if (xen_feature(XENFEAT_auto_translated_physmap))
+       if (xen_feature(XENFEAT_auto_translated_physmap) ||
+           xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
                return;
 
        /* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
 
        BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
-       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
-               virt_to_mfn(p2m_top_mfn);
+       if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+       else
+               HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+                       virt_to_mfn(p2m_top_mfn);
        HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+       HYPERVISOR_shared_info->arch.p2m_generation = 0;
+       HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+       HYPERVISOR_shared_info->arch.p2m_cr3 =
+               xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
 }
 
 /* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
 
                ptechk = lookup_address(vaddr, &level);
                if (ptechk == pte_pg) {
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
                        set_pmd(pmdp,
                                __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
                        pte_newpg[i] = NULL;
                }
 
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  */
 static bool alloc_p2m(unsigned long pfn)
 {
-       unsigned topidx, mididx;
+       unsigned topidx;
        unsigned long *top_mfn_p, *mid_mfn;
        pte_t *ptep, *pte_pg;
        unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
        unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
        unsigned long p2m_pfn;
 
-       topidx = p2m_top_index(pfn);
-       mididx = p2m_mid_index(pfn);
-
        ptep = lookup_address(addr, &level);
        BUG_ON(!ptep || level != PG_LEVEL_4K);
        pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
                        return false;
        }
 
-       if (p2m_top_mfn) {
+       if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+               topidx = p2m_top_index(pfn);
                top_mfn_p = &p2m_top_mfn[topidx];
                mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
 
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
                spin_lock_irqsave(&p2m_update_lock, flags);
 
                if (pte_pfn(*ptep) == p2m_pfn) {
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
                        set_pte(ptep,
                                pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+                       wmb(); /* Tools are synchronizing via p2m_generation. */
+                       HYPERVISOR_shared_info->arch.p2m_generation++;
                        if (mid_mfn)
-                               mid_mfn[mididx] = virt_to_mfn(p2m);
+                               mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
                        p2m = NULL;
                }
 
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
                return true;
        }
 
+       /*
+        * The interface requires atomic updates on p2m elements.
+        * xen_safe_write_ulong() is using __put_user which does an atomic
+        * store via asm().
+        */
        if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
                return true;
 
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
deleted file mode 100644 (file)
index ad8aee2..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES    10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
-                                      unsigned long pfn_e);
-
-#endif  /* _XEN_P2M_H */
index a8261716d58d6c5de73f9c633f03ffcc3cd4fc23..9586ff32810cfb7e24ee721c69aef44cafd94377 100644 (file)
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
        return 0;
 }
 
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
 {
        if (!xen_domain())
                return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644 (file)
index 0000000..724a087
--- /dev/null
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING    1
+struct xenpmu {
+       /* Shared page between hypervisor and domain */
+       struct xen_pmu_data *xenpmu_data;
+
+       uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data()    (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags()   (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+                                           (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS   6
+#define F10H_NUM_COUNTERS   4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER            0
+#define MSR_TYPE_CTRL               1
+#define MSR_TYPE_GLOBAL             2
+#define MSR_TYPE_ARCH_COUNTER       3
+#define MSR_TYPE_ARCH_CTRL          4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT        8
+#define PMU_GENERAL_NR_BITS         8
+#define PMU_GENERAL_NR_MASK         (((1 << PMU_GENERAL_NR_BITS) - 1) \
+                                    << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT          0
+#define PMU_FIXED_NR_BITS           5
+#define PMU_FIXED_NR_MASK           (((1 << PMU_FIXED_NR_BITS) - 1) \
+                                    << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK          (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT        30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+               switch (boot_cpu_data.x86) {
+               case 0x15:
+                       amd_num_counters = F15H_NUM_COUNTERS;
+                       amd_counters_base = MSR_F15H_PERF_CTR;
+                       amd_ctrls_base = MSR_F15H_PERF_CTL;
+                       amd_msr_step = 2;
+                       k7_counters_mirrored = 1;
+                       break;
+               case 0x10:
+               case 0x12:
+               case 0x14:
+               case 0x16:
+               default:
+                       amd_num_counters = F10H_NUM_COUNTERS;
+                       amd_counters_base = MSR_K7_PERFCTR0;
+                       amd_ctrls_base = MSR_K7_EVNTSEL0;
+                       amd_msr_step = 1;
+                       k7_counters_mirrored = 0;
+                       break;
+               }
+       } else {
+               uint32_t eax, ebx, ecx, edx;
+
+               cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+               intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+                       PMU_GENERAL_NR_SHIFT;
+               intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+                       PMU_FIXED_NR_SHIFT;
+       }
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+       switch (addr) {
+       case MSR_K7_PERFCTR0:
+       case MSR_K7_PERFCTR1:
+       case MSR_K7_PERFCTR2:
+       case MSR_K7_PERFCTR3:
+               return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+       case MSR_K7_EVNTSEL0:
+       case MSR_K7_EVNTSEL1:
+       case MSR_K7_EVNTSEL2:
+       case MSR_K7_EVNTSEL3:
+               return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+       default:
+               break;
+       }
+
+       return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+       if ((msr >= MSR_F15H_PERF_CTL &&
+            msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+           (msr >= MSR_K7_EVNTSEL0 &&
+            msr < MSR_K7_PERFCTR0 + amd_num_counters))
+               return true;
+
+       return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+       u32 msr_index_pmc;
+
+       switch (msr_index) {
+       case MSR_CORE_PERF_FIXED_CTR_CTRL:
+       case MSR_IA32_DS_AREA:
+       case MSR_IA32_PEBS_ENABLE:
+               *type = MSR_TYPE_CTRL;
+               return true;
+
+       case MSR_CORE_PERF_GLOBAL_CTRL:
+       case MSR_CORE_PERF_GLOBAL_STATUS:
+       case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+               *type = MSR_TYPE_GLOBAL;
+               return true;
+
+       default:
+
+               if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+                   (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+                                intel_num_fixed_counters)) {
+                       *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+                       *type = MSR_TYPE_COUNTER;
+                       return true;
+               }
+
+               if ((msr_index >= MSR_P6_EVNTSEL0) &&
+                   (msr_index < MSR_P6_EVNTSEL0 +  intel_num_arch_counters)) {
+                       *index = msr_index - MSR_P6_EVNTSEL0;
+                       *type = MSR_TYPE_ARCH_CTRL;
+                       return true;
+               }
+
+               msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+               if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+                   (msr_index_pmc < MSR_IA32_PERFCTR0 +
+                                    intel_num_arch_counters)) {
+                       *type = MSR_TYPE_ARCH_COUNTER;
+                       *index = msr_index_pmc - MSR_IA32_PERFCTR0;
+                       return true;
+               }
+               return false;
+       }
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+                                 int index, bool is_read)
+{
+       uint64_t *reg = NULL;
+       struct xen_pmu_intel_ctxt *ctxt;
+       uint64_t *fix_counters;
+       struct xen_pmu_cntr_pair *arch_cntr_pair;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+               return false;
+
+       ctxt = &xenpmu_data->pmu.c.intel;
+
+       switch (msr) {
+       case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+               reg = &ctxt->global_ovf_ctrl;
+               break;
+       case MSR_CORE_PERF_GLOBAL_STATUS:
+               reg = &ctxt->global_status;
+               break;
+       case MSR_CORE_PERF_GLOBAL_CTRL:
+               reg = &ctxt->global_ctrl;
+               break;
+       case MSR_CORE_PERF_FIXED_CTR_CTRL:
+               reg = &ctxt->fixed_ctrl;
+               break;
+       default:
+               switch (type) {
+               case MSR_TYPE_COUNTER:
+                       fix_counters = field_offset(ctxt, fixed_counters);
+                       reg = &fix_counters[index];
+                       break;
+               case MSR_TYPE_ARCH_COUNTER:
+                       arch_cntr_pair = field_offset(ctxt, arch_counters);
+                       reg = &arch_cntr_pair[index].counter;
+                       break;
+               case MSR_TYPE_ARCH_CTRL:
+                       arch_cntr_pair = field_offset(ctxt, arch_counters);
+                       reg = &arch_cntr_pair[index].control;
+                       break;
+               default:
+                       return false;
+               }
+       }
+
+       if (reg) {
+               if (is_read)
+                       *val = *reg;
+               else {
+                       *reg = *val;
+
+                       if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+                               ctxt->global_status &= (~(*val));
+               }
+               return true;
+       }
+
+       return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+       uint64_t *reg = NULL;
+       int i, off = 0;
+       struct xen_pmu_amd_ctxt *ctxt;
+       uint64_t *counter_regs, *ctrl_regs;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+               return false;
+
+       if (k7_counters_mirrored &&
+           ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+               msr = get_fam15h_addr(msr);
+
+       ctxt = &xenpmu_data->pmu.c.amd;
+       for (i = 0; i < amd_num_counters; i++) {
+               if (msr == amd_ctrls_base + off) {
+                       ctrl_regs = field_offset(ctxt, ctrls);
+                       reg = &ctrl_regs[i];
+                       break;
+               } else if (msr == amd_counters_base + off) {
+                       counter_regs = field_offset(ctxt, counters);
+                       reg = &counter_regs[i];
+                       break;
+               }
+               off += amd_msr_step;
+       }
+
+       if (reg) {
+               if (is_read)
+                       *val = *reg;
+               else
+                       *reg = *val;
+
+               return true;
+       }
+       return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+               if (is_amd_pmu_msr(msr)) {
+                       if (!xen_amd_pmu_emulate(msr, val, 1))
+                               *val = native_read_msr_safe(msr, err);
+                       return true;
+               }
+       } else {
+               int type, index;
+
+               if (is_intel_pmu_msr(msr, &type, &index)) {
+                       if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+                               *val = native_read_msr_safe(msr, err);
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+       uint64_t val = ((uint64_t)high << 32) | low;
+
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+               if (is_amd_pmu_msr(msr)) {
+                       if (!xen_amd_pmu_emulate(msr, &val, 0))
+                               *err = native_write_msr_safe(msr, low, high);
+                       return true;
+               }
+       } else {
+               int type, index;
+
+               if (is_intel_pmu_msr(msr, &type, &index)) {
+                       if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+                               *err = native_write_msr_safe(msr, low, high);
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+       struct xen_pmu_amd_ctxt *ctxt;
+       uint64_t *counter_regs;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+               uint32_t msr;
+               int err;
+
+               msr = amd_counters_base + (counter * amd_msr_step);
+               return native_read_msr_safe(msr, &err);
+       }
+
+       ctxt = &xenpmu_data->pmu.c.amd;
+       counter_regs = field_offset(ctxt, counters);
+       return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+       struct xen_pmu_intel_ctxt *ctxt;
+       uint64_t *fixed_counters;
+       struct xen_pmu_cntr_pair *arch_cntr_pair;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+               uint32_t msr;
+               int err;
+
+               if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+                       msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+               else
+                       msr = MSR_IA32_PERFCTR0 + counter;
+
+               return native_read_msr_safe(msr, &err);
+       }
+
+       ctxt = &xenpmu_data->pmu.c.intel;
+       if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+               fixed_counters = field_offset(ctxt, fixed_counters);
+               return fixed_counters[counter & 0xffff];
+       }
+
+       arch_cntr_pair = field_offset(ctxt, arch_counters);
+       return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+               return xen_amd_read_pmc(counter);
+       else
+               return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+       int ret;
+       struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return -EINVAL;
+       }
+
+       xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+       if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+               return 0;
+
+       ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+       return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return 0;
+       }
+
+       if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+               return 0;
+
+       return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return 0;
+       }
+
+       if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+               return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+       else
+               return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return 0;
+       }
+
+       return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+       .is_in_guest            = xen_is_in_guest,
+       .is_user_mode           = xen_is_user_mode,
+       .get_guest_ip           = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+                            struct pt_regs *regs, uint64_t pmu_flags)
+{
+       regs->ip = xen_regs->ip;
+       regs->cs = xen_regs->cs;
+       regs->sp = xen_regs->sp;
+
+       if (pmu_flags & PMU_SAMPLE_PV) {
+               if (pmu_flags & PMU_SAMPLE_USER)
+                       regs->cs |= 3;
+               else
+                       regs->cs &= ~3;
+       } else {
+               if (xen_regs->cpl)
+                       regs->cs |= 3;
+               else
+                       regs->cs &= ~3;
+       }
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+       int err, ret = IRQ_NONE;
+       struct pt_regs regs;
+       const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+       uint8_t xenpmu_flags = get_xenpmu_flags();
+
+       if (!xenpmu_data) {
+               pr_warn_once("%s: pmudata not initialized\n", __func__);
+               return ret;
+       }
+
+       this_cpu_ptr(&xenpmu_shared)->flags =
+               xenpmu_flags | XENPMU_IRQ_PROCESSING;
+       xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+                        xenpmu_data->pmu.pmu_flags);
+       if (x86_pmu.handle_irq(&regs))
+               ret = IRQ_HANDLED;
+
+       /* Write out cached context to HW */
+       err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+       this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+       if (err) {
+               pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+               return IRQ_NONE;
+       }
+
+       return ret;
+}
+
+bool is_xen_pmu(int cpu)
+{
+       return (get_xenpmu_data() != NULL);
+}
+
+void xen_pmu_init(int cpu)
+{
+       int err;
+       struct xen_pmu_params xp;
+       unsigned long pfn;
+       struct xen_pmu_data *xenpmu_data;
+
+       BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+       if (xen_hvm_domain())
+               return;
+
+       xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+       if (!xenpmu_data) {
+               pr_err("VPMU init: No memory\n");
+               return;
+       }
+       pfn = virt_to_pfn(xenpmu_data);
+
+       xp.val = pfn_to_mfn(pfn);
+       xp.vcpu = cpu;
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+       if (err)
+               goto fail;
+
+       per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+       per_cpu(xenpmu_shared, cpu).flags = 0;
+
+       if (cpu == 0) {
+               perf_register_guest_info_callbacks(&xen_guest_cbs);
+               xen_pmu_arch_init();
+       }
+
+       return;
+
+fail:
+       pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
+               cpu, err);
+       free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+       struct xen_pmu_params xp;
+
+       if (xen_hvm_domain())
+               return;
+
+       xp.vcpu = cpu;
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+
+       (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+       free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+       per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
new file mode 100644 (file)
index 0000000..af5f0ad
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+bool is_xen_pmu(int cpu);
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
index 55f388ef481a40a4020b51ca7dd43b7da56ff97c..f5ef6746d47a0ee36f6b0a11edd0c49cbcf3590a 100644 (file)
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
+#include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "vdso.h"
-#include "p2m.h"
 #include "mmu.h"
 
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
 /*
  * Buffer used to remap identity mapped pages. We only need the virtual space.
  * The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
  */
 #define EXTRA_MEM_RATIO                (10)
 
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+       bool val = false;
+       char *arg;
+
+       arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+       if (!arg)
+               return;
+
+       arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+       if (!arg)
+               val = true;
+       else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+               return;
+
+       xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+                                    unsigned long n_pfns)
 {
        int i;
 
+       /*
+        * No need to check for zero size, should happen rarely and will only
+        * write a new entry regarded to be unused due to zero size.
+        */
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
                /* Add new region. */
-               if (xen_extra_mem[i].size == 0) {
-                       xen_extra_mem[i].start = start;
-                       xen_extra_mem[i].size  = size;
+               if (xen_extra_mem[i].n_pfns == 0) {
+                       xen_extra_mem[i].start_pfn = start_pfn;
+                       xen_extra_mem[i].n_pfns = n_pfns;
                        break;
                }
                /* Append to existing region. */
-               if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
-                       xen_extra_mem[i].size += size;
+               if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+                   start_pfn) {
+                       xen_extra_mem[i].n_pfns += n_pfns;
                        break;
                }
        }
        if (i == XEN_EXTRA_MEM_MAX_REGIONS)
                printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 
-       memblock_reserve(start, size);
+       memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+                                    unsigned long n_pfns)
 {
        int i;
-       phys_addr_t start_r, size_r;
+       unsigned long start_r, size_r;
 
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               start_r = xen_extra_mem[i].start;
-               size_r = xen_extra_mem[i].size;
+               start_r = xen_extra_mem[i].start_pfn;
+               size_r = xen_extra_mem[i].n_pfns;
 
                /* Start of region. */
-               if (start_r == start) {
-                       BUG_ON(size > size_r);
-                       xen_extra_mem[i].start += size;
-                       xen_extra_mem[i].size -= size;
+               if (start_r == start_pfn) {
+                       BUG_ON(n_pfns > size_r);
+                       xen_extra_mem[i].start_pfn += n_pfns;
+                       xen_extra_mem[i].n_pfns -= n_pfns;
                        break;
                }
                /* End of region. */
-               if (start_r + size_r == start + size) {
-                       BUG_ON(size > size_r);
-                       xen_extra_mem[i].size -= size;
+               if (start_r + size_r == start_pfn + n_pfns) {
+                       BUG_ON(n_pfns > size_r);
+                       xen_extra_mem[i].n_pfns -= n_pfns;
                        break;
                }
                /* Mid of region. */
-               if (start > start_r && start < start_r + size_r) {
-                       BUG_ON(start + size > start_r + size_r);
-                       xen_extra_mem[i].size = start - start_r;
+               if (start_pfn > start_r && start_pfn < start_r + size_r) {
+                       BUG_ON(start_pfn + n_pfns > start_r + size_r);
+                       xen_extra_mem[i].n_pfns = start_pfn - start_r;
                        /* Calling memblock_reserve() again is okay. */
-                       xen_add_extra_mem(start + size, start_r + size_r -
-                                         (start + size));
+                       xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+                                         (start_pfn + n_pfns));
                        break;
                }
        }
-       memblock_free(start, size);
+       memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
 }
 
 /*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
 {
        int i;
-       phys_addr_t addr = PFN_PHYS(pfn);
 
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               if (addr >= xen_extra_mem[i].start &&
-                   addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+               if (pfn >= xen_extra_mem[i].start_pfn &&
+                   pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
                        return INVALID_P2M_ENTRY;
        }
 
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
        int i;
 
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-               if (!xen_extra_mem[i].size)
+               if (!xen_extra_mem[i].n_pfns)
                        continue;
-               pfn_s = PFN_DOWN(xen_extra_mem[i].start);
-               pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+               pfn_s = xen_extra_mem[i].start_pfn;
+               pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
                for (pfn = pfn_s; pfn < pfn_e; pfn++)
                        set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
        }
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
  * This function updates min_pfn with the pfn found and returns
  * the size of that range or zero if not found.
  */
-static unsigned long __init xen_find_pfn_range(
-       const struct e820entry *list, size_t map_size,
-       unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
 {
-       const struct e820entry *entry;
+       const struct e820entry *entry = xen_e820_map;
        unsigned int i;
        unsigned long done = 0;
 
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                unsigned long s_pfn;
                unsigned long e_pfn;
 
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
  * as a fallback if the remapping fails.
  */
 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
-       unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+                       unsigned long end_pfn, unsigned long nr_pages)
 {
        unsigned long pfn, end;
        int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
                WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
 
                if (ret == 1) {
-                       (*released)++;
+                       xen_released_pages++;
                        if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
                                break;
                } else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
  * to Xen and not remapped.
  */
 static unsigned long __init xen_set_identity_and_remap_chunk(
-        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
-       unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
-       unsigned long *released, unsigned long *remapped)
+       unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+       unsigned long remap_pfn)
 {
        unsigned long pfn;
        unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
                if (cur_pfn + size > nr_pages)
                        size = nr_pages - cur_pfn;
 
-               remap_range_size = xen_find_pfn_range(list, map_size,
-                                                     &remap_pfn);
+               remap_range_size = xen_find_pfn_range(&remap_pfn);
                if (!remap_range_size) {
                        pr_warning("Unable to find available pfn range, not remapping identity pages\n");
                        xen_set_identity_and_release_chunk(cur_pfn,
-                               cur_pfn + left, nr_pages, released);
+                                               cur_pfn + left, nr_pages);
                        break;
                }
                /* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
                /* Update variables to reflect new mappings. */
                i += size;
                remap_pfn += size;
-               *remapped += size;
        }
 
        /*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
        return remap_pfn;
 }
 
-static void __init xen_set_identity_and_remap(
-       const struct e820entry *list, size_t map_size, unsigned long nr_pages,
-       unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
 {
        phys_addr_t start = 0;
        unsigned long last_pfn = nr_pages;
-       const struct e820entry *entry;
-       unsigned long num_released = 0;
-       unsigned long num_remapped = 0;
+       const struct e820entry *entry = xen_e820_map;
        int i;
 
        /*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
         * example) the DMI tables in a reserved region that begins on
         * a non-page boundary.
         */
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                phys_addr_t end = entry->addr + entry->size;
-               if (entry->type == E820_RAM || i == map_size - 1) {
+               if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
                        unsigned long start_pfn = PFN_DOWN(start);
                        unsigned long end_pfn = PFN_UP(end);
 
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
 
                        if (start_pfn < end_pfn)
                                last_pfn = xen_set_identity_and_remap_chunk(
-                                               list, map_size, start_pfn,
-                                               end_pfn, nr_pages, last_pfn,
-                                               &num_released, &num_remapped);
+                                               start_pfn, end_pfn, nr_pages,
+                                               last_pfn);
                        start = end;
                }
        }
 
-       *released = num_released;
-       *remapped = num_remapped;
-
-       pr_info("Released %ld page(s)\n", num_released);
+       pr_info("Released %ld page(s)\n", xen_released_pages);
 }
 
 /*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
                } else if (pfn_s + len == xen_remap_buf.target_pfn) {
                        len += xen_remap_buf.size;
                } else {
-                       xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+                       xen_del_extra_mem(pfn_s, len);
                        pfn_s = xen_remap_buf.target_pfn;
                        len = xen_remap_buf.size;
                }
@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
        }
 
        if (pfn_s != ~0UL && len)
-               xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+               xen_del_extra_mem(pfn_s, len);
 
        set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
 
        pr_info("Remapped %ld page(s)\n", remapped);
 }
 
+static unsigned long __init xen_get_pages_limit(void)
+{
+       unsigned long limit;
+
+#ifdef CONFIG_X86_32
+       limit = GB(64) / PAGE_SIZE;
+#else
+       limit = MAXMEM / PAGE_SIZE;
+       if (!xen_initial_domain() && xen_512gb_limit)
+               limit = GB(512) / PAGE_SIZE;
+#endif
+       return limit;
+}
+
 static unsigned long __init xen_get_max_pages(void)
 {
-       unsigned long max_pages = MAX_DOMAIN_PAGES;
+       unsigned long max_pages, limit;
        domid_t domid = DOMID_SELF;
        int ret;
 
+       limit = xen_get_pages_limit();
+       max_pages = limit;
+
        /*
         * For the initial domain we use the maximum reservation as
         * the maximum page.
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
                        max_pages = ret;
        }
 
-       return min(max_pages, MAX_DOMAIN_PAGES);
+       return min(max_pages, limit);
 }
 
 static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
        e820_add_region(start, end - start, type);
 }
 
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
 {
-       struct e820entry *entry;
+       struct e820entry *entry = xen_e820_map;
        unsigned int i;
 
-       for (i = 0, entry = list; i < map_size; i++, entry++) {
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
                if (entry->type == E820_UNUSABLE)
                        entry->type = E820_RAM;
        }
 }
 
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+       unsigned long extra = 0;
+       unsigned long start_pfn, end_pfn;
+       const struct e820entry *entry = xen_e820_map;
+       int i;
+
+       end_pfn = 0;
+       for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+               start_pfn = PFN_DOWN(entry->addr);
+               /* Adjacent regions on non-page boundaries handling! */
+               end_pfn = min(end_pfn, start_pfn);
+
+               if (start_pfn >= max_pfn)
+                       return extra + max_pfn - end_pfn;
+
+               /* Add any holes in map to result. */
+               extra += start_pfn - end_pfn;
+
+               end_pfn = PFN_UP(entry->addr + entry->size);
+               end_pfn = min(end_pfn, max_pfn);
+
+               if (entry->type != E820_RAM)
+                       extra += end_pfn - start_pfn;
+       }
+
+       return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+       struct e820entry *entry;
+       unsigned mapcnt;
+       phys_addr_t end;
+
+       if (!size)
+               return false;
+
+       end = start + size;
+       entry = xen_e820_map;
+
+       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+               if (entry->type == E820_RAM && entry->addr <= start &&
+                   (entry->addr + entry->size) >= end)
+                       return false;
+
+               entry++;
+       }
+
+       return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+       unsigned mapcnt;
+       phys_addr_t addr, start;
+       struct e820entry *entry = xen_e820_map;
+
+       for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+               if (entry->type != E820_RAM || entry->size < size)
+                       continue;
+               start = entry->addr;
+               for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+                       if (!memblock_is_reserved(addr))
+                               continue;
+                       start = addr + PAGE_SIZE;
+                       if (start + size > entry->addr + entry->size)
+                               break;
+               }
+               if (addr >= start + size) {
+                       memblock_reserve(start, size);
+                       return start;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+                                  phys_addr_t n)
+{
+       phys_addr_t dest_off, src_off, dest_len, src_len, len;
+       void *from, *to;
+
+       while (n) {
+               dest_off = dest & ~PAGE_MASK;
+               src_off = src & ~PAGE_MASK;
+               dest_len = n;
+               if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+                       dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+               src_len = n;
+               if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+                       src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+               len = min(dest_len, src_len);
+               to = early_memremap(dest - dest_off, dest_len + dest_off);
+               from = early_memremap(src - src_off, src_len + src_off);
+               memcpy(to, from, len);
+               early_memunmap(to, dest_len + dest_off);
+               early_memunmap(from, src_len + src_off);
+               n -= len;
+               dest += len;
+               src += len;
+       }
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+       phys_addr_t start, size;
+
+       if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+               start = __pa(xen_start_info->mfn_list);
+               size = PFN_ALIGN(xen_start_info->nr_pages *
+                                sizeof(unsigned long));
+       } else {
+               start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+               size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+       }
+
+       if (!xen_is_e820_reserved(start, size)) {
+               memblock_reserve(start, size);
+               return;
+       }
+
+#ifdef CONFIG_X86_32
+       /*
+        * Relocating the p2m on 32 bit system to an arbitrary virtual address
+        * is not supported, so just give up.
+        */
+       xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+       BUG();
+#else
+       xen_relocate_p2m();
+#endif
+}
+
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
-       static struct e820entry map[E820MAX] __initdata;
-
-       unsigned long max_pfn = xen_start_info->nr_pages;
-       phys_addr_t mem_end;
+       unsigned long max_pfn, pfn_s, n_pfns;
+       phys_addr_t mem_end, addr, size, chunk_size;
+       u32 type;
        int rc;
        struct xen_memory_map memmap;
        unsigned long max_pages;
        unsigned long extra_pages = 0;
-       unsigned long remapped_pages;
        int i;
        int op;
 
-       max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+       xen_parse_512gb();
+       max_pfn = xen_get_pages_limit();
+       max_pfn = min(max_pfn, xen_start_info->nr_pages);
        mem_end = PFN_PHYS(max_pfn);
 
        memmap.nr_entries = E820MAX;
-       set_xen_guest_handle(memmap.buffer, map);
+       set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
        op = xen_initial_domain() ?
                XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
        if (rc == -ENOSYS) {
                BUG_ON(xen_initial_domain());
                memmap.nr_entries = 1;
-               map[0].addr = 0ULL;
-               map[0].size = mem_end;
+               xen_e820_map[0].addr = 0ULL;
+               xen_e820_map[0].size = mem_end;
                /* 8MB slack (to balance backend allocations). */
-               map[0].size += 8ULL << 20;
-               map[0].type = E820_RAM;
+               xen_e820_map[0].size += 8ULL << 20;
+               xen_e820_map[0].type = E820_RAM;
                rc = 0;
        }
        BUG_ON(rc);
        BUG_ON(memmap.nr_entries == 0);
+       xen_e820_map_entries = memmap.nr_entries;
 
        /*
         * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
         * a patch in the future.
         */
        if (xen_initial_domain())
-               xen_ignore_unusable(map, memmap.nr_entries);
+               xen_ignore_unusable();
 
        /* Make sure the Xen-supplied memory map is well-ordered. */
-       sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+       sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
+                         &xen_e820_map_entries);
 
        max_pages = xen_get_max_pages();
-       if (max_pages > max_pfn)
-               extra_pages += max_pages - max_pfn;
 
-       /*
-        * Set identity map on non-RAM pages and prepare remapping the
-        * underlying RAM.
-        */
-       xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
-                                  &xen_released_pages, &remapped_pages);
+       /* How many extra pages do we need due to remapping? */
+       max_pages += xen_count_remap_pages(max_pfn);
 
-       extra_pages += xen_released_pages;
-       extra_pages += remapped_pages;
+       if (max_pages > max_pfn)
+               extra_pages += max_pages - max_pfn;
 
        /*
         * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
         * is limited to the max size of lowmem, so that it doesn't
         * get completely filled.
         *
+        * Make sure we have no memory above max_pages, as this area
+        * isn't handled by the p2m management.
+        *
         * In principle there could be a problem in lowmem systems if
         * the initial memory is also very large with respect to
         * lowmem, but we won't try to deal with that here.
         */
-       extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-                         extra_pages);
+       extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+                          extra_pages, max_pages - max_pfn);
        i = 0;
-       while (i < memmap.nr_entries) {
-               phys_addr_t addr = map[i].addr;
-               phys_addr_t size = map[i].size;
-               u32 type = map[i].type;
+       addr = xen_e820_map[0].addr;
+       size = xen_e820_map[0].size;
+       while (i < xen_e820_map_entries) {
+               chunk_size = size;
+               type = xen_e820_map[i].type;
 
                if (type == E820_RAM) {
                        if (addr < mem_end) {
-                               size = min(size, mem_end - addr);
+                               chunk_size = min(size, mem_end - addr);
                        } else if (extra_pages) {
-                               size = min(size, PFN_PHYS(extra_pages));
-                               extra_pages -= PFN_DOWN(size);
-                               xen_add_extra_mem(addr, size);
-                               xen_max_p2m_pfn = PFN_DOWN(addr + size);
+                               chunk_size = min(size, PFN_PHYS(extra_pages));
+                               pfn_s = PFN_UP(addr);
+                               n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+                               extra_pages -= n_pfns;
+                               xen_add_extra_mem(pfn_s, n_pfns);
+                               xen_max_p2m_pfn = pfn_s + n_pfns;
                        } else
                                type = E820_UNUSABLE;
                }
 
-               xen_align_and_add_e820_region(addr, size, type);
+               xen_align_and_add_e820_region(addr, chunk_size, type);
 
-               map[i].addr += size;
-               map[i].size -= size;
-               if (map[i].size == 0)
+               addr += chunk_size;
+               size -= chunk_size;
+               if (size == 0) {
                        i++;
+                       if (i < xen_e820_map_entries) {
+                               addr = xen_e820_map[i].addr;
+                               size = xen_e820_map[i].size;
+                       }
+               }
        }
 
        /*
         * Set the rest as identity mapped, in case PCI BARs are
         * located here.
-        *
-        * PFNs above MAX_P2M_PFN are considered identity mapped as
-        * well.
         */
-       set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+       set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
 
        /*
         * In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
        e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
                        E820_RESERVED);
 
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
        /*
-        * Reserve Xen bits:
-        *  - mfn_list
-        *  - xen_start_info
-        * See comment above "struct start_info" in <xen/interface/xen.h>
-        * We tried to make the the memblock_reserve more selective so
-        * that it would be clear what region is reserved. Sadly we ran
-        * in the problem wherein on a 64-bit hypervisor with a 32-bit
-        * initial domain, the pt_base has the cr3 value which is not
-        * neccessarily where the pagetable starts! As Jan put it: "
-        * Actually, the adjustment turns out to be correct: The page
-        * tables for a 32-on-64 dom0 get allocated in the order "first L1",
-        * "first L2", "first L3", so the offset to the page table base is
-        * indeed 2. When reading xen/include/public/xen.h's comment
-        * very strictly, this is not a violation (since there nothing is said
-        * that the first thing in the page table space is pointed to by
-        * pt_base; I admit that this seems to be implied though, namely
-        * do I think that it is implied that the page table space is the
-        * range [pt_base, pt_base + nt_pt_frames), whereas that
-        * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
-        * which - without a priori knowledge - the kernel would have
-        * difficulty to figure out)." - so lets just fall back to the
-        * easy way and reserve the whole region.
+        * Check whether the kernel itself conflicts with the target E820 map.
+        * Failing now is better than running into weird problems later due
+        * to relocating (and even reusing) pages with kernel text or data.
         */
-       memblock_reserve(__pa(xen_start_info->mfn_list),
-                        xen_start_info->pt_base - xen_start_info->mfn_list);
+       if (xen_is_e820_reserved(__pa_symbol(_text),
+                       __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+               xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+               BUG();
+       }
 
-       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+       /*
+        * Check for a conflict of the hypervisor supplied page tables with
+        * the target E820 map.
+        */
+       xen_pt_check_e820();
+
+       xen_reserve_xen_mfnlist();
+
+       /* Check for a conflict of the initrd with the target E820 map. */
+       if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+                                boot_params.hdr.ramdisk_size)) {
+               phys_addr_t new_area, start, size;
+
+               new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+               if (!new_area) {
+                       xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+                       BUG();
+               }
+
+               start = boot_params.hdr.ramdisk_image;
+               size = boot_params.hdr.ramdisk_size;
+               xen_phys_memcpy(new_area, start, size);
+               pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+                       start, start + size, new_area, new_area + size);
+               memblock_free(start, size);
+               boot_params.hdr.ramdisk_image = new_area;
+               boot_params.ext_ramdisk_image = new_area >> 32;
+       }
+
+       /*
+        * Set identity map on non-RAM pages and prepare remapping the
+        * underlying RAM.
+        */
+       xen_set_identity_and_remap(max_pfn);
 
        return "Xen";
 }
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
  */
 char * __init xen_auto_xlated_memory_setup(void)
 {
-       static struct e820entry map[E820MAX] __initdata;
-
        struct xen_memory_map memmap;
        int i;
        int rc;
 
        memmap.nr_entries = E820MAX;
-       set_xen_guest_handle(memmap.buffer, map);
+       set_xen_guest_handle(memmap.buffer, xen_e820_map);
 
        rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
        if (rc < 0)
                panic("No memory map (%d)\n", rc);
 
-       sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+       xen_e820_map_entries = memmap.nr_entries;
+
+       sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+                         &xen_e820_map_entries);
 
-       for (i = 0; i < memmap.nr_entries; i++)
-               e820_add_region(map[i].addr, map[i].size, map[i].type);
+       for (i = 0; i < xen_e820_map_entries; i++)
+               e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+                               xen_e820_map[i].type);
 
-       memblock_reserve(__pa(xen_start_info->mfn_list),
-                        xen_start_info->pt_base - xen_start_info->mfn_list);
+       /* Remove p2m info, it is not needed. */
+       xen_start_info->mfn_list = 0;
+       xen_start_info->first_p2m_pfn = 0;
+       xen_start_info->nr_p2m_frames = 0;
 
        return "Xen";
 }
index 86484384492e97d8d41d030bdb78fd0119a8b2cf..3f4ebf0261f28620a96cdd1cd6366c311c6761b5 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
 
 #include <asm/xen/interface.h>
 #include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 #include "smp.h"
+#include "pmu.h"
 
 cpumask_var_t xen_cpu_initialized_map;
 
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
                kfree(per_cpu(xen_irq_work, cpu).name);
                per_cpu(xen_irq_work, cpu).name = NULL;
        }
+
+       if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+               unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+               per_cpu(xen_pmu_irq, cpu).irq = -1;
+               kfree(per_cpu(xen_pmu_irq, cpu).name);
+               per_cpu(xen_pmu_irq, cpu).name = NULL;
+       }
 };
 static int xen_smp_intr_init(unsigned int cpu)
 {
        int rc;
-       char *resched_name, *callfunc_name, *debug_name;
+       char *resched_name, *callfunc_name, *debug_name, *pmu_name;
 
        resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
        rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
        per_cpu(xen_irq_work, cpu).irq = rc;
        per_cpu(xen_irq_work, cpu).name = callfunc_name;
 
+       if (is_xen_pmu(cpu)) {
+               pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+               rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+                                            xen_pmu_irq_handler,
+                                            IRQF_PERCPU|IRQF_NOBALANCING,
+                                            pmu_name, NULL);
+               if (rc < 0)
+                       goto fail;
+               per_cpu(xen_pmu_irq, cpu).irq = rc;
+               per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+       }
+
        return 0;
 
  fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
        }
        set_cpu_sibling_map(0);
 
+       xen_pmu_init(0);
+
        if (xen_smp_intr_init(0))
                BUG();
 
@@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        }
 #endif
        ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+       ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
        if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
                BUG();
 
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
        if (rc)
                return rc;
 
+       xen_pmu_init(cpu);
+
        rc = xen_smp_intr_init(cpu);
        if (rc)
                return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
                xen_smp_intr_free(cpu);
                xen_uninit_lock_cpu(cpu);
                xen_teardown_timer(cpu);
+               xen_pmu_finish(cpu);
        }
 }
 
index 53b4c0811f4f64a72d286fcaa2a9cab9749bacc2..feddabdab4488c54784aa5a64067d09b50b3cb22 100644 (file)
@@ -11,6 +11,7 @@
 
 #include "xen-ops.h"
 #include "mmu.h"
+#include "pmu.h"
 
 static void xen_pv_pre_suspend(void)
 {
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
 
 void xen_arch_pre_suspend(void)
 {
-    if (xen_pv_domain())
-        xen_pv_pre_suspend();
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               xen_pmu_finish(cpu);
+
+       if (xen_pv_domain())
+               xen_pv_pre_suspend();
 }
 
 void xen_arch_post_suspend(int cancelled)
 {
-    if (xen_pv_domain())
-        xen_pv_post_suspend(cancelled);
-    else
-        xen_hvm_post_suspend(cancelled);
+       int cpu;
+
+       if (xen_pv_domain())
+               xen_pv_post_suspend(cancelled);
+       else
+               xen_hvm_post_suspend(cancelled);
+
+       for_each_online_cpu(cpu)
+               xen_pmu_init(cpu);
 }
 
 static void xen_vcpu_notify_restore(void *data)
index 8afdfccf6086349d286ffd53ff0780d54fa6bcec..b65f59a358a220fac788fef178fa3bc2e6dbc8d6 100644 (file)
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
        ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __PAGE_OFFSET)
 #else
        ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
+       /* Map the p2m table to a 512GB-aligned user address. */
+       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
 #endif
        ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)
        ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
index 2292721b1d103844ade9f8a7f436a649c811f77d..1399423f34183dc3bb899180e264a1198dc93192 100644 (file)
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
 void xen_setup_machphys_mapping(void);
 void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
 
 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
 
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
 unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
 void __init xen_inv_extra_mem(void);
 void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
 char * __init xen_memory_setup(void);
 char * xen_auto_xlated_memory_setup(void);
 void __init xen_arch_setup(void);
index f01cb3044e50d310112a94abb597b8ac7aed1b35..4427f38b634e62a90b629fc11d68b25d70c34ba6 100644 (file)
@@ -32,66 +32,6 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 
 #include <asm-generic/dma-mapping-common.h>
 
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_noncoherent(d, s, v, h) dma_free_attrs(d, s, v, h, NULL)
-#define dma_alloc_coherent(d, s, h, f) dma_alloc_attrs(d, s, h, f, NULL)
-#define dma_free_coherent(d, s, c, h) dma_free_attrs(d, s, c, h, NULL)
-
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-                                   dma_addr_t *dma_handle, gfp_t gfp,
-                                   struct dma_attrs *attrs)
-{
-       void *ret;
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
-               return ret;
-
-       ret = ops->alloc(dev, size, dma_handle, gfp, attrs);
-       debug_dma_alloc_coherent(dev, size, *dma_handle, ret);
-
-       return ret;
-}
-
-static inline void dma_free_attrs(struct device *dev, size_t size,
-                                 void *vaddr, dma_addr_t dma_handle,
-                                 struct dma_attrs *attrs)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       if (dma_release_from_coherent(dev, get_order(size), vaddr))
-               return;
-
-       ops->free(dev, size, vaddr, dma_handle, attrs);
-       debug_dma_free_coherent(dev, size, vaddr, dma_handle);
-}
-
-static inline int
-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-       struct dma_map_ops *ops = get_dma_ops(dev);
-
-       debug_dma_mapping_error(dev, dma_addr);
-       return ops->mapping_error(dev, dma_addr);
-}
-
-static inline int
-dma_supported(struct device *dev, u64 mask)
-{
-       return 1;
-}
-
-static inline int
-dma_set_mask(struct device *dev, u64 mask)
-{
-       if(!dev->dma_mask || !dma_supported(dev, mask))
-               return -EIO;
-
-       *dev->dma_mask = mask;
-
-       return 0;
-}
-
 void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
                    enum dma_data_direction direction);
 
index c39bb6e61911e3577274bc4794a08084d4035aaa..867840f5400f4d2d584505682bf089c7cfe694cc 100644 (file)
@@ -57,6 +57,7 @@ static inline void __iomem *ioremap_cache(unsigned long offset,
        else
                BUG();
 }
+#define ioremap_cache ioremap_cache
 
 #define ioremap_wc ioremap_nocache
 #define ioremap_wt ioremap_nocache
index 515b5434fe2de84f0fe9db306fc16adf7711ae5e..ad3f276d74bcb5a21474c49c786dee82f2f1b9f6 100644 (file)
@@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio)
 
        get_io_context_active(ioc);
        bio->bi_ioc = ioc;
-       bio->bi_css = task_get_css(current, blkio_cgrp_id);
+       bio->bi_css = task_get_css(current, io_cgrp_id);
        return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
index d6283b3f5db50674d18ae485970f1e24ed44569d..ac8370cb25157d2ed1cd215f8b95edde0735e3de 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 
@@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
                return;
 
        for (i = 0; i < BLKCG_MAX_POLS; i++)
-               kfree(blkg->pd[i]);
+               if (blkg->pd[i])
+                       blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-       blk_exit_rl(&blkg->rl);
+       if (blkg->blkcg != &blkcg_root)
+               blk_exit_rl(&blkg->rl);
+
+       blkg_rwstat_exit(&blkg->stat_ios);
+       blkg_rwstat_exit(&blkg->stat_bytes);
        kfree(blkg);
 }
 
@@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        if (!blkg)
                return NULL;
 
+       if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+           blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+               goto err_free;
+
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
@@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                        continue;
 
                /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               pd = pol->pd_alloc_fn(gfp_mask, q->node);
                if (!pd)
                        goto err_free;
 
@@ -129,26 +139,11 @@ err_free:
        return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint)
 {
        struct blkcg_gq *blkg;
 
-       blkg = rcu_dereference(blkcg->blkg_hint);
-       if (blkg && blkg->q == q)
-               return blkg;
-
        /*
         * Hint didn't match.  Look up from the radix tree.  Note that the
         * hint can only be updated under queue_lock as otherwise @blkg
@@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 
        return NULL;
 }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (unlikely(blk_queue_bypass(q)))
-               return NULL;
-       return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                    struct request_queue *q,
@@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
-               ret = -EINVAL;
+               ret = -ENODEV;
                goto err_free_blkg;
        }
 
        wb_congested = wb_congested_get_create(&q->backing_dev_info,
-                                              blkcg->css.id, GFP_ATOMIC);
+                                              blkcg->css.id, GFP_NOWAIT);
        if (!wb_congested) {
                ret = -ENOMEM;
                goto err_put_css;
@@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* allocate */
        if (!new_blkg) {
-               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_congested;
@@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
-                       ret = -EINVAL;
+                       ret = -ENODEV;
                        goto err_put_congested;
                }
                blkg_get(blkg->parent);
@@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_init_fn)
-                       pol->pd_init_fn(blkg);
+                       pol->pd_init_fn(blkg->pd[i]);
        }
 
        /* insert */
@@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                        struct blkcg_policy *pol = blkcg_policy[i];
 
                        if (blkg->pd[i] && pol->pd_online_fn)
-                               pol->pd_online_fn(blkg);
+                               pol->pd_online_fn(blkg->pd[i]);
                }
        }
        blkg->online = true;
@@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
         * we shouldn't allow anything to go through for a bypassing queue.
         */
        if (unlikely(blk_queue_bypass(q)))
-               return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 
        blkg = __blkg_lookup(blkcg, q, true);
        if (blkg)
@@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                        return blkg;
        }
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
        struct blkcg *blkcg = blkg->blkcg;
+       struct blkcg_gq *parent = blkg->parent;
        int i;
 
        lockdep_assert_held(blkg->q->queue_lock);
@@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
+                       pol->pd_offline_fn(blkg->pd[i]);
+       }
+
+       if (parent) {
+               blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+               blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
        }
+
        blkg->online = false;
 
        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
        struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-       int i;
-
-       /* tell policies that this one is being freed */
-       for (i = 0; i < BLKCG_MAX_POLS; i++) {
-               struct blkcg_policy *pol = blkcg_policy[i];
-
-               if (blkg->pd[i] && pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-       }
 
        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
@@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               blkg_rwstat_reset(&blkg->stat_bytes);
+               blkg_rwstat_reset(&blkg->stat_ios);
+
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];
 
-                       if (blkcg_policy_enabled(blkg->q, pol) &&
-                           pol->pd_reset_stats_fn)
-                               pol->pd_reset_stats_fn(blkg);
+                       if (blkg->pd[i] && pol->pd_reset_stats_fn)
+                               pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }
 
@@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
        return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
        /* some drivers (floppy) instantiate a queue w/o disk registered */
        if (blkg->q->backing_dev_info.dev)
                return dev_name(blkg->q->backing_dev_info.dev);
        return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
@@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
        return v;
 }
@@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+                                   struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+                                             struct blkg_policy_data *pd,
+                                             int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+                                                             NULL, off);
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        u64 sum = 0;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_stat *stat = (void *)pos_pd + off;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_stat *stat;
+
+               if (!pos_blkg->online)
+                       continue;
+
+               if (pol)
+                       stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       stat = (void *)blkg + off;
 
-               if (pos_blkg->online)
-                       sum += blkg_stat_read(stat);
+               sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
        }
        rcu_read_unlock();
 
@@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        struct blkg_rwstat sum = { };
        int i;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-               struct blkg_rwstat tmp;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_rwstat *rwstat;
 
                if (!pos_blkg->online)
                        continue;
 
-               tmp = blkg_rwstat_read(rwstat);
+               if (pol)
+                       rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       rwstat = (void *)pos_blkg + off;
 
                for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i];
+                       atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+                               percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+                               &sum.aux_cnt[i]);
        }
        rcu_read_unlock();
 
@@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx)
+                  char *input, struct blkg_conf_ctx *ctx)
        __acquires(rcu) __acquires(disk->queue->queue_lock)
 {
        struct gendisk *disk;
        struct blkcg_gq *blkg;
        unsigned int major, minor;
-       unsigned long long v;
-       int part, ret;
+       int key_len, part, ret;
+       char *body;
 
-       if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+       if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;
 
+       body = input + key_len;
+       if (!isspace(*body))
+               return -EINVAL;
+       body = skip_spaces(body);
+
        disk = get_gendisk(MKDEV(major, minor), &part);
        if (!disk)
-               return -EINVAL;
+               return -ENODEV;
        if (part) {
                put_disk(disk);
-               return -EINVAL;
+               return -ENODEV;
        }
 
        rcu_read_lock();
@@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (blkcg_policy_enabled(disk->queue, pol))
                blkg = blkg_lookup_create(blkcg, disk->queue);
        else
-               blkg = ERR_PTR(-EINVAL);
+               blkg = ERR_PTR(-EOPNOTSUPP);
 
        if (IS_ERR(blkg)) {
                ret = PTR_ERR(blkg);
@@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
        ctx->disk = disk;
        ctx->blkg = blkg;
-       ctx->v = v;
+       ctx->body = body;
        return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -774,7 +854,54 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               const char *dname;
+               struct blkg_rwstat rwstat;
+               u64 rbytes, wbytes, rios, wios;
+
+               dname = blkg_dev_name(blkg);
+               if (!dname)
+                       continue;
+
+               spin_lock_irq(blkg->q->queue_lock);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+               rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_ios));
+               rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               spin_unlock_irq(blkg->q->queue_lock);
+
+               if (rbytes || wbytes || rios || wios)
+                       seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+                                  dname, rbytes, wbytes, rios, wios);
+       }
+
+       rcu_read_unlock();
+       return 0;
+}
+
 struct cftype blkcg_files[] = {
+       {
+               .name = "stat",
+               .seq_show = blkcg_print_stat,
+       },
+       { }     /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
@@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
        struct blkcg *blkcg = css_to_blkcg(css);
+       int i;
 
        mutex_lock(&blkcg_pol_mutex);
+
        list_del(&blkcg->all_blkcgs_node);
-       mutex_unlock(&blkcg_pol_mutex);
 
-       if (blkcg != &blkcg_root) {
-               int i;
+       for (i = 0; i < BLKCG_MAX_POLS; i++)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 
-               for (i = 0; i < BLKCG_MAX_POLS; i++)
-                       kfree(blkcg->pd[i]);
-               kfree(blkcg);
-       }
+       mutex_unlock(&blkcg_pol_mutex);
+
+       kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
@@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 
        if (!parent_css) {
                blkcg = &blkcg_root;
-               goto done;
-       }
-
-       blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-       if (!blkcg) {
-               ret = ERR_PTR(-ENOMEM);
-               goto free_blkcg;
+       } else {
+               blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+               if (!blkcg) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_blkcg;
+               }
        }
 
        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
@@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
-               if (!pol || !pol->cpd_size)
+               if (!pol || !pol->cpd_alloc_fn)
                        continue;
 
-               BUG_ON(blkcg->pd[i]);
-               cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+               cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd) {
                        ret = ERR_PTR(-ENOMEM);
                        goto free_pd_blkcg;
                }
-               blkcg->pd[i] = cpd;
+               blkcg->cpd[i] = cpd;
+               cpd->blkcg = blkcg;
                cpd->plid = i;
-               pol->cpd_init_fn(blkcg);
+               if (pol->cpd_init_fn)
+                       pol->cpd_init_fn(cpd);
        }
 
-done:
        spin_lock_init(&blkcg->lock);
-       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
@@ -894,7 +1021,8 @@ done:
 
 free_pd_blkcg:
        for (i--; i >= 0; i--)
-               kfree(blkcg->pd[i]);
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 free_blkcg:
        kfree(blkcg);
        mutex_unlock(&blkcg_pol_mutex);
@@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q)
                radix_tree_preload_end();
 
        if (IS_ERR(blkg)) {
-               kfree(new_blkg);
+               blkg_free(new_blkg);
                return PTR_ERR(blkg);
        }
 
@@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
        return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+               struct blkcg *blkcg;
+
+               if (!pol || !pol->cpd_bind_fn)
+                       continue;
+
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+                       if (blkcg->cpd[pol->plid])
+                               pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+       }
+       mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
-       .legacy_cftypes = blkcg_files,
+       .bind = blkcg_bind,
+       .dfl_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_legacy_files,
+       .legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
@@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
        .depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
                          const struct blkcg_policy *pol)
 {
-       LIST_HEAD(pds);
+       struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg;
-       struct blkg_policy_data *pd, *nd;
-       int cnt = 0, ret;
+       int ret;
 
        if (blkcg_policy_enabled(q, pol))
                return 0;
 
-       /* count and allocate policy_data for all existing blkgs */
        blk_queue_bypass_start(q);
-       spin_lock_irq(q->queue_lock);
-       list_for_each_entry(blkg, &q->blkg_list, q_node)
-               cnt++;
-       spin_unlock_irq(q->queue_lock);
-
-       /* allocate per-blkg policy data for all existing blkgs */
-       while (cnt--) {
-               pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-               if (!pd) {
+pd_prealloc:
+       if (!pd_prealloc) {
+               pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+               if (!pd_prealloc) {
                        ret = -ENOMEM;
-                       goto out_free;
+                       goto out_bypass_end;
                }
-               list_add_tail(&pd->alloc_node, &pds);
        }
 
-       /*
-        * Install the allocated pds and cpds. With @q bypassing, no new blkg
-        * should have been created while the queue lock was dropped.
-        */
        spin_lock_irq(q->queue_lock);
 
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
-               if (WARN_ON(list_empty(&pds))) {
-                       /* umm... this shouldn't happen, just abort */
-                       ret = -ENOMEM;
-                       goto out_unlock;
-               }
-               pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-               list_del_init(&pd->alloc_node);
+               struct blkg_policy_data *pd;
 
-               /* grab blkcg lock too while installing @pd on @blkg */
-               spin_lock(&blkg->blkcg->lock);
+               if (blkg->pd[pol->plid])
+                       continue;
+
+               pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+               if (!pd)
+                       swap(pd, pd_prealloc);
+               if (!pd) {
+                       spin_unlock_irq(q->queue_lock);
+                       goto pd_prealloc;
+               }
 
                blkg->pd[pol->plid] = pd;
                pd->blkg = blkg;
                pd->plid = pol->plid;
-               pol->pd_init_fn(blkg);
-
-               spin_unlock(&blkg->blkcg->lock);
+               if (pol->pd_init_fn)
+                       pol->pd_init_fn(pd);
        }
 
        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;
-out_unlock:
+
        spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
        blk_queue_bypass_end(q);
-       list_for_each_entry_safe(pd, nd, &pds, alloc_node)
-               kfree(pd);
+       if (pd_prealloc)
+               pol->pd_free_fn(pd_prealloc);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
                /* grab blkcg lock too while removing @pd from @blkg */
                spin_lock(&blkg->blkcg->lock);
 
-               if (pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
-               if (pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-
-               kfree(blkg->pd[pol->plid]);
-               blkg->pd[pol->plid] = NULL;
+               if (blkg->pd[pol->plid]) {
+                       if (pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+               }
 
                spin_unlock(&blkg->blkcg->lock);
        }
@@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
        struct blkcg *blkcg;
        int i, ret;
 
-       if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-               return -EINVAL;
-
        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);
 
@@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol)
        blkcg_policy[pol->plid] = pol;
 
        /* allocate and install cpd's */
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;
 
-                       cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+                       cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd) {
                                mutex_unlock(&blkcg_pol_mutex);
                                goto err_free_cpds;
                        }
 
-                       blkcg->pd[pol->plid] = cpd;
+                       blkcg->cpd[pol->plid] = cpd;
+                       cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
-                       pol->cpd_init_fn(blkcg);
+                       pol->cpd_init_fn(cpd);
                }
        }
 
        mutex_unlock(&blkcg_pol_mutex);
 
        /* everything is in place, add intf files for the new policy */
-       if (pol->cftypes)
-               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
-                                                 pol->cftypes));
+       if (pol->dfl_cftypes)
+               WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+                                              pol->dfl_cftypes));
+       if (pol->legacy_cftypes)
+               WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+                                                 pol->legacy_cftypes));
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;
 
 err_free_cpds:
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                }
        }
        blkcg_policy[pol->plid] = NULL;
@@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
                goto out_unlock;
 
        /* kill the intf files first */
-       if (pol->cftypes)
-               cgroup_rm_cftypes(pol->cftypes);
+       if (pol->dfl_cftypes)
+               cgroup_rm_cftypes(pol->dfl_cftypes);
+       if (pol->legacy_cftypes)
+               cgroup_rm_cftypes(pol->legacy_cftypes);
 
        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);
 
-       if (pol->cpd_size) {
+       if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
-                       kfree(blkcg->pd[pol->plid]);
-                       blkcg->pd[pol->plid] = NULL;
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
                }
        }
        blkcg_policy[pol->plid] = NULL;
index 60912e983f16a071458b25baa4eccc2e21769bd5..2eb722d48773cb8a8de49d58b934eed830755da7 100644 (file)
@@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio)
         */
        create_io_context(GFP_ATOMIC, q->node);
 
-       if (blk_throtl_bio(q, bio))
-               return false;   /* throttled, will be resubmitted later */
+       if (!blkcg_bio_issue_check(q, bio))
+               return false;
 
        trace_block_bio_queue(q, bio);
        return true;
index b23193518ac7a964a9f0d75b26c6fa844f6eea40..c75a2636dd404840ea13000ea5747e0d44ddd7be 100644 (file)
@@ -83,14 +83,6 @@ enum tg_state_flags {
 
 #define rb_entry_tg(node)      rb_entry((node), struct throtl_grp, rb_node)
 
-/* Per-cpu group stats */
-struct tg_stats_cpu {
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
-};
-
 struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;
@@ -141,12 +133,6 @@ struct throtl_grp {
        /* When did we start a new slice */
        unsigned long slice_start[2];
        unsigned long slice_end[2];
-
-       /* Per cpu stats pointer */
-       struct tg_stats_cpu __percpu *stats_cpu;
-
-       /* List of tgs waiting for per cpu stats memory to be allocated */
-       struct list_head stats_alloc_node;
 };
 
 struct throtl_data
@@ -168,13 +154,6 @@ struct throtl_data
        struct work_struct dispatch_work;
 };
 
-/* list and work item to allocate percpu group stats */
-static DEFINE_SPINLOCK(tg_stats_alloc_lock);
-static LIST_HEAD(tg_stats_alloc_list);
-
-static void tg_stats_alloc_fn(struct work_struct *);
-static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
-
 static void throtl_pending_timer_fn(unsigned long arg);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
@@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
        return pd_to_blkg(&tg->pd);
 }
 
-static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
-{
-       return blkg_to_tg(td->queue->root_blkg);
-}
-
 /**
  * sq_to_tg - return the throl_grp the specified service queue belongs to
  * @sq: the throtl_service_queue of interest
@@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
        }                                                               \
 } while (0)
 
-static void tg_stats_init(struct tg_stats_cpu *tg_stats)
-{
-       blkg_rwstat_init(&tg_stats->service_bytes);
-       blkg_rwstat_init(&tg_stats->serviced);
-}
-
-/*
- * Worker for allocating per cpu stat for tgs. This is scheduled on the
- * system_wq once there are some groups on the alloc_list waiting for
- * allocation.
- */
-static void tg_stats_alloc_fn(struct work_struct *work)
-{
-       static struct tg_stats_cpu *stats_cpu;  /* this fn is non-reentrant */
-       struct delayed_work *dwork = to_delayed_work(work);
-       bool empty = false;
-
-alloc_stats:
-       if (!stats_cpu) {
-               int cpu;
-
-               stats_cpu = alloc_percpu(struct tg_stats_cpu);
-               if (!stats_cpu) {
-                       /* allocation failed, try again after some time */
-                       schedule_delayed_work(dwork, msecs_to_jiffies(10));
-                       return;
-               }
-               for_each_possible_cpu(cpu)
-                       tg_stats_init(per_cpu_ptr(stats_cpu, cpu));
-       }
-
-       spin_lock_irq(&tg_stats_alloc_lock);
-
-       if (!list_empty(&tg_stats_alloc_list)) {
-               struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
-                                                        struct throtl_grp,
-                                                        stats_alloc_node);
-               swap(tg->stats_cpu, stats_cpu);
-               list_del_init(&tg->stats_alloc_node);
-       }
-
-       empty = list_empty(&tg_stats_alloc_list);
-       spin_unlock_irq(&tg_stats_alloc_lock);
-       if (!empty)
-               goto alloc_stats;
-}
-
 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
 {
        INIT_LIST_HEAD(&qn->node);
@@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
 }
 
 /* init a service_queue, assumes the caller zeroed it */
-static void throtl_service_queue_init(struct throtl_service_queue *sq,
-                                     struct throtl_service_queue *parent_sq)
+static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
        INIT_LIST_HEAD(&sq->queued[0]);
        INIT_LIST_HEAD(&sq->queued[1]);
        sq->pending_tree = RB_ROOT;
-       sq->parent_sq = parent_sq;
        setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
                    (unsigned long)sq);
 }
 
-static void throtl_service_queue_exit(struct throtl_service_queue *sq)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
-       del_timer_sync(&sq->pending_timer);
+       struct throtl_grp *tg;
+       int rw;
+
+       tg = kzalloc_node(sizeof(*tg), gfp, node);
+       if (!tg)
+               return NULL;
+
+       throtl_service_queue_init(&tg->service_queue);
+
+       for (rw = READ; rw <= WRITE; rw++) {
+               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
+               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+       }
+
+       RB_CLEAR_NODE(&tg->rb_node);
+       tg->bps[READ] = -1;
+       tg->bps[WRITE] = -1;
+       tg->iops[READ] = -1;
+       tg->iops[WRITE] = -1;
+
+       return &tg->pd;
 }
 
-static void throtl_pd_init(struct blkcg_gq *blkg)
+static void throtl_pd_init(struct blkg_policy_data *pd)
 {
-       struct throtl_grp *tg = blkg_to_tg(blkg);
+       struct throtl_grp *tg = pd_to_tg(pd);
+       struct blkcg_gq *blkg = tg_to_blkg(tg);
        struct throtl_data *td = blkg->q->td;
-       struct throtl_service_queue *parent_sq;
-       unsigned long flags;
-       int rw;
+       struct throtl_service_queue *sq = &tg->service_queue;
 
        /*
         * If on the default hierarchy, we switch to properly hierarchical
@@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
         * Limits of a group don't interact with limits of other groups
         * regardless of the position of the group in the hierarchy.
         */
-       parent_sq = &td->service_queue;
-
+       sq->parent_sq = &td->service_queue;
        if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
-               parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
-
-       throtl_service_queue_init(&tg->service_queue, parent_sq);
-
-       for (rw = READ; rw <= WRITE; rw++) {
-               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
-       }
-
-       RB_CLEAR_NODE(&tg->rb_node);
+               sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
        tg->td = td;
-
-       tg->bps[READ] = -1;
-       tg->bps[WRITE] = -1;
-       tg->iops[READ] = -1;
-       tg->iops[WRITE] = -1;
-
-       /*
-        * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
-        * but percpu allocator can't be called from IO path.  Queue tg on
-        * tg_stats_alloc_list and allocate from work item.
-        */
-       spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-       list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
-       schedule_delayed_work(&tg_stats_alloc_work, 0);
-       spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
 }
 
 /*
@@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg)
                                    (tg->bps[rw] != -1 || tg->iops[rw] != -1);
 }
 
-static void throtl_pd_online(struct blkcg_gq *blkg)
+static void throtl_pd_online(struct blkg_policy_data *pd)
 {
        /*
         * We don't want new groups to escape the limits of its ancestors.
         * Update has_rules[] after a new group is brought online.
         */
-       tg_update_has_rules(blkg_to_tg(blkg));
-}
-
-static void throtl_pd_exit(struct blkcg_gq *blkg)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       unsigned long flags;
-
-       spin_lock_irqsave(&tg_stats_alloc_lock, flags);
-       list_del_init(&tg->stats_alloc_node);
-       spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
-
-       free_percpu(tg->stats_cpu);
-
-       throtl_service_queue_exit(&tg->service_queue);
-}
-
-static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       int cpu;
-
-       if (tg->stats_cpu == NULL)
-               return;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               blkg_rwstat_reset(&sc->service_bytes);
-               blkg_rwstat_reset(&sc->serviced);
-       }
-}
-
-static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td,
-                                          struct blkcg *blkcg)
-{
-       /*
-        * This is the common case when there are no blkcgs.  Avoid lookup
-        * in this case
-        */
-       if (blkcg == &blkcg_root)
-               return td_root_tg(td);
-
-       return blkg_to_tg(blkg_lookup(blkcg, td->queue));
+       tg_update_has_rules(pd_to_tg(pd));
 }
 
-static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
-                                                 struct blkcg *blkcg)
+static void throtl_pd_free(struct blkg_policy_data *pd)
 {
-       struct request_queue *q = td->queue;
-       struct throtl_grp *tg = NULL;
-
-       /*
-        * This is the common case when there are no blkcgs.  Avoid lookup
-        * in this case
-        */
-       if (blkcg == &blkcg_root) {
-               tg = td_root_tg(td);
-       } else {
-               struct blkcg_gq *blkg;
-
-               blkg = blkg_lookup_create(blkcg, q);
-
-               /* if %NULL and @q is alive, fall back to root_tg */
-               if (!IS_ERR(blkg))
-                       tg = blkg_to_tg(blkg);
-               else if (!blk_queue_dying(q))
-                       tg = td_root_tg(td);
-       }
+       struct throtl_grp *tg = pd_to_tg(pd);
 
-       return tg;
+       del_timer_sync(&tg->service_queue.pending_timer);
+       kfree(tg);
 }
 
 static struct throtl_grp *
@@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
        return 0;
 }
 
-static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes,
-                                        int rw)
-{
-       struct throtl_grp *tg = blkg_to_tg(blkg);
-       struct tg_stats_cpu *stats_cpu;
-       unsigned long flags;
-
-       /* If per cpu stats are not allocated yet, don't do any accounting. */
-       if (tg->stats_cpu == NULL)
-               return;
-
-       /*
-        * Disabling interrupts to provide mutual exclusion between two
-        * writes on same cpu. It probably is not needed for 64bit. Not
-        * optimizing that case yet.
-        */
-       local_irq_save(flags);
-
-       stats_cpu = this_cpu_ptr(tg->stats_cpu);
-
-       blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
-       blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
-
-       local_irq_restore(flags);
-}
-
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
        bool rw = bio_data_dir(bio);
@@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
         * more than once as a throttled bio will go through blk-throtl the
         * second time when it eventually gets issued.  Set it when a bio
         * is being charged to a tg.
-        *
-        * Dispatch stats aren't recursive and each @bio should only be
-        * accounted by the @tg it was originally associated with.  Let's
-        * update the stats when setting REQ_THROTTLED for the first time
-        * which is guaranteed to be for the @bio's original tg.
         */
-       if (!(bio->bi_rw & REQ_THROTTLED)) {
+       if (!(bio->bi_rw & REQ_THROTTLED))
                bio->bi_rw |= REQ_THROTTLED;
-               throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                            bio->bi_iter.bi_size, bio->bi_rw);
-       }
 }
 
 /**
@@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
        }
 }
 
-static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
-                               struct blkg_policy_data *pd, int off)
-{
-       struct throtl_grp *tg = pd_to_tg(pd);
-       struct blkg_rwstat rwstat = { }, tmp;
-       int i, cpu;
-
-       if (tg->stats_cpu == NULL)
-               return 0;
-
-       for_each_possible_cpu(cpu) {
-               struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
-
-               tmp = blkg_rwstat_read((void *)sc + off);
-               for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       rwstat.cnt[i] += tmp.cnt[i];
-       }
-
-       return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-static int tg_print_cpu_rwstat(struct seq_file *sf, void *v)
-{
-       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat,
-                         &blkcg_policy_throtl, seq_cft(sf)->private, true);
-       return 0;
-}
-
 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
                              int off)
 {
@@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
        return 0;
 }
 
-static ssize_t tg_set_conf(struct kernfs_open_file *of,
-                          char *buf, size_t nbytes, loff_t off, bool is_u64)
+static void tg_conf_updated(struct throtl_grp *tg)
 {
-       struct blkcg *blkcg = css_to_blkcg(of_css(of));
-       struct blkg_conf_ctx ctx;
-       struct throtl_grp *tg;
-       struct throtl_service_queue *sq;
-       struct blkcg_gq *blkg;
+       struct throtl_service_queue *sq = &tg->service_queue;
        struct cgroup_subsys_state *pos_css;
-       int ret;
-
-       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
-       if (ret)
-               return ret;
-
-       tg = blkg_to_tg(ctx.blkg);
-       sq = &tg->service_queue;
-
-       if (!ctx.v)
-               ctx.v = -1;
-
-       if (is_u64)
-               *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v;
-       else
-               *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v;
+       struct blkcg_gq *blkg;
 
        throtl_log(&tg->service_queue,
                   "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
@@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
         * restrictions in the whole hierarchy and allows them to bypass
         * blk-throttle.
         */
-       blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
+       blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
                tg_update_has_rules(blkg_to_tg(blkg));
 
        /*
@@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
                tg_update_disptime(tg);
                throtl_schedule_next_dispatch(sq->parent_sq, true);
        }
+}
+
+static ssize_t tg_set_conf(struct kernfs_open_file *of,
+                          char *buf, size_t nbytes, loff_t off, bool is_u64)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkg_conf_ctx ctx;
+       struct throtl_grp *tg;
+       int ret;
+       u64 v;
 
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+       if (sscanf(ctx.body, "%llu", &v) != 1)
+               goto out_finish;
+       if (!v)
+               v = -1;
+
+       tg = blkg_to_tg(ctx.blkg);
+
+       if (is_u64)
+               *(u64 *)((void *)tg + of_cft(of)->private) = v;
+       else
+               *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
+
+       tg_conf_updated(tg);
+       ret = 0;
+out_finish:
        blkg_conf_finish(&ctx);
-       return nbytes;
+       return ret ?: nbytes;
 }
 
 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
@@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
        return tg_set_conf(of, buf, nbytes, off, false);
 }
 
-static struct cftype throtl_files[] = {
+static struct cftype throtl_legacy_files[] = {
        {
                .name = "throttle.read_bps_device",
                .private = offsetof(struct throtl_grp, bps[READ]),
@@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = {
        },
        {
                .name = "throttle.io_service_bytes",
-               .private = offsetof(struct tg_stats_cpu, service_bytes),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_bytes,
        },
        {
                .name = "throttle.io_serviced",
-               .private = offsetof(struct tg_stats_cpu, serviced),
-               .seq_show = tg_print_cpu_rwstat,
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_ios,
+       },
+       { }     /* terminate */
+};
+
+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+                        int off)
+{
+       struct throtl_grp *tg = pd_to_tg(pd);
+       const char *dname = blkg_dev_name(pd->blkg);
+       char bufs[4][21] = { "max", "max", "max", "max" };
+
+       if (!dname)
+               return 0;
+       if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+           tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+               return 0;
+
+       if (tg->bps[READ] != -1)
+               snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+       if (tg->bps[WRITE] != -1)
+               snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+       if (tg->iops[READ] != -1)
+               snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+       if (tg->iops[WRITE] != -1)
+               snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+       seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+                  dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+       return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+                         &blkcg_policy_throtl, seq_cft(sf)->private, false);
+       return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+                         char *buf, size_t nbytes, loff_t off)
+{
+       struct blkcg *blkcg = css_to_blkcg(of_css(of));
+       struct blkg_conf_ctx ctx;
+       struct throtl_grp *tg;
+       u64 v[4];
+       int ret;
+
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+       if (ret)
+               return ret;
+
+       tg = blkg_to_tg(ctx.blkg);
+
+       v[0] = tg->bps[READ];
+       v[1] = tg->bps[WRITE];
+       v[2] = tg->iops[READ];
+       v[3] = tg->iops[WRITE];
+
+       while (true) {
+               char tok[27];   /* wiops=18446744073709551616 */
+               char *p;
+               u64 val = -1;
+               int len;
+
+               if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+                       break;
+               if (tok[0] == '\0')
+                       break;
+               ctx.body += len;
+
+               ret = -EINVAL;
+               p = tok;
+               strsep(&p, "=");
+               if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+                       goto out_finish;
+
+               ret = -ERANGE;
+               if (!val)
+                       goto out_finish;
+
+               ret = -EINVAL;
+               if (!strcmp(tok, "rbps"))
+                       v[0] = val;
+               else if (!strcmp(tok, "wbps"))
+                       v[1] = val;
+               else if (!strcmp(tok, "riops"))
+                       v[2] = min_t(u64, val, UINT_MAX);
+               else if (!strcmp(tok, "wiops"))
+                       v[3] = min_t(u64, val, UINT_MAX);
+               else
+                       goto out_finish;
+       }
+
+       tg->bps[READ] = v[0];
+       tg->bps[WRITE] = v[1];
+       tg->iops[READ] = v[2];
+       tg->iops[WRITE] = v[3];
+
+       tg_conf_updated(tg);
+       ret = 0;
+out_finish:
+       blkg_conf_finish(&ctx);
+       return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+       {
+               .name = "max",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = tg_print_max,
+               .write = tg_set_max,
        },
        { }     /* terminate */
 };
@@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q)
 }
 
 static struct blkcg_policy blkcg_policy_throtl = {
-       .pd_size                = sizeof(struct throtl_grp),
-       .cftypes                = throtl_files,
+       .dfl_cftypes            = throtl_files,
+       .legacy_cftypes         = throtl_legacy_files,
 
+       .pd_alloc_fn            = throtl_pd_alloc,
        .pd_init_fn             = throtl_pd_init,
        .pd_online_fn           = throtl_pd_online,
-       .pd_exit_fn             = throtl_pd_exit,
-       .pd_reset_stats_fn      = throtl_pd_reset_stats,
+       .pd_free_fn             = throtl_pd_free,
 };
 
-bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                   struct bio *bio)
 {
-       struct throtl_data *td = q->td;
        struct throtl_qnode *qn = NULL;
-       struct throtl_grp *tg;
+       struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
        struct throtl_service_queue *sq;
        bool rw = bio_data_dir(bio);
-       struct blkcg *blkcg;
        bool throttled = false;
 
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
        /* see throtl_charge_bio() */
-       if (bio->bi_rw & REQ_THROTTLED)
+       if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw])
                goto out;
 
-       /*
-        * A throtl_grp pointer retrieved under rcu can be used to access
-        * basic fields like stats and io rates. If a group has no rules,
-        * just update the dispatch stats in lockless manner and return.
-        */
-       rcu_read_lock();
-       blkcg = bio_blkcg(bio);
-       tg = throtl_lookup_tg(td, blkcg);
-       if (tg) {
-               if (!tg->has_rules[rw]) {
-                       throtl_update_dispatch_stats(tg_to_blkg(tg),
-                                       bio->bi_iter.bi_size, bio->bi_rw);
-                       goto out_unlock_rcu;
-               }
-       }
-
-       /*
-        * Either group has not been allocated yet or it is not an unlimited
-        * IO group
-        */
        spin_lock_irq(q->queue_lock);
-       tg = throtl_lookup_create_tg(td, blkcg);
-       if (unlikely(!tg))
+
+       if (unlikely(blk_queue_bypass(q)))
                goto out_unlock;
 
        sq = &tg->service_queue;
@@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 
 out_unlock:
        spin_unlock_irq(q->queue_lock);
-out_unlock_rcu:
-       rcu_read_unlock();
 out:
        /*
         * As multiple blk-throtls may stack in the same issue path, we
@@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q)
                return -ENOMEM;
 
        INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-       throtl_service_queue_init(&td->service_queue, NULL);
+       throtl_service_queue_init(&td->service_queue);
 
        q->td = td;
        td->queue = q;
index 838188b35a83f03696fba66cbb71728e402403bd..98614ad37c81f22e175d9455967f9f6a98b78676 100644 (file)
@@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
  * Internal throttling interface
  */
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 #else /* CONFIG_BLK_DEV_THROTTLING */
-static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
-{
-       return false;
-}
 static inline void blk_throtl_drain(struct request_queue *q) { }
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
index c62bb2e650b8c741e64ead5c9f32b090cbf19730..04de88463a986384b54fa57593d2c89e7189e95c 100644 (file)
@@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool;
 #define rb_entry_cfqg(node)    rb_entry((node), struct cfq_group, rb_node)
 
 /* blkio-related constants */
-#define CFQ_WEIGHT_MIN          10
-#define CFQ_WEIGHT_MAX          1000
-#define CFQ_WEIGHT_DEFAULT      500
+#define CFQ_WEIGHT_LEGACY_MIN  10
+#define CFQ_WEIGHT_LEGACY_DFL  500
+#define CFQ_WEIGHT_LEGACY_MAX  1000
 
 struct cfq_ttime {
        unsigned long last_end_request;
@@ -177,10 +177,6 @@ enum wl_type_t {
 
 struct cfqg_stats {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-       /* total bytes transferred */
-       struct blkg_rwstat              service_bytes;
-       /* total IOs serviced, post merge */
-       struct blkg_rwstat              serviced;
        /* number of ios merged */
        struct blkg_rwstat              merged;
        /* total time spent on device in ns, may not be accurate w/ queueing */
@@ -189,8 +185,6 @@ struct cfqg_stats {
        struct blkg_rwstat              wait_time;
        /* number of IOs queued up */
        struct blkg_rwstat              queued;
-       /* total sectors transferred */
-       struct blkg_stat                sectors;
        /* total disk time and nr sectors dispatched by this group */
        struct blkg_stat                time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
@@ -220,7 +214,7 @@ struct cfqg_stats {
 /* Per-cgroup data */
 struct cfq_group_data {
        /* must be the first member */
-       struct blkcg_policy_data pd;
+       struct blkcg_policy_data cpd;
 
        unsigned int weight;
        unsigned int leaf_weight;
@@ -304,7 +298,11 @@ struct cfq_group {
        int dispatched;
        struct cfq_ttime ttime;
        struct cfqg_stats stats;        /* stats for this cfqg */
-       struct cfqg_stats dead_stats;   /* stats pushed from dead children */
+
+       /* async queue for each priority case */
+       struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
+       struct cfq_queue *async_idle_cfqq;
+
 };
 
 struct cfq_io_cq {
@@ -370,12 +368,6 @@ struct cfq_data {
        struct cfq_queue *active_queue;
        struct cfq_io_cq *active_cic;
 
-       /*
-        * async queue for each priority case
-        */
-       struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
-       struct cfq_queue *async_idle_cfqq;
-
        sector_t last_position;
 
        /*
@@ -401,6 +393,7 @@ struct cfq_data {
 };
 
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+static void cfq_put_queue(struct cfq_queue *cfqq);
 
 static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
                                            enum wl_class_t class,
@@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 static struct cfq_group_data
 *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 {
-       return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+       return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
 }
 
 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
@@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
        blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 }
 
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                             uint64_t bytes, int rw)
-{
-       blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
-       blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
-       blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
-}
-
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                        uint64_t start_time, uint64_t io_start_time, int rw)
 {
@@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 static void cfqg_stats_reset(struct cfqg_stats *stats)
 {
        /* queued stats shouldn't be cleared */
-       blkg_rwstat_reset(&stats->service_bytes);
-       blkg_rwstat_reset(&stats->serviced);
        blkg_rwstat_reset(&stats->merged);
        blkg_rwstat_reset(&stats->service_time);
        blkg_rwstat_reset(&stats->wait_time);
@@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats)
 }
 
 /* @to += @from */
-static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from)
+static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
 {
        /* queued stats shouldn't be cleared */
-       blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
-       blkg_rwstat_merge(&to->serviced, &from->serviced);
-       blkg_rwstat_merge(&to->merged, &from->merged);
-       blkg_rwstat_merge(&to->service_time, &from->service_time);
-       blkg_rwstat_merge(&to->wait_time, &from->wait_time);
-       blkg_stat_merge(&from->time, &from->time);
+       blkg_rwstat_add_aux(&to->merged, &from->merged);
+       blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+       blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+       blkg_stat_add_aux(&from->time, &from->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
-       blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-       blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-       blkg_stat_merge(&to->dequeue, &from->dequeue);
-       blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
-       blkg_stat_merge(&to->idle_time, &from->idle_time);
-       blkg_stat_merge(&to->empty_time, &from->empty_time);
+       blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
+       blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+       blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+       blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+       blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+       blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+       blkg_stat_add_aux(&to->empty_time, &from->empty_time);
 #endif
 }
 
 /*
- * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors'
+ * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
  * recursive stats can still account for the amount used by this cfqg after
  * it's gone.
  */
@@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
        if (unlikely(!parent))
                return;
 
-       cfqg_stats_merge(&parent->dead_stats, &cfqg->stats);
-       cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats);
+       cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
        cfqg_stats_reset(&cfqg->stats);
-       cfqg_stats_reset(&cfqg->dead_stats);
 }
 
 #else  /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
                        unsigned long time, unsigned long unaccounted_time) { }
 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
-static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
-                                             uint64_t bytes, int rw) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
                        uint64_t start_time, uint64_t io_start_time, int rw) { }
 
@@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-                                      struct cfq_io_cq *cic, struct bio *bio,
-                                      gfp_t gfp_mask);
+                                      struct cfq_io_cq *cic, struct bio *bio);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void cfqg_stats_init(struct cfqg_stats *stats)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+                           bool on_dfl, bool reset_dev, bool is_leaf_weight);
+
+static void cfqg_stats_exit(struct cfqg_stats *stats)
 {
-       blkg_rwstat_init(&stats->service_bytes);
-       blkg_rwstat_init(&stats->serviced);
-       blkg_rwstat_init(&stats->merged);
-       blkg_rwstat_init(&stats->service_time);
-       blkg_rwstat_init(&stats->wait_time);
-       blkg_rwstat_init(&stats->queued);
+       blkg_rwstat_exit(&stats->merged);
+       blkg_rwstat_exit(&stats->service_time);
+       blkg_rwstat_exit(&stats->wait_time);
+       blkg_rwstat_exit(&stats->queued);
+       blkg_stat_exit(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_exit(&stats->unaccounted_time);
+       blkg_stat_exit(&stats->avg_queue_size_sum);
+       blkg_stat_exit(&stats->avg_queue_size_samples);
+       blkg_stat_exit(&stats->dequeue);
+       blkg_stat_exit(&stats->group_wait_time);
+       blkg_stat_exit(&stats->idle_time);
+       blkg_stat_exit(&stats->empty_time);
+#endif
+}
 
-       blkg_stat_init(&stats->sectors);
-       blkg_stat_init(&stats->time);
+static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
+{
+       if (blkg_rwstat_init(&stats->merged, gfp) ||
+           blkg_rwstat_init(&stats->service_time, gfp) ||
+           blkg_rwstat_init(&stats->wait_time, gfp) ||
+           blkg_rwstat_init(&stats->queued, gfp) ||
+           blkg_stat_init(&stats->time, gfp))
+               goto err;
 
 #ifdef CONFIG_DEBUG_BLK_CGROUP
-       blkg_stat_init(&stats->unaccounted_time);
-       blkg_stat_init(&stats->avg_queue_size_sum);
-       blkg_stat_init(&stats->avg_queue_size_samples);
-       blkg_stat_init(&stats->dequeue);
-       blkg_stat_init(&stats->group_wait_time);
-       blkg_stat_init(&stats->idle_time);
-       blkg_stat_init(&stats->empty_time);
+       if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+           blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+           blkg_stat_init(&stats->dequeue, gfp) ||
+           blkg_stat_init(&stats->group_wait_time, gfp) ||
+           blkg_stat_init(&stats->idle_time, gfp) ||
+           blkg_stat_init(&stats->empty_time, gfp))
+               goto err;
 #endif
+       return 0;
+err:
+       cfqg_stats_exit(stats);
+       return -ENOMEM;
 }
 
-static void cfq_cpd_init(const struct blkcg *blkcg)
+static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 {
-       struct cfq_group_data *cgd =
-               cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+       struct cfq_group_data *cgd;
 
-       if (blkcg == &blkcg_root) {
-               cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
-               cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
-       } else {
-               cgd->weight = CFQ_WEIGHT_DEFAULT;
-               cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
-       }
+       cgd = kzalloc(sizeof(*cgd), GFP_KERNEL);
+       if (!cgd)
+               return NULL;
+       return &cgd->cpd;
+}
+
+static void cfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+       struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
+       unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+                             CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+       if (cpd_to_blkcg(cpd) == &blkcg_root)
+               weight *= 2;
+
+       cgd->weight = weight;
+       cgd->leaf_weight = weight;
 }
 
-static void cfq_pd_init(struct blkcg_gq *blkg)
+static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-       struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
+       kfree(cpd_to_cfqgd(cpd));
+}
+
+static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
+{
+       struct blkcg *blkcg = cpd_to_blkcg(cpd);
+       bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+       unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
+
+       if (blkcg == &blkcg_root)
+               weight *= 2;
+
+       WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
+       WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
+}
+
+static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
+{
+       struct cfq_group *cfqg;
+
+       cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
+       if (!cfqg)
+               return NULL;
 
        cfq_init_cfqg_base(cfqg);
+       if (cfqg_stats_init(&cfqg->stats, gfp)) {
+               kfree(cfqg);
+               return NULL;
+       }
+
+       return &cfqg->pd;
+}
+
+static void cfq_pd_init(struct blkg_policy_data *pd)
+{
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+       struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
+
        cfqg->weight = cgd->weight;
        cfqg->leaf_weight = cgd->leaf_weight;
-       cfqg_stats_init(&cfqg->stats);
-       cfqg_stats_init(&cfqg->dead_stats);
 }
 
-static void cfq_pd_offline(struct blkcg_gq *blkg)
+static void cfq_pd_offline(struct blkg_policy_data *pd)
 {
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
+       int i;
+
+       for (i = 0; i < IOPRIO_BE_NR; i++) {
+               if (cfqg->async_cfqq[0][i])
+                       cfq_put_queue(cfqg->async_cfqq[0][i]);
+               if (cfqg->async_cfqq[1][i])
+                       cfq_put_queue(cfqg->async_cfqq[1][i]);
+       }
+
+       if (cfqg->async_idle_cfqq)
+               cfq_put_queue(cfqg->async_idle_cfqq);
+
        /*
         * @blkg is going offline and will be ignored by
         * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
         * that they don't get lost.  If IOs complete after this point, the
         * stats for them will be lost.  Oh well...
         */
-       cfqg_stats_xfer_dead(blkg_to_cfqg(blkg));
+       cfqg_stats_xfer_dead(cfqg);
 }
 
-/* offset delta from cfqg->stats to cfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) -
-                                       offsetof(struct cfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void cfq_pd_free(struct blkg_policy_data *pd)
 {
-       u64 sum = 0;
-
-       sum += blkg_stat_recursive_sum(pd, off);
-       sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
-       return sum;
-}
-
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
-                                                      int off)
-{
-       struct blkg_rwstat a, b;
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
 
-       a = blkg_rwstat_recursive_sum(pd, off);
-       b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
-       blkg_rwstat_merge(&a, &b);
-       return a;
+       cfqg_stats_exit(&cfqg->stats);
+       return kfree(cfqg);
 }
 
-static void cfq_pd_reset_stats(struct blkcg_gq *blkg)
+static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
-       struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+       struct cfq_group *cfqg = pd_to_cfqg(pd);
 
        cfqg_stats_reset(&cfqg->stats);
-       cfqg_stats_reset(&cfqg->dead_stats);
 }
 
-/*
- * Search for the cfq group current task belongs to. request_queue lock must
- * be held.
- */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                               struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+                                        struct blkcg *blkcg)
 {
-       struct request_queue *q = cfqd->queue;
-       struct cfq_group *cfqg = NULL;
-
-       /* avoid lookup for the common case where there's no blkcg */
-       if (blkcg == &blkcg_root) {
-               cfqg = cfqd->root_group;
-       } else {
-               struct blkcg_gq *blkg;
-
-               blkg = blkg_lookup_create(blkcg, q);
-               if (!IS_ERR(blkg))
-                       cfqg = blkg_to_cfqg(blkg);
-       }
+       struct blkcg_gq *blkg;
 
-       return cfqg;
+       blkg = blkg_lookup(blkcg, cfqd->queue);
+       if (likely(blkg))
+               return blkg_to_cfqg(blkg);
+       return NULL;
 }
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
-       /* Currently, all async queues are mapped to root group */
-       if (!cfq_cfqq_sync(cfqq))
-               cfqg = cfqq->cfqd->root_group;
-
        cfqq->cfqg = cfqg;
        /* cfqq reference on cfqg */
        cfqg_get(cfqg);
@@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 
 static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
                                        char *buf, size_t nbytes, loff_t off,
-                                       bool is_leaf_weight)
+                                       bool on_dfl, bool is_leaf_weight)
 {
+       unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+       unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
        struct blkcg *blkcg = css_to_blkcg(of_css(of));
        struct blkg_conf_ctx ctx;
        struct cfq_group *cfqg;
        struct cfq_group_data *cfqgd;
        int ret;
+       u64 v;
 
        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
        if (ret)
                return ret;
 
-       ret = -EINVAL;
+       if (sscanf(ctx.body, "%llu", &v) == 1) {
+               /* require "default" on dfl */
+               ret = -ERANGE;
+               if (!v && on_dfl)
+                       goto out_finish;
+       } else if (!strcmp(strim(ctx.body), "default")) {
+               v = 0;
+       } else {
+               ret = -EINVAL;
+               goto out_finish;
+       }
+
        cfqg = blkg_to_cfqg(ctx.blkg);
        cfqgd = blkcg_to_cfqgd(blkcg);
-       if (!cfqg || !cfqgd)
-               goto err;
 
-       if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+       ret = -ERANGE;
+       if (!v || (v >= min && v <= max)) {
                if (!is_leaf_weight) {
-                       cfqg->dev_weight = ctx.v;
-                       cfqg->new_weight = ctx.v ?: cfqgd->weight;
+                       cfqg->dev_weight = v;
+                       cfqg->new_weight = v ?: cfqgd->weight;
                } else {
-                       cfqg->dev_leaf_weight = ctx.v;
-                       cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
+                       cfqg->dev_leaf_weight = v;
+                       cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
                }
                ret = 0;
        }
-
-err:
+out_finish:
        blkg_conf_finish(&ctx);
        return ret ?: nbytes;
 }
@@ -1776,25 +1805,27 @@ err:
 static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
 {
-       return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+       return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
 }
 
 static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
 {
-       return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+       return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
 }
 
-static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-                           u64 val, bool is_leaf_weight)
+static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
+                           bool on_dfl, bool reset_dev, bool is_leaf_weight)
 {
+       unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
+       unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        struct cfq_group_data *cfqgd;
        int ret = 0;
 
-       if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
-               return -EINVAL;
+       if (val < min || val > max)
+               return -ERANGE;
 
        spin_lock_irq(&blkcg->lock);
        cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                        continue;
 
                if (!is_leaf_weight) {
+                       if (reset_dev)
+                               cfqg->dev_weight = 0;
                        if (!cfqg->dev_weight)
                                cfqg->new_weight = cfqgd->weight;
                } else {
+                       if (reset_dev)
+                               cfqg->dev_leaf_weight = 0;
                        if (!cfqg->dev_leaf_weight)
                                cfqg->new_leaf_weight = cfqgd->leaf_weight;
                }
@@ -1831,13 +1866,13 @@ out:
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
                          u64 val)
 {
-       return __cfq_set_weight(css, cft, val, false);
+       return __cfq_set_weight(css, val, false, false, false);
 }
 
 static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
                               struct cftype *cft, u64 val)
 {
-       return __cfq_set_weight(css, cft, val, true);
+       return __cfq_set_weight(css, val, false, false, true);
 }
 
 static int cfqg_print_stat(struct seq_file *sf, void *v)
@@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
                                      struct blkg_policy_data *pd, int off)
 {
-       u64 sum = cfqg_stat_pd_recursive_sum(pd, off);
-
+       u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+                                         &blkcg_policy_cfq, off);
        return __blkg_prfill_u64(sf, pd, sum);
 }
 
 static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
                                        struct blkg_policy_data *pd, int off)
 {
-       struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off);
-
+       struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+                                                       &blkcg_policy_cfq, off);
        return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
@@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
        return 0;
 }
 
+static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+                              int off)
+{
+       u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
+       return 0;
+}
+
+static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
+                                        struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+       u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+       return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
+                         false);
+       return 0;
+}
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
                                      struct blkg_policy_data *pd, int off)
@@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
 }
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
 
-static struct cftype cfq_blkcg_files[] = {
+static struct cftype cfq_blkcg_legacy_files[] = {
        /* on root, weight is mapped to leaf_weight */
        {
                .name = "weight_device",
@@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = {
        },
        {
                .name = "sectors",
-               .private = offsetof(struct cfq_group, stats.sectors),
-               .seq_show = cfqg_print_stat,
+               .seq_show = cfqg_print_stat_sectors,
        },
        {
                .name = "io_service_bytes",
-               .private = offsetof(struct cfq_group, stats.service_bytes),
-               .seq_show = cfqg_print_rwstat,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_bytes,
        },
        {
                .name = "io_serviced",
-               .private = offsetof(struct cfq_group, stats.serviced),
-               .seq_show = cfqg_print_rwstat,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_ios,
        },
        {
                .name = "io_service_time",
@@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = {
        },
        {
                .name = "sectors_recursive",
-               .private = offsetof(struct cfq_group, stats.sectors),
-               .seq_show = cfqg_print_stat_recursive,
+               .seq_show = cfqg_print_stat_sectors_recursive,
        },
        {
                .name = "io_service_bytes_recursive",
-               .private = offsetof(struct cfq_group, stats.service_bytes),
-               .seq_show = cfqg_print_rwstat_recursive,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_bytes_recursive,
        },
        {
                .name = "io_serviced_recursive",
-               .private = offsetof(struct cfq_group, stats.serviced),
-               .seq_show = cfqg_print_rwstat_recursive,
+               .private = (unsigned long)&blkcg_policy_cfq,
+               .seq_show = blkg_print_stat_ios_recursive,
        },
        {
                .name = "io_service_time_recursive",
@@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = {
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
        { }     /* terminate */
 };
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+       seq_printf(sf, "default %u\n", cgd->weight);
+       blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+                         &blkcg_policy_cfq, 0, false);
+       return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+                                    char *buf, size_t nbytes, loff_t off)
+{
+       char *endp;
+       int ret;
+       u64 v;
+
+       buf = strim(buf);
+
+       /* "WEIGHT" or "default WEIGHT" sets the default weight */
+       v = simple_strtoull(buf, &endp, 0);
+       if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+               ret = __cfq_set_weight(of_css(of), v, true, false, false);
+               return ret ?: nbytes;
+       }
+
+       /* "MAJ:MIN WEIGHT" */
+       return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+       {
+               .name = "weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cfq_print_weight_on_dfl,
+               .write = cfq_set_weight_on_dfl,
+       },
+       { }     /* terminate */
+};
+
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
-                                               struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
+                                        struct blkcg *blkcg)
 {
        return cfqd->root_group;
 }
@@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 
        cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
        cfqq->nr_sectors += blk_rq_sectors(rq);
-       cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 
 /*
@@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq)
        struct cfq_io_cq *cic = icq_to_cic(icq);
        struct cfq_data *cfqd = cic_to_cfqd(cic);
 
-       if (cic->cfqq[BLK_RW_ASYNC]) {
-               cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
-               cic->cfqq[BLK_RW_ASYNC] = NULL;
+       if (cic_to_cfqq(cic, false)) {
+               cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
+               cic_set_cfqq(cic, NULL, false);
        }
 
-       if (cic->cfqq[BLK_RW_SYNC]) {
-               cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
-               cic->cfqq[BLK_RW_SYNC] = NULL;
+       if (cic_to_cfqq(cic, true)) {
+               cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
+               cic_set_cfqq(cic, NULL, true);
        }
 }
 
@@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
        if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
                return;
 
-       cfqq = cic->cfqq[BLK_RW_ASYNC];
+       cfqq = cic_to_cfqq(cic, false);
        if (cfqq) {
-               struct cfq_queue *new_cfqq;
-               new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-                                        GFP_ATOMIC);
-               if (new_cfqq) {
-                       cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
-                       cfq_put_queue(cfqq);
-               }
+               cfq_put_queue(cfqq);
+               cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
+               cic_set_cfqq(cic, cfqq, false);
        }
 
-       cfqq = cic->cfqq[BLK_RW_SYNC];
+       cfqq = cic_to_cfqq(cic, true);
        if (cfqq)
                cfq_mark_cfqq_prio_changed(cfqq);
 
@@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
        struct cfq_data *cfqd = cic_to_cfqd(cic);
-       struct cfq_queue *sync_cfqq;
+       struct cfq_queue *cfqq;
        uint64_t serial_nr;
 
        rcu_read_lock();
@@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
        if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
                return;
 
-       sync_cfqq = cic_to_cfqq(cic, 1);
-       if (sync_cfqq) {
-               /*
-                * Drop reference to sync queue. A new sync queue will be
-                * assigned in new group upon arrival of a fresh request.
-                */
-               cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
-               cic_set_cfqq(cic, NULL, 1);
-               cfq_put_queue(sync_cfqq);
+       /*
+        * Drop reference to queues.  New queues will be assigned in new
+        * group upon arrival of fresh requests.
+        */
+       cfqq = cic_to_cfqq(cic, false);
+       if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+               cic_set_cfqq(cic, NULL, false);
+               cfq_put_queue(cfqq);
+       }
+
+       cfqq = cic_to_cfqq(cic, true);
+       if (cfqq) {
+               cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
+               cic_set_cfqq(cic, NULL, true);
+               cfq_put_queue(cfqq);
        }
 
        cic->blkcg_serial_nr = serial_nr;
@@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 
-static struct cfq_queue *
-cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-                    struct bio *bio, gfp_t gfp_mask)
-{
-       struct blkcg *blkcg;
-       struct cfq_queue *cfqq, *new_cfqq = NULL;
-       struct cfq_group *cfqg;
-
-retry:
-       rcu_read_lock();
-
-       blkcg = bio_blkcg(bio);
-       cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
-       if (!cfqg) {
-               cfqq = &cfqd->oom_cfqq;
-               goto out;
-       }
-
-       cfqq = cic_to_cfqq(cic, is_sync);
-
-       /*
-        * Always try a new alloc if we fell back to the OOM cfqq
-        * originally, since it should just be a temporary situation.
-        */
-       if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-               cfqq = NULL;
-               if (new_cfqq) {
-                       cfqq = new_cfqq;
-                       new_cfqq = NULL;
-               } else if (gfp_mask & __GFP_WAIT) {
-                       rcu_read_unlock();
-                       spin_unlock_irq(cfqd->queue->queue_lock);
-                       new_cfqq = kmem_cache_alloc_node(cfq_pool,
-                                       gfp_mask | __GFP_ZERO,
-                                       cfqd->queue->node);
-                       spin_lock_irq(cfqd->queue->queue_lock);
-                       if (new_cfqq)
-                               goto retry;
-                       else
-                               return &cfqd->oom_cfqq;
-               } else {
-                       cfqq = kmem_cache_alloc_node(cfq_pool,
-                                       gfp_mask | __GFP_ZERO,
-                                       cfqd->queue->node);
-               }
-
-               if (cfqq) {
-                       cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-                       cfq_init_prio_data(cfqq, cic);
-                       cfq_link_cfqq_cfqg(cfqq, cfqg);
-                       cfq_log_cfqq(cfqd, cfqq, "alloced");
-               } else
-                       cfqq = &cfqd->oom_cfqq;
-       }
-out:
-       if (new_cfqq)
-               kmem_cache_free(cfq_pool, new_cfqq);
-
-       rcu_read_unlock();
-       return cfqq;
-}
-
 static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
+cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
 {
        switch (ioprio_class) {
        case IOPRIO_CLASS_RT:
-               return &cfqd->async_cfqq[0][ioprio];
+               return &cfqg->async_cfqq[0][ioprio];
        case IOPRIO_CLASS_NONE:
                ioprio = IOPRIO_NORM;
                /* fall through */
        case IOPRIO_CLASS_BE:
-               return &cfqd->async_cfqq[1][ioprio];
+               return &cfqg->async_cfqq[1][ioprio];
        case IOPRIO_CLASS_IDLE:
-               return &cfqd->async_idle_cfqq;
+               return &cfqg->async_idle_cfqq;
        default:
                BUG();
        }
@@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-             struct bio *bio, gfp_t gfp_mask)
+             struct bio *bio)
 {
        int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
        int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
        struct cfq_queue **async_cfqq = NULL;
-       struct cfq_queue *cfqq = NULL;
+       struct cfq_queue *cfqq;
+       struct cfq_group *cfqg;
+
+       rcu_read_lock();
+       cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+       if (!cfqg) {
+               cfqq = &cfqd->oom_cfqq;
+               goto out;
+       }
 
        if (!is_sync) {
                if (!ioprio_valid(cic->ioprio)) {
@@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
                        ioprio = task_nice_ioprio(tsk);
                        ioprio_class = task_nice_ioclass(tsk);
                }
-               async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
+               async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
                cfqq = *async_cfqq;
+               if (cfqq)
+                       goto out;
        }
 
-       if (!cfqq)
-               cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+       cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO,
+                                    cfqd->queue->node);
+       if (!cfqq) {
+               cfqq = &cfqd->oom_cfqq;
+               goto out;
+       }
 
-       /*
-        * pin the queue now that it's allocated, scheduler exit will prune it
-        */
-       if (!is_sync && !(*async_cfqq)) {
+       cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
+       cfq_init_prio_data(cfqq, cic);
+       cfq_link_cfqq_cfqg(cfqq, cfqg);
+       cfq_log_cfqq(cfqd, cfqq, "alloced");
+
+       if (async_cfqq) {
+               /* a new async queue is created, pin and remember */
                cfqq->ref++;
                *async_cfqq = cfqq;
        }
-
+out:
        cfqq->ref++;
+       rcu_read_unlock();
        return cfqq;
 }
 
@@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
        const bool is_sync = rq_is_sync(rq);
        struct cfq_queue *cfqq;
 
-       might_sleep_if(gfp_mask & __GFP_WAIT);
-
        spin_lock_irq(q->queue_lock);
 
        check_ioprio_changed(cic, bio);
@@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
        cfqq = cic_to_cfqq(cic, is_sync);
        if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-               cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+               if (cfqq)
+                       cfq_put_queue(cfqq);
+               cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
                cic_set_cfqq(cic, cfqq, is_sync);
        } else {
                /*
@@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
        cancel_work_sync(&cfqd->unplug_work);
 }
 
-static void cfq_put_async_queues(struct cfq_data *cfqd)
-{
-       int i;
-
-       for (i = 0; i < IOPRIO_BE_NR; i++) {
-               if (cfqd->async_cfqq[0][i])
-                       cfq_put_queue(cfqd->async_cfqq[0][i]);
-               if (cfqd->async_cfqq[1][i])
-                       cfq_put_queue(cfqd->async_cfqq[1][i]);
-       }
-
-       if (cfqd->async_idle_cfqq)
-               cfq_put_queue(cfqd->async_idle_cfqq);
-}
-
 static void cfq_exit_queue(struct elevator_queue *e)
 {
        struct cfq_data *cfqd = e->elevator_data;
@@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
        if (cfqd->active_queue)
                __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 
-       cfq_put_async_queues(cfqd);
-
        spin_unlock_irq(q->queue_lock);
 
        cfq_shutdown_timer_wq(cfqd);
@@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
                goto out_free;
 
        cfq_init_cfqg_base(cfqd->root_group);
+       cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
+       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
 #endif
-       cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
-       cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 
        /*
         * Not strictly needed (since RB_ROOT just clears the node and we
@@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
                cfqd->prio_trees[i] = RB_ROOT;
 
        /*
-        * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
+        * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
         * Grab a permanent reference to it, so that the normal code flow
         * will not attempt to free it.  oom_cfqq is linked to root_group
         * but shouldn't hold a reference as it'll never be unlinked.  Lose
@@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = {
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
-       .pd_size                = sizeof(struct cfq_group),
-       .cpd_size               = sizeof(struct cfq_group_data),
-       .cftypes                = cfq_blkcg_files,
+       .dfl_cftypes            = cfq_blkcg_files,
+       .legacy_cftypes         = cfq_blkcg_legacy_files,
 
+       .cpd_alloc_fn           = cfq_cpd_alloc,
        .cpd_init_fn            = cfq_cpd_init,
+       .cpd_free_fn            = cfq_cpd_free,
+       .cpd_bind_fn            = cfq_cpd_bind,
+
+       .pd_alloc_fn            = cfq_pd_alloc,
        .pd_init_fn             = cfq_pd_init,
        .pd_offline_fn          = cfq_pd_offline,
+       .pd_free_fn             = cfq_pd_free,
        .pd_reset_stats_fn      = cfq_pd_reset_stats,
 };
 #endif
diff --git a/certs/Kconfig b/certs/Kconfig
new file mode 100644 (file)
index 0000000..b030b9c
--- /dev/null
@@ -0,0 +1,42 @@
+menu "Certificates for signature checking"
+
+config MODULE_SIG_KEY
+       string "File name or PKCS#11 URI of module signing key"
+       default "certs/signing_key.pem"
+       depends on MODULE_SIG
+       help
+         Provide the file name of a private key/certificate in PEM format,
+         or a PKCS#11 URI according to RFC7512. The file should contain, or
+         the URI should identify, both the certificate and its corresponding
+         private key.
+
+         If this option is unchanged from its default "certs/signing_key.pem",
+         then the kernel will automatically generate the private key and
+         certificate as described in Documentation/module-signing.txt
+
+config SYSTEM_TRUSTED_KEYRING
+       bool "Provide system-wide ring of trusted keys"
+       depends on KEYS
+       help
+         Provide a system keyring to which trusted keys can be added.  Keys in
+         the keyring are considered to be trusted.  Keys may be added at will
+         by the kernel from compiled-in data and from hardware key stores, but
+         userspace may only add extra keys if those keys can be verified by
+         keys already in the keyring.
+
+         Keys in this keyring are used by module signature checking.
+
+config SYSTEM_TRUSTED_KEYS
+       string "Additional X.509 keys for default system keyring"
+       depends on SYSTEM_TRUSTED_KEYRING
+       help
+         If set, this option should be the filename of a PEM-formatted file
+         containing trusted X.509 certificates to be included in the default
+         system keyring. Any certificate used for module signing is implicitly
+         also trusted.
+
+         NOTE: If you previously provided keys for the system keyring in the
+         form of DER-encoded *.x509 files in the top-level build directory,
+         those are no longer used. You will need to set this option instead.
+
+endmenu
diff --git a/certs/Makefile b/certs/Makefile
new file mode 100644 (file)
index 0000000..28ac694
--- /dev/null
@@ -0,0 +1,94 @@
+#
+# Makefile for the linux kernel signature checking certificates.
+#
+
+obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
+
+ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
+
+$(eval $(call config_filename,SYSTEM_TRUSTED_KEYS))
+
+# GCC doesn't include .incbin files in -MD generated dependencies (PR#66871)
+$(obj)/system_certificates.o: $(obj)/x509_certificate_list
+
+# Cope with signing_key.x509 existing in $(srctree) not $(objtree)
+AFLAGS_system_certificates.o := -I$(srctree)
+
+quiet_cmd_extract_certs  = EXTRACT_CERTS   $(patsubst "%",%,$(2))
+      cmd_extract_certs  = scripts/extract-cert $(2) $@ || ( rm $@; exit 1)
+
+targets += x509_certificate_list
+$(obj)/x509_certificate_list: scripts/extract-cert $(SYSTEM_TRUSTED_KEYS_SRCPREFIX)$(SYSTEM_TRUSTED_KEYS_FILENAME) FORCE
+       $(call if_changed,extract_certs,$(SYSTEM_TRUSTED_KEYS_SRCPREFIX)$(CONFIG_SYSTEM_TRUSTED_KEYS))
+endif
+
+clean-files := x509_certificate_list .x509.list
+
+ifeq ($(CONFIG_MODULE_SIG),y)
+###############################################################################
+#
+# If module signing is requested, say by allyesconfig, but a key has not been
+# supplied, then one will need to be generated to make sure the build does not
+# fail and that the kernel may be used afterwards.
+#
+###############################################################################
+ifndef CONFIG_MODULE_SIG_HASH
+$(error Could not determine digest type to use from kernel config)
+endif
+
+# We do it this way rather than having a boolean option for enabling an
+# external private key, because 'make randconfig' might enable such a
+# boolean option and we unfortunately can't make it depend on !RANDCONFIG.
+ifeq ($(CONFIG_MODULE_SIG_KEY),"certs/signing_key.pem")
+$(obj)/signing_key.pem: $(obj)/x509.genkey
+       @echo "###"
+       @echo "### Now generating an X.509 key pair to be used for signing modules."
+       @echo "###"
+       @echo "### If this takes a long time, you might wish to run rngd in the"
+       @echo "### background to keep the supply of entropy topped up.  It"
+       @echo "### needs to be run as root, and uses a hardware random"
+       @echo "### number generator if one is available."
+       @echo "###"
+       openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+               -batch -x509 -config $(obj)/x509.genkey \
+               -outform PEM -out $(obj)/signing_key.pem \
+               -keyout $(obj)/signing_key.pem 2>&1
+       @echo "###"
+       @echo "### Key pair generated."
+       @echo "###"
+
+$(obj)/x509.genkey:
+       @echo Generating X.509 key generation config
+       @echo  >$@ "[ req ]"
+       @echo >>$@ "default_bits = 4096"
+       @echo >>$@ "distinguished_name = req_distinguished_name"
+       @echo >>$@ "prompt = no"
+       @echo >>$@ "string_mask = utf8only"
+       @echo >>$@ "x509_extensions = myexts"
+       @echo >>$@
+       @echo >>$@ "[ req_distinguished_name ]"
+       @echo >>$@ "#O = Unspecified company"
+       @echo >>$@ "CN = Build time autogenerated kernel key"
+       @echo >>$@ "#emailAddress = unspecified.user@unspecified.company"
+       @echo >>$@
+       @echo >>$@ "[ myexts ]"
+       @echo >>$@ "basicConstraints=critical,CA:FALSE"
+       @echo >>$@ "keyUsage=digitalSignature"
+       @echo >>$@ "subjectKeyIdentifier=hash"
+       @echo >>$@ "authorityKeyIdentifier=keyid"
+endif
+
+$(eval $(call config_filename,MODULE_SIG_KEY))
+
+# If CONFIG_MODULE_SIG_KEY isn't a PKCS#11 URI, depend on it
+ifeq ($(patsubst pkcs11:%,%,$(firstword $(MODULE_SIG_KEY_FILENAME))),$(firstword $(MODULE_SIG_KEY_FILENAME)))
+X509_DEP := $(MODULE_SIG_KEY_SRCPREFIX)$(MODULE_SIG_KEY_FILENAME)
+endif
+
+# GCC PR#66871 again.
+$(obj)/system_certificates.o: $(obj)/signing_key.x509
+
+targets += signing_key.x509
+$(obj)/signing_key.x509: scripts/extract-cert $(X509_DEP) FORCE
+       $(call if_changed,extract_certs,$(MODULE_SIG_KEY_SRCPREFIX)$(CONFIG_MODULE_SIG_KEY))
+endif
diff --git a/certs/system_certificates.S b/certs/system_certificates.S
new file mode 100644 (file)
index 0000000..9216e8c
--- /dev/null
@@ -0,0 +1,23 @@
+#include <linux/export.h>
+#include <linux/init.h>
+
+       __INITRODATA
+
+       .align 8
+       .globl VMLINUX_SYMBOL(system_certificate_list)
+VMLINUX_SYMBOL(system_certificate_list):
+__cert_list_start:
+#ifdef CONFIG_MODULE_SIG
+       .incbin "certs/signing_key.x509"
+#endif
+       .incbin "certs/x509_certificate_list"
+__cert_list_end:
+
+       .align 8
+       .globl VMLINUX_SYMBOL(system_certificate_list_size)
+VMLINUX_SYMBOL(system_certificate_list_size):
+#ifdef CONFIG_64BIT
+       .quad __cert_list_end - __cert_list_start
+#else
+       .long __cert_list_end - __cert_list_start
+#endif
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
new file mode 100644 (file)
index 0000000..2570598
--- /dev/null
@@ -0,0 +1,157 @@
+/* System trusted keyring for trusted public keys
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+#include <crypto/pkcs7.h>
+
+struct key *system_trusted_keyring;
+EXPORT_SYMBOL_GPL(system_trusted_keyring);
+
+extern __initconst const u8 system_certificate_list[];
+extern __initconst const unsigned long system_certificate_list_size;
+
+/*
+ * Load the compiled-in keys
+ */
+static __init int system_trusted_keyring_init(void)
+{
+       pr_notice("Initialise system trusted keyring\n");
+
+       system_trusted_keyring =
+               keyring_alloc(".system_keyring",
+                             KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
+                             ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                             KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
+                             KEY_ALLOC_NOT_IN_QUOTA, NULL);
+       if (IS_ERR(system_trusted_keyring))
+               panic("Can't allocate system trusted keyring\n");
+
+       set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
+       return 0;
+}
+
+/*
+ * Must be initialised before we try and load the keys into the keyring.
+ */
+device_initcall(system_trusted_keyring_init);
+
+/*
+ * Load the compiled-in list of X.509 certificates.
+ */
+static __init int load_system_certificate_list(void)
+{
+       key_ref_t key;
+       const u8 *p, *end;
+       size_t plen;
+
+       pr_notice("Loading compiled-in X.509 certificates\n");
+
+       p = system_certificate_list;
+       end = p + system_certificate_list_size;
+       while (p < end) {
+               /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
+                * than 256 bytes in size.
+                */
+               if (end - p < 4)
+                       goto dodgy_cert;
+               if (p[0] != 0x30 &&
+                   p[1] != 0x82)
+                       goto dodgy_cert;
+               plen = (p[2] << 8) | p[3];
+               plen += 4;
+               if (plen > end - p)
+                       goto dodgy_cert;
+
+               key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
+                                          "asymmetric",
+                                          NULL,
+                                          p,
+                                          plen,
+                                          ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                          KEY_USR_VIEW | KEY_USR_READ),
+                                          KEY_ALLOC_NOT_IN_QUOTA |
+                                          KEY_ALLOC_TRUSTED);
+               if (IS_ERR(key)) {
+                       pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
+                              PTR_ERR(key));
+               } else {
+                       set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
+                       pr_notice("Loaded X.509 cert '%s'\n",
+                                 key_ref_to_ptr(key)->description);
+                       key_ref_put(key);
+               }
+               p += plen;
+       }
+
+       return 0;
+
+dodgy_cert:
+       pr_err("Problem parsing in-kernel X.509 certificate list\n");
+       return 0;
+}
+late_initcall(load_system_certificate_list);
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+
+/**
+ * Verify a PKCS#7-based signature on system data.
+ * @data: The data to be verified.
+ * @len: Size of @data.
+ * @raw_pkcs7: The PKCS#7 message that is the signature.
+ * @pkcs7_len: The size of @raw_pkcs7.
+ * @usage: The use to which the key is being put.
+ */
+int system_verify_data(const void *data, unsigned long len,
+                      const void *raw_pkcs7, size_t pkcs7_len,
+                      enum key_being_used_for usage)
+{
+       struct pkcs7_message *pkcs7;
+       bool trusted;
+       int ret;
+
+       pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len);
+       if (IS_ERR(pkcs7))
+               return PTR_ERR(pkcs7);
+
+       /* The data should be detached - so we need to supply it. */
+       if (pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
+               pr_err("PKCS#7 signature with non-detached data\n");
+               ret = -EBADMSG;
+               goto error;
+       }
+
+       ret = pkcs7_verify(pkcs7, usage);
+       if (ret < 0)
+               goto error;
+
+       ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
+       if (ret < 0)
+               goto error;
+
+       if (!trusted) {
+               pr_err("PKCS#7 signature not signed with a trusted key\n");
+               ret = -ENOKEY;
+       }
+
+error:
+       pkcs7_free_message(pkcs7);
+       pr_devel("<==%s() = %d\n", __func__, ret);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(system_verify_data);
+
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
index b582ea7f78d3f4dd71d8effef93d5db0d3f8cf92..48ee3e175dac258ad7ab1b60c2c76d85f3b9a2d9 100644 (file)
@@ -1635,5 +1635,6 @@ config CRYPTO_HASH_INFO
 
 source "drivers/crypto/Kconfig"
 source crypto/asymmetric_keys/Kconfig
+source certs/Kconfig
 
 endif  # if CRYPTO
index e47fcd9ac5e86f56b85b88c8179d68a41ade078c..cd1406f9b14ab2b45615aed6574cade8c539161c 100644 (file)
@@ -15,15 +15,21 @@ obj-$(CONFIG_PUBLIC_KEY_ALGO_RSA) += rsa.o
 obj-$(CONFIG_X509_CERTIFICATE_PARSER) += x509_key_parser.o
 x509_key_parser-y := \
        x509-asn1.o \
+       x509_akid-asn1.o \
        x509_rsakey-asn1.o \
        x509_cert_parser.o \
        x509_public_key.o
 
-$(obj)/x509_cert_parser.o: $(obj)/x509-asn1.h $(obj)/x509_rsakey-asn1.h
+$(obj)/x509_cert_parser.o: \
+       $(obj)/x509-asn1.h \
+       $(obj)/x509_akid-asn1.h \
+       $(obj)/x509_rsakey-asn1.h
 $(obj)/x509-asn1.o: $(obj)/x509-asn1.c $(obj)/x509-asn1.h
+$(obj)/x509_akid-asn1.o: $(obj)/x509_akid-asn1.c $(obj)/x509_akid-asn1.h
 $(obj)/x509_rsakey-asn1.o: $(obj)/x509_rsakey-asn1.c $(obj)/x509_rsakey-asn1.h
 
 clean-files    += x509-asn1.c x509-asn1.h
+clean-files    += x509_akid-asn1.c x509_akid-asn1.h
 clean-files    += x509_rsakey-asn1.c x509_rsakey-asn1.h
 
 #
index b0e4ed23d6683c54b47d633b1f6e4f01f1de8025..1916680ad81b08fc6db5a0f9f2751dc0526b3e9b 100644 (file)
@@ -12,6 +12,7 @@
  */
 #include <keys/asymmetric-subtype.h>
 #include <keys/asymmetric-parser.h>
+#include <crypto/public_key.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 
 MODULE_LICENSE("GPL");
 
+const char *const key_being_used_for[NR__KEY_BEING_USED_FOR] = {
+       [VERIFYING_MODULE_SIGNATURE]            = "mod sig",
+       [VERIFYING_FIRMWARE_SIGNATURE]          = "firmware sig",
+       [VERIFYING_KEXEC_PE_SIGNATURE]          = "kexec PE sig",
+       [VERIFYING_KEY_SIGNATURE]               = "key sig",
+       [VERIFYING_KEY_SELF_SIGNATURE]          = "key self sig",
+       [VERIFYING_UNSPECIFIED_SIGNATURE]       = "unspec sig",
+};
+EXPORT_SYMBOL_GPL(key_being_used_for);
+
 static LIST_HEAD(asymmetric_key_parsers);
 static DECLARE_RWSEM(asymmetric_key_parsers_sem);
 
index 214a992123cdd54a7f73fd857a0ac5a5360fa949..adcef59eec0b6a58eb8c0ee0ebf7c48a91472c57 100644 (file)
@@ -97,6 +97,15 @@ int mscode_note_digest_algo(void *context, size_t hdrlen,
        case OID_sha256:
                ctx->digest_algo = HASH_ALGO_SHA256;
                break;
+       case OID_sha384:
+               ctx->digest_algo = HASH_ALGO_SHA384;
+               break;
+       case OID_sha512:
+               ctx->digest_algo = HASH_ALGO_SHA512;
+               break;
+       case OID_sha224:
+               ctx->digest_algo = HASH_ALGO_SHA224;
+               break;
 
        case OID__NR:
                sprint_oid(value, vlen, buffer, sizeof(buffer));
index a5a14ef28c869ad5c558742b2260d58a7888f404..1eca740b816ace4680870df9e3a7885b8ba47538 100644 (file)
@@ -1,14 +1,14 @@
 PKCS7ContentInfo ::= SEQUENCE {
-       contentType     ContentType,
+       contentType     ContentType ({ pkcs7_check_content_type }),
        content         [0] EXPLICIT SignedData OPTIONAL
 }
 
 ContentType ::= OBJECT IDENTIFIER ({ pkcs7_note_OID })
 
 SignedData ::= SEQUENCE {
-       version                 INTEGER,
+       version                 INTEGER ({ pkcs7_note_signeddata_version }),
        digestAlgorithms        DigestAlgorithmIdentifiers,
-       contentInfo             ContentInfo,
+       contentInfo             ContentInfo ({ pkcs7_note_content }),
        certificates            CHOICE {
                certSet         [0] IMPLICIT ExtendedCertificatesAndCertificates,
                certSequence    [2] IMPLICIT Certificates
@@ -21,7 +21,7 @@ SignedData ::= SEQUENCE {
 }
 
 ContentInfo ::= SEQUENCE {
-       contentType     ContentType,
+       contentType     ContentType ({ pkcs7_note_OID }),
        content         [0] EXPLICIT Data OPTIONAL
 }
 
@@ -68,8 +68,8 @@ SignerInfos ::= CHOICE {
 }
 
 SignerInfo ::= SEQUENCE {
-       version                 INTEGER,
-       issuerAndSerialNumber   IssuerAndSerialNumber,
+       version                 INTEGER ({ pkcs7_note_signerinfo_version }),
+       sid                     SignerIdentifier, -- CMS variant, not PKCS#7
        digestAlgorithm         DigestAlgorithmIdentifier ({ pkcs7_sig_note_digest_algo }),
        authenticatedAttributes CHOICE {
                aaSet           [0] IMPLICIT SetOfAuthenticatedAttribute
@@ -88,6 +88,12 @@ SignerInfo ::= SEQUENCE {
        } OPTIONAL
 } ({ pkcs7_note_signed_info })
 
+SignerIdentifier ::= CHOICE {
+       -- RFC5652 sec 5.3
+       issuerAndSerialNumber IssuerAndSerialNumber,
+        subjectKeyIdentifier [0] IMPLICIT SubjectKeyIdentifier
+}
+
 IssuerAndSerialNumber ::= SEQUENCE {
        issuer                  Name ({ pkcs7_sig_note_issuer }),
        serialNumber            CertificateSerialNumber ({ pkcs7_sig_note_serial })
@@ -95,6 +101,8 @@ IssuerAndSerialNumber ::= SEQUENCE {
 
 CertificateSerialNumber ::= INTEGER
 
+SubjectKeyIdentifier ::= OCTET STRING ({ pkcs7_sig_note_skid })
+
 SetOfAuthenticatedAttribute ::= SET OF AuthenticatedAttribute
 
 AuthenticatedAttribute ::= SEQUENCE {
@@ -103,7 +111,7 @@ AuthenticatedAttribute ::= SEQUENCE {
 }
 
 UnauthenticatedAttribute ::= SEQUENCE {
-       type                    OBJECT IDENTIFIER ({ pkcs7_note_OID }),
+       type                    OBJECT IDENTIFIER,
        values                  SET OF ANY
 }
 
index 3d13b042da735823b7c6425f93cce9cff82e0abe..e2d0edbbc71acd7be8878817b86e53af5a921e04 100644 (file)
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/key-type.h>
+#include <keys/asymmetric-type.h>
 #include <crypto/pkcs7.h>
 #include <keys/user-type.h>
 #include <keys/system_keyring.h>
 #include "pkcs7_parser.h"
 
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PKCS#7 testing key type");
+
+static unsigned pkcs7_usage;
+module_param_named(usage, pkcs7_usage, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(pkcs7_usage,
+                "Usage to specify when verifying the PKCS#7 message");
+
 /*
  * Preparse a PKCS#7 wrapped and validated data blob.
  */
 static int pkcs7_preparse(struct key_preparsed_payload *prep)
 {
+       enum key_being_used_for usage = pkcs7_usage;
        struct pkcs7_message *pkcs7;
        const void *data, *saved_prep_data;
        size_t datalen, saved_prep_datalen;
@@ -32,6 +42,11 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
        kenter("");
 
+       if (usage >= NR__KEY_BEING_USED_FOR) {
+               pr_err("Invalid usage type %d\n", usage);
+               return -EINVAL;
+       }
+
        saved_prep_data = prep->data;
        saved_prep_datalen = prep->datalen;
        pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen);
@@ -40,7 +55,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
                goto error;
        }
 
-       ret = pkcs7_verify(pkcs7);
+       ret = pkcs7_verify(pkcs7, usage);
        if (ret < 0)
                goto error_free;
 
index 3bd5a1e4c493e23a78835cb82ff7623fa21fa6c2..758acabf2d819727ecd7f4cf0c6aa21c1c40649f 100644 (file)
@@ -33,6 +33,9 @@ struct pkcs7_parse_context {
        unsigned        raw_serial_size;
        unsigned        raw_issuer_size;
        const void      *raw_issuer;
+       const void      *raw_skid;
+       unsigned        raw_skid_size;
+       bool            expect_skid;
 };
 
 /*
@@ -78,6 +81,30 @@ void pkcs7_free_message(struct pkcs7_message *pkcs7)
 }
 EXPORT_SYMBOL_GPL(pkcs7_free_message);
 
+/*
+ * Check authenticatedAttributes are provided or not provided consistently.
+ */
+static int pkcs7_check_authattrs(struct pkcs7_message *msg)
+{
+       struct pkcs7_signed_info *sinfo;
+       bool want;
+
+       sinfo = msg->signed_infos;
+       if (sinfo->authattrs) {
+               want = true;
+               msg->have_authattrs = true;
+       }
+
+       for (sinfo = sinfo->next; sinfo; sinfo = sinfo->next)
+               if (!!sinfo->authattrs != want)
+                       goto inconsistent;
+       return 0;
+
+inconsistent:
+       pr_warn("Inconsistently supplied authAttrs\n");
+       return -EINVAL;
+}
+
 /**
  * pkcs7_parse_message - Parse a PKCS#7 message
  * @data: The raw binary ASN.1 encoded message to be parsed
@@ -110,6 +137,10 @@ struct pkcs7_message *pkcs7_parse_message(const void *data, size_t datalen)
                goto out;
        }
 
+       ret = pkcs7_check_authattrs(ctx->msg);
+       if (ret < 0)
+               goto out;
+
        msg = ctx->msg;
        ctx->msg = NULL;
 
@@ -198,6 +229,14 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen,
        case OID_sha256:
                ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA256;
                break;
+       case OID_sha384:
+               ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA384;
+               break;
+       case OID_sha512:
+               ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA512;
+               break;
+       case OID_sha224:
+               ctx->sinfo->sig.pkey_hash_algo = HASH_ALGO_SHA224;
        default:
                printk("Unsupported digest algo: %u\n", ctx->last_oid);
                return -ENOPKG;
@@ -225,6 +264,100 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen,
        return 0;
 }
 
+/*
+ * We only support signed data [RFC2315 sec 9].
+ */
+int pkcs7_check_content_type(void *context, size_t hdrlen,
+                            unsigned char tag,
+                            const void *value, size_t vlen)
+{
+       struct pkcs7_parse_context *ctx = context;
+
+       if (ctx->last_oid != OID_signed_data) {
+               pr_warn("Only support pkcs7_signedData type\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * Note the SignedData version
+ */
+int pkcs7_note_signeddata_version(void *context, size_t hdrlen,
+                                 unsigned char tag,
+                                 const void *value, size_t vlen)
+{
+       struct pkcs7_parse_context *ctx = context;
+       unsigned version;
+
+       if (vlen != 1)
+               goto unsupported;
+
+       ctx->msg->version = version = *(const u8 *)value;
+       switch (version) {
+       case 1:
+               /* PKCS#7 SignedData [RFC2315 sec 9.1]
+                * CMS ver 1 SignedData [RFC5652 sec 5.1]
+                */
+               break;
+       case 3:
+               /* CMS ver 3 SignedData [RFC2315 sec 5.1] */
+               break;
+       default:
+               goto unsupported;
+       }
+
+       return 0;
+
+unsupported:
+       pr_warn("Unsupported SignedData version\n");
+       return -EINVAL;
+}
+
+/*
+ * Note the SignerInfo version
+ */
+int pkcs7_note_signerinfo_version(void *context, size_t hdrlen,
+                                 unsigned char tag,
+                                 const void *value, size_t vlen)
+{
+       struct pkcs7_parse_context *ctx = context;
+       unsigned version;
+
+       if (vlen != 1)
+               goto unsupported;
+
+       version = *(const u8 *)value;
+       switch (version) {
+       case 1:
+               /* PKCS#7 SignerInfo [RFC2315 sec 9.2]
+                * CMS ver 1 SignerInfo [RFC5652 sec 5.3]
+                */
+               if (ctx->msg->version != 1)
+                       goto version_mismatch;
+               ctx->expect_skid = false;
+               break;
+       case 3:
+               /* CMS ver 3 SignerInfo [RFC2315 sec 5.3] */
+               if (ctx->msg->version == 1)
+                       goto version_mismatch;
+               ctx->expect_skid = true;
+               break;
+       default:
+               goto unsupported;
+       }
+
+       return 0;
+
+unsupported:
+       pr_warn("Unsupported SignerInfo version\n");
+       return -EINVAL;
+version_mismatch:
+       pr_warn("SignedData-SignerInfo version mismatch\n");
+       return -EBADMSG;
+}
+
 /*
  * Extract a certificate and store it in the context.
  */
@@ -283,6 +416,25 @@ int pkcs7_note_certificate_list(void *context, size_t hdrlen,
        return 0;
 }
 
+/*
+ * Note the content type.
+ */
+int pkcs7_note_content(void *context, size_t hdrlen,
+                      unsigned char tag,
+                      const void *value, size_t vlen)
+{
+       struct pkcs7_parse_context *ctx = context;
+
+       if (ctx->last_oid != OID_data &&
+           ctx->last_oid != OID_msIndirectData) {
+               pr_warn("Unsupported data type %d\n", ctx->last_oid);
+               return -EINVAL;
+       }
+
+       ctx->msg->data_type = ctx->last_oid;
+       return 0;
+}
+
 /*
  * Extract the data from the message and store that and its content type OID in
  * the context.
@@ -298,45 +450,119 @@ int pkcs7_note_data(void *context, size_t hdrlen,
        ctx->msg->data = value;
        ctx->msg->data_len = vlen;
        ctx->msg->data_hdrlen = hdrlen;
-       ctx->msg->data_type = ctx->last_oid;
        return 0;
 }
 
 /*
- * Parse authenticated attributes
+ * Parse authenticated attributes.
  */
 int pkcs7_sig_note_authenticated_attr(void *context, size_t hdrlen,
                                      unsigned char tag,
                                      const void *value, size_t vlen)
 {
        struct pkcs7_parse_context *ctx = context;
+       struct pkcs7_signed_info *sinfo = ctx->sinfo;
+       enum OID content_type;
 
        pr_devel("AuthAttr: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value);
 
        switch (ctx->last_oid) {
+       case OID_contentType:
+               if (__test_and_set_bit(sinfo_has_content_type, &sinfo->aa_set))
+                       goto repeated;
+               content_type = look_up_OID(value, vlen);
+               if (content_type != ctx->msg->data_type) {
+                       pr_warn("Mismatch between global data type (%d) and sinfo %u (%d)\n",
+                               ctx->msg->data_type, sinfo->index,
+                               content_type);
+                       return -EBADMSG;
+               }
+               return 0;
+
+       case OID_signingTime:
+               if (__test_and_set_bit(sinfo_has_signing_time, &sinfo->aa_set))
+                       goto repeated;
+               /* Should we check that the signing time is consistent
+                * with the signer's X.509 cert?
+                */
+               return x509_decode_time(&sinfo->signing_time,
+                                       hdrlen, tag, value, vlen);
+
        case OID_messageDigest:
+               if (__test_and_set_bit(sinfo_has_message_digest, &sinfo->aa_set))
+                       goto repeated;
                if (tag != ASN1_OTS)
                        return -EBADMSG;
-               ctx->sinfo->msgdigest = value;
-               ctx->sinfo->msgdigest_len = vlen;
+               sinfo->msgdigest = value;
+               sinfo->msgdigest_len = vlen;
+               return 0;
+
+       case OID_smimeCapabilites:
+               if (__test_and_set_bit(sinfo_has_smime_caps, &sinfo->aa_set))
+                       goto repeated;
+               if (ctx->msg->data_type != OID_msIndirectData) {
+                       pr_warn("S/MIME Caps only allowed with Authenticode\n");
+                       return -EKEYREJECTED;
+               }
+               return 0;
+
+               /* Microsoft SpOpusInfo seems to be contain cont[0] 16-bit BE
+                * char URLs and cont[1] 8-bit char URLs.
+                *
+                * Microsoft StatementType seems to contain a list of OIDs that
+                * are also used as extendedKeyUsage types in X.509 certs.
+                */
+       case OID_msSpOpusInfo:
+               if (__test_and_set_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))
+                       goto repeated;
+               goto authenticode_check;
+       case OID_msStatementType:
+               if (__test_and_set_bit(sinfo_has_ms_statement_type, &sinfo->aa_set))
+                       goto repeated;
+       authenticode_check:
+               if (ctx->msg->data_type != OID_msIndirectData) {
+                       pr_warn("Authenticode AuthAttrs only allowed with Authenticode\n");
+                       return -EKEYREJECTED;
+               }
+               /* I'm not sure how to validate these */
                return 0;
        default:
                return 0;
        }
+
+repeated:
+       /* We permit max one item per AuthenticatedAttribute and no repeats */
+       pr_warn("Repeated/multivalue AuthAttrs not permitted\n");
+       return -EKEYREJECTED;
 }
 
 /*
- * Note the set of auth attributes for digestion purposes [RFC2315 9.3]
+ * Note the set of auth attributes for digestion purposes [RFC2315 sec 9.3]
  */
 int pkcs7_sig_note_set_of_authattrs(void *context, size_t hdrlen,
                                    unsigned char tag,
                                    const void *value, size_t vlen)
 {
        struct pkcs7_parse_context *ctx = context;
+       struct pkcs7_signed_info *sinfo = ctx->sinfo;
+
+       if (!test_bit(sinfo_has_content_type, &sinfo->aa_set) ||
+           !test_bit(sinfo_has_message_digest, &sinfo->aa_set) ||
+           (ctx->msg->data_type == OID_msIndirectData &&
+            !test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set))) {
+               pr_warn("Missing required AuthAttr\n");
+               return -EBADMSG;
+       }
+
+       if (ctx->msg->data_type != OID_msIndirectData &&
+           test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) {
+               pr_warn("Unexpected Authenticode AuthAttr\n");
+               return -EBADMSG;
+       }
 
        /* We need to switch the 'CONT 0' to a 'SET OF' when we digest */
-       ctx->sinfo->authattrs = value - (hdrlen - 1);
-       ctx->sinfo->authattrs_len = vlen + (hdrlen - 1);
+       sinfo->authattrs = value - (hdrlen - 1);
+       sinfo->authattrs_len = vlen + (hdrlen - 1);
        return 0;
 }
 
@@ -366,6 +592,22 @@ int pkcs7_sig_note_issuer(void *context, size_t hdrlen,
        return 0;
 }
 
+/*
+ * Note the issuing cert's subjectKeyIdentifier
+ */
+int pkcs7_sig_note_skid(void *context, size_t hdrlen,
+                       unsigned char tag,
+                       const void *value, size_t vlen)
+{
+       struct pkcs7_parse_context *ctx = context;
+
+       pr_devel("SKID: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value);
+
+       ctx->raw_skid = value;
+       ctx->raw_skid_size = vlen;
+       return 0;
+}
+
 /*
  * Note the signature data
  */
@@ -398,14 +640,27 @@ int pkcs7_note_signed_info(void *context, size_t hdrlen,
        struct pkcs7_signed_info *sinfo = ctx->sinfo;
        struct asymmetric_key_id *kid;
 
+       if (ctx->msg->data_type == OID_msIndirectData && !sinfo->authattrs) {
+               pr_warn("Authenticode requires AuthAttrs\n");
+               return -EBADMSG;
+       }
+
        /* Generate cert issuer + serial number key ID */
-       kid = asymmetric_key_generate_id(ctx->raw_serial,
-                                        ctx->raw_serial_size,
-                                        ctx->raw_issuer,
-                                        ctx->raw_issuer_size);
+       if (!ctx->expect_skid) {
+               kid = asymmetric_key_generate_id(ctx->raw_serial,
+                                                ctx->raw_serial_size,
+                                                ctx->raw_issuer,
+                                                ctx->raw_issuer_size);
+       } else {
+               kid = asymmetric_key_generate_id(ctx->raw_skid,
+                                                ctx->raw_skid_size,
+                                                "", 0);
+       }
        if (IS_ERR(kid))
                return PTR_ERR(kid);
 
+       pr_devel("SINFO KID: %u [%*phN]\n", kid->len, kid->len, kid->data);
+
        sinfo->signing_cert_id = kid;
        sinfo->index = ++ctx->sinfo_index;
        *ctx->ppsinfo = sinfo;
index efc7dc9b8f9cfbc2ac9223e141239efb43bd60ae..a66b19ebcf47e4d74bddbb4c7bf475303399bcc3 100644 (file)
@@ -21,9 +21,9 @@
 struct pkcs7_signed_info {
        struct pkcs7_signed_info *next;
        struct x509_certificate *signer; /* Signing certificate (in msg->certs) */
-       unsigned index;
-       bool trusted;
-       bool unsupported_crypto;        /* T if not usable due to missing crypto */
+       unsigned        index;
+       bool            trusted;
+       bool            unsupported_crypto;     /* T if not usable due to missing crypto */
 
        /* Message digest - the digest of the Content Data (or NULL) */
        const void      *msgdigest;
@@ -32,8 +32,18 @@ struct pkcs7_signed_info {
        /* Authenticated Attribute data (or NULL) */
        unsigned        authattrs_len;
        const void      *authattrs;
+       unsigned long   aa_set;
+#define        sinfo_has_content_type          0
+#define        sinfo_has_signing_time          1
+#define        sinfo_has_message_digest        2
+#define sinfo_has_smime_caps           3
+#define        sinfo_has_ms_opus_info          4
+#define        sinfo_has_ms_statement_type     5
+       time64_t        signing_time;
 
-       /* Issuing cert serial number and issuer's name */
+       /* Issuing cert serial number and issuer's name [PKCS#7 or CMS ver 1]
+        * or issuing cert's SKID [CMS ver 3].
+        */
        struct asymmetric_key_id *signing_cert_id;
 
        /* Message signature.
@@ -50,6 +60,8 @@ struct pkcs7_message {
        struct x509_certificate *certs; /* Certificate list */
        struct x509_certificate *crl;   /* Revocation list */
        struct pkcs7_signed_info *signed_infos;
+       u8              version;        /* Version of cert (1 -> PKCS#7 or CMS; 3 -> CMS) */
+       bool            have_authattrs; /* T if have authattrs */
 
        /* Content Data (or NULL) */
        enum OID        data_type;      /* Type of Data */
index 1d29376072da4a502e720fe10fc8fb34f6c4749a..90d6d47965b0826d40e795bc68f563aa2f033856 100644 (file)
@@ -54,7 +54,8 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
                /* Look to see if this certificate is present in the trusted
                 * keys.
                 */
-               key = x509_request_asymmetric_key(trust_keyring, x509->id,
+               key = x509_request_asymmetric_key(trust_keyring,
+                                                 x509->id, x509->skid,
                                                  false);
                if (!IS_ERR(key)) {
                        /* One of the X.509 certificates in the PKCS#7 message
@@ -85,8 +86,10 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
        /* No match - see if the root certificate has a signer amongst the
         * trusted keys.
         */
-       if (last && last->authority) {
-               key = x509_request_asymmetric_key(trust_keyring, last->authority,
+       if (last && (last->akid_id || last->akid_skid)) {
+               key = x509_request_asymmetric_key(trust_keyring,
+                                                 last->akid_id,
+                                                 last->akid_skid,
                                                  false);
                if (!IS_ERR(key)) {
                        x509 = last;
@@ -103,6 +106,7 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
         */
        key = x509_request_asymmetric_key(trust_keyring,
                                          sinfo->signing_cert_id,
+                                         NULL,
                                          false);
        if (!IS_ERR(key)) {
                pr_devel("sinfo %u: Direct signer is key %x\n",
index cd455450b069e3c58d1a202484e3ccba2d5d7da1..d20c0b4b880ed1e6dde6552cfce4c6d85a05b053 100644 (file)
@@ -70,9 +70,15 @@ static int pkcs7_digest(struct pkcs7_message *pkcs7,
         * message digest attribute amongst them which corresponds to the
         * digest we just calculated.
         */
-       if (sinfo->msgdigest) {
+       if (sinfo->authattrs) {
                u8 tag;
 
+               if (!sinfo->msgdigest) {
+                       pr_warn("Sig %u: No messageDigest\n", sinfo->index);
+                       ret = -EKEYREJECTED;
+                       goto error;
+               }
+
                if (sinfo->msgdigest_len != sinfo->sig.digest_size) {
                        pr_debug("Sig %u: Invalid digest size (%u)\n",
                                 sinfo->index, sinfo->msgdigest_len);
@@ -170,6 +176,7 @@ static int pkcs7_verify_sig_chain(struct pkcs7_message *pkcs7,
                                  struct pkcs7_signed_info *sinfo)
 {
        struct x509_certificate *x509 = sinfo->signer, *p;
+       struct asymmetric_key_id *auth;
        int ret;
 
        kenter("");
@@ -187,11 +194,14 @@ static int pkcs7_verify_sig_chain(struct pkcs7_message *pkcs7,
                        goto maybe_missing_crypto_in_x509;
 
                pr_debug("- issuer %s\n", x509->issuer);
-               if (x509->authority)
-                       pr_debug("- authkeyid %*phN\n",
-                                x509->authority->len, x509->authority->data);
-
-               if (!x509->authority ||
+               if (x509->akid_id)
+                       pr_debug("- authkeyid.id %*phN\n",
+                                x509->akid_id->len, x509->akid_id->data);
+               if (x509->akid_skid)
+                       pr_debug("- authkeyid.skid %*phN\n",
+                                x509->akid_skid->len, x509->akid_skid->data);
+
+               if ((!x509->akid_id && !x509->akid_skid) ||
                    strcmp(x509->subject, x509->issuer) == 0) {
                        /* If there's no authority certificate specified, then
                         * the certificate must be self-signed and is the root
@@ -215,21 +225,42 @@ static int pkcs7_verify_sig_chain(struct pkcs7_message *pkcs7,
                /* Look through the X.509 certificates in the PKCS#7 message's
                 * list to see if the next one is there.
                 */
-               pr_debug("- want %*phN\n",
-                        x509->authority->len, x509->authority->data);
-               for (p = pkcs7->certs; p; p = p->next) {
-                       if (!p->skid)
-                               continue;
-                       pr_debug("- cmp [%u] %*phN\n",
-                                p->index, p->skid->len, p->skid->data);
-                       if (asymmetric_key_id_same(p->skid, x509->authority))
-                               goto found_issuer;
+               auth = x509->akid_id;
+               if (auth) {
+                       pr_debug("- want %*phN\n", auth->len, auth->data);
+                       for (p = pkcs7->certs; p; p = p->next) {
+                               pr_debug("- cmp [%u] %*phN\n",
+                                        p->index, p->id->len, p->id->data);
+                               if (asymmetric_key_id_same(p->id, auth))
+                                       goto found_issuer_check_skid;
+                       }
+               } else {
+                       auth = x509->akid_skid;
+                       pr_debug("- want %*phN\n", auth->len, auth->data);
+                       for (p = pkcs7->certs; p; p = p->next) {
+                               if (!p->skid)
+                                       continue;
+                               pr_debug("- cmp [%u] %*phN\n",
+                                        p->index, p->skid->len, p->skid->data);
+                               if (asymmetric_key_id_same(p->skid, auth))
+                                       goto found_issuer;
+                       }
                }
 
                /* We didn't find the root of this chain */
                pr_debug("- top\n");
                return 0;
 
+       found_issuer_check_skid:
+               /* We matched issuer + serialNumber, but if there's an
+                * authKeyId.keyId, that must match the CA subjKeyId also.
+                */
+               if (x509->akid_skid &&
+                   !asymmetric_key_id_same(p->skid, x509->akid_skid)) {
+                       pr_warn("Sig %u: X.509 chain contains auth-skid nonmatch (%u->%u)\n",
+                               sinfo->index, x509->index, p->index);
+                       return -EKEYREJECTED;
+               }
        found_issuer:
                pr_debug("- subject %s\n", p->subject);
                if (p->seen) {
@@ -289,6 +320,18 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
        pr_devel("Using X.509[%u] for sig %u\n",
                 sinfo->signer->index, sinfo->index);
 
+       /* Check that the PKCS#7 signing time is valid according to the X.509
+        * certificate.  We can't, however, check against the system clock
+        * since that may not have been set yet and may be wrong.
+        */
+       if (test_bit(sinfo_has_signing_time, &sinfo->aa_set)) {
+               if (sinfo->signing_time < sinfo->signer->valid_from ||
+                   sinfo->signing_time > sinfo->signer->valid_to) {
+                       pr_warn("Message signed outside of X.509 validity window\n");
+                       return -EKEYREJECTED;
+               }
+       }
+
        /* Verify the PKCS#7 binary against the key */
        ret = public_key_verify_signature(sinfo->signer->pub, &sinfo->sig);
        if (ret < 0)
@@ -303,6 +346,7 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
 /**
  * pkcs7_verify - Verify a PKCS#7 message
  * @pkcs7: The PKCS#7 message to be verified
+ * @usage: The use to which the key is being put
  *
  * Verify a PKCS#7 message is internally consistent - that is, the data digest
  * matches the digest in the AuthAttrs and any signature in the message or one
@@ -314,6 +358,9 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
  *
  * Returns, in order of descending priority:
  *
+ *  (*) -EKEYREJECTED if a key was selected that had a usage restriction at
+ *      odds with the specified usage, or:
+ *
  *  (*) -EKEYREJECTED if a signature failed to match for which we found an
  *     appropriate X.509 certificate, or:
  *
@@ -325,7 +372,8 @@ static int pkcs7_verify_one(struct pkcs7_message *pkcs7,
  *  (*) 0 if all the signature chains that don't incur -ENOPKG can be verified
  *     (note that a signature chain may be of zero length), or:
  */
-int pkcs7_verify(struct pkcs7_message *pkcs7)
+int pkcs7_verify(struct pkcs7_message *pkcs7,
+                enum key_being_used_for usage)
 {
        struct pkcs7_signed_info *sinfo;
        struct x509_certificate *x509;
@@ -334,12 +382,48 @@ int pkcs7_verify(struct pkcs7_message *pkcs7)
 
        kenter("");
 
+       switch (usage) {
+       case VERIFYING_MODULE_SIGNATURE:
+               if (pkcs7->data_type != OID_data) {
+                       pr_warn("Invalid module sig (not pkcs7-data)\n");
+                       return -EKEYREJECTED;
+               }
+               if (pkcs7->have_authattrs) {
+                       pr_warn("Invalid module sig (has authattrs)\n");
+                       return -EKEYREJECTED;
+               }
+               break;
+       case VERIFYING_FIRMWARE_SIGNATURE:
+               if (pkcs7->data_type != OID_data) {
+                       pr_warn("Invalid firmware sig (not pkcs7-data)\n");
+                       return -EKEYREJECTED;
+               }
+               if (!pkcs7->have_authattrs) {
+                       pr_warn("Invalid firmware sig (missing authattrs)\n");
+                       return -EKEYREJECTED;
+               }
+               break;
+       case VERIFYING_KEXEC_PE_SIGNATURE:
+               if (pkcs7->data_type != OID_msIndirectData) {
+                       pr_warn("Invalid kexec sig (not Authenticode)\n");
+                       return -EKEYREJECTED;
+               }
+               /* Authattr presence checked in parser */
+               break;
+       case VERIFYING_UNSPECIFIED_SIGNATURE:
+               if (pkcs7->data_type != OID_data) {
+                       pr_warn("Invalid unspecified sig (not pkcs7-data)\n");
+                       return -EKEYREJECTED;
+               }
+               break;
+       default:
+               return -EINVAL;
+       }
+
        for (n = 0, x509 = pkcs7->certs; x509; x509 = x509->next, n++) {
                ret = x509_get_sig_params(x509);
                if (ret < 0)
                        return ret;
-               pr_debug("X.509[%u] %*phN\n",
-                        n, x509->authority->len, x509->authority->data);
        }
 
        for (sinfo = pkcs7->signed_infos; sinfo; sinfo = sinfo->next) {
@@ -359,3 +443,28 @@ int pkcs7_verify(struct pkcs7_message *pkcs7)
        return enopkg;
 }
 EXPORT_SYMBOL_GPL(pkcs7_verify);
+
+/**
+ * pkcs7_supply_detached_data - Supply the data needed to verify a PKCS#7 message
+ * @pkcs7: The PKCS#7 message
+ * @data: The data to be verified
+ * @datalen: The amount of data
+ *
+ * Supply the detached data needed to verify a PKCS#7 message.  Note that no
+ * attempt to retain/pin the data is made.  That is left to the caller.  The
+ * data will not be modified by pkcs7_verify() and will not be freed when the
+ * PKCS#7 message is freed.
+ *
+ * Returns -EINVAL if data is already supplied in the message, 0 otherwise.
+ */
+int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7,
+                              const void *data, size_t datalen)
+{
+       if (pkcs7->data) {
+               pr_debug("Data already supplied\n");
+               return -EINVAL;
+       }
+       pkcs7->data = data;
+       pkcs7->data_len = datalen;
+       return 0;
+}
index 2f6e4fb1a1ea14c4a1bcba18618b0bdaf6e1cd9c..81efccbe22d5b21d3e31e44f504fc23cefd32d76 100644 (file)
@@ -39,6 +39,7 @@ EXPORT_SYMBOL_GPL(pkey_algo);
 const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST] = {
        [PKEY_ID_PGP]           = "PGP",
        [PKEY_ID_X509]          = "X509",
+       [PKEY_ID_PKCS7]         = "PKCS#7",
 };
 EXPORT_SYMBOL_GPL(pkey_id_type_name);
 
index 2421f46184ce873076fe3cfc0f195a2d7bf4f902..897b734dabf9ba7ebc7d356707927c789b9175c9 100644 (file)
@@ -393,6 +393,7 @@ error_no_desc:
  * @pebuf: Buffer containing the PE binary image
  * @pelen: Length of the binary image
  * @trust_keyring: Signing certificates to use as starting points
+ * @usage: The use to which the key is being put.
  * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message inside the PE
@@ -417,7 +418,9 @@ error_no_desc:
  * May also return -ENOMEM.
  */
 int verify_pefile_signature(const void *pebuf, unsigned pelen,
-                           struct key *trusted_keyring, bool *_trusted)
+                           struct key *trusted_keyring,
+                           enum key_being_used_for usage,
+                           bool *_trusted)
 {
        struct pkcs7_message *pkcs7;
        struct pefile_context ctx;
@@ -462,7 +465,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
        if (ret < 0)
                goto error;
 
-       ret = pkcs7_verify(pkcs7);
+       ret = pkcs7_verify(pkcs7, usage);
        if (ret < 0)
                goto error;
 
diff --git a/crypto/asymmetric_keys/x509_akid.asn1 b/crypto/asymmetric_keys/x509_akid.asn1
new file mode 100644 (file)
index 0000000..1a33231
--- /dev/null
@@ -0,0 +1,35 @@
+-- X.509 AuthorityKeyIdentifier
+-- rfc5280 section 4.2.1.1
+
+AuthorityKeyIdentifier ::= SEQUENCE {
+       keyIdentifier                   [0] IMPLICIT KeyIdentifier              OPTIONAL,
+       authorityCertIssuer             [1] IMPLICIT GeneralNames               OPTIONAL,
+       authorityCertSerialNumber       [2] IMPLICIT CertificateSerialNumber    OPTIONAL
+       }
+
+KeyIdentifier ::= OCTET STRING ({ x509_akid_note_kid })
+
+CertificateSerialNumber ::= INTEGER ({ x509_akid_note_serial })
+
+GeneralNames ::= SEQUENCE OF GeneralName
+
+GeneralName ::= CHOICE {
+       otherName                       [0] ANY,
+       rfc822Name                      [1] IA5String,
+       dNSName                         [2] IA5String,
+       x400Address                     [3] ANY,
+       directoryName                   [4] Name ({ x509_akid_note_name }),
+       ediPartyName                    [5] ANY,
+       uniformResourceIdentifier       [6] IA5String,
+       iPAddress                       [7] OCTET STRING,
+       registeredID                    [8] OBJECT IDENTIFIER
+       }
+
+Name ::= SEQUENCE OF RelativeDistinguishedName
+
+RelativeDistinguishedName ::= SET OF AttributeValueAssertion
+
+AttributeValueAssertion ::= SEQUENCE {
+       attributeType           OBJECT IDENTIFIER ({ x509_note_OID }),
+       attributeValue          ANY ({ x509_extract_name_segment })
+       }
index a668d90302d38c541fb5a78aa5442019cc739b21..af71878dc15bfff17dd34200a08be48633ceea9b 100644 (file)
@@ -18,6 +18,7 @@
 #include "public_key.h"
 #include "x509_parser.h"
 #include "x509-asn1.h"
+#include "x509_akid-asn1.h"
 #include "x509_rsakey-asn1.h"
 
 struct x509_parse_context {
@@ -35,6 +36,10 @@ struct x509_parse_context {
        u16             o_offset;               /* Offset of organizationName (O) */
        u16             cn_offset;              /* Offset of commonName (CN) */
        u16             email_offset;           /* Offset of emailAddress */
+       unsigned        raw_akid_size;
+       const void      *raw_akid;              /* Raw authorityKeyId in ASN.1 */
+       const void      *akid_raw_issuer;       /* Raw directoryName in authorityKeyId */
+       unsigned        akid_raw_issuer_size;
 };
 
 /*
@@ -48,7 +53,8 @@ void x509_free_certificate(struct x509_certificate *cert)
                kfree(cert->subject);
                kfree(cert->id);
                kfree(cert->skid);
-               kfree(cert->authority);
+               kfree(cert->akid_id);
+               kfree(cert->akid_skid);
                kfree(cert->sig.digest);
                mpi_free(cert->sig.rsa.s);
                kfree(cert);
@@ -85,6 +91,18 @@ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
        if (ret < 0)
                goto error_decode;
 
+       /* Decode the AuthorityKeyIdentifier */
+       if (ctx->raw_akid) {
+               pr_devel("AKID: %u %*phN\n",
+                        ctx->raw_akid_size, ctx->raw_akid_size, ctx->raw_akid);
+               ret = asn1_ber_decoder(&x509_akid_decoder, ctx,
+                                      ctx->raw_akid, ctx->raw_akid_size);
+               if (ret < 0) {
+                       pr_warn("Couldn't decode AuthKeyIdentifier\n");
+                       goto error_decode;
+               }
+       }
+
        /* Decode the public key */
        ret = asn1_ber_decoder(&x509_rsakey_decoder, ctx,
                               ctx->key, ctx->key_size);
@@ -422,7 +440,6 @@ int x509_process_extension(void *context, size_t hdrlen,
        struct x509_parse_context *ctx = context;
        struct asymmetric_key_id *kid;
        const unsigned char *v = value;
-       int i;
 
        pr_debug("Extension: %u\n", ctx->last_oid);
 
@@ -437,9 +454,7 @@ int x509_process_extension(void *context, size_t hdrlen,
 
                ctx->cert->raw_skid_size = vlen;
                ctx->cert->raw_skid = v;
-               kid = asymmetric_key_generate_id(ctx->cert->raw_subject,
-                                                ctx->cert->raw_subject_size,
-                                                v, vlen);
+               kid = asymmetric_key_generate_id(v, vlen, "", 0);
                if (IS_ERR(kid))
                        return PTR_ERR(kid);
                ctx->cert->skid = kid;
@@ -449,117 +464,113 @@ int x509_process_extension(void *context, size_t hdrlen,
 
        if (ctx->last_oid == OID_authorityKeyIdentifier) {
                /* Get hold of the CA key fingerprint */
-               if (ctx->cert->authority || vlen < 5)
-                       return -EBADMSG;
-
-               /* Authority Key Identifier must be a Constructed SEQUENCE */
-               if (v[0] != (ASN1_SEQ | (ASN1_CONS << 5)))
-                       return -EBADMSG;
-
-               /* Authority Key Identifier is not indefinite length */
-               if (unlikely(vlen == ASN1_INDEFINITE_LENGTH))
-                       return -EBADMSG;
-
-               if (vlen < ASN1_INDEFINITE_LENGTH) {
-                       /* Short Form length */
-                       if (v[1] != vlen - 2 ||
-                           v[2] != SEQ_TAG_KEYID ||
-                           v[3] > vlen - 4)
-                               return -EBADMSG;
-
-                       vlen = v[3];
-                       v += 4;
-               } else {
-                       /* Long Form length */
-                       size_t seq_len = 0;
-                       size_t sub = v[1] - ASN1_INDEFINITE_LENGTH;
-
-                       if (sub > 2)
-                               return -EBADMSG;
-
-                       /* calculate the length from subsequent octets */
-                       v += 2;
-                       for (i = 0; i < sub; i++) {
-                               seq_len <<= 8;
-                               seq_len |= v[i];
-                       }
-
-                       if (seq_len != vlen - 2 - sub ||
-                           v[sub] != SEQ_TAG_KEYID ||
-                           v[sub + 1] > vlen - 4 - sub)
-                               return -EBADMSG;
-
-                       vlen = v[sub + 1];
-                       v += (sub + 2);
-               }
-
-               kid = asymmetric_key_generate_id(ctx->cert->raw_issuer,
-                                                ctx->cert->raw_issuer_size,
-                                                v, vlen);
-               if (IS_ERR(kid))
-                       return PTR_ERR(kid);
-               pr_debug("authkeyid %*phN\n", kid->len, kid->data);
-               ctx->cert->authority = kid;
+               ctx->raw_akid = v;
+               ctx->raw_akid_size = vlen;
                return 0;
        }
 
        return 0;
 }
 
-/*
- * Record a certificate time.
+/**
+ * x509_decode_time - Decode an X.509 time ASN.1 object
+ * @_t: The time to fill in
+ * @hdrlen: The length of the object header
+ * @tag: The object tag
+ * @value: The object value
+ * @vlen: The size of the object value
+ *
+ * Decode an ASN.1 universal time or generalised time field into a struct the
+ * kernel can handle and check it for validity.  The time is decoded thus:
+ *
+ *     [RFC5280 §4.1.2.5]
+ *     CAs conforming to this profile MUST always encode certificate validity
+ *     dates through the year 2049 as UTCTime; certificate validity dates in
+ *     2050 or later MUST be encoded as GeneralizedTime.  Conforming
+ *     applications MUST be able to process validity dates that are encoded in
+ *     either UTCTime or GeneralizedTime.
  */
-static int x509_note_time(struct tm *tm,  size_t hdrlen,
-                         unsigned char tag,
-                         const unsigned char *value, size_t vlen)
+int x509_decode_time(time64_t *_t,  size_t hdrlen,
+                    unsigned char tag,
+                    const unsigned char *value, size_t vlen)
 {
+       static const unsigned char month_lengths[] = { 31, 29, 31, 30, 31, 30,
+                                                      31, 31, 30, 31, 30, 31 };
        const unsigned char *p = value;
+       unsigned year, mon, day, hour, min, sec, mon_len;
 
-#define dec2bin(X) ((X) - '0')
+#define dec2bin(X) ({ unsigned char x = (X) - '0'; if (x > 9) goto invalid_time; x; })
 #define DD2bin(P) ({ unsigned x = dec2bin(P[0]) * 10 + dec2bin(P[1]); P += 2; x; })
 
        if (tag == ASN1_UNITIM) {
                /* UTCTime: YYMMDDHHMMSSZ */
                if (vlen != 13)
                        goto unsupported_time;
-               tm->tm_year = DD2bin(p);
-               if (tm->tm_year >= 50)
-                       tm->tm_year += 1900;
+               year = DD2bin(p);
+               if (year >= 50)
+                       year += 1900;
                else
-                       tm->tm_year += 2000;
+                       year += 2000;
        } else if (tag == ASN1_GENTIM) {
                /* GenTime: YYYYMMDDHHMMSSZ */
                if (vlen != 15)
                        goto unsupported_time;
-               tm->tm_year = DD2bin(p) * 100 + DD2bin(p);
+               year = DD2bin(p) * 100 + DD2bin(p);
+               if (year >= 1950 && year <= 2049)
+                       goto invalid_time;
        } else {
                goto unsupported_time;
        }
 
-       tm->tm_year -= 1900;
-       tm->tm_mon  = DD2bin(p) - 1;
-       tm->tm_mday = DD2bin(p);
-       tm->tm_hour = DD2bin(p);
-       tm->tm_min  = DD2bin(p);
-       tm->tm_sec  = DD2bin(p);
+       mon  = DD2bin(p);
+       day = DD2bin(p);
+       hour = DD2bin(p);
+       min  = DD2bin(p);
+       sec  = DD2bin(p);
 
        if (*p != 'Z')
                goto unsupported_time;
 
+       mon_len = month_lengths[mon];
+       if (mon == 2) {
+               if (year % 4 == 0) {
+                       mon_len = 29;
+                       if (year % 100 == 0) {
+                               year /= 100;
+                               if (year % 4 != 0)
+                                       mon_len = 28;
+                       }
+               }
+       }
+
+       if (year < 1970 ||
+           mon < 1 || mon > 12 ||
+           day < 1 || day > mon_len ||
+           hour < 0 || hour > 23 ||
+           min < 0 || min > 59 ||
+           sec < 0 || sec > 59)
+               goto invalid_time;
+       
+       *_t = mktime64(year, mon, day, hour, min, sec);
        return 0;
 
 unsupported_time:
-       pr_debug("Got unsupported time [tag %02x]: '%*.*s'\n",
-                tag, (int)vlen, (int)vlen, value);
+       pr_debug("Got unsupported time [tag %02x]: '%*phN'\n",
+                tag, (int)vlen, value);
+       return -EBADMSG;
+invalid_time:
+       pr_debug("Got invalid time [tag %02x]: '%*phN'\n",
+                tag, (int)vlen, value);
        return -EBADMSG;
 }
+EXPORT_SYMBOL_GPL(x509_decode_time);
 
 int x509_note_not_before(void *context, size_t hdrlen,
                         unsigned char tag,
                         const void *value, size_t vlen)
 {
        struct x509_parse_context *ctx = context;
-       return x509_note_time(&ctx->cert->valid_from, hdrlen, tag, value, vlen);
+       return x509_decode_time(&ctx->cert->valid_from, hdrlen, tag, value, vlen);
 }
 
 int x509_note_not_after(void *context, size_t hdrlen,
@@ -567,5 +578,71 @@ int x509_note_not_after(void *context, size_t hdrlen,
                        const void *value, size_t vlen)
 {
        struct x509_parse_context *ctx = context;
-       return x509_note_time(&ctx->cert->valid_to, hdrlen, tag, value, vlen);
+       return x509_decode_time(&ctx->cert->valid_to, hdrlen, tag, value, vlen);
+}
+
+/*
+ * Note a key identifier-based AuthorityKeyIdentifier
+ */
+int x509_akid_note_kid(void *context, size_t hdrlen,
+                      unsigned char tag,
+                      const void *value, size_t vlen)
+{
+       struct x509_parse_context *ctx = context;
+       struct asymmetric_key_id *kid;
+
+       pr_debug("AKID: keyid: %*phN\n", (int)vlen, value);
+
+       if (ctx->cert->akid_skid)
+               return 0;
+
+       kid = asymmetric_key_generate_id(value, vlen, "", 0);
+       if (IS_ERR(kid))
+               return PTR_ERR(kid);
+       pr_debug("authkeyid %*phN\n", kid->len, kid->data);
+       ctx->cert->akid_skid = kid;
+       return 0;
+}
+
+/*
+ * Note a directoryName in an AuthorityKeyIdentifier
+ */
+int x509_akid_note_name(void *context, size_t hdrlen,
+                       unsigned char tag,
+                       const void *value, size_t vlen)
+{
+       struct x509_parse_context *ctx = context;
+
+       pr_debug("AKID: name: %*phN\n", (int)vlen, value);
+
+       ctx->akid_raw_issuer = value;
+       ctx->akid_raw_issuer_size = vlen;
+       return 0;
+}
+
+/*
+ * Note a serial number in an AuthorityKeyIdentifier
+ */
+int x509_akid_note_serial(void *context, size_t hdrlen,
+                         unsigned char tag,
+                         const void *value, size_t vlen)
+{
+       struct x509_parse_context *ctx = context;
+       struct asymmetric_key_id *kid;
+
+       pr_debug("AKID: serial: %*phN\n", (int)vlen, value);
+
+       if (!ctx->akid_raw_issuer || ctx->cert->akid_id)
+               return 0;
+
+       kid = asymmetric_key_generate_id(value,
+                                        vlen,
+                                        ctx->akid_raw_issuer,
+                                        ctx->akid_raw_issuer_size);
+       if (IS_ERR(kid))
+               return PTR_ERR(kid);
+
+       pr_debug("authkeyid %*phN\n", kid->len, kid->data);
+       ctx->cert->akid_id = kid;
+       return 0;
 }
index 3dfe6b5d6f0b90433d9aefb2a627d70068f4d928..1de01eaec88490c6c24d669a6f02c6b4fc2e9959 100644 (file)
@@ -19,11 +19,12 @@ struct x509_certificate {
        struct public_key_signature sig;        /* Signature parameters */
        char            *issuer;                /* Name of certificate issuer */
        char            *subject;               /* Name of certificate subject */
-       struct asymmetric_key_id *id;           /* Serial number + issuer */
+       struct asymmetric_key_id *id;           /* Issuer + Serial number */
        struct asymmetric_key_id *skid;         /* Subject + subjectKeyId (optional) */
-       struct asymmetric_key_id *authority;    /* Authority key identifier (optional) */
-       struct tm       valid_from;
-       struct tm       valid_to;
+       struct asymmetric_key_id *akid_id;      /* CA AuthKeyId matching ->id (optional) */
+       struct asymmetric_key_id *akid_skid;    /* CA AuthKeyId matching ->skid (optional) */
+       time64_t        valid_from;
+       time64_t        valid_to;
        const void      *tbs;                   /* Signed data */
        unsigned        tbs_size;               /* Size of signed data */
        unsigned        raw_sig_size;           /* Size of sigature */
@@ -48,6 +49,9 @@ struct x509_certificate {
  */
 extern void x509_free_certificate(struct x509_certificate *cert);
 extern struct x509_certificate *x509_cert_parse(const void *data, size_t datalen);
+extern int x509_decode_time(time64_t *_t,  size_t hdrlen,
+                           unsigned char tag,
+                           const unsigned char *value, size_t vlen);
 
 /*
  * x509_public_key.c
index 24f17e6c590488d66ca01ec1f0e6f93a3f013163..6d88dd15c98da8cada935c0dc937eb5c8db158cd 100644 (file)
@@ -65,23 +65,37 @@ __setup("ca_keys=", ca_keys_setup);
 /**
  * x509_request_asymmetric_key - Request a key by X.509 certificate params.
  * @keyring: The keys to search.
- * @kid: The key ID.
+ * @id: The issuer & serialNumber to look for or NULL.
+ * @skid: The subjectKeyIdentifier to look for or NULL.
  * @partial: Use partial match if true, exact if false.
  *
- * Find a key in the given keyring by subject name and key ID.  These might,
- * for instance, be the issuer name and the authority key ID of an X.509
- * certificate that needs to be verified.
+ * Find a key in the given keyring by identifier.  The preferred identifier is
+ * the issuer + serialNumber and the fallback identifier is the
+ * subjectKeyIdentifier.  If both are given, the lookup is by the former, but
+ * the latter must also match.
  */
 struct key *x509_request_asymmetric_key(struct key *keyring,
-                                       const struct asymmetric_key_id *kid,
+                                       const struct asymmetric_key_id *id,
+                                       const struct asymmetric_key_id *skid,
                                        bool partial)
 {
-       key_ref_t key;
-       char *id, *p;
-
+       struct key *key;
+       key_ref_t ref;
+       const char *lookup;
+       char *req, *p;
+       int len;
+
+       if (id) {
+               lookup = id->data;
+               len = id->len;
+       } else {
+               lookup = skid->data;
+               len = skid->len;
+       }
+       
        /* Construct an identifier "id:<keyid>". */
-       p = id = kmalloc(2 + 1 + kid->len * 2 + 1, GFP_KERNEL);
-       if (!id)
+       p = req = kmalloc(2 + 1 + len * 2 + 1, GFP_KERNEL);
+       if (!req)
                return ERR_PTR(-ENOMEM);
 
        if (partial) {
@@ -92,32 +106,48 @@ struct key *x509_request_asymmetric_key(struct key *keyring,
                *p++ = 'x';
        }
        *p++ = ':';
-       p = bin2hex(p, kid->data, kid->len);
+       p = bin2hex(p, lookup, len);
        *p = 0;
 
-       pr_debug("Look up: \"%s\"\n", id);
+       pr_debug("Look up: \"%s\"\n", req);
 
-       key = keyring_search(make_key_ref(keyring, 1),
-                            &key_type_asymmetric, id);
-       if (IS_ERR(key))
-               pr_debug("Request for key '%s' err %ld\n", id, PTR_ERR(key));
-       kfree(id);
+       ref = keyring_search(make_key_ref(keyring, 1),
+                            &key_type_asymmetric, req);
+       if (IS_ERR(ref))
+               pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref));
+       kfree(req);
 
-       if (IS_ERR(key)) {
-               switch (PTR_ERR(key)) {
+       if (IS_ERR(ref)) {
+               switch (PTR_ERR(ref)) {
                        /* Hide some search errors */
                case -EACCES:
                case -ENOTDIR:
                case -EAGAIN:
                        return ERR_PTR(-ENOKEY);
                default:
-                       return ERR_CAST(key);
+                       return ERR_CAST(ref);
+               }
+       }
+
+       key = key_ref_to_ptr(ref);
+       if (id && skid) {
+               const struct asymmetric_key_ids *kids = asymmetric_key_ids(key);
+               if (!kids->id[1]) {
+                       pr_debug("issuer+serial match, but expected SKID missing\n");
+                       goto reject;
+               }
+               if (!asymmetric_key_id_same(skid, kids->id[1])) {
+                       pr_debug("issuer+serial match, but SKID does not\n");
+                       goto reject;
                }
        }
+       
+       pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key));
+       return key;
 
-       pr_devel("<==%s() = 0 [%x]\n", __func__,
-                key_serial(key_ref_to_ptr(key)));
-       return key_ref_to_ptr(key);
+reject:
+       key_put(key);
+       return ERR_PTR(-EKEYREJECTED);
 }
 EXPORT_SYMBOL_GPL(x509_request_asymmetric_key);
 
@@ -227,10 +257,11 @@ static int x509_validate_trust(struct x509_certificate *cert,
        if (!trust_keyring)
                return -EOPNOTSUPP;
 
-       if (ca_keyid && !asymmetric_key_id_partial(cert->authority, ca_keyid))
+       if (ca_keyid && !asymmetric_key_id_partial(cert->akid_skid, ca_keyid))
                return -EPERM;
 
-       key = x509_request_asymmetric_key(trust_keyring, cert->authority,
+       key = x509_request_asymmetric_key(trust_keyring,
+                                         cert->akid_id, cert->akid_skid,
                                          false);
        if (!IS_ERR(key))  {
                if (!use_builtin_keys
@@ -271,14 +302,7 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
        }
 
        pr_devel("Cert Key Algo: %s\n", pkey_algo_name[cert->pub->pkey_algo]);
-       pr_devel("Cert Valid From: %04ld-%02d-%02d %02d:%02d:%02d\n",
-                cert->valid_from.tm_year + 1900, cert->valid_from.tm_mon + 1,
-                cert->valid_from.tm_mday, cert->valid_from.tm_hour,
-                cert->valid_from.tm_min,  cert->valid_from.tm_sec);
-       pr_devel("Cert Valid To: %04ld-%02d-%02d %02d:%02d:%02d\n",
-                cert->valid_to.tm_year + 1900, cert->valid_to.tm_mon + 1,
-                cert->valid_to.tm_mday, cert->valid_to.tm_hour,
-                cert->valid_to.tm_min,  cert->valid_to.tm_sec);
+       pr_devel("Cert Valid period: %lld-%lld\n", cert->valid_from, cert->valid_to);
        pr_devel("Cert Signature: %s + %s\n",
                 pkey_algo_name[cert->sig.pkey_algo],
                 hash_algo_name[cert->sig.pkey_hash_algo]);
@@ -287,8 +311,9 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
        cert->pub->id_type = PKEY_ID_X509;
 
        /* Check the signature on the key if it appears to be self-signed */
-       if (!cert->authority ||
-           asymmetric_key_id_same(cert->skid, cert->authority)) {
+       if ((!cert->akid_skid && !cert->akid_id) ||
+           asymmetric_key_id_same(cert->skid, cert->akid_skid) ||
+           asymmetric_key_id_same(cert->id, cert->akid_id)) {
                ret = x509_check_signature(cert->pub, cert); /* self-signed */
                if (ret < 0)
                        goto error_free_cert;
index 35c2de13697182ba22190ea6d0c05d46c6b905be..fa18753f5c344de0eba58d9c9c1296255f4178c7 100644 (file)
@@ -940,6 +940,7 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
        char *xbuf[XBUFSIZE];
        char *xoutbuf[XBUFSIZE];
        int ret = -ENOMEM;
+       unsigned int ivsize = crypto_skcipher_ivsize(tfm);
 
        if (testmgr_alloc_buf(xbuf))
                goto out_nobuf;
@@ -975,7 +976,7 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
                        continue;
 
                if (template[i].iv)
-                       memcpy(iv, template[i].iv, MAX_IVLEN);
+                       memcpy(iv, template[i].iv, ivsize);
                else
                        memset(iv, 0, MAX_IVLEN);
 
@@ -1051,7 +1052,7 @@ static int __test_skcipher(struct crypto_skcipher *tfm, int enc,
                        continue;
 
                if (template[i].iv)
-                       memcpy(iv, template[i].iv, MAX_IVLEN);
+                       memcpy(iv, template[i].iv, ivsize);
                else
                        memset(iv, 0, MAX_IVLEN);
 
index 54e9729f9634de085d9b9dabf8cf79174b34cdfd..5d1015c26ff4cf9bb20adc92da1cb0f3921c3a14 100644 (file)
@@ -417,6 +417,7 @@ config ACPI_NFIT
        tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
        depends on PHYS_ADDR_T_64BIT
        depends on BLK_DEV
+       depends on ARCH_HAS_MMIO_FLUSH
        select LIBNVDIMM
        help
          Infrastructure to probe ACPI 6 compliant platforms for
index fb765524cc3d5acc754a60427533f1a15fb78b69..c58940b231d69df0c135bd5f821dbef64ef5755c 100644 (file)
@@ -19,8 +19,6 @@ static const struct acpi_device_id acpi_pnp_device_ids[] = {
        {"PNP0600"},            /* Generic ESDI/IDE/ATA compatible hard disk controller */
        /* floppy */
        {"PNP0700"},
-       /* ipmi_si */
-       {"IPI0001"},
        /* tpm_inf_pnp */
        {"IFX0101"},            /* Infineon TPMs */
        {"IFX0102"},            /* Infineon TPMs */
index cf0fd96a76021778f38a24916b415e02731bfaf0..c1b8d03e262eeeedf21d24f58b528acefe827abb 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/sort.h>
 #include <linux/pmem.h>
 #include <linux/io.h>
+#include <asm/cacheflush.h>
 #include "nfit.h"
 
 /*
@@ -764,9 +765,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
        struct acpi_device *adev, *adev_dimm;
        struct device *dev = acpi_desc->dev;
        const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
-       unsigned long long sta;
-       int i, rc = -ENODEV;
-       acpi_status status;
+       int i;
 
        nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
        adev = to_acpi_dev(acpi_desc);
@@ -781,25 +780,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
                return force_enable_dimms ? 0 : -ENODEV;
        }
 
-       status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta);
-       if (status == AE_NOT_FOUND) {
-               dev_dbg(dev, "%s missing _STA, assuming enabled...\n",
-                               dev_name(&adev_dimm->dev));
-               rc = 0;
-       } else if (ACPI_FAILURE(status))
-               dev_err(dev, "%s failed to retrieve_STA, disabling...\n",
-                               dev_name(&adev_dimm->dev));
-       else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0)
-               dev_info(dev, "%s disabled by firmware\n",
-                               dev_name(&adev_dimm->dev));
-       else
-               rc = 0;
-
        for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++)
                if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i))
                        set_bit(i, &nfit_mem->dsm_mask);
 
-       return force_enable_dimms ? 0 : rc;
+       return 0;
 }
 
 static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
@@ -868,6 +853,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
        struct acpi_device *adev;
        int i;
 
+       nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en;
        adev = to_acpi_dev(acpi_desc);
        if (!adev)
                return;
@@ -1032,7 +1018,7 @@ static u32 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
        if (mmio->num_lines)
                offset = to_interleave_offset(offset, mmio);
 
-       return readl(mmio->base + offset);
+       return readl(mmio->addr.base + offset);
 }
 
 static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
@@ -1057,11 +1043,11 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
        if (mmio->num_lines)
                offset = to_interleave_offset(offset, mmio);
 
-       writeq(cmd, mmio->base + offset);
+       writeq(cmd, mmio->addr.base + offset);
        wmb_blk(nfit_blk);
 
        if (nfit_blk->dimm_flags & ND_BLK_DCR_LATCH)
-               readq(mmio->base + offset);
+               readq(mmio->addr.base + offset);
 }
 
 static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
@@ -1093,11 +1079,16 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
                }
 
                if (rw)
-                       memcpy_to_pmem(mmio->aperture + offset,
+                       memcpy_to_pmem(mmio->addr.aperture + offset,
                                        iobuf + copied, c);
-               else
+               else {
+                       if (nfit_blk->dimm_flags & ND_BLK_READ_FLUSH)
+                               mmio_flush_range((void __force *)
+                                       mmio->addr.aperture + offset, c);
+
                        memcpy_from_pmem(iobuf + copied,
-                                       mmio->aperture + offset, c);
+                                       mmio->addr.aperture + offset, c);
+               }
 
                copied += c;
                len -= c;
@@ -1144,7 +1135,10 @@ static void nfit_spa_mapping_release(struct kref *kref)
 
        WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
        dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index);
-       iounmap(spa_map->iomem);
+       if (spa_map->type == SPA_MAP_APERTURE)
+               memunmap((void __force *)spa_map->addr.aperture);
+       else
+               iounmap(spa_map->addr.base);
        release_mem_region(spa->address, spa->length);
        list_del(&spa_map->list);
        kfree(spa_map);
@@ -1190,7 +1184,7 @@ static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
        spa_map = find_spa_mapping(acpi_desc, spa);
        if (spa_map) {
                kref_get(&spa_map->kref);
-               return spa_map->iomem;
+               return spa_map->addr.base;
        }
 
        spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL);
@@ -1206,20 +1200,19 @@ static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
        if (!res)
                goto err_mem;
 
-       if (type == SPA_MAP_APERTURE) {
-               /*
-                * TODO: memremap_pmem() support, but that requires cache
-                * flushing when the aperture is moved.
-                */
-               spa_map->iomem = ioremap_wc(start, n);
-       } else
-               spa_map->iomem = ioremap_nocache(start, n);
+       spa_map->type = type;
+       if (type == SPA_MAP_APERTURE)
+               spa_map->addr.aperture = (void __pmem *)memremap(start, n,
+                                                       ARCH_MEMREMAP_PMEM);
+       else
+               spa_map->addr.base = ioremap_nocache(start, n);
+
 
-       if (!spa_map->iomem)
+       if (!spa_map->addr.base)
                goto err_map;
 
        list_add_tail(&spa_map->list, &acpi_desc->spa_maps);
-       return spa_map->iomem;
+       return spa_map->addr.base;
 
  err_map:
        release_mem_region(start, n);
@@ -1282,7 +1275,7 @@ static int acpi_nfit_blk_get_flags(struct nvdimm_bus_descriptor *nd_desc,
                nfit_blk->dimm_flags = flags.flags;
        else if (rc == -ENOTTY) {
                /* fall back to a conservative default */
-               nfit_blk->dimm_flags = ND_BLK_DCR_LATCH;
+               nfit_blk->dimm_flags = ND_BLK_DCR_LATCH | ND_BLK_READ_FLUSH;
                rc = 0;
        } else
                rc = -ENXIO;
@@ -1322,9 +1315,9 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
        /* map block aperture memory */
        nfit_blk->bdw_offset = nfit_mem->bdw->offset;
        mmio = &nfit_blk->mmio[BDW];
-       mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw,
+       mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw,
                        SPA_MAP_APERTURE);
-       if (!mmio->base) {
+       if (!mmio->addr.base) {
                dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
                                nvdimm_name(nvdimm));
                return -ENOMEM;
@@ -1345,9 +1338,9 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
        nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
        nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
        mmio = &nfit_blk->mmio[DCR];
-       mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr,
+       mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr,
                        SPA_MAP_CONTROL);
-       if (!mmio->base) {
+       if (!mmio->addr.base) {
                dev_dbg(dev, "%s: %s failed to map dcr\n", __func__,
                                nvdimm_name(nvdimm));
                return -ENOMEM;
@@ -1379,7 +1372,7 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
                        return -ENOMEM;
        }
 
-       if (!arch_has_pmem_api() && !nfit_blk->nvdimm_flush)
+       if (!arch_has_wmb_pmem() && !nfit_blk->nvdimm_flush)
                dev_warn(dev, "unable to guarantee persistence of writes\n");
 
        if (mmio->line_size == 0)
@@ -1414,7 +1407,7 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
        for (i = 0; i < 2; i++) {
                struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i];
 
-               if (mmio->base)
+               if (mmio->addr.base)
                        nfit_spa_unmap(acpi_desc, mmio->spa);
        }
        nd_blk_region_set_provider_data(ndbr, NULL);
index 79b6d83875c1de1aed4bc67e091a4d2e3aa88204..7e740156b9c2996986e0283e2bdb5d84512929ee 100644 (file)
@@ -41,6 +41,7 @@ enum nfit_uuids {
 };
 
 enum {
+       ND_BLK_READ_FLUSH = 1,
        ND_BLK_DCR_LATCH = 2,
 };
 
@@ -107,6 +108,7 @@ struct acpi_nfit_desc {
        struct nvdimm_bus *nvdimm_bus;
        struct device *dev;
        unsigned long dimm_dsm_force_en;
+       unsigned long bus_dsm_force_en;
        int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
                        void *iobuf, u64 len, int rw);
 };
@@ -116,12 +118,16 @@ enum nd_blk_mmio_selector {
        DCR,
 };
 
+struct nd_blk_addr {
+       union {
+               void __iomem *base;
+               void __pmem  *aperture;
+       };
+};
+
 struct nfit_blk {
        struct nfit_blk_mmio {
-               union {
-                       void __iomem *base;
-                       void __pmem  *aperture;
-               };
+               struct nd_blk_addr addr;
                u64 size;
                u64 base_offset;
                u32 line_size;
@@ -148,7 +154,8 @@ struct nfit_spa_mapping {
        struct acpi_nfit_system_address *spa;
        struct list_head list;
        struct kref kref;
-       void __iomem *iomem;
+       enum spa_map_type type;
+       struct nd_blk_addr addr;
 };
 
 static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
index fc28b9f5aa84b0ac80dfe45551558d325011f4e6..30d8518b25fbcfdb0ed11121c8725fb229ca17ba 100644 (file)
@@ -525,8 +525,7 @@ static void acpi_thermal_check(void *data)
 
 /* sys I/F for generic thermal sysfs support */
 
-static int thermal_get_temp(struct thermal_zone_device *thermal,
-                           unsigned long *temp)
+static int thermal_get_temp(struct thermal_zone_device *thermal, int *temp)
 {
        struct acpi_thermal *tz = thermal->devdata;
        int result;
@@ -633,7 +632,7 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
-                                int trip, unsigned long *temp)
+                                int trip, int *temp)
 {
        struct acpi_thermal *tz = thermal->devdata;
        int i;
@@ -686,7 +685,8 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal,
 }
 
 static int thermal_get_crit_temp(struct thermal_zone_device *thermal,
-                               unsigned long *temperature) {
+                               int *temperature)
+{
        struct acpi_thermal *tz = thermal->devdata;
 
        if (tz->trips.critical.flags.valid) {
@@ -709,8 +709,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal,
                return -EINVAL;
 
        if (type == THERMAL_TRIP_ACTIVE) {
-               unsigned long trip_temp;
-               unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(
+               int trip_temp;
+               int temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(
                                        tz->temperature, tz->kelvin_offset);
                if (thermal_get_trip_temp(thermal, trip, &trip_temp))
                        return -EINVAL;
index 6607f3c6ace1033fd4ca449d579bd2b7638e3963..a39e85f9efa98854768f39b01502274401673957 100644 (file)
@@ -2834,7 +2834,7 @@ static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct binder_vm_ops = {
+static const struct vm_operations_struct binder_vm_ops = {
        .open = binder_vma_open,
        .close = binder_vma_close,
        .fault = binder_vm_fault,
index 416720159e96c44291d4ab30be36eb487c0da68e..16550c63d611ad8e484ccdec663ca1cc7142e144 100644 (file)
@@ -212,6 +212,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool timed)
        return ret;
 }
 
+/**
+ * genpd_queue_power_off_work - Queue up the execution of pm_genpd_poweroff().
+ * @genpd: PM domait to power off.
+ *
+ * Queue up the execution of pm_genpd_poweroff() unless it's already been done
+ * before.
+ */
+static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
+{
+       queue_work(pm_wq, &genpd->power_off_work);
+}
+
 /**
  * __pm_genpd_poweron - Restore power to a given PM domain and its masters.
  * @genpd: PM domain to power up.
@@ -259,8 +271,12 @@ static int __pm_genpd_poweron(struct generic_pm_domain *genpd)
        return 0;
 
  err:
-       list_for_each_entry_continue_reverse(link, &genpd->slave_links, slave_node)
+       list_for_each_entry_continue_reverse(link,
+                                       &genpd->slave_links,
+                                       slave_node) {
                genpd_sd_counter_dec(link->master);
+               genpd_queue_power_off_work(link->master);
+       }
 
        return ret;
 }
@@ -348,18 +364,6 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
        return NOTIFY_DONE;
 }
 
-/**
- * genpd_queue_power_off_work - Queue up the execution of pm_genpd_poweroff().
- * @genpd: PM domait to power off.
- *
- * Queue up the execution of pm_genpd_poweroff() unless it's already been done
- * before.
- */
-static void genpd_queue_power_off_work(struct generic_pm_domain *genpd)
-{
-       queue_work(pm_wq, &genpd->power_off_work);
-}
-
 /**
  * pm_genpd_poweroff - Remove power from a given PM domain.
  * @genpd: PM domain to power down.
@@ -1469,6 +1473,13 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 
        mutex_lock(&genpd->lock);
 
+       if (!list_empty(&subdomain->slave_links) || subdomain->device_count) {
+               pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
+                       subdomain->name);
+               ret = -EBUSY;
+               goto out;
+       }
+
        list_for_each_entry(link, &genpd->master_links, master_node) {
                if (link->slave != subdomain)
                        continue;
@@ -1487,6 +1498,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
                break;
        }
 
+out:
        mutex_unlock(&genpd->lock);
 
        return ret;
index eb254497a4944454650fb220e4967f5110cec737..28cd75c535b047f2c4276fed4308fa8159491007 100644 (file)
@@ -340,6 +340,34 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
 
+/**
+ * dev_pm_opp_get_suspend_opp() - Get suspend opp
+ * @dev:       device for which we do this operation
+ *
+ * Return: This function returns pointer to the suspend opp if it is
+ * defined and available, otherwise it returns NULL.
+ *
+ * Locking: This function must be called under rcu_read_lock(). opp is a rcu
+ * protected pointer. The reason for the same is that the opp pointer which is
+ * returned will remain valid for use with opp_get_{voltage, freq} only while
+ * under the locked area. The pointer returned must be used prior to unlocking
+ * with rcu_read_unlock() to maintain the integrity of the pointer.
+ */
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+       struct device_opp *dev_opp;
+
+       opp_rcu_lockdep_assert();
+
+       dev_opp = _find_device_opp(dev);
+       if (IS_ERR(dev_opp) || !dev_opp->suspend_opp ||
+           !dev_opp->suspend_opp->available)
+               return NULL;
+
+       return dev_opp->suspend_opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp);
+
 /**
  * dev_pm_opp_get_opp_count() - Get number of opps available in the opp list
  * @dev:       device for which we do this operation
index ff03f2348f77efd8cea40d071a15735d42de400f..2d75366c61e03c7188e2839ec1e8d727d3365966 100644 (file)
@@ -611,13 +611,15 @@ static void *device_get_mac_addr(struct device *dev,
 */
 void *device_get_mac_address(struct device *dev, char *addr, int alen)
 {
-       addr = device_get_mac_addr(dev, "mac-address", addr, alen);
-       if (addr)
-               return addr;
+       char *res;
 
-       addr = device_get_mac_addr(dev, "local-mac-address", addr, alen);
-       if (addr)
-               return addr;
+       res = device_get_mac_addr(dev, "mac-address", addr, alen);
+       if (res)
+               return res;
+
+       res = device_get_mac_addr(dev, "local-mac-address", addr, alen);
+       if (res)
+               return res;
 
        return device_get_mac_addr(dev, "address", addr, alen);
 }
index 873ddf91c9d3ec742ceb1c3a9959ac322adc16db..cc557886ab2377a550c1ae529b6ecee23380161f 100644 (file)
@@ -139,11 +139,17 @@ struct regmap {
        struct reg_sequence *patch;
        int patch_regs;
 
-       /* if set, converts bulk rw to single rw */
-       bool use_single_rw;
+       /* if set, converts bulk read to single read */
+       bool use_single_read;
+       /* if set, converts bulk read to single read */
+       bool use_single_write;
        /* if set, the device supports multi write mode */
        bool can_multi_write;
 
+       /* if set, raw reads/writes are limited to this size */
+       size_t max_raw_read;
+       size_t max_raw_write;
+
        struct rb_root range_tree;
        void *selector_work_buf;        /* Scratch buffer used for selector */
 };
index b9862d741a56210885638b3cfc9ce8c92f435254..6f8a13ec32a410838d9c702af143dbdcbf0fff0c 100644 (file)
@@ -729,7 +729,7 @@ int regcache_sync_block(struct regmap *map, void *block,
                        unsigned int block_base, unsigned int start,
                        unsigned int end)
 {
-       if (regmap_can_raw_write(map) && !map->use_single_rw)
+       if (regmap_can_raw_write(map) && !map->use_single_write)
                return regcache_sync_block_raw(map, block, cache_present,
                                               block_base, start, end);
        else
index 8d304e2a943d3c62776267534a13483375d3a200..c03ebfd4c731477ac1efb17fa3b7b15615481dfe 100644 (file)
@@ -78,37 +78,24 @@ static const struct regmap_bus ac97_regmap_bus = {
        .reg_read = regmap_ac97_reg_read,
 };
 
-/**
- * regmap_init_ac97(): Initialise AC'97 register map
- *
- * @ac97: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
-struct regmap *regmap_init_ac97(struct snd_ac97 *ac97,
-                               const struct regmap_config *config)
+struct regmap *__regmap_init_ac97(struct snd_ac97 *ac97,
+                                 const struct regmap_config *config,
+                                 struct lock_class_key *lock_key,
+                                 const char *lock_name)
 {
-       return regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config);
+       return __regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config,
+                            lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_ac97);
+EXPORT_SYMBOL_GPL(__regmap_init_ac97);
 
-/**
- * devm_regmap_init_ac97(): Initialise AC'97 register map
- *
- * @ac97: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
-struct regmap *devm_regmap_init_ac97(struct snd_ac97 *ac97,
-                                    const struct regmap_config *config)
+struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
+                                      const struct regmap_config *config,
+                                      struct lock_class_key *lock_key,
+                                      const char *lock_name)
 {
-       return devm_regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config);
+       return __devm_regmap_init(&ac97->dev, &ac97_regmap_bus, ac97, config,
+                                 lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_ac97);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_ac97);
 
 MODULE_LICENSE("GPL v2");
index 5799a0b9e6cc4168168bef46d864fbbe0db4020e..f42f2bac646623fc1db767bae3a5fff0ecf98aac 100644 (file)
@@ -469,6 +469,87 @@ static const struct file_operations regmap_access_fops = {
        .llseek = default_llseek,
 };
 
+static ssize_t regmap_cache_only_write_file(struct file *file,
+                                           const char __user *user_buf,
+                                           size_t count, loff_t *ppos)
+{
+       struct regmap *map = container_of(file->private_data,
+                                         struct regmap, cache_only);
+       ssize_t result;
+       bool was_enabled, require_sync = false;
+       int err;
+
+       map->lock(map->lock_arg);
+
+       was_enabled = map->cache_only;
+
+       result = debugfs_write_file_bool(file, user_buf, count, ppos);
+       if (result < 0) {
+               map->unlock(map->lock_arg);
+               return result;
+       }
+
+       if (map->cache_only && !was_enabled) {
+               dev_warn(map->dev, "debugfs cache_only=Y forced\n");
+               add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+       } else if (!map->cache_only && was_enabled) {
+               dev_warn(map->dev, "debugfs cache_only=N forced: syncing cache\n");
+               require_sync = true;
+       }
+
+       map->unlock(map->lock_arg);
+
+       if (require_sync) {
+               err = regcache_sync(map);
+               if (err)
+                       dev_err(map->dev, "Failed to sync cache %d\n", err);
+       }
+
+       return result;
+}
+
+static const struct file_operations regmap_cache_only_fops = {
+       .open = simple_open,
+       .read = debugfs_read_file_bool,
+       .write = regmap_cache_only_write_file,
+};
+
+static ssize_t regmap_cache_bypass_write_file(struct file *file,
+                                             const char __user *user_buf,
+                                             size_t count, loff_t *ppos)
+{
+       struct regmap *map = container_of(file->private_data,
+                                         struct regmap, cache_bypass);
+       ssize_t result;
+       bool was_enabled;
+
+       map->lock(map->lock_arg);
+
+       was_enabled = map->cache_bypass;
+
+       result = debugfs_write_file_bool(file, user_buf, count, ppos);
+       if (result < 0)
+               goto out;
+
+       if (map->cache_bypass && !was_enabled) {
+               dev_warn(map->dev, "debugfs cache_bypass=Y forced\n");
+               add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+       } else if (!map->cache_bypass && was_enabled) {
+               dev_warn(map->dev, "debugfs cache_bypass=N forced\n");
+       }
+
+out:
+       map->unlock(map->lock_arg);
+
+       return result;
+}
+
+static const struct file_operations regmap_cache_bypass_fops = {
+       .open = simple_open,
+       .read = debugfs_read_file_bool,
+       .write = regmap_cache_bypass_write_file,
+};
+
 void regmap_debugfs_init(struct regmap *map, const char *name)
 {
        struct rb_node *next;
@@ -518,10 +599,11 @@ void regmap_debugfs_init(struct regmap *map, const char *name)
        if (map->max_register || regmap_readable(map, 0)) {
                umode_t registers_mode;
 
-               if (IS_ENABLED(REGMAP_ALLOW_WRITE_DEBUGFS))
-                       registers_mode = 0600;
-               else
-                       registers_mode = 0400;
+#if defined(REGMAP_ALLOW_WRITE_DEBUGFS)
+               registers_mode = 0600;
+#else
+               registers_mode = 0400;
+#endif
 
                debugfs_create_file("registers", registers_mode, map->debugfs,
                                    map, &regmap_map_fops);
@@ -530,12 +612,13 @@ void regmap_debugfs_init(struct regmap *map, const char *name)
        }
 
        if (map->cache_type) {
-               debugfs_create_bool("cache_only", 0400, map->debugfs,
-                                   &map->cache_only);
+               debugfs_create_file("cache_only", 0600, map->debugfs,
+                                   &map->cache_only, &regmap_cache_only_fops);
                debugfs_create_bool("cache_dirty", 0400, map->debugfs,
                                    &map->cache_dirty);
-               debugfs_create_bool("cache_bypass", 0400, map->debugfs,
-                                   &map->cache_bypass);
+               debugfs_create_file("cache_bypass", 0600, map->debugfs,
+                                   &map->cache_bypass,
+                                   &regmap_cache_bypass_fops);
        }
 
        next = rb_first(&map->range_tree);
index 4b76e33110a2d1adb14e661e290f2fddccfe6b42..1a8ec3b2b60181be46beaeb765f530bc9a2f7a83 100644 (file)
@@ -209,11 +209,60 @@ static struct regmap_bus regmap_i2c = {
        .val_format_endian_default = REGMAP_ENDIAN_BIG,
 };
 
+static int regmap_i2c_smbus_i2c_write(void *context, const void *data,
+                                     size_t count)
+{
+       struct device *dev = context;
+       struct i2c_client *i2c = to_i2c_client(dev);
+
+       if (count < 1)
+               return -EINVAL;
+       if (count >= I2C_SMBUS_BLOCK_MAX)
+               return -E2BIG;
+
+       --count;
+       return i2c_smbus_write_i2c_block_data(i2c, ((u8 *)data)[0], count,
+                                             ((u8 *)data + 1));
+}
+
+static int regmap_i2c_smbus_i2c_read(void *context, const void *reg,
+                                    size_t reg_size, void *val,
+                                    size_t val_size)
+{
+       struct device *dev = context;
+       struct i2c_client *i2c = to_i2c_client(dev);
+       int ret;
+
+       if (reg_size != 1 || val_size < 1)
+               return -EINVAL;
+       if (val_size >= I2C_SMBUS_BLOCK_MAX)
+               return -E2BIG;
+
+       ret = i2c_smbus_read_i2c_block_data(i2c, ((u8 *)reg)[0], val_size, val);
+       if (ret == val_size)
+               return 0;
+       else if (ret < 0)
+               return ret;
+       else
+               return -EIO;
+}
+
+static struct regmap_bus regmap_i2c_smbus_i2c_block = {
+       .write = regmap_i2c_smbus_i2c_write,
+       .read = regmap_i2c_smbus_i2c_read,
+       .max_raw_read = I2C_SMBUS_BLOCK_MAX,
+       .max_raw_write = I2C_SMBUS_BLOCK_MAX,
+};
+
 static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c,
                                        const struct regmap_config *config)
 {
        if (i2c_check_functionality(i2c->adapter, I2C_FUNC_I2C))
                return &regmap_i2c;
+       else if (config->reg_bits == 8 &&
+                i2c_check_functionality(i2c->adapter,
+                                        I2C_FUNC_SMBUS_I2C_BLOCK))
+               return &regmap_i2c_smbus_i2c_block;
        else if (config->val_bits == 16 && config->reg_bits == 8 &&
                 i2c_check_functionality(i2c->adapter,
                                         I2C_FUNC_SMBUS_WORD_DATA))
@@ -233,47 +282,34 @@ static const struct regmap_bus *regmap_get_i2c_bus(struct i2c_client *i2c,
        return ERR_PTR(-ENOTSUPP);
 }
 
-/**
- * regmap_init_i2c(): Initialise register map
- *
- * @i2c: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
-struct regmap *regmap_init_i2c(struct i2c_client *i2c,
-                              const struct regmap_config *config)
+struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
+                                const struct regmap_config *config,
+                                struct lock_class_key *lock_key,
+                                const char *lock_name)
 {
        const struct regmap_bus *bus = regmap_get_i2c_bus(i2c, config);
 
        if (IS_ERR(bus))
                return ERR_CAST(bus);
 
-       return regmap_init(&i2c->dev, bus, &i2c->dev, config);
+       return __regmap_init(&i2c->dev, bus, &i2c->dev, config,
+                            lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_i2c);
+EXPORT_SYMBOL_GPL(__regmap_init_i2c);
 
-/**
- * devm_regmap_init_i2c(): Initialise managed register map
- *
- * @i2c: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
-struct regmap *devm_regmap_init_i2c(struct i2c_client *i2c,
-                                   const struct regmap_config *config)
+struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name)
 {
        const struct regmap_bus *bus = regmap_get_i2c_bus(i2c, config);
 
        if (IS_ERR(bus))
                return ERR_CAST(bus);
 
-       return devm_regmap_init(&i2c->dev, bus, &i2c->dev, config);
+       return __devm_regmap_init(&i2c->dev, bus, &i2c->dev, config,
+                                 lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_i2c);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_i2c);
 
 MODULE_LICENSE("GPL");
index 2597600a5d26d8c115321a8688c871e29b66d2a4..38d1f72d869cf4ceb698067f588f81b96e0102df 100644 (file)
@@ -209,7 +209,7 @@ static irqreturn_t regmap_irq_thread(int irq, void *d)
         * Read in the statuses, using a single bulk read if possible
         * in order to reduce the I/O overheads.
         */
-       if (!map->use_single_rw && map->reg_stride == 1 &&
+       if (!map->use_single_read && map->reg_stride == 1 &&
            data->irq_reg_stride == 1) {
                u8 *buf8 = data->status_reg_buf;
                u16 *buf16 = data->status_reg_buf;
@@ -398,7 +398,7 @@ int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags,
        else
                d->irq_reg_stride = 1;
 
-       if (!map->use_single_rw && map->reg_stride == 1 &&
+       if (!map->use_single_read && map->reg_stride == 1 &&
            d->irq_reg_stride == 1) {
                d->status_reg_buf = kmalloc(map->format.val_bytes *
                                            chip->num_regs, GFP_KERNEL);
index 04a329a377e96b585dd074e2e16340d954e908a4..426a57e41ac76071e213ebd69d9d454c51f92172 100644 (file)
@@ -296,20 +296,11 @@ err_free:
        return ERR_PTR(ret);
 }
 
-/**
- * regmap_init_mmio_clk(): Initialise register map with register clock
- *
- * @dev: Device that will be interacted with
- * @clk_id: register clock consumer ID
- * @regs: Pointer to memory-mapped IO region
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
-struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-                                   void __iomem *regs,
-                                   const struct regmap_config *config)
+struct regmap *__regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+                                     void __iomem *regs,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name)
 {
        struct regmap_mmio_context *ctx;
 
@@ -317,25 +308,17 @@ struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
        if (IS_ERR(ctx))
                return ERR_CAST(ctx);
 
-       return regmap_init(dev, &regmap_mmio, ctx, config);
+       return __regmap_init(dev, &regmap_mmio, ctx, config,
+                            lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_mmio_clk);
-
-/**
- * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
- *
- * @dev: Device that will be interacted with
- * @clk_id: register clock consumer ID
- * @regs: Pointer to memory-mapped IO region
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
-struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-                                        void __iomem *regs,
-                                        const struct regmap_config *config)
+EXPORT_SYMBOL_GPL(__regmap_init_mmio_clk);
+
+struct regmap *__devm_regmap_init_mmio_clk(struct device *dev,
+                                          const char *clk_id,
+                                          void __iomem *regs,
+                                          const struct regmap_config *config,
+                                          struct lock_class_key *lock_key,
+                                          const char *lock_name)
 {
        struct regmap_mmio_context *ctx;
 
@@ -343,8 +326,9 @@ struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
        if (IS_ERR(ctx))
                return ERR_CAST(ctx);
 
-       return devm_regmap_init(dev, &regmap_mmio, ctx, config);
+       return __devm_regmap_init(dev, &regmap_mmio, ctx, config,
+                                 lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_mmio_clk);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_mmio_clk);
 
 MODULE_LICENSE("GPL v2");
index 53d1148e80a05e648d47ef9e0fedc03b0125a234..edd9a839d004dcf98ab46b15c9002c958ad6fe56 100644 (file)
@@ -113,37 +113,24 @@ static struct regmap_bus regmap_spi = {
        .val_format_endian_default = REGMAP_ENDIAN_BIG,
 };
 
-/**
- * regmap_init_spi(): Initialise register map
- *
- * @spi: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
-struct regmap *regmap_init_spi(struct spi_device *spi,
-                              const struct regmap_config *config)
+struct regmap *__regmap_init_spi(struct spi_device *spi,
+                                const struct regmap_config *config,
+                                struct lock_class_key *lock_key,
+                                const char *lock_name)
 {
-       return regmap_init(&spi->dev, &regmap_spi, &spi->dev, config);
+       return __regmap_init(&spi->dev, &regmap_spi, &spi->dev, config,
+                            lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_spi);
+EXPORT_SYMBOL_GPL(__regmap_init_spi);
 
-/**
- * devm_regmap_init_spi(): Initialise register map
- *
- * @spi: Device that will be interacted with
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The map will be automatically freed by the
- * device management code.
- */
-struct regmap *devm_regmap_init_spi(struct spi_device *spi,
-                                   const struct regmap_config *config)
+struct regmap *__devm_regmap_init_spi(struct spi_device *spi,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name)
 {
-       return devm_regmap_init(&spi->dev, &regmap_spi, &spi->dev, config);
+       return __devm_regmap_init(&spi->dev, &regmap_spi, &spi->dev, config,
+                                 lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_spi);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_spi);
 
 MODULE_LICENSE("GPL");
index d7026dc33388aada6003886d14d8d73f264b56a9..7e58f656039900e633729828f5eae5fb1700b93a 100644 (file)
@@ -91,36 +91,25 @@ static struct regmap_bus regmap_spmi_base = {
        .val_format_endian_default      = REGMAP_ENDIAN_NATIVE,
 };
 
-/**
- * regmap_init_spmi_base(): Create regmap for the Base register space
- * @sdev:      SPMI device that will be interacted with
- * @config:    Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
-struct regmap *regmap_init_spmi_base(struct spmi_device *sdev,
-                                    const struct regmap_config *config)
+struct regmap *__regmap_init_spmi_base(struct spmi_device *sdev,
+                                      const struct regmap_config *config,
+                                      struct lock_class_key *lock_key,
+                                      const char *lock_name)
 {
-       return regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config);
+       return __regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config,
+                            lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_spmi_base);
+EXPORT_SYMBOL_GPL(__regmap_init_spmi_base);
 
-/**
- * devm_regmap_init_spmi_base(): Create managed regmap for Base register space
- * @sdev:      SPMI device that will be interacted with
- * @config:    Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
-struct regmap *devm_regmap_init_spmi_base(struct spmi_device *sdev,
-                                         const struct regmap_config *config)
+struct regmap *__devm_regmap_init_spmi_base(struct spmi_device *sdev,
+                                           const struct regmap_config *config,
+                                           struct lock_class_key *lock_key,
+                                           const char *lock_name)
 {
-       return devm_regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config);
+       return __devm_regmap_init(&sdev->dev, &regmap_spmi_base, sdev, config,
+                                 lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_spmi_base);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_spmi_base);
 
 static int regmap_spmi_ext_read(void *context,
                                const void *reg, size_t reg_size,
@@ -222,35 +211,24 @@ static struct regmap_bus regmap_spmi_ext = {
        .val_format_endian_default      = REGMAP_ENDIAN_NATIVE,
 };
 
-/**
- * regmap_init_spmi_ext(): Create regmap for Ext register space
- * @sdev:      Device that will be interacted with
- * @config:    Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
- */
-struct regmap *regmap_init_spmi_ext(struct spmi_device *sdev,
-                                   const struct regmap_config *config)
+struct regmap *__regmap_init_spmi_ext(struct spmi_device *sdev,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name)
 {
-       return regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config);
+       return __regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config,
+                            lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(regmap_init_spmi_ext);
+EXPORT_SYMBOL_GPL(__regmap_init_spmi_ext);
 
-/**
- * devm_regmap_init_spmi_ext(): Create managed regmap for Ext register space
- * @sdev:      SPMI device that will be interacted with
- * @config:    Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
- */
-struct regmap *devm_regmap_init_spmi_ext(struct spmi_device *sdev,
-                                    const struct regmap_config *config)
+struct regmap *__devm_regmap_init_spmi_ext(struct spmi_device *sdev,
+                                          const struct regmap_config *config,
+                                          struct lock_class_key *lock_key,
+                                          const char *lock_name)
 {
-       return devm_regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config);
+       return __devm_regmap_init(&sdev->dev, &regmap_spmi_ext, sdev, config,
+                                 lock_key, lock_name);
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init_spmi_ext);
+EXPORT_SYMBOL_GPL(__devm_regmap_init_spmi_ext);
 
 MODULE_LICENSE("GPL");
index 0a849eeaf952aefdca5fc7793fe339a5de4b96e2..afaf56200674a29517678af763bc9a084659224b 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/of.h>
 #include <linux/rbtree.h>
 #include <linux/sched.h>
+#include <linux/delay.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -93,6 +94,9 @@ bool regmap_writeable(struct regmap *map, unsigned int reg)
 
 bool regmap_readable(struct regmap *map, unsigned int reg)
 {
+       if (!map->reg_read)
+               return false;
+
        if (map->max_register && reg > map->max_register)
                return false;
 
@@ -515,22 +519,12 @@ enum regmap_endian regmap_get_val_endian(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(regmap_get_val_endian);
 
-/**
- * regmap_init(): Initialise register map
- *
- * @dev: Device that will be interacted with
- * @bus: Bus-specific callbacks to use with device
- * @bus_context: Data passed to bus-specific callbacks
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.  This function should generally not be called
- * directly, it should be called by bus-specific init functions.
- */
-struct regmap *regmap_init(struct device *dev,
-                          const struct regmap_bus *bus,
-                          void *bus_context,
-                          const struct regmap_config *config)
+struct regmap *__regmap_init(struct device *dev,
+                            const struct regmap_bus *bus,
+                            void *bus_context,
+                            const struct regmap_config *config,
+                            struct lock_class_key *lock_key,
+                            const char *lock_name)
 {
        struct regmap *map;
        int ret = -EINVAL;
@@ -556,10 +550,14 @@ struct regmap *regmap_init(struct device *dev,
                        spin_lock_init(&map->spinlock);
                        map->lock = regmap_lock_spinlock;
                        map->unlock = regmap_unlock_spinlock;
+                       lockdep_set_class_and_name(&map->spinlock,
+                                                  lock_key, lock_name);
                } else {
                        mutex_init(&map->mutex);
                        map->lock = regmap_lock_mutex;
                        map->unlock = regmap_unlock_mutex;
+                       lockdep_set_class_and_name(&map->mutex,
+                                                  lock_key, lock_name);
                }
                map->lock_arg = map;
        }
@@ -573,8 +571,13 @@ struct regmap *regmap_init(struct device *dev,
                map->reg_stride = config->reg_stride;
        else
                map->reg_stride = 1;
-       map->use_single_rw = config->use_single_rw;
-       map->can_multi_write = config->can_multi_write;
+       map->use_single_read = config->use_single_rw || !bus || !bus->read;
+       map->use_single_write = config->use_single_rw || !bus || !bus->write;
+       map->can_multi_write = config->can_multi_write && bus && bus->write;
+       if (bus) {
+               map->max_raw_read = bus->max_raw_read;
+               map->max_raw_write = bus->max_raw_write;
+       }
        map->dev = dev;
        map->bus = bus;
        map->bus_context = bus_context;
@@ -763,7 +766,7 @@ struct regmap *regmap_init(struct device *dev,
                if ((reg_endian != REGMAP_ENDIAN_BIG) ||
                    (val_endian != REGMAP_ENDIAN_BIG))
                        goto err_map;
-               map->use_single_rw = true;
+               map->use_single_write = true;
        }
 
        if (!map->format.format_write &&
@@ -899,30 +902,19 @@ err_map:
 err:
        return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(regmap_init);
+EXPORT_SYMBOL_GPL(__regmap_init);
 
 static void devm_regmap_release(struct device *dev, void *res)
 {
        regmap_exit(*(struct regmap **)res);
 }
 
-/**
- * devm_regmap_init(): Initialise managed register map
- *
- * @dev: Device that will be interacted with
- * @bus: Bus-specific callbacks to use with device
- * @bus_context: Data passed to bus-specific callbacks
- * @config: Configuration for register map
- *
- * The return value will be an ERR_PTR() on error or a valid pointer
- * to a struct regmap.  This function should generally not be called
- * directly, it should be called by bus-specific init functions.  The
- * map will be automatically freed by the device management code.
- */
-struct regmap *devm_regmap_init(struct device *dev,
-                               const struct regmap_bus *bus,
-                               void *bus_context,
-                               const struct regmap_config *config)
+struct regmap *__devm_regmap_init(struct device *dev,
+                                 const struct regmap_bus *bus,
+                                 void *bus_context,
+                                 const struct regmap_config *config,
+                                 struct lock_class_key *lock_key,
+                                 const char *lock_name)
 {
        struct regmap **ptr, *regmap;
 
@@ -930,7 +922,8 @@ struct regmap *devm_regmap_init(struct device *dev,
        if (!ptr)
                return ERR_PTR(-ENOMEM);
 
-       regmap = regmap_init(dev, bus, bus_context, config);
+       regmap = __regmap_init(dev, bus, bus_context, config,
+                              lock_key, lock_name);
        if (!IS_ERR(regmap)) {
                *ptr = regmap;
                devres_add(dev, ptr);
@@ -940,7 +933,7 @@ struct regmap *devm_regmap_init(struct device *dev,
 
        return regmap;
 }
-EXPORT_SYMBOL_GPL(devm_regmap_init);
+EXPORT_SYMBOL_GPL(__devm_regmap_init);
 
 static void regmap_field_init(struct regmap_field *rm_field,
        struct regmap *regmap, struct reg_field reg_field)
@@ -1382,10 +1375,33 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
  */
 bool regmap_can_raw_write(struct regmap *map)
 {
-       return map->bus && map->format.format_val && map->format.format_reg;
+       return map->bus && map->bus->write && map->format.format_val &&
+               map->format.format_reg;
 }
 EXPORT_SYMBOL_GPL(regmap_can_raw_write);
 
+/**
+ * regmap_get_raw_read_max - Get the maximum size we can read
+ *
+ * @map: Map to check.
+ */
+size_t regmap_get_raw_read_max(struct regmap *map)
+{
+       return map->max_raw_read;
+}
+EXPORT_SYMBOL_GPL(regmap_get_raw_read_max);
+
+/**
+ * regmap_get_raw_write_max - Get the maximum size we can read
+ *
+ * @map: Map to check.
+ */
+size_t regmap_get_raw_write_max(struct regmap *map)
+{
+       return map->max_raw_write;
+}
+EXPORT_SYMBOL_GPL(regmap_get_raw_write_max);
+
 static int _regmap_bus_formatted_write(void *context, unsigned int reg,
                                       unsigned int val)
 {
@@ -1555,6 +1571,8 @@ int regmap_raw_write(struct regmap *map, unsigned int reg,
                return -EINVAL;
        if (val_len % map->format.val_bytes)
                return -EINVAL;
+       if (map->max_raw_write && map->max_raw_write > val_len)
+               return -E2BIG;
 
        map->lock(map->lock_arg);
 
@@ -1681,6 +1699,7 @@ int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
 {
        int ret = 0, i;
        size_t val_bytes = map->format.val_bytes;
+       size_t total_size = val_bytes * val_count;
 
        if (map->bus && !map->format.parse_inplace)
                return -EINVAL;
@@ -1689,9 +1708,15 @@ int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
 
        /*
         * Some devices don't support bulk write, for
-        * them we have a series of single write operations.
+        * them we have a series of single write operations in the first two if
+        * blocks.
+        *
+        * The first if block is used for memory mapped io. It does not allow
+        * val_bytes of 3 for example.
+        * The second one is used for busses which do not have this limitation
+        * and can write arbitrary value lengths.
         */
-       if (!map->bus || map->use_single_rw) {
+       if (!map->bus) {
                map->lock(map->lock_arg);
                for (i = 0; i < val_count; i++) {
                        unsigned int ival;
@@ -1723,6 +1748,38 @@ int regmap_bulk_write(struct regmap *map, unsigned int reg, const void *val,
                }
 out:
                map->unlock(map->lock_arg);
+       } else if (map->use_single_write ||
+                  (map->max_raw_write && map->max_raw_write < total_size)) {
+               int chunk_stride = map->reg_stride;
+               size_t chunk_size = val_bytes;
+               size_t chunk_count = val_count;
+
+               if (!map->use_single_write) {
+                       chunk_size = map->max_raw_write;
+                       if (chunk_size % val_bytes)
+                               chunk_size -= chunk_size % val_bytes;
+                       chunk_count = total_size / chunk_size;
+                       chunk_stride *= chunk_size / val_bytes;
+               }
+
+               map->lock(map->lock_arg);
+               /* Write as many bytes as possible with chunk_size */
+               for (i = 0; i < chunk_count; i++) {
+                       ret = _regmap_raw_write(map,
+                                               reg + (i * chunk_stride),
+                                               val + (i * chunk_size),
+                                               chunk_size);
+                       if (ret)
+                               break;
+               }
+
+               /* Write remaining bytes */
+               if (!ret && chunk_size * i < total_size) {
+                       ret = _regmap_raw_write(map, reg + (i * chunk_stride),
+                                               val + (i * chunk_size),
+                                               total_size - i * chunk_size);
+               }
+               map->unlock(map->lock_arg);
        } else {
                void *wval;
 
@@ -1752,7 +1809,7 @@ EXPORT_SYMBOL_GPL(regmap_bulk_write);
  *
  * the (register,newvalue) pairs in regs have not been formatted, but
  * they are all in the same page and have been changed to being page
- * relative. The page register has been written if that was neccessary.
+ * relative. The page register has been written if that was necessary.
  */
 static int _regmap_raw_multi_reg_write(struct regmap *map,
                                       const struct reg_sequence *regs,
@@ -1780,8 +1837,8 @@ static int _regmap_raw_multi_reg_write(struct regmap *map,
        u8 = buf;
 
        for (i = 0; i < num_regs; i++) {
-               int reg = regs[i].reg;
-               int val = regs[i].def;
+               unsigned int reg = regs[i].reg;
+               unsigned int val = regs[i].def;
                trace_regmap_hw_write_start(map, reg, 1);
                map->format.format_reg(u8, reg, map->reg_shift);
                u8 += reg_bytes + pad_bytes;
@@ -1819,10 +1876,12 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map,
        int i, n;
        struct reg_sequence *base;
        unsigned int this_page = 0;
+       unsigned int page_change = 0;
        /*
         * the set of registers are not neccessarily in order, but
         * since the order of write must be preserved this algorithm
-        * chops the set each time the page changes
+        * chops the set each time the page changes. This also applies
+        * if there is a delay required at any point in the sequence.
         */
        base = regs;
        for (i = 0, n = 0; i < num_regs; i++, n++) {
@@ -1838,16 +1897,48 @@ static int _regmap_range_multi_paged_reg_write(struct regmap *map,
                                this_page = win_page;
                        if (win_page != this_page) {
                                this_page = win_page;
+                               page_change = 1;
+                       }
+               }
+
+               /* If we have both a page change and a delay make sure to
+                * write the regs and apply the delay before we change the
+                * page.
+                */
+
+               if (page_change || regs[i].delay_us) {
+
+                               /* For situations where the first write requires
+                                * a delay we need to make sure we don't call
+                                * raw_multi_reg_write with n=0
+                                * This can't occur with page breaks as we
+                                * never write on the first iteration
+                                */
+                               if (regs[i].delay_us && i == 0)
+                                       n = 1;
+
                                ret = _regmap_raw_multi_reg_write(map, base, n);
                                if (ret != 0)
                                        return ret;
+
+                               if (regs[i].delay_us)
+                                       udelay(regs[i].delay_us);
+
                                base += n;
                                n = 0;
-                       }
-                       ret = _regmap_select_page(map, &base[n].reg, range, 1);
-                       if (ret != 0)
-                               return ret;
+
+                               if (page_change) {
+                                       ret = _regmap_select_page(map,
+                                                                 &base[n].reg,
+                                                                 range, 1);
+                                       if (ret != 0)
+                                               return ret;
+
+                                       page_change = 0;
+                               }
+
                }
+
        }
        if (n > 0)
                return _regmap_raw_multi_reg_write(map, base, n);
@@ -1866,6 +1957,9 @@ static int _regmap_multi_reg_write(struct regmap *map,
                        ret = _regmap_write(map, regs[i].reg, regs[i].def);
                        if (ret != 0)
                                return ret;
+
+                       if (regs[i].delay_us)
+                               udelay(regs[i].delay_us);
                }
                return 0;
        }
@@ -1905,8 +1999,12 @@ static int _regmap_multi_reg_write(struct regmap *map,
        for (i = 0; i < num_regs; i++) {
                unsigned int reg = regs[i].reg;
                struct regmap_range_node *range;
+
+               /* Coalesce all the writes between a page break or a delay
+                * in a sequence
+                */
                range = _regmap_range_lookup(map, reg);
-               if (range) {
+               if (range || regs[i].delay_us) {
                        size_t len = sizeof(struct reg_sequence)*num_regs;
                        struct reg_sequence *base = kmemdup(regs, len,
                                                           GFP_KERNEL);
@@ -2062,7 +2160,7 @@ static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 
        /*
         * Some buses or devices flag reads by setting the high bits in the
-        * register addresss; since it's always the high bits for all
+        * register address; since it's always the high bits for all
         * current formats we can do this here rather than in
         * formatting.  This may break if we get interesting formats.
         */
@@ -2109,8 +2207,6 @@ static int _regmap_read(struct regmap *map, unsigned int reg,
        int ret;
        void *context = _regmap_map_get_context(map);
 
-       WARN_ON(!map->reg_read);
-
        if (!map->cache_bypass) {
                ret = regcache_read(map, reg, val);
                if (ret == 0)
@@ -2191,11 +2287,22 @@ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
                return -EINVAL;
        if (reg % map->reg_stride)
                return -EINVAL;
+       if (val_count == 0)
+               return -EINVAL;
 
        map->lock(map->lock_arg);
 
        if (regmap_volatile_range(map, reg, val_count) || map->cache_bypass ||
            map->cache_type == REGCACHE_NONE) {
+               if (!map->bus->read) {
+                       ret = -ENOTSUPP;
+                       goto out;
+               }
+               if (map->max_raw_read && map->max_raw_read < val_len) {
+                       ret = -E2BIG;
+                       goto out;
+               }
+
                /* Physical block read if there's no cache involved */
                ret = _regmap_raw_read(map, reg, val, val_len);
 
@@ -2304,20 +2411,51 @@ int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
                 * Some devices does not support bulk read, for
                 * them we have a series of single read operations.
                 */
-               if (map->use_single_rw) {
-                       for (i = 0; i < val_count; i++) {
-                               ret = regmap_raw_read(map,
-                                               reg + (i * map->reg_stride),
-                                               val + (i * val_bytes),
-                                               val_bytes);
-                               if (ret != 0)
-                                       return ret;
-                       }
-               } else {
+               size_t total_size = val_bytes * val_count;
+
+               if (!map->use_single_read &&
+                   (!map->max_raw_read || map->max_raw_read > total_size)) {
                        ret = regmap_raw_read(map, reg, val,
                                              val_bytes * val_count);
                        if (ret != 0)
                                return ret;
+               } else {
+                       /*
+                        * Some devices do not support bulk read or do not
+                        * support large bulk reads, for them we have a series
+                        * of read operations.
+                        */
+                       int chunk_stride = map->reg_stride;
+                       size_t chunk_size = val_bytes;
+                       size_t chunk_count = val_count;
+
+                       if (!map->use_single_read) {
+                               chunk_size = map->max_raw_read;
+                               if (chunk_size % val_bytes)
+                                       chunk_size -= chunk_size % val_bytes;
+                               chunk_count = total_size / chunk_size;
+                               chunk_stride *= chunk_size / val_bytes;
+                       }
+
+                       /* Read bytes that fit into a multiple of chunk_size */
+                       for (i = 0; i < chunk_count; i++) {
+                               ret = regmap_raw_read(map,
+                                                     reg + (i * chunk_stride),
+                                                     val + (i * chunk_size),
+                                                     chunk_size);
+                               if (ret != 0)
+                                       return ret;
+                       }
+
+                       /* Read remaining bytes */
+                       if (chunk_size * i < total_size) {
+                               ret = regmap_raw_read(map,
+                                                     reg + (i * chunk_stride),
+                                                     val + (i * chunk_size),
+                                                     total_size - i * chunk_size);
+                               if (ret != 0)
+                                       return ret;
+                       }
                }
 
                for (i = 0; i < val_count * val_bytes; i += val_bytes)
@@ -2329,7 +2467,34 @@ int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val,
                                          &ival);
                        if (ret != 0)
                                return ret;
-                       map->format.format_val(val + (i * val_bytes), ival, 0);
+
+                       if (map->format.format_val) {
+                               map->format.format_val(val + (i * val_bytes), ival, 0);
+                       } else {
+                               /* Devices providing read and write
+                                * operations can use the bulk I/O
+                                * functions if they define a val_bytes,
+                                * we assume that the values are native
+                                * endian.
+                                */
+                               u32 *u32 = val;
+                               u16 *u16 = val;
+                               u8 *u8 = val;
+
+                               switch (map->format.val_bytes) {
+                               case 4:
+                                       u32[i] = ival;
+                                       break;
+                               case 2:
+                                       u16[i] = ival;
+                                       break;
+                               case 1:
+                                       u8[i] = ival;
+                                       break;
+                               default:
+                                       return -EINVAL;
+                               }
+                       }
                }
        }
 
index f9ab74505e69370f5214c261b0428a51f3e9e4c3..b9794aeeb878cc7a054be313063191399b717159 100644 (file)
@@ -374,7 +374,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 
 #ifdef CONFIG_BLK_DEV_RAM_DAX
 static long brd_direct_access(struct block_device *bdev, sector_t sector,
-                       void **kaddr, unsigned long *pfn, long size)
+                       void __pmem **kaddr, unsigned long *pfn)
 {
        struct brd_device *brd = bdev->bd_disk->private_data;
        struct page *page;
@@ -384,13 +384,9 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
        page = brd_insert_page(brd, sector);
        if (!page)
                return -ENOSPC;
-       *kaddr = page_address(page);
+       *kaddr = (void __pmem *)page_address(page);
        *pfn = page_to_pfn(page);
 
-       /*
-        * TODO: If size > PAGE_SIZE, we could look to see if the next page in
-        * the file happens to be mapped to the next page of physical RAM.
-        */
        return PAGE_SIZE;
 }
 #else
index 698f761037ce54a6c94be1aeaf0a6179e4c9735b..d93a0372b37b5c7b4cb214e7013e64897c3a9aba 100644 (file)
@@ -4673,7 +4673,10 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
        }
 
        ret = rbd_dev_v2_snap_context(rbd_dev);
-       dout("rbd_dev_v2_snap_context returned %d\n", ret);
+       if (ret && first_time) {
+               kfree(rbd_dev->header.object_prefix);
+               rbd_dev->header.object_prefix = NULL;
+       }
 
        return ret;
 }
@@ -5154,7 +5157,6 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
 out_err:
        if (parent) {
                rbd_dev_unparent(rbd_dev);
-               kfree(rbd_dev->header_name);
                rbd_dev_destroy(parent);
        } else {
                rbd_put_client(rbdc);
index d4d05f064d390772a2f99acbf882eaa983788511..e93899cc6f60be0bd13b45dde3b8d697b7a733c8 100644 (file)
@@ -478,8 +478,7 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev)
                                   struct virtio_blk_config, wce,
                                   &writeback);
        if (err)
-               writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) ||
-                           virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+               writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
 
        return writeback;
 }
@@ -657,6 +656,7 @@ static int virtblk_probe(struct virtio_device *vdev)
        vblk->disk->private_data = vblk;
        vblk->disk->fops = &virtblk_fops;
        vblk->disk->driverfs_dev = &vdev->dev;
+       vblk->disk->flags |= GENHD_FL_EXT_DEVT;
        vblk->index = index;
 
        /* configure queue flush support */
@@ -840,7 +840,7 @@ static unsigned int features_legacy[] = {
 static unsigned int features[] = {
        VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
        VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-       VIRTIO_BLK_F_TOPOLOGY,
+       VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
        VIRTIO_BLK_F_MQ,
 };
 
index 5f6b3be0a93cc0ba82c105f1c57a5b3ba4382aea..0823a96902f87fa90d2e35a425183ea0de2e0049 100644 (file)
@@ -37,6 +37,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
 #include <linux/module.h>
@@ -147,6 +148,7 @@ struct blkfront_info
        unsigned int feature_persistent:1;
        unsigned int max_indirect_segments;
        int is_ready;
+       struct blk_mq_tag_set tag_set;
 };
 
 static unsigned int nr_minors;
@@ -247,7 +249,7 @@ static struct grant *get_grant(grant_ref_t *gref_head,
                                struct blkfront_info *info)
 {
        struct grant *gnt_list_entry;
-       unsigned long buffer_mfn;
+       unsigned long buffer_gfn;
 
        BUG_ON(list_empty(&info->grants));
        gnt_list_entry = list_first_entry(&info->grants, struct grant,
@@ -266,10 +268,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
                BUG_ON(!pfn);
                gnt_list_entry->pfn = pfn;
        }
-       buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
+       buffer_gfn = pfn_to_gfn(gnt_list_entry->pfn);
        gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
                                        info->xbdev->otherend_id,
-                                       buffer_mfn, 0);
+                                       buffer_gfn, 0);
        return gnt_list_entry;
 }
 
@@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
                 !(info->feature_flush & REQ_FUA)));
 }
 
-/*
- * do_blkif_request
- *  read a block; request is in a request queue
- */
-static void do_blkif_request(struct request_queue *rq)
+static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+                          const struct blk_mq_queue_data *qd)
 {
-       struct blkfront_info *info = NULL;
-       struct request *req;
-       int queued;
-
-       pr_debug("Entered do_blkif_request\n");
-
-       queued = 0;
+       struct blkfront_info *info = qd->rq->rq_disk->private_data;
 
-       while ((req = blk_peek_request(rq)) != NULL) {
-               info = req->rq_disk->private_data;
-
-               if (RING_FULL(&info->ring))
-                       goto wait;
+       blk_mq_start_request(qd->rq);
+       spin_lock_irq(&info->io_lock);
+       if (RING_FULL(&info->ring))
+               goto out_busy;
 
-               blk_start_request(req);
+       if (blkif_request_flush_invalid(qd->rq, info))
+               goto out_err;
 
-               if (blkif_request_flush_invalid(req, info)) {
-                       __blk_end_request_all(req, -EOPNOTSUPP);
-                       continue;
-               }
+       if (blkif_queue_request(qd->rq))
+               goto out_busy;
 
-               pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-                        "(%u/%u) [%s]\n",
-                        req, req->cmd, (unsigned long)blk_rq_pos(req),
-                        blk_rq_cur_sectors(req), blk_rq_sectors(req),
-                        rq_data_dir(req) ? "write" : "read");
-
-               if (blkif_queue_request(req)) {
-                       blk_requeue_request(rq, req);
-wait:
-                       /* Avoid pointless unplugs. */
-                       blk_stop_queue(rq);
-                       break;
-               }
+       flush_requests(info);
+       spin_unlock_irq(&info->io_lock);
+       return BLK_MQ_RQ_QUEUE_OK;
 
-               queued++;
-       }
+out_err:
+       spin_unlock_irq(&info->io_lock);
+       return BLK_MQ_RQ_QUEUE_ERROR;
 
-       if (queued != 0)
-               flush_requests(info);
+out_busy:
+       spin_unlock_irq(&info->io_lock);
+       blk_mq_stop_hw_queue(hctx);
+       return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
+static struct blk_mq_ops blkfront_mq_ops = {
+       .queue_rq = blkif_queue_rq,
+       .map_queue = blk_mq_map_queue,
+};
+
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
                                unsigned int physical_sector_size,
                                unsigned int segments)
@@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
        struct request_queue *rq;
        struct blkfront_info *info = gd->private_data;
 
-       rq = blk_init_queue(do_blkif_request, &info->io_lock);
-       if (rq == NULL)
+       memset(&info->tag_set, 0, sizeof(info->tag_set));
+       info->tag_set.ops = &blkfront_mq_ops;
+       info->tag_set.nr_hw_queues = 1;
+       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       info->tag_set.numa_node = NUMA_NO_NODE;
+       info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+       info->tag_set.cmd_size = 0;
+       info->tag_set.driver_data = info;
+
+       if (blk_mq_alloc_tag_set(&info->tag_set))
                return -1;
+       rq = blk_mq_init_queue(&info->tag_set);
+       if (IS_ERR(rq)) {
+               blk_mq_free_tag_set(&info->tag_set);
+               return -1;
+       }
 
        queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
@@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
        unsigned int minor, nr_minors;
-       unsigned long flags;
 
        if (info->rq == NULL)
                return;
 
-       spin_lock_irqsave(&info->io_lock, flags);
-
        /* No more blkif_request(). */
-       blk_stop_queue(info->rq);
+       blk_mq_stop_hw_queues(info->rq);
 
        /* No more gnttab callback work. */
        gnttab_cancel_free_callback(&info->callback);
-       spin_unlock_irqrestore(&info->io_lock, flags);
 
        /* Flush gnttab callback work. Must be done with no locks held. */
        flush_work(&info->work);
@@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
        xlbd_release_minors(minor, nr_minors);
 
        blk_cleanup_queue(info->rq);
+       blk_mq_free_tag_set(&info->tag_set);
        info->rq = NULL;
 
        put_disk(info->gd);
        info->gd = NULL;
 }
 
+/* Must be called with io_lock holded */
 static void kick_pending_request_queues(struct blkfront_info *info)
 {
-       if (!RING_FULL(&info->ring)) {
-               /* Re-enable calldowns. */
-               blk_start_queue(info->rq);
-               /* Kick things off immediately. */
-               do_blkif_request(info->rq);
-       }
+       if (!RING_FULL(&info->ring))
+               blk_mq_start_stopped_hw_queues(info->rq, true);
 }
 
 static void blkif_restart_queue(struct work_struct *work)
@@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
                BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
        /* No more blkif_request(). */
        if (info->rq)
-               blk_stop_queue(info->rq);
+               blk_mq_stop_hw_queues(info->rq);
 
        /* Remove all persistent grants */
        if (!list_empty(&info->grants)) {
@@ -1146,7 +1142,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
        RING_IDX i, rp;
        unsigned long flags;
        struct blkfront_info *info = (struct blkfront_info *)dev_id;
-       int error;
 
        spin_lock_irqsave(&info->io_lock, flags);
 
@@ -1187,37 +1182,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                        continue;
                }
 
-               error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+               req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
                switch (bret->operation) {
                case BLKIF_OP_DISCARD:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                struct request_queue *rq = info->rq;
                                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                           info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                                info->feature_discard = 0;
                                info->feature_secdiscard = 0;
                                queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
                                queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
                        }
-                       __blk_end_request_all(req, error);
+                       blk_mq_complete_request(req);
                        break;
                case BLKIF_OP_FLUSH_DISKCACHE:
                case BLKIF_OP_WRITE_BARRIER:
                        if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
                                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                        }
                        if (unlikely(bret->status == BLKIF_RSP_ERROR &&
                                     info->shadow[id].req.u.rw.nr_segments == 0)) {
                                printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                       info->gd->disk_name, op_name(bret->operation));
-                               error = -EOPNOTSUPP;
+                               req->errors = -EOPNOTSUPP;
                        }
-                       if (unlikely(error)) {
-                               if (error == -EOPNOTSUPP)
-                                       error = 0;
+                       if (unlikely(req->errors)) {
+                               if (req->errors == -EOPNOTSUPP)
+                                       req->errors = 0;
                                info->feature_flush = 0;
                                xlvbd_flush(info);
                        }
@@ -1228,7 +1223,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
                                        "request: %x\n", bret->status);
 
-                       __blk_end_request_all(req, error);
+                       blk_mq_complete_request(req);
                        break;
                default:
                        BUG();
@@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
 
        kfree(copy);
 
-       /*
-        * Empty the queue, this is important because we might have
-        * requests in the queue with more segments than what we
-        * can handle now.
-        */
-       spin_lock_irq(&info->io_lock);
-       while ((req = blk_fetch_request(info->rq)) != NULL) {
-               if (req->cmd_flags &
-                   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
-                       list_add(&req->queuelist, &requests);
-                       continue;
-               }
-               merge_bio.head = req->bio;
-               merge_bio.tail = req->biotail;
-               bio_list_merge(&bio_list, &merge_bio);
-               req->bio = NULL;
-               if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
-                       pr_alert("diskcache flush request found!\n");
-               __blk_end_request_all(req, 0);
-       }
-       spin_unlock_irq(&info->io_lock);
-
        xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
        spin_lock_irq(&info->io_lock);
@@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
                /* Requeue pending requests (flush or discard) */
                list_del_init(&req->queuelist);
                BUG_ON(req->nr_phys_segments > segs);
-               blk_requeue_request(info->rq, req);
+               blk_mq_requeue_request(req);
        }
        spin_unlock_irq(&info->io_lock);
+       blk_mq_kick_requeue_list(info->rq);
 
        while ((bio = bio_list_pop(&bio_list)) != NULL) {
                /* Traverse the list of pending bios and re-queue them */
index 965d1afb0eaa72558a1c6c36869e791733ec7350..5cb13ca3a3acac2aa16a104e9731770066ff03de 100644 (file)
@@ -330,12 +330,14 @@ void zcomp_destroy(struct zcomp *comp)
  * allocate new zcomp and initialize it. return compressing
  * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
  * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
- * case of allocation error.
+ * case of allocation error, or any other error potentially
+ * returned by functions zcomp_strm_{multi,single}_create.
  */
 struct zcomp *zcomp_create(const char *compress, int max_strm)
 {
        struct zcomp *comp;
        struct zcomp_backend *backend;
+       int error;
 
        backend = find_backend(compress);
        if (!backend)
@@ -347,12 +349,12 @@ struct zcomp *zcomp_create(const char *compress, int max_strm)
 
        comp->backend = backend;
        if (max_strm > 1)
-               zcomp_strm_multi_create(comp, max_strm);
+               error = zcomp_strm_multi_create(comp, max_strm);
        else
-               zcomp_strm_single_create(comp);
-       if (!comp->stream) {
+               error = zcomp_strm_single_create(comp);
+       if (error) {
                kfree(comp);
-               return ERR_PTR(-ENOMEM);
+               return ERR_PTR(error);
        }
        return comp;
 }
index 9c01f5bfa33fc9a0494a1868b0b6ca7dc79b50b9..9fa15bb9d118ee5ad2d9f34e23aab6777d09d105 100644 (file)
@@ -388,7 +388,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
 static ssize_t compact_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t len)
 {
-       unsigned long nr_migrated;
        struct zram *zram = dev_to_zram(dev);
        struct zram_meta *meta;
 
@@ -399,8 +398,7 @@ static ssize_t compact_store(struct device *dev,
        }
 
        meta = zram->meta;
-       nr_migrated = zs_compact(meta->mem_pool);
-       atomic64_add(nr_migrated, &zram->stats.num_migrated);
+       zs_compact(meta->mem_pool);
        up_read(&zram->init_lock);
 
        return len;
@@ -428,26 +426,31 @@ static ssize_t mm_stat_show(struct device *dev,
                struct device_attribute *attr, char *buf)
 {
        struct zram *zram = dev_to_zram(dev);
+       struct zs_pool_stats pool_stats;
        u64 orig_size, mem_used = 0;
        long max_used;
        ssize_t ret;
 
+       memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
        down_read(&zram->init_lock);
-       if (init_done(zram))
+       if (init_done(zram)) {
                mem_used = zs_get_total_pages(zram->meta->mem_pool);
+               zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+       }
 
        orig_size = atomic64_read(&zram->stats.pages_stored);
        max_used = atomic_long_read(&zram->stats.max_used_pages);
 
        ret = scnprintf(buf, PAGE_SIZE,
-                       "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+                       "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
                        orig_size << PAGE_SHIFT,
                        (u64)atomic64_read(&zram->stats.compr_data_size),
                        mem_used << PAGE_SHIFT,
                        zram->limit_pages << PAGE_SHIFT,
                        max_used << PAGE_SHIFT,
                        (u64)atomic64_read(&zram->stats.zero_pages),
-                       (u64)atomic64_read(&zram->stats.num_migrated));
+                       pool_stats.pages_compacted);
        up_read(&zram->init_lock);
 
        return ret;
@@ -619,7 +622,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
                uncmem = user_mem;
 
        if (!uncmem) {
-               pr_info("Unable to allocate temp memory\n");
+               pr_err("Unable to allocate temp memory\n");
                ret = -ENOMEM;
                goto out_cleanup;
        }
@@ -716,7 +719,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 
        handle = zs_malloc(meta->mem_pool, clen);
        if (!handle) {
-               pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+               pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
                        index, clen);
                ret = -ENOMEM;
                goto out;
@@ -1036,7 +1039,7 @@ static ssize_t disksize_store(struct device *dev,
 
        comp = zcomp_create(zram->compressor, zram->max_comp_streams);
        if (IS_ERR(comp)) {
-               pr_info("Cannot initialise %s compressing backend\n",
+               pr_err("Cannot initialise %s compressing backend\n",
                                zram->compressor);
                err = PTR_ERR(comp);
                goto out_free_meta;
@@ -1214,7 +1217,7 @@ static int zram_add(void)
        /* gendisk structure */
        zram->disk = alloc_disk(1);
        if (!zram->disk) {
-               pr_warn("Error allocating disk structure for device %d\n",
+               pr_err("Error allocating disk structure for device %d\n",
                        device_id);
                ret = -ENOMEM;
                goto out_free_queue;
@@ -1263,7 +1266,8 @@ static int zram_add(void)
        ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
                                &zram_disk_attr_group);
        if (ret < 0) {
-               pr_warn("Error creating sysfs group");
+               pr_err("Error creating sysfs group for device %d\n",
+                               device_id);
                goto out_free_disk;
        }
        strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
@@ -1403,13 +1407,13 @@ static int __init zram_init(void)
 
        ret = class_register(&zram_control_class);
        if (ret) {
-               pr_warn("Unable to register zram-control class\n");
+               pr_err("Unable to register zram-control class\n");
                return ret;
        }
 
        zram_major = register_blkdev(0, "zram");
        if (zram_major <= 0) {
-               pr_warn("Unable to get major number\n");
+               pr_err("Unable to get major number\n");
                class_unregister(&zram_control_class);
                return -EBUSY;
        }
index 6dbe2df506bf0bda2a3b02ca63f3bc8a0831e088..8e92339686d7467584220ff1805002953d5e6621 100644 (file)
@@ -78,7 +78,6 @@ struct zram_stats {
        atomic64_t compr_data_size;     /* compressed size of pages stored */
        atomic64_t num_reads;   /* failed + successful */
        atomic64_t num_writes;  /* --do-- */
-       atomic64_t num_migrated;        /* no. of migrated object */
        atomic64_t failed_reads;        /* can happen when memory is too low */
        atomic64_t failed_writes;       /* can happen when memory is too low */
        atomic64_t invalid_io;  /* non-page-aligned I/O requests */
index a64763b6b5fd1ee1e38418bd931f9d7e2066dc4a..6575c0fe6a4ea3a1e3bc1bae99010eefd9d7ff94 100644 (file)
@@ -107,7 +107,7 @@ struct regmap *devm_regmap_init_vexpress_config(struct device *dev)
        if (!res)
                return ERR_PTR(-ENOMEM);
 
-       regmap = bridge->ops->regmap_init(dev, bridge->context);
+       regmap = (bridge->ops->regmap_init)(dev, bridge->context);
        if (IS_ERR(regmap)) {
                devres_free(res);
                return regmap;
index 61e71616689b7c3bd5441230b5af30df79df837c..feafdab734ae20b268e3078bd71822ef382a408e 100644 (file)
@@ -694,7 +694,7 @@ static int bt_size(void)
        return sizeof(struct si_sm_data);
 }
 
-struct si_sm_handlers bt_smi_handlers = {
+const struct si_sm_handlers bt_smi_handlers = {
        .init_data              = bt_init_data,
        .start_transaction      = bt_start_transaction,
        .get_result             = bt_get_result,
index 8c25f596808a9a2b50d9f14b9a750b26fc079b74..1da61af7f576787ad2ab6ca5e11f24247c15ed3d 100644 (file)
@@ -540,7 +540,7 @@ static void kcs_cleanup(struct si_sm_data *kcs)
 {
 }
 
-struct si_sm_handlers kcs_smi_handlers = {
+const struct si_sm_handlers kcs_smi_handlers = {
        .init_data         = init_kcs_data,
        .start_transaction = start_kcs_transaction,
        .get_result        = get_kcs_result,
index bf75f63617731595d958765b7c1582f79452f3cb..e3536da05c88aaddf8ed803feb3b089e47053dbe 100644 (file)
@@ -342,7 +342,7 @@ struct ipmi_smi {
         * an umpreemptible region to use this.  You must fetch the
         * value into a local variable and make sure it is not NULL.
         */
-       struct ipmi_smi_handlers *handlers;
+       const struct ipmi_smi_handlers *handlers;
        void                     *send_info;
 
 #ifdef CONFIG_PROC_FS
@@ -744,7 +744,13 @@ static void deliver_response(struct ipmi_recv_msg *msg)
                        ipmi_inc_stat(intf, unhandled_local_responses);
                }
                ipmi_free_recv_msg(msg);
-       } else {
+       } else if (!oops_in_progress) {
+               /*
+                * If we are running in the panic context, calling the
+                * receive handler doesn't much meaning and has a deadlock
+                * risk.  At this moment, simply skip it in that case.
+                */
+
                ipmi_user_t user = msg->user;
                user->handler->ipmi_recv_hndl(msg, user->handler_data);
        }
@@ -1015,7 +1021,7 @@ int ipmi_get_smi_info(int if_num, struct ipmi_smi_info *data)
 {
        int           rv = 0;
        ipmi_smi_t    intf;
-       struct ipmi_smi_handlers *handlers;
+       const struct ipmi_smi_handlers *handlers;
 
        mutex_lock(&ipmi_interfaces_mutex);
        list_for_each_entry_rcu(intf, &ipmi_interfaces, link) {
@@ -1501,7 +1507,7 @@ static struct ipmi_smi_msg *smi_add_send_msg(ipmi_smi_t intf,
 }
 
 
-static void smi_send(ipmi_smi_t intf, struct ipmi_smi_handlers *handlers,
+static void smi_send(ipmi_smi_t intf, const struct ipmi_smi_handlers *handlers,
                     struct ipmi_smi_msg *smi_msg, int priority)
 {
        int run_to_completion = intf->run_to_completion;
@@ -2747,7 +2753,7 @@ void ipmi_poll_interface(ipmi_user_t user)
 }
 EXPORT_SYMBOL(ipmi_poll_interface);
 
-int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
+int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
                      void                     *send_info,
                      struct ipmi_device_id    *device_id,
                      struct device            *si_dev,
@@ -3959,6 +3965,10 @@ free_msg:
 
        if (!run_to_completion)
                spin_lock_irqsave(&intf->xmit_msgs_lock, flags);
+       /*
+        * We can get an asynchronous event or receive message in addition
+        * to commands we send.
+        */
        if (msg == intf->curr_msg)
                intf->curr_msg = NULL;
        if (!run_to_completion)
@@ -4015,7 +4025,7 @@ static void check_msg_timeout(ipmi_smi_t intf, struct seq_table *ent,
                              unsigned int *waiting_msgs)
 {
        struct ipmi_recv_msg     *msg;
-       struct ipmi_smi_handlers *handlers;
+       const struct ipmi_smi_handlers *handlers;
 
        if (intf->in_shutdown)
                return;
@@ -4082,7 +4092,7 @@ static void check_msg_timeout(ipmi_smi_t intf, struct seq_table *ent,
                                ipmi_inc_stat(intf,
                                              retransmitted_ipmb_commands);
 
-                       smi_send(intf, intf->handlers, smi_msg, 0);
+                       smi_send(intf, handlers, smi_msg, 0);
                } else
                        ipmi_free_smi_msg(smi_msg);
 
@@ -4291,6 +4301,9 @@ static void ipmi_panic_request_and_wait(ipmi_smi_t           intf,
                            0, 1); /* Don't retry, and don't wait. */
        if (rv)
                atomic_sub(2, &panic_done_count);
+       else if (intf->handlers->flush_messages)
+               intf->handlers->flush_messages(intf->send_info);
+
        while (atomic_read(&panic_done_count) != 0)
                ipmi_poll(intf);
 }
@@ -4364,9 +4377,7 @@ static void send_panic_events(char *str)
                        /* Interface is not ready. */
                        continue;
 
-               intf->run_to_completion = 1;
                /* Send the event announcing the panic. */
-               intf->handlers->set_run_to_completion(intf->send_info, 1);
                ipmi_panic_request_and_wait(intf, &addr, &msg);
        }
 
@@ -4506,6 +4517,23 @@ static int panic_event(struct notifier_block *this,
                        /* Interface is not ready. */
                        continue;
 
+               /*
+                * If we were interrupted while locking xmit_msgs_lock or
+                * waiting_rcv_msgs_lock, the corresponding list may be
+                * corrupted.  In this case, drop items on the list for
+                * the safety.
+                */
+               if (!spin_trylock(&intf->xmit_msgs_lock)) {
+                       INIT_LIST_HEAD(&intf->xmit_msgs);
+                       INIT_LIST_HEAD(&intf->hp_xmit_msgs);
+               } else
+                       spin_unlock(&intf->xmit_msgs_lock);
+
+               if (!spin_trylock(&intf->waiting_rcv_msgs_lock))
+                       INIT_LIST_HEAD(&intf->waiting_rcv_msgs);
+               else
+                       spin_unlock(&intf->waiting_rcv_msgs_lock);
+
                intf->run_to_completion = 1;
                intf->handlers->set_run_to_completion(intf->send_info, 1);
        }
index 9b409c0f14f7ccff38fa13f964c72e3d336666a1..6e658aa114f19da873b5d201d6c131b7350311cd 100644 (file)
@@ -143,8 +143,15 @@ static int ipmi_powernv_recv(struct ipmi_smi_powernv *smi)
        pr_devel("%s:   -> %d (size %lld)\n", __func__,
                        rc, rc == 0 ? size : 0);
        if (rc) {
+               /* If came via the poll, and response was not yet ready */
+               if (rc == OPAL_EMPTY) {
+                       spin_unlock_irqrestore(&smi->msg_lock, flags);
+                       return 0;
+               }
+
+               smi->cur_msg = NULL;
                spin_unlock_irqrestore(&smi->msg_lock, flags);
-               ipmi_free_smi_msg(msg);
+               send_error_reply(smi, msg, IPMI_ERR_UNSPECIFIED);
                return 0;
        }
 
@@ -300,7 +307,6 @@ static const struct of_device_id ipmi_powernv_match[] = {
 static struct platform_driver powernv_ipmi_driver = {
        .driver = {
                .name           = "ipmi-powernv",
-               .owner          = THIS_MODULE,
                .of_match_table = ipmi_powernv_match,
        },
        .probe  = ipmi_powernv_probe,
index 8a45e92ff60c7483349cf1819b9d0ebc576ebd44..654f6f36a071c1411e21373d3cb53e00efadd4f6 100644 (file)
@@ -64,7 +64,6 @@
 #include <linux/dmi.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/pnp.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
@@ -164,7 +163,7 @@ struct smi_info {
        int                    intf_num;
        ipmi_smi_t             intf;
        struct si_sm_data      *si_sm;
-       struct si_sm_handlers  *handlers;
+       const struct si_sm_handlers *handlers;
        enum si_type           si_type;
        spinlock_t             si_lock;
        struct ipmi_smi_msg    *waiting_msg;
@@ -263,9 +262,21 @@ struct smi_info {
        bool supports_event_msg_buff;
 
        /*
-        * Can we clear the global enables receive irq bit?
+        * Can we disable interrupts the global enables receive irq
+        * bit?  There are currently two forms of brokenness, some
+        * systems cannot disable the bit (which is technically within
+        * the spec but a bad idea) and some systems have the bit
+        * forced to zero even though interrupts work (which is
+        * clearly outside the spec).  The next bool tells which form
+        * of brokenness is present.
         */
-       bool cannot_clear_recv_irq_bit;
+       bool cannot_disable_irq;
+
+       /*
+        * Some systems are broken and cannot set the irq enable
+        * bit, even if they support interrupts.
+        */
+       bool irq_enable_broken;
 
        /*
         * Did we get an attention that we did not handle?
@@ -309,9 +320,6 @@ static int num_force_kipmid;
 #ifdef CONFIG_PCI
 static bool pci_registered;
 #endif
-#ifdef CONFIG_ACPI
-static bool pnp_registered;
-#endif
 #ifdef CONFIG_PARISC
 static bool parisc_registered;
 #endif
@@ -558,13 +566,14 @@ static u8 current_global_enables(struct smi_info *smi_info, u8 base,
        if (smi_info->supports_event_msg_buff)
                enables |= IPMI_BMC_EVT_MSG_BUFF;
 
-       if ((smi_info->irq && !smi_info->interrupt_disabled) ||
-           smi_info->cannot_clear_recv_irq_bit)
+       if (((smi_info->irq && !smi_info->interrupt_disabled) ||
+            smi_info->cannot_disable_irq) &&
+           !smi_info->irq_enable_broken)
                enables |= IPMI_BMC_RCV_MSG_INTR;
 
        if (smi_info->supports_event_msg_buff &&
-           smi_info->irq && !smi_info->interrupt_disabled)
-
+           smi_info->irq && !smi_info->interrupt_disabled &&
+           !smi_info->irq_enable_broken)
                enables |= IPMI_BMC_EVT_MSG_INTR;
 
        *irq_on = enables & (IPMI_BMC_EVT_MSG_INTR | IPMI_BMC_RCV_MSG_INTR);
@@ -928,33 +937,36 @@ static void check_start_timer_thread(struct smi_info *smi_info)
        }
 }
 
+static void flush_messages(void *send_info)
+{
+       struct smi_info *smi_info = send_info;
+       enum si_sm_result result;
+
+       /*
+        * Currently, this function is called only in run-to-completion
+        * mode.  This means we are single-threaded, no need for locks.
+        */
+       result = smi_event_handler(smi_info, 0);
+       while (result != SI_SM_IDLE) {
+               udelay(SI_SHORT_TIMEOUT_USEC);
+               result = smi_event_handler(smi_info, SI_SHORT_TIMEOUT_USEC);
+       }
+}
+
 static void sender(void                *send_info,
                   struct ipmi_smi_msg *msg)
 {
        struct smi_info   *smi_info = send_info;
-       enum si_sm_result result;
        unsigned long     flags;
 
        debug_timestamp("Enqueue");
 
        if (smi_info->run_to_completion) {
                /*
-                * If we are running to completion, start it and run
-                * transactions until everything is clear.
+                * If we are running to completion, start it.  Upper
+                * layer will call flush_messages to clear it out.
                 */
                smi_info->waiting_msg = msg;
-
-               /*
-                * Run to completion means we are single-threaded, no
-                * need for locks.
-                */
-
-               result = smi_event_handler(smi_info, 0);
-               while (result != SI_SM_IDLE) {
-                       udelay(SI_SHORT_TIMEOUT_USEC);
-                       result = smi_event_handler(smi_info,
-                                                  SI_SHORT_TIMEOUT_USEC);
-               }
                return;
        }
 
@@ -975,17 +987,10 @@ static void sender(void                *send_info,
 static void set_run_to_completion(void *send_info, bool i_run_to_completion)
 {
        struct smi_info   *smi_info = send_info;
-       enum si_sm_result result;
 
        smi_info->run_to_completion = i_run_to_completion;
-       if (i_run_to_completion) {
-               result = smi_event_handler(smi_info, 0);
-               while (result != SI_SM_IDLE) {
-                       udelay(SI_SHORT_TIMEOUT_USEC);
-                       result = smi_event_handler(smi_info,
-                                                  SI_SHORT_TIMEOUT_USEC);
-               }
-       }
+       if (i_run_to_completion)
+               flush_messages(smi_info);
 }
 
 /*
@@ -1258,7 +1263,7 @@ static void set_maintenance_mode(void *send_info, bool enable)
                atomic_set(&smi_info->req_events, 0);
 }
 
-static struct ipmi_smi_handlers handlers = {
+static const struct ipmi_smi_handlers handlers = {
        .owner                  = THIS_MODULE,
        .start_processing       = smi_start_processing,
        .get_smi_info           = get_smi_info,
@@ -1267,6 +1272,7 @@ static struct ipmi_smi_handlers handlers = {
        .set_need_watch         = set_need_watch,
        .set_maintenance_mode   = set_maintenance_mode,
        .set_run_to_completion  = set_run_to_completion,
+       .flush_messages         = flush_messages,
        .poll                   = poll,
 };
 
@@ -1283,14 +1289,14 @@ static int smi_num; /* Used to sequence the SMIs */
 #define DEFAULT_REGSIZE                1
 
 #ifdef CONFIG_ACPI
-static bool          si_tryacpi = 1;
+static bool          si_tryacpi = true;
 #endif
 #ifdef CONFIG_DMI
-static bool          si_trydmi = 1;
+static bool          si_trydmi = true;
 #endif
-static bool          si_tryplatform = 1;
+static bool          si_tryplatform = true;
 #ifdef CONFIG_PCI
-static bool          si_trypci = 1;
+static bool          si_trypci = true;
 #endif
 static bool          si_trydefaults = IS_ENABLED(CONFIG_IPMI_SI_PROBE_DEFAULTS);
 static char          *si_type[SI_MAX_PARMS];
@@ -1446,14 +1452,14 @@ static int std_irq_setup(struct smi_info *info)
        return rv;
 }
 
-static unsigned char port_inb(struct si_sm_io *io, unsigned int offset)
+static unsigned char port_inb(const struct si_sm_io *io, unsigned int offset)
 {
        unsigned int addr = io->addr_data;
 
        return inb(addr + (offset * io->regspacing));
 }
 
-static void port_outb(struct si_sm_io *io, unsigned int offset,
+static void port_outb(const struct si_sm_io *io, unsigned int offset,
                      unsigned char b)
 {
        unsigned int addr = io->addr_data;
@@ -1461,14 +1467,14 @@ static void port_outb(struct si_sm_io *io, unsigned int offset,
        outb(b, addr + (offset * io->regspacing));
 }
 
-static unsigned char port_inw(struct si_sm_io *io, unsigned int offset)
+static unsigned char port_inw(const struct si_sm_io *io, unsigned int offset)
 {
        unsigned int addr = io->addr_data;
 
        return (inw(addr + (offset * io->regspacing)) >> io->regshift) & 0xff;
 }
 
-static void port_outw(struct si_sm_io *io, unsigned int offset,
+static void port_outw(const struct si_sm_io *io, unsigned int offset,
                      unsigned char b)
 {
        unsigned int addr = io->addr_data;
@@ -1476,14 +1482,14 @@ static void port_outw(struct si_sm_io *io, unsigned int offset,
        outw(b << io->regshift, addr + (offset * io->regspacing));
 }
 
-static unsigned char port_inl(struct si_sm_io *io, unsigned int offset)
+static unsigned char port_inl(const struct si_sm_io *io, unsigned int offset)
 {
        unsigned int addr = io->addr_data;
 
        return (inl(addr + (offset * io->regspacing)) >> io->regshift) & 0xff;
 }
 
-static void port_outl(struct si_sm_io *io, unsigned int offset,
+static void port_outl(const struct si_sm_io *io, unsigned int offset,
                      unsigned char b)
 {
        unsigned int addr = io->addr_data;
@@ -1556,49 +1562,52 @@ static int port_setup(struct smi_info *info)
        return 0;
 }
 
-static unsigned char intf_mem_inb(struct si_sm_io *io, unsigned int offset)
+static unsigned char intf_mem_inb(const struct si_sm_io *io,
+                                 unsigned int offset)
 {
        return readb((io->addr)+(offset * io->regspacing));
 }
 
-static void intf_mem_outb(struct si_sm_io *io, unsigned int offset,
-                    unsigned char b)
+static void intf_mem_outb(const struct si_sm_io *io, unsigned int offset,
+                         unsigned char b)
 {
        writeb(b, (io->addr)+(offset * io->regspacing));
 }
 
-static unsigned char intf_mem_inw(struct si_sm_io *io, unsigned int offset)
+static unsigned char intf_mem_inw(const struct si_sm_io *io,
+                                 unsigned int offset)
 {
        return (readw((io->addr)+(offset * io->regspacing)) >> io->regshift)
                & 0xff;
 }
 
-static void intf_mem_outw(struct si_sm_io *io, unsigned int offset,
-                    unsigned char b)
+static void intf_mem_outw(const struct si_sm_io *io, unsigned int offset,
+                         unsigned char b)
 {
        writeb(b << io->regshift, (io->addr)+(offset * io->regspacing));
 }
 
-static unsigned char intf_mem_inl(struct si_sm_io *io, unsigned int offset)
+static unsigned char intf_mem_inl(const struct si_sm_io *io,
+                                 unsigned int offset)
 {
        return (readl((io->addr)+(offset * io->regspacing)) >> io->regshift)
                & 0xff;
 }
 
-static void intf_mem_outl(struct si_sm_io *io, unsigned int offset,
-                    unsigned char b)
+static void intf_mem_outl(const struct si_sm_io *io, unsigned int offset,
+                         unsigned char b)
 {
        writel(b << io->regshift, (io->addr)+(offset * io->regspacing));
 }
 
 #ifdef readq
-static unsigned char mem_inq(struct si_sm_io *io, unsigned int offset)
+static unsigned char mem_inq(const struct si_sm_io *io, unsigned int offset)
 {
        return (readq((io->addr)+(offset * io->regspacing)) >> io->regshift)
                & 0xff;
 }
 
-static void mem_outq(struct si_sm_io *io, unsigned int offset,
+static void mem_outq(const struct si_sm_io *io, unsigned int offset,
                     unsigned char b)
 {
        writeq(b << io->regshift, (io->addr)+(offset * io->regspacing));
@@ -2233,134 +2242,6 @@ static void spmi_find_bmc(void)
                try_init_spmi(spmi);
        }
 }
-
-static int ipmi_pnp_probe(struct pnp_dev *dev,
-                                   const struct pnp_device_id *dev_id)
-{
-       struct acpi_device *acpi_dev;
-       struct smi_info *info;
-       struct resource *res, *res_second;
-       acpi_handle handle;
-       acpi_status status;
-       unsigned long long tmp;
-       int rv = -EINVAL;
-
-       acpi_dev = pnp_acpi_device(dev);
-       if (!acpi_dev)
-               return -ENODEV;
-
-       info = smi_info_alloc();
-       if (!info)
-               return -ENOMEM;
-
-       info->addr_source = SI_ACPI;
-       printk(KERN_INFO PFX "probing via ACPI\n");
-
-       handle = acpi_dev->handle;
-       info->addr_info.acpi_info.acpi_handle = handle;
-
-       /* _IFT tells us the interface type: KCS, BT, etc */
-       status = acpi_evaluate_integer(handle, "_IFT", NULL, &tmp);
-       if (ACPI_FAILURE(status)) {
-               dev_err(&dev->dev, "Could not find ACPI IPMI interface type\n");
-               goto err_free;
-       }
-
-       switch (tmp) {
-       case 1:
-               info->si_type = SI_KCS;
-               break;
-       case 2:
-               info->si_type = SI_SMIC;
-               break;
-       case 3:
-               info->si_type = SI_BT;
-               break;
-       case 4: /* SSIF, just ignore */
-               rv = -ENODEV;
-               goto err_free;
-       default:
-               dev_info(&dev->dev, "unknown IPMI type %lld\n", tmp);
-               goto err_free;
-       }
-
-       res = pnp_get_resource(dev, IORESOURCE_IO, 0);
-       if (res) {
-               info->io_setup = port_setup;
-               info->io.addr_type = IPMI_IO_ADDR_SPACE;
-       } else {
-               res = pnp_get_resource(dev, IORESOURCE_MEM, 0);
-               if (res) {
-                       info->io_setup = mem_setup;
-                       info->io.addr_type = IPMI_MEM_ADDR_SPACE;
-               }
-       }
-       if (!res) {
-               dev_err(&dev->dev, "no I/O or memory address\n");
-               goto err_free;
-       }
-       info->io.addr_data = res->start;
-
-       info->io.regspacing = DEFAULT_REGSPACING;
-       res_second = pnp_get_resource(dev,
-                              (info->io.addr_type == IPMI_IO_ADDR_SPACE) ?
-                                       IORESOURCE_IO : IORESOURCE_MEM,
-                              1);
-       if (res_second) {
-               if (res_second->start > info->io.addr_data)
-                       info->io.regspacing = res_second->start - info->io.addr_data;
-       }
-       info->io.regsize = DEFAULT_REGSPACING;
-       info->io.regshift = 0;
-
-       /* If _GPE exists, use it; otherwise use standard interrupts */
-       status = acpi_evaluate_integer(handle, "_GPE", NULL, &tmp);
-       if (ACPI_SUCCESS(status)) {
-               info->irq = tmp;
-               info->irq_setup = acpi_gpe_irq_setup;
-       } else if (pnp_irq_valid(dev, 0)) {
-               info->irq = pnp_irq(dev, 0);
-               info->irq_setup = std_irq_setup;
-       }
-
-       info->dev = &dev->dev;
-       pnp_set_drvdata(dev, info);
-
-       dev_info(info->dev, "%pR regsize %d spacing %d irq %d\n",
-                res, info->io.regsize, info->io.regspacing,
-                info->irq);
-
-       rv = add_smi(info);
-       if (rv)
-               kfree(info);
-
-       return rv;
-
-err_free:
-       kfree(info);
-       return rv;
-}
-
-static void ipmi_pnp_remove(struct pnp_dev *dev)
-{
-       struct smi_info *info = pnp_get_drvdata(dev);
-
-       cleanup_one_si(info);
-}
-
-static const struct pnp_device_id pnp_dev_table[] = {
-       {"IPI0001", 0},
-       {"", 0},
-};
-
-static struct pnp_driver ipmi_pnp_driver = {
-       .name           = DEVICE_NAME,
-       .probe          = ipmi_pnp_probe,
-       .remove         = ipmi_pnp_remove,
-       .id_table       = pnp_dev_table,
-};
-
-MODULE_DEVICE_TABLE(pnp, pnp_dev_table);
 #endif
 
 #ifdef CONFIG_DMI
@@ -2654,7 +2535,7 @@ static void ipmi_pci_remove(struct pci_dev *pdev)
        pci_disable_device(pdev);
 }
 
-static struct pci_device_id ipmi_pci_devices[] = {
+static const struct pci_device_id ipmi_pci_devices[] = {
        { PCI_DEVICE(PCI_HP_VENDOR_ID, PCI_MMC_DEVICE_ID) },
        { PCI_DEVICE_CLASS(PCI_ERMC_CLASSCODE, PCI_ERMC_CLASSCODE_MASK) },
        { 0, }
@@ -2669,10 +2550,19 @@ static struct pci_driver ipmi_pci_driver = {
 };
 #endif /* CONFIG_PCI */
 
-static const struct of_device_id ipmi_match[];
-static int ipmi_probe(struct platform_device *dev)
-{
 #ifdef CONFIG_OF
+static const struct of_device_id of_ipmi_match[] = {
+       { .type = "ipmi", .compatible = "ipmi-kcs",
+         .data = (void *)(unsigned long) SI_KCS },
+       { .type = "ipmi", .compatible = "ipmi-smic",
+         .data = (void *)(unsigned long) SI_SMIC },
+       { .type = "ipmi", .compatible = "ipmi-bt",
+         .data = (void *)(unsigned long) SI_BT },
+       {},
+};
+
+static int of_ipmi_probe(struct platform_device *dev)
+{
        const struct of_device_id *match;
        struct smi_info *info;
        struct resource resource;
@@ -2683,9 +2573,9 @@ static int ipmi_probe(struct platform_device *dev)
 
        dev_info(&dev->dev, "probing via device tree\n");
 
-       match = of_match_device(ipmi_match, &dev->dev);
+       match = of_match_device(of_ipmi_match, &dev->dev);
        if (!match)
-               return -EINVAL;
+               return -ENODEV;
 
        if (!of_device_is_available(np))
                return -EINVAL;
@@ -2754,33 +2644,160 @@ static int ipmi_probe(struct platform_device *dev)
                kfree(info);
                return ret;
        }
-#endif
        return 0;
 }
+MODULE_DEVICE_TABLE(of, of_ipmi_match);
+#else
+#define of_ipmi_match NULL
+static int of_ipmi_probe(struct platform_device *dev)
+{
+       return -ENODEV;
+}
+#endif
 
-static int ipmi_remove(struct platform_device *dev)
+#ifdef CONFIG_ACPI
+static int acpi_ipmi_probe(struct platform_device *dev)
 {
-#ifdef CONFIG_OF
-       cleanup_one_si(dev_get_drvdata(&dev->dev));
+       struct smi_info *info;
+       struct resource *res, *res_second;
+       acpi_handle handle;
+       acpi_status status;
+       unsigned long long tmp;
+       int rv = -EINVAL;
+
+       handle = ACPI_HANDLE(&dev->dev);
+       if (!handle)
+               return -ENODEV;
+
+       info = smi_info_alloc();
+       if (!info)
+               return -ENOMEM;
+
+       info->addr_source = SI_ACPI;
+       dev_info(&dev->dev, PFX "probing via ACPI\n");
+
+       info->addr_info.acpi_info.acpi_handle = handle;
+
+       /* _IFT tells us the interface type: KCS, BT, etc */
+       status = acpi_evaluate_integer(handle, "_IFT", NULL, &tmp);
+       if (ACPI_FAILURE(status)) {
+               dev_err(&dev->dev, "Could not find ACPI IPMI interface type\n");
+               goto err_free;
+       }
+
+       switch (tmp) {
+       case 1:
+               info->si_type = SI_KCS;
+               break;
+       case 2:
+               info->si_type = SI_SMIC;
+               break;
+       case 3:
+               info->si_type = SI_BT;
+               break;
+       case 4: /* SSIF, just ignore */
+               rv = -ENODEV;
+               goto err_free;
+       default:
+               dev_info(&dev->dev, "unknown IPMI type %lld\n", tmp);
+               goto err_free;
+       }
+
+       res = platform_get_resource(dev, IORESOURCE_IO, 0);
+       if (res) {
+               info->io_setup = port_setup;
+               info->io.addr_type = IPMI_IO_ADDR_SPACE;
+       } else {
+               res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+               if (res) {
+                       info->io_setup = mem_setup;
+                       info->io.addr_type = IPMI_MEM_ADDR_SPACE;
+               }
+       }
+       if (!res) {
+               dev_err(&dev->dev, "no I/O or memory address\n");
+               goto err_free;
+       }
+       info->io.addr_data = res->start;
+
+       info->io.regspacing = DEFAULT_REGSPACING;
+       res_second = platform_get_resource(dev,
+                              (info->io.addr_type == IPMI_IO_ADDR_SPACE) ?
+                                       IORESOURCE_IO : IORESOURCE_MEM,
+                              1);
+       if (res_second) {
+               if (res_second->start > info->io.addr_data)
+                       info->io.regspacing =
+                               res_second->start - info->io.addr_data;
+       }
+       info->io.regsize = DEFAULT_REGSPACING;
+       info->io.regshift = 0;
+
+       /* If _GPE exists, use it; otherwise use standard interrupts */
+       status = acpi_evaluate_integer(handle, "_GPE", NULL, &tmp);
+       if (ACPI_SUCCESS(status)) {
+               info->irq = tmp;
+               info->irq_setup = acpi_gpe_irq_setup;
+       } else {
+               int irq = platform_get_irq(dev, 0);
+
+               if (irq > 0) {
+                       info->irq = irq;
+                       info->irq_setup = std_irq_setup;
+               }
+       }
+
+       info->dev = &dev->dev;
+       platform_set_drvdata(dev, info);
+
+       dev_info(info->dev, "%pR regsize %d spacing %d irq %d\n",
+                res, info->io.regsize, info->io.regspacing,
+                info->irq);
+
+       rv = add_smi(info);
+       if (rv)
+               kfree(info);
+
+       return rv;
+
+err_free:
+       kfree(info);
+       return rv;
+}
+
+static const struct acpi_device_id acpi_ipmi_match[] = {
+       { "IPI0001", 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(acpi, acpi_ipmi_match);
+#else
+static int acpi_ipmi_probe(struct platform_device *dev)
+{
+       return -ENODEV;
+}
 #endif
-       return 0;
+
+static int ipmi_probe(struct platform_device *dev)
+{
+       if (of_ipmi_probe(dev) == 0)
+               return 0;
+
+       return acpi_ipmi_probe(dev);
 }
 
-static const struct of_device_id ipmi_match[] =
+static int ipmi_remove(struct platform_device *dev)
 {
-       { .type = "ipmi", .compatible = "ipmi-kcs",
-         .data = (void *)(unsigned long) SI_KCS },
-       { .type = "ipmi", .compatible = "ipmi-smic",
-         .data = (void *)(unsigned long) SI_SMIC },
-       { .type = "ipmi", .compatible = "ipmi-bt",
-         .data = (void *)(unsigned long) SI_BT },
-       {},
-};
+       struct smi_info *info = dev_get_drvdata(&dev->dev);
+
+       cleanup_one_si(info);
+       return 0;
+}
 
 static struct platform_driver ipmi_driver = {
        .driver = {
                .name = DEVICE_NAME,
-               .of_match_table = ipmi_match,
+               .of_match_table = of_ipmi_match,
+               .acpi_match_table = ACPI_PTR(acpi_ipmi_match),
        },
        .probe          = ipmi_probe,
        .remove         = ipmi_remove,
@@ -2905,12 +2922,7 @@ static int try_get_dev_id(struct smi_info *smi_info)
        return rv;
 }
 
-/*
- * Some BMCs do not support clearing the receive irq bit in the global
- * enables (even if they don't support interrupts on the BMC).  Check
- * for this and handle it properly.
- */
-static void check_clr_rcv_irq(struct smi_info *smi_info)
+static int get_global_enables(struct smi_info *smi_info, u8 *enables)
 {
        unsigned char         msg[3];
        unsigned char         *resp;
@@ -2918,12 +2930,8 @@ static void check_clr_rcv_irq(struct smi_info *smi_info)
        int                   rv;
 
        resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
-       if (!resp) {
-               printk(KERN_WARNING PFX "Out of memory allocating response for"
-                      " global enables command, cannot check recv irq bit"
-                      " handling.\n");
-               return;
-       }
+       if (!resp)
+               return -ENOMEM;
 
        msg[0] = IPMI_NETFN_APP_REQUEST << 2;
        msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
@@ -2931,9 +2939,9 @@ static void check_clr_rcv_irq(struct smi_info *smi_info)
 
        rv = wait_for_msg_done(smi_info);
        if (rv) {
-               printk(KERN_WARNING PFX "Error getting response from get"
-                      " global enables command, cannot check recv irq bit"
-                      " handling.\n");
+               dev_warn(smi_info->dev,
+                        "Error getting response from get global enables command: %d\n",
+                        rv);
                goto out;
        }
 
@@ -2944,27 +2952,44 @@ static void check_clr_rcv_irq(struct smi_info *smi_info)
                        resp[0] != (IPMI_NETFN_APP_REQUEST | 1) << 2 ||
                        resp[1] != IPMI_GET_BMC_GLOBAL_ENABLES_CMD   ||
                        resp[2] != 0) {
-               printk(KERN_WARNING PFX "Invalid return from get global"
-                      " enables command, cannot check recv irq bit"
-                      " handling.\n");
+               dev_warn(smi_info->dev,
+                        "Invalid return from get global enables command: %ld %x %x %x\n",
+                        resp_len, resp[0], resp[1], resp[2]);
                rv = -EINVAL;
                goto out;
+       } else {
+               *enables = resp[3];
        }
 
-       if ((resp[3] & IPMI_BMC_RCV_MSG_INTR) == 0)
-               /* Already clear, should work ok. */
-               goto out;
+out:
+       kfree(resp);
+       return rv;
+}
+
+/*
+ * Returns 1 if it gets an error from the command.
+ */
+static int set_global_enables(struct smi_info *smi_info, u8 enables)
+{
+       unsigned char         msg[3];
+       unsigned char         *resp;
+       unsigned long         resp_len;
+       int                   rv;
+
+       resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+       if (!resp)
+               return -ENOMEM;
 
        msg[0] = IPMI_NETFN_APP_REQUEST << 2;
        msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
-       msg[2] = resp[3] & ~IPMI_BMC_RCV_MSG_INTR;
+       msg[2] = enables;
        smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
 
        rv = wait_for_msg_done(smi_info);
        if (rv) {
-               printk(KERN_WARNING PFX "Error getting response from set"
-                      " global enables command, cannot check recv irq bit"
-                      " handling.\n");
+               dev_warn(smi_info->dev,
+                        "Error getting response from set global enables command: %d\n",
+                        rv);
                goto out;
        }
 
@@ -2974,25 +2999,93 @@ static void check_clr_rcv_irq(struct smi_info *smi_info)
        if (resp_len < 3 ||
                        resp[0] != (IPMI_NETFN_APP_REQUEST | 1) << 2 ||
                        resp[1] != IPMI_SET_BMC_GLOBAL_ENABLES_CMD) {
-               printk(KERN_WARNING PFX "Invalid return from get global"
-                      " enables command, cannot check recv irq bit"
-                      " handling.\n");
+               dev_warn(smi_info->dev,
+                        "Invalid return from set global enables command: %ld %x %x\n",
+                        resp_len, resp[0], resp[1]);
                rv = -EINVAL;
                goto out;
        }
 
-       if (resp[2] != 0) {
+       if (resp[2] != 0)
+               rv = 1;
+
+out:
+       kfree(resp);
+       return rv;
+}
+
+/*
+ * Some BMCs do not support clearing the receive irq bit in the global
+ * enables (even if they don't support interrupts on the BMC).  Check
+ * for this and handle it properly.
+ */
+static void check_clr_rcv_irq(struct smi_info *smi_info)
+{
+       u8 enables = 0;
+       int rv;
+
+       rv = get_global_enables(smi_info, &enables);
+       if (!rv) {
+               if ((enables & IPMI_BMC_RCV_MSG_INTR) == 0)
+                       /* Already clear, should work ok. */
+                       return;
+
+               enables &= ~IPMI_BMC_RCV_MSG_INTR;
+               rv = set_global_enables(smi_info, enables);
+       }
+
+       if (rv < 0) {
+               dev_err(smi_info->dev,
+                       "Cannot check clearing the rcv irq: %d\n", rv);
+               return;
+       }
+
+       if (rv) {
                /*
                 * An error when setting the event buffer bit means
                 * clearing the bit is not supported.
                 */
-               printk(KERN_WARNING PFX "The BMC does not support clearing"
-                      " the recv irq bit, compensating, but the BMC needs to"
-                      " be fixed.\n");
-               smi_info->cannot_clear_recv_irq_bit = true;
+               dev_warn(smi_info->dev,
+                        "The BMC does not support clearing the recv irq bit, compensating, but the BMC needs to be fixed.\n");
+               smi_info->cannot_disable_irq = true;
+       }
+}
+
+/*
+ * Some BMCs do not support setting the interrupt bits in the global
+ * enables even if they support interrupts.  Clearly bad, but we can
+ * compensate.
+ */
+static void check_set_rcv_irq(struct smi_info *smi_info)
+{
+       u8 enables = 0;
+       int rv;
+
+       if (!smi_info->irq)
+               return;
+
+       rv = get_global_enables(smi_info, &enables);
+       if (!rv) {
+               enables |= IPMI_BMC_RCV_MSG_INTR;
+               rv = set_global_enables(smi_info, enables);
+       }
+
+       if (rv < 0) {
+               dev_err(smi_info->dev,
+                       "Cannot check setting the rcv irq: %d\n", rv);
+               return;
+       }
+
+       if (rv) {
+               /*
+                * An error when setting the event buffer bit means
+                * setting the bit is not supported.
+                */
+               dev_warn(smi_info->dev,
+                        "The BMC does not support setting the recv irq bit, compensating, but the BMC needs to be fixed.\n");
+               smi_info->cannot_disable_irq = true;
+               smi_info->irq_enable_broken = true;
        }
- out:
-       kfree(resp);
 }
 
 static int try_enable_event_buffer(struct smi_info *smi_info)
@@ -3313,6 +3406,12 @@ static void setup_xaction_handlers(struct smi_info *smi_info)
        setup_dell_poweredge_bt_xaction_handler(smi_info);
 }
 
+static void check_for_broken_irqs(struct smi_info *smi_info)
+{
+       check_clr_rcv_irq(smi_info);
+       check_set_rcv_irq(smi_info);
+}
+
 static inline void wait_for_timer_and_thread(struct smi_info *smi_info)
 {
        if (smi_info->thread != NULL)
@@ -3321,7 +3420,7 @@ static inline void wait_for_timer_and_thread(struct smi_info *smi_info)
                del_timer_sync(&smi_info->si_timer);
 }
 
-static struct ipmi_default_vals
+static const struct ipmi_default_vals
 {
        int type;
        int port;
@@ -3490,10 +3589,9 @@ static int try_smi_init(struct smi_info *new_smi)
                goto out_err;
        }
 
-       check_clr_rcv_irq(new_smi);
-
        setup_oem_data_handler(new_smi);
        setup_xaction_handlers(new_smi);
+       check_for_broken_irqs(new_smi);
 
        new_smi->waiting_msg = NULL;
        new_smi->curr_msg = NULL;
@@ -3692,13 +3790,6 @@ static int init_ipmi_si(void)
        }
 #endif
 
-#ifdef CONFIG_ACPI
-       if (si_tryacpi) {
-               pnp_register_driver(&ipmi_pnp_driver);
-               pnp_registered = true;
-       }
-#endif
-
 #ifdef CONFIG_DMI
        if (si_trydmi)
                dmi_find_bmc();
@@ -3850,10 +3941,6 @@ static void cleanup_ipmi_si(void)
        if (pci_registered)
                pci_unregister_driver(&ipmi_pci_driver);
 #endif
-#ifdef CONFIG_ACPI
-       if (pnp_registered)
-               pnp_unregister_driver(&ipmi_pnp_driver);
-#endif
 #ifdef CONFIG_PARISC
        if (parisc_registered)
                unregister_parisc_driver(&ipmi_parisc_driver);
index df89f73475fb996770bfeae8f3710038feabb825..a705027c0493f2241a9b917f6ac9f06849eeb9db 100644 (file)
@@ -46,8 +46,8 @@ struct si_sm_data;
  * this interface.
  */
 struct si_sm_io {
-       unsigned char (*inputb)(struct si_sm_io *io, unsigned int offset);
-       void (*outputb)(struct si_sm_io *io,
+       unsigned char (*inputb)(const struct si_sm_io *io, unsigned int offset);
+       void (*outputb)(const struct si_sm_io *io,
                        unsigned int  offset,
                        unsigned char b);
 
@@ -135,7 +135,7 @@ struct si_sm_handlers {
 };
 
 /* Current state machines that we can use. */
-extern struct si_sm_handlers kcs_smi_handlers;
-extern struct si_sm_handlers smic_smi_handlers;
-extern struct si_sm_handlers bt_smi_handlers;
+extern const struct si_sm_handlers kcs_smi_handlers;
+extern const struct si_sm_handlers smic_smi_handlers;
+extern const struct si_sm_handlers bt_smi_handlers;
 
index c8e77afa8b961c31bafb4dd929f39255be98274f..8f7c73ff58f23c11c849358ea3d60f1c5831d7fb 100644 (file)
@@ -589,7 +589,7 @@ static int smic_size(void)
        return sizeof(struct si_sm_data);
 }
 
-struct si_sm_handlers smic_smi_handlers = {
+const struct si_sm_handlers smic_smi_handlers = {
        .init_data         = init_smic_data,
        .start_transaction = start_smic_transaction,
        .get_result        = smic_get_result,
index 207689c444a8155540b72280b9c402c027ac361b..877205d2204686b0f39c3c798ce5ecf9fa19d358 100644 (file)
@@ -1136,6 +1136,10 @@ module_param_array(slave_addrs, int, &num_slave_addrs, 0);
 MODULE_PARM_DESC(slave_addrs,
                 "The default IPMB slave address for the controller.");
 
+static bool alerts_broken;
+module_param(alerts_broken, bool, 0);
+MODULE_PARM_DESC(alerts_broken, "Don't enable alerts for the controller.");
+
 /*
  * Bit 0 enables message debugging, bit 1 enables state debugging, and
  * bit 2 enables timing debugging.  This is an array indexed by
@@ -1154,11 +1158,11 @@ static int use_thread;
 module_param(use_thread, int, 0);
 MODULE_PARM_DESC(use_thread, "Use the thread interface.");
 
-static bool ssif_tryacpi = 1;
+static bool ssif_tryacpi = true;
 module_param_named(tryacpi, ssif_tryacpi, bool, 0);
 MODULE_PARM_DESC(tryacpi, "Setting this to zero will disable the default scan of the interfaces identified via ACPI");
 
-static bool ssif_trydmi = 1;
+static bool ssif_trydmi = true;
 module_param_named(trydmi, ssif_trydmi, bool, 0);
 MODULE_PARM_DESC(trydmi, "Setting this to zero will disable the default scan of the interfaces identified via DMI (SMBIOS)");
 
@@ -1582,6 +1586,10 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
                ssif_info->global_enables |= IPMI_BMC_EVT_MSG_BUFF;
        }
 
+       /* Some systems don't behave well if you enable alerts. */
+       if (alerts_broken)
+               goto found;
+
        msg[0] = IPMI_NETFN_APP_REQUEST << 2;
        msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
        msg[2] = ssif_info->global_enables | IPMI_BMC_RCV_MSG_INTR;
@@ -1787,7 +1795,7 @@ skip_addr:
 }
 
 #ifdef CONFIG_ACPI
-static struct acpi_device_id ssif_acpi_match[] = {
+static const struct acpi_device_id ssif_acpi_match[] = {
        { "IPI0001", 0 },
        { },
 };
index 2a38eb4a25527604ec0cbd867ab5d6fbfbc4b648..6cf38dc1c9291e843f140e187b5dd0f0aadf6d4b 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/err.h>
 #include <linux/device.h>
 #include <linux/of_address.h>
+#include <linux/slab.h>
 
 static DEFINE_SPINLOCK(clklock);
 
index 2c16807341dce86b386cb662aa1a52418d311665..e434854486127a5287fd7b73f92fa3dc7c5151cc 100644 (file)
@@ -1,6 +1,12 @@
 config COMMON_CLK_HI6220
        bool "Hi6220 Clock Driver"
-       depends on (ARCH_HISI || COMPILE_TEST) && MAILBOX
+       depends on ARCH_HISI || COMPILE_TEST
        default ARCH_HISI
        help
          Build the Hisilicon Hi6220 clock driver based on the common clock framework.
+
+config STUB_CLK_HI6220
+       bool "Hi6220 Stub Clock Driver"
+       depends on COMMON_CLK_HI6220 && MAILBOX
+       help
+         Build the Hisilicon Hi6220 stub clock driver.
index 4a1001a11f04502d1b0b003ec5a7b3785946f313..74dba31590f9a39e122b1aa589cb869b685d3e18 100644 (file)
@@ -7,4 +7,5 @@ obj-y   += clk.o clkgate-separated.o clkdivider-hi6220.o
 obj-$(CONFIG_ARCH_HI3xxx)      += clk-hi3620.o
 obj-$(CONFIG_ARCH_HIP04)       += clk-hip04.o
 obj-$(CONFIG_ARCH_HIX5HD2)     += clk-hix5hd2.o
-obj-$(CONFIG_COMMON_CLK_HI6220)        += clk-hi6220.o clk-hi6220-stub.o
+obj-$(CONFIG_COMMON_CLK_HI6220)        += clk-hi6220.o
+obj-$(CONFIG_STUB_CLK_HI6220)  += clk-hi6220-stub.o
index ed02bbc7b11f303a3d265003074ef022e545ddcf..abb47608713bc39a8fddbdf915f29955b10a497f 100644 (file)
@@ -716,6 +716,8 @@ static const char *const rk3188_critical_clocks[] __initconst = {
        "aclk_cpu",
        "aclk_peri",
        "hclk_peri",
+       "pclk_cpu",
+       "pclk_peri",
 };
 
 static void __init rk3188_common_clk_init(struct device_node *np)
@@ -744,8 +746,6 @@ static void __init rk3188_common_clk_init(struct device_node *np)
 
        rockchip_clk_register_branches(common_clk_branches,
                                  ARRAY_SIZE(common_clk_branches));
-       rockchip_clk_protect_critical(rk3188_critical_clocks,
-                                     ARRAY_SIZE(rk3188_critical_clocks));
 
        rockchip_register_softrst(np, 9, reg_base + RK2928_SOFTRST_CON(0),
                                  ROCKCHIP_SOFTRST_HIWORD_MASK);
@@ -765,6 +765,8 @@ static void __init rk3066a_clk_init(struct device_node *np)
                        mux_armclk_p, ARRAY_SIZE(mux_armclk_p),
                        &rk3066_cpuclk_data, rk3066_cpuclk_rates,
                        ARRAY_SIZE(rk3066_cpuclk_rates));
+       rockchip_clk_protect_critical(rk3188_critical_clocks,
+                                     ARRAY_SIZE(rk3188_critical_clocks));
 }
 CLK_OF_DECLARE(rk3066a_cru, "rockchip,rk3066a-cru", rk3066a_clk_init);
 
@@ -801,6 +803,9 @@ static void __init rk3188a_clk_init(struct device_node *np)
                pr_warn("%s: missing clocks to reparent aclk_cpu_pre to gpll\n",
                        __func__);
        }
+
+       rockchip_clk_protect_critical(rk3188_critical_clocks,
+                                     ARRAY_SIZE(rk3188_critical_clocks));
 }
 CLK_OF_DECLARE(rk3188a_cru, "rockchip,rk3188a-cru", rk3188a_clk_init);
 
index 251f48dcd12d5fa344a80d370bd6177c7f705091..7f370d3e098379e9076210764361ebb5046c739f 100644 (file)
@@ -1398,6 +1398,45 @@ static const struct exynos_cpuclk_cfg_data e4210_armclk_d[] __initconst = {
        {  0 },
 };
 
+static const struct exynos_cpuclk_cfg_data e4212_armclk_d[] __initconst = {
+       { 1500000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4210_CPU_DIV1(2, 6), },
+       { 1400000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4210_CPU_DIV1(2, 6), },
+       { 1300000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4210_CPU_DIV1(2, 5), },
+       { 1200000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4210_CPU_DIV1(2, 5), },
+       { 1100000, E4210_CPU_DIV0(2, 1, 4, 0, 6, 3), E4210_CPU_DIV1(2, 4), },
+       { 1000000, E4210_CPU_DIV0(1, 1, 4, 0, 5, 2), E4210_CPU_DIV1(2, 4), },
+       {  900000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4210_CPU_DIV1(2, 3), },
+       {  800000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4210_CPU_DIV1(2, 3), },
+       {  700000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  600000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  500000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  400000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  300000, E4210_CPU_DIV0(1, 1, 2, 0, 4, 2), E4210_CPU_DIV1(2, 3), },
+       {  200000, E4210_CPU_DIV0(1, 1, 1, 0, 3, 1), E4210_CPU_DIV1(2, 3), },
+       {  0 },
+};
+
+#define E4412_CPU_DIV1(cores, hpm, copy)                               \
+               (((cores) << 8) | ((hpm) << 4) | ((copy) << 0))
+
+static const struct exynos_cpuclk_cfg_data e4412_armclk_d[] __initconst = {
+       { 1500000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4412_CPU_DIV1(7, 0, 6), },
+       { 1400000, E4210_CPU_DIV0(2, 1, 6, 0, 7, 3), E4412_CPU_DIV1(6, 0, 6), },
+       { 1300000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4412_CPU_DIV1(6, 0, 5), },
+       { 1200000, E4210_CPU_DIV0(2, 1, 5, 0, 7, 3), E4412_CPU_DIV1(5, 0, 5), },
+       { 1100000, E4210_CPU_DIV0(2, 1, 4, 0, 6, 3), E4412_CPU_DIV1(5, 0, 4), },
+       { 1000000, E4210_CPU_DIV0(1, 1, 4, 0, 5, 2), E4412_CPU_DIV1(4, 0, 4), },
+       {  900000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4412_CPU_DIV1(4, 0, 3), },
+       {  800000, E4210_CPU_DIV0(1, 1, 3, 0, 5, 2), E4412_CPU_DIV1(3, 0, 3), },
+       {  700000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(3, 0, 3), },
+       {  600000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(2, 0, 3), },
+       {  500000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(2, 0, 3), },
+       {  400000, E4210_CPU_DIV0(1, 1, 3, 0, 4, 2), E4412_CPU_DIV1(1, 0, 3), },
+       {  300000, E4210_CPU_DIV0(1, 1, 2, 0, 4, 2), E4412_CPU_DIV1(1, 0, 3), },
+       {  200000, E4210_CPU_DIV0(1, 1, 1, 0, 3, 1), E4412_CPU_DIV1(0, 0, 3), },
+       {  0 },
+};
+
 /* register exynos4 clocks */
 static void __init exynos4_clk_init(struct device_node *np,
                                    enum exynos4_soc soc)
@@ -1491,6 +1530,17 @@ static void __init exynos4_clk_init(struct device_node *np,
                samsung_clk_register_fixed_factor(ctx,
                        exynos4x12_fixed_factor_clks,
                        ARRAY_SIZE(exynos4x12_fixed_factor_clks));
+               if (of_machine_is_compatible("samsung,exynos4412")) {
+                       exynos_register_cpu_clock(ctx, CLK_ARM_CLK, "armclk",
+                               mout_core_p4x12[0], mout_core_p4x12[1], 0x14200,
+                               e4412_armclk_d, ARRAY_SIZE(e4412_armclk_d),
+                               CLK_CPU_NEEDS_DEBUG_ALT_DIV | CLK_CPU_HAS_DIV1);
+               } else {
+                       exynos_register_cpu_clock(ctx, CLK_ARM_CLK, "armclk",
+                               mout_core_p4x12[0], mout_core_p4x12[1], 0x14200,
+                               e4212_armclk_d, ARRAY_SIZE(e4212_armclk_d),
+                               CLK_CPU_NEEDS_DEBUG_ALT_DIV | CLK_CPU_HAS_DIV1);
+               }
        }
 
        samsung_clk_register_alias(ctx, exynos4_aliases,
index 5b60beb7d0ebce09f7f0f34b97620dd7025ba762..a91825471c79acd28f68ea5f0f2c998d54a571d9 100644 (file)
@@ -28,6 +28,8 @@
 #define USIBU1_RSTCTRL 0x0ac
 #define USIBU2_RSTCTRL 0x0b0
 #define USIBU3_RSTCTRL 0x0b4
+#define IIC0_RSTCTRL 0x0dc
+#define IIC1_RSTCTRL 0x0e0
 #define STI_RSTCTRL 0x124
 #define STI_CLKSEL 0x688
 
@@ -66,6 +68,10 @@ static void __init emev2_smu_init(void)
        emev2_smu_write(2, USIBU1_RSTCTRL);
        emev2_smu_write(2, USIBU2_RSTCTRL);
        emev2_smu_write(2, USIBU3_RSTCTRL);
+
+       /* deassert reset for IIC0->IIC1 */
+       emev2_smu_write(1, IIC0_RSTCTRL);
+       emev2_smu_write(1, IIC1_RSTCTRL);
 }
 
 static void __init emev2_smu_clkdiv_init(struct device_node *np)
index 77aa34eae92cbacf67e4027c6068a5551a0b83eb..cd0391e46c6dbc6163819b533ce225f3afe898e8 100644 (file)
@@ -24,55 +24,6 @@ config ARM_VEXPRESS_SPC_CPUFREQ
           This add the CPUfreq driver support for Versatile Express
          big.LITTLE platforms using SPC for power management.
 
-
-config ARM_EXYNOS_CPUFREQ
-       tristate "SAMSUNG EXYNOS CPUfreq Driver"
-       depends on CPU_EXYNOS4210 || SOC_EXYNOS4212 || SOC_EXYNOS4412 || SOC_EXYNOS5250
-       depends on THERMAL
-       help
-         This adds the CPUFreq driver for Samsung EXYNOS platforms.
-         Supported SoC versions are:
-            Exynos4210, Exynos4212, Exynos4412, and Exynos5250.
-
-         If in doubt, say N.
-
-config ARM_EXYNOS4X12_CPUFREQ
-       bool "SAMSUNG EXYNOS4x12"
-       depends on SOC_EXYNOS4212 || SOC_EXYNOS4412
-       depends on ARM_EXYNOS_CPUFREQ
-       default y
-       help
-         This adds the CPUFreq driver for Samsung EXYNOS4X12
-         SoC (EXYNOS4212 or EXYNOS4412).
-
-         If in doubt, say N.
-
-config ARM_EXYNOS5250_CPUFREQ
-       bool "SAMSUNG EXYNOS5250"
-       depends on SOC_EXYNOS5250
-       depends on ARM_EXYNOS_CPUFREQ
-       default y
-       help
-         This adds the CPUFreq driver for Samsung EXYNOS5250
-         SoC.
-
-         If in doubt, say N.
-
-config ARM_EXYNOS_CPU_FREQ_BOOST_SW
-       bool "EXYNOS Frequency Overclocking - Software"
-       depends on ARM_EXYNOS_CPUFREQ && THERMAL
-       select CPU_FREQ_BOOST_SW
-       select EXYNOS_THERMAL
-       help
-         This driver supports software managed overclocking (BOOST).
-         It allows usage of special frequencies for Samsung Exynos
-         processors if thermal conditions are appropriate.
-
-         It requires, for safe operation, thermal framework with properly
-         defined trip points.
-
-         If in doubt, say N.
-
 config ARM_EXYNOS5440_CPUFREQ
        tristate "SAMSUNG EXYNOS5440"
        depends on SOC_EXYNOS5440
@@ -133,6 +84,7 @@ config ARM_KIRKWOOD_CPUFREQ
 config ARM_MT8173_CPUFREQ
        bool "Mediatek MT8173 CPUFreq support"
        depends on ARCH_MEDIATEK && REGULATOR
+       depends on !CPU_THERMAL || THERMAL=y
        select PM_OPP
        help
          This adds the CPUFreq driver support for Mediatek MT8173 SoC.
index 60a57ca5b22ddd47a74af1cbe99f13626197d9a7..41340384f11f291c4fe93e96e73559926f4f59b1 100644 (file)
@@ -52,10 +52,6 @@ obj-$(CONFIG_ARM_DT_BL_CPUFREQ)              += arm_big_little_dt.o
 
 obj-$(CONFIG_ARCH_DAVINCI)             += davinci-cpufreq.o
 obj-$(CONFIG_UX500_SOC_DB8500)         += dbx500-cpufreq.o
-obj-$(CONFIG_ARM_EXYNOS_CPUFREQ)       += arm-exynos-cpufreq.o
-arm-exynos-cpufreq-y                                   := exynos-cpufreq.o
-arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ)    += exynos4x12-cpufreq.o
-arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS5250_CPUFREQ)    += exynos5250-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ)   += exynos5440-cpufreq.o
 obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)     += highbank-cpufreq.o
 obj-$(CONFIG_ARM_HISI_ACPU_CPUFREQ)    += hisi-acpu-cpufreq.o
index c3583cdfadbdf2704e0a8a1e83d9e74324940a6e..7c0d70e2a86163b7a102276e1a51d1ceeb1b7b5c 100644 (file)
@@ -196,6 +196,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
        struct device *cpu_dev;
        struct regulator *cpu_reg;
        struct clk *cpu_clk;
+       struct dev_pm_opp *suspend_opp;
        unsigned long min_uV = ~0, max_uV = 0;
        unsigned int transition_latency;
        bool need_update = false;
@@ -239,6 +240,17 @@ static int cpufreq_init(struct cpufreq_policy *policy)
         */
        of_cpumask_init_opp_table(policy->cpus);
 
+       /*
+        * But we need OPP table to function so if it is not there let's
+        * give platform code chance to provide it for us.
+        */
+       ret = dev_pm_opp_get_opp_count(cpu_dev);
+       if (ret <= 0) {
+               pr_debug("OPP table is not ready, deferring probe\n");
+               ret = -EPROBE_DEFER;
+               goto out_free_opp;
+       }
+
        if (need_update) {
                struct cpufreq_dt_platform_data *pd = cpufreq_get_driver_data();
 
@@ -249,24 +261,16 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                 * OPP tables are initialized only for policy->cpu, do it for
                 * others as well.
                 */
-               set_cpus_sharing_opps(cpu_dev, policy->cpus);
+               ret = set_cpus_sharing_opps(cpu_dev, policy->cpus);
+               if (ret)
+                       dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n",
+                               __func__, ret);
 
                of_property_read_u32(np, "clock-latency", &transition_latency);
        } else {
                transition_latency = dev_pm_opp_get_max_clock_latency(cpu_dev);
        }
 
-       /*
-        * But we need OPP table to function so if it is not there let's
-        * give platform code chance to provide it for us.
-        */
-       ret = dev_pm_opp_get_opp_count(cpu_dev);
-       if (ret <= 0) {
-               pr_debug("OPP table is not ready, deferring probe\n");
-               ret = -EPROBE_DEFER;
-               goto out_free_opp;
-       }
-
        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv) {
                ret = -ENOMEM;
@@ -300,7 +304,8 @@ static int cpufreq_init(struct cpufreq_policy *policy)
                        rcu_read_unlock();
 
                        tol_uV = opp_uV * priv->voltage_tolerance / 100;
-                       if (regulator_is_supported_voltage(cpu_reg, opp_uV,
+                       if (regulator_is_supported_voltage(cpu_reg,
+                                                          opp_uV - tol_uV,
                                                           opp_uV + tol_uV)) {
                                if (opp_uV < min_uV)
                                        min_uV = opp_uV;
@@ -329,6 +334,13 @@ static int cpufreq_init(struct cpufreq_policy *policy)
        policy->driver_data = priv;
 
        policy->clk = cpu_clk;
+
+       rcu_read_lock();
+       suspend_opp = dev_pm_opp_get_suspend_opp(cpu_dev);
+       if (suspend_opp)
+               policy->suspend_freq = dev_pm_opp_get_freq(suspend_opp) / 1000;
+       rcu_read_unlock();
+
        ret = cpufreq_table_validate_and_show(policy, freq_table);
        if (ret) {
                dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__,
@@ -419,6 +431,7 @@ static struct cpufreq_driver dt_cpufreq_driver = {
        .ready = cpufreq_ready,
        .name = "cpufreq-dt",
        .attr = cpufreq_dt_attr,
+       .suspend = cpufreq_generic_suspend,
 };
 
 static int dt_cpufreq_probe(struct platform_device *pdev)
index b3d9368339af3530baa756441b347df3412a3b64..6633b3fa996e06091089297f05e5bb710b995134 100644 (file)
@@ -239,7 +239,7 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 EXPORT_SYMBOL_GPL(cpufreq_generic_init);
 
 /* Only for cpufreq core internal use */
-struct cpufreq_policy *cpufreq_cpu_get_raw(unsigned int cpu)
+static struct cpufreq_policy *cpufreq_cpu_get_raw(unsigned int cpu)
 {
        struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
 
@@ -1626,8 +1626,8 @@ int cpufreq_generic_suspend(struct cpufreq_policy *policy)
        int ret;
 
        if (!policy->suspend_freq) {
-               pr_err("%s: suspend_freq can't be zero\n", __func__);
-               return -EINVAL;
+               pr_debug("%s: suspend_freq not defined\n", __func__);
+               return 0;
        }
 
        pr_debug("%s: Setting suspend-freq: %u\n", __func__,
@@ -2031,8 +2031,7 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
                if (!try_module_get(policy->governor->owner))
                        return -EINVAL;
 
-       pr_debug("__cpufreq_governor for CPU %u, event %u\n",
-                policy->cpu, event);
+       pr_debug("%s: for CPU %u, event %u\n", __func__, policy->cpu, event);
 
        mutex_lock(&cpufreq_governor_lock);
        if ((policy->governor_enabled && event == CPUFREQ_GOV_START)
diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c
deleted file mode 100644 (file)
index fa3dd84..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2010-2011 Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS - CPU frequency scaling support for EXYNOS series
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/regulator/consumer.h>
-#include <linux/cpufreq.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/cpu_cooling.h>
-#include <linux/cpu.h>
-
-#include "exynos-cpufreq.h"
-
-static struct exynos_dvfs_info *exynos_info;
-static struct thermal_cooling_device *cdev;
-static struct regulator *arm_regulator;
-static unsigned int locking_frequency;
-
-static int exynos_cpufreq_get_index(unsigned int freq)
-{
-       struct cpufreq_frequency_table *freq_table = exynos_info->freq_table;
-       struct cpufreq_frequency_table *pos;
-
-       cpufreq_for_each_entry(pos, freq_table)
-               if (pos->frequency == freq)
-                       break;
-
-       if (pos->frequency == CPUFREQ_TABLE_END)
-               return -EINVAL;
-
-       return pos - freq_table;
-}
-
-static int exynos_cpufreq_scale(unsigned int target_freq)
-{
-       struct cpufreq_frequency_table *freq_table = exynos_info->freq_table;
-       unsigned int *volt_table = exynos_info->volt_table;
-       struct cpufreq_policy *policy = cpufreq_cpu_get(0);
-       unsigned int arm_volt, safe_arm_volt = 0;
-       unsigned int mpll_freq_khz = exynos_info->mpll_freq_khz;
-       struct device *dev = exynos_info->dev;
-       unsigned int old_freq;
-       int index, old_index;
-       int ret = 0;
-
-       old_freq = policy->cur;
-
-       /*
-        * The policy max have been changed so that we cannot get proper
-        * old_index with cpufreq_frequency_table_target(). Thus, ignore
-        * policy and get the index from the raw frequency table.
-        */
-       old_index = exynos_cpufreq_get_index(old_freq);
-       if (old_index < 0) {
-               ret = old_index;
-               goto out;
-       }
-
-       index = exynos_cpufreq_get_index(target_freq);
-       if (index < 0) {
-               ret = index;
-               goto out;
-       }
-
-       /*
-        * ARM clock source will be changed APLL to MPLL temporary
-        * To support this level, need to control regulator for
-        * required voltage level
-        */
-       if (exynos_info->need_apll_change != NULL) {
-               if (exynos_info->need_apll_change(old_index, index) &&
-                  (freq_table[index].frequency < mpll_freq_khz) &&
-                  (freq_table[old_index].frequency < mpll_freq_khz))
-                       safe_arm_volt = volt_table[exynos_info->pll_safe_idx];
-       }
-       arm_volt = volt_table[index];
-
-       /* When the new frequency is higher than current frequency */
-       if ((target_freq > old_freq) && !safe_arm_volt) {
-               /* Firstly, voltage up to increase frequency */
-               ret = regulator_set_voltage(arm_regulator, arm_volt, arm_volt);
-               if (ret) {
-                       dev_err(dev, "failed to set cpu voltage to %d\n",
-                               arm_volt);
-                       return ret;
-               }
-       }
-
-       if (safe_arm_volt) {
-               ret = regulator_set_voltage(arm_regulator, safe_arm_volt,
-                                     safe_arm_volt);
-               if (ret) {
-                       dev_err(dev, "failed to set cpu voltage to %d\n",
-                               safe_arm_volt);
-                       return ret;
-               }
-       }
-
-       exynos_info->set_freq(old_index, index);
-
-       /* When the new frequency is lower than current frequency */
-       if ((target_freq < old_freq) ||
-          ((target_freq > old_freq) && safe_arm_volt)) {
-               /* down the voltage after frequency change */
-               ret = regulator_set_voltage(arm_regulator, arm_volt,
-                               arm_volt);
-               if (ret) {
-                       dev_err(dev, "failed to set cpu voltage to %d\n",
-                               arm_volt);
-                       goto out;
-               }
-       }
-
-out:
-       cpufreq_cpu_put(policy);
-
-       return ret;
-}
-
-static int exynos_target(struct cpufreq_policy *policy, unsigned int index)
-{
-       return exynos_cpufreq_scale(exynos_info->freq_table[index].frequency);
-}
-
-static int exynos_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-       policy->clk = exynos_info->cpu_clk;
-       policy->suspend_freq = locking_frequency;
-       return cpufreq_generic_init(policy, exynos_info->freq_table, 100000);
-}
-
-static struct cpufreq_driver exynos_driver = {
-       .flags          = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
-       .verify         = cpufreq_generic_frequency_table_verify,
-       .target_index   = exynos_target,
-       .get            = cpufreq_generic_get,
-       .init           = exynos_cpufreq_cpu_init,
-       .name           = "exynos_cpufreq",
-       .attr           = cpufreq_generic_attr,
-#ifdef CONFIG_ARM_EXYNOS_CPU_FREQ_BOOST_SW
-       .boost_supported = true,
-#endif
-#ifdef CONFIG_PM
-       .suspend        = cpufreq_generic_suspend,
-#endif
-};
-
-static int exynos_cpufreq_probe(struct platform_device *pdev)
-{
-       struct device_node *cpu0;
-       int ret = -EINVAL;
-
-       exynos_info = kzalloc(sizeof(*exynos_info), GFP_KERNEL);
-       if (!exynos_info)
-               return -ENOMEM;
-
-       exynos_info->dev = &pdev->dev;
-
-       if (of_machine_is_compatible("samsung,exynos4212")) {
-               exynos_info->type = EXYNOS_SOC_4212;
-               ret = exynos4x12_cpufreq_init(exynos_info);
-       } else if (of_machine_is_compatible("samsung,exynos4412")) {
-               exynos_info->type = EXYNOS_SOC_4412;
-               ret = exynos4x12_cpufreq_init(exynos_info);
-       } else if (of_machine_is_compatible("samsung,exynos5250")) {
-               exynos_info->type = EXYNOS_SOC_5250;
-               ret = exynos5250_cpufreq_init(exynos_info);
-       } else {
-               pr_err("%s: Unknown SoC type\n", __func__);
-               ret = -ENODEV;
-       }
-
-       if (ret)
-               goto err_vdd_arm;
-
-       if (exynos_info->set_freq == NULL) {
-               dev_err(&pdev->dev, "No set_freq function (ERR)\n");
-               ret = -EINVAL;
-               goto err_vdd_arm;
-       }
-
-       arm_regulator = regulator_get(NULL, "vdd_arm");
-       if (IS_ERR(arm_regulator)) {
-               dev_err(&pdev->dev, "failed to get resource vdd_arm\n");
-               ret = -EINVAL;
-               goto err_vdd_arm;
-       }
-
-       /* Done here as we want to capture boot frequency */
-       locking_frequency = clk_get_rate(exynos_info->cpu_clk) / 1000;
-
-       ret = cpufreq_register_driver(&exynos_driver);
-       if (ret)
-               goto err_cpufreq_reg;
-
-       cpu0 = of_get_cpu_node(0, NULL);
-       if (!cpu0) {
-               pr_err("failed to find cpu0 node\n");
-               return 0;
-       }
-
-       if (of_find_property(cpu0, "#cooling-cells", NULL)) {
-               cdev = of_cpufreq_cooling_register(cpu0,
-                                                  cpu_present_mask);
-               if (IS_ERR(cdev))
-                       pr_err("running cpufreq without cooling device: %ld\n",
-                              PTR_ERR(cdev));
-       }
-
-       return 0;
-
-err_cpufreq_reg:
-       dev_err(&pdev->dev, "failed to register cpufreq driver\n");
-       regulator_put(arm_regulator);
-err_vdd_arm:
-       kfree(exynos_info);
-       return ret;
-}
-
-static struct platform_driver exynos_cpufreq_platdrv = {
-       .driver = {
-               .name   = "exynos-cpufreq",
-       },
-       .probe = exynos_cpufreq_probe,
-};
-module_platform_driver(exynos_cpufreq_platdrv);
diff --git a/drivers/cpufreq/exynos-cpufreq.h b/drivers/cpufreq/exynos-cpufreq.h
deleted file mode 100644 (file)
index a3855e4..0000000
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2010 Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS - CPUFreq support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-enum cpufreq_level_index {
-       L0, L1, L2, L3, L4,
-       L5, L6, L7, L8, L9,
-       L10, L11, L12, L13, L14,
-       L15, L16, L17, L18, L19,
-       L20,
-};
-
-enum exynos_soc_type {
-       EXYNOS_SOC_4212,
-       EXYNOS_SOC_4412,
-       EXYNOS_SOC_5250,
-};
-
-#define APLL_FREQ(f, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, m, p, s) \
-       { \
-               .freq = (f) * 1000, \
-               .clk_div_cpu0 = ((a0) | (a1) << 4 | (a2) << 8 | (a3) << 12 | \
-                       (a4) << 16 | (a5) << 20 | (a6) << 24 | (a7) << 28), \
-               .clk_div_cpu1 = (b0 << 0 | b1 << 4 | b2 << 8), \
-               .mps = ((m) << 16 | (p) << 8 | (s)), \
-       }
-
-struct apll_freq {
-       unsigned int freq;
-       u32 clk_div_cpu0;
-       u32 clk_div_cpu1;
-       u32 mps;
-};
-
-struct exynos_dvfs_info {
-       enum exynos_soc_type type;
-       struct device   *dev;
-       unsigned long   mpll_freq_khz;
-       unsigned int    pll_safe_idx;
-       struct clk      *cpu_clk;
-       unsigned int    *volt_table;
-       struct cpufreq_frequency_table  *freq_table;
-       void (*set_freq)(unsigned int, unsigned int);
-       bool (*need_apll_change)(unsigned int, unsigned int);
-       void __iomem    *cmu_regs;
-};
-
-#ifdef CONFIG_ARM_EXYNOS4X12_CPUFREQ
-extern int exynos4x12_cpufreq_init(struct exynos_dvfs_info *);
-#else
-static inline int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       return -EOPNOTSUPP;
-}
-#endif
-#ifdef CONFIG_ARM_EXYNOS5250_CPUFREQ
-extern int exynos5250_cpufreq_init(struct exynos_dvfs_info *);
-#else
-static inline int exynos5250_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       return -EOPNOTSUPP;
-}
-#endif
-
-#define EXYNOS4_CLKSRC_CPU                     0x14200
-#define EXYNOS4_CLKMUX_STATCPU                 0x14400
-
-#define EXYNOS4_CLKDIV_CPU                     0x14500
-#define EXYNOS4_CLKDIV_CPU1                    0x14504
-#define EXYNOS4_CLKDIV_STATCPU                 0x14600
-#define EXYNOS4_CLKDIV_STATCPU1                        0x14604
-
-#define EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT       (16)
-#define EXYNOS4_CLKMUX_STATCPU_MUXCORE_MASK    (0x7 << EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT)
-
-#define EXYNOS5_APLL_LOCK                      0x00000
-#define EXYNOS5_APLL_CON0                      0x00100
-#define EXYNOS5_CLKMUX_STATCPU                 0x00400
-#define EXYNOS5_CLKDIV_CPU0                    0x00500
-#define EXYNOS5_CLKDIV_CPU1                    0x00504
-#define EXYNOS5_CLKDIV_STATCPU0                        0x00600
-#define EXYNOS5_CLKDIV_STATCPU1                        0x00604
diff --git a/drivers/cpufreq/exynos4x12-cpufreq.c b/drivers/cpufreq/exynos4x12-cpufreq.c
deleted file mode 100644 (file)
index 9e78a85..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2010-2012 Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS4X12 - CPU frequency scaling support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/cpufreq.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include "exynos-cpufreq.h"
-
-static struct clk *cpu_clk;
-static struct clk *moutcore;
-static struct clk *mout_mpll;
-static struct clk *mout_apll;
-static struct exynos_dvfs_info *cpufreq;
-
-static unsigned int exynos4x12_volt_table[] = {
-       1350000, 1287500, 1250000, 1187500, 1137500, 1087500, 1037500,
-       1000000,  987500,  975000,  950000,  925000,  900000,  900000
-};
-
-static struct cpufreq_frequency_table exynos4x12_freq_table[] = {
-       {CPUFREQ_BOOST_FREQ, L0, 1500 * 1000},
-       {0, L1, 1400 * 1000},
-       {0, L2, 1300 * 1000},
-       {0, L3, 1200 * 1000},
-       {0, L4, 1100 * 1000},
-       {0, L5, 1000 * 1000},
-       {0, L6,  900 * 1000},
-       {0, L7,  800 * 1000},
-       {0, L8,  700 * 1000},
-       {0, L9,  600 * 1000},
-       {0, L10, 500 * 1000},
-       {0, L11, 400 * 1000},
-       {0, L12, 300 * 1000},
-       {0, L13, 200 * 1000},
-       {0, 0, CPUFREQ_TABLE_END},
-};
-
-static struct apll_freq *apll_freq_4x12;
-
-static struct apll_freq apll_freq_4212[] = {
-       /*
-        * values:
-        * freq
-        * clock divider for CORE, COREM0, COREM1, PERIPH, ATB, PCLK_DBG, APLL, CORE2
-        * clock divider for COPY, HPM, RESERVED
-        * PLL M, P, S
-        */
-       APLL_FREQ(1500, 0, 3, 7, 0, 6, 1, 2, 0, 6, 2, 0, 250, 4, 0),
-       APLL_FREQ(1400, 0, 3, 7, 0, 6, 1, 2, 0, 6, 2, 0, 175, 3, 0),
-       APLL_FREQ(1300, 0, 3, 7, 0, 5, 1, 2, 0, 5, 2, 0, 325, 6, 0),
-       APLL_FREQ(1200, 0, 3, 7, 0, 5, 1, 2, 0, 5, 2, 0, 200, 4, 0),
-       APLL_FREQ(1100, 0, 3, 6, 0, 4, 1, 2, 0, 4, 2, 0, 275, 6, 0),
-       APLL_FREQ(1000, 0, 2, 5, 0, 4, 1, 1, 0, 4, 2, 0, 125, 3, 0),
-       APLL_FREQ(900,  0, 2, 5, 0, 3, 1, 1, 0, 3, 2, 0, 150, 4, 0),
-       APLL_FREQ(800,  0, 2, 5, 0, 3, 1, 1, 0, 3, 2, 0, 100, 3, 0),
-       APLL_FREQ(700,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 175, 3, 1),
-       APLL_FREQ(600,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 200, 4, 1),
-       APLL_FREQ(500,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 125, 3, 1),
-       APLL_FREQ(400,  0, 2, 4, 0, 3, 1, 1, 0, 3, 2, 0, 100, 3, 1),
-       APLL_FREQ(300,  0, 2, 4, 0, 2, 1, 1, 0, 3, 2, 0, 200, 4, 2),
-       APLL_FREQ(200,  0, 1, 3, 0, 1, 1, 1, 0, 3, 2, 0, 100, 3, 2),
-};
-
-static struct apll_freq apll_freq_4412[] = {
-       /*
-        * values:
-        * freq
-        * clock divider for CORE, COREM0, COREM1, PERIPH, ATB, PCLK_DBG, APLL, CORE2
-        * clock divider for COPY, HPM, CORES
-        * PLL M, P, S
-        */
-       APLL_FREQ(1500, 0, 3, 7, 0, 6, 1, 2, 0, 6, 0, 7, 250, 4, 0),
-       APLL_FREQ(1400, 0, 3, 7, 0, 6, 1, 2, 0, 6, 0, 6, 175, 3, 0),
-       APLL_FREQ(1300, 0, 3, 7, 0, 5, 1, 2, 0, 5, 0, 6, 325, 6, 0),
-       APLL_FREQ(1200, 0, 3, 7, 0, 5, 1, 2, 0, 5, 0, 5, 200, 4, 0),
-       APLL_FREQ(1100, 0, 3, 6, 0, 4, 1, 2, 0, 4, 0, 5, 275, 6, 0),
-       APLL_FREQ(1000, 0, 2, 5, 0, 4, 1, 1, 0, 4, 0, 4, 125, 3, 0),
-       APLL_FREQ(900,  0, 2, 5, 0, 3, 1, 1, 0, 3, 0, 4, 150, 4, 0),
-       APLL_FREQ(800,  0, 2, 5, 0, 3, 1, 1, 0, 3, 0, 3, 100, 3, 0),
-       APLL_FREQ(700,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 3, 175, 3, 1),
-       APLL_FREQ(600,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 2, 200, 4, 1),
-       APLL_FREQ(500,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 2, 125, 3, 1),
-       APLL_FREQ(400,  0, 2, 4, 0, 3, 1, 1, 0, 3, 0, 1, 100, 3, 1),
-       APLL_FREQ(300,  0, 2, 4, 0, 2, 1, 1, 0, 3, 0, 1, 200, 4, 2),
-       APLL_FREQ(200,  0, 1, 3, 0, 1, 1, 1, 0, 3, 0, 0, 100, 3, 2),
-};
-
-static void exynos4x12_set_clkdiv(unsigned int div_index)
-{
-       unsigned int tmp;
-
-       /* Change Divider - CPU0 */
-
-       tmp = apll_freq_4x12[div_index].clk_div_cpu0;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS4_CLKDIV_CPU);
-
-       while (__raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKDIV_STATCPU)
-              & 0x11111111)
-               cpu_relax();
-
-       /* Change Divider - CPU1 */
-       tmp = apll_freq_4x12[div_index].clk_div_cpu1;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS4_CLKDIV_CPU1);
-
-       do {
-               cpu_relax();
-               tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKDIV_STATCPU1);
-       } while (tmp != 0x0);
-}
-
-static void exynos4x12_set_apll(unsigned int index)
-{
-       unsigned int tmp, freq = apll_freq_4x12[index].freq;
-
-       /* MUX_CORE_SEL = MPLL, ARMCLK uses MPLL for lock time */
-       clk_set_parent(moutcore, mout_mpll);
-
-       do {
-               cpu_relax();
-               tmp = (__raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKMUX_STATCPU)
-                       >> EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT);
-               tmp &= 0x7;
-       } while (tmp != 0x2);
-
-       clk_set_rate(mout_apll, freq * 1000);
-
-       /* MUX_CORE_SEL = APLL */
-       clk_set_parent(moutcore, mout_apll);
-
-       do {
-               cpu_relax();
-               tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS4_CLKMUX_STATCPU);
-               tmp &= EXYNOS4_CLKMUX_STATCPU_MUXCORE_MASK;
-       } while (tmp != (0x1 << EXYNOS4_CLKSRC_CPU_MUXCORE_SHIFT));
-}
-
-static void exynos4x12_set_frequency(unsigned int old_index,
-                                 unsigned int new_index)
-{
-       if (old_index > new_index) {
-               exynos4x12_set_clkdiv(new_index);
-               exynos4x12_set_apll(new_index);
-       } else if (old_index < new_index) {
-               exynos4x12_set_apll(new_index);
-               exynos4x12_set_clkdiv(new_index);
-       }
-}
-
-int exynos4x12_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       struct device_node *np;
-       unsigned long rate;
-
-       /*
-        * HACK: This is a temporary workaround to get access to clock
-        * controller registers directly and remove static mappings and
-        * dependencies on platform headers. It is necessary to enable
-        * Exynos multi-platform support and will be removed together with
-        * this whole driver as soon as Exynos gets migrated to use
-        * cpufreq-dt driver.
-        */
-       np = of_find_compatible_node(NULL, NULL, "samsung,exynos4412-clock");
-       if (!np) {
-               pr_err("%s: failed to find clock controller DT node\n",
-                       __func__);
-               return -ENODEV;
-       }
-
-       info->cmu_regs = of_iomap(np, 0);
-       if (!info->cmu_regs) {
-               pr_err("%s: failed to map CMU registers\n", __func__);
-               return -EFAULT;
-       }
-
-       cpu_clk = clk_get(NULL, "armclk");
-       if (IS_ERR(cpu_clk))
-               return PTR_ERR(cpu_clk);
-
-       moutcore = clk_get(NULL, "moutcore");
-       if (IS_ERR(moutcore))
-               goto err_moutcore;
-
-       mout_mpll = clk_get(NULL, "mout_mpll");
-       if (IS_ERR(mout_mpll))
-               goto err_mout_mpll;
-
-       rate = clk_get_rate(mout_mpll) / 1000;
-
-       mout_apll = clk_get(NULL, "mout_apll");
-       if (IS_ERR(mout_apll))
-               goto err_mout_apll;
-
-       if (info->type == EXYNOS_SOC_4212)
-               apll_freq_4x12 = apll_freq_4212;
-       else
-               apll_freq_4x12 = apll_freq_4412;
-
-       info->mpll_freq_khz = rate;
-       /* 800Mhz */
-       info->pll_safe_idx = L7;
-       info->cpu_clk = cpu_clk;
-       info->volt_table = exynos4x12_volt_table;
-       info->freq_table = exynos4x12_freq_table;
-       info->set_freq = exynos4x12_set_frequency;
-
-       cpufreq = info;
-
-       return 0;
-
-err_mout_apll:
-       clk_put(mout_mpll);
-err_mout_mpll:
-       clk_put(moutcore);
-err_moutcore:
-       clk_put(cpu_clk);
-
-       pr_debug("%s: failed initialization\n", __func__);
-       return -EINVAL;
-}
diff --git a/drivers/cpufreq/exynos5250-cpufreq.c b/drivers/cpufreq/exynos5250-cpufreq.c
deleted file mode 100644 (file)
index 3eafdc7..0000000
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2010-20122Samsung Electronics Co., Ltd.
- *             http://www.samsung.com
- *
- * EXYNOS5250 - CPU frequency scaling support
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/cpufreq.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include "exynos-cpufreq.h"
-
-static struct clk *cpu_clk;
-static struct clk *moutcore;
-static struct clk *mout_mpll;
-static struct clk *mout_apll;
-static struct exynos_dvfs_info *cpufreq;
-
-static unsigned int exynos5250_volt_table[] = {
-       1300000, 1250000, 1225000, 1200000, 1150000,
-       1125000, 1100000, 1075000, 1050000, 1025000,
-       1012500, 1000000,  975000,  950000,  937500,
-       925000
-};
-
-static struct cpufreq_frequency_table exynos5250_freq_table[] = {
-       {0, L0, 1700 * 1000},
-       {0, L1, 1600 * 1000},
-       {0, L2, 1500 * 1000},
-       {0, L3, 1400 * 1000},
-       {0, L4, 1300 * 1000},
-       {0, L5, 1200 * 1000},
-       {0, L6, 1100 * 1000},
-       {0, L7, 1000 * 1000},
-       {0, L8,  900 * 1000},
-       {0, L9,  800 * 1000},
-       {0, L10, 700 * 1000},
-       {0, L11, 600 * 1000},
-       {0, L12, 500 * 1000},
-       {0, L13, 400 * 1000},
-       {0, L14, 300 * 1000},
-       {0, L15, 200 * 1000},
-       {0, 0, CPUFREQ_TABLE_END},
-};
-
-static struct apll_freq apll_freq_5250[] = {
-       /*
-        * values:
-        * freq
-        * clock divider for ARM, CPUD, ACP, PERIPH, ATB, PCLK_DBG, APLL, ARM2
-        * clock divider for COPY, HPM, RESERVED
-        * PLL M, P, S
-        */
-       APLL_FREQ(1700, 0, 3, 7, 7, 7, 3, 5, 0, 0, 2, 0, 425, 6, 0),
-       APLL_FREQ(1600, 0, 3, 7, 7, 7, 1, 4, 0, 0, 2, 0, 200, 3, 0),
-       APLL_FREQ(1500, 0, 2, 7, 7, 7, 1, 4, 0, 0, 2, 0, 250, 4, 0),
-       APLL_FREQ(1400, 0, 2, 7, 7, 6, 1, 4, 0, 0, 2, 0, 175, 3, 0),
-       APLL_FREQ(1300, 0, 2, 7, 7, 6, 1, 3, 0, 0, 2, 0, 325, 6, 0),
-       APLL_FREQ(1200, 0, 2, 7, 7, 5, 1, 3, 0, 0, 2, 0, 200, 4, 0),
-       APLL_FREQ(1100, 0, 3, 7, 7, 5, 1, 3, 0, 0, 2, 0, 275, 6, 0),
-       APLL_FREQ(1000, 0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 125, 3, 0),
-       APLL_FREQ(900,  0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 150, 4, 0),
-       APLL_FREQ(800,  0, 1, 7, 7, 4, 1, 2, 0, 0, 2, 0, 100, 3, 0),
-       APLL_FREQ(700,  0, 1, 7, 7, 3, 1, 1, 0, 0, 2, 0, 175, 3, 1),
-       APLL_FREQ(600,  0, 1, 7, 7, 3, 1, 1, 0, 0, 2, 0, 200, 4, 1),
-       APLL_FREQ(500,  0, 1, 7, 7, 2, 1, 1, 0, 0, 2, 0, 125, 3, 1),
-       APLL_FREQ(400,  0, 1, 7, 7, 2, 1, 1, 0, 0, 2, 0, 100, 3, 1),
-       APLL_FREQ(300,  0, 1, 7, 7, 1, 1, 1, 0, 0, 2, 0, 200, 4, 2),
-       APLL_FREQ(200,  0, 1, 7, 7, 1, 1, 1, 0, 0, 2, 0, 100, 3, 2),
-};
-
-static void set_clkdiv(unsigned int div_index)
-{
-       unsigned int tmp;
-
-       /* Change Divider - CPU0 */
-
-       tmp = apll_freq_5250[div_index].clk_div_cpu0;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS5_CLKDIV_CPU0);
-
-       while (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKDIV_STATCPU0)
-              & 0x11111111)
-               cpu_relax();
-
-       /* Change Divider - CPU1 */
-       tmp = apll_freq_5250[div_index].clk_div_cpu1;
-
-       __raw_writel(tmp, cpufreq->cmu_regs + EXYNOS5_CLKDIV_CPU1);
-
-       while (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKDIV_STATCPU1) & 0x11)
-               cpu_relax();
-}
-
-static void set_apll(unsigned int index)
-{
-       unsigned int tmp;
-       unsigned int freq = apll_freq_5250[index].freq;
-
-       /* MUX_CORE_SEL = MPLL, ARMCLK uses MPLL for lock time */
-       clk_set_parent(moutcore, mout_mpll);
-
-       do {
-               cpu_relax();
-               tmp = (__raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKMUX_STATCPU)
-                       >> 16);
-               tmp &= 0x7;
-       } while (tmp != 0x2);
-
-       clk_set_rate(mout_apll, freq * 1000);
-
-       /* MUX_CORE_SEL = APLL */
-       clk_set_parent(moutcore, mout_apll);
-
-       do {
-               cpu_relax();
-               tmp = __raw_readl(cpufreq->cmu_regs + EXYNOS5_CLKMUX_STATCPU);
-               tmp &= (0x7 << 16);
-       } while (tmp != (0x1 << 16));
-}
-
-static void exynos5250_set_frequency(unsigned int old_index,
-                                 unsigned int new_index)
-{
-       if (old_index > new_index) {
-               set_clkdiv(new_index);
-               set_apll(new_index);
-       } else if (old_index < new_index) {
-               set_apll(new_index);
-               set_clkdiv(new_index);
-       }
-}
-
-int exynos5250_cpufreq_init(struct exynos_dvfs_info *info)
-{
-       struct device_node *np;
-       unsigned long rate;
-
-       /*
-        * HACK: This is a temporary workaround to get access to clock
-        * controller registers directly and remove static mappings and
-        * dependencies on platform headers. It is necessary to enable
-        * Exynos multi-platform support and will be removed together with
-        * this whole driver as soon as Exynos gets migrated to use
-        * cpufreq-dt driver.
-        */
-       np = of_find_compatible_node(NULL, NULL, "samsung,exynos5250-clock");
-       if (!np) {
-               pr_err("%s: failed to find clock controller DT node\n",
-                       __func__);
-               return -ENODEV;
-       }
-
-       info->cmu_regs = of_iomap(np, 0);
-       if (!info->cmu_regs) {
-               pr_err("%s: failed to map CMU registers\n", __func__);
-               return -EFAULT;
-       }
-
-       cpu_clk = clk_get(NULL, "armclk");
-       if (IS_ERR(cpu_clk))
-               return PTR_ERR(cpu_clk);
-
-       moutcore = clk_get(NULL, "mout_cpu");
-       if (IS_ERR(moutcore))
-               goto err_moutcore;
-
-       mout_mpll = clk_get(NULL, "mout_mpll");
-       if (IS_ERR(mout_mpll))
-               goto err_mout_mpll;
-
-       rate = clk_get_rate(mout_mpll) / 1000;
-
-       mout_apll = clk_get(NULL, "mout_apll");
-       if (IS_ERR(mout_apll))
-               goto err_mout_apll;
-
-       info->mpll_freq_khz = rate;
-       /* 800Mhz */
-       info->pll_safe_idx = L9;
-       info->cpu_clk = cpu_clk;
-       info->volt_table = exynos5250_volt_table;
-       info->freq_table = exynos5250_freq_table;
-       info->set_freq = exynos5250_set_frequency;
-
-       cpufreq = info;
-
-       return 0;
-
-err_mout_apll:
-       clk_put(mout_mpll);
-err_mout_mpll:
-       clk_put(moutcore);
-err_moutcore:
-       clk_put(cpu_clk);
-
-       pr_err("%s: failed initialization\n", __func__);
-       return -EINVAL;
-}
index cddc61939a86a911f1792ba9eb89a955b55a24ce..3af9dd7332e6927d8dd860b5af410fba738bff4a 100644 (file)
@@ -260,24 +260,31 @@ static inline void update_turbo_state(void)
                 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 }
 
-#define PCT_TO_HWP(x) (x * 255 / 100)
 static void intel_pstate_hwp_set(void)
 {
-       int min, max, cpu;
-       u64 value, freq;
+       int min, hw_min, max, hw_max, cpu, range, adj_range;
+       u64 value, cap;
+
+       rdmsrl(MSR_HWP_CAPABILITIES, cap);
+       hw_min = HWP_LOWEST_PERF(cap);
+       hw_max = HWP_HIGHEST_PERF(cap);
+       range = hw_max - hw_min;
 
        get_online_cpus();
 
        for_each_online_cpu(cpu) {
                rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
-               min = PCT_TO_HWP(limits.min_perf_pct);
+               adj_range = limits.min_perf_pct * range / 100;
+               min = hw_min + adj_range;
                value &= ~HWP_MIN_PERF(~0L);
                value |= HWP_MIN_PERF(min);
 
-               max = PCT_TO_HWP(limits.max_perf_pct);
+               adj_range = limits.max_perf_pct * range / 100;
+               max = hw_min + adj_range;
                if (limits.no_turbo) {
-                       rdmsrl( MSR_HWP_CAPABILITIES, freq);
-                       max = HWP_GUARANTEED_PERF(freq);
+                       hw_max = HWP_GUARANTEED_PERF(cap);
+                       if (hw_max < max)
+                               max = hw_max;
                }
 
                value &= ~HWP_MAX_PERF(~0L);
@@ -423,6 +430,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 
        limits.max_sysfs_pct = clamp_t(int, input, 0 , 100);
        limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
+       limits.max_perf_pct = max(limits.min_policy_pct, limits.max_perf_pct);
+       limits.max_perf_pct = max(limits.min_perf_pct, limits.max_perf_pct);
        limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
 
        if (hwp_active)
@@ -442,6 +451,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 
        limits.min_sysfs_pct = clamp_t(int, input, 0 , 100);
        limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
+       limits.min_perf_pct = min(limits.max_policy_pct, limits.min_perf_pct);
+       limits.min_perf_pct = min(limits.max_perf_pct, limits.min_perf_pct);
        limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
 
        if (hwp_active)
@@ -989,12 +1000,19 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 
        limits.min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
        limits.min_policy_pct = clamp_t(int, limits.min_policy_pct, 0 , 100);
-       limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
-       limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
-
        limits.max_policy_pct = (policy->max * 100) / policy->cpuinfo.max_freq;
        limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0 , 100);
+
+       /* Normalize user input to [min_policy_pct, max_policy_pct] */
+       limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
+       limits.min_perf_pct = min(limits.max_policy_pct, limits.min_perf_pct);
        limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
+       limits.max_perf_pct = max(limits.min_policy_pct, limits.max_perf_pct);
+
+       /* Make sure min_perf_pct <= max_perf_pct */
+       limits.min_perf_pct = min(limits.max_perf_pct, limits.min_perf_pct);
+
+       limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
        limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
 
        if (hwp_active)
index 1523e2d745eb59682d18495273c6b76b022bb035..344058f8501a2c2ee888189950b79f615e815a02 100644 (file)
@@ -186,6 +186,28 @@ bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
        return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;
 }
 
+/**
+ * cpuidle_coupled_state_verify - check if the coupled states are correctly set.
+ * @drv: struct cpuidle_driver for the platform
+ *
+ * Returns 0 for valid state values, a negative error code otherwise:
+ *  * -EINVAL if any coupled state(safe_state_index) is wrongly set.
+ */
+int cpuidle_coupled_state_verify(struct cpuidle_driver *drv)
+{
+       int i;
+
+       for (i = drv->state_count - 1; i >= 0; i--) {
+               if (cpuidle_state_is_coupled(drv, i) &&
+                   (drv->safe_state_index == i ||
+                    drv->safe_state_index < 0 ||
+                    drv->safe_state_index >= drv->state_count))
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
 /**
  * cpuidle_coupled_set_ready - mark a cpu as ready
  * @coupled: the struct coupled that contains the current cpu
index 178c5ad3d56871a4ce034fa7290a54f7ae3882c5..f87f399b0540c139dad8e58134ebd21e925d4fd9 100644 (file)
@@ -35,6 +35,7 @@ extern void cpuidle_remove_sysfs(struct cpuidle_device *dev);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state);
+int cpuidle_coupled_state_verify(struct cpuidle_driver *drv);
 int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
                struct cpuidle_driver *drv, int next_state);
 int cpuidle_coupled_register_device(struct cpuidle_device *dev);
@@ -46,6 +47,11 @@ bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
        return false;
 }
 
+static inline int cpuidle_coupled_state_verify(struct cpuidle_driver *drv)
+{
+       return 0;
+}
+
 static inline int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
                struct cpuidle_driver *drv, int next_state)
 {
index 5db147859b9047db626d66e64bef2897a41822f2..389ade4572beb17c71ff44faf2ce884ad16baf54 100644 (file)
@@ -227,6 +227,10 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
        if (!drv || !drv->state_count)
                return -EINVAL;
 
+       ret = cpuidle_coupled_state_verify(drv);
+       if (ret)
+               return ret;
+
        if (cpuidle_disabled())
                return -ENODEV;
 
index 07bc7aa6b224aeeb7ada29b08aec61966d3b3b2e..d234719065a5f2e39c3e81681dc0bd184e803033 100644 (file)
@@ -461,7 +461,7 @@ config CRYPTO_DEV_QCE
 
 config CRYPTO_DEV_VMX
        bool "Support for VMX cryptographic acceleration instructions"
-       depends on PPC64
+       depends on PPC64 && VSX
        help
          Support for VMX cryptographic acceleration instructions.
 
index e41986967294164f0e226f39faff1e817fb3c590..52340b9bb3873bbf1a327c7b41bbe3f2ffa5b222 100644 (file)
@@ -86,9 +86,7 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
 {
        struct adf_etr_ring_data *ring = sfile->private;
        struct adf_etr_bank_data *bank = ring->bank;
-       uint32_t *msg = v;
        void __iomem *csr = ring->bank->csr_addr;
-       int i, x;
 
        if (v == SEQ_START_TOKEN) {
                int head, tail, empty;
@@ -113,18 +111,8 @@ static int adf_ring_show(struct seq_file *sfile, void *v)
                seq_puts(sfile, "----------- Ring data ------------\n");
                return 0;
        }
-       seq_printf(sfile, "%p:", msg);
-       x = 0;
-       i = 0;
-       for (; i < (ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2); i++) {
-               seq_printf(sfile, " %08X", *(msg + i));
-               if ((ADF_MSG_SIZE_TO_BYTES(ring->msg_size) >> 2) != i + 1 &&
-                   (++x == 8)) {
-                       seq_printf(sfile, "\n%p:", msg + i + 1);
-                       x = 0;
-               }
-       }
-       seq_puts(sfile, "\n");
+       seq_hex_dump(sfile, "", DUMP_PREFIX_ADDRESS, 32, 4,
+                    v, ADF_MSG_SIZE_TO_BYTES(ring->msg_size), false);
        return 0;
 }
 
index e070c316e8b76a7384efa33a6a34ec99fb9a8849..a19ee127edcafd3c70ad9e6ee8a86b1823f020f7 100644 (file)
@@ -104,7 +104,7 @@ static int sun4i_ss_opti_poll(struct ablkcipher_request *areq)
                        sg_miter_next(&mo);
                        oo = 0;
                }
-       } while (mo.length > 0);
+       } while (oleft > 0);
 
        if (areq->info) {
                for (i = 0; i < 4 && i < ivsize / 4; i++) {
index ca7831168298a444d1b2dd55fbdec48a287b5ab1..cf1268ddef0c058982ff45fb0bb0074a06d2f6b7 100644 (file)
@@ -280,6 +280,7 @@ struct sbridge_info {
        u8              max_interleave;
        u8              (*get_node_id)(struct sbridge_pvt *pvt);
        enum mem_type   (*get_memory_type)(struct sbridge_pvt *pvt);
+       enum dev_type   (*get_width)(struct sbridge_pvt *pvt, u32 mtr);
        struct pci_dev  *pci_vtd;
 };
 
@@ -471,6 +472,9 @@ static const struct pci_id_table pci_dev_descr_ibridge_table[] = {
 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2 0x2f6c
 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3 0x2f6d
 #define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0 0x2fbd
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1 0x2fbf
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2 0x2fb9
+#define PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3 0x2fbb
 static const struct pci_id_descr pci_dev_descr_haswell[] = {
        /* first item must be the HA */
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0, 0)             },
@@ -488,6 +492,9 @@ static const struct pci_id_descr pci_dev_descr_haswell[] = {
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3, 1)        },
 
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0, 1)          },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1, 1)          },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2, 1)          },
+       { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3, 1)          },
 
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA, 1)          },
        { PCI_DESCR(PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_THERMAL, 1)     },
@@ -762,6 +769,49 @@ out:
        return mtype;
 }
 
+static enum dev_type sbridge_get_width(struct sbridge_pvt *pvt, u32 mtr)
+{
+       /* there's no way to figure out */
+       return DEV_UNKNOWN;
+}
+
+static enum dev_type __ibridge_get_width(u32 mtr)
+{
+       enum dev_type type;
+
+       switch (mtr) {
+       case 3:
+               type = DEV_UNKNOWN;
+               break;
+       case 2:
+               type = DEV_X16;
+               break;
+       case 1:
+               type = DEV_X8;
+               break;
+       case 0:
+               type = DEV_X4;
+               break;
+       }
+
+       return type;
+}
+
+static enum dev_type ibridge_get_width(struct sbridge_pvt *pvt, u32 mtr)
+{
+       /*
+        * ddr3_width on the documentation but also valid for DDR4 on
+        * Haswell
+        */
+       return __ibridge_get_width(GET_BITFIELD(mtr, 7, 8));
+}
+
+static enum dev_type broadwell_get_width(struct sbridge_pvt *pvt, u32 mtr)
+{
+       /* ddr3_width on the documentation but also valid for DDR4 */
+       return __ibridge_get_width(GET_BITFIELD(mtr, 8, 9));
+}
+
 static u8 get_node_id(struct sbridge_pvt *pvt)
 {
        u32 reg;
@@ -966,17 +1016,7 @@ static int get_dimm_config(struct mem_ctl_info *mci)
 
                                dimm->nr_pages = npages;
                                dimm->grain = 32;
-                               switch (banks) {
-                               case 16:
-                                       dimm->dtype = DEV_X16;
-                                       break;
-                               case 8:
-                                       dimm->dtype = DEV_X8;
-                                       break;
-                               case 4:
-                                       dimm->dtype = DEV_X4;
-                                       break;
-                               }
+                               dimm->dtype = pvt->info.get_width(pvt, mtr);
                                dimm->mtype = mtype;
                                dimm->edac_mode = mode;
                                snprintf(dimm->label, sizeof(dimm->label),
@@ -1869,7 +1909,11 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
                }
                        break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0:
-                       pvt->pci_ddrio = pdev;
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO1:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO2:
+               case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO3:
+                       if (!pvt->pci_ddrio)
+                               pvt->pci_ddrio = pdev;
                        break;
                case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1:
                        pvt->pci_ha1 = pdev;
@@ -2361,6 +2405,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = ibridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
                pvt->info.interleave_pkg = ibridge_interleave_pkg;
+               pvt->info.get_width = ibridge_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Ivy Bridge Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
@@ -2380,6 +2425,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = sbridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(sbridge_interleave_list);
                pvt->info.interleave_pkg = sbridge_interleave_pkg;
+               pvt->info.get_width = sbridge_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Sandy Bridge Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
@@ -2399,6 +2445,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = ibridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
                pvt->info.interleave_pkg = ibridge_interleave_pkg;
+               pvt->info.get_width = ibridge_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Haswell Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
@@ -2418,6 +2465,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
                pvt->info.interleave_list = ibridge_interleave_list;
                pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list);
                pvt->info.interleave_pkg = ibridge_interleave_pkg;
+               pvt->info.get_width = broadwell_get_width;
                mci->ctl_name = kasprintf(GFP_KERNEL, "Broadwell Socket#%d", mci->mc_idx);
 
                /* Store pci devices at mci for faster access */
index 54071c1483400d41e214c0f83512ca1f4600814a..84533e02fbf8ba292cddf960fde5881e1898821c 100644 (file)
@@ -43,7 +43,7 @@ config EFI_VARS_PSTORE_DEFAULT_DISABLE
 
 config EFI_RUNTIME_MAP
        bool "Export efi runtime maps to sysfs"
-       depends on X86 && EFI && KEXEC
+       depends on X86 && EFI && KEXEC_CORE
        default y
        help
          Export efi runtime memory maps to /sys/firmware/efi/runtime-map.
index b4fc9e4d24c6b15857e2a2cd3c1000e0473af8f9..8949b3f6f74d207f0bc935af11dd76cf956b49ba 100644 (file)
@@ -356,7 +356,7 @@ config GPIO_PXA
 
 config GPIO_RCAR
        tristate "Renesas R-Car GPIO"
-       depends on ARM && (ARCH_SHMOBILE || COMPILE_TEST)
+       depends on ARCH_SHMOBILE || COMPILE_TEST
        select GPIOLIB_IRQCHIP
        help
          Say yes here to support GPIO on Renesas R-Car SoCs.
index b752b560126e71c0066fd66f659641155b39ad71..8813abab9736e86246d0ab90740fbbf89c70fbbc 100644 (file)
@@ -339,13 +339,15 @@ static int gpio_set_wake_irq(struct irq_data *d, u32 enable)
        return 0;
 }
 
-static void mxc_gpio_init_gc(struct mxc_gpio_port *port, int irq_base)
+static int mxc_gpio_init_gc(struct mxc_gpio_port *port, int irq_base)
 {
        struct irq_chip_generic *gc;
        struct irq_chip_type *ct;
 
        gc = irq_alloc_generic_chip("gpio-mxc", 1, irq_base,
                                    port->base, handle_level_irq);
+       if (!gc)
+               return -ENOMEM;
        gc->private = port;
 
        ct = gc->chip_types;
@@ -360,6 +362,8 @@ static void mxc_gpio_init_gc(struct mxc_gpio_port *port, int irq_base)
 
        irq_setup_generic_chip(gc, IRQ_MSK(32), IRQ_GC_INIT_NESTED_LOCK,
                               IRQ_NOREQUEST, 0);
+
+       return 0;
 }
 
 static void mxc_gpio_get_hw(struct platform_device *pdev)
@@ -477,12 +481,16 @@ static int mxc_gpio_probe(struct platform_device *pdev)
        }
 
        /* gpio-mxc can be a generic irq chip */
-       mxc_gpio_init_gc(port, irq_base);
+       err = mxc_gpio_init_gc(port, irq_base);
+       if (err < 0)
+               goto out_irqdomain_remove;
 
        list_add_tail(&port->node, &mxc_gpio_ports);
 
        return 0;
 
+out_irqdomain_remove:
+       irq_domain_remove(port->domain);
 out_irqdesc_free:
        irq_free_descs(irq_base, 32);
 out_gpiochip_remove:
index b7f383eb18d91e2f82831e4813b987d74d18d77b..1387385e66978cbf096d033e8db152d88e8b1a8a 100644 (file)
@@ -196,13 +196,16 @@ static int mxs_gpio_set_wake_irq(struct irq_data *d, unsigned int enable)
        return 0;
 }
 
-static void __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base)
+static int __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base)
 {
        struct irq_chip_generic *gc;
        struct irq_chip_type *ct;
 
        gc = irq_alloc_generic_chip("gpio-mxs", 1, irq_base,
                                    port->base, handle_level_irq);
+       if (!gc)
+               return -ENOMEM;
+
        gc->private = port;
 
        ct = gc->chip_types;
@@ -216,6 +219,8 @@ static void __init mxs_gpio_init_gc(struct mxs_gpio_port *port, int irq_base)
 
        irq_setup_generic_chip(gc, IRQ_MSK(32), IRQ_GC_INIT_NESTED_LOCK,
                               IRQ_NOREQUEST, 0);
+
+       return 0;
 }
 
 static int mxs_gpio_to_irq(struct gpio_chip *gc, unsigned offset)
@@ -317,7 +322,9 @@ static int mxs_gpio_probe(struct platform_device *pdev)
        }
 
        /* gpio-mxs can be a generic irq chip */
-       mxs_gpio_init_gc(port, irq_base);
+       err = mxs_gpio_init_gc(port, irq_base);
+       if (err < 0)
+               goto out_irqdomain_remove;
 
        /* setup one handler for each entry */
        irq_set_chained_handler_and_data(port->irq, mxs_gpio_irq_handler,
@@ -343,6 +350,8 @@ static int mxs_gpio_probe(struct platform_device *pdev)
 
 out_bgpio_remove:
        bgpio_remove(&port->bgc);
+out_irqdomain_remove:
+       irq_domain_remove(port->domain);
 out_irqdesc_free:
        irq_free_descs(irq_base, 32);
        return err;
index 2ae0d47e955443f7f1aec2f51c522466aa54ce30..072af5239bc14867ab38d69dbe04d9ce68e5d925 100644 (file)
@@ -1098,7 +1098,6 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct irq_chip *irqc)
        } else {
                bank->chip.label = "gpio";
                bank->chip.base = gpio;
-               gpio += bank->width;
        }
        bank->chip.ngpio = bank->width;
 
@@ -1108,6 +1107,9 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct irq_chip *irqc)
                return ret;
        }
 
+       if (!bank->is_mpuio)
+               gpio += bank->width;
+
 #ifdef CONFIG_ARCH_OMAP1
        /*
         * REVISIT: Once we have OMAP1 supporting SPARSE_IRQ, we can drop
@@ -1253,8 +1255,11 @@ static int omap_gpio_probe(struct platform_device *pdev)
        omap_gpio_mod_init(bank);
 
        ret = omap_gpio_chip_init(bank, irqc);
-       if (ret)
+       if (ret) {
+               pm_runtime_put_sync(bank->dev);
+               pm_runtime_disable(bank->dev);
                return ret;
+       }
 
        omap_gpio_show_rev(bank);
 
index 458d9d7952b840af2d7d2666ef3567dac8244403..9c6b96707c9f286a9bbf13c756beab7d4cedbace 100644 (file)
@@ -706,4 +706,3 @@ module_exit(sx150x_exit);
 MODULE_AUTHOR("Gregory Bean <gbean@codeaurora.org>");
 MODULE_DESCRIPTION("Driver for Semtech SX150X I2C GPIO Expanders");
 MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("i2c:sx150x");
index 980c1f87866ac268b3dc8046faec602e57ca32c9..5db3445552b176d2c11ca8b225ef88b4b80d22eb 100644 (file)
@@ -1174,15 +1174,16 @@ EXPORT_SYMBOL_GPL(gpiod_is_active_low);
  * that the GPIO was actually requested.
  */
 
-static bool _gpiod_get_raw_value(const struct gpio_desc *desc)
+static int _gpiod_get_raw_value(const struct gpio_desc *desc)
 {
        struct gpio_chip        *chip;
-       bool value;
        int offset;
+       int value;
 
        chip = desc->chip;
        offset = gpio_chip_hwgpio(desc);
-       value = chip->get ? chip->get(chip, offset) : false;
+       value = chip->get ? chip->get(chip, offset) : -EIO;
+       value = value < 0 ? value : !!value;
        trace_gpio_value(desc_to_gpio(desc), 1, value);
        return value;
 }
@@ -1192,7 +1193,7 @@ static bool _gpiod_get_raw_value(const struct gpio_desc *desc)
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's raw value, i.e. the value of the physical line disregarding
- * its ACTIVE_LOW status.
+ * its ACTIVE_LOW status, or negative errno on failure.
  *
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
@@ -1212,7 +1213,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_value);
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's logical value, i.e. taking the ACTIVE_LOW status into
- * account.
+ * account, or negative errno on failure.
  *
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
@@ -1226,6 +1227,9 @@ int gpiod_get_value(const struct gpio_desc *desc)
        WARN_ON(desc->chip->can_sleep);
 
        value = _gpiod_get_raw_value(desc);
+       if (value < 0)
+               return value;
+
        if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
                value = !value;
 
@@ -1548,7 +1552,7 @@ EXPORT_SYMBOL_GPL(gpiochip_unlock_as_irq);
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's raw value, i.e. the value of the physical line disregarding
- * its ACTIVE_LOW status.
+ * its ACTIVE_LOW status, or negative errno on failure.
  *
  * This function is to be called from contexts that can sleep.
  */
@@ -1566,7 +1570,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_raw_value_cansleep);
  * @desc: gpio whose value will be returned
  *
  * Return the GPIO's logical value, i.e. taking the ACTIVE_LOW status into
- * account.
+ * account, or negative errno on failure.
  *
  * This function is to be called from contexts that can sleep.
  */
@@ -1579,6 +1583,9 @@ int gpiod_get_value_cansleep(const struct gpio_desc *desc)
                return 0;
 
        value = _gpiod_get_raw_value(desc);
+       if (value < 0)
+               return value;
+
        if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
                value = !value;
 
index 434915448ea0be99b7786c6daf2650cfb93b728a..f7d5166f89b24ef740e854175927ad934652fed4 100644 (file)
@@ -1515,7 +1515,8 @@ retry:
                        copied_props++;
                }
 
-               if (obj->type == DRM_MODE_OBJECT_PLANE && count_props) {
+               if (obj->type == DRM_MODE_OBJECT_PLANE && count_props &&
+                   !(arg->flags & DRM_MODE_ATOMIC_TEST_ONLY)) {
                        plane = obj_to_plane(obj);
                        plane_mask |= (1 << drm_plane_index(plane));
                        plane->old_fb = plane->fb;
@@ -1537,10 +1538,11 @@ retry:
        }
 
        if (arg->flags & DRM_MODE_ATOMIC_TEST_ONLY) {
+               /*
+                * Unlike commit, check_only does not clean up state.
+                * Below we call drm_atomic_state_free for it.
+                */
                ret = drm_atomic_check_only(state);
-               /* _check_only() does not free state, unlike _commit() */
-               if (!ret)
-                       drm_atomic_state_free(state);
        } else if (arg->flags & DRM_MODE_ATOMIC_NONBLOCK) {
                ret = drm_atomic_async_commit(state);
        } else {
@@ -1567,25 +1569,30 @@ out:
                plane->old_fb = NULL;
        }
 
+       if (ret && arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
+               /*
+                * TEST_ONLY and PAGE_FLIP_EVENT are mutually exclusive,
+                * if they weren't, this code should be called on success
+                * for TEST_ONLY too.
+                */
+
+               for_each_crtc_in_state(state, crtc, crtc_state, i) {
+                       if (!crtc_state->event)
+                               continue;
+
+                       destroy_vblank_event(dev, file_priv,
+                                            crtc_state->event);
+               }
+       }
+
        if (ret == -EDEADLK) {
                drm_atomic_state_clear(state);
                drm_modeset_backoff(&ctx);
                goto retry;
        }
 
-       if (ret) {
-               if (arg->flags & DRM_MODE_PAGE_FLIP_EVENT) {
-                       for_each_crtc_in_state(state, crtc, crtc_state, i) {
-                               if (!crtc_state->event)
-                                       continue;
-
-                               destroy_vblank_event(dev, file_priv,
-                                                    crtc_state->event);
-                       }
-               }
-
+       if (ret || arg->flags & DRM_MODE_ATOMIC_TEST_ONLY)
                drm_atomic_state_free(state);
-       }
 
        drm_modeset_drop_locks(&ctx);
        drm_modeset_acquire_fini(&ctx);
index 80a02a412607f9c1b38e20b5494033b969c9e800..291734e87fca7457da9eb3ec8e0ba80f262bd232 100644 (file)
@@ -159,6 +159,8 @@ int drm_dp_bw_code_to_link_rate(u8 link_bw)
 }
 EXPORT_SYMBOL(drm_dp_bw_code_to_link_rate);
 
+#define AUX_RETRY_INTERVAL 500 /* us */
+
 /**
  * DOC: dp helpers
  *
@@ -213,7 +215,7 @@ static int drm_dp_dpcd_access(struct drm_dp_aux *aux, u8 request,
                        return -EIO;
 
                case DP_AUX_NATIVE_REPLY_DEFER:
-                       usleep_range(400, 500);
+                       usleep_range(AUX_RETRY_INTERVAL, AUX_RETRY_INTERVAL + 100);
                        break;
                }
        }
@@ -422,6 +424,90 @@ static u32 drm_dp_i2c_functionality(struct i2c_adapter *adapter)
               I2C_FUNC_10BIT_ADDR;
 }
 
+#define AUX_PRECHARGE_LEN 10 /* 10 to 16 */
+#define AUX_SYNC_LEN (16 + 4) /* preamble + AUX_SYNC_END */
+#define AUX_STOP_LEN 4
+#define AUX_CMD_LEN 4
+#define AUX_ADDRESS_LEN 20
+#define AUX_REPLY_PAD_LEN 4
+#define AUX_LENGTH_LEN 8
+
+/*
+ * Calculate the duration of the AUX request/reply in usec. Gives the
+ * "best" case estimate, ie. successful while as short as possible.
+ */
+static int drm_dp_aux_req_duration(const struct drm_dp_aux_msg *msg)
+{
+       int len = AUX_PRECHARGE_LEN + AUX_SYNC_LEN + AUX_STOP_LEN +
+               AUX_CMD_LEN + AUX_ADDRESS_LEN + AUX_LENGTH_LEN;
+
+       if ((msg->request & DP_AUX_I2C_READ) == 0)
+               len += msg->size * 8;
+
+       return len;
+}
+
+static int drm_dp_aux_reply_duration(const struct drm_dp_aux_msg *msg)
+{
+       int len = AUX_PRECHARGE_LEN + AUX_SYNC_LEN + AUX_STOP_LEN +
+               AUX_CMD_LEN + AUX_REPLY_PAD_LEN;
+
+       /*
+        * For read we expect what was asked. For writes there will
+        * be 0 or 1 data bytes. Assume 0 for the "best" case.
+        */
+       if (msg->request & DP_AUX_I2C_READ)
+               len += msg->size * 8;
+
+       return len;
+}
+
+#define I2C_START_LEN 1
+#define I2C_STOP_LEN 1
+#define I2C_ADDR_LEN 9 /* ADDRESS + R/W + ACK/NACK */
+#define I2C_DATA_LEN 9 /* DATA + ACK/NACK */
+
+/*
+ * Calculate the length of the i2c transfer in usec, assuming
+ * the i2c bus speed is as specified. Gives the the "worst"
+ * case estimate, ie. successful while as long as possible.
+ * Doesn't account the the "MOT" bit, and instead assumes each
+ * message includes a START, ADDRESS and STOP. Neither does it
+ * account for additional random variables such as clock stretching.
+ */
+static int drm_dp_i2c_msg_duration(const struct drm_dp_aux_msg *msg,
+                                  int i2c_speed_khz)
+{
+       /* AUX bitrate is 1MHz, i2c bitrate as specified */
+       return DIV_ROUND_UP((I2C_START_LEN + I2C_ADDR_LEN +
+                            msg->size * I2C_DATA_LEN +
+                            I2C_STOP_LEN) * 1000, i2c_speed_khz);
+}
+
+/*
+ * Deterine how many retries should be attempted to successfully transfer
+ * the specified message, based on the estimated durations of the
+ * i2c and AUX transfers.
+ */
+static int drm_dp_i2c_retry_count(const struct drm_dp_aux_msg *msg,
+                             int i2c_speed_khz)
+{
+       int aux_time_us = drm_dp_aux_req_duration(msg) +
+               drm_dp_aux_reply_duration(msg);
+       int i2c_time_us = drm_dp_i2c_msg_duration(msg, i2c_speed_khz);
+
+       return DIV_ROUND_UP(i2c_time_us, aux_time_us + AUX_RETRY_INTERVAL);
+}
+
+/*
+ * FIXME currently assumes 10 kHz as some real world devices seem
+ * to require it. We should query/set the speed via DPCD if supported.
+ */
+static int dp_aux_i2c_speed_khz __read_mostly = 10;
+module_param_unsafe(dp_aux_i2c_speed_khz, int, 0644);
+MODULE_PARM_DESC(dp_aux_i2c_speed_khz,
+                "Assumed speed of the i2c bus in kHz, (1-400, default 10)");
+
 /*
  * Transfer a single I2C-over-AUX message and handle various error conditions,
  * retrying the transaction as appropriate.  It is assumed that the
@@ -434,13 +520,16 @@ static int drm_dp_i2c_do_msg(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg)
 {
        unsigned int retry, defer_i2c;
        int ret;
-
        /*
         * DP1.2 sections 2.7.7.1.5.6.1 and 2.7.7.1.6.6.1: A DP Source device
         * is required to retry at least seven times upon receiving AUX_DEFER
         * before giving up the AUX transaction.
+        *
+        * We also try to account for the i2c bus speed.
         */
-       for (retry = 0, defer_i2c = 0; retry < (7 + defer_i2c); retry++) {
+       int max_retries = max(7, drm_dp_i2c_retry_count(msg, dp_aux_i2c_speed_khz));
+
+       for (retry = 0, defer_i2c = 0; retry < (max_retries + defer_i2c); retry++) {
                mutex_lock(&aux->hw_mutex);
                ret = aux->transfer(aux, msg);
                mutex_unlock(&aux->hw_mutex);
@@ -476,7 +565,7 @@ static int drm_dp_i2c_do_msg(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg)
                         * For now just defer for long enough to hopefully be
                         * safe for all use-cases.
                         */
-                       usleep_range(500, 600);
+                       usleep_range(AUX_RETRY_INTERVAL, AUX_RETRY_INTERVAL + 100);
                        continue;
 
                default:
@@ -506,7 +595,7 @@ static int drm_dp_i2c_do_msg(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg)
                        aux->i2c_defer_count++;
                        if (defer_i2c < 7)
                                defer_i2c++;
-                       usleep_range(400, 500);
+                       usleep_range(AUX_RETRY_INTERVAL, AUX_RETRY_INTERVAL + 100);
                        continue;
 
                default:
index df0b61a60501c46ac6a58da5ae0ec993cbb7f9f4..bd1a4156f647b3b8cf3d26b97ee657c2b4b2bab6 100644 (file)
@@ -77,6 +77,7 @@ config DRM_EXYNOS_VIDI
 config DRM_EXYNOS_G2D
        bool "Exynos DRM G2D"
        depends on DRM_EXYNOS && !VIDEO_SAMSUNG_S5P_G2D
+       select FRAME_VECTOR
        help
          Choose this option if you want to use Exynos G2D for DRM.
 
index 535b4ad6c4b14783a6ce099f81f4775759bd5ee6..3734c34aed16a22938509454cf794e5b4069ac35 100644 (file)
@@ -194,10 +194,8 @@ struct g2d_cmdlist_userptr {
        dma_addr_t              dma_addr;
        unsigned long           userptr;
        unsigned long           size;
-       struct page             **pages;
-       unsigned int            npages;
+       struct frame_vector     *vec;
        struct sg_table         *sgt;
-       struct vm_area_struct   *vma;
        atomic_t                refcount;
        bool                    in_pool;
        bool                    out_of_list;
@@ -367,6 +365,7 @@ static void g2d_userptr_put_dma_addr(struct drm_device *drm_dev,
 {
        struct g2d_cmdlist_userptr *g2d_userptr =
                                        (struct g2d_cmdlist_userptr *)obj;
+       struct page **pages;
 
        if (!obj)
                return;
@@ -386,19 +385,21 @@ out:
        exynos_gem_unmap_sgt_from_dma(drm_dev, g2d_userptr->sgt,
                                        DMA_BIDIRECTIONAL);
 
-       exynos_gem_put_pages_to_userptr(g2d_userptr->pages,
-                                       g2d_userptr->npages,
-                                       g2d_userptr->vma);
+       pages = frame_vector_pages(g2d_userptr->vec);
+       if (!IS_ERR(pages)) {
+               int i;
 
-       exynos_gem_put_vma(g2d_userptr->vma);
+               for (i = 0; i < frame_vector_count(g2d_userptr->vec); i++)
+                       set_page_dirty_lock(pages[i]);
+       }
+       put_vaddr_frames(g2d_userptr->vec);
+       frame_vector_destroy(g2d_userptr->vec);
 
        if (!g2d_userptr->out_of_list)
                list_del_init(&g2d_userptr->list);
 
        sg_free_table(g2d_userptr->sgt);
        kfree(g2d_userptr->sgt);
-
-       drm_free_large(g2d_userptr->pages);
        kfree(g2d_userptr);
 }
 
@@ -412,9 +413,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
        struct exynos_drm_g2d_private *g2d_priv = file_priv->g2d_priv;
        struct g2d_cmdlist_userptr *g2d_userptr;
        struct g2d_data *g2d;
-       struct page **pages;
        struct sg_table *sgt;
-       struct vm_area_struct *vma;
        unsigned long start, end;
        unsigned int npages, offset;
        int ret;
@@ -460,65 +459,40 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev,
                return ERR_PTR(-ENOMEM);
 
        atomic_set(&g2d_userptr->refcount, 1);
+       g2d_userptr->size = size;
 
        start = userptr & PAGE_MASK;
        offset = userptr & ~PAGE_MASK;
        end = PAGE_ALIGN(userptr + size);
        npages = (end - start) >> PAGE_SHIFT;
-       g2d_userptr->npages = npages;
-
-       pages = drm_calloc_large(npages, sizeof(struct page *));
-       if (!pages) {
-               DRM_ERROR("failed to allocate pages.\n");
+       g2d_userptr->vec = frame_vector_create(npages);
+       if (!g2d_userptr->vec) {
                ret = -ENOMEM;
                goto err_free;
        }
 
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma(current->mm, userptr);
-       if (!vma) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("failed to get vm region.\n");
+       ret = get_vaddr_frames(start, npages, true, true, g2d_userptr->vec);
+       if (ret != npages) {
+               DRM_ERROR("failed to get user pages from userptr.\n");
+               if (ret < 0)
+                       goto err_destroy_framevec;
                ret = -EFAULT;
-               goto err_free_pages;
+               goto err_put_framevec;
        }
-
-       if (vma->vm_end < userptr + size) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("vma is too small.\n");
+       if (frame_vector_to_pages(g2d_userptr->vec) < 0) {
                ret = -EFAULT;
-               goto err_free_pages;
-       }
-
-       g2d_userptr->vma = exynos_gem_get_vma(vma);
-       if (!g2d_userptr->vma) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("failed to copy vma.\n");
-               ret = -ENOMEM;
-               goto err_free_pages;
-       }
-
-       g2d_userptr->size = size;
-
-       ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK,
-                                               npages, pages, vma);
-       if (ret < 0) {
-               up_read(&current->mm->mmap_sem);
-               DRM_ERROR("failed to get user pages from userptr.\n");
-               goto err_put_vma;
+               goto err_put_framevec;
        }
 
-       up_read(&current->mm->mmap_sem);
-       g2d_userptr->pages = pages;
-
        sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
        if (!sgt) {
                ret = -ENOMEM;
-               goto err_free_userptr;
+               goto err_put_framevec;
        }
 
-       ret = sg_alloc_table_from_pages(sgt, pages, npages, offset,
-                                       size, GFP_KERNEL);
+       ret = sg_alloc_table_from_pages(sgt,
+                                       frame_vector_pages(g2d_userptr->vec),
+                                       npages, offset, size, GFP_KERNEL);
        if (ret < 0) {
                DRM_ERROR("failed to get sgt from pages.\n");
                goto err_free_sgt;
@@ -553,16 +527,11 @@ err_sg_free_table:
 err_free_sgt:
        kfree(sgt);
 
-err_free_userptr:
-       exynos_gem_put_pages_to_userptr(g2d_userptr->pages,
-                                       g2d_userptr->npages,
-                                       g2d_userptr->vma);
-
-err_put_vma:
-       exynos_gem_put_vma(g2d_userptr->vma);
+err_put_framevec:
+       put_vaddr_frames(g2d_userptr->vec);
 
-err_free_pages:
-       drm_free_large(pages);
+err_destroy_framevec:
+       frame_vector_destroy(g2d_userptr->vec);
 
 err_free:
        kfree(g2d_userptr);
index 62b9ea1b07fb005c04da732490c06e91a2996e21..f12fbc36b120065902c50253a4e91e9cc8952df5 100644 (file)
@@ -366,103 +366,6 @@ int exynos_drm_gem_get_ioctl(struct drm_device *dev, void *data,
        return 0;
 }
 
-struct vm_area_struct *exynos_gem_get_vma(struct vm_area_struct *vma)
-{
-       struct vm_area_struct *vma_copy;
-
-       vma_copy = kmalloc(sizeof(*vma_copy), GFP_KERNEL);
-       if (!vma_copy)
-               return NULL;
-
-       if (vma->vm_ops && vma->vm_ops->open)
-               vma->vm_ops->open(vma);
-
-       if (vma->vm_file)
-               get_file(vma->vm_file);
-
-       memcpy(vma_copy, vma, sizeof(*vma));
-
-       vma_copy->vm_mm = NULL;
-       vma_copy->vm_next = NULL;
-       vma_copy->vm_prev = NULL;
-
-       return vma_copy;
-}
-
-void exynos_gem_put_vma(struct vm_area_struct *vma)
-{
-       if (!vma)
-               return;
-
-       if (vma->vm_ops && vma->vm_ops->close)
-               vma->vm_ops->close(vma);
-
-       if (vma->vm_file)
-               fput(vma->vm_file);
-
-       kfree(vma);
-}
-
-int exynos_gem_get_pages_from_userptr(unsigned long start,
-                                               unsigned int npages,
-                                               struct page **pages,
-                                               struct vm_area_struct *vma)
-{
-       int get_npages;
-
-       /* the memory region mmaped with VM_PFNMAP. */
-       if (vma_is_io(vma)) {
-               unsigned int i;
-
-               for (i = 0; i < npages; ++i, start += PAGE_SIZE) {
-                       unsigned long pfn;
-                       int ret = follow_pfn(vma, start, &pfn);
-                       if (ret)
-                               return ret;
-
-                       pages[i] = pfn_to_page(pfn);
-               }
-
-               if (i != npages) {
-                       DRM_ERROR("failed to get user_pages.\n");
-                       return -EINVAL;
-               }
-
-               return 0;
-       }
-
-       get_npages = get_user_pages(current, current->mm, start,
-                                       npages, 1, 1, pages, NULL);
-       get_npages = max(get_npages, 0);
-       if (get_npages != npages) {
-               DRM_ERROR("failed to get user_pages.\n");
-               while (get_npages)
-                       put_page(pages[--get_npages]);
-               return -EFAULT;
-       }
-
-       return 0;
-}
-
-void exynos_gem_put_pages_to_userptr(struct page **pages,
-                                       unsigned int npages,
-                                       struct vm_area_struct *vma)
-{
-       if (!vma_is_io(vma)) {
-               unsigned int i;
-
-               for (i = 0; i < npages; i++) {
-                       set_page_dirty_lock(pages[i]);
-
-                       /*
-                        * undo the reference we took when populating
-                        * the table.
-                        */
-                       put_page(pages[i]);
-               }
-       }
-}
-
 int exynos_gem_map_sgt_with_dma(struct drm_device *drm_dev,
                                struct sg_table *sgt,
                                enum dma_data_direction dir)
index 81adf89b92f1536a5f135eeb349ee75eadc178bf..e1db8de52851b979c31607abf7b6995bd1febe0b 100644 (file)
@@ -1929,6 +1929,8 @@ struct drm_i915_private {
                        struct skl_wm_values skl_hw;
                        struct vlv_wm_values vlv;
                };
+
+               uint8_t max_level;
        } wm;
 
        struct i915_runtime_pm pm;
@@ -3384,13 +3386,13 @@ int intel_freq_opcode(struct drm_i915_private *dev_priv, int val);
 #define I915_READ64(reg)       dev_priv->uncore.funcs.mmio_readq(dev_priv, (reg), true)
 
 #define I915_READ64_2x32(lower_reg, upper_reg) ({                      \
-       u32 upper, lower, tmp;                                          \
-       tmp = I915_READ(upper_reg);                                     \
+       u32 upper, lower, old_upper, loop = 0;                          \
+       upper = I915_READ(upper_reg);                                   \
        do {                                                            \
-               upper = tmp;                                            \
+               old_upper = upper;                                      \
                lower = I915_READ(lower_reg);                           \
-               tmp = I915_READ(upper_reg);                             \
-       } while (upper != tmp);                                         \
+               upper = I915_READ(upper_reg);                           \
+       } while (upper != old_upper && loop++ < 2);                     \
        (u64)upper << 32 | lower; })
 
 #define POSTING_READ(reg)      (void)I915_READ_NOTRACE(reg)
index 923a3c4bf0b79c71b8a05c417211ae9507f919aa..a953d4975b8c08d237bac9aa4a7ecd3eaf29c28e 100644 (file)
@@ -1032,6 +1032,7 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
                u32 old_read = obj->base.read_domains;
                u32 old_write = obj->base.write_domain;
 
+               obj->dirty = 1; /* be paranoid  */
                obj->base.write_domain = obj->base.pending_write_domain;
                if (obj->base.write_domain == 0)
                        obj->base.pending_read_domains |= obj->base.read_domains;
@@ -1039,7 +1040,6 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
 
                i915_vma_move_to_active(vma, req);
                if (obj->base.write_domain) {
-                       obj->dirty = 1;
                        i915_gem_request_assign(&obj->last_write_req, req);
 
                        intel_fb_obj_invalidate(obj, ORIGIN_CS);
index b5fb1430c1d7d4b3ee6bda5fa4d115d58d502566..5a244ab9395ba2c9f61278a1293850d5e75bc9c4 100644 (file)
@@ -1558,7 +1558,7 @@ static void i9xx_hpd_irq_handler(struct drm_device *dev)
                u32 hotplug_trigger = hotplug_status & HOTPLUG_INT_STATUS_I915;
 
                intel_get_hpd_pins(&pin_mask, &long_mask, hotplug_trigger,
-                                  hotplug_trigger, hpd_status_g4x,
+                                  hotplug_trigger, hpd_status_i915,
                                   i9xx_port_hotplug_long_detect);
                intel_hpd_irq_handler(dev, pin_mask, long_mask);
        }
index ba1ae031e6fd47ff7873fe739a4efd5e5c5fddac..d0f1b8d833cd2d890e0328df50cf5839cd3fba2f 100644 (file)
@@ -350,7 +350,7 @@ static void finish_csr_load(const struct firmware *fw, void *context)
        }
        csr->mmio_count = dmc_header->mmio_count;
        for (i = 0; i < dmc_header->mmio_count; i++) {
-               if (dmc_header->mmioaddr[i] < CSR_MMIO_START_RANGE &&
+               if (dmc_header->mmioaddr[i] < CSR_MMIO_START_RANGE ||
                        dmc_header->mmioaddr[i] > CSR_MMIO_END_RANGE) {
                        DRM_ERROR(" Firmware has wrong mmio address 0x%x\n",
                                                dmc_header->mmioaddr[i]);
index ca9278be49f7d5f9f259d44ad6566c9a7c18d1ae..8cc9264f78094c8cfcb101c16d61caf40729323e 100644 (file)
@@ -6305,7 +6305,7 @@ static void intel_connector_check_state(struct intel_connector *connector)
                      connector->base.name);
 
        if (connector->get_hw_state(connector)) {
-               struct drm_encoder *encoder = &connector->encoder->base;
+               struct intel_encoder *encoder = connector->encoder;
                struct drm_connector_state *conn_state = connector->base.state;
 
                I915_STATE_WARN(!crtc,
@@ -6317,13 +6317,13 @@ static void intel_connector_check_state(struct intel_connector *connector)
                I915_STATE_WARN(!crtc->state->active,
                      "connector is active, but attached crtc isn't\n");
 
-               if (!encoder)
+               if (!encoder || encoder->type == INTEL_OUTPUT_DP_MST)
                        return;
 
-               I915_STATE_WARN(conn_state->best_encoder != encoder,
+               I915_STATE_WARN(conn_state->best_encoder != &encoder->base,
                        "atomic encoder doesn't match attached encoder\n");
 
-               I915_STATE_WARN(conn_state->crtc != encoder->crtc,
+               I915_STATE_WARN(conn_state->crtc != encoder->base.crtc,
                        "attached encoder crtc differs from connector crtc\n");
        } else {
                I915_STATE_WARN(crtc && crtc->state->active,
index 983553cf8b74e472806bf8ab7b1a55dcbc9d0d73..3e4be5a3becdddf9fd2a23e6be26f02da90a28f2 100644 (file)
@@ -173,6 +173,11 @@ static void intel_mst_pre_enable_dp(struct intel_encoder *encoder)
                return;
        }
 
+       /* MST encoders are bound to a crtc, not to a connector,
+        * force the mapping here for get_hw_state.
+        */
+       found->encoder = encoder;
+
        DRM_DEBUG_KMS("%d\n", intel_dp->active_mst_links);
        intel_mst->port = found->port;
 
@@ -400,7 +405,7 @@ static const struct drm_encoder_funcs intel_dp_mst_enc_funcs = {
 
 static bool intel_dp_mst_get_hw_state(struct intel_connector *connector)
 {
-       if (connector->encoder) {
+       if (connector->encoder && connector->base.state->crtc) {
                enum pipe pipe;
                if (!connector->encoder->get_hw_state(connector->encoder, &pipe))
                        return false;
index 4a601cf90f16c68d694babd065bbae09cbe6e9f7..32a6c7184ca4fcbcc73786e678f634a22453224d 100644 (file)
@@ -1048,11 +1048,7 @@ void intel_dsi_init(struct drm_device *dev)
        intel_connector->unregister = intel_connector_unregister;
 
        /* Pipe A maps to MIPI DSI port A, pipe B maps to MIPI DSI port C */
-       if (dev_priv->vbt.dsi.config->dual_link) {
-               /* XXX: does dual link work on either pipe? */
-               intel_encoder->crtc_mask = (1 << PIPE_A);
-               intel_dsi->ports = ((1 << PORT_A) | (1 << PORT_C));
-       } else if (dev_priv->vbt.dsi.port == DVO_PORT_MIPIA) {
+       if (dev_priv->vbt.dsi.port == DVO_PORT_MIPIA) {
                intel_encoder->crtc_mask = (1 << PIPE_A);
                intel_dsi->ports = (1 << PORT_A);
        } else if (dev_priv->vbt.dsi.port == DVO_PORT_MIPIC) {
@@ -1060,6 +1056,9 @@ void intel_dsi_init(struct drm_device *dev)
                intel_dsi->ports = (1 << PORT_C);
        }
 
+       if (dev_priv->vbt.dsi.config->dual_link)
+               intel_dsi->ports = ((1 << PORT_A) | (1 << PORT_C));
+
        /* Create a DSI host (and a device) for each port. */
        for_each_dsi_port(port, intel_dsi->ports) {
                struct intel_dsi_host *host;
index fff0c22682ee32f947907da7bb27f4fda0463073..ddbb7ed0a193229355700926006578ca5f06b937 100644 (file)
@@ -955,8 +955,6 @@ enum vlv_wm_level {
        VLV_WM_LEVEL_PM2,
        VLV_WM_LEVEL_PM5,
        VLV_WM_LEVEL_DDR_DVFS,
-       CHV_WM_NUM_LEVELS,
-       VLV_WM_NUM_LEVELS = 1,
 };
 
 /* latency must be in 0.1us units. */
@@ -982,9 +980,13 @@ static void vlv_setup_wm_latency(struct drm_device *dev)
        /* all latencies in usec */
        dev_priv->wm.pri_latency[VLV_WM_LEVEL_PM2] = 3;
 
+       dev_priv->wm.max_level = VLV_WM_LEVEL_PM2;
+
        if (IS_CHERRYVIEW(dev_priv)) {
                dev_priv->wm.pri_latency[VLV_WM_LEVEL_PM5] = 12;
                dev_priv->wm.pri_latency[VLV_WM_LEVEL_DDR_DVFS] = 33;
+
+               dev_priv->wm.max_level = VLV_WM_LEVEL_DDR_DVFS;
        }
 }
 
@@ -1137,10 +1139,7 @@ static void vlv_compute_wm(struct intel_crtc *crtc)
        memset(wm_state, 0, sizeof(*wm_state));
 
        wm_state->cxsr = crtc->pipe != PIPE_C && crtc->wm.cxsr_allowed;
-       if (IS_CHERRYVIEW(dev))
-               wm_state->num_levels = CHV_WM_NUM_LEVELS;
-       else
-               wm_state->num_levels = VLV_WM_NUM_LEVELS;
+       wm_state->num_levels = to_i915(dev)->wm.max_level + 1;
 
        wm_state->num_active_planes = 0;
 
@@ -1220,7 +1219,7 @@ static void vlv_compute_wm(struct intel_crtc *crtc)
        }
 
        /* clear any (partially) filled invalid levels */
-       for (level = wm_state->num_levels; level < CHV_WM_NUM_LEVELS; level++) {
+       for (level = wm_state->num_levels; level < to_i915(dev)->wm.max_level + 1; level++) {
                memset(&wm_state->wm[level], 0, sizeof(wm_state->wm[level]));
                memset(&wm_state->sr[level], 0, sizeof(wm_state->sr[level]));
        }
@@ -1324,10 +1323,7 @@ static void vlv_merge_wm(struct drm_device *dev,
        struct intel_crtc *crtc;
        int num_active_crtcs = 0;
 
-       if (IS_CHERRYVIEW(dev))
-               wm->level = VLV_WM_LEVEL_DDR_DVFS;
-       else
-               wm->level = VLV_WM_LEVEL_PM2;
+       wm->level = to_i915(dev)->wm.max_level;
        wm->cxsr = true;
 
        for_each_intel_crtc(dev, crtc) {
@@ -4083,9 +4079,29 @@ void vlv_wm_get_hw_state(struct drm_device *dev)
                if (val & DSP_MAXFIFO_PM5_ENABLE)
                        wm->level = VLV_WM_LEVEL_PM5;
 
+               /*
+                * If DDR DVFS is disabled in the BIOS, Punit
+                * will never ack the request. So if that happens
+                * assume we don't have to enable/disable DDR DVFS
+                * dynamically. To test that just set the REQ_ACK
+                * bit to poke the Punit, but don't change the
+                * HIGH/LOW bits so that we don't actually change
+                * the current state.
+                */
                val = vlv_punit_read(dev_priv, PUNIT_REG_DDR_SETUP2);
-               if ((val & FORCE_DDR_HIGH_FREQ) == 0)
-                       wm->level = VLV_WM_LEVEL_DDR_DVFS;
+               val |= FORCE_DDR_FREQ_REQ_ACK;
+               vlv_punit_write(dev_priv, PUNIT_REG_DDR_SETUP2, val);
+
+               if (wait_for((vlv_punit_read(dev_priv, PUNIT_REG_DDR_SETUP2) &
+                             FORCE_DDR_FREQ_REQ_ACK) == 0, 3)) {
+                       DRM_DEBUG_KMS("Punit not acking DDR DVFS request, "
+                                     "assuming DDR DVFS is disabled\n");
+                       dev_priv->wm.max_level = VLV_WM_LEVEL_PM5;
+               } else {
+                       val = vlv_punit_read(dev_priv, PUNIT_REG_DDR_SETUP2);
+                       if ((val & FORCE_DDR_HIGH_FREQ) == 0)
+                               wm->level = VLV_WM_LEVEL_DDR_DVFS;
+               }
 
                mutex_unlock(&dev_priv->rps.hw_lock);
        }
index 9dd1cac81e808e456caa3b5fb31fa6a2bf70a8d4..e8eb14e438f4d9fdea84ba444961c1cf453c7aff 100644 (file)
@@ -689,6 +689,7 @@ nvkm_device_pci_10de_11e3[] = {
 
 static const struct nvkm_device_pci_vendor
 nvkm_device_pci_10de_11fc[] = {
+       { 0x1179, 0x0001, NULL, { .War00C800_0 = true } }, /* Toshiba Tecra W50 */
        { 0x17aa, 0x2211, NULL, { .War00C800_0 = true } }, /* Lenovo W541 */
        { 0x17aa, 0x221e, NULL, { .War00C800_0 = true } }, /* Lenovo W541 */
        {}
index 426ba0025a8d049c57ffa25e29f9a4458ffde891..85c5b7fea5f5e177c347632aae8fb978f18d16c5 100644 (file)
@@ -1048,11 +1048,11 @@ nv04_gr_object_bind(struct nvkm_object *object, struct nvkm_gpuobj *parent,
        if (ret == 0) {
                nvkm_kmap(*pgpuobj);
                nvkm_wo32(*pgpuobj, 0x00, object->oclass);
-               nvkm_wo32(*pgpuobj, 0x04, 0x00000000);
-               nvkm_wo32(*pgpuobj, 0x08, 0x00000000);
 #ifdef __BIG_ENDIAN
-               nvkm_mo32(*pgpuobj, 0x08, 0x00080000, 0x00080000);
+               nvkm_mo32(*pgpuobj, 0x00, 0x00080000, 0x00080000);
 #endif
+               nvkm_wo32(*pgpuobj, 0x04, 0x00000000);
+               nvkm_wo32(*pgpuobj, 0x08, 0x00000000);
                nvkm_wo32(*pgpuobj, 0x0c, 0x00000000);
                nvkm_done(*pgpuobj);
        }
index 07feae620c8d9f301e103a030d5ffb23c8e2120a..c233e3f653ce23168a579c36e07d153182333021 100644 (file)
@@ -326,7 +326,7 @@ gt215_clk_pre(struct nvkm_clk *clk, unsigned long *flags)
                return -EIO;
 
        if (nvkm_msec(device, 2000,
-               u32 tmp = nvkm_rd32(device, 0x002504) & 0x0000003f;
+               u32 tmp = nvkm_rd32(device, 0x00251c) & 0x0000003f;
                if (tmp == 0x0000003f)
                        break;
        ) < 0)
index a8dbb3ef4e3c9602aa7734e3d5984b46ba607fb6..7c6225c84ba6745919377fa9bfbd3506a7d8f7e5 100644 (file)
@@ -160,9 +160,35 @@ static int qxl_add_monitors_config_modes(struct drm_connector *connector,
        *pwidth = head->width;
        *pheight = head->height;
        drm_mode_probed_add(connector, mode);
+       /* remember the last custom size for mode validation */
+       qdev->monitors_config_width = mode->hdisplay;
+       qdev->monitors_config_height = mode->vdisplay;
        return 1;
 }
 
+static struct mode_size {
+       int w;
+       int h;
+} common_modes[] = {
+       { 640,  480},
+       { 720,  480},
+       { 800,  600},
+       { 848,  480},
+       {1024,  768},
+       {1152,  768},
+       {1280,  720},
+       {1280,  800},
+       {1280,  854},
+       {1280,  960},
+       {1280, 1024},
+       {1440,  900},
+       {1400, 1050},
+       {1680, 1050},
+       {1600, 1200},
+       {1920, 1080},
+       {1920, 1200}
+};
+
 static int qxl_add_common_modes(struct drm_connector *connector,
                                 unsigned pwidth,
                                 unsigned pheight)
@@ -170,29 +196,6 @@ static int qxl_add_common_modes(struct drm_connector *connector,
        struct drm_device *dev = connector->dev;
        struct drm_display_mode *mode = NULL;
        int i;
-       struct mode_size {
-               int w;
-               int h;
-       } common_modes[] = {
-               { 640,  480},
-               { 720,  480},
-               { 800,  600},
-               { 848,  480},
-               {1024,  768},
-               {1152,  768},
-               {1280,  720},
-               {1280,  800},
-               {1280,  854},
-               {1280,  960},
-               {1280, 1024},
-               {1440,  900},
-               {1400, 1050},
-               {1680, 1050},
-               {1600, 1200},
-               {1920, 1080},
-               {1920, 1200}
-       };
-
        for (i = 0; i < ARRAY_SIZE(common_modes); i++) {
                mode = drm_cvt_mode(dev, common_modes[i].w, common_modes[i].h,
                                    60, false, false, false);
@@ -823,11 +826,22 @@ static int qxl_conn_get_modes(struct drm_connector *connector)
 static int qxl_conn_mode_valid(struct drm_connector *connector,
                               struct drm_display_mode *mode)
 {
+       struct drm_device *ddev = connector->dev;
+       struct qxl_device *qdev = ddev->dev_private;
+       int i;
+
        /* TODO: is this called for user defined modes? (xrandr --add-mode)
         * TODO: check that the mode fits in the framebuffer */
-       DRM_DEBUG("%s: %dx%d status=%d\n", mode->name, mode->hdisplay,
-                 mode->vdisplay, mode->status);
-       return MODE_OK;
+
+       if(qdev->monitors_config_width == mode->hdisplay &&
+          qdev->monitors_config_height == mode->vdisplay)
+               return MODE_OK;
+
+       for (i = 0; i < ARRAY_SIZE(common_modes); i++) {
+               if (common_modes[i].w == mode->hdisplay && common_modes[i].h == mode->vdisplay)
+                       return MODE_OK;
+       }
+       return MODE_BAD;
 }
 
 static struct drm_encoder *qxl_best_encoder(struct drm_connector *connector)
index d8549690801d20fbc2dfabfed52006fc3752bdf8..01a86948eb8cd6007a1db70785b714df9bc9cbb1 100644 (file)
@@ -325,6 +325,8 @@ struct qxl_device {
        struct work_struct fb_work;
 
        struct drm_property *hotplug_mode_update_property;
+       int monitors_config_width;
+       int monitors_config_height;
 };
 
 /* forward declaration for QXL_INFO_IO */
index 6394547cf67a0b3de9662105bc9e4b604d2f77c6..860062ef88144e6fe4351b386328954be2293201 100644 (file)
@@ -125,7 +125,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        }
 }
 
-static struct vm_operations_struct vgem_gem_vm_ops = {
+static const struct vm_operations_struct vgem_gem_vm_ops = {
        .fault = vgem_gem_fault,
        .open = drm_gem_vm_open,
        .close = drm_gem_vm_close,
index d04643f9548bbca84edee48659cfe7fac2600bfa..95638df73d1c328c88a57c077fccf271731b5a4e 100644 (file)
@@ -1110,7 +1110,7 @@ static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
 
-static struct vm_operations_struct cs_char_vm_ops = {
+static const struct vm_operations_struct cs_char_vm_ops = {
        .fault  = cs_char_vma_fault,
 };
 
index 500b262b89bb15ee9859caa097fdb2b2f6977849..e13c902e8966977581f1959155b3f83774e94aa5 100644 (file)
@@ -1140,8 +1140,8 @@ config SENSORS_NCT6775
        help
          If you say yes here you get support for the hardware monitoring
          functionality of the Nuvoton NCT6106D, NCT6775F, NCT6776F, NCT6779D,
-         NCT6791D, NCT6792D and compatible Super-I/O chips. This driver
-         replaces the w83627ehf driver for NCT6775F and NCT6776F.
+         NCT6791D, NCT6792D, NCT6793D, and compatible Super-I/O chips. This
+         driver replaces the w83627ehf driver for NCT6775F and NCT6776F.
 
          This driver can also be built as a module.  If so, the module
          will be called nct6775.
index fe41d5ae7cb2c02e8121d80ebab5bdf7a3533cab..e4e57bbafb10eeb621ad03ff03e8985030c43e4a 100644 (file)
@@ -104,7 +104,7 @@ static inline long lm75_reg_to_mc(s16 temp, u8 resolution)
 
 /* sysfs attributes for hwmon */
 
-static int lm75_read_temp(void *dev, long *temp)
+static int lm75_read_temp(void *dev, int *temp)
 {
        struct lm75_data *data = lm75_update_device(dev);
 
index bd1c99deac71b73dadf15615c1e8442027bccee9..8b4fa55e46c6afceb3895515bc0df0cb3b8a4b85 100644 (file)
@@ -39,6 +39,7 @@
  * nct6779d    15      5       5       2+6    0xc560 0xc1    0x5ca3
  * nct6791d    15      6       6       2+6    0xc800 0xc1    0x5ca3
  * nct6792d    15      6       6       2+6    0xc910 0xc1    0x5ca3
+ * nct6793d    15      6       6       2+6    0xd120 0xc1    0x5ca3
  *
  * #temp lists the number of monitored temperature sources (first value) plus
  * the number of directly connectable temperature sensors (second value).
@@ -63,7 +64,7 @@
 
 #define USE_ALTERNATE
 
-enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792 };
+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793 };
 
 /* used to set data->name = nct6775_device_names[data->sio_kind] */
 static const char * const nct6775_device_names[] = {
@@ -73,6 +74,17 @@ static const char * const nct6775_device_names[] = {
        "nct6779",
        "nct6791",
        "nct6792",
+       "nct6793",
+};
+
+static const char * const nct6775_sio_names[] __initconst = {
+       "NCT6106D",
+       "NCT6775F",
+       "NCT6776D/F",
+       "NCT6779D",
+       "NCT6791D",
+       "NCT6792D",
+       "NCT6793D",
 };
 
 static unsigned short force_id;
@@ -104,6 +116,7 @@ MODULE_PARM_DESC(fan_debounce, "Enable debouncing for fan RPM signal");
 #define SIO_NCT6779_ID         0xc560
 #define SIO_NCT6791_ID         0xc800
 #define SIO_NCT6792_ID         0xc910
+#define SIO_NCT6793_ID         0xd120
 #define SIO_ID_MASK            0xFFF0
 
 enum pwm_enable { off, manual, thermal_cruise, speed_cruise, sf3, sf4 };
@@ -354,6 +367,10 @@ static const u16 NCT6775_REG_TEMP_CRIT[ARRAY_SIZE(nct6775_temp_label) - 1]
 
 /* NCT6776 specific data */
 
+/* STEP_UP_TIME and STEP_DOWN_TIME regs are swapped for all chips but NCT6775 */
+#define NCT6776_REG_FAN_STEP_UP_TIME NCT6775_REG_FAN_STEP_DOWN_TIME
+#define NCT6776_REG_FAN_STEP_DOWN_TIME NCT6775_REG_FAN_STEP_UP_TIME
+
 static const s8 NCT6776_ALARM_BITS[] = {
        0, 1, 2, 3, 8, 21, 20, 16,      /* in0.. in7 */
        17, -1, -1, -1, -1, -1, -1,     /* in8..in14 */
@@ -533,7 +550,7 @@ static const s8 NCT6791_ALARM_BITS[] = {
        4, 5, 13, -1, -1, -1,           /* temp1..temp6 */
        12, 9 };                        /* intrusion0, intrusion1 */
 
-/* NCT6792 specific data */
+/* NCT6792/NCT6793 specific data */
 
 static const u16 NCT6792_REG_TEMP_MON[] = {
        0x73, 0x75, 0x77, 0x79, 0x7b, 0x7d };
@@ -1056,6 +1073,7 @@ static bool is_word_sized(struct nct6775_data *data, u16 reg)
        case nct6779:
        case nct6791:
        case nct6792:
+       case nct6793:
                return reg == 0x150 || reg == 0x153 || reg == 0x155 ||
                  ((reg & 0xfff0) == 0x4b0 && (reg & 0x000f) < 0x0b) ||
                  reg == 0x402 ||
@@ -1407,6 +1425,7 @@ static void nct6775_update_pwm_limits(struct device *dev)
                case nct6779:
                case nct6791:
                case nct6792:
+               case nct6793:
                        reg = nct6775_read_value(data,
                                        data->REG_CRITICAL_PWM_ENABLE[i]);
                        if (reg & data->CRITICAL_PWM_ENABLE_MASK)
@@ -2822,6 +2841,7 @@ store_auto_pwm(struct device *dev, struct device_attribute *attr,
                case nct6779:
                case nct6791:
                case nct6792:
+               case nct6793:
                        nct6775_write_value(data, data->REG_CRITICAL_PWM[nr],
                                            val);
                        reg = nct6775_read_value(data,
@@ -3256,7 +3276,7 @@ nct6775_check_fan_inputs(struct nct6775_data *data)
                pwm4pin = false;
                pwm5pin = false;
                pwm6pin = false;
-       } else {        /* NCT6779D, NCT6791D, or NCT6792D */
+       } else {        /* NCT6779D, NCT6791D, NCT6792D, or NCT6793D */
                regval = superio_inb(sioreg, 0x1c);
 
                fan3pin = !(regval & (1 << 5));
@@ -3269,7 +3289,8 @@ nct6775_check_fan_inputs(struct nct6775_data *data)
 
                fan4min = fan4pin;
 
-               if (data->kind == nct6791 || data->kind == nct6792) {
+               if (data->kind == nct6791 || data->kind == nct6792 ||
+                   data->kind == nct6793) {
                        regval = superio_inb(sioreg, 0x2d);
                        fan6pin = (regval & (1 << 1));
                        pwm6pin = (regval & (1 << 0));
@@ -3528,8 +3549,8 @@ static int nct6775_probe(struct platform_device *pdev)
                data->REG_FAN_PULSES = NCT6776_REG_FAN_PULSES;
                data->FAN_PULSE_SHIFT = NCT6775_FAN_PULSE_SHIFT;
                data->REG_FAN_TIME[0] = NCT6775_REG_FAN_STOP_TIME;
-               data->REG_FAN_TIME[1] = NCT6775_REG_FAN_STEP_UP_TIME;
-               data->REG_FAN_TIME[2] = NCT6775_REG_FAN_STEP_DOWN_TIME;
+               data->REG_FAN_TIME[1] = NCT6776_REG_FAN_STEP_UP_TIME;
+               data->REG_FAN_TIME[2] = NCT6776_REG_FAN_STEP_DOWN_TIME;
                data->REG_TOLERANCE_H = NCT6776_REG_TOLERANCE_H;
                data->REG_PWM[0] = NCT6775_REG_PWM;
                data->REG_PWM[1] = NCT6775_REG_FAN_START_OUTPUT;
@@ -3600,8 +3621,8 @@ static int nct6775_probe(struct platform_device *pdev)
                data->REG_FAN_PULSES = NCT6779_REG_FAN_PULSES;
                data->FAN_PULSE_SHIFT = NCT6775_FAN_PULSE_SHIFT;
                data->REG_FAN_TIME[0] = NCT6775_REG_FAN_STOP_TIME;
-               data->REG_FAN_TIME[1] = NCT6775_REG_FAN_STEP_UP_TIME;
-               data->REG_FAN_TIME[2] = NCT6775_REG_FAN_STEP_DOWN_TIME;
+               data->REG_FAN_TIME[1] = NCT6776_REG_FAN_STEP_UP_TIME;
+               data->REG_FAN_TIME[2] = NCT6776_REG_FAN_STEP_DOWN_TIME;
                data->REG_TOLERANCE_H = NCT6776_REG_TOLERANCE_H;
                data->REG_PWM[0] = NCT6775_REG_PWM;
                data->REG_PWM[1] = NCT6775_REG_FAN_START_OUTPUT;
@@ -3643,6 +3664,7 @@ static int nct6775_probe(struct platform_device *pdev)
                break;
        case nct6791:
        case nct6792:
+       case nct6793:
                data->in_num = 15;
                data->pwm_num = 6;
                data->auto_pwm_num = 4;
@@ -3677,8 +3699,8 @@ static int nct6775_probe(struct platform_device *pdev)
                data->REG_FAN_PULSES = NCT6779_REG_FAN_PULSES;
                data->FAN_PULSE_SHIFT = NCT6775_FAN_PULSE_SHIFT;
                data->REG_FAN_TIME[0] = NCT6775_REG_FAN_STOP_TIME;
-               data->REG_FAN_TIME[1] = NCT6775_REG_FAN_STEP_UP_TIME;
-               data->REG_FAN_TIME[2] = NCT6775_REG_FAN_STEP_DOWN_TIME;
+               data->REG_FAN_TIME[1] = NCT6776_REG_FAN_STEP_UP_TIME;
+               data->REG_FAN_TIME[2] = NCT6776_REG_FAN_STEP_DOWN_TIME;
                data->REG_TOLERANCE_H = NCT6776_REG_TOLERANCE_H;
                data->REG_PWM[0] = NCT6775_REG_PWM;
                data->REG_PWM[1] = NCT6775_REG_FAN_START_OUTPUT;
@@ -3918,6 +3940,7 @@ static int nct6775_probe(struct platform_device *pdev)
        case nct6779:
        case nct6791:
        case nct6792:
+       case nct6793:
                break;
        }
 
@@ -3950,6 +3973,7 @@ static int nct6775_probe(struct platform_device *pdev)
                        break;
                case nct6791:
                case nct6792:
+               case nct6793:
                        tmp |= 0x7e;
                        break;
                }
@@ -4047,7 +4071,8 @@ static int __maybe_unused nct6775_resume(struct device *dev)
        if (reg != data->sio_reg_enable)
                superio_outb(sioreg, SIO_REG_ENABLE, data->sio_reg_enable);
 
-       if (data->kind == nct6791 || data->kind == nct6792)
+       if (data->kind == nct6791 || data->kind == nct6792 ||
+           data->kind == nct6793)
                nct6791_enable_io_mapping(sioreg);
 
        superio_exit(sioreg);
@@ -4106,15 +4131,6 @@ static struct platform_driver nct6775_driver = {
        .probe          = nct6775_probe,
 };
 
-static const char * const nct6775_sio_names[] __initconst = {
-       "NCT6106D",
-       "NCT6775F",
-       "NCT6776D/F",
-       "NCT6779D",
-       "NCT6791D",
-       "NCT6792D",
-};
-
 /* nct6775_find() looks for a '627 in the Super-I/O config space */
 static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
 {
@@ -4150,6 +4166,9 @@ static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
        case SIO_NCT6792_ID:
                sio_data->kind = nct6792;
                break;
+       case SIO_NCT6793_ID:
+               sio_data->kind = nct6793;
+               break;
        default:
                if (val != 0xffff)
                        pr_debug("unsupported chip ID: 0x%04x\n", val);
@@ -4175,7 +4194,8 @@ static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
                superio_outb(sioaddr, SIO_REG_ENABLE, val | 0x01);
        }
 
-       if (sio_data->kind == nct6791 || sio_data->kind == nct6792)
+       if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
+           sio_data->kind == nct6793)
                nct6791_enable_io_mapping(sioaddr);
 
        superio_exit(sioaddr);
@@ -4285,7 +4305,7 @@ static void __exit sensors_nct6775_exit(void)
 }
 
 MODULE_AUTHOR("Guenter Roeck <linux@roeck-us.net>");
-MODULE_DESCRIPTION("NCT6106D/NCT6775F/NCT6776F/NCT6779D/NCT6791D/NCT6792D driver");
+MODULE_DESCRIPTION("Driver for NCT6775F and compatible chips");
 MODULE_LICENSE("GPL");
 
 module_init(sensors_nct6775_init);
index dc0b76c5e3028018683b6a34c2c4f58c3906c650..feed30646d91837a5ea9ddab9e5ccd7e812ace47 100644 (file)
@@ -477,7 +477,7 @@ static int ntc_thermistor_get_ohm(struct ntc_data *data)
        return -EINVAL;
 }
 
-static int ntc_read_temp(void *dev, long *temp)
+static int ntc_read_temp(void *dev, int *temp)
 {
        struct ntc_data *data = dev_get_drvdata(dev);
        int ohm;
index 9da2735f14243ed30c1365124195d6dcc9e0baf1..65482624ea2c81a2ef2b5487d6a0470cc90fead7 100644 (file)
@@ -98,7 +98,7 @@ static struct tmp102 *tmp102_update_device(struct device *dev)
        return tmp102;
 }
 
-static int tmp102_read_temp(void *dev, long *temp)
+static int tmp102_read_temp(void *dev, int *temp)
 {
        struct tmp102 *tmp102 = tmp102_update_device(dev);
 
index 577d58d1f1a19881a23063cccf39cf5158d88520..08b86178e8fba99679b5e61977338779204643b0 100644 (file)
@@ -526,6 +526,13 @@ config I2C_EG20T
          ML7213/ML7223/ML7831 is companion chip for Intel Atom E6xx series.
          ML7213/ML7223/ML7831 is completely compatible for Intel EG20T PCH.
 
+config I2C_EMEV2
+       tristate "EMMA Mobile series I2C adapter"
+       depends on HAVE_CLK
+       help
+         If you say yes to this option, support will be included for the
+         I2C interface on the Renesas Electronics EM/EV family of processors.
+
 config I2C_EXYNOS5
        tristate "Exynos5 high-speed I2C driver"
        depends on ARCH_EXYNOS && OF
@@ -612,6 +619,16 @@ config I2C_KEMPLD
          This driver can also be built as a module. If so, the module
          will be called i2c-kempld.
 
+config I2C_LPC2K
+       tristate "I2C bus support for NXP LPC2K/LPC178x/18xx/43xx"
+       depends on OF && (ARCH_LPC18XX || COMPILE_TEST)
+       help
+         This driver supports the I2C interface found several NXP
+         devices including LPC2xxx, LPC178x/7x and LPC18xx/43xx.
+
+         This driver can also be built as a module.  If so, the module
+         will be called i2c-lpc2k.
+
 config I2C_MESON
        tristate "Amlogic Meson I2C controller"
        depends on ARCH_MESON
@@ -1123,7 +1140,7 @@ config I2C_SIBYTE
 
 config I2C_CROS_EC_TUNNEL
        tristate "ChromeOS EC tunnel I2C bus"
-       depends on CROS_EC_PROTO
+       depends on MFD_CROS_EC
        help
          If you say yes here you get an I2C bus that will tunnel i2c commands
          through to the other side of the ChromeOS EC to the i2c bus
index e5f537c80da08f306304d248a5f358a6b7b9281a..6df3b303bd092bb3bf093dade0bc8e7c32ec51ca 100644 (file)
@@ -48,6 +48,7 @@ i2c-designware-pci-objs := i2c-designware-pcidrv.o
 obj-$(CONFIG_I2C_DIGICOLOR)    += i2c-digicolor.o
 obj-$(CONFIG_I2C_EFM32)                += i2c-efm32.o
 obj-$(CONFIG_I2C_EG20T)                += i2c-eg20t.o
+obj-$(CONFIG_I2C_EMEV2)                += i2c-emev2.o
 obj-$(CONFIG_I2C_EXYNOS5)      += i2c-exynos5.o
 obj-$(CONFIG_I2C_GPIO)         += i2c-gpio.o
 obj-$(CONFIG_I2C_HIGHLANDER)   += i2c-highlander.o
@@ -58,6 +59,7 @@ obj-$(CONFIG_I2C_IMX)         += i2c-imx.o
 obj-$(CONFIG_I2C_IOP3XX)       += i2c-iop3xx.o
 obj-$(CONFIG_I2C_JZ4780)       += i2c-jz4780.o
 obj-$(CONFIG_I2C_KEMPLD)       += i2c-kempld.o
+obj-$(CONFIG_I2C_LPC2K)                += i2c-lpc2k.o
 obj-$(CONFIG_I2C_MESON)                += i2c-meson.o
 obj-$(CONFIG_I2C_MPC)          += i2c-mpc.o
 obj-$(CONFIG_I2C_MT65XX)       += i2c-mt65xx.o
index 2ee78e099d3047de1096ebbb469dc561e589cce3..84deed6571bdf45296b9cb906087d305c9340e55 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
 
 /* Register offsets for the I2C device. */
 #define CDNS_I2C_CR_OFFSET             0x00 /* Control Register, RW */
 
 #define CDNS_I2C_TIMEOUT_MAX   0xFF
 
+#define CDNS_I2C_BROKEN_HOLD_BIT       BIT(0)
+
 #define cdns_i2c_readreg(offset)       readl_relaxed(id->membase + offset)
 #define cdns_i2c_writereg(val, offset) writel_relaxed(val, id->membase + offset)
 
  * @bus_hold_flag:     Flag used in repeated start for clearing HOLD bit
  * @clk:               Pointer to struct clk
  * @clk_rate_change_nb:        Notifier block for clock rate changes
+ * @quirks:            flag for broken hold bit usage in r1p10
  */
 struct cdns_i2c {
        void __iomem *membase;
@@ -154,6 +158,11 @@ struct cdns_i2c {
        unsigned int bus_hold_flag;
        struct clk *clk;
        struct notifier_block clk_rate_change_nb;
+       u32 quirks;
+};
+
+struct cdns_platform_data {
+       u32 quirks;
 };
 
 #define to_cdns_i2c(_nb)       container_of(_nb, struct cdns_i2c, \
@@ -172,6 +181,12 @@ static void cdns_i2c_clear_bus_hold(struct cdns_i2c *id)
                cdns_i2c_writereg(reg & ~CDNS_I2C_CR_HOLD, CDNS_I2C_CR_OFFSET);
 }
 
+static inline bool cdns_is_holdquirk(struct cdns_i2c *id, bool hold_wrkaround)
+{
+       return (hold_wrkaround &&
+               (id->curr_recv_count == CDNS_I2C_FIFO_DEPTH + 1));
+}
+
 /**
  * cdns_i2c_isr - Interrupt handler for the I2C device
  * @irq:       irq number for the I2C device
@@ -186,6 +201,7 @@ static irqreturn_t cdns_i2c_isr(int irq, void *ptr)
 {
        unsigned int isr_status, avail_bytes, updatetx;
        unsigned int bytes_to_send;
+       bool hold_quirk;
        struct cdns_i2c *id = ptr;
        /* Signal completion only after everything is updated */
        int done_flag = 0;
@@ -208,6 +224,8 @@ static irqreturn_t cdns_i2c_isr(int irq, void *ptr)
        if (id->recv_count > id->curr_recv_count)
                updatetx = 1;
 
+       hold_quirk = (id->quirks & CDNS_I2C_BROKEN_HOLD_BIT) && updatetx;
+
        /* When receiving, handle data interrupt and completion interrupt */
        if (id->p_recv_buf &&
            ((isr_status & CDNS_I2C_IXR_COMP) ||
@@ -229,8 +247,7 @@ static irqreturn_t cdns_i2c_isr(int irq, void *ptr)
                        id->recv_count--;
                        id->curr_recv_count--;
 
-                       if (updatetx &&
-                           (id->curr_recv_count == CDNS_I2C_FIFO_DEPTH + 1))
+                       if (cdns_is_holdquirk(id, hold_quirk))
                                break;
                }
 
@@ -241,8 +258,7 @@ static irqreturn_t cdns_i2c_isr(int irq, void *ptr)
                 * maintain transfer size non-zero while performing a large
                 * receive operation.
                 */
-               if (updatetx &&
-                   (id->curr_recv_count == CDNS_I2C_FIFO_DEPTH + 1)) {
+               if (cdns_is_holdquirk(id, hold_quirk)) {
                        /* wait while fifo is full */
                        while (cdns_i2c_readreg(CDNS_I2C_XFER_SIZE_OFFSET) !=
                               (id->curr_recv_count - CDNS_I2C_FIFO_DEPTH))
@@ -264,6 +280,22 @@ static irqreturn_t cdns_i2c_isr(int irq, void *ptr)
                                                  CDNS_I2C_XFER_SIZE_OFFSET);
                                id->curr_recv_count = id->recv_count;
                        }
+               } else if (id->recv_count && !hold_quirk &&
+                                               !id->curr_recv_count) {
+
+                       /* Set the slave address in address register*/
+                       cdns_i2c_writereg(id->p_msg->addr & CDNS_I2C_ADDR_MASK,
+                                               CDNS_I2C_ADDR_OFFSET);
+
+                       if (id->recv_count > CDNS_I2C_TRANSFER_SIZE) {
+                               cdns_i2c_writereg(CDNS_I2C_TRANSFER_SIZE,
+                                               CDNS_I2C_XFER_SIZE_OFFSET);
+                               id->curr_recv_count = CDNS_I2C_TRANSFER_SIZE;
+                       } else {
+                               cdns_i2c_writereg(id->recv_count,
+                                               CDNS_I2C_XFER_SIZE_OFFSET);
+                               id->curr_recv_count = id->recv_count;
+                       }
                }
 
                /* Clear hold (if not repeated start) and signal completion */
@@ -535,11 +567,13 @@ static int cdns_i2c_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
        int ret, count;
        u32 reg;
        struct cdns_i2c *id = adap->algo_data;
+       bool hold_quirk;
 
        /* Check if the bus is free */
        if (cdns_i2c_readreg(CDNS_I2C_SR_OFFSET) & CDNS_I2C_SR_BA)
                return -EAGAIN;
 
+       hold_quirk = !!(id->quirks & CDNS_I2C_BROKEN_HOLD_BIT);
        /*
         * Set the flag to one when multiple messages are to be
         * processed with a repeated start.
@@ -552,7 +586,7 @@ static int cdns_i2c_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
                 * followed by any other message, an error is returned
                 * indicating that this sequence is not supported.
                 */
-               for (count = 0; count < num - 1; count++) {
+               for (count = 0; (count < num - 1 && hold_quirk); count++) {
                        if (msgs[count].flags & I2C_M_RD) {
                                dev_warn(adap->dev.parent,
                                         "Can't do repeated start after a receive message\n");
@@ -815,6 +849,17 @@ static int __maybe_unused cdns_i2c_resume(struct device *_dev)
 static SIMPLE_DEV_PM_OPS(cdns_i2c_dev_pm_ops, cdns_i2c_suspend,
                         cdns_i2c_resume);
 
+static const struct cdns_platform_data r1p10_i2c_def = {
+       .quirks = CDNS_I2C_BROKEN_HOLD_BIT,
+};
+
+static const struct of_device_id cdns_i2c_of_match[] = {
+       { .compatible = "cdns,i2c-r1p10", .data = &r1p10_i2c_def },
+       { .compatible = "cdns,i2c-r1p14",},
+       { /* end of table */ }
+};
+MODULE_DEVICE_TABLE(of, cdns_i2c_of_match);
+
 /**
  * cdns_i2c_probe - Platform registration call
  * @pdev:      Handle to the platform device structure
@@ -830,6 +875,7 @@ static int cdns_i2c_probe(struct platform_device *pdev)
        struct resource *r_mem;
        struct cdns_i2c *id;
        int ret;
+       const struct of_device_id *match;
 
        id = devm_kzalloc(&pdev->dev, sizeof(*id), GFP_KERNEL);
        if (!id)
@@ -837,6 +883,12 @@ static int cdns_i2c_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, id);
 
+       match = of_match_node(cdns_i2c_of_match, pdev->dev.of_node);
+       if (match && match->data) {
+               const struct cdns_platform_data *data = match->data;
+               id->quirks = data->quirks;
+       }
+
        r_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        id->membase = devm_ioremap_resource(&pdev->dev, r_mem);
        if (IS_ERR(id->membase))
@@ -844,6 +896,7 @@ static int cdns_i2c_probe(struct platform_device *pdev)
 
        id->irq = platform_get_irq(pdev, 0);
 
+       id->adap.owner = THIS_MODULE;
        id->adap.dev.of_node = pdev->dev.of_node;
        id->adap.algo = &cdns_i2c_algo;
        id->adap.timeout = CDNS_I2C_TIMEOUT;
@@ -935,12 +988,6 @@ static int cdns_i2c_remove(struct platform_device *pdev)
        return 0;
 }
 
-static const struct of_device_id cdns_i2c_of_match[] = {
-       { .compatible = "cdns,i2c-r1p10", },
-       { /* end of table */ }
-};
-MODULE_DEVICE_TABLE(of, cdns_i2c_of_match);
-
 static struct platform_driver cdns_i2c_drv = {
        .driver = {
                .name  = DRIVER_NAME,
index 6f19a33773fe79dd4ef27643232b2df1e5bba66b..7441cdc1b34a6a67f85fd3bfd71db40b5076734c 100644 (file)
@@ -777,8 +777,7 @@ irqreturn_t i2c_dw_isr(int this_irq, void *dev_id)
 
        enabled = dw_readl(dev, DW_IC_ENABLE);
        stat = dw_readl(dev, DW_IC_RAW_INTR_STAT);
-       dev_dbg(dev->dev, "%s:  %s enabled= 0x%x stat=0x%x\n", __func__,
-               dev->adapter.name, enabled, stat);
+       dev_dbg(dev->dev, "%s: enabled=%#x stat=%#x\n", __func__, enabled, stat);
        if (!enabled || !(stat & ~DW_IC_INTR_ACTIVITY))
                return IRQ_NONE;
 
index 6643d2dc0b250ddbf022c669db4fd2b4b4f848e7..df23e8c30e6f8a37556de068cd816493d5574184 100644 (file)
@@ -260,8 +260,8 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev,
 
        snprintf(adap->name, sizeof(adap->name), "i2c-designware-pci");
 
-       r = devm_request_irq(&pdev->dev, pdev->irq, i2c_dw_isr, IRQF_SHARED,
-                       adap->name, dev);
+       r = devm_request_irq(&pdev->dev, pdev->irq, i2c_dw_isr,
+                       IRQF_SHARED | IRQF_COND_SUSPEND, adap->name, dev);
        if (r) {
                dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq);
                return r;
diff --git a/drivers/i2c/busses/i2c-emev2.c b/drivers/i2c/busses/i2c-emev2.c
new file mode 100644 (file)
index 0000000..192ef6b
--- /dev/null
@@ -0,0 +1,332 @@
+/*
+ * I2C driver for the Renesas EMEV2 SoC
+ *
+ * Copyright (C) 2015 Wolfram Sang <wsa@sang-engineering.com>
+ * Copyright 2013 Codethink Ltd.
+ * Copyright 2010-2015 Renesas Electronics Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+
+/* I2C Registers */
+#define I2C_OFS_IICACT0                0x00    /* start */
+#define I2C_OFS_IIC0           0x04    /* shift */
+#define I2C_OFS_IICC0          0x08    /* control */
+#define I2C_OFS_SVA0           0x0c    /* slave address */
+#define I2C_OFS_IICCL0         0x10    /* clock select */
+#define I2C_OFS_IICX0          0x14    /* extension */
+#define I2C_OFS_IICS0          0x18    /* status */
+#define I2C_OFS_IICSE0         0x1c    /* status For emulation */
+#define I2C_OFS_IICF0          0x20    /* IIC flag */
+
+/* I2C IICACT0 Masks */
+#define I2C_BIT_IICE0          0x0001
+
+/* I2C IICC0 Masks */
+#define I2C_BIT_LREL0          0x0040
+#define I2C_BIT_WREL0          0x0020
+#define I2C_BIT_SPIE0          0x0010
+#define I2C_BIT_WTIM0          0x0008
+#define I2C_BIT_ACKE0          0x0004
+#define I2C_BIT_STT0           0x0002
+#define I2C_BIT_SPT0           0x0001
+
+/* I2C IICCL0 Masks */
+#define I2C_BIT_SMC0           0x0008
+#define I2C_BIT_DFC0           0x0004
+
+/* I2C IICSE0 Masks */
+#define I2C_BIT_MSTS0          0x0080
+#define I2C_BIT_ALD0           0x0040
+#define I2C_BIT_EXC0           0x0020
+#define I2C_BIT_COI0           0x0010
+#define I2C_BIT_TRC0           0x0008
+#define I2C_BIT_ACKD0          0x0004
+#define I2C_BIT_STD0           0x0002
+#define I2C_BIT_SPD0           0x0001
+
+/* I2C IICF0 Masks */
+#define I2C_BIT_STCF           0x0080
+#define I2C_BIT_IICBSY         0x0040
+#define I2C_BIT_STCEN          0x0002
+#define I2C_BIT_IICRSV         0x0001
+
+struct em_i2c_device {
+       void __iomem *base;
+       struct i2c_adapter adap;
+       struct completion msg_done;
+       struct clk *sclk;
+};
+
+static inline void em_clear_set_bit(struct em_i2c_device *priv, u8 clear, u8 set, u8 reg)
+{
+       writeb((readb(priv->base + reg) & ~clear) | set, priv->base + reg);
+}
+
+static int em_i2c_wait_for_event(struct em_i2c_device *priv)
+{
+       unsigned long time_left;
+       int status;
+
+       reinit_completion(&priv->msg_done);
+
+       time_left = wait_for_completion_timeout(&priv->msg_done, priv->adap.timeout);
+
+       if (!time_left)
+               return -ETIMEDOUT;
+
+       status = readb(priv->base + I2C_OFS_IICSE0);
+       return status & I2C_BIT_ALD0 ? -EAGAIN : status;
+}
+
+static void em_i2c_stop(struct em_i2c_device *priv)
+{
+       /* Send Stop condition */
+       em_clear_set_bit(priv, 0, I2C_BIT_SPT0 | I2C_BIT_SPIE0, I2C_OFS_IICC0);
+
+       /* Wait for stop condition */
+       em_i2c_wait_for_event(priv);
+}
+
+static void em_i2c_reset(struct i2c_adapter *adap)
+{
+       struct em_i2c_device *priv = i2c_get_adapdata(adap);
+       int retr;
+
+       /* If I2C active */
+       if (readb(priv->base + I2C_OFS_IICACT0) & I2C_BIT_IICE0) {
+               /* Disable I2C operation */
+               writeb(0, priv->base + I2C_OFS_IICACT0);
+
+               retr = 1000;
+               while (readb(priv->base + I2C_OFS_IICACT0) == 1 && retr)
+                       retr--;
+               WARN_ON(retr == 0);
+       }
+
+       /* Transfer mode set */
+       writeb(I2C_BIT_DFC0, priv->base + I2C_OFS_IICCL0);
+
+       /* Can Issue start without detecting a stop, Reservation disabled. */
+       writeb(I2C_BIT_STCEN | I2C_BIT_IICRSV, priv->base + I2C_OFS_IICF0);
+
+       /* I2C enable, 9 bit interrupt mode */
+       writeb(I2C_BIT_WTIM0, priv->base + I2C_OFS_IICC0);
+
+       /* Enable I2C operation */
+       writeb(I2C_BIT_IICE0, priv->base + I2C_OFS_IICACT0);
+
+       retr = 1000;
+       while (readb(priv->base + I2C_OFS_IICACT0) == 0 && retr)
+               retr--;
+       WARN_ON(retr == 0);
+}
+
+static int __em_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msg,
+                               int stop)
+{
+       struct em_i2c_device *priv = i2c_get_adapdata(adap);
+       int count, status, read = !!(msg->flags & I2C_M_RD);
+
+       /* Send start condition */
+       em_clear_set_bit(priv, 0, I2C_BIT_ACKE0 | I2C_BIT_WTIM0, I2C_OFS_IICC0);
+       em_clear_set_bit(priv, 0, I2C_BIT_STT0, I2C_OFS_IICC0);
+
+       /* Send slave address and R/W type */
+       writeb((msg->addr << 1) | read, priv->base + I2C_OFS_IIC0);
+
+       /* Wait for transaction */
+       status = em_i2c_wait_for_event(priv);
+       if (status < 0)
+               goto out_reset;
+
+       /* Received NACK (result of setting slave address and R/W) */
+       if (!(status & I2C_BIT_ACKD0)) {
+               em_i2c_stop(priv);
+               goto out;
+       }
+
+       /* Extra setup for read transactions */
+       if (read) {
+               /* 8 bit interrupt mode */
+               em_clear_set_bit(priv, I2C_BIT_WTIM0, I2C_BIT_ACKE0, I2C_OFS_IICC0);
+               em_clear_set_bit(priv, I2C_BIT_WTIM0, I2C_BIT_WREL0, I2C_OFS_IICC0);
+
+               /* Wait for transaction */
+               status = em_i2c_wait_for_event(priv);
+               if (status < 0)
+                       goto out_reset;
+       }
+
+       /* Send / receive data */
+       for (count = 0; count < msg->len; count++) {
+               if (read) { /* Read transaction */
+                       msg->buf[count] = readb(priv->base + I2C_OFS_IIC0);
+                       em_clear_set_bit(priv, 0, I2C_BIT_WREL0, I2C_OFS_IICC0);
+
+               } else { /* Write transaction */
+                       /* Received NACK */
+                       if (!(status & I2C_BIT_ACKD0)) {
+                               em_i2c_stop(priv);
+                               goto out;
+                       }
+
+                       /* Write data */
+                       writeb(msg->buf[count], priv->base + I2C_OFS_IIC0);
+               }
+
+               /* Wait for R/W transaction */
+               status = em_i2c_wait_for_event(priv);
+               if (status < 0)
+                       goto out_reset;
+       }
+
+       if (stop)
+               em_i2c_stop(priv);
+
+       return count;
+
+out_reset:
+       em_i2c_reset(adap);
+out:
+       return status < 0 ? status : -ENXIO;
+}
+
+static int em_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
+       int num)
+{
+       struct em_i2c_device *priv = i2c_get_adapdata(adap);
+       int ret, i;
+
+       if (readb(priv->base + I2C_OFS_IICF0) & I2C_BIT_IICBSY)
+               return -EAGAIN;
+
+       for (i = 0; i < num; i++) {
+               ret = __em_i2c_xfer(adap, &msgs[i], (i == (num - 1)));
+               if (ret < 0)
+                       return ret;
+       }
+
+       /* I2C transfer completed */
+       return num;
+}
+
+static irqreturn_t em_i2c_irq_handler(int this_irq, void *dev_id)
+{
+       struct em_i2c_device *priv = dev_id;
+
+       complete(&priv->msg_done);
+       return IRQ_HANDLED;
+}
+
+static u32 em_i2c_func(struct i2c_adapter *adap)
+{
+       return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+}
+
+static struct i2c_algorithm em_i2c_algo = {
+       .master_xfer = em_i2c_xfer,
+       .functionality = em_i2c_func,
+};
+
+static int em_i2c_probe(struct platform_device *pdev)
+{
+       struct em_i2c_device *priv;
+       struct resource *r;
+       int irq, ret;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       priv->base = devm_ioremap_resource(&pdev->dev, r);
+       if (IS_ERR(priv->base))
+               return PTR_ERR(priv->base);
+
+       strlcpy(priv->adap.name, "EMEV2 I2C", sizeof(priv->adap.name));
+
+       priv->sclk = devm_clk_get(&pdev->dev, "sclk");
+       if (IS_ERR(priv->sclk))
+               return PTR_ERR(priv->sclk);
+
+       clk_prepare_enable(priv->sclk);
+
+       priv->adap.timeout = msecs_to_jiffies(100);
+       priv->adap.retries = 5;
+       priv->adap.dev.parent = &pdev->dev;
+       priv->adap.algo = &em_i2c_algo;
+       priv->adap.owner = THIS_MODULE;
+       priv->adap.dev.of_node = pdev->dev.of_node;
+
+       init_completion(&priv->msg_done);
+
+       platform_set_drvdata(pdev, priv);
+       i2c_set_adapdata(&priv->adap, priv);
+
+       em_i2c_reset(&priv->adap);
+
+       irq = platform_get_irq(pdev, 0);
+       ret = devm_request_irq(&pdev->dev, irq, em_i2c_irq_handler, 0,
+                               "em_i2c", priv);
+       if (ret)
+               goto err_clk;
+
+       ret = i2c_add_adapter(&priv->adap);
+
+       if (ret)
+               goto err_clk;
+
+       dev_info(&pdev->dev, "Added i2c controller %d, irq %d\n", priv->adap.nr, irq);
+
+       return 0;
+
+err_clk:
+       clk_disable_unprepare(priv->sclk);
+       return ret;
+}
+
+static int em_i2c_remove(struct platform_device *dev)
+{
+       struct em_i2c_device *priv = platform_get_drvdata(dev);
+
+       i2c_del_adapter(&priv->adap);
+       clk_disable_unprepare(priv->sclk);
+
+       return 0;
+}
+
+static const struct of_device_id em_i2c_ids[] = {
+       { .compatible = "renesas,iic-emev2", },
+       { }
+};
+
+static struct platform_driver em_i2c_driver = {
+       .probe = em_i2c_probe,
+       .remove = em_i2c_remove,
+       .driver = {
+               .name = "em-i2c",
+               .of_match_table = em_i2c_ids,
+       }
+};
+module_platform_driver(em_i2c_driver);
+
+MODULE_DESCRIPTION("EMEV2 I2C bus driver");
+MODULE_AUTHOR("Ian Molton and Wolfram Sang <wsa@sang-engineering.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DEVICE_TABLE(of, em_i2c_ids);
diff --git a/drivers/i2c/busses/i2c-lpc2k.c b/drivers/i2c/busses/i2c-lpc2k.c
new file mode 100644 (file)
index 0000000..8560a13
--- /dev/null
@@ -0,0 +1,513 @@
+/*
+ * Copyright (C) 2011 NXP Semiconductors
+ *
+ * Code portions referenced from the i2x-pxa and i2c-pnx drivers
+ *
+ * Make SMBus byte and word transactions work on LPC178x/7x
+ * Copyright (c) 2012
+ * Alexander Potashev, Emcraft Systems, aspotashev@emcraft.com
+ * Anton Protopopov, Emcraft Systems, antonp@emcraft.com
+ *
+ * Copyright (C) 2015 Joachim Eastwood <manabian@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/errno.h>
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+
+/* LPC24xx register offsets and bits */
+#define LPC24XX_I2CONSET       0x00
+#define LPC24XX_I2STAT         0x04
+#define LPC24XX_I2DAT          0x08
+#define LPC24XX_I2ADDR         0x0c
+#define LPC24XX_I2SCLH         0x10
+#define LPC24XX_I2SCLL         0x14
+#define LPC24XX_I2CONCLR       0x18
+
+#define LPC24XX_AA             BIT(2)
+#define LPC24XX_SI             BIT(3)
+#define LPC24XX_STO            BIT(4)
+#define LPC24XX_STA            BIT(5)
+#define LPC24XX_I2EN           BIT(6)
+
+#define LPC24XX_STO_AA         (LPC24XX_STO | LPC24XX_AA)
+#define LPC24XX_CLEAR_ALL      (LPC24XX_AA | LPC24XX_SI | LPC24XX_STO | \
+                                LPC24XX_STA | LPC24XX_I2EN)
+
+/* I2C SCL clock has different duty cycle depending on mode */
+#define I2C_STD_MODE_DUTY              46
+#define I2C_FAST_MODE_DUTY             36
+#define I2C_FAST_MODE_PLUS_DUTY                38
+
+/*
+ * 26 possible I2C status codes, but codes applicable only
+ * to master are listed here and used in this driver
+ */
+enum {
+       M_BUS_ERROR             = 0x00,
+       M_START                 = 0x08,
+       M_REPSTART              = 0x10,
+       MX_ADDR_W_ACK           = 0x18,
+       MX_ADDR_W_NACK          = 0x20,
+       MX_DATA_W_ACK           = 0x28,
+       MX_DATA_W_NACK          = 0x30,
+       M_DATA_ARB_LOST         = 0x38,
+       MR_ADDR_R_ACK           = 0x40,
+       MR_ADDR_R_NACK          = 0x48,
+       MR_DATA_R_ACK           = 0x50,
+       MR_DATA_R_NACK          = 0x58,
+       M_I2C_IDLE              = 0xf8,
+};
+
+struct lpc2k_i2c {
+       void __iomem            *base;
+       struct clk              *clk;
+       int                     irq;
+       wait_queue_head_t       wait;
+       struct i2c_adapter      adap;
+       struct i2c_msg          *msg;
+       int                     msg_idx;
+       int                     msg_status;
+       int                     is_last;
+};
+
+static void i2c_lpc2k_reset(struct lpc2k_i2c *i2c)
+{
+       /* Will force clear all statuses */
+       writel(LPC24XX_CLEAR_ALL, i2c->base + LPC24XX_I2CONCLR);
+       writel(0, i2c->base + LPC24XX_I2ADDR);
+       writel(LPC24XX_I2EN, i2c->base + LPC24XX_I2CONSET);
+}
+
+static int i2c_lpc2k_clear_arb(struct lpc2k_i2c *i2c)
+{
+       unsigned long timeout = jiffies + msecs_to_jiffies(1000);
+
+       /*
+        * If the transfer needs to abort for some reason, we'll try to
+        * force a stop condition to clear any pending bus conditions
+        */
+       writel(LPC24XX_STO, i2c->base + LPC24XX_I2CONSET);
+
+       /* Wait for status change */
+       while (readl(i2c->base + LPC24XX_I2STAT) != M_I2C_IDLE) {
+               if (time_after(jiffies, timeout)) {
+                       /* Bus was not idle, try to reset adapter */
+                       i2c_lpc2k_reset(i2c);
+                       return -EBUSY;
+               }
+
+               cpu_relax();
+       }
+
+       return 0;
+}
+
+static void i2c_lpc2k_pump_msg(struct lpc2k_i2c *i2c)
+{
+       unsigned char data;
+       u32 status;
+
+       /*
+        * I2C in the LPC2xxx series is basically a state machine.
+        * Just run through the steps based on the current status.
+        */
+       status = readl(i2c->base + LPC24XX_I2STAT);
+
+       switch (status) {
+       case M_START:
+       case M_REPSTART:
+               /* Start bit was just sent out, send out addr and dir */
+               data = i2c->msg->addr << 1;
+               if (i2c->msg->flags & I2C_M_RD)
+                       data |= 1;
+
+               writel(data, i2c->base + LPC24XX_I2DAT);
+               writel(LPC24XX_STA, i2c->base + LPC24XX_I2CONCLR);
+               break;
+
+       case MX_ADDR_W_ACK:
+       case MX_DATA_W_ACK:
+               /*
+                * Address or data was sent out with an ACK. If there is more
+                * data to send, send it now
+                */
+               if (i2c->msg_idx < i2c->msg->len) {
+                       writel(i2c->msg->buf[i2c->msg_idx],
+                              i2c->base + LPC24XX_I2DAT);
+               } else if (i2c->is_last) {
+                       /* Last message, send stop */
+                       writel(LPC24XX_STO_AA, i2c->base + LPC24XX_I2CONSET);
+                       writel(LPC24XX_SI, i2c->base + LPC24XX_I2CONCLR);
+                       i2c->msg_status = 0;
+                       disable_irq_nosync(i2c->irq);
+               } else {
+                       i2c->msg_status = 0;
+                       disable_irq_nosync(i2c->irq);
+               }
+
+               i2c->msg_idx++;
+               break;
+
+       case MR_ADDR_R_ACK:
+               /* Receive first byte from slave */
+               if (i2c->msg->len == 1) {
+                       /* Last byte, return NACK */
+                       writel(LPC24XX_AA, i2c->base + LPC24XX_I2CONCLR);
+               } else {
+                       /* Not last byte, return ACK */
+                       writel(LPC24XX_AA, i2c->base + LPC24XX_I2CONSET);
+               }
+
+               writel(LPC24XX_STA, i2c->base + LPC24XX_I2CONCLR);
+               break;
+
+       case MR_DATA_R_NACK:
+               /*
+                * The I2C shows NACK status on reads, so we need to accept
+                * the NACK as an ACK here. This should be ok, as the real
+                * BACK would of been caught on the address write.
+                */
+       case MR_DATA_R_ACK:
+               /* Data was received */
+               if (i2c->msg_idx < i2c->msg->len) {
+                       i2c->msg->buf[i2c->msg_idx] =
+                                       readl(i2c->base + LPC24XX_I2DAT);
+               }
+
+               /* If transfer is done, send STOP */
+               if (i2c->msg_idx >= i2c->msg->len - 1 && i2c->is_last) {
+                       writel(LPC24XX_STO_AA, i2c->base + LPC24XX_I2CONSET);
+                       writel(LPC24XX_SI, i2c->base + LPC24XX_I2CONCLR);
+                       i2c->msg_status = 0;
+               }
+
+               /* Message is done */
+               if (i2c->msg_idx >= i2c->msg->len - 1) {
+                       i2c->msg_status = 0;
+                       disable_irq_nosync(i2c->irq);
+               }
+
+               /*
+                * One pre-last data input, send NACK to tell the slave that
+                * this is going to be the last data byte to be transferred.
+                */
+               if (i2c->msg_idx >= i2c->msg->len - 2) {
+                       /* One byte left to receive - NACK */
+                       writel(LPC24XX_AA, i2c->base + LPC24XX_I2CONCLR);
+               } else {
+                       /* More than one byte left to receive - ACK */
+                       writel(LPC24XX_AA, i2c->base + LPC24XX_I2CONSET);
+               }
+
+               writel(LPC24XX_STA, i2c->base + LPC24XX_I2CONCLR);
+               i2c->msg_idx++;
+               break;
+
+       case MX_ADDR_W_NACK:
+       case MX_DATA_W_NACK:
+       case MR_ADDR_R_NACK:
+               /* NACK processing is done */
+               writel(LPC24XX_STO_AA, i2c->base + LPC24XX_I2CONSET);
+               i2c->msg_status = -ENXIO;
+               disable_irq_nosync(i2c->irq);
+               break;
+
+       case M_DATA_ARB_LOST:
+               /* Arbitration lost */
+               i2c->msg_status = -EAGAIN;
+
+               /* Release the I2C bus */
+               writel(LPC24XX_STA | LPC24XX_STO, i2c->base + LPC24XX_I2CONCLR);
+               disable_irq_nosync(i2c->irq);
+               break;
+
+       default:
+               /* Unexpected statuses */
+               i2c->msg_status = -EIO;
+               disable_irq_nosync(i2c->irq);
+               break;
+       }
+
+       /* Exit on failure or all bytes transferred */
+       if (i2c->msg_status != -EBUSY)
+               wake_up(&i2c->wait);
+
+       /*
+        * If `msg_status` is zero, then `lpc2k_process_msg()`
+        * is responsible for clearing the SI flag.
+        */
+       if (i2c->msg_status != 0)
+               writel(LPC24XX_SI, i2c->base + LPC24XX_I2CONCLR);
+}
+
+static int lpc2k_process_msg(struct lpc2k_i2c *i2c, int msgidx)
+{
+       /* A new transfer is kicked off by initiating a start condition */
+       if (!msgidx) {
+               writel(LPC24XX_STA, i2c->base + LPC24XX_I2CONSET);
+       } else {
+               /*
+                * A multi-message I2C transfer continues where the
+                * previous I2C transfer left off and uses the
+                * current condition of the I2C adapter.
+                */
+               if (unlikely(i2c->msg->flags & I2C_M_NOSTART)) {
+                       WARN_ON(i2c->msg->len == 0);
+
+                       if (!(i2c->msg->flags & I2C_M_RD)) {
+                               /* Start transmit of data */
+                               writel(i2c->msg->buf[0],
+                                      i2c->base + LPC24XX_I2DAT);
+                               i2c->msg_idx++;
+                       }
+               } else {
+                       /* Start or repeated start */
+                       writel(LPC24XX_STA, i2c->base + LPC24XX_I2CONSET);
+               }
+
+               writel(LPC24XX_SI, i2c->base + LPC24XX_I2CONCLR);
+       }
+
+       enable_irq(i2c->irq);
+
+       /* Wait for transfer completion */
+       if (wait_event_timeout(i2c->wait, i2c->msg_status != -EBUSY,
+                              msecs_to_jiffies(1000)) == 0) {
+               disable_irq_nosync(i2c->irq);
+
+               return -ETIMEDOUT;
+       }
+
+       return i2c->msg_status;
+}
+
+static int i2c_lpc2k_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs,
+                         int msg_num)
+{
+       struct lpc2k_i2c *i2c = i2c_get_adapdata(adap);
+       int ret, i;
+       u32 stat;
+
+       /* Check for bus idle condition */
+       stat = readl(i2c->base + LPC24XX_I2STAT);
+       if (stat != M_I2C_IDLE) {
+               /* Something is holding the bus, try to clear it */
+               return i2c_lpc2k_clear_arb(i2c);
+       }
+
+       /* Process a single message at a time */
+       for (i = 0; i < msg_num; i++) {
+               /* Save message pointer and current message data index */
+               i2c->msg = &msgs[i];
+               i2c->msg_idx = 0;
+               i2c->msg_status = -EBUSY;
+               i2c->is_last = (i == (msg_num - 1));
+
+               ret = lpc2k_process_msg(i2c, i);
+               if (ret)
+                       return ret;
+       }
+
+       return msg_num;
+}
+
+static irqreturn_t i2c_lpc2k_handler(int irq, void *dev_id)
+{
+       struct lpc2k_i2c *i2c = dev_id;
+
+       if (readl(i2c->base + LPC24XX_I2CONSET) & LPC24XX_SI) {
+               i2c_lpc2k_pump_msg(i2c);
+               return IRQ_HANDLED;
+       }
+
+       return IRQ_NONE;
+}
+
+static u32 i2c_lpc2k_functionality(struct i2c_adapter *adap)
+{
+       /* Only emulated SMBus for now */
+       return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
+}
+
+static const struct i2c_algorithm i2c_lpc2k_algorithm = {
+       .master_xfer    = i2c_lpc2k_xfer,
+       .functionality  = i2c_lpc2k_functionality,
+};
+
+static int i2c_lpc2k_probe(struct platform_device *pdev)
+{
+       struct lpc2k_i2c *i2c;
+       struct resource *res;
+       u32 bus_clk_rate;
+       u32 scl_high;
+       u32 clkrate;
+       int ret;
+
+       i2c = devm_kzalloc(&pdev->dev, sizeof(*i2c), GFP_KERNEL);
+       if (!i2c)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       i2c->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(i2c->base))
+               return PTR_ERR(i2c->base);
+
+       i2c->irq = platform_get_irq(pdev, 0);
+       if (i2c->irq < 0) {
+               dev_err(&pdev->dev, "can't get interrupt resource\n");
+               return i2c->irq;
+       }
+
+       init_waitqueue_head(&i2c->wait);
+
+       i2c->clk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(i2c->clk)) {
+               dev_err(&pdev->dev, "error getting clock\n");
+               return PTR_ERR(i2c->clk);
+       }
+
+       ret = clk_prepare_enable(i2c->clk);
+       if (ret) {
+               dev_err(&pdev->dev, "unable to enable clock.\n");
+               return ret;
+       }
+
+       ret = devm_request_irq(&pdev->dev, i2c->irq, i2c_lpc2k_handler, 0,
+                              dev_name(&pdev->dev), i2c);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "can't request interrupt.\n");
+               goto fail_clk;
+       }
+
+       disable_irq_nosync(i2c->irq);
+
+       /* Place controller is a known state */
+       i2c_lpc2k_reset(i2c);
+
+       ret = of_property_read_u32(pdev->dev.of_node, "clock-frequency",
+                                  &bus_clk_rate);
+       if (ret)
+               bus_clk_rate = 100000; /* 100 kHz default clock rate */
+
+       clkrate = clk_get_rate(i2c->clk);
+       if (clkrate == 0) {
+               dev_err(&pdev->dev, "can't get I2C base clock\n");
+               ret = -EINVAL;
+               goto fail_clk;
+       }
+
+       /* Setup I2C dividers to generate clock with proper duty cycle */
+       clkrate = clkrate / bus_clk_rate;
+       if (bus_clk_rate <= 100000)
+               scl_high = (clkrate * I2C_STD_MODE_DUTY) / 100;
+       else if (bus_clk_rate <= 400000)
+               scl_high = (clkrate * I2C_FAST_MODE_DUTY) / 100;
+       else
+               scl_high = (clkrate * I2C_FAST_MODE_PLUS_DUTY) / 100;
+
+       writel(scl_high, i2c->base + LPC24XX_I2SCLH);
+       writel(clkrate - scl_high, i2c->base + LPC24XX_I2SCLL);
+
+       platform_set_drvdata(pdev, i2c);
+
+       i2c_set_adapdata(&i2c->adap, i2c);
+       i2c->adap.owner = THIS_MODULE;
+       strlcpy(i2c->adap.name, "LPC2K I2C adapter", sizeof(i2c->adap.name));
+       i2c->adap.algo = &i2c_lpc2k_algorithm;
+       i2c->adap.dev.parent = &pdev->dev;
+       i2c->adap.dev.of_node = pdev->dev.of_node;
+
+       ret = i2c_add_adapter(&i2c->adap);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "failed to add adapter!\n");
+               goto fail_clk;
+       }
+
+       dev_info(&pdev->dev, "LPC2K I2C adapter\n");
+
+       return 0;
+
+fail_clk:
+       clk_disable_unprepare(i2c->clk);
+       return ret;
+}
+
+static int i2c_lpc2k_remove(struct platform_device *dev)
+{
+       struct lpc2k_i2c *i2c = platform_get_drvdata(dev);
+
+       i2c_del_adapter(&i2c->adap);
+       clk_disable_unprepare(i2c->clk);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM
+static int i2c_lpc2k_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct lpc2k_i2c *i2c = platform_get_drvdata(pdev);
+
+       clk_disable(i2c->clk);
+
+       return 0;
+}
+
+static int i2c_lpc2k_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct lpc2k_i2c *i2c = platform_get_drvdata(pdev);
+
+       clk_enable(i2c->clk);
+       i2c_lpc2k_reset(i2c);
+
+       return 0;
+}
+
+static const struct dev_pm_ops i2c_lpc2k_dev_pm_ops = {
+       .suspend_noirq = i2c_lpc2k_suspend,
+       .resume_noirq = i2c_lpc2k_resume,
+};
+
+#define I2C_LPC2K_DEV_PM_OPS (&i2c_lpc2k_dev_pm_ops)
+#else
+#define I2C_LPC2K_DEV_PM_OPS NULL
+#endif
+
+static const struct of_device_id lpc2k_i2c_match[] = {
+       { .compatible = "nxp,lpc1788-i2c" },
+       {},
+};
+MODULE_DEVICE_TABLE(of, lpc2k_i2c_match);
+
+static struct platform_driver i2c_lpc2k_driver = {
+       .probe  = i2c_lpc2k_probe,
+       .remove = i2c_lpc2k_remove,
+       .driver = {
+               .name           = "lpc2k-i2c",
+               .pm             = I2C_LPC2K_DEV_PM_OPS,
+               .of_match_table = lpc2k_i2c_match,
+       },
+};
+module_platform_driver(i2c_lpc2k_driver);
+
+MODULE_AUTHOR("Kevin Wells <kevin.wells@nxp.com>");
+MODULE_DESCRIPTION("I2C driver for LPC2xxx devices");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:lpc2k-i2c");
index 9920eef74672ff94dd85f6bd01e5df55f4210713..c02e6c018c39f0034dd9dab70c2f61ce290094b8 100644 (file)
@@ -59,6 +59,7 @@
 #define I2C_DMA_START_EN               0x0001
 #define I2C_DMA_INT_FLAG_NONE          0x0000
 #define I2C_DMA_CLR_FLAG               0x0000
+#define I2C_DMA_HARD_RST               0x0002
 
 #define I2C_DEFAULT_SPEED              100000  /* hz */
 #define MAX_FS_MODE_SPEED              400000
@@ -81,6 +82,7 @@ enum DMA_REGS_OFFSET {
        OFFSET_INT_FLAG = 0x0,
        OFFSET_INT_EN = 0x04,
        OFFSET_EN = 0x08,
+       OFFSET_RST = 0x0c,
        OFFSET_CON = 0x18,
        OFFSET_TX_MEM_ADDR = 0x1c,
        OFFSET_RX_MEM_ADDR = 0x20,
@@ -262,6 +264,10 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c)
                      I2C_CONTROL_CLK_EXT_EN | I2C_CONTROL_DMA_EN;
        writew(control_reg, i2c->base + OFFSET_CONTROL);
        writew(I2C_DELAY_LEN, i2c->base + OFFSET_DELAY_LEN);
+
+       writel(I2C_DMA_HARD_RST, i2c->pdmabase + OFFSET_RST);
+       udelay(50);
+       writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST);
 }
 
 /*
@@ -551,15 +557,22 @@ static irqreturn_t mtk_i2c_irq(int irqno, void *dev_id)
 {
        struct mtk_i2c *i2c = dev_id;
        u16 restart_flag = 0;
+       u16 intr_stat;
 
        if (i2c->dev_comp->auto_restart)
                restart_flag = I2C_RS_TRANSFER;
 
-       i2c->irq_stat = readw(i2c->base + OFFSET_INTR_STAT);
-       writew(restart_flag | I2C_HS_NACKERR | I2C_ACKERR
-               | I2C_TRANSAC_COMP, i2c->base + OFFSET_INTR_STAT);
+       intr_stat = readw(i2c->base + OFFSET_INTR_STAT);
+       writew(intr_stat, i2c->base + OFFSET_INTR_STAT);
 
-       complete(&i2c->msg_complete);
+       /*
+        * when occurs ack error, i2c controller generate two interrupts
+        * first is the ack error interrupt, then the complete interrupt
+        * i2c->irq_stat need keep the two interrupt value.
+        */
+       i2c->irq_stat |= intr_stat;
+       if (i2c->irq_stat & (I2C_TRANSAC_COMP | restart_flag))
+               complete(&i2c->msg_complete);
 
        return IRQ_HANDLED;
 }
index fc9bf7f30e355dfadfcadd4f7d8f187f818e9566..08d26ba61ed3326a8905c99cdb6ee8caaef45727 100644 (file)
@@ -270,35 +270,35 @@ static const u8 reg_map_ip_v2[] = {
        [OMAP_I2C_IP_V2_IRQENABLE_CLR] = 0x30,
 };
 
-static inline void omap_i2c_write_reg(struct omap_i2c_dev *i2c_dev,
+static inline void omap_i2c_write_reg(struct omap_i2c_dev *omap,
                                      int reg, u16 val)
 {
-       writew_relaxed(val, i2c_dev->base +
-                       (i2c_dev->regs[reg] << i2c_dev->reg_shift));
+       writew_relaxed(val, omap->base +
+                       (omap->regs[reg] << omap->reg_shift));
 }
 
-static inline u16 omap_i2c_read_reg(struct omap_i2c_dev *i2c_dev, int reg)
+static inline u16 omap_i2c_read_reg(struct omap_i2c_dev *omap, int reg)
 {
-       return readw_relaxed(i2c_dev->base +
-                               (i2c_dev->regs[reg] << i2c_dev->reg_shift));
+       return readw_relaxed(omap->base +
+                               (omap->regs[reg] << omap->reg_shift));
 }
 
-static void __omap_i2c_init(struct omap_i2c_dev *dev)
+static void __omap_i2c_init(struct omap_i2c_dev *omap)
 {
 
-       omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, 0);
+       omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, 0);
 
        /* Setup clock prescaler to obtain approx 12MHz I2C module clock: */
-       omap_i2c_write_reg(dev, OMAP_I2C_PSC_REG, dev->pscstate);
+       omap_i2c_write_reg(omap, OMAP_I2C_PSC_REG, omap->pscstate);
 
        /* SCL low and high time values */
-       omap_i2c_write_reg(dev, OMAP_I2C_SCLL_REG, dev->scllstate);
-       omap_i2c_write_reg(dev, OMAP_I2C_SCLH_REG, dev->sclhstate);
-       if (dev->rev >= OMAP_I2C_REV_ON_3430_3530)
-               omap_i2c_write_reg(dev, OMAP_I2C_WE_REG, dev->westate);
+       omap_i2c_write_reg(omap, OMAP_I2C_SCLL_REG, omap->scllstate);
+       omap_i2c_write_reg(omap, OMAP_I2C_SCLH_REG, omap->sclhstate);
+       if (omap->rev >= OMAP_I2C_REV_ON_3430_3530)
+               omap_i2c_write_reg(omap, OMAP_I2C_WE_REG, omap->westate);
 
        /* Take the I2C module out of reset: */
-       omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
+       omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
 
        /*
         * NOTE: right after setting CON_EN, STAT_BB could be 0 while the
@@ -310,32 +310,32 @@ static void __omap_i2c_init(struct omap_i2c_dev *dev)
         * Don't write to this register if the IE state is 0 as it can
         * cause deadlock.
         */
-       if (dev->iestate)
-               omap_i2c_write_reg(dev, OMAP_I2C_IE_REG, dev->iestate);
+       if (omap->iestate)
+               omap_i2c_write_reg(omap, OMAP_I2C_IE_REG, omap->iestate);
 }
 
-static int omap_i2c_reset(struct omap_i2c_dev *dev)
+static int omap_i2c_reset(struct omap_i2c_dev *omap)
 {
        unsigned long timeout;
        u16 sysc;
 
-       if (dev->rev >= OMAP_I2C_OMAP1_REV_2) {
-               sysc = omap_i2c_read_reg(dev, OMAP_I2C_SYSC_REG);
+       if (omap->rev >= OMAP_I2C_OMAP1_REV_2) {
+               sysc = omap_i2c_read_reg(omap, OMAP_I2C_SYSC_REG);
 
                /* Disable I2C controller before soft reset */
-               omap_i2c_write_reg(dev, OMAP_I2C_CON_REG,
-                       omap_i2c_read_reg(dev, OMAP_I2C_CON_REG) &
+               omap_i2c_write_reg(omap, OMAP_I2C_CON_REG,
+                       omap_i2c_read_reg(omap, OMAP_I2C_CON_REG) &
                                ~(OMAP_I2C_CON_EN));
 
-               omap_i2c_write_reg(dev, OMAP_I2C_SYSC_REG, SYSC_SOFTRESET_MASK);
+               omap_i2c_write_reg(omap, OMAP_I2C_SYSC_REG, SYSC_SOFTRESET_MASK);
                /* For some reason we need to set the EN bit before the
                 * reset done bit gets set. */
                timeout = jiffies + OMAP_I2C_TIMEOUT;
-               omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
-               while (!(omap_i2c_read_reg(dev, OMAP_I2C_SYSS_REG) &
+               omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, OMAP_I2C_CON_EN);
+               while (!(omap_i2c_read_reg(omap, OMAP_I2C_SYSS_REG) &
                         SYSS_RESETDONE_MASK)) {
                        if (time_after(jiffies, timeout)) {
-                               dev_warn(dev->dev, "timeout waiting "
+                               dev_warn(omap->dev, "timeout waiting "
                                                "for controller reset\n");
                                return -ETIMEDOUT;
                        }
@@ -343,18 +343,18 @@ static int omap_i2c_reset(struct omap_i2c_dev *dev)
                }
 
                /* SYSC register is cleared by the reset; rewrite it */
-               omap_i2c_write_reg(dev, OMAP_I2C_SYSC_REG, sysc);
+               omap_i2c_write_reg(omap, OMAP_I2C_SYSC_REG, sysc);
 
-               if (dev->rev > OMAP_I2C_REV_ON_3430_3530) {
+               if (omap->rev > OMAP_I2C_REV_ON_3430_3530) {
                        /* Schedule I2C-bus monitoring on the next transfer */
-                       dev->bb_valid = 0;
+                       omap->bb_valid = 0;
                }
        }
 
        return 0;
 }
 
-static int omap_i2c_init(struct omap_i2c_dev *dev)
+static int omap_i2c_init(struct omap_i2c_dev *omap)
 {
        u16 psc = 0, scll = 0, sclh = 0;
        u16 fsscll = 0, fssclh = 0, hsscll = 0, hssclh = 0;
@@ -362,23 +362,23 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
        unsigned long internal_clk = 0;
        struct clk *fclk;
 
-       if (dev->rev >= OMAP_I2C_REV_ON_3430_3530) {
+       if (omap->rev >= OMAP_I2C_REV_ON_3430_3530) {
                /*
                 * Enabling all wakup sources to stop I2C freezing on
                 * WFI instruction.
                 * REVISIT: Some wkup sources might not be needed.
                 */
-               dev->westate = OMAP_I2C_WE_ALL;
+               omap->westate = OMAP_I2C_WE_ALL;
        }
 
-       if (dev->flags & OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK) {
+       if (omap->flags & OMAP_I2C_FLAG_ALWAYS_ARMXOR_CLK) {
                /*
                 * The I2C functional clock is the armxor_ck, so there's
                 * no need to get "armxor_ck" separately.  Now, if OMAP2420
                 * always returns 12MHz for the functional clock, we can
                 * do this bit unconditionally.
                 */
-               fclk = clk_get(dev->dev, "fck");
+               fclk = clk_get(omap->dev, "fck");
                fclk_rate = clk_get_rate(fclk);
                clk_put(fclk);
 
@@ -395,7 +395,7 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
                        psc = fclk_rate / 12000000;
        }
 
-       if (!(dev->flags & OMAP_I2C_FLAG_SIMPLE_CLOCK)) {
+       if (!(omap->flags & OMAP_I2C_FLAG_SIMPLE_CLOCK)) {
 
                /*
                 * HSI2C controller internal clk rate should be 19.2 Mhz for
@@ -403,14 +403,14 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
                 * to get longer filter period for better noise suppression.
                 * The filter is iclk (fclk for HS) period.
                 */
-               if (dev->speed > 400 ||
-                              dev->flags & OMAP_I2C_FLAG_FORCE_19200_INT_CLK)
+               if (omap->speed > 400 ||
+                              omap->flags & OMAP_I2C_FLAG_FORCE_19200_INT_CLK)
                        internal_clk = 19200;
-               else if (dev->speed > 100)
+               else if (omap->speed > 100)
                        internal_clk = 9600;
                else
                        internal_clk = 4000;
-               fclk = clk_get(dev->dev, "fck");
+               fclk = clk_get(omap->dev, "fck");
                fclk_rate = clk_get_rate(fclk) / 1000;
                clk_put(fclk);
 
@@ -419,7 +419,7 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
                psc = psc - 1;
 
                /* If configured for High Speed */
-               if (dev->speed > 400) {
+               if (omap->speed > 400) {
                        unsigned long scl;
 
                        /* For first phase of HS mode */
@@ -428,20 +428,20 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
                        fssclh = (scl / 3) - 5;
 
                        /* For second phase of HS mode */
-                       scl = fclk_rate / dev->speed;
+                       scl = fclk_rate / omap->speed;
                        hsscll = scl - (scl / 3) - 7;
                        hssclh = (scl / 3) - 5;
-               } else if (dev->speed > 100) {
+               } else if (omap->speed > 100) {
                        unsigned long scl;
 
                        /* Fast mode */
-                       scl = internal_clk / dev->speed;
+                       scl = internal_clk / omap->speed;
                        fsscll = scl - (scl / 3) - 7;
                        fssclh = (scl / 3) - 5;
                } else {
                        /* Standard mode */
-                       fsscll = internal_clk / (dev->speed * 2) - 7;
-                       fssclh = internal_clk / (dev->speed * 2) - 5;
+                       fsscll = internal_clk / (omap->speed * 2) - 7;
+                       fssclh = internal_clk / (omap->speed * 2) - 5;
                }
                scll = (hsscll << OMAP_I2C_SCLL_HSSCLL) | fsscll;
                sclh = (hssclh << OMAP_I2C_SCLH_HSSCLH) | fssclh;
@@ -450,25 +450,25 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
                fclk_rate /= (psc + 1) * 1000;
                if (psc > 2)
                        psc = 2;
-               scll = fclk_rate / (dev->speed * 2) - 7 + psc;
-               sclh = fclk_rate / (dev->speed * 2) - 7 + psc;
+               scll = fclk_rate / (omap->speed * 2) - 7 + psc;
+               sclh = fclk_rate / (omap->speed * 2) - 7 + psc;
        }
 
-       dev->iestate = (OMAP_I2C_IE_XRDY | OMAP_I2C_IE_RRDY |
+       omap->iestate = (OMAP_I2C_IE_XRDY | OMAP_I2C_IE_RRDY |
                        OMAP_I2C_IE_ARDY | OMAP_I2C_IE_NACK |
-                       OMAP_I2C_IE_AL)  | ((dev->fifo_size) ?
+                       OMAP_I2C_IE_AL)  | ((omap->fifo_size) ?
                                (OMAP_I2C_IE_RDR | OMAP_I2C_IE_XDR) : 0);
 
-       dev->pscstate = psc;
-       dev->scllstate = scll;
-       dev->sclhstate = sclh;
+       omap->pscstate = psc;
+       omap->scllstate = scll;
+       omap->sclhstate = sclh;
 
-       if (dev->rev <= OMAP_I2C_REV_ON_3430_3530) {
+       if (omap->rev <= OMAP_I2C_REV_ON_3430_3530) {
                /* Not implemented */
-               dev->bb_valid = 1;
+               omap->bb_valid = 1;
        }
 
-       __omap_i2c_init(dev);
+       __omap_i2c_init(omap);
 
        return 0;
 }
@@ -476,14 +476,14 @@ static int omap_i2c_init(struct omap_i2c_dev *dev)
 /*
  * Waiting on Bus Busy
  */
-static int omap_i2c_wait_for_bb(struct omap_i2c_dev *dev)
+static int omap_i2c_wait_for_bb(struct omap_i2c_dev *omap)
 {
        unsigned long timeout;
 
        timeout = jiffies + OMAP_I2C_TIMEOUT;
-       while (omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG) & OMAP_I2C_STAT_BB) {
+       while (omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG) & OMAP_I2C_STAT_BB) {
                if (time_after(jiffies, timeout))
-                       return i2c_recover_bus(&dev->adapter);
+                       return i2c_recover_bus(&omap->adapter);
                msleep(1);
        }
 
@@ -518,19 +518,19 @@ static int omap_i2c_wait_for_bb(struct omap_i2c_dev *dev)
  * 3. Any transfer started in the middle of another master's transfer
  *    results in unpredictable results and data corruption
  */
-static int omap_i2c_wait_for_bb_valid(struct omap_i2c_dev *dev)
+static int omap_i2c_wait_for_bb_valid(struct omap_i2c_dev *omap)
 {
        unsigned long bus_free_timeout = 0;
        unsigned long timeout;
        int bus_free = 0;
        u16 stat, systest;
 
-       if (dev->bb_valid)
+       if (omap->bb_valid)
                return 0;
 
        timeout = jiffies + OMAP_I2C_TIMEOUT;
        while (1) {
-               stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+               stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
                /*
                 * We will see BB or BF event in a case IP had detected any
                 * activity on the I2C bus. Now IP correctly tracks the bus
@@ -543,7 +543,7 @@ static int omap_i2c_wait_for_bb_valid(struct omap_i2c_dev *dev)
                 * Otherwise, we must look signals on the bus to make
                 * the right decision.
                 */
-               systest = omap_i2c_read_reg(dev, OMAP_I2C_SYSTEST_REG);
+               systest = omap_i2c_read_reg(omap, OMAP_I2C_SYSTEST_REG);
                if ((systest & OMAP_I2C_SYSTEST_SCL_I_FUNC) &&
                    (systest & OMAP_I2C_SYSTEST_SDA_I_FUNC)) {
                        if (!bus_free) {
@@ -564,22 +564,22 @@ static int omap_i2c_wait_for_bb_valid(struct omap_i2c_dev *dev)
                }
 
                if (time_after(jiffies, timeout)) {
-                       dev_warn(dev->dev, "timeout waiting for bus ready\n");
+                       dev_warn(omap->dev, "timeout waiting for bus ready\n");
                        return -ETIMEDOUT;
                }
 
                msleep(1);
        }
 
-       dev->bb_valid = 1;
+       omap->bb_valid = 1;
        return 0;
 }
 
-static void omap_i2c_resize_fifo(struct omap_i2c_dev *dev, u8 size, bool is_rx)
+static void omap_i2c_resize_fifo(struct omap_i2c_dev *omap, u8 size, bool is_rx)
 {
        u16             buf;
 
-       if (dev->flags & OMAP_I2C_FLAG_NO_FIFO)
+       if (omap->flags & OMAP_I2C_FLAG_NO_FIFO)
                return;
 
        /*
@@ -589,29 +589,29 @@ static void omap_i2c_resize_fifo(struct omap_i2c_dev *dev, u8 size, bool is_rx)
         * then we might use draining feature to transfer the remaining bytes.
         */
 
-       dev->threshold = clamp(size, (u8) 1, dev->fifo_size);
+       omap->threshold = clamp(size, (u8) 1, omap->fifo_size);
 
-       buf = omap_i2c_read_reg(dev, OMAP_I2C_BUF_REG);
+       buf = omap_i2c_read_reg(omap, OMAP_I2C_BUF_REG);
 
        if (is_rx) {
                /* Clear RX Threshold */
                buf &= ~(0x3f << 8);
-               buf |= ((dev->threshold - 1) << 8) | OMAP_I2C_BUF_RXFIF_CLR;
+               buf |= ((omap->threshold - 1) << 8) | OMAP_I2C_BUF_RXFIF_CLR;
        } else {
                /* Clear TX Threshold */
                buf &= ~0x3f;
-               buf |= (dev->threshold - 1) | OMAP_I2C_BUF_TXFIF_CLR;
+               buf |= (omap->threshold - 1) | OMAP_I2C_BUF_TXFIF_CLR;
        }
 
-       omap_i2c_write_reg(dev, OMAP_I2C_BUF_REG, buf);
+       omap_i2c_write_reg(omap, OMAP_I2C_BUF_REG, buf);
 
-       if (dev->rev < OMAP_I2C_REV_ON_3630)
-               dev->b_hw = 1; /* Enable hardware fixes */
+       if (omap->rev < OMAP_I2C_REV_ON_3630)
+               omap->b_hw = 1; /* Enable hardware fixes */
 
        /* calculate wakeup latency constraint for MPU */
-       if (dev->set_mpu_wkup_lat != NULL)
-               dev->latency = (1000000 * dev->threshold) /
-                       (1000 * dev->speed / 8);
+       if (omap->set_mpu_wkup_lat != NULL)
+               omap->latency = (1000000 * omap->threshold) /
+                       (1000 * omap->speed / 8);
 }
 
 /*
@@ -620,42 +620,42 @@ static void omap_i2c_resize_fifo(struct omap_i2c_dev *dev, u8 size, bool is_rx)
 static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
                             struct i2c_msg *msg, int stop)
 {
-       struct omap_i2c_dev *dev = i2c_get_adapdata(adap);
+       struct omap_i2c_dev *omap = i2c_get_adapdata(adap);
        unsigned long timeout;
        u16 w;
 
-       dev_dbg(dev->dev, "addr: 0x%04x, len: %d, flags: 0x%x, stop: %d\n",
+       dev_dbg(omap->dev, "addr: 0x%04x, len: %d, flags: 0x%x, stop: %d\n",
                msg->addr, msg->len, msg->flags, stop);
 
        if (msg->len == 0)
                return -EINVAL;
 
-       dev->receiver = !!(msg->flags & I2C_M_RD);
-       omap_i2c_resize_fifo(dev, msg->len, dev->receiver);
+       omap->receiver = !!(msg->flags & I2C_M_RD);
+       omap_i2c_resize_fifo(omap, msg->len, omap->receiver);
 
-       omap_i2c_write_reg(dev, OMAP_I2C_SA_REG, msg->addr);
+       omap_i2c_write_reg(omap, OMAP_I2C_SA_REG, msg->addr);
 
        /* REVISIT: Could the STB bit of I2C_CON be used with probing? */
-       dev->buf = msg->buf;
-       dev->buf_len = msg->len;
+       omap->buf = msg->buf;
+       omap->buf_len = msg->len;
 
-       /* make sure writes to dev->buf_len are ordered */
+       /* make sure writes to omap->buf_len are ordered */
        barrier();
 
-       omap_i2c_write_reg(dev, OMAP_I2C_CNT_REG, dev->buf_len);
+       omap_i2c_write_reg(omap, OMAP_I2C_CNT_REG, omap->buf_len);
 
        /* Clear the FIFO Buffers */
-       w = omap_i2c_read_reg(dev, OMAP_I2C_BUF_REG);
+       w = omap_i2c_read_reg(omap, OMAP_I2C_BUF_REG);
        w |= OMAP_I2C_BUF_RXFIF_CLR | OMAP_I2C_BUF_TXFIF_CLR;
-       omap_i2c_write_reg(dev, OMAP_I2C_BUF_REG, w);
+       omap_i2c_write_reg(omap, OMAP_I2C_BUF_REG, w);
 
-       reinit_completion(&dev->cmd_complete);
-       dev->cmd_err = 0;
+       reinit_completion(&omap->cmd_complete);
+       omap->cmd_err = 0;
 
        w = OMAP_I2C_CON_EN | OMAP_I2C_CON_MST | OMAP_I2C_CON_STT;
 
        /* High speed configuration */
-       if (dev->speed > 400)
+       if (omap->speed > 400)
                w |= OMAP_I2C_CON_OPMODE_HS;
 
        if (msg->flags & I2C_M_STOP)
@@ -665,27 +665,27 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
        if (!(msg->flags & I2C_M_RD))
                w |= OMAP_I2C_CON_TRX;
 
-       if (!dev->b_hw && stop)
+       if (!omap->b_hw && stop)
                w |= OMAP_I2C_CON_STP;
        /*
         * NOTE: STAT_BB bit could became 1 here if another master occupy
         * the bus. IP successfully complete transfer when the bus will be
         * free again (BB reset to 0).
         */
-       omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, w);
+       omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, w);
 
        /*
         * Don't write stt and stp together on some hardware.
         */
-       if (dev->b_hw && stop) {
+       if (omap->b_hw && stop) {
                unsigned long delay = jiffies + OMAP_I2C_TIMEOUT;
-               u16 con = omap_i2c_read_reg(dev, OMAP_I2C_CON_REG);
+               u16 con = omap_i2c_read_reg(omap, OMAP_I2C_CON_REG);
                while (con & OMAP_I2C_CON_STT) {
-                       con = omap_i2c_read_reg(dev, OMAP_I2C_CON_REG);
+                       con = omap_i2c_read_reg(omap, OMAP_I2C_CON_REG);
 
                        /* Let the user know if i2c is in a bad state */
                        if (time_after(jiffies, delay)) {
-                               dev_err(dev->dev, "controller timed out "
+                               dev_err(omap->dev, "controller timed out "
                                "waiting for start condition to finish\n");
                                return -ETIMEDOUT;
                        }
@@ -694,42 +694,42 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
 
                w |= OMAP_I2C_CON_STP;
                w &= ~OMAP_I2C_CON_STT;
-               omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, w);
+               omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, w);
        }
 
        /*
         * REVISIT: We should abort the transfer on signals, but the bus goes
         * into arbitration and we're currently unable to recover from it.
         */
-       timeout = wait_for_completion_timeout(&dev->cmd_complete,
+       timeout = wait_for_completion_timeout(&omap->cmd_complete,
                                                OMAP_I2C_TIMEOUT);
        if (timeout == 0) {
-               dev_err(dev->dev, "controller timed out\n");
-               omap_i2c_reset(dev);
-               __omap_i2c_init(dev);
+               dev_err(omap->dev, "controller timed out\n");
+               omap_i2c_reset(omap);
+               __omap_i2c_init(omap);
                return -ETIMEDOUT;
        }
 
-       if (likely(!dev->cmd_err))
+       if (likely(!omap->cmd_err))
                return 0;
 
        /* We have an error */
-       if (dev->cmd_err & (OMAP_I2C_STAT_ROVR | OMAP_I2C_STAT_XUDF)) {
-               omap_i2c_reset(dev);
-               __omap_i2c_init(dev);
+       if (omap->cmd_err & (OMAP_I2C_STAT_ROVR | OMAP_I2C_STAT_XUDF)) {
+               omap_i2c_reset(omap);
+               __omap_i2c_init(omap);
                return -EIO;
        }
 
-       if (dev->cmd_err & OMAP_I2C_STAT_AL)
+       if (omap->cmd_err & OMAP_I2C_STAT_AL)
                return -EAGAIN;
 
-       if (dev->cmd_err & OMAP_I2C_STAT_NACK) {
+       if (omap->cmd_err & OMAP_I2C_STAT_NACK) {
                if (msg->flags & I2C_M_IGNORE_NAK)
                        return 0;
 
-               w = omap_i2c_read_reg(dev, OMAP_I2C_CON_REG);
+               w = omap_i2c_read_reg(omap, OMAP_I2C_CON_REG);
                w |= OMAP_I2C_CON_STP;
-               omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, w);
+               omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, w);
                return -EREMOTEIO;
        }
        return -EIO;
@@ -743,24 +743,24 @@ static int omap_i2c_xfer_msg(struct i2c_adapter *adap,
 static int
 omap_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
 {
-       struct omap_i2c_dev *dev = i2c_get_adapdata(adap);
+       struct omap_i2c_dev *omap = i2c_get_adapdata(adap);
        int i;
        int r;
 
-       r = pm_runtime_get_sync(dev->dev);
+       r = pm_runtime_get_sync(omap->dev);
        if (r < 0)
                goto out;
 
-       r = omap_i2c_wait_for_bb_valid(dev);
+       r = omap_i2c_wait_for_bb_valid(omap);
        if (r < 0)
                goto out;
 
-       r = omap_i2c_wait_for_bb(dev);
+       r = omap_i2c_wait_for_bb(omap);
        if (r < 0)
                goto out;
 
-       if (dev->set_mpu_wkup_lat != NULL)
-               dev->set_mpu_wkup_lat(dev->dev, dev->latency);
+       if (omap->set_mpu_wkup_lat != NULL)
+               omap->set_mpu_wkup_lat(omap->dev, omap->latency);
 
        for (i = 0; i < num; i++) {
                r = omap_i2c_xfer_msg(adap, &msgs[i], (i == (num - 1)));
@@ -771,14 +771,14 @@ omap_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
        if (r == 0)
                r = num;
 
-       omap_i2c_wait_for_bb(dev);
+       omap_i2c_wait_for_bb(omap);
 
-       if (dev->set_mpu_wkup_lat != NULL)
-               dev->set_mpu_wkup_lat(dev->dev, -1);
+       if (omap->set_mpu_wkup_lat != NULL)
+               omap->set_mpu_wkup_lat(omap->dev, -1);
 
 out:
-       pm_runtime_mark_last_busy(dev->dev);
-       pm_runtime_put_autosuspend(dev->dev);
+       pm_runtime_mark_last_busy(omap->dev);
+       pm_runtime_put_autosuspend(omap->dev);
        return r;
 }
 
@@ -790,19 +790,19 @@ omap_i2c_func(struct i2c_adapter *adap)
 }
 
 static inline void
-omap_i2c_complete_cmd(struct omap_i2c_dev *dev, u16 err)
+omap_i2c_complete_cmd(struct omap_i2c_dev *omap, u16 err)
 {
-       dev->cmd_err |= err;
-       complete(&dev->cmd_complete);
+       omap->cmd_err |= err;
+       complete(&omap->cmd_complete);
 }
 
 static inline void
-omap_i2c_ack_stat(struct omap_i2c_dev *dev, u16 stat)
+omap_i2c_ack_stat(struct omap_i2c_dev *omap, u16 stat)
 {
-       omap_i2c_write_reg(dev, OMAP_I2C_STAT_REG, stat);
+       omap_i2c_write_reg(omap, OMAP_I2C_STAT_REG, stat);
 }
 
-static inline void i2c_omap_errata_i207(struct omap_i2c_dev *dev, u16 stat)
+static inline void i2c_omap_errata_i207(struct omap_i2c_dev *omap, u16 stat)
 {
        /*
         * I2C Errata(Errata Nos. OMAP2: 1.67, OMAP3: 1.8)
@@ -813,17 +813,17 @@ static inline void i2c_omap_errata_i207(struct omap_i2c_dev *dev, u16 stat)
         */
        if (stat & OMAP_I2C_STAT_RDR) {
                /* Step 1: If RDR is set, clear it */
-               omap_i2c_ack_stat(dev, OMAP_I2C_STAT_RDR);
+               omap_i2c_ack_stat(omap, OMAP_I2C_STAT_RDR);
 
                /* Step 2: */
-               if (!(omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG)
+               if (!(omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG)
                                                & OMAP_I2C_STAT_BB)) {
 
                        /* Step 3: */
-                       if (omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG)
+                       if (omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG)
                                                & OMAP_I2C_STAT_RDR) {
-                               omap_i2c_ack_stat(dev, OMAP_I2C_STAT_RDR);
-                               dev_dbg(dev->dev, "RDR when bus is busy.\n");
+                               omap_i2c_ack_stat(omap, OMAP_I2C_STAT_RDR);
+                               dev_dbg(omap->dev, "RDR when bus is busy.\n");
                        }
 
                }
@@ -836,50 +836,50 @@ static inline void i2c_omap_errata_i207(struct omap_i2c_dev *dev, u16 stat)
 static irqreturn_t
 omap_i2c_omap1_isr(int this_irq, void *dev_id)
 {
-       struct omap_i2c_dev *dev = dev_id;
+       struct omap_i2c_dev *omap = dev_id;
        u16 iv, w;
 
-       if (pm_runtime_suspended(dev->dev))
+       if (pm_runtime_suspended(omap->dev))
                return IRQ_NONE;
 
-       iv = omap_i2c_read_reg(dev, OMAP_I2C_IV_REG);
+       iv = omap_i2c_read_reg(omap, OMAP_I2C_IV_REG);
        switch (iv) {
        case 0x00:      /* None */
                break;
        case 0x01:      /* Arbitration lost */
-               dev_err(dev->dev, "Arbitration lost\n");
-               omap_i2c_complete_cmd(dev, OMAP_I2C_STAT_AL);
+               dev_err(omap->dev, "Arbitration lost\n");
+               omap_i2c_complete_cmd(omap, OMAP_I2C_STAT_AL);
                break;
        case 0x02:      /* No acknowledgement */
-               omap_i2c_complete_cmd(dev, OMAP_I2C_STAT_NACK);
-               omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, OMAP_I2C_CON_STP);
+               omap_i2c_complete_cmd(omap, OMAP_I2C_STAT_NACK);
+               omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, OMAP_I2C_CON_STP);
                break;
        case 0x03:      /* Register access ready */
-               omap_i2c_complete_cmd(dev, 0);
+               omap_i2c_complete_cmd(omap, 0);
                break;
        case 0x04:      /* Receive data ready */
-               if (dev->buf_len) {
-                       w = omap_i2c_read_reg(dev, OMAP_I2C_DATA_REG);
-                       *dev->buf++ = w;
-                       dev->buf_len--;
-                       if (dev->buf_len) {
-                               *dev->buf++ = w >> 8;
-                               dev->buf_len--;
+               if (omap->buf_len) {
+                       w = omap_i2c_read_reg(omap, OMAP_I2C_DATA_REG);
+                       *omap->buf++ = w;
+                       omap->buf_len--;
+                       if (omap->buf_len) {
+                               *omap->buf++ = w >> 8;
+                               omap->buf_len--;
                        }
                } else
-                       dev_err(dev->dev, "RRDY IRQ while no data requested\n");
+                       dev_err(omap->dev, "RRDY IRQ while no data requested\n");
                break;
        case 0x05:      /* Transmit data ready */
-               if (dev->buf_len) {
-                       w = *dev->buf++;
-                       dev->buf_len--;
-                       if (dev->buf_len) {
-                               w |= *dev->buf++ << 8;
-                               dev->buf_len--;
+               if (omap->buf_len) {
+                       w = *omap->buf++;
+                       omap->buf_len--;
+                       if (omap->buf_len) {
+                               w |= *omap->buf++ << 8;
+                               omap->buf_len--;
                        }
-                       omap_i2c_write_reg(dev, OMAP_I2C_DATA_REG, w);
+                       omap_i2c_write_reg(omap, OMAP_I2C_DATA_REG, w);
                } else
-                       dev_err(dev->dev, "XRDY IRQ while no data to send\n");
+                       dev_err(omap->dev, "XRDY IRQ while no data to send\n");
                break;
        default:
                return IRQ_NONE;
@@ -896,28 +896,28 @@ omap_i2c_omap1_isr(int this_irq, void *dev_id)
  * data to DATA_REG. Otherwise some data bytes can be lost while transferring
  * them from the memory to the I2C interface.
  */
-static int errata_omap3_i462(struct omap_i2c_dev *dev)
+static int errata_omap3_i462(struct omap_i2c_dev *omap)
 {
        unsigned long timeout = 10000;
        u16 stat;
 
        do {
-               stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+               stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
                if (stat & OMAP_I2C_STAT_XUDF)
                        break;
 
                if (stat & (OMAP_I2C_STAT_NACK | OMAP_I2C_STAT_AL)) {
-                       omap_i2c_ack_stat(dev, (OMAP_I2C_STAT_XRDY |
+                       omap_i2c_ack_stat(omap, (OMAP_I2C_STAT_XRDY |
                                                        OMAP_I2C_STAT_XDR));
                        if (stat & OMAP_I2C_STAT_NACK) {
-                               dev->cmd_err |= OMAP_I2C_STAT_NACK;
-                               omap_i2c_ack_stat(dev, OMAP_I2C_STAT_NACK);
+                               omap->cmd_err |= OMAP_I2C_STAT_NACK;
+                               omap_i2c_ack_stat(omap, OMAP_I2C_STAT_NACK);
                        }
 
                        if (stat & OMAP_I2C_STAT_AL) {
-                               dev_err(dev->dev, "Arbitration lost\n");
-                               dev->cmd_err |= OMAP_I2C_STAT_AL;
-                               omap_i2c_ack_stat(dev, OMAP_I2C_STAT_AL);
+                               dev_err(omap->dev, "Arbitration lost\n");
+                               omap->cmd_err |= OMAP_I2C_STAT_AL;
+                               omap_i2c_ack_stat(omap, OMAP_I2C_STAT_AL);
                        }
 
                        return -EIO;
@@ -927,61 +927,61 @@ static int errata_omap3_i462(struct omap_i2c_dev *dev)
        } while (--timeout);
 
        if (!timeout) {
-               dev_err(dev->dev, "timeout waiting on XUDF bit\n");
+               dev_err(omap->dev, "timeout waiting on XUDF bit\n");
                return 0;
        }
 
        return 0;
 }
 
-static void omap_i2c_receive_data(struct omap_i2c_dev *dev, u8 num_bytes,
+static void omap_i2c_receive_data(struct omap_i2c_dev *omap, u8 num_bytes,
                bool is_rdr)
 {
        u16             w;
 
        while (num_bytes--) {
-               w = omap_i2c_read_reg(dev, OMAP_I2C_DATA_REG);
-               *dev->buf++ = w;
-               dev->buf_len--;
+               w = omap_i2c_read_reg(omap, OMAP_I2C_DATA_REG);
+               *omap->buf++ = w;
+               omap->buf_len--;
 
                /*
                 * Data reg in 2430, omap3 and
                 * omap4 is 8 bit wide
                 */
-               if (dev->flags & OMAP_I2C_FLAG_16BIT_DATA_REG) {
-                       *dev->buf++ = w >> 8;
-                       dev->buf_len--;
+               if (omap->flags & OMAP_I2C_FLAG_16BIT_DATA_REG) {
+                       *omap->buf++ = w >> 8;
+                       omap->buf_len--;
                }
        }
 }
 
-static int omap_i2c_transmit_data(struct omap_i2c_dev *dev, u8 num_bytes,
+static int omap_i2c_transmit_data(struct omap_i2c_dev *omap, u8 num_bytes,
                bool is_xdr)
 {
        u16             w;
 
        while (num_bytes--) {
-               w = *dev->buf++;
-               dev->buf_len--;
+               w = *omap->buf++;
+               omap->buf_len--;
 
                /*
                 * Data reg in 2430, omap3 and
                 * omap4 is 8 bit wide
                 */
-               if (dev->flags & OMAP_I2C_FLAG_16BIT_DATA_REG) {
-                       w |= *dev->buf++ << 8;
-                       dev->buf_len--;
+               if (omap->flags & OMAP_I2C_FLAG_16BIT_DATA_REG) {
+                       w |= *omap->buf++ << 8;
+                       omap->buf_len--;
                }
 
-               if (dev->errata & I2C_OMAP_ERRATA_I462) {
+               if (omap->errata & I2C_OMAP_ERRATA_I462) {
                        int ret;
 
-                       ret = errata_omap3_i462(dev);
+                       ret = errata_omap3_i462(omap);
                        if (ret < 0)
                                return ret;
                }
 
-               omap_i2c_write_reg(dev, OMAP_I2C_DATA_REG, w);
+               omap_i2c_write_reg(omap, OMAP_I2C_DATA_REG, w);
        }
 
        return 0;
@@ -990,19 +990,19 @@ static int omap_i2c_transmit_data(struct omap_i2c_dev *dev, u8 num_bytes,
 static irqreturn_t
 omap_i2c_isr(int irq, void *dev_id)
 {
-       struct omap_i2c_dev *dev = dev_id;
+       struct omap_i2c_dev *omap = dev_id;
        irqreturn_t ret = IRQ_HANDLED;
        u16 mask;
        u16 stat;
 
-       spin_lock(&dev->lock);
-       mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
-       stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+       spin_lock(&omap->lock);
+       mask = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
+       stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
 
        if (stat & mask)
                ret = IRQ_WAKE_THREAD;
 
-       spin_unlock(&dev->lock);
+       spin_unlock(&omap->lock);
 
        return ret;
 }
@@ -1010,20 +1010,20 @@ omap_i2c_isr(int irq, void *dev_id)
 static irqreturn_t
 omap_i2c_isr_thread(int this_irq, void *dev_id)
 {
-       struct omap_i2c_dev *dev = dev_id;
+       struct omap_i2c_dev *omap = dev_id;
        unsigned long flags;
        u16 bits;
        u16 stat;
        int err = 0, count = 0;
 
-       spin_lock_irqsave(&dev->lock, flags);
+       spin_lock_irqsave(&omap->lock, flags);
        do {
-               bits = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
-               stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
+               bits = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
+               stat = omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
                stat &= bits;
 
                /* If we're in receiver mode, ignore XDR/XRDY */
-               if (dev->receiver)
+               if (omap->receiver)
                        stat &= ~(OMAP_I2C_STAT_XDR | OMAP_I2C_STAT_XRDY);
                else
                        stat &= ~(OMAP_I2C_STAT_RDR | OMAP_I2C_STAT_RRDY);
@@ -1033,32 +1033,32 @@ omap_i2c_isr_thread(int this_irq, void *dev_id)
                        goto out;
                }
 
-               dev_dbg(dev->dev, "IRQ (ISR = 0x%04x)\n", stat);
+               dev_dbg(omap->dev, "IRQ (ISR = 0x%04x)\n", stat);
                if (count++ == 100) {
-                       dev_warn(dev->dev, "Too much work in one IRQ\n");
+                       dev_warn(omap->dev, "Too much work in one IRQ\n");
                        break;
                }
 
                if (stat & OMAP_I2C_STAT_NACK) {
                        err |= OMAP_I2C_STAT_NACK;
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_NACK);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_NACK);
                }
 
                if (stat & OMAP_I2C_STAT_AL) {
-                       dev_err(dev->dev, "Arbitration lost\n");
+                       dev_err(omap->dev, "Arbitration lost\n");
                        err |= OMAP_I2C_STAT_AL;
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_AL);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_AL);
                }
 
                /*
                 * ProDB0017052: Clear ARDY bit twice
                 */
                if (stat & OMAP_I2C_STAT_ARDY)
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_ARDY);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_ARDY);
 
                if (stat & (OMAP_I2C_STAT_ARDY | OMAP_I2C_STAT_NACK |
                                        OMAP_I2C_STAT_AL)) {
-                       omap_i2c_ack_stat(dev, (OMAP_I2C_STAT_RRDY |
+                       omap_i2c_ack_stat(omap, (OMAP_I2C_STAT_RRDY |
                                                OMAP_I2C_STAT_RDR |
                                                OMAP_I2C_STAT_XRDY |
                                                OMAP_I2C_STAT_XDR |
@@ -1069,28 +1069,28 @@ omap_i2c_isr_thread(int this_irq, void *dev_id)
                if (stat & OMAP_I2C_STAT_RDR) {
                        u8 num_bytes = 1;
 
-                       if (dev->fifo_size)
-                               num_bytes = dev->buf_len;
+                       if (omap->fifo_size)
+                               num_bytes = omap->buf_len;
 
-                       if (dev->errata & I2C_OMAP_ERRATA_I207) {
-                               i2c_omap_errata_i207(dev, stat);
-                               num_bytes = (omap_i2c_read_reg(dev,
+                       if (omap->errata & I2C_OMAP_ERRATA_I207) {
+                               i2c_omap_errata_i207(omap, stat);
+                               num_bytes = (omap_i2c_read_reg(omap,
                                        OMAP_I2C_BUFSTAT_REG) >> 8) & 0x3F;
                        }
 
-                       omap_i2c_receive_data(dev, num_bytes, true);
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_RDR);
+                       omap_i2c_receive_data(omap, num_bytes, true);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_RDR);
                        continue;
                }
 
                if (stat & OMAP_I2C_STAT_RRDY) {
                        u8 num_bytes = 1;
 
-                       if (dev->threshold)
-                               num_bytes = dev->threshold;
+                       if (omap->threshold)
+                               num_bytes = omap->threshold;
 
-                       omap_i2c_receive_data(dev, num_bytes, false);
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_RRDY);
+                       omap_i2c_receive_data(omap, num_bytes, false);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_RRDY);
                        continue;
                }
 
@@ -1098,14 +1098,14 @@ omap_i2c_isr_thread(int this_irq, void *dev_id)
                        u8 num_bytes = 1;
                        int ret;
 
-                       if (dev->fifo_size)
-                               num_bytes = dev->buf_len;
+                       if (omap->fifo_size)
+                               num_bytes = omap->buf_len;
 
-                       ret = omap_i2c_transmit_data(dev, num_bytes, true);
+                       ret = omap_i2c_transmit_data(omap, num_bytes, true);
                        if (ret < 0)
                                break;
 
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_XDR);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_XDR);
                        continue;
                }
 
@@ -1113,36 +1113,36 @@ omap_i2c_isr_thread(int this_irq, void *dev_id)
                        u8 num_bytes = 1;
                        int ret;
 
-                       if (dev->threshold)
-                               num_bytes = dev->threshold;
+                       if (omap->threshold)
+                               num_bytes = omap->threshold;
 
-                       ret = omap_i2c_transmit_data(dev, num_bytes, false);
+                       ret = omap_i2c_transmit_data(omap, num_bytes, false);
                        if (ret < 0)
                                break;
 
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_XRDY);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_XRDY);
                        continue;
                }
 
                if (stat & OMAP_I2C_STAT_ROVR) {
-                       dev_err(dev->dev, "Receive overrun\n");
+                       dev_err(omap->dev, "Receive overrun\n");
                        err |= OMAP_I2C_STAT_ROVR;
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_ROVR);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_ROVR);
                        break;
                }
 
                if (stat & OMAP_I2C_STAT_XUDF) {
-                       dev_err(dev->dev, "Transmit underflow\n");
+                       dev_err(omap->dev, "Transmit underflow\n");
                        err |= OMAP_I2C_STAT_XUDF;
-                       omap_i2c_ack_stat(dev, OMAP_I2C_STAT_XUDF);
+                       omap_i2c_ack_stat(omap, OMAP_I2C_STAT_XUDF);
                        break;
                }
        } while (stat);
 
-       omap_i2c_complete_cmd(dev, err);
+       omap_i2c_complete_cmd(omap, err);
 
 out:
-       spin_unlock_irqrestore(&dev->lock, flags);
+       spin_unlock_irqrestore(&omap->lock, flags);
 
        return IRQ_HANDLED;
 }
@@ -1284,7 +1284,7 @@ static struct i2c_bus_recovery_info omap_i2c_bus_recovery_info = {
 static int
 omap_i2c_probe(struct platform_device *pdev)
 {
-       struct omap_i2c_dev     *dev;
+       struct omap_i2c_dev     *omap;
        struct i2c_adapter      *adap;
        struct resource         *mem;
        const struct omap_i2c_bus_platform_data *pdata =
@@ -1302,46 +1302,46 @@ omap_i2c_probe(struct platform_device *pdev)
                return irq;
        }
 
-       dev = devm_kzalloc(&pdev->dev, sizeof(struct omap_i2c_dev), GFP_KERNEL);
-       if (!dev)
+       omap = devm_kzalloc(&pdev->dev, sizeof(struct omap_i2c_dev), GFP_KERNEL);
+       if (!omap)
                return -ENOMEM;
 
        mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       dev->base = devm_ioremap_resource(&pdev->dev, mem);
-       if (IS_ERR(dev->base))
-               return PTR_ERR(dev->base);
+       omap->base = devm_ioremap_resource(&pdev->dev, mem);
+       if (IS_ERR(omap->base))
+               return PTR_ERR(omap->base);
 
        match = of_match_device(of_match_ptr(omap_i2c_of_match), &pdev->dev);
        if (match) {
                u32 freq = 100000; /* default to 100000 Hz */
 
                pdata = match->data;
-               dev->flags = pdata->flags;
+               omap->flags = pdata->flags;
 
                of_property_read_u32(node, "clock-frequency", &freq);
                /* convert DT freq value in Hz into kHz for speed */
-               dev->speed = freq / 1000;
+               omap->speed = freq / 1000;
        } else if (pdata != NULL) {
-               dev->speed = pdata->clkrate;
-               dev->flags = pdata->flags;
-               dev->set_mpu_wkup_lat = pdata->set_mpu_wkup_lat;
+               omap->speed = pdata->clkrate;
+               omap->flags = pdata->flags;
+               omap->set_mpu_wkup_lat = pdata->set_mpu_wkup_lat;
        }
 
-       dev->dev = &pdev->dev;
-       dev->irq = irq;
+       omap->dev = &pdev->dev;
+       omap->irq = irq;
 
-       spin_lock_init(&dev->lock);
+       spin_lock_init(&omap->lock);
 
-       platform_set_drvdata(pdev, dev);
-       init_completion(&dev->cmd_complete);
+       platform_set_drvdata(pdev, omap);
+       init_completion(&omap->cmd_complete);
 
-       dev->reg_shift = (dev->flags >> OMAP_I2C_FLAG_BUS_SHIFT__SHIFT) & 3;
+       omap->reg_shift = (omap->flags >> OMAP_I2C_FLAG_BUS_SHIFT__SHIFT) & 3;
 
-       pm_runtime_enable(dev->dev);
-       pm_runtime_set_autosuspend_delay(dev->dev, OMAP_I2C_PM_TIMEOUT);
-       pm_runtime_use_autosuspend(dev->dev);
+       pm_runtime_enable(omap->dev);
+       pm_runtime_set_autosuspend_delay(omap->dev, OMAP_I2C_PM_TIMEOUT);
+       pm_runtime_use_autosuspend(omap->dev);
 
-       r = pm_runtime_get_sync(dev->dev);
+       r = pm_runtime_get_sync(omap->dev);
        if (r < 0)
                goto err_free_mem;
 
@@ -1351,42 +1351,42 @@ omap_i2c_probe(struct platform_device *pdev)
         * Also since the omap_i2c_read_reg uses reg_map_ip_* a
         * readw_relaxed is done.
         */
-       rev = readw_relaxed(dev->base + 0x04);
+       rev = readw_relaxed(omap->base + 0x04);
 
-       dev->scheme = OMAP_I2C_SCHEME(rev);
-       switch (dev->scheme) {
+       omap->scheme = OMAP_I2C_SCHEME(rev);
+       switch (omap->scheme) {
        case OMAP_I2C_SCHEME_0:
-               dev->regs = (u8 *)reg_map_ip_v1;
-               dev->rev = omap_i2c_read_reg(dev, OMAP_I2C_REV_REG);
-               minor = OMAP_I2C_REV_SCHEME_0_MAJOR(dev->rev);
-               major = OMAP_I2C_REV_SCHEME_0_MAJOR(dev->rev);
+               omap->regs = (u8 *)reg_map_ip_v1;
+               omap->rev = omap_i2c_read_reg(omap, OMAP_I2C_REV_REG);
+               minor = OMAP_I2C_REV_SCHEME_0_MAJOR(omap->rev);
+               major = OMAP_I2C_REV_SCHEME_0_MAJOR(omap->rev);
                break;
        case OMAP_I2C_SCHEME_1:
                /* FALLTHROUGH */
        default:
-               dev->regs = (u8 *)reg_map_ip_v2;
+               omap->regs = (u8 *)reg_map_ip_v2;
                rev = (rev << 16) |
-                       omap_i2c_read_reg(dev, OMAP_I2C_IP_V2_REVNB_LO);
+                       omap_i2c_read_reg(omap, OMAP_I2C_IP_V2_REVNB_LO);
                minor = OMAP_I2C_REV_SCHEME_1_MINOR(rev);
                major = OMAP_I2C_REV_SCHEME_1_MAJOR(rev);
-               dev->rev = rev;
+               omap->rev = rev;
        }
 
-       dev->errata = 0;
+       omap->errata = 0;
 
-       if (dev->rev >= OMAP_I2C_REV_ON_2430 &&
-                       dev->rev < OMAP_I2C_REV_ON_4430_PLUS)
-               dev->errata |= I2C_OMAP_ERRATA_I207;
+       if (omap->rev >= OMAP_I2C_REV_ON_2430 &&
+                       omap->rev < OMAP_I2C_REV_ON_4430_PLUS)
+               omap->errata |= I2C_OMAP_ERRATA_I207;
 
-       if (dev->rev <= OMAP_I2C_REV_ON_3430_3530)
-               dev->errata |= I2C_OMAP_ERRATA_I462;
+       if (omap->rev <= OMAP_I2C_REV_ON_3430_3530)
+               omap->errata |= I2C_OMAP_ERRATA_I462;
 
-       if (!(dev->flags & OMAP_I2C_FLAG_NO_FIFO)) {
+       if (!(omap->flags & OMAP_I2C_FLAG_NO_FIFO)) {
                u16 s;
 
                /* Set up the fifo size - Get total size */
-               s = (omap_i2c_read_reg(dev, OMAP_I2C_BUFSTAT_REG) >> 14) & 0x3;
-               dev->fifo_size = 0x8 << s;
+               s = (omap_i2c_read_reg(omap, OMAP_I2C_BUFSTAT_REG) >> 14) & 0x3;
+               omap->fifo_size = 0x8 << s;
 
                /*
                 * Set up notification threshold as half the total available
@@ -1394,36 +1394,36 @@ omap_i2c_probe(struct platform_device *pdev)
                 * call back latencies.
                 */
 
-               dev->fifo_size = (dev->fifo_size / 2);
+               omap->fifo_size = (omap->fifo_size / 2);
 
-               if (dev->rev < OMAP_I2C_REV_ON_3630)
-                       dev->b_hw = 1; /* Enable hardware fixes */
+               if (omap->rev < OMAP_I2C_REV_ON_3630)
+                       omap->b_hw = 1; /* Enable hardware fixes */
 
                /* calculate wakeup latency constraint for MPU */
-               if (dev->set_mpu_wkup_lat != NULL)
-                       dev->latency = (1000000 * dev->fifo_size) /
-                                      (1000 * dev->speed / 8);
+               if (omap->set_mpu_wkup_lat != NULL)
+                       omap->latency = (1000000 * omap->fifo_size) /
+                                      (1000 * omap->speed / 8);
        }
 
        /* reset ASAP, clearing any IRQs */
-       omap_i2c_init(dev);
+       omap_i2c_init(omap);
 
-       if (dev->rev < OMAP_I2C_OMAP1_REV_2)
-               r = devm_request_irq(&pdev->dev, dev->irq, omap_i2c_omap1_isr,
-                               IRQF_NO_SUSPEND, pdev->name, dev);
+       if (omap->rev < OMAP_I2C_OMAP1_REV_2)
+               r = devm_request_irq(&pdev->dev, omap->irq, omap_i2c_omap1_isr,
+                               IRQF_NO_SUSPEND, pdev->name, omap);
        else
-               r = devm_request_threaded_irq(&pdev->dev, dev->irq,
+               r = devm_request_threaded_irq(&pdev->dev, omap->irq,
                                omap_i2c_isr, omap_i2c_isr_thread,
                                IRQF_NO_SUSPEND | IRQF_ONESHOT,
-                               pdev->name, dev);
+                               pdev->name, omap);
 
        if (r) {
-               dev_err(dev->dev, "failure requesting irq %i\n", dev->irq);
+               dev_err(omap->dev, "failure requesting irq %i\n", omap->irq);
                goto err_unuse_clocks;
        }
 
-       adap = &dev->adapter;
-       i2c_set_adapdata(adap, dev);
+       adap = &omap->adapter;
+       i2c_set_adapdata(adap, omap);
        adap->owner = THIS_MODULE;
        adap->class = I2C_CLASS_DEPRECATED;
        strlcpy(adap->name, "OMAP I2C adapter", sizeof(adap->name));
@@ -1436,21 +1436,21 @@ omap_i2c_probe(struct platform_device *pdev)
        adap->nr = pdev->id;
        r = i2c_add_numbered_adapter(adap);
        if (r) {
-               dev_err(dev->dev, "failure adding adapter\n");
+               dev_err(omap->dev, "failure adding adapter\n");
                goto err_unuse_clocks;
        }
 
-       dev_info(dev->dev, "bus %d rev%d.%d at %d kHz\n", adap->nr,
-                major, minor, dev->speed);
+       dev_info(omap->dev, "bus %d rev%d.%d at %d kHz\n", adap->nr,
+                major, minor, omap->speed);
 
-       pm_runtime_mark_last_busy(dev->dev);
-       pm_runtime_put_autosuspend(dev->dev);
+       pm_runtime_mark_last_busy(omap->dev);
+       pm_runtime_put_autosuspend(omap->dev);
 
        return 0;
 
 err_unuse_clocks:
-       omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, 0);
-       pm_runtime_put(dev->dev);
+       omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, 0);
+       pm_runtime_put(omap->dev);
        pm_runtime_disable(&pdev->dev);
 err_free_mem:
 
@@ -1459,16 +1459,16 @@ err_free_mem:
 
 static int omap_i2c_remove(struct platform_device *pdev)
 {
-       struct omap_i2c_dev     *dev = platform_get_drvdata(pdev);
+       struct omap_i2c_dev     *omap = platform_get_drvdata(pdev);
        int ret;
 
-       i2c_del_adapter(&dev->adapter);
+       i2c_del_adapter(&omap->adapter);
        ret = pm_runtime_get_sync(&pdev->dev);
        if (ret < 0)
                return ret;
 
-       omap_i2c_write_reg(dev, OMAP_I2C_CON_REG, 0);
-       pm_runtime_put(&pdev->dev);
+       omap_i2c_write_reg(omap, OMAP_I2C_CON_REG, 0);
+       pm_runtime_put_sync(&pdev->dev);
        pm_runtime_disable(&pdev->dev);
        return 0;
 }
@@ -1476,24 +1476,23 @@ static int omap_i2c_remove(struct platform_device *pdev)
 #ifdef CONFIG_PM
 static int omap_i2c_runtime_suspend(struct device *dev)
 {
-       struct platform_device *pdev = to_platform_device(dev);
-       struct omap_i2c_dev *_dev = platform_get_drvdata(pdev);
+       struct omap_i2c_dev *omap = dev_get_drvdata(dev);
 
-       _dev->iestate = omap_i2c_read_reg(_dev, OMAP_I2C_IE_REG);
+       omap->iestate = omap_i2c_read_reg(omap, OMAP_I2C_IE_REG);
 
-       if (_dev->scheme == OMAP_I2C_SCHEME_0)
-               omap_i2c_write_reg(_dev, OMAP_I2C_IE_REG, 0);
+       if (omap->scheme == OMAP_I2C_SCHEME_0)
+               omap_i2c_write_reg(omap, OMAP_I2C_IE_REG, 0);
        else
-               omap_i2c_write_reg(_dev, OMAP_I2C_IP_V2_IRQENABLE_CLR,
+               omap_i2c_write_reg(omap, OMAP_I2C_IP_V2_IRQENABLE_CLR,
                                   OMAP_I2C_IP_V2_INTERRUPTS_MASK);
 
-       if (_dev->rev < OMAP_I2C_OMAP1_REV_2) {
-               omap_i2c_read_reg(_dev, OMAP_I2C_IV_REG); /* Read clears */
+       if (omap->rev < OMAP_I2C_OMAP1_REV_2) {
+               omap_i2c_read_reg(omap, OMAP_I2C_IV_REG); /* Read clears */
        } else {
-               omap_i2c_write_reg(_dev, OMAP_I2C_STAT_REG, _dev->iestate);
+               omap_i2c_write_reg(omap, OMAP_I2C_STAT_REG, omap->iestate);
 
                /* Flush posted write */
-               omap_i2c_read_reg(_dev, OMAP_I2C_STAT_REG);
+               omap_i2c_read_reg(omap, OMAP_I2C_STAT_REG);
        }
 
        pinctrl_pm_select_sleep_state(dev);
@@ -1503,15 +1502,14 @@ static int omap_i2c_runtime_suspend(struct device *dev)
 
 static int omap_i2c_runtime_resume(struct device *dev)
 {
-       struct platform_device *pdev = to_platform_device(dev);
-       struct omap_i2c_dev *_dev = platform_get_drvdata(pdev);
+       struct omap_i2c_dev *omap = dev_get_drvdata(dev);
 
        pinctrl_pm_select_default_state(dev);
 
-       if (!_dev->regs)
+       if (!omap->regs)
                return 0;
 
-       __omap_i2c_init(_dev);
+       __omap_i2c_init(omap);
 
        return 0;
 }
index 9b94c3db80abf2ef5f84fddedbcd5d416a936ac7..a8e54df4aed6f962f1153d8fe7f97f76aa185c55 100644 (file)
@@ -20,6 +20,8 @@
    GNU General Public License for more details.
  * ------------------------------------------------------------------------ */
 
+#define pr_fmt(fmt) "i2c-parport: " fmt
+
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
@@ -176,26 +178,24 @@ static void i2c_parport_attach(struct parport *port)
                        break;
        }
        if (i == MAX_DEVICE) {
-               pr_debug("i2c-parport: Not using parport%d.\n", port->number);
+               pr_debug("Not using parport%d.\n", port->number);
                return;
        }
 
        adapter = kzalloc(sizeof(struct i2c_par), GFP_KERNEL);
-       if (adapter == NULL) {
-               printk(KERN_ERR "i2c-parport: Failed to kzalloc\n");
+       if (!adapter)
                return;
-       }
        memset(&i2c_parport_cb, 0, sizeof(i2c_parport_cb));
        i2c_parport_cb.flags = PARPORT_FLAG_EXCL;
        i2c_parport_cb.irq_func = i2c_parport_irq;
        i2c_parport_cb.private = adapter;
 
-       pr_debug("i2c-parport: attaching to %s\n", port->name);
+       pr_debug("attaching to %s\n", port->name);
        parport_disable_irq(port);
        adapter->pdev = parport_register_dev_model(port, "i2c-parport",
                                                   &i2c_parport_cb, i);
        if (!adapter->pdev) {
-               printk(KERN_ERR "i2c-parport: Unable to register with parport\n");
+               pr_err("Unable to register with parport\n");
                goto err_free;
        }
 
@@ -215,7 +215,8 @@ static void i2c_parport_attach(struct parport *port)
        adapter->adapter.dev.parent = port->physport->dev;
 
        if (parport_claim_or_block(adapter->pdev) < 0) {
-               printk(KERN_ERR "i2c-parport: Could not claim parallel port\n");
+               dev_err(&adapter->pdev->dev,
+                       "Could not claim parallel port\n");
                goto err_unregister;
        }
 
@@ -230,7 +231,7 @@ static void i2c_parport_attach(struct parport *port)
        }
 
        if (i2c_bit_add_bus(&adapter->adapter) < 0) {
-               printk(KERN_ERR "i2c-parport: Unable to register with I2C\n");
+               dev_err(&adapter->pdev->dev, "Unable to register with I2C\n");
                goto err_unregister;
        }
 
@@ -242,8 +243,8 @@ static void i2c_parport_attach(struct parport *port)
                if (adapter->ara)
                        parport_enable_irq(port);
                else
-                       printk(KERN_WARNING "i2c-parport: Failed to register "
-                              "ARA client\n");
+                       dev_warn(&adapter->pdev->dev,
+                                "Failed to register ARA client\n");
        }
 
        /* Add the new adapter to the list */
@@ -298,12 +299,12 @@ static struct parport_driver i2c_parport_driver = {
 static int __init i2c_parport_init(void)
 {
        if (type < 0) {
-               printk(KERN_WARNING "i2c-parport: adapter type unspecified\n");
+               pr_warn("adapter type unspecified\n");
                return -ENODEV;
        }
 
        if (type >= ARRAY_SIZE(adapter_parm)) {
-               printk(KERN_WARNING "i2c-parport: invalid type (%d)\n", type);
+               pr_warn("invalid type (%d)\n", type);
                return -ENODEV;
        }
 
index 4e129453680515888b8ca31f22dba490efc94d5d..84a6616b072f943024e7b14277ef4fb994a9e32b 100644 (file)
@@ -89,6 +89,13 @@ static const struct adapter_parm adapter_parm[] = {
                .getsda = { 0x80, PORT_STAT, 1 },
                .init   = { 0x04, PORT_DATA, 1 },
        },
+       /* type 8: VCT-jig */
+       {
+               .setsda = { 0x04, PORT_DATA, 1 },
+               .setscl = { 0x01, PORT_DATA, 1 },
+               .getsda = { 0x40, PORT_STAT, 0 },
+               .getscl = { 0x80, PORT_STAT, 1 },
+       },
 };
 
 static int type = -1;
@@ -103,4 +110,5 @@ MODULE_PARM_DESC(type,
        " 5 = ADM1025, ADM1030 and ADM1031 evaluation boards\n"
        " 6 = Barco LPT->DVI (K5800236) adapter\n"
        " 7 = One For All JP1 parallel port adapter\n"
+       " 8 = VCT-jig\n"
 );
index d9c0d6a17ad6c39b3e5fadfd4bc36564562a4478..645e4b79d968155c8f55ec28cb2aa3e5d1953626 100644 (file)
@@ -132,6 +132,7 @@ struct pxa_i2c {
        unsigned int            msg_idx;
        unsigned int            msg_ptr;
        unsigned int            slave_addr;
+       unsigned int            req_slave_addr;
 
        struct i2c_adapter      adap;
        struct clk              *clk;
@@ -253,15 +254,20 @@ static void i2c_pxa_show_state(struct pxa_i2c *i2c, int lno, const char *fname)
 static void i2c_pxa_scream_blue_murder(struct pxa_i2c *i2c, const char *why)
 {
        unsigned int i;
-       printk(KERN_ERR "i2c: error: %s\n", why);
-       printk(KERN_ERR "i2c: msg_num: %d msg_idx: %d msg_ptr: %d\n",
+       struct device *dev = &i2c->adap.dev;
+
+       dev_err(dev, "slave_0x%x error: %s\n",
+               i2c->req_slave_addr >> 1, why);
+       dev_err(dev, "msg_num: %d msg_idx: %d msg_ptr: %d\n",
                i2c->msg_num, i2c->msg_idx, i2c->msg_ptr);
-       printk(KERN_ERR "i2c: ICR: %08x ISR: %08x\n",
-              readl(_ICR(i2c)), readl(_ISR(i2c)));
-       printk(KERN_DEBUG "i2c: log: ");
+       dev_err(dev, "IBMR: %08x IDBR: %08x ICR: %08x ISR: %08x\n",
+               readl(_IBMR(i2c)), readl(_IDBR(i2c)), readl(_ICR(i2c)),
+               readl(_ISR(i2c)));
+       dev_dbg(dev, "log: ");
        for (i = 0; i < i2c->irqlogidx; i++)
-               printk("[%08x:%08x] ", i2c->isrlog[i], i2c->icrlog[i]);
-       printk("\n");
+               pr_debug("[%08x:%08x] ", i2c->isrlog[i], i2c->icrlog[i]);
+
+       pr_debug("\n");
 }
 
 #else /* ifdef DEBUG */
@@ -459,7 +465,7 @@ static void i2c_pxa_reset(struct pxa_i2c *i2c)
        writel(I2C_ISR_INIT, _ISR(i2c));
        writel(readl(_ICR(i2c)) & ~ICR_UR, _ICR(i2c));
 
-       if (i2c->reg_isar)
+       if (i2c->reg_isar && IS_ENABLED(CONFIG_I2C_PXA_SLAVE))
                writel(i2c->slave_addr, _ISAR(i2c));
 
        /* set control register values */
@@ -638,6 +644,7 @@ static inline void i2c_pxa_start_message(struct pxa_i2c *i2c)
         * Step 1: target slave address into IDBR
         */
        writel(i2c_pxa_addr_byte(i2c->msg), _IDBR(i2c));
+       i2c->req_slave_addr = i2c_pxa_addr_byte(i2c->msg);
 
        /*
         * Step 2: initiate the write.
@@ -745,8 +752,10 @@ static int i2c_pxa_do_pio_xfer(struct pxa_i2c *i2c,
        ret = i2c->msg_idx;
 
 out:
-       if (timeout == 0)
+       if (timeout == 0) {
                i2c_pxa_scream_blue_murder(i2c, "timeout");
+               ret = I2C_RETRY;
+       }
 
        return ret;
 }
@@ -949,6 +958,7 @@ static void i2c_pxa_irq_txempty(struct pxa_i2c *i2c, u32 isr)
                 * Write the next address.
                 */
                writel(i2c_pxa_addr_byte(i2c->msg), _IDBR(i2c));
+               i2c->req_slave_addr = i2c_pxa_addr_byte(i2c->msg);
 
                /*
                 * And trigger a repeated start, and send the byte.
@@ -1114,7 +1124,9 @@ static int i2c_pxa_probe_dt(struct platform_device *pdev, struct pxa_i2c *i2c,
                i2c->use_pio = 1;
        if (of_get_property(np, "mrvl,i2c-fast-mode", NULL))
                i2c->fast_mode = 1;
-       *i2c_types = (u32)(of_id->data);
+
+       *i2c_types = (enum pxa_i2c_types)(of_id->data);
+
        return 0;
 }
 
@@ -1146,10 +1158,19 @@ static int i2c_pxa_probe(struct platform_device *dev)
        struct resource *res = NULL;
        int ret, irq;
 
-       i2c = kzalloc(sizeof(struct pxa_i2c), GFP_KERNEL);
-       if (!i2c) {
-               ret = -ENOMEM;
-               goto emalloc;
+       i2c = devm_kzalloc(&dev->dev, sizeof(struct pxa_i2c), GFP_KERNEL);
+       if (!i2c)
+               return -ENOMEM;
+
+       res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+       i2c->reg_base = devm_ioremap_resource(&dev->dev, res);
+       if (IS_ERR(i2c->reg_base))
+               return PTR_ERR(i2c->reg_base);
+
+       irq = platform_get_irq(dev, 0);
+       if (irq < 0) {
+               dev_err(&dev->dev, "no irq resource: %d\n", irq);
+               return irq;
        }
 
        /* Default adapter num to device id; i2c_pxa_probe_dt can override. */
@@ -1159,19 +1180,7 @@ static int i2c_pxa_probe(struct platform_device *dev)
        if (ret > 0)
                ret = i2c_pxa_probe_pdata(dev, i2c, &i2c_type);
        if (ret < 0)
-               goto eclk;
-
-       res = platform_get_resource(dev, IORESOURCE_MEM, 0);
-       irq = platform_get_irq(dev, 0);
-       if (res == NULL || irq < 0) {
-               ret = -ENODEV;
-               goto eclk;
-       }
-
-       if (!request_mem_region(res->start, resource_size(res), res->name)) {
-               ret = -ENOMEM;
-               goto eclk;
-       }
+               return ret;
 
        i2c->adap.owner   = THIS_MODULE;
        i2c->adap.retries = 5;
@@ -1181,16 +1190,10 @@ static int i2c_pxa_probe(struct platform_device *dev)
 
        strlcpy(i2c->adap.name, "pxa_i2c-i2c", sizeof(i2c->adap.name));
 
-       i2c->clk = clk_get(&dev->dev, NULL);
+       i2c->clk = devm_clk_get(&dev->dev, NULL);
        if (IS_ERR(i2c->clk)) {
-               ret = PTR_ERR(i2c->clk);
-               goto eclk;
-       }
-
-       i2c->reg_base = ioremap(res->start, resource_size(res));
-       if (!i2c->reg_base) {
-               ret = -EIO;
-               goto eremap;
+               dev_err(&dev->dev, "failed to get the clk: %ld\n", PTR_ERR(i2c->clk));
+               return PTR_ERR(i2c->clk);
        }
 
        i2c->reg_ibmr = i2c->reg_base + pxa_reg_layout[i2c_type].ibmr;
@@ -1232,10 +1235,13 @@ static int i2c_pxa_probe(struct platform_device *dev)
                i2c->adap.algo = &i2c_pxa_pio_algorithm;
        } else {
                i2c->adap.algo = &i2c_pxa_algorithm;
-               ret = request_irq(irq, i2c_pxa_handler, IRQF_SHARED,
-                                 dev_name(&dev->dev), i2c);
-               if (ret)
+               ret = devm_request_irq(&dev->dev, irq, i2c_pxa_handler,
+                               IRQF_SHARED | IRQF_NO_SUSPEND,
+                               dev_name(&dev->dev), i2c);
+               if (ret) {
+                       dev_err(&dev->dev, "failed to request irq: %d\n", ret);
                        goto ereqirq;
+               }
        }
 
        i2c_pxa_reset(i2c);
@@ -1248,33 +1254,22 @@ static int i2c_pxa_probe(struct platform_device *dev)
 
        ret = i2c_add_numbered_adapter(&i2c->adap);
        if (ret < 0) {
-               printk(KERN_INFO "I2C: Failed to add bus\n");
-               goto eadapt;
+               dev_err(&dev->dev, "failed to add bus: %d\n", ret);
+               goto ereqirq;
        }
 
        platform_set_drvdata(dev, i2c);
 
 #ifdef CONFIG_I2C_PXA_SLAVE
-       printk(KERN_INFO "I2C: %s: PXA I2C adapter, slave address %d\n",
-              dev_name(&i2c->adap.dev), i2c->slave_addr);
+       dev_info(&i2c->adap.dev, " PXA I2C adapter, slave address %d\n",
+               i2c->slave_addr);
 #else
-       printk(KERN_INFO "I2C: %s: PXA I2C adapter\n",
-              dev_name(&i2c->adap.dev));
+       dev_info(&i2c->adap.dev, " PXA I2C adapter\n");
 #endif
        return 0;
 
-eadapt:
-       if (!i2c->use_pio)
-               free_irq(irq, i2c);
 ereqirq:
        clk_disable_unprepare(i2c->clk);
-       iounmap(i2c->reg_base);
-eremap:
-       clk_put(i2c->clk);
-eclk:
-       kfree(i2c);
-emalloc:
-       release_mem_region(res->start, resource_size(res));
        return ret;
 }
 
@@ -1283,15 +1278,8 @@ static int i2c_pxa_remove(struct platform_device *dev)
        struct pxa_i2c *i2c = platform_get_drvdata(dev);
 
        i2c_del_adapter(&i2c->adap);
-       if (!i2c->use_pio)
-               free_irq(i2c->irq, i2c);
 
        clk_disable_unprepare(i2c->clk);
-       clk_put(i2c->clk);
-
-       iounmap(i2c->reg_base);
-       release_mem_region(i2c->iobase, i2c->iosize);
-       kfree(i2c);
 
        return 0;
 }
index 78a3668146967408146c81a213e09d6219543718..b7e1a365542100c6b2bc6bf12184a4e8fd3d4103 100644 (file)
 #define I2C_HEADER_CONTINUE_XFER               (1<<15)
 #define I2C_HEADER_MASTER_ADDR_SHIFT           12
 #define I2C_HEADER_SLAVE_ADDR_SHIFT            1
+
+#define I2C_CONFIG_LOAD                                0x08C
+#define I2C_MSTR_CONFIG_LOAD                   (1 << 0)
+#define I2C_SLV_CONFIG_LOAD                    (1 << 1)
+#define I2C_TIMEOUT_CONFIG_LOAD                        (1 << 2)
+
 /*
  * msg_end_type: The bus control which need to be send at end of transfer.
  * @MSG_END_STOP: Send stop pulse at end of transfer.
@@ -121,6 +127,8 @@ enum msg_end_type {
  * @has_single_clk_source: The i2c controller has single clock source. Tegra30
  *             and earlier Socs has two clock sources i.e. div-clk and
  *             fast-clk.
+ * @has_config_load_reg: Has the config load register to load the new
+ *             configuration.
  * @clk_divisor_hs_mode: Clock divisor in HS mode.
  * @clk_divisor_std_fast_mode: Clock divisor in standard/fast mode. It is
  *             applicable if there is no fast clock source i.e. single clock
@@ -131,8 +139,10 @@ struct tegra_i2c_hw_feature {
        bool has_continue_xfer_support;
        bool has_per_pkt_xfer_complete_irq;
        bool has_single_clk_source;
+       bool has_config_load_reg;
        int clk_divisor_hs_mode;
        int clk_divisor_std_fast_mode;
+       u16 clk_divisor_fast_plus_mode;
 };
 
 /**
@@ -172,6 +182,7 @@ struct tegra_i2c_dev {
        size_t msg_buf_remaining;
        int msg_read;
        u32 bus_clk_rate;
+       u16 clk_divisor_non_hs_mode;
        bool is_suspended;
 };
 
@@ -410,6 +421,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
        u32 val;
        int err = 0;
        u32 clk_divisor;
+       unsigned long timeout = jiffies + HZ;
 
        err = tegra_i2c_clock_enable(i2c_dev);
        if (err < 0) {
@@ -431,7 +443,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
 
        /* Make sure clock divisor programmed correctly */
        clk_divisor = i2c_dev->hw->clk_divisor_hs_mode;
-       clk_divisor |= i2c_dev->hw->clk_divisor_std_fast_mode <<
+       clk_divisor |= i2c_dev->clk_divisor_non_hs_mode <<
                                        I2C_CLK_DIVISOR_STD_FAST_MODE_SHIFT;
        i2c_writel(i2c_dev, clk_divisor, I2C_CLK_DIVISOR);
 
@@ -451,6 +463,18 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev)
        if (tegra_i2c_flush_fifos(i2c_dev))
                err = -ETIMEDOUT;
 
+       if (i2c_dev->hw->has_config_load_reg) {
+               i2c_writel(i2c_dev, I2C_MSTR_CONFIG_LOAD, I2C_CONFIG_LOAD);
+               while (i2c_readl(i2c_dev, I2C_CONFIG_LOAD) != 0) {
+                       if (time_after(jiffies, timeout)) {
+                               dev_warn(i2c_dev->dev,
+                                       "timeout waiting for config load\n");
+                               return -ETIMEDOUT;
+                       }
+                       msleep(1);
+               }
+       }
+
        tegra_i2c_clock_disable(i2c_dev);
 
        if (i2c_dev->irq_disabled) {
@@ -681,6 +705,8 @@ static const struct tegra_i2c_hw_feature tegra20_i2c_hw = {
        .has_single_clk_source = false,
        .clk_divisor_hs_mode = 3,
        .clk_divisor_std_fast_mode = 0,
+       .clk_divisor_fast_plus_mode = 0,
+       .has_config_load_reg = false,
 };
 
 static const struct tegra_i2c_hw_feature tegra30_i2c_hw = {
@@ -689,6 +715,8 @@ static const struct tegra_i2c_hw_feature tegra30_i2c_hw = {
        .has_single_clk_source = false,
        .clk_divisor_hs_mode = 3,
        .clk_divisor_std_fast_mode = 0,
+       .clk_divisor_fast_plus_mode = 0,
+       .has_config_load_reg = false,
 };
 
 static const struct tegra_i2c_hw_feature tegra114_i2c_hw = {
@@ -697,10 +725,23 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = {
        .has_single_clk_source = true,
        .clk_divisor_hs_mode = 1,
        .clk_divisor_std_fast_mode = 0x19,
+       .clk_divisor_fast_plus_mode = 0x10,
+       .has_config_load_reg = false,
+};
+
+static const struct tegra_i2c_hw_feature tegra124_i2c_hw = {
+       .has_continue_xfer_support = true,
+       .has_per_pkt_xfer_complete_irq = true,
+       .has_single_clk_source = true,
+       .clk_divisor_hs_mode = 1,
+       .clk_divisor_std_fast_mode = 0x19,
+       .clk_divisor_fast_plus_mode = 0x10,
+       .has_config_load_reg = true,
 };
 
 /* Match table for of_platform binding */
 static const struct of_device_id tegra_i2c_of_match[] = {
+       { .compatible = "nvidia,tegra124-i2c", .data = &tegra124_i2c_hw, },
        { .compatible = "nvidia,tegra114-i2c", .data = &tegra114_i2c_hw, },
        { .compatible = "nvidia,tegra30-i2c", .data = &tegra30_i2c_hw, },
        { .compatible = "nvidia,tegra20-i2c", .data = &tegra20_i2c_hw, },
@@ -793,7 +834,14 @@ static int tegra_i2c_probe(struct platform_device *pdev)
                }
        }
 
-       clk_multiplier *= (i2c_dev->hw->clk_divisor_std_fast_mode + 1);
+       i2c_dev->clk_divisor_non_hs_mode =
+                       i2c_dev->hw->clk_divisor_std_fast_mode;
+       if (i2c_dev->hw->clk_divisor_fast_plus_mode &&
+               (i2c_dev->bus_clk_rate == 1000000))
+               i2c_dev->clk_divisor_non_hs_mode =
+                       i2c_dev->hw->clk_divisor_fast_plus_mode;
+
+       clk_multiplier *= (i2c_dev->clk_divisor_non_hs_mode + 1);
        ret = clk_set_rate(i2c_dev->div_clk,
                           i2c_dev->bus_clk_rate * clk_multiplier);
        if (ret) {
index 47e88adf2011e15509a070e663e80236a96f3c61..543456a0a3382df396df4329fb301e657a1bcc7c 100644 (file)
@@ -391,11 +391,11 @@ static int vprbrd_i2c_probe(struct platform_device *pdev)
                        VPRBRD_USB_REQUEST_I2C_FREQ, VPRBRD_USB_TYPE_OUT,
                        0x0000, 0x0000, &vb_i2c->bus_freq_param, 1,
                        VPRBRD_USB_TIMEOUT_MS);
-           if (ret != 1) {
-               dev_err(&pdev->dev,
-                       "failure setting i2c_bus_freq to %d\n", i2c_bus_freq);
-               return -EIO;
-           }
+               if (ret != 1) {
+                       dev_err(&pdev->dev, "failure setting i2c_bus_freq to %d\n",
+                               i2c_bus_freq);
+                       return -EIO;
+               }
        } else {
                dev_err(&pdev->dev,
                        "invalid i2c_bus_freq setting:%d\n", i2c_bus_freq);
index 1c9cb65ac4cf8aeb83becaec867f9a34f28b5a1e..4233f5695352fdc951c54c54dc6fbb9de926d26e 100644 (file)
@@ -198,10 +198,10 @@ static int slimpro_i2c_blkrd(struct slimpro_i2c_dev *ctx, u32 chip, u32 addr,
        int rc;
 
        paddr = dma_map_single(ctx->dev, ctx->dma_buffer, readlen, DMA_FROM_DEVICE);
-       rc = dma_mapping_error(ctx->dev, paddr);
-       if (rc) {
+       if (dma_mapping_error(ctx->dev, paddr)) {
                dev_err(&ctx->adapter.dev, "Error in mapping dma buffer %p\n",
                        ctx->dma_buffer);
+               rc = -ENOMEM;
                goto err;
        }
 
@@ -241,10 +241,10 @@ static int slimpro_i2c_blkwr(struct slimpro_i2c_dev *ctx, u32 chip,
        memcpy(ctx->dma_buffer, data, writelen);
        paddr = dma_map_single(ctx->dev, ctx->dma_buffer, writelen,
                               DMA_TO_DEVICE);
-       rc = dma_mapping_error(ctx->dev, paddr);
-       if (rc) {
+       if (dma_mapping_error(ctx->dev, paddr)) {
                dev_err(&ctx->adapter.dev, "Error in mapping dma buffer %p\n",
                        ctx->dma_buffer);
+               rc = -ENOMEM;
                goto err;
        }
 
index 4dda23f22a67b47502d64151c32c9d0c3063aa85..e23a7b068c601906bb1b245d637b0413c2b2c29f 100644 (file)
@@ -283,7 +283,7 @@ static void xiic_reinit(struct xiic_i2c *i2c)
        /* Enable interrupts */
        xiic_setreg32(i2c, XIIC_DGIER_OFFSET, XIIC_GINTR_ENABLE_MASK);
 
-       xiic_irq_clr_en(i2c, XIIC_INTR_AAS_MASK | XIIC_INTR_ARB_LOST_MASK);
+       xiic_irq_clr_en(i2c, XIIC_INTR_ARB_LOST_MASK);
 }
 
 static void xiic_deinit(struct xiic_i2c *i2c)
@@ -358,8 +358,9 @@ static void xiic_wakeup(struct xiic_i2c *i2c, int code)
        wake_up(&i2c->wait);
 }
 
-static void xiic_process(struct xiic_i2c *i2c)
+static irqreturn_t xiic_process(int irq, void *dev_id)
 {
+       struct xiic_i2c *i2c = dev_id;
        u32 pend, isr, ier;
        u32 clr = 0;
 
@@ -368,6 +369,7 @@ static void xiic_process(struct xiic_i2c *i2c)
         * To find which interrupts are pending; AND interrupts pending with
         * interrupts masked.
         */
+       spin_lock(&i2c->lock);
        isr = xiic_getreg32(i2c, XIIC_IISR_OFFSET);
        ier = xiic_getreg32(i2c, XIIC_IIER_OFFSET);
        pend = isr & ier;
@@ -378,11 +380,6 @@ static void xiic_process(struct xiic_i2c *i2c)
                __func__, xiic_getreg8(i2c, XIIC_SR_REG_OFFSET),
                i2c->tx_msg, i2c->nmsgs);
 
-       /* Do not processes a devices interrupts if the device has no
-        * interrupts pending
-        */
-       if (!pend)
-               return;
 
        /* Service requesting interrupt */
        if ((pend & XIIC_INTR_ARB_LOST_MASK) ||
@@ -402,13 +399,15 @@ static void xiic_process(struct xiic_i2c *i2c)
                 */
                xiic_reinit(i2c);
 
+               if (i2c->rx_msg)
+                       xiic_wakeup(i2c, STATE_ERROR);
                if (i2c->tx_msg)
                        xiic_wakeup(i2c, STATE_ERROR);
-
-       } else if (pend & XIIC_INTR_RX_FULL_MASK) {
+       }
+       if (pend & XIIC_INTR_RX_FULL_MASK) {
                /* Receive register/FIFO is full */
 
-               clr = XIIC_INTR_RX_FULL_MASK;
+               clr |= XIIC_INTR_RX_FULL_MASK;
                if (!i2c->rx_msg) {
                        dev_dbg(i2c->adap.dev.parent,
                                "%s unexpexted RX IRQ\n", __func__);
@@ -441,9 +440,10 @@ static void xiic_process(struct xiic_i2c *i2c)
                                __xiic_start_xfer(i2c);
                        }
                }
-       } else if (pend & XIIC_INTR_BNB_MASK) {
+       }
+       if (pend & XIIC_INTR_BNB_MASK) {
                /* IIC bus has transitioned to not busy */
-               clr = XIIC_INTR_BNB_MASK;
+               clr |= XIIC_INTR_BNB_MASK;
 
                /* The bus is not busy, disable BusNotBusy interrupt */
                xiic_irq_dis(i2c, XIIC_INTR_BNB_MASK);
@@ -456,12 +456,12 @@ static void xiic_process(struct xiic_i2c *i2c)
                        xiic_wakeup(i2c, STATE_DONE);
                else
                        xiic_wakeup(i2c, STATE_ERROR);
-
-       } else if (pend & (XIIC_INTR_TX_EMPTY_MASK | XIIC_INTR_TX_HALF_MASK)) {
+       }
+       if (pend & (XIIC_INTR_TX_EMPTY_MASK | XIIC_INTR_TX_HALF_MASK)) {
                /* Transmit register/FIFO is empty or ½ empty */
 
-               clr pend &
-                       (XIIC_INTR_TX_EMPTY_MASK | XIIC_INTR_TX_HALF_MASK);
+               clr |= (pend &
+                       (XIIC_INTR_TX_EMPTY_MASK | XIIC_INTR_TX_HALF_MASK));
 
                if (!i2c->tx_msg) {
                        dev_dbg(i2c->adap.dev.parent,
@@ -492,16 +492,13 @@ static void xiic_process(struct xiic_i2c *i2c)
                         * make sure to disable tx half
                         */
                        xiic_irq_dis(i2c, XIIC_INTR_TX_HALF_MASK);
-       } else {
-               /* got IRQ which is not acked */
-               dev_err(i2c->adap.dev.parent, "%s Got unexpected IRQ\n",
-                       __func__);
-               clr = pend;
        }
 out:
        dev_dbg(i2c->adap.dev.parent, "%s clr: 0x%x\n", __func__, clr);
 
        xiic_setreg32(i2c, XIIC_IISR_OFFSET, clr);
+       spin_unlock(&i2c->lock);
+       return IRQ_HANDLED;
 }
 
 static int xiic_bus_busy(struct xiic_i2c *i2c)
@@ -525,7 +522,7 @@ static int xiic_busy(struct xiic_i2c *i2c)
         */
        err = xiic_bus_busy(i2c);
        while (err && tries--) {
-               mdelay(1);
+               msleep(1);
                err = xiic_bus_busy(i2c);
        }
 
@@ -602,19 +599,21 @@ static void xiic_start_send(struct xiic_i2c *i2c)
 static irqreturn_t xiic_isr(int irq, void *dev_id)
 {
        struct xiic_i2c *i2c = dev_id;
-
-       spin_lock(&i2c->lock);
-       /* disable interrupts globally */
-       xiic_setreg32(i2c, XIIC_DGIER_OFFSET, 0);
+       u32 pend, isr, ier;
+       irqreturn_t ret = IRQ_NONE;
+       /* Do not processes a devices interrupts if the device has no
+        * interrupts pending
+        */
 
        dev_dbg(i2c->adap.dev.parent, "%s entry\n", __func__);
 
-       xiic_process(i2c);
-
-       xiic_setreg32(i2c, XIIC_DGIER_OFFSET, XIIC_GINTR_ENABLE_MASK);
-       spin_unlock(&i2c->lock);
+       isr = xiic_getreg32(i2c, XIIC_IISR_OFFSET);
+       ier = xiic_getreg32(i2c, XIIC_IIER_OFFSET);
+       pend = isr & ier;
+       if (pend)
+               ret = IRQ_WAKE_THREAD;
 
-       return IRQ_HANDLED;
+       return ret;
 }
 
 static void __xiic_start_xfer(struct xiic_i2c *i2c)
@@ -663,16 +662,8 @@ static void __xiic_start_xfer(struct xiic_i2c *i2c)
 
 static void xiic_start_xfer(struct xiic_i2c *i2c)
 {
-       unsigned long flags;
-
-       spin_lock_irqsave(&i2c->lock, flags);
-       xiic_reinit(i2c);
-       /* disable interrupts globally */
-       xiic_setreg32(i2c, XIIC_DGIER_OFFSET, 0);
-       spin_unlock_irqrestore(&i2c->lock, flags);
 
        __xiic_start_xfer(i2c);
-       xiic_setreg32(i2c, XIIC_DGIER_OFFSET, XIIC_GINTR_ENABLE_MASK);
 }
 
 static int xiic_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num)
@@ -755,7 +746,10 @@ static int xiic_i2c_probe(struct platform_device *pdev)
        spin_lock_init(&i2c->lock);
        init_waitqueue_head(&i2c->wait);
 
-       ret = devm_request_irq(&pdev->dev, irq, xiic_isr, 0, pdev->name, i2c);
+       ret = devm_request_threaded_irq(&pdev->dev, irq, xiic_isr,
+                                       xiic_process, IRQF_ONESHOT,
+                                       pdev->name, i2c);
+
        if (ret < 0) {
                dev_err(&pdev->dev, "Cannot claim IRQ\n");
                return ret;
index c83e4d13cfc5c402dfdea64df08f399ab486822b..5f89f1e3c2f24fc562a519eb173d33de8c280f42 100644 (file)
@@ -27,6 +27,7 @@
    I2C slave support (c) 2014 by Wolfram Sang <wsa@sang-engineering.com>
  */
 
+#include <dt-bindings/i2c/i2c.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -47,6 +48,7 @@
 #include <linux/rwsem.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm_domain.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/acpi.h>
 #include <linux/jump_label.h>
 #include <asm/uaccess.h>
@@ -57,6 +59,9 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/i2c.h>
 
+#define I2C_ADDR_OFFSET_TEN_BIT        0xa000
+#define I2C_ADDR_OFFSET_SLAVE  0x1000
+
 /* core_lock protects i2c_adapter_idr, and guarantees
    that device detection, deletion of detected devices, and attach_adapter
    calls are serialized */
@@ -641,11 +646,13 @@ static int i2c_device_probe(struct device *dev)
        if (!client->irq) {
                int irq = -ENOENT;
 
-               if (dev->of_node)
-                       irq = of_irq_get(dev->of_node, 0);
-               else if (ACPI_COMPANION(dev))
+               if (dev->of_node) {
+                       irq = of_irq_get_byname(dev->of_node, "irq");
+                       if (irq == -EINVAL || irq == -ENODATA)
+                               irq = of_irq_get(dev->of_node, 0);
+               } else if (ACPI_COMPANION(dev)) {
                        irq = acpi_dev_gpio_irq_get(ACPI_COMPANION(dev), 0);
-
+               }
                if (irq == -EPROBE_DEFER)
                        return irq;
                if (irq < 0)
@@ -658,23 +665,49 @@ static int i2c_device_probe(struct device *dev)
        if (!driver->probe || !driver->id_table)
                return -ENODEV;
 
-       if (!device_can_wakeup(&client->dev))
-               device_init_wakeup(&client->dev,
-                                       client->flags & I2C_CLIENT_WAKE);
+       if (client->flags & I2C_CLIENT_WAKE) {
+               int wakeirq = -ENOENT;
+
+               if (dev->of_node) {
+                       wakeirq = of_irq_get_byname(dev->of_node, "wakeup");
+                       if (wakeirq == -EPROBE_DEFER)
+                               return wakeirq;
+               }
+
+               device_init_wakeup(&client->dev, true);
+
+               if (wakeirq > 0 && wakeirq != client->irq)
+                       status = dev_pm_set_dedicated_wake_irq(dev, wakeirq);
+               else if (client->irq > 0)
+                       status = dev_pm_set_wake_irq(dev, wakeirq);
+               else
+                       status = 0;
+
+               if (status)
+                       dev_warn(&client->dev, "failed to set up wakeup irq");
+       }
+
        dev_dbg(dev, "probe\n");
 
        status = of_clk_set_defaults(dev->of_node, false);
        if (status < 0)
-               return status;
+               goto err_clear_wakeup_irq;
 
        status = dev_pm_domain_attach(&client->dev, true);
        if (status != -EPROBE_DEFER) {
                status = driver->probe(client, i2c_match_id(driver->id_table,
                                        client));
                if (status)
-                       dev_pm_domain_detach(&client->dev, true);
+                       goto err_detach_pm_domain;
        }
 
+       return 0;
+
+err_detach_pm_domain:
+       dev_pm_domain_detach(&client->dev, true);
+err_clear_wakeup_irq:
+       dev_pm_clear_wake_irq(&client->dev);
+       device_init_wakeup(&client->dev, false);
        return status;
 }
 
@@ -694,6 +727,10 @@ static int i2c_device_remove(struct device *dev)
        }
 
        dev_pm_domain_detach(&client->dev, true);
+
+       dev_pm_clear_wake_irq(&client->dev);
+       device_init_wakeup(&client->dev, false);
+
        return status;
 }
 
@@ -778,17 +815,32 @@ struct i2c_client *i2c_verify_client(struct device *dev)
 EXPORT_SYMBOL(i2c_verify_client);
 
 
+/* Return a unique address which takes the flags of the client into account */
+static unsigned short i2c_encode_flags_to_addr(struct i2c_client *client)
+{
+       unsigned short addr = client->addr;
+
+       /* For some client flags, add an arbitrary offset to avoid collisions */
+       if (client->flags & I2C_CLIENT_TEN)
+               addr |= I2C_ADDR_OFFSET_TEN_BIT;
+
+       if (client->flags & I2C_CLIENT_SLAVE)
+               addr |= I2C_ADDR_OFFSET_SLAVE;
+
+       return addr;
+}
+
 /* This is a permissive address validity check, I2C address map constraints
  * are purposely not enforced, except for the general call address. */
-static int i2c_check_client_addr_validity(const struct i2c_client *client)
+static int i2c_check_addr_validity(unsigned addr, unsigned short flags)
 {
-       if (client->flags & I2C_CLIENT_TEN) {
+       if (flags & I2C_CLIENT_TEN) {
                /* 10-bit address, all values are valid */
-               if (client->addr > 0x3ff)
+               if (addr > 0x3ff)
                        return -EINVAL;
        } else {
                /* 7-bit address, reject the general call address */
-               if (client->addr == 0x00 || client->addr > 0x7f)
+               if (addr == 0x00 || addr > 0x7f)
                        return -EINVAL;
        }
        return 0;
@@ -798,7 +850,7 @@ static int i2c_check_client_addr_validity(const struct i2c_client *client)
  * device uses a reserved address, then it shouldn't be probed. 7-bit
  * addressing is assumed, 10-bit address devices are rare and should be
  * explicitly enumerated. */
-static int i2c_check_addr_validity(unsigned short addr)
+static int i2c_check_7bit_addr_validity_strict(unsigned short addr)
 {
        /*
         * Reserved addresses per I2C specification:
@@ -820,7 +872,7 @@ static int __i2c_check_addr_busy(struct device *dev, void *addrp)
        struct i2c_client       *client = i2c_verify_client(dev);
        int                     addr = *(int *)addrp;
 
-       if (client && client->addr == addr)
+       if (client && i2c_encode_flags_to_addr(client) == addr)
                return -EBUSY;
        return 0;
 }
@@ -923,10 +975,8 @@ static void i2c_dev_set_name(struct i2c_adapter *adap,
                return;
        }
 
-       /* For 10-bit clients, add an arbitrary offset to avoid collisions */
        dev_set_name(&client->dev, "%d-%04x", i2c_adapter_id(adap),
-                    client->addr | ((client->flags & I2C_CLIENT_TEN)
-                                    ? 0xa000 : 0));
+                    i2c_encode_flags_to_addr(client));
 }
 
 /**
@@ -968,8 +1018,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 
        strlcpy(client->name, info->type, sizeof(client->name));
 
-       /* Check for address validity */
-       status = i2c_check_client_addr_validity(client);
+       status = i2c_check_addr_validity(client->addr, client->flags);
        if (status) {
                dev_err(&adap->dev, "Invalid %d-bit I2C address 0x%02hx\n",
                        client->flags & I2C_CLIENT_TEN ? 10 : 7, client->addr);
@@ -977,7 +1026,7 @@ i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
        }
 
        /* Check for address business */
-       status = i2c_check_addr_busy(adap, client->addr);
+       status = i2c_check_addr_busy(adap, i2c_encode_flags_to_addr(client));
        if (status)
                goto out_err;
 
@@ -1142,6 +1191,16 @@ i2c_sysfs_new_device(struct device *dev, struct device_attribute *attr,
                return -EINVAL;
        }
 
+       if ((info.addr & I2C_ADDR_OFFSET_TEN_BIT) == I2C_ADDR_OFFSET_TEN_BIT) {
+               info.addr &= ~I2C_ADDR_OFFSET_TEN_BIT;
+               info.flags |= I2C_CLIENT_TEN;
+       }
+
+       if (info.addr & I2C_ADDR_OFFSET_SLAVE) {
+               info.addr &= ~I2C_ADDR_OFFSET_SLAVE;
+               info.flags |= I2C_CLIENT_SLAVE;
+       }
+
        client = i2c_new_device(adap, &info);
        if (!client)
                return -EINVAL;
@@ -1193,7 +1252,7 @@ i2c_sysfs_delete_device(struct device *dev, struct device_attribute *attr,
                          i2c_adapter_depth(adap));
        list_for_each_entry_safe(client, next, &adap->userspace_clients,
                                 detected) {
-               if (client->addr == addr) {
+               if (i2c_encode_flags_to_addr(client) == addr) {
                        dev_info(dev, "%s: Deleting device %s at 0x%02hx\n",
                                 "delete_device", client->name, client->addr);
 
@@ -1273,7 +1332,8 @@ static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap,
        struct i2c_client *result;
        struct i2c_board_info info = {};
        struct dev_archdata dev_ad = {};
-       const __be32 *addr;
+       const __be32 *addr_be;
+       u32 addr;
        int len;
 
        dev_dbg(&adap->dev, "of_i2c: register %s\n", node->full_name);
@@ -1284,20 +1344,31 @@ static struct i2c_client *of_i2c_register_device(struct i2c_adapter *adap,
                return ERR_PTR(-EINVAL);
        }
 
-       addr = of_get_property(node, "reg", &len);
-       if (!addr || (len < sizeof(*addr))) {
+       addr_be = of_get_property(node, "reg", &len);
+       if (!addr_be || (len < sizeof(*addr_be))) {
                dev_err(&adap->dev, "of_i2c: invalid reg on %s\n",
                        node->full_name);
                return ERR_PTR(-EINVAL);
        }
 
-       info.addr = be32_to_cpup(addr);
-       if (info.addr > (1 << 10) - 1) {
+       addr = be32_to_cpup(addr_be);
+       if (addr & I2C_TEN_BIT_ADDRESS) {
+               addr &= ~I2C_TEN_BIT_ADDRESS;
+               info.flags |= I2C_CLIENT_TEN;
+       }
+
+       if (addr & I2C_OWN_SLAVE_ADDRESS) {
+               addr &= ~I2C_OWN_SLAVE_ADDRESS;
+               info.flags |= I2C_CLIENT_SLAVE;
+       }
+
+       if (i2c_check_addr_validity(addr, info.flags)) {
                dev_err(&adap->dev, "of_i2c: invalid addr=%x on %s\n",
                        info.addr, node->full_name);
                return ERR_PTR(-EINVAL);
        }
 
+       info.addr = addr;
        info.of_node = of_node_get(node);
        info.archdata = &dev_ad;
 
@@ -1371,6 +1442,24 @@ struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node)
        return adapter;
 }
 EXPORT_SYMBOL(of_find_i2c_adapter_by_node);
+
+/* must call i2c_put_adapter() when done with returned i2c_adapter device */
+struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node)
+{
+       struct i2c_adapter *adapter;
+
+       adapter = of_find_i2c_adapter_by_node(node);
+       if (!adapter)
+               return NULL;
+
+       if (!try_module_get(adapter->owner)) {
+               put_device(&adapter->dev);
+               adapter = NULL;
+       }
+
+       return adapter;
+}
+EXPORT_SYMBOL(of_get_i2c_adapter_by_node);
 #else
 static void of_i2c_register_devices(struct i2c_adapter *adap) { }
 #endif /* CONFIG_OF */
@@ -2262,14 +2351,14 @@ static int i2c_detect_address(struct i2c_client *temp_client,
        int err;
 
        /* Make sure the address is valid */
-       err = i2c_check_addr_validity(addr);
+       err = i2c_check_7bit_addr_validity_strict(addr);
        if (err) {
                dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n",
                         addr);
                return err;
        }
 
-       /* Skip if already in use */
+       /* Skip if already in use (7 bit, no need to encode flags) */
        if (i2c_check_addr_busy(adapter, addr))
                return 0;
 
@@ -2379,13 +2468,13 @@ i2c_new_probed_device(struct i2c_adapter *adap,
 
        for (i = 0; addr_list[i] != I2C_CLIENT_END; i++) {
                /* Check address validity */
-               if (i2c_check_addr_validity(addr_list[i]) < 0) {
+               if (i2c_check_7bit_addr_validity_strict(addr_list[i]) < 0) {
                        dev_warn(&adap->dev, "Invalid 7-bit address "
                                 "0x%02x\n", addr_list[i]);
                        continue;
                }
 
-               /* Check address availability */
+               /* Check address availability (7 bit, no need to encode flags) */
                if (i2c_check_addr_busy(adap, addr_list[i])) {
                        dev_dbg(&adap->dev, "Address 0x%02x already in "
                                "use, not probing\n", addr_list[i]);
@@ -2413,9 +2502,15 @@ struct i2c_adapter *i2c_get_adapter(int nr)
 
        mutex_lock(&core_lock);
        adapter = idr_find(&i2c_adapter_idr, nr);
-       if (adapter && !try_module_get(adapter->owner))
+       if (!adapter)
+               goto exit;
+
+       if (try_module_get(adapter->owner))
+               get_device(&adapter->dev);
+       else
                adapter = NULL;
 
+ exit:
        mutex_unlock(&core_lock);
        return adapter;
 }
@@ -2423,8 +2518,11 @@ EXPORT_SYMBOL(i2c_get_adapter);
 
 void i2c_put_adapter(struct i2c_adapter *adap)
 {
-       if (adap)
-               module_put(adap->owner);
+       if (!adap)
+               return;
+
+       put_device(&adap->dev);
+       module_put(adap->owner);
 }
 EXPORT_SYMBOL(i2c_put_adapter);
 
@@ -2942,6 +3040,63 @@ trace:
 }
 EXPORT_SYMBOL(i2c_smbus_xfer);
 
+/**
+ * i2c_smbus_read_i2c_block_data_or_emulated - read block or emulate
+ * @client: Handle to slave device
+ * @command: Byte interpreted by slave
+ * @length: Size of data block; SMBus allows at most I2C_SMBUS_BLOCK_MAX bytes
+ * @values: Byte array into which data will be read; big enough to hold
+ *     the data returned by the slave.  SMBus allows at most
+ *     I2C_SMBUS_BLOCK_MAX bytes.
+ *
+ * This executes the SMBus "block read" protocol if supported by the adapter.
+ * If block read is not supported, it emulates it using either word or byte
+ * read protocols depending on availability.
+ *
+ * The addresses of the I2C slave device that are accessed with this function
+ * must be mapped to a linear region, so that a block read will have the same
+ * effect as a byte read. Before using this function you must double-check
+ * if the I2C slave does support exchanging a block transfer with a byte
+ * transfer.
+ */
+s32 i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client,
+                                             u8 command, u8 length, u8 *values)
+{
+       u8 i = 0;
+       int status;
+
+       if (length > I2C_SMBUS_BLOCK_MAX)
+               length = I2C_SMBUS_BLOCK_MAX;
+
+       if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_I2C_BLOCK))
+               return i2c_smbus_read_i2c_block_data(client, command, length, values);
+
+       if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_BYTE_DATA))
+               return -EOPNOTSUPP;
+
+       if (i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_READ_WORD_DATA)) {
+               while ((i + 2) <= length) {
+                       status = i2c_smbus_read_word_data(client, command + i);
+                       if (status < 0)
+                               return status;
+                       values[i] = status & 0xff;
+                       values[i + 1] = status >> 8;
+                       i += 2;
+               }
+       }
+
+       while (i < length) {
+               status = i2c_smbus_read_byte_data(client, command + i);
+               if (status < 0)
+                       return status;
+               values[i] = status;
+               i++;
+       }
+
+       return i;
+}
+EXPORT_SYMBOL(i2c_smbus_read_i2c_block_data_or_emulated);
+
 #if IS_ENABLED(CONFIG_I2C_SLAVE)
 int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb)
 {
@@ -2952,9 +3107,13 @@ int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb)
                return -EINVAL;
        }
 
+       if (!(client->flags & I2C_CLIENT_SLAVE))
+               dev_warn(&client->dev, "%s: client slave flag not set. You might see address collisions\n",
+                        __func__);
+
        if (!(client->flags & I2C_CLIENT_TEN)) {
                /* Enforce stricter address checking */
-               ret = i2c_check_addr_validity(client->addr);
+               ret = i2c_check_7bit_addr_validity_strict(client->addr);
                if (ret) {
                        dev_err(&client->dev, "%s: invalid address\n", __func__);
                        return ret;
index 1da44961477953038e78409169f80a3f4884f89a..b2039f94c9d89e3c35b0f50f14c3ed4fe30ad50f 100644 (file)
@@ -157,7 +157,6 @@ MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id);
 static struct i2c_driver i2c_slave_eeprom_driver = {
        .driver = {
                .name = "i2c-slave-eeprom",
-               .owner = THIS_MODULE,
        },
        .probe = i2c_slave_eeprom_probe,
        .remove = i2c_slave_eeprom_remove,
index fdd0769c84a31a02a14d1b7f43dda059efa752b9..f06b0e24673b8732efdc2640246dae6c73809a13 100644 (file)
@@ -61,4 +61,15 @@ config I2C_MUX_PINCTRL
          This driver can also be built as a module. If so, the module will be
          called pinctrl-i2cmux.
 
+config I2C_MUX_REG
+       tristate "Register-based I2C multiplexer"
+       help
+         If you say yes to this option, support will be included for a
+         register based I2C multiplexer. This driver provides access to
+         I2C busses connected through a MUX, which is controlled
+         by a single register.
+
+         This driver can also be built as a module.  If so, the module
+         will be called i2c-mux-reg.
+
 endmenu
index 465778b5d5dc864d3a97262308df7c35368aec4a..e89799b76a92807d19fd29ed53386708a98a07f8 100644 (file)
@@ -7,5 +7,6 @@ obj-$(CONFIG_I2C_MUX_GPIO)      += i2c-mux-gpio.o
 obj-$(CONFIG_I2C_MUX_PCA9541)  += i2c-mux-pca9541.o
 obj-$(CONFIG_I2C_MUX_PCA954x)  += i2c-mux-pca954x.o
 obj-$(CONFIG_I2C_MUX_PINCTRL)  += i2c-mux-pinctrl.o
+obj-$(CONFIG_I2C_MUX_REG)      += i2c-mux-reg.o
 
 ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG
index 5cf1b60b69e2140e674c7245577971c12c3e65b5..402e3a6c671a11ac628d502b7a2833654cd66d68 100644 (file)
@@ -196,7 +196,8 @@ static int i2c_arbitrator_probe(struct platform_device *pdev)
                dev_err(dev, "Cannot parse i2c-parent\n");
                return -EINVAL;
        }
-       arb->parent = of_find_i2c_adapter_by_node(parent_np);
+       arb->parent = of_get_i2c_adapter_by_node(parent_np);
+       of_node_put(parent_np);
        if (!arb->parent) {
                dev_err(dev, "Cannot find parent bus\n");
                return -EPROBE_DEFER;
index 70db99264339ef5c7d6ffa0df9c5b1fdda9dfeaf..b8e11c16d98c609dc653855d57ceb96067e2362f 100644 (file)
@@ -76,6 +76,7 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
                return -ENODEV;
        }
        adapter = of_find_i2c_adapter_by_node(adapter_np);
+       of_node_put(adapter_np);
        if (!adapter)
                return -EPROBE_DEFER;
 
index 0c8d4d2cbdaf42a5844a19b13acfd652a3b2f3a9..d0ba424adebc80a3a44a05879f9bb131f5362bfd 100644 (file)
@@ -386,7 +386,6 @@ static int pca9541_remove(struct i2c_client *client)
 static struct i2c_driver pca9541_driver = {
        .driver = {
                   .name = "pca9541",
-                  .owner = THIS_MODULE,
                   },
        .probe = pca9541_probe,
        .remove = pca9541_remove,
index ea4aa9dfcea9674f412cdfae21d8735f2885d650..acfcef3d4068549cd22ebebf5115f8642ad10438 100644 (file)
@@ -300,7 +300,6 @@ static struct i2c_driver pca954x_driver = {
        .driver         = {
                .name   = "pca954x",
                .pm     = &pca954x_pm,
-               .owner  = THIS_MODULE,
        },
        .probe          = pca954x_probe,
        .remove         = pca954x_remove,
index b48378c4b40d64bf707002f6140cc8bd2a0d6da2..b5a982ba88986d67f24802acfd2df6845691078d 100644 (file)
@@ -111,6 +111,7 @@ static int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux,
                return -ENODEV;
        }
        adapter = of_find_i2c_adapter_by_node(adapter_np);
+       of_node_put(adapter_np);
        if (!adapter) {
                dev_err(mux->dev, "Cannot find parent bus\n");
                return -EPROBE_DEFER;
diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c
new file mode 100644 (file)
index 0000000..5fbd5bd
--- /dev/null
@@ -0,0 +1,290 @@
+/*
+ * I2C multiplexer using a single register
+ *
+ * Copyright 2015 Freescale Semiconductor
+ * York Sun  <yorksun@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/platform_data/i2c-mux-reg.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct regmux {
+       struct i2c_adapter *parent;
+       struct i2c_adapter **adap; /* child busses */
+       struct i2c_mux_reg_platform_data data;
+};
+
+static int i2c_mux_reg_set(const struct regmux *mux, unsigned int chan_id)
+{
+       if (!mux->data.reg)
+               return -EINVAL;
+
+       /*
+        * Write to the register, followed by a read to ensure the write is
+        * completed on a "posted" bus, for example PCI or write buffers.
+        * The endianness of reading doesn't matter and the return data
+        * is not used.
+        */
+       switch (mux->data.reg_size) {
+       case 4:
+               if (mux->data.little_endian)
+                       iowrite32(chan_id, mux->data.reg);
+               else
+                       iowrite32be(chan_id, mux->data.reg);
+               if (!mux->data.write_only)
+                       ioread32(mux->data.reg);
+               break;
+       case 2:
+               if (mux->data.little_endian)
+                       iowrite16(chan_id, mux->data.reg);
+               else
+                       iowrite16be(chan_id, mux->data.reg);
+               if (!mux->data.write_only)
+                       ioread16(mux->data.reg);
+               break;
+       case 1:
+               iowrite8(chan_id, mux->data.reg);
+               if (!mux->data.write_only)
+                       ioread8(mux->data.reg);
+               break;
+       }
+
+       return 0;
+}
+
+static int i2c_mux_reg_select(struct i2c_adapter *adap, void *data,
+                             unsigned int chan)
+{
+       struct regmux *mux = data;
+
+       return i2c_mux_reg_set(mux, chan);
+}
+
+static int i2c_mux_reg_deselect(struct i2c_adapter *adap, void *data,
+                               unsigned int chan)
+{
+       struct regmux *mux = data;
+
+       if (mux->data.idle_in_use)
+               return i2c_mux_reg_set(mux, mux->data.idle);
+
+       return 0;
+}
+
+#ifdef CONFIG_OF
+static int i2c_mux_reg_probe_dt(struct regmux *mux,
+                                       struct platform_device *pdev)
+{
+       struct device_node *np = pdev->dev.of_node;
+       struct device_node *adapter_np, *child;
+       struct i2c_adapter *adapter;
+       struct resource res;
+       unsigned *values;
+       int i = 0;
+
+       if (!np)
+               return -ENODEV;
+
+       adapter_np = of_parse_phandle(np, "i2c-parent", 0);
+       if (!adapter_np) {
+               dev_err(&pdev->dev, "Cannot parse i2c-parent\n");
+               return -ENODEV;
+       }
+       adapter = of_find_i2c_adapter_by_node(adapter_np);
+       of_node_put(adapter_np);
+       if (!adapter)
+               return -EPROBE_DEFER;
+
+       mux->parent = adapter;
+       mux->data.parent = i2c_adapter_id(adapter);
+       put_device(&adapter->dev);
+
+       mux->data.n_values = of_get_child_count(np);
+       if (of_find_property(np, "little-endian", NULL)) {
+               mux->data.little_endian = true;
+       } else if (of_find_property(np, "big-endian", NULL)) {
+               mux->data.little_endian = false;
+       } else {
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : \
+       defined(__LITTLE_ENDIAN)
+               mux->data.little_endian = true;
+#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : \
+       defined(__BIG_ENDIAN)
+               mux->data.little_endian = false;
+#else
+#error Endianness not defined?
+#endif
+       }
+       if (of_find_property(np, "write-only", NULL))
+               mux->data.write_only = true;
+       else
+               mux->data.write_only = false;
+
+       values = devm_kzalloc(&pdev->dev,
+                             sizeof(*mux->data.values) * mux->data.n_values,
+                             GFP_KERNEL);
+       if (!values) {
+               dev_err(&pdev->dev, "Cannot allocate values array");
+               return -ENOMEM;
+       }
+
+       for_each_child_of_node(np, child) {
+               of_property_read_u32(child, "reg", values + i);
+               i++;
+       }
+       mux->data.values = values;
+
+       if (!of_property_read_u32(np, "idle-state", &mux->data.idle))
+               mux->data.idle_in_use = true;
+
+       /* map address from "reg" if exists */
+       if (of_address_to_resource(np, 0, &res)) {
+               mux->data.reg_size = resource_size(&res);
+               mux->data.reg = devm_ioremap_resource(&pdev->dev, &res);
+               if (IS_ERR(mux->data.reg))
+                       return PTR_ERR(mux->data.reg);
+       }
+
+       return 0;
+}
+#else
+static int i2c_mux_reg_probe_dt(struct regmux *mux,
+                                       struct platform_device *pdev)
+{
+       return 0;
+}
+#endif
+
+static int i2c_mux_reg_probe(struct platform_device *pdev)
+{
+       struct regmux *mux;
+       struct i2c_adapter *parent;
+       struct resource *res;
+       int (*deselect)(struct i2c_adapter *, void *, u32);
+       unsigned int class;
+       int i, ret, nr;
+
+       mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
+       if (!mux)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, mux);
+
+       if (dev_get_platdata(&pdev->dev)) {
+               memcpy(&mux->data, dev_get_platdata(&pdev->dev),
+                       sizeof(mux->data));
+
+               parent = i2c_get_adapter(mux->data.parent);
+               if (!parent)
+                       return -EPROBE_DEFER;
+
+               mux->parent = parent;
+       } else {
+               ret = i2c_mux_reg_probe_dt(mux, pdev);
+               if (ret < 0) {
+                       dev_err(&pdev->dev, "Error parsing device tree");
+                       return ret;
+               }
+       }
+
+       if (!mux->data.reg) {
+               dev_info(&pdev->dev,
+                       "Register not set, using platform resource\n");
+               res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+               mux->data.reg_size = resource_size(res);
+               mux->data.reg = devm_ioremap_resource(&pdev->dev, res);
+               if (IS_ERR(mux->data.reg))
+                       return PTR_ERR(mux->data.reg);
+       }
+
+       if (mux->data.reg_size != 4 && mux->data.reg_size != 2 &&
+           mux->data.reg_size != 1) {
+               dev_err(&pdev->dev, "Invalid register size\n");
+               return -EINVAL;
+       }
+
+       mux->adap = devm_kzalloc(&pdev->dev,
+                                sizeof(*mux->adap) * mux->data.n_values,
+                                GFP_KERNEL);
+       if (!mux->adap) {
+               dev_err(&pdev->dev, "Cannot allocate i2c_adapter structure");
+               return -ENOMEM;
+       }
+
+       if (mux->data.idle_in_use)
+               deselect = i2c_mux_reg_deselect;
+       else
+               deselect = NULL;
+
+       for (i = 0; i < mux->data.n_values; i++) {
+               nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
+               class = mux->data.classes ? mux->data.classes[i] : 0;
+
+               mux->adap[i] = i2c_add_mux_adapter(mux->parent, &pdev->dev, mux,
+                                                  nr, mux->data.values[i],
+                                                  class, i2c_mux_reg_select,
+                                                  deselect);
+               if (!mux->adap[i]) {
+                       ret = -ENODEV;
+                       dev_err(&pdev->dev, "Failed to add adapter %d\n", i);
+                       goto add_adapter_failed;
+               }
+       }
+
+       dev_dbg(&pdev->dev, "%d port mux on %s adapter\n",
+                mux->data.n_values, mux->parent->name);
+
+       return 0;
+
+add_adapter_failed:
+       for (; i > 0; i--)
+               i2c_del_mux_adapter(mux->adap[i - 1]);
+
+       return ret;
+}
+
+static int i2c_mux_reg_remove(struct platform_device *pdev)
+{
+       struct regmux *mux = platform_get_drvdata(pdev);
+       int i;
+
+       for (i = 0; i < mux->data.n_values; i++)
+               i2c_del_mux_adapter(mux->adap[i]);
+
+       i2c_put_adapter(mux->parent);
+
+       return 0;
+}
+
+static const struct of_device_id i2c_mux_reg_of_match[] = {
+       { .compatible = "i2c-mux-reg", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, i2c_mux_reg_of_match);
+
+static struct platform_driver i2c_mux_reg_driver = {
+       .probe  = i2c_mux_reg_probe,
+       .remove = i2c_mux_reg_remove,
+       .driver = {
+               .name   = "i2c-mux-reg",
+       },
+};
+
+module_platform_driver(i2c_mux_reg_driver);
+
+MODULE_DESCRIPTION("Register-based I2C multiplexer driver");
+MODULE_AUTHOR("York Sun <yorksun@freescale.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:i2c-mux-reg");
index b899531498eb0dc7924e4587f67dbc3f7c116313..aa26f3c3416bbbcd04dac1ec5381fcb081105d5e 100644 (file)
@@ -55,10 +55,7 @@ config INFINIBAND_ADDR_TRANS
        default y
 
 source "drivers/infiniband/hw/mthca/Kconfig"
-source "drivers/infiniband/hw/ipath/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
-source "drivers/infiniband/hw/ehca/Kconfig"
-source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
index acf73676444593704267ac9176696f95caa52335..d43a8994ac5c129d201b0f164442c618192eea9d 100644 (file)
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
                                        $(user_access-y)
 
 ib_core-y :=                   packer.o ud_header.o verbs.o sysfs.o \
-                               device.o fmr_pool.o cache.o netlink.o
+                               device.o fmr_pool.o cache.o netlink.o \
+                               roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
index 871da832d016a7a9b6305047f6e512bc2a936b68..8f66c67ff0df09380dc7c486dce44d92efac1f18 100644 (file)
@@ -37,6 +37,8 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
 
 #include <rdma/ib_cache.h>
 
@@ -47,76 +49,621 @@ struct ib_pkey_cache {
        u16             table[0];
 };
 
-struct ib_gid_cache {
-       int             table_len;
-       union ib_gid    table[0];
-};
-
 struct ib_update_work {
        struct work_struct work;
        struct ib_device  *device;
        u8                 port_num;
 };
 
-int ib_get_cached_gid(struct ib_device *device,
-                     u8                port_num,
-                     int               index,
-                     union ib_gid     *gid)
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+       GID_ATTR_FIND_MASK_GID          = 1UL << 0,
+       GID_ATTR_FIND_MASK_NETDEV       = 1UL << 1,
+       GID_ATTR_FIND_MASK_DEFAULT      = 1UL << 2,
+};
+
+enum gid_table_entry_props {
+       GID_TABLE_ENTRY_INVALID         = 1UL << 0,
+       GID_TABLE_ENTRY_DEFAULT         = 1UL << 1,
+};
+
+enum gid_table_write_action {
+       GID_TABLE_WRITE_ACTION_ADD,
+       GID_TABLE_WRITE_ACTION_DEL,
+       /* MODIFY only updates the GID table. Currently only used by
+        * ib_cache_update.
+        */
+       GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+       /* This lock protects an entry from being
+        * read and written simultaneously.
+        */
+       rwlock_t            lock;
+       unsigned long       props;
+       union ib_gid        gid;
+       struct ib_gid_attr  attr;
+       void               *context;
+};
+
+struct ib_gid_table {
+       int                  sz;
+       /* In RoCE, adding a GID to the table requires:
+        * (a) Find if this GID is already exists.
+        * (b) Find a free space.
+        * (c) Write the new GID
+        *
+        * Delete requires different set of operations:
+        * (a) Find the GID
+        * (b) Delete it.
+        *
+        * Add/delete should be carried out atomically.
+        * This is done by locking this mutex from multiple
+        * writers. We don't need this lock for IB, as the MAD
+        * layer replaces all entries. All data_vec entries
+        * are locked by this lock.
+        **/
+       struct mutex         lock;
+       struct ib_gid_table_entry *data_vec;
+};
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+                    struct ib_gid_table *table, int ix,
+                    const union ib_gid *gid,
+                    const struct ib_gid_attr *attr,
+                    enum gid_table_write_action action,
+                    bool  default_gid)
 {
-       struct ib_gid_cache *cache;
+       int ret = 0;
+       struct net_device *old_net_dev;
        unsigned long flags;
+
+       /* in rdma_cap_roce_gid_table, this funciton should be protected by a
+        * sleep-able lock.
+        */
+       write_lock_irqsave(&table->data_vec[ix].lock, flags);
+
+       if (rdma_cap_roce_gid_table(ib_dev, port)) {
+               table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+               write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+               /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+                * RoCE providers and thus only updates the cache.
+                */
+               if (action == GID_TABLE_WRITE_ACTION_ADD)
+                       ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+                                             &table->data_vec[ix].context);
+               else if (action == GID_TABLE_WRITE_ACTION_DEL)
+                       ret = ib_dev->del_gid(ib_dev, port, ix,
+                                             &table->data_vec[ix].context);
+               write_lock_irqsave(&table->data_vec[ix].lock, flags);
+       }
+
+       old_net_dev = table->data_vec[ix].attr.ndev;
+       if (old_net_dev && old_net_dev != attr->ndev)
+               dev_put(old_net_dev);
+       /* if modify_gid failed, just delete the old gid */
+       if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+               gid = &zgid;
+               attr = &zattr;
+               table->data_vec[ix].context = NULL;
+       }
+       if (default_gid)
+               table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+       memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+       memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+       if (table->data_vec[ix].attr.ndev &&
+           table->data_vec[ix].attr.ndev != old_net_dev)
+               dev_hold(table->data_vec[ix].attr.ndev);
+
+       table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+       write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+
+       if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
+               struct ib_event event;
+
+               event.device            = ib_dev;
+               event.element.port_num  = port;
+               event.event             = IB_EVENT_GID_CHANGE;
+
+               ib_dispatch_event(&event);
+       }
+       return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+                  struct ib_gid_table *table, int ix,
+                  const union ib_gid *gid,
+                  const struct ib_gid_attr *attr,
+                  bool  default_gid) {
+       return write_gid(ib_dev, port, table, ix, gid, attr,
+                        GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+                     struct ib_gid_table *table, int ix,
+                     const union ib_gid *gid,
+                     const struct ib_gid_attr *attr,
+                     bool  default_gid) {
+       return write_gid(ib_dev, port, table, ix, gid, attr,
+                        GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+                  struct ib_gid_table *table, int ix,
+                  bool  default_gid) {
+       return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+                        GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+                   const struct ib_gid_attr *val, bool default_gid,
+                   unsigned long mask)
+{
+       int i;
+
+       for (i = 0; i < table->sz; i++) {
+               unsigned long flags;
+               struct ib_gid_attr *attr = &table->data_vec[i].attr;
+
+               read_lock_irqsave(&table->data_vec[i].lock, flags);
+
+               if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+                       goto next;
+
+               if (mask & GID_ATTR_FIND_MASK_GID &&
+                   memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+                       goto next;
+
+               if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+                   attr->ndev != val->ndev)
+                       goto next;
+
+               if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+                   !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+                   default_gid)
+                       goto next;
+
+               read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+               return i;
+next:
+               read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+       }
+
+       return -1;
+}
+
+static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+       gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+       addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+                    union ib_gid *gid, struct ib_gid_attr *attr)
+{
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       struct ib_gid_table *table;
+       int ix;
        int ret = 0;
+       struct net_device *idev;
 
-       if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+       table = ports_table[port - rdma_start_port(ib_dev)];
+
+       if (!memcmp(gid, &zgid, sizeof(*gid)))
                return -EINVAL;
 
-       read_lock_irqsave(&device->cache.lock, flags);
+       if (ib_dev->get_netdev) {
+               idev = ib_dev->get_netdev(ib_dev, port);
+               if (idev && attr->ndev != idev) {
+                       union ib_gid default_gid;
 
-       cache = device->cache.gid_cache[port_num - rdma_start_port(device)];
+                       /* Adding default GIDs in not permitted */
+                       make_default_gid(idev, &default_gid);
+                       if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+                               dev_put(idev);
+                               return -EPERM;
+                       }
+               }
+               if (idev)
+                       dev_put(idev);
+       }
 
-       if (index < 0 || index >= cache->table_len)
-               ret = -EINVAL;
-       else
-               *gid = cache->table[index];
+       mutex_lock(&table->lock);
 
-       read_unlock_irqrestore(&device->cache.lock, flags);
+       ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+                     GID_ATTR_FIND_MASK_NETDEV);
+       if (ix >= 0)
+               goto out_unlock;
 
+       ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+                     GID_ATTR_FIND_MASK_DEFAULT);
+       if (ix < 0) {
+               ret = -ENOSPC;
+               goto out_unlock;
+       }
+
+       add_gid(ib_dev, port, table, ix, gid, attr, false);
+
+out_unlock:
+       mutex_unlock(&table->lock);
        return ret;
 }
-EXPORT_SYMBOL(ib_get_cached_gid);
 
-int ib_find_cached_gid(struct ib_device   *device,
-                      const union ib_gid *gid,
-                      u8                 *port_num,
-                      u16                *index)
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+                    union ib_gid *gid, struct ib_gid_attr *attr)
 {
-       struct ib_gid_cache *cache;
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       struct ib_gid_table *table;
+       int ix;
+
+       table = ports_table[port - rdma_start_port(ib_dev)];
+
+       mutex_lock(&table->lock);
+
+       ix = find_gid(table, gid, attr, false,
+                     GID_ATTR_FIND_MASK_GID      |
+                     GID_ATTR_FIND_MASK_NETDEV   |
+                     GID_ATTR_FIND_MASK_DEFAULT);
+       if (ix < 0)
+               goto out_unlock;
+
+       del_gid(ib_dev, port, table, ix, false);
+
+out_unlock:
+       mutex_unlock(&table->lock);
+       return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+                                    struct net_device *ndev)
+{
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       struct ib_gid_table *table;
+       int ix;
+
+       table  = ports_table[port - rdma_start_port(ib_dev)];
+
+       mutex_lock(&table->lock);
+
+       for (ix = 0; ix < table->sz; ix++)
+               if (table->data_vec[ix].attr.ndev == ndev)
+                       del_gid(ib_dev, port, table, ix, false);
+
+       mutex_unlock(&table->lock);
+       return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+                             union ib_gid *gid, struct ib_gid_attr *attr)
+{
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       struct ib_gid_table *table;
        unsigned long flags;
-       int p, i;
-       int ret = -ENOENT;
 
-       *port_num = -1;
-       if (index)
-               *index = -1;
+       table = ports_table[port - rdma_start_port(ib_dev)];
 
-       read_lock_irqsave(&device->cache.lock, flags);
+       if (index < 0 || index >= table->sz)
+               return -EINVAL;
 
-       for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-               cache = device->cache.gid_cache[p];
-               for (i = 0; i < cache->table_len; ++i) {
-                       if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
-                               *port_num = p + rdma_start_port(device);
-                               if (index)
-                                       *index = i;
-                               ret = 0;
-                               goto found;
-                       }
+       read_lock_irqsave(&table->data_vec[index].lock, flags);
+       if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
+               read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+               return -EAGAIN;
+       }
+
+       memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+       if (attr) {
+               memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+               if (attr->ndev)
+                       dev_hold(attr->ndev);
+       }
+
+       read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+       return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+                                   const union ib_gid *gid,
+                                   const struct ib_gid_attr *val,
+                                   unsigned long mask,
+                                   u8 *port, u16 *index)
+{
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       struct ib_gid_table *table;
+       u8 p;
+       int local_index;
+
+       for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+               table = ports_table[p];
+               local_index = find_gid(table, gid, val, false, mask);
+               if (local_index >= 0) {
+                       if (index)
+                               *index = local_index;
+                       if (port)
+                               *port = p + rdma_start_port(ib_dev);
+                       return 0;
                }
        }
-found:
-       read_unlock_irqrestore(&device->cache.lock, flags);
 
-       return ret;
+       return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+                            const union ib_gid *gid,
+                            struct net_device *ndev, u8 *port,
+                            u16 *index)
+{
+       unsigned long mask = GID_ATTR_FIND_MASK_GID;
+       struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+
+       if (ndev)
+               mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+       return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+                                       mask, port, index);
+}
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+                             const union ib_gid *gid,
+                             u8 port, struct net_device *ndev,
+                             u16 *index)
+{
+       int local_index;
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       struct ib_gid_table *table;
+       unsigned long mask = GID_ATTR_FIND_MASK_GID;
+       struct ib_gid_attr val = {.ndev = ndev};
+
+       if (port < rdma_start_port(ib_dev) ||
+           port > rdma_end_port(ib_dev))
+               return -ENOENT;
+
+       table = ports_table[port - rdma_start_port(ib_dev)];
+
+       if (ndev)
+               mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+       local_index = find_gid(table, gid, &val, false, mask);
+       if (local_index >= 0) {
+               if (index)
+                       *index = local_index;
+               return 0;
+       }
+
+       return -ENOENT;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+       unsigned int i;
+       struct ib_gid_table *table =
+               kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+       if (!table)
+               return NULL;
+
+       table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+       if (!table->data_vec)
+               goto err_free_table;
+
+       mutex_init(&table->lock);
+
+       table->sz = sz;
+
+       for (i = 0; i < sz; i++)
+               rwlock_init(&table->data_vec[i].lock);
+
+       return table;
+
+err_free_table:
+       kfree(table);
+       return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+       if (table) {
+               kfree(table->data_vec);
+               kfree(table);
+       }
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+                                  struct ib_gid_table *table)
+{
+       int i;
+
+       if (!table)
+               return;
+
+       for (i = 0; i < table->sz; ++i) {
+               if (memcmp(&table->data_vec[i].gid, &zgid,
+                          sizeof(table->data_vec[i].gid)))
+                       del_gid(ib_dev, port, table, i,
+                               table->data_vec[i].props &
+                               GID_ATTR_FIND_MASK_DEFAULT);
+       }
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+                                 struct net_device *ndev,
+                                 enum ib_cache_gid_default_mode mode)
+{
+       struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+       union ib_gid gid;
+       struct ib_gid_attr gid_attr;
+       struct ib_gid_table *table;
+       int ix;
+       union ib_gid current_gid;
+       struct ib_gid_attr current_gid_attr = {};
+
+       table  = ports_table[port - rdma_start_port(ib_dev)];
+
+       make_default_gid(ndev, &gid);
+       memset(&gid_attr, 0, sizeof(gid_attr));
+       gid_attr.ndev = ndev;
+
+       ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
+
+       /* Coudn't find default GID location */
+       WARN_ON(ix < 0);
+
+       mutex_lock(&table->lock);
+       if (!__ib_cache_gid_get(ib_dev, port, ix,
+                               &current_gid, &current_gid_attr) &&
+           mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+           !memcmp(&gid, &current_gid, sizeof(gid)) &&
+           !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+               goto unlock;
+
+       if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+            memcmp(&current_gid_attr, &zattr,
+                   sizeof(current_gid_attr))) &&
+           del_gid(ib_dev, port, table, ix, true)) {
+               pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+                       ix, gid.raw);
+               goto unlock;
+       }
+
+       if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
+               if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+                       pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+                               gid.raw);
+
+unlock:
+       if (current_gid_attr.ndev)
+               dev_put(current_gid_attr.ndev);
+       mutex_unlock(&table->lock);
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+                                    struct ib_gid_table *table)
+{
+       if (rdma_protocol_roce(ib_dev, port)) {
+               struct ib_gid_table_entry *entry = &table->data_vec[0];
+
+               entry->props |= GID_TABLE_ENTRY_DEFAULT;
+       }
+
+       return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+       u8 port;
+       struct ib_gid_table **table;
+       int err = 0;
+
+       table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+       if (!table) {
+               pr_warn("failed to allocate ib gid cache for %s\n",
+                       ib_dev->name);
+               return -ENOMEM;
+       }
+
+       for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+               u8 rdma_port = port + rdma_start_port(ib_dev);
+
+               table[port] =
+                       alloc_gid_table(
+                               ib_dev->port_immutable[rdma_port].gid_tbl_len);
+               if (!table[port]) {
+                       err = -ENOMEM;
+                       goto rollback_table_setup;
+               }
+
+               err = gid_table_reserve_default(ib_dev,
+                                               port + rdma_start_port(ib_dev),
+                                               table[port]);
+               if (err)
+                       goto rollback_table_setup;
+       }
+
+       ib_dev->cache.gid_cache = table;
+       return 0;
+
+rollback_table_setup:
+       for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+               cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+                                      table[port]);
+               release_gid_table(table[port]);
+       }
+
+       kfree(table);
+       return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+       struct ib_gid_table **table = ib_dev->cache.gid_cache;
+       u8 port;
+
+       if (!table)
+               return;
+
+       for (port = 0; port < ib_dev->phys_port_cnt; port++)
+               release_gid_table(table[port]);
+
+       kfree(table);
+       ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+       struct ib_gid_table **table = ib_dev->cache.gid_cache;
+       u8 port;
+
+       if (!table)
+               return;
+
+       for (port = 0; port < ib_dev->phys_port_cnt; port++)
+               cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+                                      table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+       int err;
+
+       err = _gid_table_setup_one(ib_dev);
+
+       if (err)
+               return err;
+
+       err = roce_rescan_device(ib_dev);
+
+       if (err) {
+               gid_table_cleanup_one(ib_dev);
+               gid_table_release_one(ib_dev);
+       }
+
+       return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+                     u8                port_num,
+                     int               index,
+                     union ib_gid     *gid)
+{
+       if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+               return -EINVAL;
+
+       return __ib_cache_gid_get(device, port_num, index, gid, NULL);
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+                      const union ib_gid *gid,
+                      u8               *port_num,
+                      u16              *index)
+{
+       return ib_cache_gid_find(device, gid, NULL, port_num, index);
 }
 EXPORT_SYMBOL(ib_find_cached_gid);
 
@@ -243,9 +790,21 @@ static void ib_cache_update(struct ib_device *device,
 {
        struct ib_port_attr       *tprops = NULL;
        struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
-       struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+       struct ib_gid_cache {
+               int             table_len;
+               union ib_gid    table[0];
+       }                         *gid_cache = NULL;
        int                        i;
        int                        ret;
+       struct ib_gid_table       *table;
+       struct ib_gid_table      **ports_table = device->cache.gid_cache;
+       bool                       use_roce_gid_table =
+                                       rdma_cap_roce_gid_table(device, port);
+
+       if (port < rdma_start_port(device) || port > rdma_end_port(device))
+               return;
+
+       table = ports_table[port - rdma_start_port(device)];
 
        tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
        if (!tprops)
@@ -265,12 +824,14 @@ static void ib_cache_update(struct ib_device *device,
 
        pkey_cache->table_len = tprops->pkey_tbl_len;
 
-       gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
-                           sizeof *gid_cache->table, GFP_KERNEL);
-       if (!gid_cache)
-               goto err;
+       if (!use_roce_gid_table) {
+               gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+                           sizeof(*gid_cache->table), GFP_KERNEL);
+               if (!gid_cache)
+                       goto err;
 
-       gid_cache->table_len = tprops->gid_tbl_len;
+               gid_cache->table_len = tprops->gid_tbl_len;
+       }
 
        for (i = 0; i < pkey_cache->table_len; ++i) {
                ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -281,29 +842,36 @@ static void ib_cache_update(struct ib_device *device,
                }
        }
 
-       for (i = 0; i < gid_cache->table_len; ++i) {
-               ret = ib_query_gid(device, port, i, gid_cache->table + i);
-               if (ret) {
-                       printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
-                              ret, device->name, i);
-                       goto err;
+       if (!use_roce_gid_table) {
+               for (i = 0;  i < gid_cache->table_len; ++i) {
+                       ret = ib_query_gid(device, port, i,
+                                          gid_cache->table + i);
+                       if (ret) {
+                               printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+                                      ret, device->name, i);
+                               goto err;
+                       }
                }
        }
 
        write_lock_irq(&device->cache.lock);
 
        old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
-       old_gid_cache  = device->cache.gid_cache [port - rdma_start_port(device)];
 
        device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
-       device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache;
+       if (!use_roce_gid_table) {
+               for (i = 0; i < gid_cache->table_len; i++) {
+                       modify_gid(device, port, table, i, gid_cache->table + i,
+                                  &zattr, false);
+               }
+       }
 
        device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
 
        write_unlock_irq(&device->cache.lock);
 
+       kfree(gid_cache);
        kfree(old_pkey_cache);
-       kfree(old_gid_cache);
        kfree(tprops);
        return;
 
@@ -344,85 +912,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
        }
 }
 
-static void ib_cache_setup_one(struct ib_device *device)
+int ib_cache_setup_one(struct ib_device *device)
 {
        int p;
+       int err;
 
        rwlock_init(&device->cache.lock);
 
        device->cache.pkey_cache =
-               kmalloc(sizeof *device->cache.pkey_cache *
-                       (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-       device->cache.gid_cache =
-               kmalloc(sizeof *device->cache.gid_cache *
+               kzalloc(sizeof *device->cache.pkey_cache *
                        (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-
        device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
                                          (rdma_end_port(device) -
                                           rdma_start_port(device) + 1),
                                          GFP_KERNEL);
-
-       if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+       if (!device->cache.pkey_cache ||
            !device->cache.lmc_cache) {
                printk(KERN_WARNING "Couldn't allocate cache "
                       "for %s\n", device->name);
-               goto err;
+               return -ENOMEM;
        }
 
-       for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-               device->cache.pkey_cache[p] = NULL;
-               device->cache.gid_cache [p] = NULL;
+       err = gid_table_setup_one(device);
+       if (err)
+               /* Allocated memory will be cleaned in the release function */
+               return err;
+
+       for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
                ib_cache_update(device, p + rdma_start_port(device));
-       }
 
        INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
                              device, ib_cache_event);
-       if (ib_register_event_handler(&device->cache.event_handler))
-               goto err_cache;
-
-       return;
+       err = ib_register_event_handler(&device->cache.event_handler);
+       if (err)
+               goto err;
 
-err_cache:
-       for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-               kfree(device->cache.pkey_cache[p]);
-               kfree(device->cache.gid_cache[p]);
-       }
+       return 0;
 
 err:
-       kfree(device->cache.pkey_cache);
-       kfree(device->cache.gid_cache);
-       kfree(device->cache.lmc_cache);
+       gid_table_cleanup_one(device);
+       return err;
 }
 
-static void ib_cache_cleanup_one(struct ib_device *device)
+void ib_cache_release_one(struct ib_device *device)
 {
        int p;
 
-       ib_unregister_event_handler(&device->cache.event_handler);
-       flush_workqueue(ib_wq);
-
-       for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
-               kfree(device->cache.pkey_cache[p]);
-               kfree(device->cache.gid_cache[p]);
-       }
-
+       /*
+        * The release function frees all the cache elements.
+        * This function should be called as part of freeing
+        * all the device's resources when the cache could no
+        * longer be accessed.
+        */
+       if (device->cache.pkey_cache)
+               for (p = 0;
+                    p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+                       kfree(device->cache.pkey_cache[p]);
+
+       gid_table_release_one(device);
        kfree(device->cache.pkey_cache);
-       kfree(device->cache.gid_cache);
        kfree(device->cache.lmc_cache);
 }
 
-static struct ib_client cache_client = {
-       .name   = "cache",
-       .add    = ib_cache_setup_one,
-       .remove = ib_cache_cleanup_one
-};
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+       /* The cleanup function unregisters the event handler,
+        * waits for all in-progress workqueue elements and cleans
+        * up the GID cache. This function should be called after
+        * the device was removed from the devices list and all
+        * clients were removed, so the cache exists but is
+        * non-functional and shouldn't be updated anymore.
+        */
+       ib_unregister_event_handler(&device->cache.event_handler);
+       flush_workqueue(ib_wq);
+       gid_table_cleanup_one(device);
+}
 
-int __init ib_cache_setup(void)
+void __init ib_cache_setup(void)
 {
-       return ib_register_client(&cache_client);
+       roce_gid_mgmt_init();
 }
 
 void __exit ib_cache_cleanup(void)
 {
-       ib_unregister_client(&cache_client);
+       roce_gid_mgmt_cleanup();
 }
index 3a972ebf3c0d1170efe280aa7bcf781c831fa98f..ea4db9c1d44fba56ea5798649f3744c78b148e87 100644 (file)
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
 MODULE_LICENSE("Dual BSD/GPL");
 
 static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client cm_client = {
        .name   = "cm",
@@ -213,13 +213,15 @@ struct cm_id_private {
        spinlock_t lock;        /* Do not acquire inside cm.lock */
        struct completion comp;
        atomic_t refcount;
+       /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+        * Protected by the cm.lock spinlock. */
+       int listen_sharecount;
 
        struct ib_mad_send_buf *msg;
        struct cm_timewait_info *timewait_info;
        /* todo: use alternate port on send failure */
        struct cm_av av;
        struct cm_av alt_av;
-       struct ib_cm_compare_data *compare_data;
 
        void *private_data;
        __be64 tid;
@@ -440,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
        return cm_id_priv;
 }
 
-static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
-{
-       int i;
-
-       for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
-               dst[i] = src[i] & mask[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
-                          struct ib_cm_compare_data *dst_data)
-{
-       u32 src[IB_CM_COMPARE_SIZE];
-       u32 dst[IB_CM_COMPARE_SIZE];
-
-       if (!src_data || !dst_data)
-               return 0;
-
-       cm_mask_copy(src, src_data->data, dst_data->mask);
-       cm_mask_copy(dst, dst_data->data, src_data->mask);
-       return memcmp(src, dst, sizeof(src));
-}
-
-static int cm_compare_private_data(u32 *private_data,
-                                  struct ib_cm_compare_data *dst_data)
-{
-       u32 src[IB_CM_COMPARE_SIZE];
-
-       if (!dst_data)
-               return 0;
-
-       cm_mask_copy(src, private_data, dst_data->mask);
-       return memcmp(src, dst_data->data, sizeof(src));
-}
-
 /*
  * Trivial helpers to strip endian annotation and compare; the
  * endianness doesn't actually matter since we just need a stable
@@ -506,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
        struct cm_id_private *cur_cm_id_priv;
        __be64 service_id = cm_id_priv->id.service_id;
        __be64 service_mask = cm_id_priv->id.service_mask;
-       int data_cmp;
 
        while (*link) {
                parent = *link;
                cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
                                          service_node);
-               data_cmp = cm_compare_data(cm_id_priv->compare_data,
-                                          cur_cm_id_priv->compare_data);
                if ((cur_cm_id_priv->id.service_mask & service_id) ==
                    (service_mask & cur_cm_id_priv->id.service_id) &&
-                   (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
-                   !data_cmp)
+                   (cm_id_priv->id.device == cur_cm_id_priv->id.device))
                        return cur_cm_id_priv;
 
                if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -528,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
                        link = &(*link)->rb_left;
                else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
                        link = &(*link)->rb_right;
-               else if (data_cmp < 0)
-                       link = &(*link)->rb_left;
                else
                        link = &(*link)->rb_right;
        }
@@ -539,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 }
 
 static struct cm_id_private * cm_find_listen(struct ib_device *device,
-                                            __be64 service_id,
-                                            u32 *private_data)
+                                            __be64 service_id)
 {
        struct rb_node *node = cm.listen_service_table.rb_node;
        struct cm_id_private *cm_id_priv;
-       int data_cmp;
 
        while (node) {
                cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
-               data_cmp = cm_compare_private_data(private_data,
-                                                  cm_id_priv->compare_data);
                if ((cm_id_priv->id.service_mask & service_id) ==
                     cm_id_priv->id.service_id &&
-                   (cm_id_priv->id.device == device) && !data_cmp)
+                   (cm_id_priv->id.device == device))
                        return cm_id_priv;
 
                if (device < cm_id_priv->id.device)
@@ -563,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
                        node = node->rb_left;
                else if (be64_gt(service_id, cm_id_priv->id.service_id))
                        node = node->rb_right;
-               else if (data_cmp < 0)
-                       node = node->rb_left;
                else
                        node = node->rb_right;
        }
@@ -859,9 +815,15 @@ retest:
        spin_lock_irq(&cm_id_priv->lock);
        switch (cm_id->state) {
        case IB_CM_LISTEN:
-               cm_id->state = IB_CM_IDLE;
                spin_unlock_irq(&cm_id_priv->lock);
+
                spin_lock_irq(&cm.lock);
+               if (--cm_id_priv->listen_sharecount > 0) {
+                       /* The id is still shared. */
+                       cm_deref_id(cm_id_priv);
+                       spin_unlock_irq(&cm.lock);
+                       return;
+               }
                rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
                spin_unlock_irq(&cm.lock);
                break;
@@ -930,7 +892,6 @@ retest:
        wait_for_completion(&cm_id_priv->comp);
        while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
                cm_free_work(work);
-       kfree(cm_id_priv->compare_data);
        kfree(cm_id_priv->private_data);
        kfree(cm_id_priv);
 }
@@ -941,11 +902,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
 }
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
-                struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ *   connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ *   range of service IDs.  If set to 0, the service ID is matched
+ *   exactly.  This parameter is ignored if %service_id is set to
+ *   IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+                         __be64 service_mask)
 {
        struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
-       unsigned long flags;
        int ret = 0;
 
        service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -958,20 +931,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
        if (cm_id->state != IB_CM_IDLE)
                return -EINVAL;
 
-       if (compare_data) {
-               cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
-                                                  GFP_KERNEL);
-               if (!cm_id_priv->compare_data)
-                       return -ENOMEM;
-               cm_mask_copy(cm_id_priv->compare_data->data,
-                            compare_data->data, compare_data->mask);
-               memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
-                      sizeof(compare_data->mask));
-       }
-
        cm_id->state = IB_CM_LISTEN;
+       ++cm_id_priv->listen_sharecount;
 
-       spin_lock_irqsave(&cm.lock, flags);
        if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
                cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
                cm_id->service_mask = ~cpu_to_be64(0);
@@ -980,18 +942,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
                cm_id->service_mask = service_mask;
        }
        cur_cm_id_priv = cm_insert_listen(cm_id_priv);
-       spin_unlock_irqrestore(&cm.lock, flags);
 
        if (cur_cm_id_priv) {
                cm_id->state = IB_CM_IDLE;
-               kfree(cm_id_priv->compare_data);
-               cm_id_priv->compare_data = NULL;
+               --cm_id_priv->listen_sharecount;
                ret = -EBUSY;
        }
        return ret;
 }
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&cm.lock, flags);
+       ret = __ib_cm_listen(cm_id, service_id, service_mask);
+       spin_unlock_irqrestore(&cm.lock, flags);
+
+       return ret;
+}
 EXPORT_SYMBOL(ib_cm_listen);
 
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id.  All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ *   and service ID resolution requests.  The service ID should be specified
+ *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ *   assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+                                    ib_cm_handler cm_handler,
+                                    __be64 service_id)
+{
+       struct cm_id_private *cm_id_priv;
+       struct ib_cm_id *cm_id;
+       unsigned long flags;
+       int err = 0;
+
+       /* Create an ID in advance, since the creation may sleep */
+       cm_id = ib_create_cm_id(device, cm_handler, NULL);
+       if (IS_ERR(cm_id))
+               return cm_id;
+
+       spin_lock_irqsave(&cm.lock, flags);
+
+       if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+               goto new_id;
+
+       /* Find an existing ID */
+       cm_id_priv = cm_find_listen(device, service_id);
+       if (cm_id_priv) {
+               if (cm_id->cm_handler != cm_handler || cm_id->context) {
+                       /* Sharing an ib_cm_id with different handlers is not
+                        * supported */
+                       spin_unlock_irqrestore(&cm.lock, flags);
+                       return ERR_PTR(-EINVAL);
+               }
+               atomic_inc(&cm_id_priv->refcount);
+               ++cm_id_priv->listen_sharecount;
+               spin_unlock_irqrestore(&cm.lock, flags);
+
+               ib_destroy_cm_id(cm_id);
+               cm_id = &cm_id_priv->id;
+               return cm_id;
+       }
+
+new_id:
+       /* Use newly created ID */
+       err = __ib_cm_listen(cm_id, service_id, 0);
+
+       spin_unlock_irqrestore(&cm.lock, flags);
+
+       if (err) {
+               ib_destroy_cm_id(cm_id);
+               return ERR_PTR(err);
+       }
+       return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
 static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
                          enum cm_msg_sequence msg_seq)
 {
@@ -1268,6 +1307,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
        primary_path->packet_life_time =
                cm_req_get_primary_local_ack_timeout(req_msg);
        primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+       primary_path->service_id = req_msg->service_id;
 
        if (req_msg->alt_local_lid) {
                memset(alt_path, 0, sizeof *alt_path);
@@ -1289,7 +1329,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
                alt_path->packet_life_time =
                        cm_req_get_alt_local_ack_timeout(req_msg);
                alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+               alt_path->service_id = req_msg->service_id;
+       }
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+       struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+       u8 port_num = work->port->port_num;
+       u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+       u16 pkey;
+       int ret;
+
+       ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+       if (ret) {
+               dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+                                    port_num, pkey_index, ret);
+               return 0;
        }
+
+       return pkey;
 }
 
 static void cm_format_req_event(struct cm_work *work,
@@ -1302,6 +1361,7 @@ static void cm_format_req_event(struct cm_work *work,
        req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
        param = &work->cm_event.param.req_rcvd;
        param->listen_id = listen_id;
+       param->bth_pkey = cm_get_bth_pkey(work);
        param->port = cm_id_priv->av.port->port_num;
        param->primary_path = &work->path[0];
        if (req_msg->alt_local_lid)
@@ -1484,8 +1544,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 
        /* Find matching listen request. */
        listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
-                                          req_msg->service_id,
-                                          req_msg->private_data);
+                                          req_msg->service_id);
        if (!listen_cm_id_priv) {
                cm_cleanup_timewait(cm_id_priv->timewait_info);
                spin_unlock_irq(&cm.lock);
@@ -2992,6 +3051,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
        param = &work->cm_event.param.sidr_req_rcvd;
        param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
        param->listen_id = listen_id;
+       param->service_id = sidr_req_msg->service_id;
+       param->bth_pkey = cm_get_bth_pkey(work);
        param->port = work->port->port_num;
        work->cm_event.private_data = &sidr_req_msg->private_data;
 }
@@ -3031,8 +3092,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
        }
        cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
        cur_cm_id_priv = cm_find_listen(cm_id->device,
-                                       sidr_req_msg->service_id,
-                                       sidr_req_msg->private_data);
+                                       sidr_req_msg->service_id);
        if (!cur_cm_id_priv) {
                spin_unlock_irq(&cm.lock);
                cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3886,9 +3946,9 @@ free:
        kfree(cm_dev);
 }
 
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
 {
-       struct cm_device *cm_dev;
+       struct cm_device *cm_dev = client_data;
        struct cm_port *port;
        struct ib_port_modify port_modify = {
                .clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3896,7 +3956,6 @@ static void cm_remove_one(struct ib_device *ib_device)
        unsigned long flags;
        int i;
 
-       cm_dev = ib_get_client_data(ib_device, &cm_client);
        if (!cm_dev)
                return;
 
index 143ded2bbe7c7fbd8d51a6d952aab1f7cf101b35..b1ab13f3e182bb520cc986512d11e9016ecf1362 100644 (file)
@@ -46,6 +46,8 @@
 
 #include <net/tcp.h>
 #include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
 
 #include <rdma/rdma_cm.h>
 #include <rdma/rdma_cm_ib.h>
@@ -94,7 +96,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)
 EXPORT_SYMBOL(rdma_event_msg);
 
 static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client cma_client = {
        .name   = "cma",
@@ -113,6 +115,22 @@ static DEFINE_IDR(udp_ps);
 static DEFINE_IDR(ipoib_ps);
 static DEFINE_IDR(ib_ps);
 
+static struct idr *cma_idr(enum rdma_port_space ps)
+{
+       switch (ps) {
+       case RDMA_PS_TCP:
+               return &tcp_ps;
+       case RDMA_PS_UDP:
+               return &udp_ps;
+       case RDMA_PS_IPOIB:
+               return &ipoib_ps;
+       case RDMA_PS_IB:
+               return &ib_ps;
+       default:
+               return NULL;
+       }
+}
+
 struct cma_device {
        struct list_head        list;
        struct ib_device        *device;
@@ -122,11 +140,33 @@ struct cma_device {
 };
 
 struct rdma_bind_list {
-       struct idr              *ps;
+       enum rdma_port_space    ps;
        struct hlist_head       owners;
        unsigned short          port;
 };
 
+static int cma_ps_alloc(enum rdma_port_space ps,
+                       struct rdma_bind_list *bind_list, int snum)
+{
+       struct idr *idr = cma_idr(ps);
+
+       return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum)
+{
+       struct idr *idr = cma_idr(ps);
+
+       return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(enum rdma_port_space ps, int snum)
+{
+       struct idr *idr = cma_idr(ps);
+
+       idr_remove(idr, snum);
+}
+
 enum {
        CMA_OPTION_AFONLY,
 };
@@ -225,6 +265,15 @@ struct cma_hdr {
 
 #define CMA_VERSION 0x00
 
+struct cma_req_info {
+       struct ib_device *device;
+       int port;
+       union ib_gid local_gid;
+       __be64 service_id;
+       u16 pkey;
+       bool has_gid:1;
+};
+
 static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
 {
        unsigned long flags;
@@ -262,7 +311,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
        return old;
 }
 
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
 {
        return hdr->ip_version >> 4;
 }
@@ -870,107 +919,397 @@ static inline int cma_any_port(struct sockaddr *addr)
        return !cma_port(addr);
 }
 
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+                            struct sockaddr *dst_addr,
+                            struct rdma_cm_id *listen_id,
                             struct ib_sa_path_rec *path)
 {
        struct sockaddr_ib *listen_ib, *ib;
 
        listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
-       ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
-       ib->sib_family = listen_ib->sib_family;
-       if (path) {
-               ib->sib_pkey = path->pkey;
-               ib->sib_flowinfo = path->flow_label;
-               memcpy(&ib->sib_addr, &path->sgid, 16);
-       } else {
-               ib->sib_pkey = listen_ib->sib_pkey;
-               ib->sib_flowinfo = listen_ib->sib_flowinfo;
-               ib->sib_addr = listen_ib->sib_addr;
-       }
-       ib->sib_sid = listen_ib->sib_sid;
-       ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
-       ib->sib_scope_id = listen_ib->sib_scope_id;
-
-       if (path) {
-               ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
-               ib->sib_family = listen_ib->sib_family;
-               ib->sib_pkey = path->pkey;
-               ib->sib_flowinfo = path->flow_label;
-               memcpy(&ib->sib_addr, &path->dgid, 16);
+       if (src_addr) {
+               ib = (struct sockaddr_ib *)src_addr;
+               ib->sib_family = AF_IB;
+               if (path) {
+                       ib->sib_pkey = path->pkey;
+                       ib->sib_flowinfo = path->flow_label;
+                       memcpy(&ib->sib_addr, &path->sgid, 16);
+                       ib->sib_sid = path->service_id;
+                       ib->sib_scope_id = 0;
+               } else {
+                       ib->sib_pkey = listen_ib->sib_pkey;
+                       ib->sib_flowinfo = listen_ib->sib_flowinfo;
+                       ib->sib_addr = listen_ib->sib_addr;
+                       ib->sib_sid = listen_ib->sib_sid;
+                       ib->sib_scope_id = listen_ib->sib_scope_id;
+               }
+               ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+       }
+       if (dst_addr) {
+               ib = (struct sockaddr_ib *)dst_addr;
+               ib->sib_family = AF_IB;
+               if (path) {
+                       ib->sib_pkey = path->pkey;
+                       ib->sib_flowinfo = path->flow_label;
+                       memcpy(&ib->sib_addr, &path->dgid, 16);
+               }
        }
 }
 
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
-       if (ss->ss_family == AF_INET)
-               return ((struct sockaddr_in *)ss)->sin_port;
-       else if (ss->ss_family == AF_INET6)
-               return ((struct sockaddr_in6 *)ss)->sin6_port;
-       BUG();
-}
-
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-                             struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+                             struct sockaddr *dst_addr,
+                             struct cma_hdr *hdr,
+                             __be16 local_port)
 {
        struct sockaddr_in *ip4;
 
-       ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
-       ip4->sin_family = AF_INET;
-       ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
-       ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+       if (src_addr) {
+               ip4 = (struct sockaddr_in *)src_addr;
+               ip4->sin_family = AF_INET;
+               ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+               ip4->sin_port = local_port;
+       }
 
-       ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
-       ip4->sin_family = AF_INET;
-       ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
-       ip4->sin_port = hdr->port;
+       if (dst_addr) {
+               ip4 = (struct sockaddr_in *)dst_addr;
+               ip4->sin_family = AF_INET;
+               ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+               ip4->sin_port = hdr->port;
+       }
 }
 
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-                             struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+                             struct sockaddr *dst_addr,
+                             struct cma_hdr *hdr,
+                             __be16 local_port)
 {
        struct sockaddr_in6 *ip6;
 
-       ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
-       ip6->sin6_family = AF_INET6;
-       ip6->sin6_addr = hdr->dst_addr.ip6;
-       ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+       if (src_addr) {
+               ip6 = (struct sockaddr_in6 *)src_addr;
+               ip6->sin6_family = AF_INET6;
+               ip6->sin6_addr = hdr->dst_addr.ip6;
+               ip6->sin6_port = local_port;
+       }
 
-       ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
-       ip6->sin6_family = AF_INET6;
-       ip6->sin6_addr = hdr->src_addr.ip6;
-       ip6->sin6_port = hdr->port;
+       if (dst_addr) {
+               ip6 = (struct sockaddr_in6 *)dst_addr;
+               ip6->sin6_family = AF_INET6;
+               ip6->sin6_addr = hdr->src_addr.ip6;
+               ip6->sin6_port = hdr->port;
+       }
 }
 
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
-                            struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
 {
-       struct cma_hdr *hdr;
+       return (u16)be64_to_cpu(service_id);
+}
 
-       if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
-               if (ib_event->event == IB_CM_REQ_RECEIVED)
-                       cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
-               else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
-                       cma_save_ib_info(id, listen_id, NULL);
-               return 0;
-       }
+static int cma_save_ip_info(struct sockaddr *src_addr,
+                           struct sockaddr *dst_addr,
+                           struct ib_cm_event *ib_event,
+                           __be64 service_id)
+{
+       struct cma_hdr *hdr;
+       __be16 port;
 
        hdr = ib_event->private_data;
        if (hdr->cma_version != CMA_VERSION)
                return -EINVAL;
 
+       port = htons(cma_port_from_service_id(service_id));
+
        switch (cma_get_ip_ver(hdr)) {
        case 4:
-               cma_save_ip4_info(id, listen_id, hdr);
+               cma_save_ip4_info(src_addr, dst_addr, hdr, port);
                break;
        case 6:
-               cma_save_ip6_info(id, listen_id, hdr);
+               cma_save_ip6_info(src_addr, dst_addr, hdr, port);
+               break;
+       default:
+               return -EAFNOSUPPORT;
+       }
+
+       return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+                            struct sockaddr *dst_addr,
+                            struct rdma_cm_id *listen_id,
+                            struct ib_cm_event *ib_event,
+                            sa_family_t sa_family, __be64 service_id)
+{
+       if (sa_family == AF_IB) {
+               if (ib_event->event == IB_CM_REQ_RECEIVED)
+                       cma_save_ib_info(src_addr, dst_addr, listen_id,
+                                        ib_event->param.req_rcvd.primary_path);
+               else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+                       cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+               return 0;
+       }
+
+       return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+                            struct cma_req_info *req)
+{
+       const struct ib_cm_req_event_param *req_param =
+               &ib_event->param.req_rcvd;
+       const struct ib_cm_sidr_req_event_param *sidr_param =
+               &ib_event->param.sidr_req_rcvd;
+
+       switch (ib_event->event) {
+       case IB_CM_REQ_RECEIVED:
+               req->device     = req_param->listen_id->device;
+               req->port       = req_param->port;
+               memcpy(&req->local_gid, &req_param->primary_path->sgid,
+                      sizeof(req->local_gid));
+               req->has_gid    = true;
+               req->service_id = req_param->primary_path->service_id;
+               req->pkey       = req_param->bth_pkey;
+               break;
+       case IB_CM_SIDR_REQ_RECEIVED:
+               req->device     = sidr_param->listen_id->device;
+               req->port       = sidr_param->port;
+               req->has_gid    = false;
+               req->service_id = sidr_param->service_id;
+               req->pkey       = sidr_param->bth_pkey;
                break;
        default:
                return -EINVAL;
        }
+
        return 0;
 }
 
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+                                 const struct sockaddr_in *dst_addr,
+                                 const struct sockaddr_in *src_addr)
+{
+       __be32 daddr = dst_addr->sin_addr.s_addr,
+              saddr = src_addr->sin_addr.s_addr;
+       struct fib_result res;
+       struct flowi4 fl4;
+       int err;
+       bool ret;
+
+       if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+           ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+           ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+           ipv4_is_loopback(saddr))
+               return false;
+
+       memset(&fl4, 0, sizeof(fl4));
+       fl4.flowi4_iif = net_dev->ifindex;
+       fl4.daddr = daddr;
+       fl4.saddr = saddr;
+
+       rcu_read_lock();
+       err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+       if (err)
+               return false;
+
+       ret = FIB_RES_DEV(res) == net_dev;
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+                                 const struct sockaddr_in6 *dst_addr,
+                                 const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+       const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+                          IPV6_ADDR_LINKLOCAL;
+       struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+                                        &src_addr->sin6_addr, net_dev->ifindex,
+                                        strict);
+       bool ret;
+
+       if (!rt)
+               return false;
+
+       ret = rt->rt6i_idev->dev == net_dev;
+       ip6_rt_put(rt);
+
+       return ret;
+#else
+       return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+                            const struct sockaddr *daddr,
+                            const struct sockaddr *saddr)
+{
+       const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+       const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+       const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+       const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+       switch (daddr->sa_family) {
+       case AF_INET:
+               return saddr->sa_family == AF_INET &&
+                      validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+       case AF_INET6:
+               return saddr->sa_family == AF_INET6 &&
+                      validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+       default:
+               return false;
+       }
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+                                         const struct cma_req_info *req)
+{
+       struct sockaddr_storage listen_addr_storage, src_addr_storage;
+       struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+                       *src_addr = (struct sockaddr *)&src_addr_storage;
+       struct net_device *net_dev;
+       const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+       int err;
+
+       err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+                              req->service_id);
+       if (err)
+               return ERR_PTR(err);
+
+       net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+                                          gid, listen_addr);
+       if (!net_dev)
+               return ERR_PTR(-ENODEV);
+
+       if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+               dev_put(net_dev);
+               return ERR_PTR(-EHOSTUNREACH);
+       }
+
+       return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+       return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+                                  const struct cma_hdr *hdr)
+{
+       struct sockaddr *addr = cma_src_addr(id_priv);
+       __be32 ip4_addr;
+       struct in6_addr ip6_addr;
+
+       if (cma_any_addr(addr) && !id_priv->afonly)
+               return true;
+
+       switch (addr->sa_family) {
+       case AF_INET:
+               ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+               if (cma_get_ip_ver(hdr) != 4)
+                       return false;
+               if (!cma_any_addr(addr) &&
+                   hdr->dst_addr.ip4.addr != ip4_addr)
+                       return false;
+               break;
+       case AF_INET6:
+               ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+               if (cma_get_ip_ver(hdr) != 6)
+                       return false;
+               if (!cma_any_addr(addr) &&
+                   memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+                       return false;
+               break;
+       case AF_IB:
+               return true;
+       default:
+               return false;
+       }
+
+       return true;
+}
+
+static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
+                             const struct net_device *net_dev)
+{
+       const struct rdma_addr *addr = &id_priv->id.route.addr;
+
+       if (!net_dev)
+               /* This request is an AF_IB request */
+               return addr->src_addr.ss_family == AF_IB;
+
+       return !addr->dev_addr.bound_dev_if ||
+              (net_eq(dev_net(net_dev), &init_net) &&
+               addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+               const struct rdma_bind_list *bind_list,
+               const struct ib_cm_id *cm_id,
+               const struct ib_cm_event *ib_event,
+               const struct cma_req_info *req,
+               const struct net_device *net_dev)
+{
+       struct rdma_id_private *id_priv, *id_priv_dev;
+
+       if (!bind_list)
+               return ERR_PTR(-EINVAL);
+
+       hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+               if (cma_match_private_data(id_priv, ib_event->private_data)) {
+                       if (id_priv->id.device == cm_id->device &&
+                           cma_match_net_dev(id_priv, net_dev))
+                               return id_priv;
+                       list_for_each_entry(id_priv_dev,
+                                           &id_priv->listen_list,
+                                           listen_list) {
+                               if (id_priv_dev->id.device == cm_id->device &&
+                                   cma_match_net_dev(id_priv_dev, net_dev))
+                                       return id_priv_dev;
+                       }
+               }
+       }
+
+       return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+                                                struct ib_cm_event *ib_event,
+                                                struct net_device **net_dev)
+{
+       struct cma_req_info req;
+       struct rdma_bind_list *bind_list;
+       struct rdma_id_private *id_priv;
+       int err;
+
+       err = cma_save_req_info(ib_event, &req);
+       if (err)
+               return ERR_PTR(err);
+
+       *net_dev = cma_get_net_dev(ib_event, &req);
+       if (IS_ERR(*net_dev)) {
+               if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+                       /* Assuming the protocol is AF_IB */
+                       *net_dev = NULL;
+               } else {
+                       return ERR_CAST(*net_dev);
+               }
+       }
+
+       bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
+                               cma_port_from_service_id(req.service_id));
+       id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+       if (IS_ERR(id_priv)) {
+               dev_put(*net_dev);
+               *net_dev = NULL;
+       }
+
+       return id_priv;
+}
+
 static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
 {
        return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1038,7 +1377,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
        mutex_lock(&lock);
        hlist_del(&id_priv->node);
        if (hlist_empty(&bind_list->owners)) {
-               idr_remove(bind_list->ps, bind_list->port);
+               cma_ps_remove(bind_list->ps, bind_list->port);
                kfree(bind_list);
        }
        mutex_unlock(&lock);
@@ -1216,11 +1555,15 @@ out:
 }
 
 static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
-                                              struct ib_cm_event *ib_event)
+                                              struct ib_cm_event *ib_event,
+                                              struct net_device *net_dev)
 {
        struct rdma_id_private *id_priv;
        struct rdma_cm_id *id;
        struct rdma_route *rt;
+       const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+       const __be64 service_id =
+                     ib_event->param.req_rcvd.primary_path->service_id;
        int ret;
 
        id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1229,7 +1572,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
                return NULL;
 
        id_priv = container_of(id, struct rdma_id_private, id);
-       if (cma_save_net_info(id, listen_id, ib_event))
+       if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+                             (struct sockaddr *)&id->route.addr.dst_addr,
+                             listen_id, ib_event, ss_family, service_id))
                goto err;
 
        rt = &id->route;
@@ -1243,14 +1588,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
        if (rt->num_paths == 2)
                rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
 
-       if (cma_any_addr(cma_src_addr(id_priv))) {
-               rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
-               rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
-               ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
-       } else {
-               ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+       if (net_dev) {
+               ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
                if (ret)
                        goto err;
+       } else {
+               /* An AF_IB connection */
+               WARN_ON_ONCE(ss_family != AF_IB);
+
+               cma_translate_ib((struct sockaddr_ib *)cma_src_addr(id_priv),
+                                &rt->addr.dev_addr);
        }
        rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
 
@@ -1263,10 +1610,12 @@ err:
 }
 
 static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
-                                             struct ib_cm_event *ib_event)
+                                             struct ib_cm_event *ib_event,
+                                             struct net_device *net_dev)
 {
        struct rdma_id_private *id_priv;
        struct rdma_cm_id *id;
+       const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
        int ret;
 
        id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1275,13 +1624,24 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
                return NULL;
 
        id_priv = container_of(id, struct rdma_id_private, id);
-       if (cma_save_net_info(id, listen_id, ib_event))
+       if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+                             (struct sockaddr *)&id->route.addr.dst_addr,
+                             listen_id, ib_event, ss_family,
+                             ib_event->param.sidr_req_rcvd.service_id))
                goto err;
 
-       if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
-               ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+       if (net_dev) {
+               ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
                if (ret)
                        goto err;
+       } else {
+               /* An AF_IB connection */
+               WARN_ON_ONCE(ss_family != AF_IB);
+
+               if (!cma_any_addr(cma_src_addr(id_priv)))
+                       cma_translate_ib((struct sockaddr_ib *)
+                                               cma_src_addr(id_priv),
+                                        &id->route.addr.dev_addr);
        }
 
        id_priv->state = RDMA_CM_CONNECT;
@@ -1319,25 +1679,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 {
        struct rdma_id_private *listen_id, *conn_id;
        struct rdma_cm_event event;
+       struct net_device *net_dev;
        int offset, ret;
 
-       listen_id = cm_id->context;
-       if (!cma_check_req_qp_type(&listen_id->id, ib_event))
-               return -EINVAL;
+       listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+       if (IS_ERR(listen_id))
+               return PTR_ERR(listen_id);
 
-       if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
-               return -ECONNABORTED;
+       if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+               ret = -EINVAL;
+               goto net_dev_put;
+       }
+
+       if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+               ret = -ECONNABORTED;
+               goto net_dev_put;
+       }
 
        memset(&event, 0, sizeof event);
        offset = cma_user_data_offset(listen_id);
        event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
        if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
-               conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+               conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
                event.param.ud.private_data = ib_event->private_data + offset;
                event.param.ud.private_data_len =
                                IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
        } else {
-               conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+               conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
                cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
                                       ib_event->private_data, offset);
        }
@@ -1375,6 +1743,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
        mutex_unlock(&conn_id->handler_mutex);
        mutex_unlock(&listen_id->handler_mutex);
        cma_deref_id(conn_id);
+       if (net_dev)
+               dev_put(net_dev);
        return 0;
 
 err3:
@@ -1388,6 +1758,11 @@ err1:
        mutex_unlock(&listen_id->handler_mutex);
        if (conn_id)
                rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+       if (net_dev)
+               dev_put(net_dev);
+
        return ret;
 }
 
@@ -1400,42 +1775,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
 }
 EXPORT_SYMBOL(rdma_get_service_id);
 
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
-                                struct ib_cm_compare_data *compare)
-{
-       struct cma_hdr *cma_data, *cma_mask;
-       __be32 ip4_addr;
-       struct in6_addr ip6_addr;
-
-       memset(compare, 0, sizeof *compare);
-       cma_data = (void *) compare->data;
-       cma_mask = (void *) compare->mask;
-
-       switch (addr->sa_family) {
-       case AF_INET:
-               ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
-               cma_set_ip_ver(cma_data, 4);
-               cma_set_ip_ver(cma_mask, 0xF);
-               if (!cma_any_addr(addr)) {
-                       cma_data->dst_addr.ip4.addr = ip4_addr;
-                       cma_mask->dst_addr.ip4.addr = htonl(~0);
-               }
-               break;
-       case AF_INET6:
-               ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
-               cma_set_ip_ver(cma_data, 6);
-               cma_set_ip_ver(cma_mask, 0xF);
-               if (!cma_any_addr(addr)) {
-                       cma_data->dst_addr.ip6 = ip6_addr;
-                       memset(&cma_mask->dst_addr.ip6, 0xFF,
-                              sizeof cma_mask->dst_addr.ip6);
-               }
-               break;
-       default:
-               break;
-       }
-}
-
 static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
 {
        struct rdma_id_private *id_priv = iw_id->context;
@@ -1589,33 +1928,18 @@ out:
 
 static int cma_ib_listen(struct rdma_id_private *id_priv)
 {
-       struct ib_cm_compare_data compare_data;
        struct sockaddr *addr;
        struct ib_cm_id *id;
        __be64 svc_id;
-       int ret;
 
-       id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+       addr = cma_src_addr(id_priv);
+       svc_id = rdma_get_service_id(&id_priv->id, addr);
+       id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
        if (IS_ERR(id))
                return PTR_ERR(id);
-
        id_priv->cm_id.ib = id;
 
-       addr = cma_src_addr(id_priv);
-       svc_id = rdma_get_service_id(&id_priv->id, addr);
-       if (cma_any_addr(addr) && !id_priv->afonly)
-               ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
-       else {
-               cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
-               ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
-       }
-
-       if (ret) {
-               ib_destroy_cm_id(id_priv->cm_id.ib);
-               id_priv->cm_id.ib = NULL;
-       }
-
-       return ret;
+       return 0;
 }
 
 static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
@@ -2203,8 +2527,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
                src_addr = (struct sockaddr *) &id->route.addr.src_addr;
                src_addr->sa_family = dst_addr->sa_family;
                if (dst_addr->sa_family == AF_INET6) {
-                       ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
-                               ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+                       struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+                       struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+                       src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+                       if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+                               id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
                } else if (dst_addr->sa_family == AF_IB) {
                        ((struct sockaddr_ib *) src_addr)->sib_pkey =
                                ((struct sockaddr_ib *) dst_addr)->sib_pkey;
@@ -2325,8 +2652,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
        hlist_add_head(&id_priv->node, &bind_list->owners);
 }
 
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
-                         unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+                         struct rdma_id_private *id_priv, unsigned short snum)
 {
        struct rdma_bind_list *bind_list;
        int ret;
@@ -2335,7 +2662,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
        if (!bind_list)
                return -ENOMEM;
 
-       ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+       ret = cma_ps_alloc(ps, bind_list, snum);
        if (ret < 0)
                goto err;
 
@@ -2348,7 +2675,8 @@ err:
        return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
 }
 
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+                             struct rdma_id_private *id_priv)
 {
        static unsigned int last_used_port;
        int low, high, remaining;
@@ -2359,7 +2687,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
        rover = prandom_u32() % remaining + low;
 retry:
        if (last_used_port != rover &&
-           !idr_find(ps, (unsigned short) rover)) {
+           !cma_ps_find(ps, (unsigned short)rover)) {
                int ret = cma_alloc_port(ps, id_priv, rover);
                /*
                 * Remember previously used port number in order to avoid
@@ -2414,7 +2742,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
        return 0;
 }
 
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+                       struct rdma_id_private *id_priv)
 {
        struct rdma_bind_list *bind_list;
        unsigned short snum;
@@ -2424,7 +2753,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
        if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
                return -EACCES;
 
-       bind_list = idr_find(ps, snum);
+       bind_list = cma_ps_find(ps, snum);
        if (!bind_list) {
                ret = cma_alloc_port(ps, id_priv, snum);
        } else {
@@ -2447,25 +2776,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
        return ret;
 }
 
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+               struct rdma_id_private *id_priv)
 {
        switch (id_priv->id.ps) {
        case RDMA_PS_TCP:
-               return &tcp_ps;
        case RDMA_PS_UDP:
-               return &udp_ps;
        case RDMA_PS_IPOIB:
-               return &ipoib_ps;
        case RDMA_PS_IB:
-               return &ib_ps;
+               return id_priv->id.ps;
        default:
-               return NULL;
+
+               return 0;
        }
 }
 
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
 {
-       struct idr *ps = NULL;
+       enum rdma_port_space ps = 0;
        struct sockaddr_ib *sib;
        u64 sid_ps, mask, sid;
 
@@ -2475,15 +2803,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
 
        if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
                sid_ps = RDMA_IB_IP_PS_IB;
-               ps = &ib_ps;
+               ps = RDMA_PS_IB;
        } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
                   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
                sid_ps = RDMA_IB_IP_PS_TCP;
-               ps = &tcp_ps;
+               ps = RDMA_PS_TCP;
        } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
                   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
                sid_ps = RDMA_IB_IP_PS_UDP;
-               ps = &udp_ps;
+               ps = RDMA_PS_UDP;
        }
 
        if (ps) {
@@ -2496,7 +2824,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
 
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
-       struct idr *ps;
+       enum rdma_port_space ps;
        int ret;
 
        if (cma_family(id_priv) != AF_IB)
@@ -3551,11 +3879,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
        wait_for_completion(&cma_dev->comp);
 }
 
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
 {
-       struct cma_device *cma_dev;
+       struct cma_device *cma_dev = client_data;
 
-       cma_dev = ib_get_client_data(device, &cma_client);
        if (!cma_dev)
                return;
 
index 87d1936f5c1caa93e188af91e9aefb1e6484902b..70bb36ebb03b8e91ff2cf89445b0a8f52329e591 100644 (file)
@@ -43,12 +43,58 @@ int  ib_device_register_sysfs(struct ib_device *device,
                                                   u8, struct kobject *));
 void ib_device_unregister_sysfs(struct ib_device *device);
 
-int  ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
-int  ib_cache_setup(void);
+void ib_cache_setup(void);
 void ib_cache_cleanup(void);
 
 int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
                            struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+             struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+            struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+                        roce_netdev_filter filter,
+                        void *filter_cookie,
+                        roce_netdev_callback cb,
+                        void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+                             void *filter_cookie,
+                             roce_netdev_callback cb,
+                             void *cookie);
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+                             const union ib_gid *gid,
+                             u8 port, struct net_device *ndev,
+                             u16 *index);
+
+enum ib_cache_gid_default_mode {
+       IB_CACHE_GID_DEFAULT_MODE_SET,
+       IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+                                 struct net_device *ndev,
+                                 enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+                    union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+                    union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+                                    struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
 #endif /* _CORE_PRIV_H */
index 9567756ca4f9f9024032adcc0211938583e96066..17639117afc6ab8637c7e73a3e2a13e6101a9702 100644 (file)
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/netdevice.h>
 #include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "core_priv.h"
 
@@ -50,22 +53,34 @@ struct ib_client_data {
        struct list_head  list;
        struct ib_client *client;
        void *            data;
+       /* The device or client is going down. Do not call client or device
+        * callbacks other than remove(). */
+       bool              going_down;
 };
 
 struct workqueue_struct *ib_wq;
 EXPORT_SYMBOL_GPL(ib_wq);
 
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
 static LIST_HEAD(device_list);
 static LIST_HEAD(client_list);
 
 /*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list.  In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list.  device_mutex protects writer access by device and client
+ * registration / de-registration.  lists_rwsem protects reader access to
+ * these lists.  Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
  */
 static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
 
 static int ib_device_check_mandatory(struct ib_device *device)
 {
@@ -152,6 +167,36 @@ static int alloc_name(char *name)
        return 0;
 }
 
+static void ib_device_release(struct device *device)
+{
+       struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+       ib_cache_release_one(dev);
+       kfree(dev->port_immutable);
+       kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+                           struct kobj_uevent_env *env)
+{
+       struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+       if (add_uevent_var(env, "NAME=%s", dev->name))
+               return -ENOMEM;
+
+       /*
+        * It would be nice to pass the node GUID with the event...
+        */
+
+       return 0;
+}
+
+static struct class ib_class = {
+       .name    = "infiniband",
+       .dev_release = ib_device_release,
+       .dev_uevent = ib_device_uevent,
+};
+
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -164,9 +209,27 @@ static int alloc_name(char *name)
  */
 struct ib_device *ib_alloc_device(size_t size)
 {
-       BUG_ON(size < sizeof (struct ib_device));
+       struct ib_device *device;
+
+       if (WARN_ON(size < sizeof(struct ib_device)))
+               return NULL;
+
+       device = kzalloc(size, GFP_KERNEL);
+       if (!device)
+               return NULL;
+
+       device->dev.class = &ib_class;
+       device_initialize(&device->dev);
+
+       dev_set_drvdata(&device->dev, device);
+
+       INIT_LIST_HEAD(&device->event_handler_list);
+       spin_lock_init(&device->event_handler_lock);
+       spin_lock_init(&device->client_data_lock);
+       INIT_LIST_HEAD(&device->client_data_list);
+       INIT_LIST_HEAD(&device->port_list);
 
-       return kzalloc(size, GFP_KERNEL);
+       return device;
 }
 EXPORT_SYMBOL(ib_alloc_device);
 
@@ -178,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);
  */
 void ib_dealloc_device(struct ib_device *device)
 {
-       if (device->reg_state == IB_DEV_UNINITIALIZED) {
-               kfree(device);
-               return;
-       }
-
-       BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+       WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+               device->reg_state != IB_DEV_UNINITIALIZED);
        kobject_put(&device->dev.kobj);
 }
 EXPORT_SYMBOL(ib_dealloc_device);
@@ -203,10 +261,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
 
        context->client = client;
        context->data   = NULL;
+       context->going_down = false;
 
+       down_write(&lists_rwsem);
        spin_lock_irqsave(&device->client_data_lock, flags);
        list_add(&context->list, &device->client_data_list);
        spin_unlock_irqrestore(&device->client_data_lock, flags);
+       up_write(&lists_rwsem);
 
        return 0;
 }
@@ -219,7 +280,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
 
 static int read_port_immutable(struct ib_device *device)
 {
-       int ret = -ENOMEM;
+       int ret;
        u8 start_port = rdma_start_port(device);
        u8 end_port = rdma_end_port(device);
        u8 port;
@@ -235,26 +296,18 @@ static int read_port_immutable(struct ib_device *device)
                                         * (end_port + 1),
                                         GFP_KERNEL);
        if (!device->port_immutable)
-               goto err;
+               return -ENOMEM;
 
        for (port = start_port; port <= end_port; ++port) {
                ret = device->get_port_immutable(device, port,
                                                 &device->port_immutable[port]);
                if (ret)
-                       goto err;
+                       return ret;
 
-               if (verify_immutable(device, port)) {
-                       ret = -EINVAL;
-                       goto err;
-               }
+               if (verify_immutable(device, port))
+                       return -EINVAL;
        }
-
-       ret = 0;
-       goto out;
-err:
-       kfree(device->port_immutable);
-out:
-       return ret;
+       return 0;
 }
 
 /**
@@ -271,6 +324,7 @@ int ib_register_device(struct ib_device *device,
                                            u8, struct kobject *))
 {
        int ret;
+       struct ib_client *client;
 
        mutex_lock(&device_mutex);
 
@@ -285,11 +339,6 @@ int ib_register_device(struct ib_device *device,
                goto out;
        }
 
-       INIT_LIST_HEAD(&device->event_handler_list);
-       INIT_LIST_HEAD(&device->client_data_list);
-       spin_lock_init(&device->event_handler_lock);
-       spin_lock_init(&device->client_data_lock);
-
        ret = read_port_immutable(device);
        if (ret) {
                printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
@@ -297,27 +346,30 @@ int ib_register_device(struct ib_device *device,
                goto out;
        }
 
+       ret = ib_cache_setup_one(device);
+       if (ret) {
+               printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+               goto out;
+       }
+
        ret = ib_device_register_sysfs(device, port_callback);
        if (ret) {
                printk(KERN_WARNING "Couldn't register device %s with driver model\n",
                       device->name);
-               kfree(device->port_immutable);
+               ib_cache_cleanup_one(device);
                goto out;
        }
 
-       list_add_tail(&device->core_list, &device_list);
-
        device->reg_state = IB_DEV_REGISTERED;
 
-       {
-               struct ib_client *client;
-
-               list_for_each_entry(client, &client_list, list)
-                       if (client->add && !add_client_context(device, client))
-                               client->add(device);
-       }
+       list_for_each_entry(client, &client_list, list)
+               if (client->add && !add_client_context(device, client))
+                       client->add(device);
 
- out:
+       down_write(&lists_rwsem);
+       list_add_tail(&device->core_list, &device_list);
+       up_write(&lists_rwsem);
+out:
        mutex_unlock(&device_mutex);
        return ret;
 }
@@ -331,26 +383,37 @@ EXPORT_SYMBOL(ib_register_device);
  */
 void ib_unregister_device(struct ib_device *device)
 {
-       struct ib_client *client;
        struct ib_client_data *context, *tmp;
        unsigned long flags;
 
        mutex_lock(&device_mutex);
 
-       list_for_each_entry_reverse(client, &client_list, list)
-               if (client->remove)
-                       client->remove(device);
-
+       down_write(&lists_rwsem);
        list_del(&device->core_list);
+       spin_lock_irqsave(&device->client_data_lock, flags);
+       list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+               context->going_down = true;
+       spin_unlock_irqrestore(&device->client_data_lock, flags);
+       downgrade_write(&lists_rwsem);
+
+       list_for_each_entry_safe(context, tmp, &device->client_data_list,
+                                list) {
+               if (context->client->remove)
+                       context->client->remove(device, context->data);
+       }
+       up_read(&lists_rwsem);
 
        mutex_unlock(&device_mutex);
 
        ib_device_unregister_sysfs(device);
+       ib_cache_cleanup_one(device);
 
+       down_write(&lists_rwsem);
        spin_lock_irqsave(&device->client_data_lock, flags);
        list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
                kfree(context);
        spin_unlock_irqrestore(&device->client_data_lock, flags);
+       up_write(&lists_rwsem);
 
        device->reg_state = IB_DEV_UNREGISTERED;
 }
@@ -375,11 +438,14 @@ int ib_register_client(struct ib_client *client)
 
        mutex_lock(&device_mutex);
 
-       list_add_tail(&client->list, &client_list);
        list_for_each_entry(device, &device_list, core_list)
                if (client->add && !add_client_context(device, client))
                        client->add(device);
 
+       down_write(&lists_rwsem);
+       list_add_tail(&client->list, &client_list);
+       up_write(&lists_rwsem);
+
        mutex_unlock(&device_mutex);
 
        return 0;
@@ -402,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)
 
        mutex_lock(&device_mutex);
 
+       down_write(&lists_rwsem);
+       list_del(&client->list);
+       up_write(&lists_rwsem);
+
        list_for_each_entry(device, &device_list, core_list) {
-               if (client->remove)
-                       client->remove(device);
+               struct ib_client_data *found_context = NULL;
 
+               down_write(&lists_rwsem);
                spin_lock_irqsave(&device->client_data_lock, flags);
                list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
                        if (context->client == client) {
-                               list_del(&context->list);
-                               kfree(context);
+                               context->going_down = true;
+                               found_context = context;
+                               break;
                        }
                spin_unlock_irqrestore(&device->client_data_lock, flags);
+               up_write(&lists_rwsem);
+
+               if (client->remove)
+                       client->remove(device, found_context ?
+                                              found_context->data : NULL);
+
+               if (!found_context) {
+                       pr_warn("No client context found for %s/%s\n",
+                               device->name, client->name);
+                       continue;
+               }
+
+               down_write(&lists_rwsem);
+               spin_lock_irqsave(&device->client_data_lock, flags);
+               list_del(&found_context->list);
+               kfree(found_context);
+               spin_unlock_irqrestore(&device->client_data_lock, flags);
+               up_write(&lists_rwsem);
        }
-       list_del(&client->list);
 
        mutex_unlock(&device_mutex);
 }
@@ -590,10 +678,79 @@ EXPORT_SYMBOL(ib_query_port);
 int ib_query_gid(struct ib_device *device,
                 u8 port_num, int index, union ib_gid *gid)
 {
+       if (rdma_cap_roce_gid_table(device, port_num))
+               return ib_get_cached_gid(device, port_num, index, gid);
+
        return device->query_gid(device, port_num, index, gid);
 }
 EXPORT_SYMBOL(ib_query_gid);
 
+/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+                        roce_netdev_filter filter,
+                        void *filter_cookie,
+                        roce_netdev_callback cb,
+                        void *cookie)
+{
+       u8 port;
+
+       for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+            port++)
+               if (rdma_protocol_roce(ib_dev, port)) {
+                       struct net_device *idev = NULL;
+
+                       if (ib_dev->get_netdev)
+                               idev = ib_dev->get_netdev(ib_dev, port);
+
+                       if (idev &&
+                           idev->reg_state >= NETREG_UNREGISTERED) {
+                               dev_put(idev);
+                               idev = NULL;
+                       }
+
+                       if (filter(ib_dev, port, idev, filter_cookie))
+                               cb(ib_dev, port, idev, cookie);
+
+                       if (idev)
+                               dev_put(idev);
+               }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+                             void *filter_cookie,
+                             roce_netdev_callback cb,
+                             void *cookie)
+{
+       struct ib_device *dev;
+
+       down_read(&lists_rwsem);
+       list_for_each_entry(dev, &device_list, core_list)
+               ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+       up_read(&lists_rwsem);
+}
+
 /**
  * ib_query_pkey - Get P_Key table entry
  * @device:Device to query
@@ -673,6 +830,14 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
        int ret, port, i;
 
        for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+               if (rdma_cap_roce_gid_table(device, port)) {
+                       if (!ib_cache_gid_find_by_port(device, gid, port,
+                                                      NULL, index)) {
+                               *port_num = port;
+                               return 0;
+                       }
+               }
+
                for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
                        ret = ib_query_gid(device, port, i, &tmp_gid);
                        if (ret)
@@ -729,6 +894,51 @@ int ib_find_pkey(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_find_pkey);
 
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev:       An RDMA device on which the request has been received.
+ * @port:      Port number on the RDMA device.
+ * @pkey:      The Pkey the request came on.
+ * @gid:       A GID that the net_dev uses to communicate.
+ * @addr:      Contains the IP address that the request specified as its
+ *             destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+                                           u8 port,
+                                           u16 pkey,
+                                           const union ib_gid *gid,
+                                           const struct sockaddr *addr)
+{
+       struct net_device *net_dev = NULL;
+       struct ib_client_data *context;
+
+       if (!rdma_protocol_ib(dev, port))
+               return NULL;
+
+       down_read(&lists_rwsem);
+
+       list_for_each_entry(context, &dev->client_data_list, list) {
+               struct ib_client *client = context->client;
+
+               if (context->going_down)
+                       continue;
+
+               if (client->get_net_dev_by_params) {
+                       net_dev = client->get_net_dev_by_params(dev, port, pkey,
+                                                               gid, addr,
+                                                               context->data);
+                       if (net_dev)
+                               break;
+               }
+       }
+
+       up_read(&lists_rwsem);
+
+       return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
 static int __init ib_core_init(void)
 {
        int ret;
@@ -737,7 +947,7 @@ static int __init ib_core_init(void)
        if (!ib_wq)
                return -ENOMEM;
 
-       ret = ib_sysfs_setup();
+       ret = class_register(&ib_class);
        if (ret) {
                printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
                goto err;
@@ -749,19 +959,12 @@ static int __init ib_core_init(void)
                goto err_sysfs;
        }
 
-       ret = ib_cache_setup();
-       if (ret) {
-               printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
-               goto err_nl;
-       }
+       ib_cache_setup();
 
        return 0;
 
-err_nl:
-       ibnl_cleanup();
-
 err_sysfs:
-       ib_sysfs_cleanup();
+       class_unregister(&ib_class);
 
 err:
        destroy_workqueue(ib_wq);
@@ -772,7 +975,7 @@ static void __exit ib_core_cleanup(void)
 {
        ib_cache_cleanup();
        ibnl_cleanup();
-       ib_sysfs_cleanup();
+       class_unregister(&ib_class);
        /* Make sure that any pending umem accounting work is done. */
        destroy_workqueue(ib_wq);
 }
index 786fc51bf04b22b0d9b0fc371f3fdb25ec4c811b..4b5c72311debbe59ae0975ec6d0fa722db13234e 100644 (file)
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
                goto error1;
        }
 
-       mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
-                                                IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(mad_agent_priv->agent.mr)) {
-               ret = ERR_PTR(-ENOMEM);
-               goto error2;
-       }
-
        if (mad_reg_req) {
                reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
                if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
        spin_unlock_irqrestore(&port_priv->reg_lock, flags);
        kfree(reg_req);
 error3:
-       ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
        kfree(mad_agent_priv);
 error1:
        return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
        wait_for_completion(&mad_agent_priv->comp);
 
        kfree(mad_agent_priv->reg_req);
-       ib_dereg_mr(mad_agent_priv->agent.mr);
        kfree(mad_agent_priv);
 }
 
@@ -1038,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 
        mad_send_wr->mad_agent_priv = mad_agent_priv;
        mad_send_wr->sg_list[0].length = hdr_len;
-       mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+       mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
 
        /* OPA MADs don't have to be the full 2048 bytes */
        if (opa && base_version == OPA_MGMT_BASE_VERSION &&
@@ -1047,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
        else
                mad_send_wr->sg_list[1].length = mad_size - hdr_len;
 
-       mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+       mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
 
        mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
        mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
@@ -2885,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
        struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
 
        /* Initialize common scatter list fields */
-       sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+       sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
 
        /* Initialize common receive WR fields */
        recv_wr.next = NULL;
@@ -3201,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,
                goto error4;
        }
 
-       port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(port_priv->mr)) {
-               dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
-               ret = PTR_ERR(port_priv->mr);
-               goto error5;
-       }
-
        if (has_smi) {
                ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
                if (ret)
@@ -3248,8 +3231,6 @@ error8:
 error7:
        destroy_mad_qp(&port_priv->qp_info[0]);
 error6:
-       ib_dereg_mr(port_priv->mr);
-error5:
        ib_dealloc_pd(port_priv->pd);
 error4:
        ib_destroy_cq(port_priv->cq);
@@ -3284,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
        destroy_workqueue(port_priv->wq);
        destroy_mad_qp(&port_priv->qp_info[1]);
        destroy_mad_qp(&port_priv->qp_info[0]);
-       ib_dereg_mr(port_priv->mr);
        ib_dealloc_pd(port_priv->pd);
        ib_destroy_cq(port_priv->cq);
        cleanup_recv_queue(&port_priv->qp_info[1]);
@@ -3335,7 +3315,7 @@ error:
        }
 }
 
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
 {
        int i;
 
index 5be89f98928f203ff43208a5fed57f3890b6b82e..4a4f7aad09783de0cfd3fb92031b03890925b760 100644 (file)
@@ -199,7 +199,6 @@ struct ib_mad_port_private {
        int port_num;
        struct ib_cq *cq;
        struct ib_pd *pd;
-       struct ib_mr *mr;
 
        spinlock_t reg_lock;
        struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
index 2cb865c7ce7a98773f338b1b8c09ffc66db4cebf..d38d8b2b2979ddc2bebb243b98b79a04644fc929 100644 (file)
@@ -43,7 +43,7 @@
 #include "sa.h"
 
 static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client mcast_client = {
        .name   = "ib_multicast",
@@ -840,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)
        ib_register_event_handler(&dev->event_handler);
 }
 
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
 {
-       struct mcast_device *dev;
+       struct mcast_device *dev = client_data;
        struct mcast_port *port;
        int i;
 
-       dev = ib_get_client_data(device, &mcast_client);
        if (!dev)
                return;
 
index 23dd5a5c7597122de45eddf8211c736bedae553e..d47df935677966b6b8ceaf3217036ee8a16f4507 100644 (file)
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
 static struct sock *nls;
 static LIST_HEAD(client_list);
 
+int ibnl_chk_listeners(unsigned int group)
+{
+       if (netlink_has_listeners(nls, group) == 0)
+               return -1;
+       return 0;
+}
+EXPORT_SYMBOL(ibnl_chk_listeners);
+
 int ibnl_add_client(int index, int nops,
                    const struct ibnl_client_cbs cb_table[])
 {
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                            !client->cb_table[op].dump)
                                return -EINVAL;
 
+                       /*
+                        * For response or local service set_timeout request,
+                        * there is no need to use netlink_dump_start.
+                        */
+                       if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+                           (index == RDMA_NL_LS &&
+                            op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+                               struct netlink_callback cb = {
+                                       .skb = skb,
+                                       .nlh = nlh,
+                                       .dump = client->cb_table[op].dump,
+                                       .module = client->cb_table[op].module,
+                               };
+
+                               return cb.dump(skb, &cb);
+                       }
+
                        {
                                struct netlink_dump_control c = {
                                        .dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        return -EINVAL;
 }
 
+static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+{
+       struct nlmsghdr *nlh;
+       int msglen;
+
+       /*
+        * Process responses until there is no more message or the first
+        * request. Generally speaking, it is not recommended to mix responses
+        * with requests.
+        */
+       while (skb->len >= nlmsg_total_size(0)) {
+               nlh = nlmsg_hdr(skb);
+
+               if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+                       return;
+
+               /* Handle response only */
+               if (nlh->nlmsg_flags & NLM_F_REQUEST)
+                       return;
+
+               ibnl_rcv_msg(skb, nlh);
+
+               msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+               if (msglen > skb->len)
+                       msglen = skb->len;
+               skb_pull(skb, msglen);
+       }
+}
+
 static void ibnl_rcv(struct sk_buff *skb)
 {
        mutex_lock(&ibnl_mutex);
+       ibnl_rcv_reply_skb(skb);
        netlink_rcv_skb(skb, &ibnl_rcv_msg);
        mutex_unlock(&ibnl_mutex);
 }
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644 (file)
index 0000000..6b24cba
--- /dev/null
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+enum gid_op_type {
+       GID_DEL = 0,
+       GID_ADD
+};
+
+struct update_gid_event_work {
+       struct work_struct work;
+       union ib_gid       gid;
+       struct ib_gid_attr gid_attr;
+       enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ                3
+struct netdev_event_work_cmd {
+       roce_netdev_callback    cb;
+       roce_netdev_filter      filter;
+       struct net_device       *ndev;
+       struct net_device       *filter_ndev;
+};
+
+struct netdev_event_work {
+       struct work_struct              work;
+       struct netdev_event_work_cmd    cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+                      u8 port, union ib_gid *gid,
+                      struct ib_gid_attr *gid_attr)
+{
+       switch (gid_op) {
+       case GID_ADD:
+               ib_cache_gid_add(ib_dev, port, gid, gid_attr);
+               break;
+       case GID_DEL:
+               ib_cache_gid_del(ib_dev, port, gid, gid_attr);
+               break;
+       }
+}
+
+enum bonding_slave_state {
+       BONDING_SLAVE_STATE_ACTIVE      = 1UL << 0,
+       BONDING_SLAVE_STATE_INACTIVE    = 1UL << 1,
+       /* No primary slave or the device isn't a slave in bonding */
+       BONDING_SLAVE_STATE_NA          = 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+                                                                  struct net_device *upper)
+{
+       if (upper && netif_is_bond_master(upper)) {
+               struct net_device *pdev =
+                       bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+               if (pdev)
+                       return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+                               BONDING_SLAVE_STATE_INACTIVE;
+       }
+
+       return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+       struct net_device *_upper = NULL;
+       struct list_head *iter;
+
+       netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+               if (_upper == upper)
+                       break;
+
+       return _upper == upper;
+}
+
+#define REQUIRED_BOND_STATES           (BONDING_SLAVE_STATE_ACTIVE |   \
+                                        BONDING_SLAVE_STATE_NA)
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+                                struct net_device *rdma_ndev, void *cookie)
+{
+       struct net_device *event_ndev = (struct net_device *)cookie;
+       struct net_device *real_dev;
+       int res;
+
+       if (!rdma_ndev)
+               return 0;
+
+       rcu_read_lock();
+       real_dev = rdma_vlan_dev_real_dev(event_ndev);
+       if (!real_dev)
+               real_dev = event_ndev;
+
+       res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+              (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+               REQUIRED_BOND_STATES)) ||
+              real_dev == rdma_ndev);
+
+       rcu_read_unlock();
+       return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+                                     struct net_device *rdma_ndev, void *cookie)
+{
+       struct net_device *master_dev;
+       int res;
+
+       if (!rdma_ndev)
+               return 0;
+
+       rcu_read_lock();
+       master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+       res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+               BONDING_SLAVE_STATE_INACTIVE;
+       rcu_read_unlock();
+
+       return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+                          struct net_device *rdma_ndev, void *cookie)
+{
+       return 1;
+}
+
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+                              struct net_device *rdma_ndev, void *cookie)
+{
+       struct net_device *event_ndev = (struct net_device *)cookie;
+       int res;
+
+       if (!rdma_ndev)
+               return 0;
+
+       if (rdma_ndev == event_ndev)
+               return 1;
+
+       rcu_read_lock();
+       res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+       rcu_read_unlock();
+
+       return res;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+                         struct ib_device *ib_dev,
+                         u8 port, struct net_device *ndev,
+                         struct sockaddr *addr)
+{
+       union ib_gid gid;
+       struct ib_gid_attr gid_attr;
+
+       rdma_ip2gid(addr, &gid);
+       memset(&gid_attr, 0, sizeof(gid_attr));
+       gid_attr.ndev = ndev;
+
+       update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+                                    u8 port, struct net_device *event_ndev,
+                                    struct net_device *rdma_ndev)
+{
+       rcu_read_lock();
+       if (!rdma_ndev ||
+           ((rdma_ndev != event_ndev &&
+             !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+            is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+                                               netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+            BONDING_SLAVE_STATE_INACTIVE)) {
+               rcu_read_unlock();
+               return;
+       }
+       rcu_read_unlock();
+
+       ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+                                    IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+                                           u8 port,
+                                           struct net_device *event_ndev,
+                                           struct net_device *rdma_ndev)
+{
+       struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+
+       if (!rdma_ndev)
+               return;
+
+       if (!real_dev)
+               real_dev = event_ndev;
+
+       rcu_read_lock();
+
+       if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+           is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
+           BONDING_SLAVE_STATE_INACTIVE) {
+               rcu_read_unlock();
+
+               ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+                                            IB_CACHE_GID_DEFAULT_MODE_DELETE);
+       } else {
+               rcu_read_unlock();
+       }
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+                                u8 port, struct net_device *ndev)
+{
+       struct in_device *in_dev;
+
+       if (ndev->reg_state >= NETREG_UNREGISTERING)
+               return;
+
+       in_dev = in_dev_get(ndev);
+       if (!in_dev)
+               return;
+
+       for_ifa(in_dev) {
+               struct sockaddr_in ip;
+
+               ip.sin_family = AF_INET;
+               ip.sin_addr.s_addr = ifa->ifa_address;
+               update_gid_ip(GID_ADD, ib_dev, port, ndev,
+                             (struct sockaddr *)&ip);
+       }
+       endfor_ifa(in_dev);
+
+       in_dev_put(in_dev);
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+                                u8 port, struct net_device *ndev)
+{
+       struct inet6_ifaddr *ifp;
+       struct inet6_dev *in6_dev;
+       struct sin6_list {
+               struct list_head        list;
+               struct sockaddr_in6     sin6;
+       };
+       struct sin6_list *sin6_iter;
+       struct sin6_list *sin6_temp;
+       struct ib_gid_attr gid_attr = {.ndev = ndev};
+       LIST_HEAD(sin6_list);
+
+       if (ndev->reg_state >= NETREG_UNREGISTERING)
+               return;
+
+       in6_dev = in6_dev_get(ndev);
+       if (!in6_dev)
+               return;
+
+       read_lock_bh(&in6_dev->lock);
+       list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+               struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+               if (!entry) {
+                       pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+                       continue;
+               }
+
+               entry->sin6.sin6_family = AF_INET6;
+               entry->sin6.sin6_addr = ifp->addr;
+               list_add_tail(&entry->list, &sin6_list);
+       }
+       read_unlock_bh(&in6_dev->lock);
+
+       in6_dev_put(in6_dev);
+
+       list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+               union ib_gid    gid;
+
+               rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+               update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+               list_del(&sin6_iter->list);
+               kfree(sin6_iter);
+       }
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+                           struct net_device *ndev)
+{
+       enum_netdev_ipv4_ips(ib_dev, port, ndev);
+       if (IS_ENABLED(CONFIG_IPV6))
+               enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+                          struct net_device *rdma_ndev, void *cookie)
+{
+       struct net_device *event_ndev = (struct net_device *)cookie;
+
+       enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+       _add_netdev_ips(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+                          struct net_device *rdma_ndev, void *cookie)
+{
+       struct net_device *event_ndev = (struct net_device *)cookie;
+
+       ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+                                   u8 port,
+                                   struct net_device *rdma_ndev,
+                                   void *cookie)
+{
+       struct net *net;
+       struct net_device *ndev;
+
+       /* Lock the rtnl to make sure the netdevs does not move under
+        * our feet
+        */
+       rtnl_lock();
+       for_each_net(net)
+               for_each_netdev(net, ndev)
+                       if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+                               add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+       rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+       ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+                           enum_all_gids_of_dev_cb, NULL);
+
+       return 0;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+                                             u8 port,
+                                             struct net_device *rdma_ndev,
+                                             void *cookie)
+{
+       struct update_gid_event_work *parsed = cookie;
+
+       return update_gid(parsed->gid_op, device,
+                         port, &parsed->gid,
+                         &parsed->gid_attr);
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+                               void *cookie,
+                               void (*handle_netdev)(struct ib_device *ib_dev,
+                                                     u8 port,
+                                                     struct net_device *ndev))
+{
+       struct net_device *ndev = (struct net_device *)cookie;
+       struct upper_list {
+               struct list_head list;
+               struct net_device *upper;
+       };
+       struct net_device *upper;
+       struct list_head *iter;
+       struct upper_list *upper_iter;
+       struct upper_list *upper_temp;
+       LIST_HEAD(upper_list);
+
+       rcu_read_lock();
+       netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+               struct upper_list *entry = kmalloc(sizeof(*entry),
+                                                  GFP_ATOMIC);
+
+               if (!entry) {
+                       pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+                       continue;
+               }
+
+               list_add_tail(&entry->list, &upper_list);
+               dev_hold(upper);
+               entry->upper = upper;
+       }
+       rcu_read_unlock();
+
+       handle_netdev(ib_dev, port, ndev);
+       list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+                                list) {
+               handle_netdev(ib_dev, port, upper_iter->upper);
+               dev_put(upper_iter->upper);
+               list_del(&upper_iter->list);
+               kfree(upper_iter);
+       }
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+                                     struct net_device *event_ndev)
+{
+       ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+                                struct net_device *rdma_ndev, void *cookie)
+{
+       handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+                                struct net_device *rdma_ndev, void *cookie)
+{
+       handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+                                       struct net_device *rdma_ndev,
+                                       void *cookie)
+{
+       struct net_device *master_ndev;
+
+       rcu_read_lock();
+       master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+       if (master_ndev)
+               dev_hold(master_ndev);
+       rcu_read_unlock();
+
+       if (master_ndev) {
+               bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+                                               rdma_ndev);
+               dev_put(master_ndev);
+       }
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+                                  struct net_device *rdma_ndev, void *cookie)
+{
+       struct net_device *event_ndev = (struct net_device *)cookie;
+
+       bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+       struct netdev_event_work *work =
+               container_of(_work, struct netdev_event_work, work);
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+               ib_enum_all_roce_netdevs(work->cmds[i].filter,
+                                        work->cmds[i].filter_ndev,
+                                        work->cmds[i].cb,
+                                        work->cmds[i].ndev);
+               dev_put(work->cmds[i].ndev);
+               dev_put(work->cmds[i].filter_ndev);
+       }
+
+       kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+                               struct net_device *ndev)
+{
+       unsigned int i;
+       struct netdev_event_work *ndev_work =
+               kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+       if (!ndev_work) {
+               pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+               return NOTIFY_DONE;
+       }
+
+       memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+       for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+               if (!ndev_work->cmds[i].ndev)
+                       ndev_work->cmds[i].ndev = ndev;
+               if (!ndev_work->cmds[i].filter_ndev)
+                       ndev_work->cmds[i].filter_ndev = ndev;
+               dev_hold(ndev_work->cmds[i].ndev);
+               dev_hold(ndev_work->cmds[i].filter_ndev);
+       }
+       INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+       queue_work(ib_wq, &ndev_work->work);
+
+       return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+       .cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+       .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
+                                       struct netdev_event_work_cmd *cmds)
+{
+       static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+               .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+       static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+               .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+       if (changeupper_info->linking == false) {
+               cmds[0] = upper_ips_del_cmd;
+               cmds[0].ndev = changeupper_info->upper_dev;
+               cmds[1] = add_cmd;
+       } else {
+               cmds[0] = bonding_default_del_cmd;
+               cmds[0].ndev = changeupper_info->upper_dev;
+               cmds[1] = add_cmd_upper_ips;
+               cmds[1].ndev = changeupper_info->upper_dev;
+               cmds[1].filter_ndev = changeupper_info->upper_dev;
+       }
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+                          void *ptr)
+{
+       static const struct netdev_event_work_cmd del_cmd = {
+               .cb = del_netdev_ips, .filter = pass_all_filter};
+       static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+               .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+       static const struct netdev_event_work_cmd default_del_cmd = {
+               .cb = del_netdev_default_ips, .filter = pass_all_filter};
+       static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+               .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+       struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+       struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+       if (ndev->type != ARPHRD_ETHER)
+               return NOTIFY_DONE;
+
+       switch (event) {
+       case NETDEV_REGISTER:
+       case NETDEV_UP:
+               cmds[0] = bonding_default_del_cmd_join;
+               cmds[1] = add_cmd;
+               break;
+
+       case NETDEV_UNREGISTER:
+               if (ndev->reg_state < NETREG_UNREGISTERED)
+                       cmds[0] = del_cmd;
+               else
+                       return NOTIFY_DONE;
+               break;
+
+       case NETDEV_CHANGEADDR:
+               cmds[0] = default_del_cmd;
+               cmds[1] = add_cmd;
+               break;
+
+       case NETDEV_CHANGEUPPER:
+               netdevice_event_changeupper(
+                       container_of(ptr, struct netdev_notifier_changeupper_info, info),
+                       cmds);
+               break;
+
+       case NETDEV_BONDING_FAILOVER:
+               cmds[0] = bonding_event_ips_del_cmd;
+               cmds[1] = bonding_default_del_cmd_join;
+               cmds[2] = add_cmd_upper_ips;
+               break;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+       struct update_gid_event_work *work =
+               container_of(_work, struct update_gid_event_work, work);
+
+       ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+                                callback_for_addr_gid_device_scan, work);
+
+       dev_put(work->gid_attr.ndev);
+       kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+                     struct sockaddr *sa, struct net_device *ndev)
+{
+       struct update_gid_event_work *work;
+       enum gid_op_type gid_op;
+
+       if (ndev->type != ARPHRD_ETHER)
+               return NOTIFY_DONE;
+
+       switch (event) {
+       case NETDEV_UP:
+               gid_op = GID_ADD;
+               break;
+
+       case NETDEV_DOWN:
+               gid_op = GID_DEL;
+               break;
+
+       default:
+               return NOTIFY_DONE;
+       }
+
+       work = kmalloc(sizeof(*work), GFP_ATOMIC);
+       if (!work) {
+               pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+               return NOTIFY_DONE;
+       }
+
+       INIT_WORK(&work->work, update_gid_event_work_handler);
+
+       rdma_ip2gid(sa, &work->gid);
+       work->gid_op = gid_op;
+
+       memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+       dev_hold(ndev);
+       work->gid_attr.ndev   = ndev;
+
+       queue_work(ib_wq, &work->work);
+
+       return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+                         void *ptr)
+{
+       struct sockaddr_in      in;
+       struct net_device       *ndev;
+       struct in_ifaddr        *ifa = ptr;
+
+       in.sin_family = AF_INET;
+       in.sin_addr.s_addr = ifa->ifa_address;
+       ndev = ifa->ifa_dev->dev;
+
+       return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+                          void *ptr)
+{
+       struct sockaddr_in6     in6;
+       struct net_device       *ndev;
+       struct inet6_ifaddr     *ifa6 = ptr;
+
+       in6.sin6_family = AF_INET6;
+       in6.sin6_addr = ifa6->addr;
+       ndev = ifa6->idev->dev;
+
+       return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+       .notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+       .notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+       .notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+       register_inetaddr_notifier(&nb_inetaddr);
+       if (IS_ENABLED(CONFIG_IPV6))
+               register_inet6addr_notifier(&nb_inet6addr);
+       /* We relay on the netdevice notifier to enumerate all
+        * existing devices in the system. Register to this notifier
+        * last to make sure we will not miss any IP add/del
+        * callbacks.
+        */
+       register_netdevice_notifier(&nb_netdevice);
+
+       return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+       if (IS_ENABLED(CONFIG_IPV6))
+               unregister_inet6addr_notifier(&nb_inet6addr);
+       unregister_inetaddr_notifier(&nb_inetaddr);
+       unregister_netdevice_notifier(&nb_netdevice);
+       /* Ensure all gid deletion tasks complete before we go down,
+        * to avoid any reference to free'd memory. By the time
+        * ib-core is removed, all physical devices have been removed,
+        * so no issue with remaining hardware contexts.
+        */
+}
index ca919f4296664f070f0c63b1765542375818f0aa..8c014b33d8e0a5bc97e6e2408b3ded37ddfd06c2 100644 (file)
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
 #include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
 #include "sa.h"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand subnet administration query support");
 MODULE_LICENSE("Dual BSD/GPL");
 
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN            100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT                2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX            200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
 struct ib_sa_sm_ah {
        struct ib_ah        *ah;
        struct kref          ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
        struct ib_mad_send_buf *mad_buf;
        struct ib_sa_sm_ah     *sm_ah;
        int                     id;
+       u32                     flags;
+       struct list_head        list; /* Local svc request list */
+       u32                     seq; /* Local svc request sequence number */
+       unsigned long           timeout; /* Local svc timeout */
+       u8                      path_use; /* How will the pathrecord be used */
 };
 
+#define IB_SA_ENABLE_LOCAL_SERVICE     0x00000001
+#define IB_SA_CANCEL                   0x00000002
+
 struct ib_sa_service_query {
        void (*callback)(int, struct ib_sa_service_rec *, void *);
        void *context;
@@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {
        struct ib_sa_query sa_query;
 };
 
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+       [LS_NLA_TYPE_PATH_RECORD]       = {.type = NLA_BINARY,
+               .len = sizeof(struct ib_path_rec_data)},
+       [LS_NLA_TYPE_TIMEOUT]           = {.type = NLA_U32},
+       [LS_NLA_TYPE_SERVICE_ID]        = {.type = NLA_U64},
+       [LS_NLA_TYPE_DGID]              = {.type = NLA_BINARY,
+               .len = sizeof(struct rdma_nla_ls_gid)},
+       [LS_NLA_TYPE_SGID]              = {.type = NLA_BINARY,
+               .len = sizeof(struct rdma_nla_ls_gid)},
+       [LS_NLA_TYPE_TCLASS]            = {.type = NLA_U8},
+       [LS_NLA_TYPE_PKEY]              = {.type = NLA_U16},
+       [LS_NLA_TYPE_QOS_CLASS]         = {.type = NLA_U16},
+};
+
+
 static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client sa_client = {
        .name   = "sa",
@@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {
          .size_bits    = 512 },
 };
 
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+       query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+       return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+                                    struct ib_sa_query *query)
+{
+       struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
+       struct ib_sa_mad *mad = query->mad_buf->mad;
+       ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+       u16 val16;
+       u64 val64;
+       struct rdma_ls_resolve_header *header;
+
+       query->mad_buf->context[1] = NULL;
+
+       /* Construct the family header first */
+       header = (struct rdma_ls_resolve_header *)
+               skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+       memcpy(header->device_name, query->port->agent->device->name,
+              LS_DEVICE_NAME_MAX);
+       header->port_num = query->port->port_num;
+
+       if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+           sa_rec->reversible != 0)
+               query->path_use = LS_RESOLVE_PATH_USE_GMP;
+       else
+               query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+       header->path_use = query->path_use;
+
+       /* Now build the attributes */
+       if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+               val64 = be64_to_cpu(sa_rec->service_id);
+               nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+                       sizeof(val64), &val64);
+       }
+       if (comp_mask & IB_SA_PATH_REC_DGID)
+               nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+                       sizeof(sa_rec->dgid), &sa_rec->dgid);
+       if (comp_mask & IB_SA_PATH_REC_SGID)
+               nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+                       sizeof(sa_rec->sgid), &sa_rec->sgid);
+       if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+               nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+                       sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+       if (comp_mask & IB_SA_PATH_REC_PKEY) {
+               val16 = be16_to_cpu(sa_rec->pkey);
+               nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+                       sizeof(val16), &val16);
+       }
+       if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+               val16 = be16_to_cpu(sa_rec->qos_class);
+               nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+                       sizeof(val16), &val16);
+       }
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+       int len = 0;
+
+       if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+               len += nla_total_size(sizeof(u64));
+       if (comp_mask & IB_SA_PATH_REC_DGID)
+               len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+       if (comp_mask & IB_SA_PATH_REC_SGID)
+               len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+       if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+               len += nla_total_size(sizeof(u8));
+       if (comp_mask & IB_SA_PATH_REC_PKEY)
+               len += nla_total_size(sizeof(u16));
+       if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+               len += nla_total_size(sizeof(u16));
+
+       /*
+        * Make sure that at least some of the required comp_mask bits are
+        * set.
+        */
+       if (WARN_ON(len == 0))
+               return len;
+
+       /* Add the family header */
+       len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+       return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query)
+{
+       struct sk_buff *skb = NULL;
+       struct nlmsghdr *nlh;
+       void *data;
+       int ret = 0;
+       struct ib_sa_mad *mad;
+       int len;
+
+       mad = query->mad_buf->mad;
+       len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+       if (len <= 0)
+               return -EMSGSIZE;
+
+       skb = nlmsg_new(len, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+
+       /* Put nlmsg header only for now */
+       data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+                           RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+       if (!data) {
+               kfree_skb(skb);
+               return -EMSGSIZE;
+       }
+
+       /* Add attributes */
+       ib_nl_set_path_rec_attrs(skb, query);
+
+       /* Repair the nlmsg header length */
+       nlmsg_end(skb, nlh);
+
+       ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+       if (!ret)
+               ret = len;
+       else
+               ret = 0;
+
+       return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+       unsigned long flags;
+       unsigned long delay;
+       int ret;
+
+       INIT_LIST_HEAD(&query->list);
+       query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+       spin_lock_irqsave(&ib_nl_request_lock, flags);
+       ret = ib_nl_send_msg(query);
+       if (ret <= 0) {
+               ret = -EIO;
+               goto request_out;
+       } else {
+               ret = 0;
+       }
+
+       delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+       query->timeout = delay + jiffies;
+       list_add_tail(&query->list, &ib_nl_request_list);
+       /* Start the timeout if this is the only request */
+       if (ib_nl_request_list.next == &query->list)
+               queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+       spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+       return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+       unsigned long flags;
+       struct ib_sa_query *wait_query;
+       int found = 0;
+
+       spin_lock_irqsave(&ib_nl_request_lock, flags);
+       list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+               /* Let the timeout to take care of the callback */
+               if (query == wait_query) {
+                       query->flags |= IB_SA_CANCEL;
+                       query->timeout = jiffies;
+                       list_move(&query->list, &ib_nl_request_list);
+                       found = 1;
+                       mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+       return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+                        struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+                                          const struct nlmsghdr *nlh)
+{
+       struct ib_mad_send_wc mad_send_wc;
+       struct ib_sa_mad *mad = NULL;
+       const struct nlattr *head, *curr;
+       struct ib_path_rec_data  *rec;
+       int len, rem;
+       u32 mask = 0;
+       int status = -EIO;
+
+       if (query->callback) {
+               head = (const struct nlattr *) nlmsg_data(nlh);
+               len = nlmsg_len(nlh);
+               switch (query->path_use) {
+               case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+                       mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+                       break;
+
+               case LS_RESOLVE_PATH_USE_ALL:
+               case LS_RESOLVE_PATH_USE_GMP:
+               default:
+                       mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+                               IB_PATH_BIDIRECTIONAL;
+                       break;
+               }
+               nla_for_each_attr(curr, head, len, rem) {
+                       if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+                               rec = nla_data(curr);
+                               /*
+                                * Get the first one. In the future, we may
+                                * need to get up to 6 pathrecords.
+                                */
+                               if ((rec->flags & mask) == mask) {
+                                       mad = query->mad_buf->mad;
+                                       mad->mad_hdr.method |=
+                                               IB_MGMT_METHOD_RESP;
+                                       memcpy(mad->data, rec->path_rec,
+                                              sizeof(rec->path_rec));
+                                       status = 0;
+                                       break;
+                               }
+                       }
+               }
+               query->callback(query, status, mad);
+       }
+
+       mad_send_wc.send_buf = query->mad_buf;
+       mad_send_wc.status = IB_WC_SUCCESS;
+       send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+       unsigned long flags;
+       struct ib_sa_query *query;
+       unsigned long delay;
+       struct ib_mad_send_wc mad_send_wc;
+       int ret;
+
+       spin_lock_irqsave(&ib_nl_request_lock, flags);
+       while (!list_empty(&ib_nl_request_list)) {
+               query = list_entry(ib_nl_request_list.next,
+                                  struct ib_sa_query, list);
+
+               if (time_after(query->timeout, jiffies)) {
+                       delay = query->timeout - jiffies;
+                       if ((long)delay <= 0)
+                               delay = 1;
+                       queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+                       break;
+               }
+
+               list_del(&query->list);
+               ib_sa_disable_local_svc(query);
+               /* Hold the lock to protect against query cancellation */
+               if (ib_sa_query_cancelled(query))
+                       ret = -1;
+               else
+                       ret = ib_post_send_mad(query->mad_buf, NULL);
+               if (ret) {
+                       mad_send_wc.send_buf = query->mad_buf;
+                       mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+                       spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+                       send_handler(query->port->agent, &mad_send_wc);
+                       spin_lock_irqsave(&ib_nl_request_lock, flags);
+               }
+       }
+       spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+                                   struct netlink_callback *cb)
+{
+       const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+       int timeout, delta, abs_delta;
+       const struct nlattr *attr;
+       unsigned long flags;
+       struct ib_sa_query *query;
+       long delay = 0;
+       struct nlattr *tb[LS_NLA_TYPE_MAX];
+       int ret;
+
+       if (!netlink_capable(skb, CAP_NET_ADMIN))
+               return -EPERM;
+
+       ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+                       nlmsg_len(nlh), ib_nl_policy);
+       attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+       if (ret || !attr)
+               goto settimeout_out;
+
+       timeout = *(int *) nla_data(attr);
+       if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+               timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+       if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+               timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+       delta = timeout - sa_local_svc_timeout_ms;
+       if (delta < 0)
+               abs_delta = -delta;
+       else
+               abs_delta = delta;
+
+       if (delta != 0) {
+               spin_lock_irqsave(&ib_nl_request_lock, flags);
+               sa_local_svc_timeout_ms = timeout;
+               list_for_each_entry(query, &ib_nl_request_list, list) {
+                       if (delta < 0 && abs_delta > query->timeout)
+                               query->timeout = 0;
+                       else
+                               query->timeout += delta;
+
+                       /* Get the new delay from the first entry */
+                       if (!delay) {
+                               delay = query->timeout - jiffies;
+                               if (delay <= 0)
+                                       delay = 1;
+                       }
+               }
+               if (delay)
+                       mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+                                        (unsigned long)delay);
+               spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+       }
+
+settimeout_out:
+       return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+       struct nlattr *tb[LS_NLA_TYPE_MAX];
+       int ret;
+
+       if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+               return 0;
+
+       ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+                       nlmsg_len(nlh), ib_nl_policy);
+       if (ret)
+               return 0;
+
+       return 1;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+                                    struct netlink_callback *cb)
+{
+       const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+       unsigned long flags;
+       struct ib_sa_query *query;
+       struct ib_mad_send_buf *send_buf;
+       struct ib_mad_send_wc mad_send_wc;
+       int found = 0;
+       int ret;
+
+       if (!netlink_capable(skb, CAP_NET_ADMIN))
+               return -EPERM;
+
+       spin_lock_irqsave(&ib_nl_request_lock, flags);
+       list_for_each_entry(query, &ib_nl_request_list, list) {
+               /*
+                * If the query is cancelled, let the timeout routine
+                * take care of it.
+                */
+               if (nlh->nlmsg_seq == query->seq) {
+                       found = !ib_sa_query_cancelled(query);
+                       if (found)
+                               list_del(&query->list);
+                       break;
+               }
+       }
+
+       if (!found) {
+               spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+               goto resp_out;
+       }
+
+       send_buf = query->mad_buf;
+
+       if (!ib_nl_is_good_resolve_resp(nlh)) {
+               /* if the result is a failure, send out the packet via IB */
+               ib_sa_disable_local_svc(query);
+               ret = ib_post_send_mad(query->mad_buf, NULL);
+               spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+               if (ret) {
+                       mad_send_wc.send_buf = send_buf;
+                       mad_send_wc.status = IB_WC_GENERAL_ERR;
+                       send_handler(query->port->agent, &mad_send_wc);
+               }
+       } else {
+               spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+               ib_nl_process_good_resolve_rsp(query, nlh);
+       }
+
+resp_out:
+       return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+       [RDMA_NL_LS_OP_RESOLVE] = {
+               .dump = ib_nl_handle_resolve_resp,
+               .module = THIS_MODULE },
+       [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+               .dump = ib_nl_handle_set_timeout,
+               .module = THIS_MODULE },
+};
+
 static void free_sm_ah(struct kref *kref)
 {
        struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
        mad_buf = query->mad_buf;
        spin_unlock_irqrestore(&idr_lock, flags);
 
-       ib_cancel_mad(agent, mad_buf);
+       /*
+        * If the query is still on the netlink request list, schedule
+        * it to be cancelled by the timeout routine. Otherwise, it has been
+        * sent to the MAD layer and has to be cancelled from there.
+        */
+       if (!ib_nl_cancel_request(query))
+               ib_cancel_mad(agent, mad_buf);
 }
 EXPORT_SYMBOL(ib_sa_cancel_query);
 
@@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
        query->mad_buf->context[0] = query;
        query->id = id;
 
+       if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+               if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+                       if (!ib_nl_make_request(query))
+                               return id;
+               }
+               ib_sa_disable_local_svc(query);
+       }
+
        ret = ib_post_send_mad(query->mad_buf, NULL);
        if (ret) {
                spin_lock_irqsave(&idr_lock, flags);
@@ -740,7 +1212,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
        port  = &sa_dev->port[port_num - sa_dev->start_port];
        agent = port->agent;
 
-       query = kmalloc(sizeof *query, gfp_mask);
+       query = kzalloc(sizeof(*query), gfp_mask);
        if (!query)
                return -ENOMEM;
 
@@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
 
        *sa_query = &query->sa_query;
 
+       query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+       query->sa_query.mad_buf->context[1] = rec;
+
        ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
        if (ret < 0)
                goto err2;
@@ -862,7 +1337,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
            method != IB_SA_METHOD_DELETE)
                return -EINVAL;
 
-       query = kmalloc(sizeof *query, gfp_mask);
+       query = kzalloc(sizeof(*query), gfp_mask);
        if (!query)
                return -ENOMEM;
 
@@ -954,7 +1429,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
        port  = &sa_dev->port[port_num - sa_dev->start_port];
        agent = port->agent;
 
-       query = kmalloc(sizeof *query, gfp_mask);
+       query = kzalloc(sizeof(*query), gfp_mask);
        if (!query)
                return -ENOMEM;
 
@@ -1051,7 +1526,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
        port  = &sa_dev->port[port_num - sa_dev->start_port];
        agent = port->agent;
 
-       query = kmalloc(sizeof *query, gfp_mask);
+       query = kzalloc(sizeof(*query), gfp_mask);
        if (!query)
                return -ENOMEM;
 
@@ -1221,9 +1696,9 @@ free:
        return;
 }
 
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
 {
-       struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+       struct ib_sa_device *sa_dev = client_data;
        int i;
 
        if (!sa_dev)
@@ -1251,6 +1726,8 @@ static int __init ib_sa_init(void)
 
        get_random_bytes(&tid, sizeof tid);
 
+       atomic_set(&ib_nl_sa_request_seq, 0);
+
        ret = ib_register_client(&sa_client);
        if (ret) {
                printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1263,7 +1740,25 @@ static int __init ib_sa_init(void)
                goto err2;
        }
 
+       ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+       if (!ib_nl_wq) {
+               ret = -ENOMEM;
+               goto err3;
+       }
+
+       if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+                           ib_sa_cb_table)) {
+               pr_err("Failed to add netlink callback\n");
+               ret = -EINVAL;
+               goto err4;
+       }
+       INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
        return 0;
+err4:
+       destroy_workqueue(ib_nl_wq);
+err3:
+       mcast_cleanup();
 err2:
        ib_unregister_client(&sa_client);
 err1:
@@ -1272,6 +1767,10 @@ err1:
 
 static void __exit ib_sa_cleanup(void)
 {
+       ibnl_remove_client(RDMA_NL_LS);
+       cancel_delayed_work(&ib_nl_timed_work);
+       flush_workqueue(ib_nl_wq);
+       destroy_workqueue(ib_nl_wq);
        mcast_cleanup();
        ib_unregister_client(&sa_client);
        idr_destroy(&query_idr);
index 0b84a9cdfe5b90636d3633dbb42d84ef84c4e98e..34cdd74b0a17ed06228bf78f134c2001c976125a 100644 (file)
@@ -457,29 +457,6 @@ static struct kobj_type port_type = {
        .default_attrs = port_default_attrs
 };
 
-static void ib_device_release(struct device *device)
-{
-       struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-       kfree(dev->port_immutable);
-       kfree(dev);
-}
-
-static int ib_device_uevent(struct device *device,
-                           struct kobj_uevent_env *env)
-{
-       struct ib_device *dev = container_of(device, struct ib_device, dev);
-
-       if (add_uevent_var(env, "NAME=%s", dev->name))
-               return -ENOMEM;
-
-       /*
-        * It would be nice to pass the node GUID with the event...
-        */
-
-       return 0;
-}
-
 static struct attribute **
 alloc_group_attrs(ssize_t (*show)(struct ib_port *,
                                  struct port_attribute *, char *buf),
@@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
        &dev_attr_node_desc
 };
 
-static struct class ib_class = {
-       .name    = "infiniband",
-       .dev_release = ib_device_release,
-       .dev_uevent = ib_device_uevent,
-};
-
 /* Show a given an attribute in the statistics group */
 static ssize_t show_protocol_stat(const struct device *device,
                            struct device_attribute *attr, char *buf,
@@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
        int ret;
        int i;
 
-       class_dev->class      = &ib_class;
-       class_dev->parent     = device->dma_device;
-       dev_set_name(class_dev, "%s", device->name);
-       dev_set_drvdata(class_dev, device);
-
-       INIT_LIST_HEAD(&device->port_list);
+       device->dev.parent = device->dma_device;
+       ret = dev_set_name(class_dev, "%s", device->name);
+       if (ret)
+               return ret;
 
-       ret = device_register(class_dev);
+       ret = device_add(class_dev);
        if (ret)
                goto err;
 
@@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
 
        device_unregister(&device->dev);
 }
-
-int ib_sysfs_setup(void)
-{
-       return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
-       class_unregister(&ib_class);
-}
index 00948107364466cafe28e95557be1fcf829e988c..6b4e8a008bc0418f3ad54728b73e03e6e8d6e1fa 100644 (file)
@@ -109,7 +109,7 @@ enum {
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
 
 static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
 
 static struct ib_client ucm_client = {
        .name   = "ucm",
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
        if (result)
                goto out;
 
-       result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
-                             NULL);
+       result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
 out:
        ib_ucm_ctx_put(ctx);
        return result;
@@ -1310,9 +1309,9 @@ err:
        return;
 }
 
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
 {
-       struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+       struct ib_ucm_device *ucm_dev = client_data;
 
        if (!ucm_dev)
                return;
index 29b21213ea7586129357bd803c7de89096227bf1..a53fc9b01c69957cb6d45433c4a34a0a69a5c367 100644 (file)
@@ -74,6 +74,7 @@ struct ucma_file {
        struct list_head        ctx_list;
        struct list_head        event_list;
        wait_queue_head_t       poll_wait;
+       struct workqueue_struct *close_wq;
 };
 
 struct ucma_context {
@@ -89,6 +90,13 @@ struct ucma_context {
 
        struct list_head        list;
        struct list_head        mc_list;
+       /* mark that device is in process of destroying the internal HW
+        * resources, protected by the global mut
+        */
+       int                     closing;
+       /* sync between removal event and id destroy, protected by file mut */
+       int                     destroying;
+       struct work_struct      close_work;
 };
 
 struct ucma_multicast {
@@ -107,6 +115,7 @@ struct ucma_event {
        struct list_head        list;
        struct rdma_cm_id       *cm_id;
        struct rdma_ucm_event_resp resp;
+       struct work_struct      close_work;
 };
 
 static DEFINE_MUTEX(mut);
@@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 
        mutex_lock(&mut);
        ctx = _ucma_find_context(id, file);
-       if (!IS_ERR(ctx))
-               atomic_inc(&ctx->ref);
+       if (!IS_ERR(ctx)) {
+               if (ctx->closing)
+                       ctx = ERR_PTR(-EIO);
+               else
+                       atomic_inc(&ctx->ref);
+       }
        mutex_unlock(&mut);
        return ctx;
 }
@@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
                complete(&ctx->comp);
 }
 
+static void ucma_close_event_id(struct work_struct *work)
+{
+       struct ucma_event *uevent_close =  container_of(work, struct ucma_event, close_work);
+
+       rdma_destroy_id(uevent_close->cm_id);
+       kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+       struct ucma_context *ctx =  container_of(work, struct ucma_context, close_work);
+
+       /* once all inflight tasks are finished, we close all underlying
+        * resources. The context is still alive till its explicit destryoing
+        * by its creator.
+        */
+       ucma_put_ctx(ctx);
+       wait_for_completion(&ctx->comp);
+       /* No new events will be generated after destroying the id. */
+       rdma_destroy_id(ctx->cm_id);
+}
+
 static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 {
        struct ucma_context *ctx;
@@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
        if (!ctx)
                return NULL;
 
+       INIT_WORK(&ctx->close_work, ucma_close_id);
        atomic_set(&ctx->ref, 1);
        init_completion(&ctx->comp);
        INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
        }
 }
 
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+       struct ucma_context *ctx = cm_id->context;
+       struct ucma_event *con_req_eve;
+       int event_found = 0;
+
+       if (ctx->destroying)
+               return;
+
+       /* only if context is pointing to cm_id that it owns it and can be
+        * queued to be closed, otherwise that cm_id is an inflight one that
+        * is part of that context event list pending to be detached and
+        * reattached to its new context as part of ucma_get_event,
+        * handled separately below.
+        */
+       if (ctx->cm_id == cm_id) {
+               mutex_lock(&mut);
+               ctx->closing = 1;
+               mutex_unlock(&mut);
+               queue_work(ctx->file->close_wq, &ctx->close_work);
+               return;
+       }
+
+       list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+               if (con_req_eve->cm_id == cm_id &&
+                   con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+                       list_del(&con_req_eve->list);
+                       INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+                       queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+                       event_found = 1;
+                       break;
+               }
+       }
+       if (!event_found)
+               printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
 static int ucma_event_handler(struct rdma_cm_id *cm_id,
                              struct rdma_cm_event *event)
 {
@@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
                 * We ignore events for new connections until userspace has set
                 * their context.  This can only happen if an error occurs on a
                 * new connection before the user accepts it.  This is okay,
-                * since the accept will just fail later.
+                * since the accept will just fail later. However, we do need
+                * to release the underlying HW resources in case of a device
+                * removal event.
                 */
+               if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+                       ucma_removal_event_handler(cm_id);
+
                kfree(uevent);
                goto out;
        }
 
        list_add_tail(&uevent->list, &ctx->file->event_list);
        wake_up_interruptible(&ctx->file->poll_wait);
+       if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+               ucma_removal_event_handler(cm_id);
 out:
        mutex_unlock(&ctx->file->mut);
        return ret;
@@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
 }
 
 /*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock.  We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing.  We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
  */
 static int ucma_free_ctx(struct ucma_context *ctx)
 {
@@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
        struct ucma_event *uevent, *tmp;
        LIST_HEAD(list);
 
-       /* No new events will be generated after destroying the id. */
-       rdma_destroy_id(ctx->cm_id);
 
        ucma_cleanup_multicast(ctx);
 
@@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
 
-       ucma_put_ctx(ctx);
-       wait_for_completion(&ctx->comp);
-       resp.events_reported = ucma_free_ctx(ctx);
+       mutex_lock(&ctx->file->mut);
+       ctx->destroying = 1;
+       mutex_unlock(&ctx->file->mut);
 
+       flush_workqueue(ctx->file->close_wq);
+       /* At this point it's guaranteed that there is no inflight
+        * closing task */
+       mutex_lock(&mut);
+       if (!ctx->closing) {
+               mutex_unlock(&mut);
+               ucma_put_ctx(ctx);
+               wait_for_completion(&ctx->comp);
+               rdma_destroy_id(ctx->cm_id);
+       } else {
+               mutex_unlock(&mut);
+       }
+
+       resp.events_reported = ucma_free_ctx(ctx);
        if (copy_to_user((void __user *)(unsigned long)cmd.response,
                         &resp, sizeof(resp)))
                ret = -EFAULT;
@@ -1321,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
                mc = ERR_PTR(-ENOENT);
        else if (mc->ctx->file != file)
                mc = ERR_PTR(-EINVAL);
-       else {
+       else if (!atomic_inc_not_zero(&mc->ctx->ref))
+               mc = ERR_PTR(-ENXIO);
+       else
                idr_remove(&multicast_idr, mc->id);
-               atomic_inc(&mc->ctx->ref);
-       }
        mutex_unlock(&mut);
 
        if (IS_ERR(mc)) {
@@ -1529,6 +1628,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
        INIT_LIST_HEAD(&file->ctx_list);
        init_waitqueue_head(&file->poll_wait);
        mutex_init(&file->mut);
+       file->close_wq = create_singlethread_workqueue("ucma_close_id");
 
        filp->private_data = file;
        file->filp = filp;
@@ -1543,16 +1643,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
 
        mutex_lock(&file->mut);
        list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+               ctx->destroying = 1;
                mutex_unlock(&file->mut);
 
                mutex_lock(&mut);
                idr_remove(&ctx_idr, ctx->id);
                mutex_unlock(&mut);
 
+               flush_workqueue(file->close_wq);
+               /* At that step once ctx was marked as destroying and workqueue
+                * was flushed we are safe from any inflights handlers that
+                * might put other closing task.
+                */
+               mutex_lock(&mut);
+               if (!ctx->closing) {
+                       mutex_unlock(&mut);
+                       /* rdma_destroy_id ensures that no event handlers are
+                        * inflight for that id before releasing it.
+                        */
+                       rdma_destroy_id(ctx->cm_id);
+               } else {
+                       mutex_unlock(&mut);
+               }
+
                ucma_free_ctx(ctx);
                mutex_lock(&file->mut);
        }
        mutex_unlock(&file->mut);
+       destroy_workqueue(file->close_wq);
        kfree(file);
        return 0;
 }
index 35567fffaa4e330cf7e1b4963f86daf2132b5574..57f281f8d686224b7a40488330b78ac38943270c 100644 (file)
@@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 
 static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
 
 static void ib_umad_release_dev(struct kobject *kobj)
 {
@@ -1322,9 +1322,9 @@ free:
        kobject_put(&umad_dev->kobj);
 }
 
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
 {
-       struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+       struct ib_umad_device *umad_dev = client_data;
        int i;
 
        if (!umad_dev)
index ba365b6d1e8d561d891f358da5f3f05e54800740..3863d33c243d80cde7df7c100205d4578aa58d7a 100644 (file)
  */
 
 struct ib_uverbs_device {
-       struct kref                             ref;
+       atomic_t                                refcount;
        int                                     num_comp_vectors;
        struct completion                       comp;
        struct device                          *dev;
-       struct ib_device                       *ib_dev;
+       struct ib_device        __rcu          *ib_dev;
        int                                     devnum;
        struct cdev                             cdev;
        struct rb_root                          xrcd_tree;
        struct mutex                            xrcd_tree_mutex;
+       struct kobject                          kobj;
+       struct srcu_struct                      disassociate_srcu;
+       struct mutex                            lists_mutex; /* protect lists */
+       struct list_head                        uverbs_file_list;
+       struct list_head                        uverbs_events_file_list;
 };
 
 struct ib_uverbs_event_file {
@@ -105,6 +110,7 @@ struct ib_uverbs_event_file {
        wait_queue_head_t                       poll_wait;
        struct fasync_struct                   *async_queue;
        struct list_head                        event_list;
+       struct list_head                        list;
 };
 
 struct ib_uverbs_file {
@@ -114,6 +120,8 @@ struct ib_uverbs_file {
        struct ib_ucontext                     *ucontext;
        struct ib_event_handler                 event_handler;
        struct ib_uverbs_event_file            *async_file;
+       struct list_head                        list;
+       int                                     is_closed;
 };
 
 struct ib_uverbs_event {
@@ -177,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;
 void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+                                       struct ib_device *ib_dev,
                                        int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
 
 void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -212,6 +222,7 @@ struct ib_uverbs_flow_spec {
 
 #define IB_UVERBS_DECLARE_CMD(name)                                    \
        ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,           \
+                                struct ib_device *ib_dev,              \
                                 const char __user *buf, int in_len,    \
                                 int out_len)
 
@@ -253,6 +264,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 
 #define IB_UVERBS_DECLARE_EX_CMD(name)                         \
        int ib_uverbs_ex_##name(struct ib_uverbs_file *file,    \
+                               struct ib_device *ib_dev,               \
                                struct ib_udata *ucore,         \
                                struct ib_udata *uhw)
 
index bbb02ffe87df97ea2696e97e4baf9ae1718d0355..be4cb9f04be3349f433084b1f95cc817f83ad63e 100644 (file)
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
 }
 
 ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+                             struct ib_device *ib_dev,
                              const char __user *buf,
                              int in_len, int out_len)
 {
        struct ib_uverbs_get_context      cmd;
        struct ib_uverbs_get_context_resp resp;
        struct ib_udata                   udata;
-       struct ib_device                 *ibdev = file->device->ib_dev;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        struct ib_device_attr             dev_attr;
 #endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
                   (unsigned long) cmd.response + sizeof resp,
                   in_len - sizeof cmd, out_len - sizeof resp);
 
-       ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+       ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
        if (IS_ERR(ucontext)) {
                ret = PTR_ERR(ucontext);
                goto err;
        }
 
-       ucontext->device = ibdev;
+       ucontext->device = ib_dev;
        INIT_LIST_HEAD(&ucontext->pd_list);
        INIT_LIST_HEAD(&ucontext->mr_list);
        INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        ucontext->odp_mrs_count = 0;
        INIT_LIST_HEAD(&ucontext->no_private_counters);
 
-       ret = ib_query_device(ibdev, &dev_attr);
+       ret = ib_query_device(ib_dev, &dev_attr);
        if (ret)
                goto err_free;
        if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
                goto err_free;
        resp.async_fd = ret;
 
-       filp = ib_uverbs_alloc_event_file(file, 1);
+       filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
        if (IS_ERR(filp)) {
                ret = PTR_ERR(filp);
                goto err_fd;
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
                goto err_file;
        }
 
-       file->async_file = filp->private_data;
-
-       INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
-                             ib_uverbs_event_handler);
-       ret = ib_register_event_handler(&file->event_handler);
-       if (ret)
-               goto err_file;
-
-       kref_get(&file->async_file->ref);
-       kref_get(&file->ref);
        file->ucontext = ucontext;
 
        fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
        return in_len;
 
 err_file:
+       ib_uverbs_free_async_event_file(file);
        fput(filp);
 
 err_fd:
@@ -393,7 +384,7 @@ err_fd:
 
 err_free:
        put_pid(ucontext->tgid);
-       ibdev->dealloc_ucontext(ucontext);
+       ib_dev->dealloc_ucontext(ucontext);
 
 err:
        mutex_unlock(&file->mutex);
@@ -401,11 +392,12 @@ err:
 }
 
 static void copy_query_dev_fields(struct ib_uverbs_file *file,
+                                 struct ib_device *ib_dev,
                                  struct ib_uverbs_query_device_resp *resp,
                                  struct ib_device_attr *attr)
 {
        resp->fw_ver            = attr->fw_ver;
-       resp->node_guid         = file->device->ib_dev->node_guid;
+       resp->node_guid         = ib_dev->node_guid;
        resp->sys_image_guid    = attr->sys_image_guid;
        resp->max_mr_size       = attr->max_mr_size;
        resp->page_size_cap     = attr->page_size_cap;
@@ -443,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
        resp->max_srq_sge               = attr->max_srq_sge;
        resp->max_pkeys                 = attr->max_pkeys;
        resp->local_ca_ack_delay        = attr->local_ca_ack_delay;
-       resp->phys_port_cnt             = file->device->ib_dev->phys_port_cnt;
+       resp->phys_port_cnt             = ib_dev->phys_port_cnt;
 }
 
 ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+                              struct ib_device *ib_dev,
                               const char __user *buf,
                               int in_len, int out_len)
 {
@@ -461,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
        if (copy_from_user(&cmd, buf, sizeof cmd))
                return -EFAULT;
 
-       ret = ib_query_device(file->device->ib_dev, &attr);
+       ret = ib_query_device(ib_dev, &attr);
        if (ret)
                return ret;
 
        memset(&resp, 0, sizeof resp);
-       copy_query_dev_fields(file, &resp, &attr);
+       copy_query_dev_fields(file, ib_dev, &resp, &attr);
 
        if (copy_to_user((void __user *) (unsigned long) cmd.response,
                         &resp, sizeof resp))
@@ -476,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf,
                             int in_len, int out_len)
 {
@@ -490,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
        if (copy_from_user(&cmd, buf, sizeof cmd))
                return -EFAULT;
 
-       ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+       ret = ib_query_port(ib_dev, cmd.port_num, &attr);
        if (ret)
                return ret;
 
@@ -515,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
        resp.active_width    = attr.active_width;
        resp.active_speed    = attr.active_speed;
        resp.phys_state      = attr.phys_state;
-       resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev,
+       resp.link_layer      = rdma_port_get_link_layer(ib_dev,
                                                        cmd.port_num);
 
        if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -526,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
                           const char __user *buf,
                           int in_len, int out_len)
 {
@@ -553,15 +548,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
        init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
        down_write(&uobj->mutex);
 
-       pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
-                                           file->ucontext, &udata);
+       pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
        if (IS_ERR(pd)) {
                ret = PTR_ERR(pd);
                goto err;
        }
 
-       pd->device  = file->device->ib_dev;
+       pd->device  = ib_dev;
        pd->uobject = uobj;
+       pd->local_mr = NULL;
        atomic_set(&pd->usecnt, 0);
 
        uobj->object = pd;
@@ -600,11 +595,13 @@ err:
 }
 
 ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf,
                             int in_len, int out_len)
 {
        struct ib_uverbs_dealloc_pd cmd;
        struct ib_uobject          *uobj;
+       struct ib_pd               *pd;
        int                         ret;
 
        if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -613,15 +610,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
        uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
        if (!uobj)
                return -EINVAL;
+       pd = uobj->object;
 
-       ret = ib_dealloc_pd(uobj->object);
-       if (!ret)
-               uobj->live = 0;
-
-       put_uobj_write(uobj);
+       if (atomic_read(&pd->usecnt)) {
+               ret = -EBUSY;
+               goto err_put;
+       }
 
+       ret = pd->device->dealloc_pd(uobj->object);
+       WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
        if (ret)
-               return ret;
+               goto err_put;
+
+       uobj->live = 0;
+       put_uobj_write(uobj);
 
        idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
 
@@ -632,6 +634,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
        put_uobj(uobj);
 
        return in_len;
+
+err_put:
+       put_uobj_write(uobj);
+       return ret;
 }
 
 struct xrcd_table_entry {
@@ -720,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
 }
 
 ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -778,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
        down_write(&obj->uobject.mutex);
 
        if (!xrcd) {
-               xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
-                                                       file->ucontext, &udata);
+               xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
                if (IS_ERR(xrcd)) {
                        ret = PTR_ERR(xrcd);
                        goto err;
                }
 
                xrcd->inode   = inode;
-               xrcd->device  = file->device->ib_dev;
+               xrcd->device  = ib_dev;
                atomic_set(&xrcd->usecnt, 0);
                mutex_init(&xrcd->tgt_qp_mutex);
                INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -857,6 +863,7 @@ err_tree_mutex_unlock:
 }
 
 ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf, int in_len,
                             int out_len)
 {
@@ -934,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
 }
 
 ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+                        struct ib_device *ib_dev,
                         const char __user *buf, int in_len,
                         int out_len)
 {
@@ -1043,6 +1051,7 @@ err_free:
 }
 
 ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
                           const char __user *buf, int in_len,
                           int out_len)
 {
@@ -1136,6 +1145,7 @@ put_uobjs:
 }
 
 ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
                           const char __user *buf, int in_len,
                           int out_len)
 {
@@ -1174,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
-                        const char __user *buf, int in_len,
-                        int out_len)
+                          struct ib_device *ib_dev,
+                          const char __user *buf, int in_len,
+                          int out_len)
 {
        struct ib_uverbs_alloc_mw      cmd;
        struct ib_uverbs_alloc_mw_resp resp;
@@ -1256,8 +1267,9 @@ err_free:
 }
 
 ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
-                          const char __user *buf, int in_len,
-                          int out_len)
+                            struct ib_device *ib_dev,
+                            const char __user *buf, int in_len,
+                            int out_len)
 {
        struct ib_uverbs_dealloc_mw cmd;
        struct ib_mw               *mw;
@@ -1294,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+                                     struct ib_device *ib_dev,
                                      const char __user *buf, int in_len,
                                      int out_len)
 {
@@ -1313,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
                return ret;
        resp.fd = ret;
 
-       filp = ib_uverbs_alloc_event_file(file, 0);
+       filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
        if (IS_ERR(filp)) {
                put_unused_fd(resp.fd);
                return PTR_ERR(filp);
@@ -1331,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
 }
 
 static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+                                       struct ib_device *ib_dev,
                                       struct ib_udata *ucore,
                                       struct ib_udata *uhw,
                                       struct ib_uverbs_ex_create_cq *cmd,
@@ -1379,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
        if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
                attr.flags = cmd->flags;
 
-       cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
+       cq = ib_dev->create_cq(ib_dev, &attr,
                                             file->ucontext, uhw);
        if (IS_ERR(cq)) {
                ret = PTR_ERR(cq);
                goto err_file;
        }
 
-       cq->device        = file->device->ib_dev;
+       cq->device        = ib_dev;
        cq->uobject       = &obj->uobject;
        cq->comp_handler  = ib_uverbs_comp_handler;
        cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1447,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -1475,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
        cmd_ex.comp_vector = cmd.comp_vector;
        cmd_ex.comp_channel = cmd.comp_channel;
 
-       obj = create_cq(file, &ucore, &uhw, &cmd_ex,
+       obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
                        offsetof(typeof(cmd_ex), comp_channel) +
                        sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
                        NULL);
@@ -1498,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
 }
 
 int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+                        struct ib_device *ib_dev,
                           struct ib_udata *ucore,
                           struct ib_udata *uhw)
 {
@@ -1523,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
                             sizeof(resp.response_length)))
                return -ENOSPC;
 
-       obj = create_cq(file, ucore, uhw, &cmd,
+       obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
                        min(ucore->inlen, sizeof(cmd)),
                        ib_uverbs_ex_create_cq_cb, NULL);
 
@@ -1534,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -1597,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
 }
 
 ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+                         struct ib_device *ib_dev,
                          const char __user *buf, int in_len,
                          int out_len)
 {
@@ -1648,6 +1666,7 @@ out_put:
 }
 
 ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+                               struct ib_device *ib_dev,
                                const char __user *buf, int in_len,
                                int out_len)
 {
@@ -1670,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf, int in_len,
                             int out_len)
 {
@@ -1722,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -1917,6 +1938,7 @@ err_put:
 }
 
 ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+                         struct ib_device *ib_dev,
                          const char __user *buf, int in_len, int out_len)
 {
        struct ib_uverbs_open_qp        cmd;
@@ -2011,6 +2033,7 @@ err_put:
 }
 
 ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+                          struct ib_device *ib_dev,
                           const char __user *buf, int in_len,
                           int out_len)
 {
@@ -2125,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
 }
 
 ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -2221,6 +2245,7 @@ out:
 }
 
 ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf, int in_len,
                             int out_len)
 {
@@ -2279,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -2346,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
                next->send_flags = user_wr->send_flags;
 
                if (is_ud) {
+                       if (next->opcode != IB_WR_SEND &&
+                           next->opcode != IB_WR_SEND_WITH_IMM) {
+                               ret = -EINVAL;
+                               goto out_put;
+                       }
+
                        next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
                                                     file->ucontext);
                        if (!next->wr.ud.ah) {
@@ -2385,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
                                        user_wr->wr.atomic.compare_add;
                                next->wr.atomic.swap = user_wr->wr.atomic.swap;
                                next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+                       case IB_WR_SEND:
                                break;
                        default:
-                               break;
+                               ret = -EINVAL;
+                               goto out_put;
                        }
                }
 
@@ -2523,6 +2557,7 @@ err:
 }
 
 ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -2572,6 +2607,7 @@ out:
 }
 
 ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+                               struct ib_device *ib_dev,
                                const char __user *buf, int in_len,
                                int out_len)
 {
@@ -2621,6 +2657,7 @@ out:
 }
 
 ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf, int in_len,
                            int out_len)
 {
@@ -2713,6 +2750,7 @@ err:
 }
 
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf, int in_len, int out_len)
 {
        struct ib_uverbs_destroy_ah cmd;
@@ -2749,6 +2787,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+                              struct ib_device *ib_dev,
                               const char __user *buf, int in_len,
                               int out_len)
 {
@@ -2796,6 +2835,7 @@ out_put:
 }
 
 ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+                              struct ib_device *ib_dev,
                               const char __user *buf, int in_len,
                               int out_len)
 {
@@ -2876,6 +2916,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
 }
 
 int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             struct ib_udata *ucore,
                             struct ib_udata *uhw)
 {
@@ -3036,6 +3077,7 @@ err_free_attr:
 }
 
 int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+                             struct ib_device *ib_dev,
                              struct ib_udata *ucore,
                              struct ib_udata *uhw)
 {
@@ -3078,6 +3120,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
 }
 
 static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+                               struct ib_device *ib_dev,
                                struct ib_uverbs_create_xsrq *cmd,
                                struct ib_udata *udata)
 {
@@ -3211,6 +3254,7 @@ err:
 }
 
 ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf, int in_len,
                             int out_len)
 {
@@ -3238,7 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
                   (unsigned long) cmd.response + sizeof resp,
                   in_len - sizeof cmd, out_len - sizeof resp);
 
-       ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+       ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
        if (ret)
                return ret;
 
@@ -3246,6 +3290,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+                             struct ib_device *ib_dev,
                              const char __user *buf, int in_len, int out_len)
 {
        struct ib_uverbs_create_xsrq     cmd;
@@ -3263,7 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
                   (unsigned long) cmd.response + sizeof resp,
                   in_len - sizeof cmd, out_len - sizeof resp);
 
-       ret = __uverbs_create_xsrq(file, &cmd, &udata);
+       ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
        if (ret)
                return ret;
 
@@ -3271,6 +3316,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+                            struct ib_device *ib_dev,
                             const char __user *buf, int in_len,
                             int out_len)
 {
@@ -3301,6 +3347,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+                           struct ib_device *ib_dev,
                            const char __user *buf,
                            int in_len, int out_len)
 {
@@ -3341,6 +3388,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
 }
 
 ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+                             struct ib_device *ib_dev,
                              const char __user *buf, int in_len,
                              int out_len)
 {
@@ -3398,16 +3446,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 }
 
 int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+                             struct ib_device *ib_dev,
                              struct ib_udata *ucore,
                              struct ib_udata *uhw)
 {
        struct ib_uverbs_ex_query_device_resp resp;
        struct ib_uverbs_ex_query_device  cmd;
        struct ib_device_attr attr;
-       struct ib_device *device;
        int err;
 
-       device = file->device->ib_dev;
        if (ucore->inlen < sizeof(cmd))
                return -EINVAL;
 
@@ -3428,11 +3475,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
        memset(&attr, 0, sizeof(attr));
 
-       err = device->query_device(device, &attr, uhw);
+       err = ib_dev->query_device(ib_dev, &attr, uhw);
        if (err)
                return err;
 
-       copy_query_dev_fields(file, &resp.base, &attr);
+       copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
        resp.comp_mask = 0;
 
        if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
index f6eef2da7097980b7066c62f1746d7722064f8f3..c29a660c72fe3674cea593f9cdab483f133edd8f 100644 (file)
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+                                    struct ib_device *ib_dev,
                                     const char __user *buf, int in_len,
                                     int out_len) = {
        [IB_USER_VERBS_CMD_GET_CONTEXT]         = ib_uverbs_get_context,
@@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 };
 
 static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+                                   struct ib_device *ib_dev,
                                    struct ib_udata *ucore,
                                    struct ib_udata *uhw) = {
        [IB_USER_VERBS_EX_CMD_CREATE_FLOW]      = ib_uverbs_ex_create_flow,
@@ -128,16 +130,21 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
 
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
 {
        struct ib_uverbs_device *dev =
-               container_of(ref, struct ib_uverbs_device, ref);
+               container_of(kobj, struct ib_uverbs_device, kobj);
 
-       complete(&dev->comp);
+       cleanup_srcu_struct(&dev->disassociate_srcu);
+       kfree(dev);
 }
 
+static struct kobj_type ib_uverbs_dev_ktype = {
+       .release = ib_uverbs_release_dev,
+};
+
 static void ib_uverbs_release_event_file(struct kref *ref)
 {
        struct ib_uverbs_event_file *file =
@@ -201,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 {
        struct ib_uobject *uobj, *tmp;
 
-       if (!context)
-               return 0;
-
        context->closing = 1;
 
        list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -303,13 +307,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
        return context->device->dealloc_ucontext(context);
 }
 
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+       complete(&dev->comp);
+}
+
 static void ib_uverbs_release_file(struct kref *ref)
 {
        struct ib_uverbs_file *file =
                container_of(ref, struct ib_uverbs_file, ref);
+       struct ib_device *ib_dev;
+       int srcu_key;
+
+       srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+       ib_dev = srcu_dereference(file->device->ib_dev,
+                                 &file->device->disassociate_srcu);
+       if (ib_dev && !ib_dev->disassociate_ucontext)
+               module_put(ib_dev->owner);
+       srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
 
-       module_put(file->device->ib_dev->owner);
-       kref_put(&file->device->ref, ib_uverbs_release_dev);
+       if (atomic_dec_and_test(&file->device->refcount))
+               ib_uverbs_comp_dev(file->device);
 
        kfree(file);
 }
@@ -331,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
                        return -EAGAIN;
 
                if (wait_event_interruptible(file->poll_wait,
-                                            !list_empty(&file->event_list)))
+                                            (!list_empty(&file->event_list) ||
+                       /* The barriers built into wait_event_interruptible()
+                        * and wake_up() guarentee this will see the null set
+                        * without using RCU
+                        */
+                                            !file->uverbs_file->device->ib_dev)))
                        return -ERESTARTSYS;
 
+               /* If device was disassociated and no event exists set an error */
+               if (list_empty(&file->event_list) &&
+                   !file->uverbs_file->device->ib_dev)
+                       return -EIO;
+
                spin_lock_irq(&file->lock);
        }
 
@@ -396,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 {
        struct ib_uverbs_event_file *file = filp->private_data;
        struct ib_uverbs_event *entry, *tmp;
+       int closed_already = 0;
 
+       mutex_lock(&file->uverbs_file->device->lists_mutex);
        spin_lock_irq(&file->lock);
+       closed_already = file->is_closed;
        file->is_closed = 1;
        list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
                if (entry->counter)
@@ -405,11 +436,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
                kfree(entry);
        }
        spin_unlock_irq(&file->lock);
-
-       if (file->is_async) {
-               ib_unregister_event_handler(&file->uverbs_file->event_handler);
-               kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+       if (!closed_already) {
+               list_del(&file->list);
+               if (file->is_async)
+                       ib_unregister_event_handler(&file->uverbs_file->
+                               event_handler);
        }
+       mutex_unlock(&file->uverbs_file->device->lists_mutex);
+
+       kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
        kref_put(&file->ref, ib_uverbs_release_event_file);
 
        return 0;
@@ -541,13 +576,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
                                NULL, NULL);
 }
 
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+       kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+       file->async_file = NULL;
+}
+
 struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+                                       struct ib_device        *ib_dev,
                                        int is_async)
 {
        struct ib_uverbs_event_file *ev_file;
        struct file *filp;
+       int ret;
 
-       ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+       ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
        if (!ev_file)
                return ERR_PTR(-ENOMEM);
 
@@ -556,16 +599,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
        INIT_LIST_HEAD(&ev_file->event_list);
        init_waitqueue_head(&ev_file->poll_wait);
        ev_file->uverbs_file = uverbs_file;
+       kref_get(&ev_file->uverbs_file->ref);
        ev_file->async_queue = NULL;
-       ev_file->is_async    = is_async;
        ev_file->is_closed   = 0;
 
        filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
                                  ev_file, O_RDONLY);
        if (IS_ERR(filp))
-               kfree(ev_file);
+               goto err_put_refs;
+
+       mutex_lock(&uverbs_file->device->lists_mutex);
+       list_add_tail(&ev_file->list,
+                     &uverbs_file->device->uverbs_events_file_list);
+       mutex_unlock(&uverbs_file->device->lists_mutex);
+
+       if (is_async) {
+               WARN_ON(uverbs_file->async_file);
+               uverbs_file->async_file = ev_file;
+               kref_get(&uverbs_file->async_file->ref);
+               INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+                                     ib_dev,
+                                     ib_uverbs_event_handler);
+               ret = ib_register_event_handler(&uverbs_file->event_handler);
+               if (ret)
+                       goto err_put_file;
+
+               /* At that point async file stuff was fully set */
+               ev_file->is_async = 1;
+       }
 
        return filp;
+
+err_put_file:
+       fput(filp);
+       kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+       uverbs_file->async_file = NULL;
+       return ERR_PTR(ret);
+
+err_put_refs:
+       kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+       kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+       return filp;
 }
 
 /*
@@ -601,8 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
                             size_t count, loff_t *pos)
 {
        struct ib_uverbs_file *file = filp->private_data;
+       struct ib_device *ib_dev;
        struct ib_uverbs_cmd_hdr hdr;
        __u32 flags;
+       int srcu_key;
+       ssize_t ret;
 
        if (count < sizeof hdr)
                return -EINVAL;
@@ -610,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
        if (copy_from_user(&hdr, buf, sizeof hdr))
                return -EFAULT;
 
+       srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+       ib_dev = srcu_dereference(file->device->ib_dev,
+                                 &file->device->disassociate_srcu);
+       if (!ib_dev) {
+               ret = -EIO;
+               goto out;
+       }
+
        flags = (hdr.command &
                 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
 
@@ -617,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
                __u32 command;
 
                if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
-                                          IB_USER_VERBS_CMD_COMMAND_MASK))
-                       return -EINVAL;
+                                          IB_USER_VERBS_CMD_COMMAND_MASK)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
                command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
                if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
-                   !uverbs_cmd_table[command])
-                       return -EINVAL;
+                   !uverbs_cmd_table[command]) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
                if (!file->ucontext &&
-                   command != IB_USER_VERBS_CMD_GET_CONTEXT)
-                       return -EINVAL;
+                   command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-               if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
-                       return -ENOSYS;
+               if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
+                       ret = -ENOSYS;
+                       goto out;
+               }
 
-               if (hdr.in_words * 4 != count)
-                       return -EINVAL;
+               if (hdr.in_words * 4 != count) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-               return uverbs_cmd_table[command](file,
+               ret = uverbs_cmd_table[command](file, ib_dev,
                                                 buf + sizeof(hdr),
                                                 hdr.in_words * 4,
                                                 hdr.out_words * 4);
@@ -647,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
                struct ib_uverbs_ex_cmd_hdr ex_hdr;
                struct ib_udata ucore;
                struct ib_udata uhw;
-               int err;
                size_t written_count = count;
 
                if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
-                                          IB_USER_VERBS_CMD_COMMAND_MASK))
-                       return -EINVAL;
+                                          IB_USER_VERBS_CMD_COMMAND_MASK)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
                command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
 
                if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
-                   !uverbs_ex_cmd_table[command])
-                       return -ENOSYS;
+                   !uverbs_ex_cmd_table[command]) {
+                       ret = -ENOSYS;
+                       goto out;
+               }
 
-               if (!file->ucontext)
-                       return -EINVAL;
+               if (!file->ucontext) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-               if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
-                       return -ENOSYS;
+               if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
+                       ret = -ENOSYS;
+                       goto out;
+               }
 
-               if (count < (sizeof(hdr) + sizeof(ex_hdr)))
-                       return -EINVAL;
+               if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-               if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
-                       return -EFAULT;
+               if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+                       ret = -EFAULT;
+                       goto out;
+               }
 
                count -= sizeof(hdr) + sizeof(ex_hdr);
                buf += sizeof(hdr) + sizeof(ex_hdr);
 
-               if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
-                       return -EINVAL;
+               if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
-               if (ex_hdr.cmd_hdr_reserved)
-                       return -EINVAL;
+               if (ex_hdr.cmd_hdr_reserved) {
+                       ret = -EINVAL;
+                       goto out;
+               }
 
                if (ex_hdr.response) {
-                       if (!hdr.out_words && !ex_hdr.provider_out_words)
-                               return -EINVAL;
+                       if (!hdr.out_words && !ex_hdr.provider_out_words) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
 
                        if (!access_ok(VERIFY_WRITE,
                                       (void __user *) (unsigned long) ex_hdr.response,
-                                      (hdr.out_words + ex_hdr.provider_out_words) * 8))
-                               return -EFAULT;
+                                      (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+                               ret = -EFAULT;
+                               goto out;
+                       }
                } else {
-                       if (hdr.out_words || ex_hdr.provider_out_words)
-                               return -EINVAL;
+                       if (hdr.out_words || ex_hdr.provider_out_words) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
                }
 
                INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -703,27 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
                                       ex_hdr.provider_in_words * 8,
                                       ex_hdr.provider_out_words * 8);
 
-               err = uverbs_ex_cmd_table[command](file,
+               ret = uverbs_ex_cmd_table[command](file,
+                                                  ib_dev,
                                                   &ucore,
                                                   &uhw);
-
-               if (err)
-                       return err;
-
-               return written_count;
+               if (!ret)
+                       ret = written_count;
+       } else {
+               ret = -ENOSYS;
        }
 
-       return -ENOSYS;
+out:
+       srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+       return ret;
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
 {
        struct ib_uverbs_file *file = filp->private_data;
+       struct ib_device *ib_dev;
+       int ret = 0;
+       int srcu_key;
+
+       srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+       ib_dev = srcu_dereference(file->device->ib_dev,
+                                 &file->device->disassociate_srcu);
+       if (!ib_dev) {
+               ret = -EIO;
+               goto out;
+       }
 
        if (!file->ucontext)
-               return -ENODEV;
+               ret = -ENODEV;
        else
-               return file->device->ib_dev->mmap(file->ucontext, vma);
+               ret = ib_dev->mmap(file->ucontext, vma);
+out:
+       srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+       return ret;
 }
 
 /*
@@ -740,23 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 {
        struct ib_uverbs_device *dev;
        struct ib_uverbs_file *file;
+       struct ib_device *ib_dev;
        int ret;
+       int module_dependent;
+       int srcu_key;
 
        dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
-       if (dev)
-               kref_get(&dev->ref);
-       else
+       if (!atomic_inc_not_zero(&dev->refcount))
                return -ENXIO;
 
-       if (!try_module_get(dev->ib_dev->owner)) {
-               ret = -ENODEV;
+       srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+       mutex_lock(&dev->lists_mutex);
+       ib_dev = srcu_dereference(dev->ib_dev,
+                                 &dev->disassociate_srcu);
+       if (!ib_dev) {
+               ret = -EIO;
                goto err;
        }
 
-       file = kmalloc(sizeof *file, GFP_KERNEL);
+       /* In case IB device supports disassociate ucontext, there is no hard
+        * dependency between uverbs device and its low level device.
+        */
+       module_dependent = !(ib_dev->disassociate_ucontext);
+
+       if (module_dependent) {
+               if (!try_module_get(ib_dev->owner)) {
+                       ret = -ENODEV;
+                       goto err;
+               }
+       }
+
+       file = kzalloc(sizeof(*file), GFP_KERNEL);
        if (!file) {
                ret = -ENOMEM;
-               goto err_module;
+               if (module_dependent)
+                       goto err_module;
+
+               goto err;
        }
 
        file->device     = dev;
@@ -766,27 +918,47 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
        mutex_init(&file->mutex);
 
        filp->private_data = file;
+       kobject_get(&dev->kobj);
+       list_add_tail(&file->list, &dev->uverbs_file_list);
+       mutex_unlock(&dev->lists_mutex);
+       srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
        return nonseekable_open(inode, filp);
 
 err_module:
-       module_put(dev->ib_dev->owner);
+       module_put(ib_dev->owner);
 
 err:
-       kref_put(&dev->ref, ib_uverbs_release_dev);
+       mutex_unlock(&dev->lists_mutex);
+       srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+       if (atomic_dec_and_test(&dev->refcount))
+               ib_uverbs_comp_dev(dev);
+
        return ret;
 }
 
 static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
        struct ib_uverbs_file *file = filp->private_data;
-
-       ib_uverbs_cleanup_ucontext(file, file->ucontext);
+       struct ib_uverbs_device *dev = file->device;
+       struct ib_ucontext *ucontext = NULL;
+
+       mutex_lock(&file->device->lists_mutex);
+       ucontext = file->ucontext;
+       file->ucontext = NULL;
+       if (!file->is_closed) {
+               list_del(&file->list);
+               file->is_closed = 1;
+       }
+       mutex_unlock(&file->device->lists_mutex);
+       if (ucontext)
+               ib_uverbs_cleanup_ucontext(file, ucontext);
 
        if (file->async_file)
                kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
 
        kref_put(&file->ref, ib_uverbs_release_file);
+       kobject_put(&dev->kobj);
 
        return 0;
 }
@@ -817,12 +989,21 @@ static struct ib_client uverbs_client = {
 static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
                          char *buf)
 {
+       int ret = -ENODEV;
+       int srcu_key;
        struct ib_uverbs_device *dev = dev_get_drvdata(device);
+       struct ib_device *ib_dev;
 
        if (!dev)
                return -ENODEV;
 
-       return sprintf(buf, "%s\n", dev->ib_dev->name);
+       srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+       ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+       if (ib_dev)
+               ret = sprintf(buf, "%s\n", ib_dev->name);
+       srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+       return ret;
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
@@ -830,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,
                                    struct device_attribute *attr, char *buf)
 {
        struct ib_uverbs_device *dev = dev_get_drvdata(device);
+       int ret = -ENODEV;
+       int srcu_key;
+       struct ib_device *ib_dev;
 
        if (!dev)
                return -ENODEV;
+       srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+       ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+       if (ib_dev)
+               ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+       srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
 
-       return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+       return ret;
 }
 static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
 
@@ -874,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
        int devnum;
        dev_t base;
        struct ib_uverbs_device *uverbs_dev;
+       int ret;
 
        if (!device->alloc_ucontext)
                return;
@@ -882,10 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
        if (!uverbs_dev)
                return;
 
-       kref_init(&uverbs_dev->ref);
+       ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+       if (ret) {
+               kfree(uverbs_dev);
+               return;
+       }
+
+       atomic_set(&uverbs_dev->refcount, 1);
        init_completion(&uverbs_dev->comp);
        uverbs_dev->xrcd_tree = RB_ROOT;
        mutex_init(&uverbs_dev->xrcd_tree_mutex);
+       kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+       mutex_init(&uverbs_dev->lists_mutex);
+       INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+       INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
 
        spin_lock(&map_lock);
        devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -906,12 +1106,13 @@ static void ib_uverbs_add_one(struct ib_device *device)
        }
        spin_unlock(&map_lock);
 
-       uverbs_dev->ib_dev           = device;
+       rcu_assign_pointer(uverbs_dev->ib_dev, device);
        uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
        cdev_init(&uverbs_dev->cdev, NULL);
        uverbs_dev->cdev.owner = THIS_MODULE;
        uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+       uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
        kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
        if (cdev_add(&uverbs_dev->cdev, base, 1))
                goto err_cdev;
@@ -942,15 +1143,79 @@ err_cdev:
                clear_bit(devnum, overflow_map);
 
 err:
-       kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+       if (atomic_dec_and_test(&uverbs_dev->refcount))
+               ib_uverbs_comp_dev(uverbs_dev);
        wait_for_completion(&uverbs_dev->comp);
-       kfree(uverbs_dev);
+       kobject_put(&uverbs_dev->kobj);
        return;
 }
 
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+                                       struct ib_device *ib_dev)
 {
-       struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+       struct ib_uverbs_file *file;
+       struct ib_uverbs_event_file *event_file;
+       struct ib_event event;
+
+       /* Pending running commands to terminate */
+       synchronize_srcu(&uverbs_dev->disassociate_srcu);
+       event.event = IB_EVENT_DEVICE_FATAL;
+       event.element.port_num = 0;
+       event.device = ib_dev;
+
+       mutex_lock(&uverbs_dev->lists_mutex);
+       while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+               struct ib_ucontext *ucontext;
+
+               file = list_first_entry(&uverbs_dev->uverbs_file_list,
+                                       struct ib_uverbs_file, list);
+               file->is_closed = 1;
+               ucontext = file->ucontext;
+               list_del(&file->list);
+               file->ucontext = NULL;
+               kref_get(&file->ref);
+               mutex_unlock(&uverbs_dev->lists_mutex);
+               /* We must release the mutex before going ahead and calling
+                * disassociate_ucontext. disassociate_ucontext might end up
+                * indirectly calling uverbs_close, for example due to freeing
+                * the resources (e.g mmput).
+                */
+               ib_uverbs_event_handler(&file->event_handler, &event);
+               if (ucontext) {
+                       ib_dev->disassociate_ucontext(ucontext);
+                       ib_uverbs_cleanup_ucontext(file, ucontext);
+               }
+
+               mutex_lock(&uverbs_dev->lists_mutex);
+               kref_put(&file->ref, ib_uverbs_release_file);
+       }
+
+       while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+               event_file = list_first_entry(&uverbs_dev->
+                                             uverbs_events_file_list,
+                                             struct ib_uverbs_event_file,
+                                             list);
+               spin_lock_irq(&event_file->lock);
+               event_file->is_closed = 1;
+               spin_unlock_irq(&event_file->lock);
+
+               list_del(&event_file->list);
+               if (event_file->is_async) {
+                       ib_unregister_event_handler(&event_file->uverbs_file->
+                                                   event_handler);
+                       event_file->uverbs_file->event_handler.device = NULL;
+               }
+
+               wake_up_interruptible(&event_file->poll_wait);
+               kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+       }
+       mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+       struct ib_uverbs_device *uverbs_dev = client_data;
+       int wait_clients = 1;
 
        if (!uverbs_dev)
                return;
@@ -964,9 +1229,28 @@ static void ib_uverbs_remove_one(struct ib_device *device)
        else
                clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
-       kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
-       wait_for_completion(&uverbs_dev->comp);
-       kfree(uverbs_dev);
+       if (device->disassociate_ucontext) {
+               /* We disassociate HW resources and immediately return.
+                * Userspace will see a EIO errno for all future access.
+                * Upon returning, ib_device may be freed internally and is not
+                * valid any more.
+                * uverbs_device is still available until all clients close
+                * their files, then the uverbs device ref count will be zero
+                * and its resources will be freed.
+                * Note: At this point no more files can be opened since the
+                * cdev was deleted, however active clients can still issue
+                * commands and close their open files.
+                */
+               rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+               ib_uverbs_free_hw_resources(uverbs_dev, device);
+               wait_clients = 0;
+       }
+
+       if (atomic_dec_and_test(&uverbs_dev->refcount))
+               ib_uverbs_comp_dev(uverbs_dev);
+       if (wait_clients)
+               wait_for_completion(&uverbs_dev->comp);
+       kobject_put(&uverbs_dev->kobj);
 }
 
 static char *uverbs_devnode(struct device *dev, umode_t *mode)
index bac3fb406a7470edb0deb1d66d6559118fe8ac53..e1f2c9887f3f48ebc20c0931304ae75c5f65c03f 100644 (file)
@@ -213,28 +213,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
 
 /* Protection domains */
 
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
 {
        struct ib_pd *pd;
+       struct ib_device_attr devattr;
+       int rc;
+
+       rc = ib_query_device(device, &devattr);
+       if (rc)
+               return ERR_PTR(rc);
 
        pd = device->alloc_pd(device, NULL, NULL);
+       if (IS_ERR(pd))
+               return pd;
+
+       pd->device = device;
+       pd->uobject = NULL;
+       pd->local_mr = NULL;
+       atomic_set(&pd->usecnt, 0);
+
+       if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+               pd->local_dma_lkey = device->local_dma_lkey;
+       else {
+               struct ib_mr *mr;
+
+               mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+               if (IS_ERR(mr)) {
+                       ib_dealloc_pd(pd);
+                       return (struct ib_pd *)mr;
+               }
 
-       if (!IS_ERR(pd)) {
-               pd->device  = device;
-               pd->uobject = NULL;
-               atomic_set(&pd->usecnt, 0);
+               pd->local_mr = mr;
+               pd->local_dma_lkey = pd->local_mr->lkey;
        }
-
        return pd;
 }
 EXPORT_SYMBOL(ib_alloc_pd);
 
-int ib_dealloc_pd(struct ib_pd *pd)
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist.  The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
 {
-       if (atomic_read(&pd->usecnt))
-               return -EBUSY;
+       int ret;
+
+       if (pd->local_mr) {
+               ret = ib_dereg_mr(pd->local_mr);
+               WARN_ON(ret);
+               pd->local_mr = NULL;
+       }
+
+       /* uverbs manipulates usecnt with proper locking, while the kabi
+          requires the caller to guarantee we can't race here. */
+       WARN_ON(atomic_read(&pd->usecnt));
 
-       return pd->device->dealloc_pd(pd);
+       /* Making delalloc_pd a void return is a WIP, no driver should return
+          an error here. */
+       ret = pd->device->dealloc_pd(pd);
+       WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
 }
 EXPORT_SYMBOL(ib_dealloc_pd);
 
@@ -1144,73 +1195,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 }
 EXPORT_SYMBOL(ib_get_dma_mr);
 
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
-                            struct ib_phys_buf *phys_buf_array,
-                            int num_phys_buf,
-                            int mr_access_flags,
-                            u64 *iova_start)
-{
-       struct ib_mr *mr;
-       int err;
-
-       err = ib_check_mr_access(mr_access_flags);
-       if (err)
-               return ERR_PTR(err);
-
-       if (!pd->device->reg_phys_mr)
-               return ERR_PTR(-ENOSYS);
-
-       mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
-                                    mr_access_flags, iova_start);
-
-       if (!IS_ERR(mr)) {
-               mr->device  = pd->device;
-               mr->pd      = pd;
-               mr->uobject = NULL;
-               atomic_inc(&pd->usecnt);
-               atomic_set(&mr->usecnt, 0);
-       }
-
-       return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
-                    int mr_rereg_mask,
-                    struct ib_pd *pd,
-                    struct ib_phys_buf *phys_buf_array,
-                    int num_phys_buf,
-                    int mr_access_flags,
-                    u64 *iova_start)
-{
-       struct ib_pd *old_pd;
-       int ret;
-
-       ret = ib_check_mr_access(mr_access_flags);
-       if (ret)
-               return ret;
-
-       if (!mr->device->rereg_phys_mr)
-               return -ENOSYS;
-
-       if (atomic_read(&mr->usecnt))
-               return -EBUSY;
-
-       old_pd = mr->pd;
-
-       ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
-                                       phys_buf_array, num_phys_buf,
-                                       mr_access_flags, iova_start);
-
-       if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
-               atomic_dec(&old_pd->usecnt);
-               atomic_inc(&pd->usecnt);
-       }
-
-       return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
 int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
 {
        return mr->device->query_mr ?
@@ -1235,54 +1219,28 @@ int ib_dereg_mr(struct ib_mr *mr)
 }
 EXPORT_SYMBOL(ib_dereg_mr);
 
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
-                          struct ib_mr_init_attr *mr_init_attr)
-{
-       struct ib_mr *mr;
-
-       if (!pd->device->create_mr)
-               return ERR_PTR(-ENOSYS);
-
-       mr = pd->device->create_mr(pd, mr_init_attr);
-
-       if (!IS_ERR(mr)) {
-               mr->device  = pd->device;
-               mr->pd      = pd;
-               mr->uobject = NULL;
-               atomic_inc(&pd->usecnt);
-               atomic_set(&mr->usecnt, 0);
-       }
-
-       return mr;
-}
-EXPORT_SYMBOL(ib_create_mr);
-
-int ib_destroy_mr(struct ib_mr *mr)
-{
-       struct ib_pd *pd;
-       int ret;
-
-       if (atomic_read(&mr->usecnt))
-               return -EBUSY;
-
-       pd = mr->pd;
-       ret = mr->device->destroy_mr(mr);
-       if (!ret)
-               atomic_dec(&pd->usecnt);
-
-       return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd:            protection domain associated with the region
+ * @mr_type:       memory region type
+ * @max_num_sg:    maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+                         enum ib_mr_type mr_type,
+                         u32 max_num_sg)
 {
        struct ib_mr *mr;
 
-       if (!pd->device->alloc_fast_reg_mr)
+       if (!pd->device->alloc_mr)
                return ERR_PTR(-ENOSYS);
 
-       mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
+       mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
        if (!IS_ERR(mr)) {
                mr->device  = pd->device;
                mr->pd      = pd;
@@ -1293,7 +1251,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
 
        return mr;
 }
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+EXPORT_SYMBOL(ib_alloc_mr);
 
 struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
                                                          int max_page_list_len)
index e900b03531a9a880b94635f221ee6c3a99872047..aded2a5cc2d5d677cabe42707704119e68116c5d 100644 (file)
@@ -1,8 +1,5 @@
 obj-$(CONFIG_INFINIBAND_MTHCA)         += mthca/
-obj-$(CONFIG_INFINIBAND_IPATH)         += ipath/
 obj-$(CONFIG_INFINIBAND_QIB)           += qib/
-obj-$(CONFIG_INFINIBAND_EHCA)          += ehca/
-obj-$(CONFIG_INFINIBAND_AMSO1100)      += amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)         += cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)         += cxgb4/
 obj-$(CONFIG_MLX4_INFINIBAND)          += mlx4/
diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/infiniband/hw/amso1100/Kbuild
deleted file mode 100644 (file)
index 950dfab..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
-
-obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
-
-iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
-       c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/infiniband/hw/amso1100/Kconfig b/drivers/infiniband/hw/amso1100/Kconfig
deleted file mode 100644 (file)
index e6ce5f2..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-config INFINIBAND_AMSO1100
-       tristate "Ammasso 1100 HCA support"
-       depends on PCI && INET
-       ---help---
-         This is a low-level driver for the Ammasso 1100 host
-         channel adapter (HCA).
-
-config INFINIBAND_AMSO1100_DEBUG
-       bool "Verbose debugging output"
-       depends on INFINIBAND_AMSO1100
-       default n
-       ---help---
-         This option causes the amso1100 driver to produce a bunch of
-         debug messages.  Select this if you are developing the driver
-         or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
deleted file mode 100644 (file)
index 766a71c..0000000
+++ /dev/null
@@ -1,1241 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/slab.h>
-#include <linux/prefetch.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_smi.h>
-#include "c2.h"
-#include "c2_provider.h"
-
-MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
-MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_VERSION(DRV_VERSION);
-
-static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
-    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
-
-static int debug = -1;         /* defaults above */
-module_param(debug, int, 0);
-MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
-
-static int c2_up(struct net_device *netdev);
-static int c2_down(struct net_device *netdev);
-static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
-static void c2_tx_interrupt(struct net_device *netdev);
-static void c2_rx_interrupt(struct net_device *netdev);
-static irqreturn_t c2_interrupt(int irq, void *dev_id);
-static void c2_tx_timeout(struct net_device *netdev);
-static int c2_change_mtu(struct net_device *netdev, int new_mtu);
-static void c2_reset(struct c2_port *c2_port);
-
-static struct pci_device_id c2_pci_table[] = {
-       { PCI_DEVICE(0x18b8, 0xb001) },
-       { 0 }
-};
-
-MODULE_DEVICE_TABLE(pci, c2_pci_table);
-
-static void c2_print_macaddr(struct net_device *netdev)
-{
-       pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
-}
-
-static void c2_set_rxbufsize(struct c2_port *c2_port)
-{
-       struct net_device *netdev = c2_port->netdev;
-
-       if (netdev->mtu > RX_BUF_SIZE)
-               c2_port->rx_buf_size =
-                   netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
-                   NET_IP_ALIGN;
-       else
-               c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
-}
-
-/*
- * Allocate TX ring elements and chain them together.
- * One-to-one association of adapter descriptors with ring elements.
- */
-static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
-                           dma_addr_t base, void __iomem * mmio_txp_ring)
-{
-       struct c2_tx_desc *tx_desc;
-       struct c2_txp_desc __iomem *txp_desc;
-       struct c2_element *elem;
-       int i;
-
-       tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
-       if (!tx_ring->start)
-               return -ENOMEM;
-
-       elem = tx_ring->start;
-       tx_desc = vaddr;
-       txp_desc = mmio_txp_ring;
-       for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
-               tx_desc->len = 0;
-               tx_desc->status = 0;
-
-               /* Set TXP_HTXD_UNINIT */
-               __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
-                            (void __iomem *) txp_desc + C2_TXP_ADDR);
-               __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
-               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
-                            (void __iomem *) txp_desc + C2_TXP_FLAGS);
-
-               elem->skb = NULL;
-               elem->ht_desc = tx_desc;
-               elem->hw_desc = txp_desc;
-
-               if (i == tx_ring->count - 1) {
-                       elem->next = tx_ring->start;
-                       tx_desc->next_offset = base;
-               } else {
-                       elem->next = elem + 1;
-                       tx_desc->next_offset =
-                           base + (i + 1) * sizeof(*tx_desc);
-               }
-       }
-
-       tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
-
-       return 0;
-}
-
-/*
- * Allocate RX ring elements and chain them together.
- * One-to-one association of adapter descriptors with ring elements.
- */
-static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
-                           dma_addr_t base, void __iomem * mmio_rxp_ring)
-{
-       struct c2_rx_desc *rx_desc;
-       struct c2_rxp_desc __iomem *rxp_desc;
-       struct c2_element *elem;
-       int i;
-
-       rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
-       if (!rx_ring->start)
-               return -ENOMEM;
-
-       elem = rx_ring->start;
-       rx_desc = vaddr;
-       rxp_desc = mmio_rxp_ring;
-       for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
-               rx_desc->len = 0;
-               rx_desc->status = 0;
-
-               /* Set RXP_HRXD_UNINIT */
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
-                      (void __iomem *) rxp_desc + C2_RXP_STATUS);
-               __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
-               __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
-               __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
-                            (void __iomem *) rxp_desc + C2_RXP_ADDR);
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
-                            (void __iomem *) rxp_desc + C2_RXP_FLAGS);
-
-               elem->skb = NULL;
-               elem->ht_desc = rx_desc;
-               elem->hw_desc = rxp_desc;
-
-               if (i == rx_ring->count - 1) {
-                       elem->next = rx_ring->start;
-                       rx_desc->next_offset = base;
-               } else {
-                       elem->next = elem + 1;
-                       rx_desc->next_offset =
-                           base + (i + 1) * sizeof(*rx_desc);
-               }
-       }
-
-       rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
-
-       return 0;
-}
-
-/* Setup buffer for receiving */
-static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
-{
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_rx_desc *rx_desc = elem->ht_desc;
-       struct sk_buff *skb;
-       dma_addr_t mapaddr;
-       u32 maplen;
-       struct c2_rxp_hdr *rxp_hdr;
-
-       skb = dev_alloc_skb(c2_port->rx_buf_size);
-       if (unlikely(!skb)) {
-               pr_debug("%s: out of memory for receive\n",
-                       c2_port->netdev->name);
-               return -ENOMEM;
-       }
-
-       /* Zero out the rxp hdr in the sk_buff */
-       memset(skb->data, 0, sizeof(*rxp_hdr));
-
-       skb->dev = c2_port->netdev;
-
-       maplen = c2_port->rx_buf_size;
-       mapaddr =
-           pci_map_single(c2dev->pcidev, skb->data, maplen,
-                          PCI_DMA_FROMDEVICE);
-
-       /* Set the sk_buff RXP_header to RXP_HRXD_READY */
-       rxp_hdr = (struct c2_rxp_hdr *) skb->data;
-       rxp_hdr->flags = RXP_HRXD_READY;
-
-       __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-       __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
-                    elem->hw_desc + C2_RXP_LEN);
-       __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
-       __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-                    elem->hw_desc + C2_RXP_FLAGS);
-
-       elem->skb = skb;
-       elem->mapaddr = mapaddr;
-       elem->maplen = maplen;
-       rx_desc->len = maplen;
-
-       return 0;
-}
-
-/*
- * Allocate buffers for the Rx ring
- * For receive:  rx_ring.to_clean is next received frame
- */
-static int c2_rx_fill(struct c2_port *c2_port)
-{
-       struct c2_ring *rx_ring = &c2_port->rx_ring;
-       struct c2_element *elem;
-       int ret = 0;
-
-       elem = rx_ring->start;
-       do {
-               if (c2_rx_alloc(c2_port, elem)) {
-                       ret = 1;
-                       break;
-               }
-       } while ((elem = elem->next) != rx_ring->start);
-
-       rx_ring->to_clean = rx_ring->start;
-       return ret;
-}
-
-/* Free all buffers in RX ring, assumes receiver stopped */
-static void c2_rx_clean(struct c2_port *c2_port)
-{
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *rx_ring = &c2_port->rx_ring;
-       struct c2_element *elem;
-       struct c2_rx_desc *rx_desc;
-
-       elem = rx_ring->start;
-       do {
-               rx_desc = elem->ht_desc;
-               rx_desc->len = 0;
-
-               __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-               __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
-               __raw_writew(0, elem->hw_desc + C2_RXP_LEN);
-               __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
-                            elem->hw_desc + C2_RXP_ADDR);
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
-                            elem->hw_desc + C2_RXP_FLAGS);
-
-               if (elem->skb) {
-                       pci_unmap_single(c2dev->pcidev, elem->mapaddr,
-                                        elem->maplen, PCI_DMA_FROMDEVICE);
-                       dev_kfree_skb(elem->skb);
-                       elem->skb = NULL;
-               }
-       } while ((elem = elem->next) != rx_ring->start);
-}
-
-static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
-{
-       struct c2_tx_desc *tx_desc = elem->ht_desc;
-
-       tx_desc->len = 0;
-
-       pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
-                        PCI_DMA_TODEVICE);
-
-       if (elem->skb) {
-               dev_kfree_skb_any(elem->skb);
-               elem->skb = NULL;
-       }
-
-       return 0;
-}
-
-/* Free all buffers in TX ring, assumes transmitter stopped */
-static void c2_tx_clean(struct c2_port *c2_port)
-{
-       struct c2_ring *tx_ring = &c2_port->tx_ring;
-       struct c2_element *elem;
-       struct c2_txp_desc txp_htxd;
-       int retry;
-       unsigned long flags;
-
-       spin_lock_irqsave(&c2_port->tx_lock, flags);
-
-       elem = tx_ring->start;
-
-       do {
-               retry = 0;
-               do {
-                       txp_htxd.flags =
-                           readw(elem->hw_desc + C2_TXP_FLAGS);
-
-                       if (txp_htxd.flags == TXP_HTXD_READY) {
-                               retry = 1;
-                               __raw_writew(0,
-                                            elem->hw_desc + C2_TXP_LEN);
-                               __raw_writeq(0,
-                                            elem->hw_desc + C2_TXP_ADDR);
-                               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
-                                            elem->hw_desc + C2_TXP_FLAGS);
-                               c2_port->netdev->stats.tx_dropped++;
-                               break;
-                       } else {
-                               __raw_writew(0,
-                                            elem->hw_desc + C2_TXP_LEN);
-                               __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
-                                            elem->hw_desc + C2_TXP_ADDR);
-                               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
-                                            elem->hw_desc + C2_TXP_FLAGS);
-                       }
-
-                       c2_tx_free(c2_port->c2dev, elem);
-
-               } while ((elem = elem->next) != tx_ring->start);
-       } while (retry);
-
-       c2_port->tx_avail = c2_port->tx_ring.count - 1;
-       c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
-
-       if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
-               netif_wake_queue(c2_port->netdev);
-
-       spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-}
-
-/*
- * Process transmit descriptors marked 'DONE' by the firmware,
- * freeing up their unneeded sk_buffs.
- */
-static void c2_tx_interrupt(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *tx_ring = &c2_port->tx_ring;
-       struct c2_element *elem;
-       struct c2_txp_desc txp_htxd;
-
-       spin_lock(&c2_port->tx_lock);
-
-       for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
-            elem = elem->next) {
-               txp_htxd.flags =
-                   be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
-
-               if (txp_htxd.flags != TXP_HTXD_DONE)
-                       break;
-
-               if (netif_msg_tx_done(c2_port)) {
-                       /* PCI reads are expensive in fast path */
-                       txp_htxd.len =
-                           be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
-                       pr_debug("%s: tx done slot %3Zu status 0x%x len "
-                               "%5u bytes\n",
-                               netdev->name, elem - tx_ring->start,
-                               txp_htxd.flags, txp_htxd.len);
-               }
-
-               c2_tx_free(c2dev, elem);
-               ++(c2_port->tx_avail);
-       }
-
-       tx_ring->to_clean = elem;
-
-       if (netif_queue_stopped(netdev)
-           && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
-               netif_wake_queue(netdev);
-
-       spin_unlock(&c2_port->tx_lock);
-}
-
-static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
-{
-       struct c2_rx_desc *rx_desc = elem->ht_desc;
-       struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
-
-       if (rxp_hdr->status != RXP_HRXD_OK ||
-           rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
-               pr_debug("BAD RXP_HRXD\n");
-               pr_debug("  rx_desc : %p\n", rx_desc);
-               pr_debug("    index : %Zu\n",
-                       elem - c2_port->rx_ring.start);
-               pr_debug("    len   : %u\n", rx_desc->len);
-               pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
-                       (void *) __pa((unsigned long) rxp_hdr));
-               pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
-               pr_debug("    status: 0x%x\n", rxp_hdr->status);
-               pr_debug("    len   : %u\n", rxp_hdr->len);
-               pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
-       }
-
-       /* Setup the skb for reuse since we're dropping this pkt */
-       elem->skb->data = elem->skb->head;
-       skb_reset_tail_pointer(elem->skb);
-
-       /* Zero out the rxp hdr in the sk_buff */
-       memset(elem->skb->data, 0, sizeof(*rxp_hdr));
-
-       /* Write the descriptor to the adapter's rx ring */
-       __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
-       __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
-       __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
-                    elem->hw_desc + C2_RXP_LEN);
-       __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
-                    elem->hw_desc + C2_RXP_ADDR);
-       __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-                    elem->hw_desc + C2_RXP_FLAGS);
-
-       pr_debug("packet dropped\n");
-       c2_port->netdev->stats.rx_dropped++;
-}
-
-static void c2_rx_interrupt(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *rx_ring = &c2_port->rx_ring;
-       struct c2_element *elem;
-       struct c2_rx_desc *rx_desc;
-       struct c2_rxp_hdr *rxp_hdr;
-       struct sk_buff *skb;
-       dma_addr_t mapaddr;
-       u32 maplen, buflen;
-       unsigned long flags;
-
-       spin_lock_irqsave(&c2dev->lock, flags);
-
-       /* Begin where we left off */
-       rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
-
-       for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
-            elem = elem->next) {
-               rx_desc = elem->ht_desc;
-               mapaddr = elem->mapaddr;
-               maplen = elem->maplen;
-               skb = elem->skb;
-               rxp_hdr = (struct c2_rxp_hdr *) skb->data;
-
-               if (rxp_hdr->flags != RXP_HRXD_DONE)
-                       break;
-               buflen = rxp_hdr->len;
-
-               /* Sanity check the RXP header */
-               if (rxp_hdr->status != RXP_HRXD_OK ||
-                   buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
-                       c2_rx_error(c2_port, elem);
-                       continue;
-               }
-
-               /*
-                * Allocate and map a new skb for replenishing the host
-                * RX desc
-                */
-               if (c2_rx_alloc(c2_port, elem)) {
-                       c2_rx_error(c2_port, elem);
-                       continue;
-               }
-
-               /* Unmap the old skb */
-               pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
-                                PCI_DMA_FROMDEVICE);
-
-               prefetch(skb->data);
-
-               /*
-                * Skip past the leading 8 bytes comprising of the
-                * "struct c2_rxp_hdr", prepended by the adapter
-                * to the usual Ethernet header ("struct ethhdr"),
-                * to the start of the raw Ethernet packet.
-                *
-                * Fix up the various fields in the sk_buff before
-                * passing it up to netif_rx(). The transfer size
-                * (in bytes) specified by the adapter len field of
-                * the "struct rxp_hdr_t" does NOT include the
-                * "sizeof(struct c2_rxp_hdr)".
-                */
-               skb->data += sizeof(*rxp_hdr);
-               skb_set_tail_pointer(skb, buflen);
-               skb->len = buflen;
-               skb->protocol = eth_type_trans(skb, netdev);
-
-               netif_rx(skb);
-
-               netdev->stats.rx_packets++;
-               netdev->stats.rx_bytes += buflen;
-       }
-
-       /* Save where we left off */
-       rx_ring->to_clean = elem;
-       c2dev->cur_rx = elem - rx_ring->start;
-       C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
-
-       spin_unlock_irqrestore(&c2dev->lock, flags);
-}
-
-/*
- * Handle netisr0 TX & RX interrupts.
- */
-static irqreturn_t c2_interrupt(int irq, void *dev_id)
-{
-       unsigned int netisr0, dmaisr;
-       int handled = 0;
-       struct c2_dev *c2dev = (struct c2_dev *) dev_id;
-
-       /* Process CCILNET interrupts */
-       netisr0 = readl(c2dev->regs + C2_NISR0);
-       if (netisr0) {
-
-               /*
-                * There is an issue with the firmware that always
-                * provides the status of RX for both TX & RX
-                * interrupts.  So process both queues here.
-                */
-               c2_rx_interrupt(c2dev->netdev);
-               c2_tx_interrupt(c2dev->netdev);
-
-               /* Clear the interrupt */
-               writel(netisr0, c2dev->regs + C2_NISR0);
-               handled++;
-       }
-
-       /* Process RNIC interrupts */
-       dmaisr = readl(c2dev->regs + C2_DISR);
-       if (dmaisr) {
-               writel(dmaisr, c2dev->regs + C2_DISR);
-               c2_rnic_interrupt(c2dev);
-               handled++;
-       }
-
-       if (handled) {
-               return IRQ_HANDLED;
-       } else {
-               return IRQ_NONE;
-       }
-}
-
-static int c2_up(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_element *elem;
-       struct c2_rxp_hdr *rxp_hdr;
-       struct in_device *in_dev;
-       size_t rx_size, tx_size;
-       int ret, i;
-       unsigned int netimr0;
-
-       if (netif_msg_ifup(c2_port))
-               pr_debug("%s: enabling interface\n", netdev->name);
-
-       /* Set the Rx buffer size based on MTU */
-       c2_set_rxbufsize(c2_port);
-
-       /* Allocate DMA'able memory for Tx/Rx host descriptor rings */
-       rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
-       tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
-
-       c2_port->mem_size = tx_size + rx_size;
-       c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
-                                            &c2_port->dma);
-       if (c2_port->mem == NULL) {
-               pr_debug("Unable to allocate memory for "
-                       "host descriptor rings\n");
-               return -ENOMEM;
-       }
-
-       /* Create the Rx host descriptor ring */
-       if ((ret =
-            c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
-                             c2dev->mmio_rxp_ring))) {
-               pr_debug("Unable to create RX ring\n");
-               goto bail0;
-       }
-
-       /* Allocate Rx buffers for the host descriptor ring */
-       if (c2_rx_fill(c2_port)) {
-               pr_debug("Unable to fill RX ring\n");
-               goto bail1;
-       }
-
-       /* Create the Tx host descriptor ring */
-       if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
-                                   c2_port->dma + rx_size,
-                                   c2dev->mmio_txp_ring))) {
-               pr_debug("Unable to create TX ring\n");
-               goto bail1;
-       }
-
-       /* Set the TX pointer to where we left off */
-       c2_port->tx_avail = c2_port->tx_ring.count - 1;
-       c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
-           c2_port->tx_ring.start + c2dev->cur_tx;
-
-       /* missing: Initialize MAC */
-
-       BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
-
-       /* Reset the adapter, ensures the driver is in sync with the RXP */
-       c2_reset(c2_port);
-
-       /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
-       for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
-            i++, elem++) {
-               rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
-               rxp_hdr->flags = 0;
-               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
-                            elem->hw_desc + C2_RXP_FLAGS);
-       }
-
-       /* Enable network packets */
-       netif_start_queue(netdev);
-
-       /* Enable IRQ */
-       writel(0, c2dev->regs + C2_IDIS);
-       netimr0 = readl(c2dev->regs + C2_NIMR0);
-       netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
-       writel(netimr0, c2dev->regs + C2_NIMR0);
-
-       /* Tell the stack to ignore arp requests for ipaddrs bound to
-        * other interfaces.  This is needed to prevent the host stack
-        * from responding to arp requests to the ipaddr bound on the
-        * rdma interface.
-        */
-       in_dev = in_dev_get(netdev);
-       IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
-       in_dev_put(in_dev);
-
-       return 0;
-
-      bail1:
-       c2_rx_clean(c2_port);
-       kfree(c2_port->rx_ring.start);
-
-      bail0:
-       pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
-                           c2_port->dma);
-
-       return ret;
-}
-
-static int c2_down(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-
-       if (netif_msg_ifdown(c2_port))
-               pr_debug("%s: disabling interface\n",
-                       netdev->name);
-
-       /* Wait for all the queued packets to get sent */
-       c2_tx_interrupt(netdev);
-
-       /* Disable network packets */
-       netif_stop_queue(netdev);
-
-       /* Disable IRQs by clearing the interrupt mask */
-       writel(1, c2dev->regs + C2_IDIS);
-       writel(0, c2dev->regs + C2_NIMR0);
-
-       /* missing: Stop transmitter */
-
-       /* missing: Stop receiver */
-
-       /* Reset the adapter, ensures the driver is in sync with the RXP */
-       c2_reset(c2_port);
-
-       /* missing: Turn off LEDs here */
-
-       /* Free all buffers in the host descriptor rings */
-       c2_tx_clean(c2_port);
-       c2_rx_clean(c2_port);
-
-       /* Free the host descriptor rings */
-       kfree(c2_port->rx_ring.start);
-       kfree(c2_port->tx_ring.start);
-       pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
-                           c2_port->dma);
-
-       return 0;
-}
-
-static void c2_reset(struct c2_port *c2_port)
-{
-       struct c2_dev *c2dev = c2_port->c2dev;
-       unsigned int cur_rx = c2dev->cur_rx;
-
-       /* Tell the hardware to quiesce */
-       C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
-
-       /*
-        * The hardware will reset the C2_PCI_HRX_QUI bit once
-        * the RXP is quiesced.  Wait 2 seconds for this.
-        */
-       ssleep(2);
-
-       cur_rx = C2_GET_CUR_RX(c2dev);
-
-       if (cur_rx & C2_PCI_HRX_QUI)
-               pr_debug("c2_reset: failed to quiesce the hardware!\n");
-
-       cur_rx &= ~C2_PCI_HRX_QUI;
-
-       c2dev->cur_rx = cur_rx;
-
-       pr_debug("Current RX: %u\n", c2dev->cur_rx);
-}
-
-static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-       struct c2_dev *c2dev = c2_port->c2dev;
-       struct c2_ring *tx_ring = &c2_port->tx_ring;
-       struct c2_element *elem;
-       dma_addr_t mapaddr;
-       u32 maplen;
-       unsigned long flags;
-       unsigned int i;
-
-       spin_lock_irqsave(&c2_port->tx_lock, flags);
-
-       if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
-               netif_stop_queue(netdev);
-               spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-
-               pr_debug("%s: Tx ring full when queue awake!\n",
-                       netdev->name);
-               return NETDEV_TX_BUSY;
-       }
-
-       maplen = skb_headlen(skb);
-       mapaddr =
-           pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
-
-       elem = tx_ring->to_use;
-       elem->skb = skb;
-       elem->mapaddr = mapaddr;
-       elem->maplen = maplen;
-
-       /* Tell HW to xmit */
-       __raw_writeq((__force u64) cpu_to_be64(mapaddr),
-                    elem->hw_desc + C2_TXP_ADDR);
-       __raw_writew((__force u16) cpu_to_be16(maplen),
-                    elem->hw_desc + C2_TXP_LEN);
-       __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
-                    elem->hw_desc + C2_TXP_FLAGS);
-
-       netdev->stats.tx_packets++;
-       netdev->stats.tx_bytes += maplen;
-
-       /* Loop thru additional data fragments and queue them */
-       if (skb_shinfo(skb)->nr_frags) {
-               for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-                       const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-                       maplen = skb_frag_size(frag);
-                       mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
-                                                  0, maplen, DMA_TO_DEVICE);
-                       elem = elem->next;
-                       elem->skb = NULL;
-                       elem->mapaddr = mapaddr;
-                       elem->maplen = maplen;
-
-                       /* Tell HW to xmit */
-                       __raw_writeq((__force u64) cpu_to_be64(mapaddr),
-                                    elem->hw_desc + C2_TXP_ADDR);
-                       __raw_writew((__force u16) cpu_to_be16(maplen),
-                                    elem->hw_desc + C2_TXP_LEN);
-                       __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
-                                    elem->hw_desc + C2_TXP_FLAGS);
-
-                       netdev->stats.tx_packets++;
-                       netdev->stats.tx_bytes += maplen;
-               }
-       }
-
-       tx_ring->to_use = elem->next;
-       c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
-
-       if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
-               netif_stop_queue(netdev);
-               if (netif_msg_tx_queued(c2_port))
-                       pr_debug("%s: transmit queue full\n",
-                               netdev->name);
-       }
-
-       spin_unlock_irqrestore(&c2_port->tx_lock, flags);
-
-       netdev->trans_start = jiffies;
-
-       return NETDEV_TX_OK;
-}
-
-static void c2_tx_timeout(struct net_device *netdev)
-{
-       struct c2_port *c2_port = netdev_priv(netdev);
-
-       if (netif_msg_timer(c2_port))
-               pr_debug("%s: tx timeout\n", netdev->name);
-
-       c2_tx_clean(c2_port);
-}
-
-static int c2_change_mtu(struct net_device *netdev, int new_mtu)
-{
-       int ret = 0;
-
-       if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
-               return -EINVAL;
-
-       netdev->mtu = new_mtu;
-
-       if (netif_running(netdev)) {
-               c2_down(netdev);
-
-               c2_up(netdev);
-       }
-
-       return ret;
-}
-
-static const struct net_device_ops c2_netdev = {
-       .ndo_open               = c2_up,
-       .ndo_stop               = c2_down,
-       .ndo_start_xmit         = c2_xmit_frame,
-       .ndo_tx_timeout         = c2_tx_timeout,
-       .ndo_change_mtu         = c2_change_mtu,
-       .ndo_set_mac_address    = eth_mac_addr,
-       .ndo_validate_addr      = eth_validate_addr,
-};
-
-/* Initialize network device */
-static struct net_device *c2_devinit(struct c2_dev *c2dev,
-                                    void __iomem * mmio_addr)
-{
-       struct c2_port *c2_port = NULL;
-       struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
-
-       if (!netdev) {
-               pr_debug("c2_port etherdev alloc failed");
-               return NULL;
-       }
-
-       SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
-
-       netdev->netdev_ops = &c2_netdev;
-       netdev->watchdog_timeo = C2_TX_TIMEOUT;
-       netdev->irq = c2dev->pcidev->irq;
-
-       c2_port = netdev_priv(netdev);
-       c2_port->netdev = netdev;
-       c2_port->c2dev = c2dev;
-       c2_port->msg_enable = netif_msg_init(debug, default_msg);
-       c2_port->tx_ring.count = C2_NUM_TX_DESC;
-       c2_port->rx_ring.count = C2_NUM_RX_DESC;
-
-       spin_lock_init(&c2_port->tx_lock);
-
-       /* Copy our 48-bit ethernet hardware address */
-       memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
-
-       /* Validate the MAC address */
-       if (!is_valid_ether_addr(netdev->dev_addr)) {
-               pr_debug("Invalid MAC Address\n");
-               c2_print_macaddr(netdev);
-               free_netdev(netdev);
-               return NULL;
-       }
-
-       c2dev->netdev = netdev;
-
-       return netdev;
-}
-
-static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
-{
-       int ret = 0, i;
-       unsigned long reg0_start, reg0_flags, reg0_len;
-       unsigned long reg2_start, reg2_flags, reg2_len;
-       unsigned long reg4_start, reg4_flags, reg4_len;
-       unsigned kva_map_size;
-       struct net_device *netdev = NULL;
-       struct c2_dev *c2dev = NULL;
-       void __iomem *mmio_regs = NULL;
-
-       printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
-               DRV_VERSION);
-
-       /* Enable PCI device */
-       ret = pci_enable_device(pcidev);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
-                       pci_name(pcidev));
-               goto bail0;
-       }
-
-       reg0_start = pci_resource_start(pcidev, BAR_0);
-       reg0_len = pci_resource_len(pcidev, BAR_0);
-       reg0_flags = pci_resource_flags(pcidev, BAR_0);
-
-       reg2_start = pci_resource_start(pcidev, BAR_2);
-       reg2_len = pci_resource_len(pcidev, BAR_2);
-       reg2_flags = pci_resource_flags(pcidev, BAR_2);
-
-       reg4_start = pci_resource_start(pcidev, BAR_4);
-       reg4_len = pci_resource_len(pcidev, BAR_4);
-       reg4_flags = pci_resource_flags(pcidev, BAR_4);
-
-       pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
-       pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
-       pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
-
-       /* Make sure PCI base addr are MMIO */
-       if (!(reg0_flags & IORESOURCE_MEM) ||
-           !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
-               printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
-               ret = -ENODEV;
-               goto bail1;
-       }
-
-       /* Check for weird/broken PCI region reporting */
-       if ((reg0_len < C2_REG0_SIZE) ||
-           (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
-               printk(KERN_ERR PFX "Invalid PCI region sizes\n");
-               ret = -ENODEV;
-               goto bail1;
-       }
-
-       /* Reserve PCI I/O and memory resources */
-       ret = pci_request_regions(pcidev, DRV_NAME);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: Unable to request regions\n",
-                       pci_name(pcidev));
-               goto bail1;
-       }
-
-       if ((sizeof(dma_addr_t) > 4)) {
-               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
-               if (ret < 0) {
-                       printk(KERN_ERR PFX "64b DMA configuration failed\n");
-                       goto bail2;
-               }
-       } else {
-               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
-               if (ret < 0) {
-                       printk(KERN_ERR PFX "32b DMA configuration failed\n");
-                       goto bail2;
-               }
-       }
-
-       /* Enables bus-mastering on the device */
-       pci_set_master(pcidev);
-
-       /* Remap the adapter PCI registers in BAR4 */
-       mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
-                                   sizeof(struct c2_adapter_pci_regs));
-       if (!mmio_regs) {
-               printk(KERN_ERR PFX
-                       "Unable to remap adapter PCI registers in BAR4\n");
-               ret = -EIO;
-               goto bail2;
-       }
-
-       /* Validate PCI regs magic */
-       for (i = 0; i < sizeof(c2_magic); i++) {
-               if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
-                       printk(KERN_ERR PFX "Downlevel Firmware boot loader "
-                               "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
-                              "utility to update your boot loader\n",
-                               i + 1, sizeof(c2_magic),
-                               readb(mmio_regs + C2_REGS_MAGIC + i),
-                               c2_magic[i]);
-                       printk(KERN_ERR PFX "Adapter not claimed\n");
-                       iounmap(mmio_regs);
-                       ret = -EIO;
-                       goto bail2;
-               }
-       }
-
-       /* Validate the adapter version */
-       if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
-               printk(KERN_ERR PFX "Version mismatch "
-                       "[fw=%u, c2=%u], Adapter not claimed\n",
-                       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
-                       C2_VERSION);
-               ret = -EINVAL;
-               iounmap(mmio_regs);
-               goto bail2;
-       }
-
-       /* Validate the adapter IVN */
-       if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
-               printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
-                      "the OpenIB device support kit. "
-                      "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
-                      C2_IVN);
-               ret = -EINVAL;
-               iounmap(mmio_regs);
-               goto bail2;
-       }
-
-       /* Allocate hardware structure */
-       c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
-       if (!c2dev) {
-               printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
-                       pci_name(pcidev));
-               ret = -ENOMEM;
-               iounmap(mmio_regs);
-               goto bail2;
-       }
-
-       memset(c2dev, 0, sizeof(*c2dev));
-       spin_lock_init(&c2dev->lock);
-       c2dev->pcidev = pcidev;
-       c2dev->cur_tx = 0;
-
-       /* Get the last RX index */
-       c2dev->cur_rx =
-           (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
-            0xffffc000) / sizeof(struct c2_rxp_desc);
-
-       /* Request an interrupt line for the driver */
-       ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
-       if (ret) {
-               printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
-                       pci_name(pcidev), pcidev->irq);
-               iounmap(mmio_regs);
-               goto bail3;
-       }
-
-       /* Set driver specific data */
-       pci_set_drvdata(pcidev, c2dev);
-
-       /* Initialize network device */
-       if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
-               ret = -ENOMEM;
-               iounmap(mmio_regs);
-               goto bail4;
-       }
-
-       /* Save off the actual size prior to unmapping mmio_regs */
-       kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
-
-       /* Unmap the adapter PCI registers in BAR4 */
-       iounmap(mmio_regs);
-
-       /* Register network device */
-       ret = register_netdev(netdev);
-       if (ret) {
-               printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
-                       ret);
-               goto bail5;
-       }
-
-       /* Disable network packets */
-       netif_stop_queue(netdev);
-
-       /* Remap the adapter HRXDQ PA space to kernel VA space */
-       c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
-                                              C2_RXP_HRXDQ_SIZE);
-       if (!c2dev->mmio_rxp_ring) {
-               printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
-               ret = -EIO;
-               goto bail6;
-       }
-
-       /* Remap the adapter HTXDQ PA space to kernel VA space */
-       c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
-                                              C2_TXP_HTXDQ_SIZE);
-       if (!c2dev->mmio_txp_ring) {
-               printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
-               ret = -EIO;
-               goto bail7;
-       }
-
-       /* Save off the current RX index in the last 4 bytes of the TXP Ring */
-       C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
-
-       /* Remap the PCI registers in adapter BAR0 to kernel VA space */
-       c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
-       if (!c2dev->regs) {
-               printk(KERN_ERR PFX "Unable to remap BAR0\n");
-               ret = -EIO;
-               goto bail8;
-       }
-
-       /* Remap the PCI registers in adapter BAR4 to kernel VA space */
-       c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
-       c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
-                                    kva_map_size);
-       if (!c2dev->kva) {
-               printk(KERN_ERR PFX "Unable to remap BAR4\n");
-               ret = -EIO;
-               goto bail9;
-       }
-
-       /* Print out the MAC address */
-       c2_print_macaddr(netdev);
-
-       ret = c2_rnic_init(c2dev);
-       if (ret) {
-               printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
-               goto bail10;
-       }
-
-       ret = c2_register_device(c2dev);
-       if (ret)
-               goto bail10;
-
-       return 0;
-
- bail10:
-       iounmap(c2dev->kva);
-
- bail9:
-       iounmap(c2dev->regs);
-
- bail8:
-       iounmap(c2dev->mmio_txp_ring);
-
- bail7:
-       iounmap(c2dev->mmio_rxp_ring);
-
- bail6:
-       unregister_netdev(netdev);
-
- bail5:
-       free_netdev(netdev);
-
- bail4:
-       free_irq(pcidev->irq, c2dev);
-
- bail3:
-       ib_dealloc_device(&c2dev->ibdev);
-
- bail2:
-       pci_release_regions(pcidev);
-
- bail1:
-       pci_disable_device(pcidev);
-
- bail0:
-       return ret;
-}
-
-static void c2_remove(struct pci_dev *pcidev)
-{
-       struct c2_dev *c2dev = pci_get_drvdata(pcidev);
-       struct net_device *netdev = c2dev->netdev;
-
-       /* Unregister with OpenIB */
-       c2_unregister_device(c2dev);
-
-       /* Clean up the RNIC resources */
-       c2_rnic_term(c2dev);
-
-       /* Remove network device from the kernel */
-       unregister_netdev(netdev);
-
-       /* Free network device */
-       free_netdev(netdev);
-
-       /* Free the interrupt line */
-       free_irq(pcidev->irq, c2dev);
-
-       /* missing: Turn LEDs off here */
-
-       /* Unmap adapter PA space */
-       iounmap(c2dev->kva);
-       iounmap(c2dev->regs);
-       iounmap(c2dev->mmio_txp_ring);
-       iounmap(c2dev->mmio_rxp_ring);
-
-       /* Free the hardware structure */
-       ib_dealloc_device(&c2dev->ibdev);
-
-       /* Release reserved PCI I/O and memory resources */
-       pci_release_regions(pcidev);
-
-       /* Disable PCI device */
-       pci_disable_device(pcidev);
-
-       /* Clear driver specific data */
-       pci_set_drvdata(pcidev, NULL);
-}
-
-static struct pci_driver c2_pci_driver = {
-       .name = DRV_NAME,
-       .id_table = c2_pci_table,
-       .probe = c2_probe,
-       .remove = c2_remove,
-};
-
-module_pci_driver(c2_pci_driver);
diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h
deleted file mode 100644 (file)
index d619d73..0000000
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __C2_H
-#define __C2_H
-
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/idr.h>
-
-#include "c2_provider.h"
-#include "c2_mq.h"
-#include "c2_status.h"
-
-#define DRV_NAME     "c2"
-#define DRV_VERSION  "1.1"
-#define PFX          DRV_NAME ": "
-
-#define BAR_0                0
-#define BAR_2                2
-#define BAR_4                4
-
-#define RX_BUF_SIZE         (1536 + 8)
-#define ETH_JUMBO_MTU        9000
-#define C2_MAGIC            "CEPHEUS"
-#define C2_VERSION           4
-#define C2_IVN              (18 & 0x7fffffff)
-
-#define C2_REG0_SIZE        (16 * 1024)
-#define C2_REG2_SIZE        (2 * 1024 * 1024)
-#define C2_REG4_SIZE        (256 * 1024 * 1024)
-#define C2_NUM_TX_DESC       341
-#define C2_NUM_RX_DESC       256
-#define C2_PCI_REGS_OFFSET  (0x10000)
-#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
-#define C2_RXP_HRXDQ_SIZE   (4096)
-#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
-#define C2_TXP_HTXDQ_SIZE   (4096)
-#define C2_TX_TIMEOUT      (6*HZ)
-
-/* CEPHEUS */
-static const u8 c2_magic[] = {
-       0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
-};
-
-enum adapter_pci_regs {
-       C2_REGS_MAGIC = 0x0000,
-       C2_REGS_VERS = 0x0008,
-       C2_REGS_IVN = 0x000C,
-       C2_REGS_PCI_WINSIZE = 0x0010,
-       C2_REGS_Q0_QSIZE = 0x0014,
-       C2_REGS_Q0_MSGSIZE = 0x0018,
-       C2_REGS_Q0_POOLSTART = 0x001C,
-       C2_REGS_Q0_SHARED = 0x0020,
-       C2_REGS_Q1_QSIZE = 0x0024,
-       C2_REGS_Q1_MSGSIZE = 0x0028,
-       C2_REGS_Q1_SHARED = 0x0030,
-       C2_REGS_Q2_QSIZE = 0x0034,
-       C2_REGS_Q2_MSGSIZE = 0x0038,
-       C2_REGS_Q2_SHARED = 0x0040,
-       C2_REGS_ENADDR = 0x004C,
-       C2_REGS_RDMA_ENADDR = 0x0054,
-       C2_REGS_HRX_CUR = 0x006C,
-};
-
-struct c2_adapter_pci_regs {
-       char reg_magic[8];
-       u32 version;
-       u32 ivn;
-       u32 pci_window_size;
-       u32 q0_q_size;
-       u32 q0_msg_size;
-       u32 q0_pool_start;
-       u32 q0_shared;
-       u32 q1_q_size;
-       u32 q1_msg_size;
-       u32 q1_pool_start;
-       u32 q1_shared;
-       u32 q2_q_size;
-       u32 q2_msg_size;
-       u32 q2_pool_start;
-       u32 q2_shared;
-       u32 log_start;
-       u32 log_size;
-       u8 host_enaddr[8];
-       u8 rdma_enaddr[8];
-       u32 crash_entry;
-       u32 crash_ready[2];
-       u32 fw_txd_cur;
-       u32 fw_hrxd_cur;
-       u32 fw_rxd_cur;
-};
-
-enum pci_regs {
-       C2_HISR = 0x0000,
-       C2_DISR = 0x0004,
-       C2_HIMR = 0x0008,
-       C2_DIMR = 0x000C,
-       C2_NISR0 = 0x0010,
-       C2_NISR1 = 0x0014,
-       C2_NIMR0 = 0x0018,
-       C2_NIMR1 = 0x001C,
-       C2_IDIS = 0x0020,
-};
-
-enum {
-       C2_PCI_HRX_INT = 1 << 8,
-       C2_PCI_HTX_INT = 1 << 17,
-       C2_PCI_HRX_QUI = 1 << 31,
-};
-
-/*
- * Cepheus registers in BAR0.
- */
-struct c2_pci_regs {
-       u32 hostisr;
-       u32 dmaisr;
-       u32 hostimr;
-       u32 dmaimr;
-       u32 netisr0;
-       u32 netisr1;
-       u32 netimr0;
-       u32 netimr1;
-       u32 int_disable;
-};
-
-/* TXP flags */
-enum c2_txp_flags {
-       TXP_HTXD_DONE = 0,
-       TXP_HTXD_READY = 1 << 0,
-       TXP_HTXD_UNINIT = 1 << 1,
-};
-
-/* RXP flags */
-enum c2_rxp_flags {
-       RXP_HRXD_UNINIT = 0,
-       RXP_HRXD_READY = 1 << 0,
-       RXP_HRXD_DONE = 1 << 1,
-};
-
-/* RXP status */
-enum c2_rxp_status {
-       RXP_HRXD_ZERO = 0,
-       RXP_HRXD_OK = 1 << 0,
-       RXP_HRXD_BUF_OV = 1 << 1,
-};
-
-/* TXP descriptor fields */
-enum txp_desc {
-       C2_TXP_FLAGS = 0x0000,
-       C2_TXP_LEN = 0x0002,
-       C2_TXP_ADDR = 0x0004,
-};
-
-/* RXP descriptor fields */
-enum rxp_desc {
-       C2_RXP_FLAGS = 0x0000,
-       C2_RXP_STATUS = 0x0002,
-       C2_RXP_COUNT = 0x0004,
-       C2_RXP_LEN = 0x0006,
-       C2_RXP_ADDR = 0x0008,
-};
-
-struct c2_txp_desc {
-       u16 flags;
-       u16 len;
-       u64 addr;
-} __attribute__ ((packed));
-
-struct c2_rxp_desc {
-       u16 flags;
-       u16 status;
-       u16 count;
-       u16 len;
-       u64 addr;
-} __attribute__ ((packed));
-
-struct c2_rxp_hdr {
-       u16 flags;
-       u16 status;
-       u16 len;
-       u16 rsvd;
-} __attribute__ ((packed));
-
-struct c2_tx_desc {
-       u32 len;
-       u32 status;
-       dma_addr_t next_offset;
-};
-
-struct c2_rx_desc {
-       u32 len;
-       u32 status;
-       dma_addr_t next_offset;
-};
-
-struct c2_alloc {
-       u32 last;
-       u32 max;
-       spinlock_t lock;
-       unsigned long *table;
-};
-
-struct c2_array {
-       struct {
-               void **page;
-               int used;
-       } *page_list;
-};
-
-/*
- * The MQ shared pointer pool is organized as a linked list of
- * chunks. Each chunk contains a linked list of free shared pointers
- * that can be allocated to a given user mode client.
- *
- */
-struct sp_chunk {
-       struct sp_chunk *next;
-       dma_addr_t dma_addr;
-       DEFINE_DMA_UNMAP_ADDR(mapping);
-       u16 head;
-       u16 shared_ptr[0];
-};
-
-struct c2_pd_table {
-       u32 last;
-       u32 max;
-       spinlock_t lock;
-       unsigned long *table;
-};
-
-struct c2_qp_table {
-       struct idr idr;
-       spinlock_t lock;
-};
-
-struct c2_element {
-       struct c2_element *next;
-       void *ht_desc;          /* host     descriptor */
-       void __iomem *hw_desc;  /* hardware descriptor */
-       struct sk_buff *skb;
-       dma_addr_t mapaddr;
-       u32 maplen;
-};
-
-struct c2_ring {
-       struct c2_element *to_clean;
-       struct c2_element *to_use;
-       struct c2_element *start;
-       unsigned long count;
-};
-
-struct c2_dev {
-       struct ib_device ibdev;
-       void __iomem *regs;
-       void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
-       void __iomem *mmio_rxp_ring;
-       spinlock_t lock;
-       struct pci_dev *pcidev;
-       struct net_device *netdev;
-       struct net_device *pseudo_netdev;
-       unsigned int cur_tx;
-       unsigned int cur_rx;
-       u32 adapter_handle;
-       int device_cap_flags;
-       void __iomem *kva;      /* KVA device memory */
-       unsigned long pa;       /* PA device memory */
-       void **qptr_array;
-
-       struct kmem_cache *host_msg_cache;
-
-       struct list_head cca_link;              /* adapter list */
-       struct list_head eh_wakeup_list;        /* event wakeup list */
-       wait_queue_head_t req_vq_wo;
-
-       /* Cached RNIC properties */
-       struct ib_device_attr props;
-
-       struct c2_pd_table pd_table;
-       struct c2_qp_table qp_table;
-       int ports;              /* num of GigE ports */
-       int devnum;
-       spinlock_t vqlock;      /* sync vbs req MQ */
-
-       /* Verbs Queues */
-       struct c2_mq req_vq;    /* Verbs Request MQ */
-       struct c2_mq rep_vq;    /* Verbs Reply MQ */
-       struct c2_mq aeq;       /* Async Events MQ */
-
-       /* Kernel client MQs */
-       struct sp_chunk *kern_mqsp_pool;
-
-       /* Device updates these values when posting messages to a host
-        * target queue */
-       u16 req_vq_shared;
-       u16 rep_vq_shared;
-       u16 aeq_shared;
-       u16 irq_claimed;
-
-       /*
-        * Shared host target pages for user-accessible MQs.
-        */
-       int hthead;             /* index of first free entry */
-       void *htpages;          /* kernel vaddr */
-       int htlen;              /* length of htpages memory */
-       void *htuva;            /* user mapped vaddr */
-       spinlock_t htlock;      /* serialize allocation */
-
-       u64 adapter_hint_uva;   /* access to the activity FIFO */
-
-       //      spinlock_t aeq_lock;
-       //      spinlock_t rnic_lock;
-
-       __be16 *hint_count;
-       dma_addr_t hint_count_dma;
-       u16 hints_read;
-
-       int init;               /* TRUE if it's ready */
-       char ae_cache_name[16];
-       char vq_cache_name[16];
-};
-
-struct c2_port {
-       u32 msg_enable;
-       struct c2_dev *c2dev;
-       struct net_device *netdev;
-
-       spinlock_t tx_lock;
-       u32 tx_avail;
-       struct c2_ring tx_ring;
-       struct c2_ring rx_ring;
-
-       void *mem;              /* PCI memory for host rings */
-       dma_addr_t dma;
-       unsigned long mem_size;
-
-       u32 rx_buf_size;
-};
-
-/*
- * Activity FIFO registers in BAR0.
- */
-#define PCI_BAR0_HOST_HINT     0x100
-#define PCI_BAR0_ADAPTER_HINT  0x2000
-
-/*
- * Ammasso PCI vendor id and Cepheus PCI device id.
- */
-#define CQ_ARMED       0x01
-#define CQ_WAIT_FOR_DMA        0x80
-
-/*
- * The format of a hint is as follows:
- * Lower 16 bits are the count of hints for the queue.
- * Next 15 bits are the qp_index
- * Upper most bit depends on who reads it:
- *    If read by producer, then it means Full (1) or Not-Full (0)
- *    If read by consumer, then it means Empty (1) or Not-Empty (0)
- */
-#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
-#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
-#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
-
-
-/*
- * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
- * struct.
- */
-#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
-
-#ifndef readq
-static inline u64 readq(const void __iomem * addr)
-{
-       u64 ret = readl(addr + 4);
-       ret <<= 32;
-       ret |= readl(addr);
-
-       return ret;
-}
-#endif
-
-#ifndef writeq
-static inline void __raw_writeq(u64 val, void __iomem * addr)
-{
-       __raw_writel((u32) (val), addr);
-       __raw_writel((u32) (val >> 32), (addr + 4));
-}
-#endif
-
-#define C2_SET_CUR_RX(c2dev, cur_rx) \
-       __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
-
-#define C2_GET_CUR_RX(c2dev) \
-       be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
-
-static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
-{
-       return container_of(ibdev, struct c2_dev, ibdev);
-}
-
-static inline int c2_errno(void *reply)
-{
-       switch (c2_wr_get_result(reply)) {
-       case C2_OK:
-               return 0;
-       case CCERR_NO_BUFS:
-       case CCERR_INSUFFICIENT_RESOURCES:
-       case CCERR_ZERO_RDMA_READ_RESOURCES:
-               return -ENOMEM;
-       case CCERR_MR_IN_USE:
-       case CCERR_QP_IN_USE:
-               return -EBUSY;
-       case CCERR_ADDR_IN_USE:
-               return -EADDRINUSE;
-       case CCERR_ADDR_NOT_AVAIL:
-               return -EADDRNOTAVAIL;
-       case CCERR_CONN_RESET:
-               return -ECONNRESET;
-       case CCERR_NOT_IMPLEMENTED:
-       case CCERR_INVALID_WQE:
-               return -ENOSYS;
-       case CCERR_QP_NOT_PRIVILEGED:
-               return -EPERM;
-       case CCERR_STACK_ERROR:
-               return -EPROTO;
-       case CCERR_ACCESS_VIOLATION:
-       case CCERR_BASE_AND_BOUNDS_VIOLATION:
-               return -EFAULT;
-       case CCERR_STAG_STATE_NOT_INVALID:
-       case CCERR_INVALID_ADDRESS:
-       case CCERR_INVALID_CQ:
-       case CCERR_INVALID_EP:
-       case CCERR_INVALID_MODIFIER:
-       case CCERR_INVALID_MTU:
-       case CCERR_INVALID_PD_ID:
-       case CCERR_INVALID_QP:
-       case CCERR_INVALID_RNIC:
-       case CCERR_INVALID_STAG:
-               return -EINVAL;
-       default:
-               return -EAGAIN;
-       }
-}
-
-/* Device */
-extern int c2_register_device(struct c2_dev *c2dev);
-extern void c2_unregister_device(struct c2_dev *c2dev);
-extern int c2_rnic_init(struct c2_dev *c2dev);
-extern void c2_rnic_term(struct c2_dev *c2dev);
-extern void c2_rnic_interrupt(struct c2_dev *c2dev);
-extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
-extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
-
-/* QPs */
-extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
-                      struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
-extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
-extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
-extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
-                       struct ib_qp_attr *attr, int attr_mask);
-extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
-                                int ord, int ird);
-extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
-                       struct ib_send_wr **bad_wr);
-extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
-                          struct ib_recv_wr **bad_wr);
-extern void c2_init_qp_table(struct c2_dev *c2dev);
-extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
-extern void c2_set_qp_state(struct c2_qp *, int);
-extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
-
-/* PDs */
-extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
-extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
-extern int c2_init_pd_table(struct c2_dev *c2dev);
-extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
-
-/* CQs */
-extern int c2_init_cq(struct c2_dev *c2dev, int entries,
-                     struct c2_ucontext *ctx, struct c2_cq *cq);
-extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
-extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
-extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
-extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
-
-/* CM */
-extern int c2_llp_connect(struct iw_cm_id *cm_id,
-                         struct iw_cm_conn_param *iw_param);
-extern int c2_llp_accept(struct iw_cm_id *cm_id,
-                        struct iw_cm_conn_param *iw_param);
-extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
-                        u8 pdata_len);
-extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
-extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
-
-/* MM */
-extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
-                                     int page_size, int pbl_depth, u32 length,
-                                     u32 off, u64 *va, enum c2_acf acf,
-                                     struct c2_mr *mr);
-extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
-
-/* AE */
-extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
-
-/* MQSP Allocator */
-extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
-                            struct sp_chunk **root);
-extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
-extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
-                            dma_addr_t *dma_addr, gfp_t gfp_mask);
-extern void c2_free_mqsp(__be16* mqsp);
-#endif
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c
deleted file mode 100644 (file)
index cedda25..0000000
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include <rdma/iw_cm.h>
-#include "c2_status.h"
-#include "c2_ae.h"
-
-static int c2_convert_cm_status(u32 c2_status)
-{
-       switch (c2_status) {
-       case C2_CONN_STATUS_SUCCESS:
-               return 0;
-       case C2_CONN_STATUS_REJECTED:
-               return -ENETRESET;
-       case C2_CONN_STATUS_REFUSED:
-               return -ECONNREFUSED;
-       case C2_CONN_STATUS_TIMEDOUT:
-               return -ETIMEDOUT;
-       case C2_CONN_STATUS_NETUNREACH:
-               return -ENETUNREACH;
-       case C2_CONN_STATUS_HOSTUNREACH:
-               return -EHOSTUNREACH;
-       case C2_CONN_STATUS_INVALID_RNIC:
-               return -EINVAL;
-       case C2_CONN_STATUS_INVALID_QP:
-               return -EINVAL;
-       case C2_CONN_STATUS_INVALID_QP_STATE:
-               return -EINVAL;
-       case C2_CONN_STATUS_ADDR_NOT_AVAIL:
-               return -EADDRNOTAVAIL;
-       default:
-               printk(KERN_ERR PFX
-                      "%s - Unable to convert CM status: %d\n",
-                      __func__, c2_status);
-               return -EIO;
-       }
-}
-
-static const char* to_event_str(int event)
-{
-       static const char* event_str[] = {
-               "CCAE_REMOTE_SHUTDOWN",
-               "CCAE_ACTIVE_CONNECT_RESULTS",
-               "CCAE_CONNECTION_REQUEST",
-               "CCAE_LLP_CLOSE_COMPLETE",
-               "CCAE_TERMINATE_MESSAGE_RECEIVED",
-               "CCAE_LLP_CONNECTION_RESET",
-               "CCAE_LLP_CONNECTION_LOST",
-               "CCAE_LLP_SEGMENT_SIZE_INVALID",
-               "CCAE_LLP_INVALID_CRC",
-               "CCAE_LLP_BAD_FPDU",
-               "CCAE_INVALID_DDP_VERSION",
-               "CCAE_INVALID_RDMA_VERSION",
-               "CCAE_UNEXPECTED_OPCODE",
-               "CCAE_INVALID_DDP_QUEUE_NUMBER",
-               "CCAE_RDMA_READ_NOT_ENABLED",
-               "CCAE_RDMA_WRITE_NOT_ENABLED",
-               "CCAE_RDMA_READ_TOO_SMALL",
-               "CCAE_NO_L_BIT",
-               "CCAE_TAGGED_INVALID_STAG",
-               "CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
-               "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
-               "CCAE_TAGGED_INVALID_PD",
-               "CCAE_WRAP_ERROR",
-               "CCAE_BAD_CLOSE",
-               "CCAE_BAD_LLP_CLOSE",
-               "CCAE_INVALID_MSN_RANGE",
-               "CCAE_INVALID_MSN_GAP",
-               "CCAE_IRRQ_OVERFLOW",
-               "CCAE_IRRQ_MSN_GAP",
-               "CCAE_IRRQ_MSN_RANGE",
-               "CCAE_IRRQ_INVALID_STAG",
-               "CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
-               "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
-               "CCAE_IRRQ_INVALID_PD",
-               "CCAE_IRRQ_WRAP_ERROR",
-               "CCAE_CQ_SQ_COMPLETION_OVERFLOW",
-               "CCAE_CQ_RQ_COMPLETION_ERROR",
-               "CCAE_QP_SRQ_WQE_ERROR",
-               "CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
-               "CCAE_CQ_OVERFLOW",
-               "CCAE_CQ_OPERATION_ERROR",
-               "CCAE_SRQ_LIMIT_REACHED",
-               "CCAE_QP_RQ_LIMIT_REACHED",
-               "CCAE_SRQ_CATASTROPHIC_ERROR",
-               "CCAE_RNIC_CATASTROPHIC_ERROR"
-       };
-
-       if (event < CCAE_REMOTE_SHUTDOWN ||
-           event > CCAE_RNIC_CATASTROPHIC_ERROR)
-               return "<invalid event>";
-
-       event -= CCAE_REMOTE_SHUTDOWN;
-       return event_str[event];
-}
-
-static const char *to_qp_state_str(int state)
-{
-       switch (state) {
-       case C2_QP_STATE_IDLE:
-               return "C2_QP_STATE_IDLE";
-       case C2_QP_STATE_CONNECTING:
-               return "C2_QP_STATE_CONNECTING";
-       case C2_QP_STATE_RTS:
-               return "C2_QP_STATE_RTS";
-       case C2_QP_STATE_CLOSING:
-               return "C2_QP_STATE_CLOSING";
-       case C2_QP_STATE_TERMINATE:
-               return "C2_QP_STATE_TERMINATE";
-       case C2_QP_STATE_ERROR:
-               return "C2_QP_STATE_ERROR";
-       default:
-               return "<invalid QP state>";
-       }
-}
-
-void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
-{
-       struct c2_mq *mq = c2dev->qptr_array[mq_index];
-       union c2wr *wr;
-       void *resource_user_context;
-       struct iw_cm_event cm_event;
-       struct ib_event ib_event;
-       enum c2_resource_indicator resource_indicator;
-       enum c2_event_id event_id;
-       unsigned long flags;
-       int status;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
-       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
-
-       /*
-        * retrieve the message
-        */
-       wr = c2_mq_consume(mq);
-       if (!wr)
-               return;
-
-       memset(&ib_event, 0, sizeof(ib_event));
-       memset(&cm_event, 0, sizeof(cm_event));
-
-       event_id = c2_wr_get_id(wr);
-       resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
-       resource_user_context =
-           (void *) (unsigned long) wr->ae.ae_generic.user_context;
-
-       status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
-
-       pr_debug("event received c2_dev=%p, event_id=%d, "
-               "resource_indicator=%d, user_context=%p, status = %d\n",
-               c2dev, event_id, resource_indicator, resource_user_context,
-               status);
-
-       switch (resource_indicator) {
-       case C2_RES_IND_QP:{
-
-               struct c2_qp *qp = (struct c2_qp *)resource_user_context;
-               struct iw_cm_id *cm_id = qp->cm_id;
-               struct c2wr_ae_active_connect_results *res;
-
-               if (!cm_id) {
-                       pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
-                               qp);
-                       goto ignore_it;
-               }
-               pr_debug("%s: event = %s, user_context=%llx, "
-                       "resource_type=%x, "
-                       "resource=%x, qp_state=%s\n",
-                       __func__,
-                       to_event_str(event_id),
-                       (unsigned long long) wr->ae.ae_generic.user_context,
-                       be32_to_cpu(wr->ae.ae_generic.resource_type),
-                       be32_to_cpu(wr->ae.ae_generic.resource),
-                       to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
-
-               c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
-
-               switch (event_id) {
-               case CCAE_ACTIVE_CONNECT_RESULTS:
-                       res = &wr->ae.ae_active_connect_results;
-                       cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
-                       laddr->sin_addr.s_addr = res->laddr;
-                       raddr->sin_addr.s_addr = res->raddr;
-                       laddr->sin_port = res->lport;
-                       raddr->sin_port = res->rport;
-                       if (status == 0) {
-                               cm_event.private_data_len =
-                                       be32_to_cpu(res->private_data_length);
-                               cm_event.private_data = res->private_data;
-                       } else {
-                               spin_lock_irqsave(&qp->lock, flags);
-                               if (qp->cm_id) {
-                                       qp->cm_id->rem_ref(qp->cm_id);
-                                       qp->cm_id = NULL;
-                               }
-                               spin_unlock_irqrestore(&qp->lock, flags);
-                               cm_event.private_data_len = 0;
-                               cm_event.private_data = NULL;
-                       }
-                       if (cm_id->event_handler)
-                               cm_id->event_handler(cm_id, &cm_event);
-                       break;
-               case CCAE_TERMINATE_MESSAGE_RECEIVED:
-               case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
-                       ib_event.device = &c2dev->ibdev;
-                       ib_event.element.qp = &qp->ibqp;
-                       ib_event.event = IB_EVENT_QP_REQ_ERR;
-
-                       if (qp->ibqp.event_handler)
-                               qp->ibqp.event_handler(&ib_event,
-                                                      qp->ibqp.
-                                                      qp_context);
-                       break;
-               case CCAE_BAD_CLOSE:
-               case CCAE_LLP_CLOSE_COMPLETE:
-               case CCAE_LLP_CONNECTION_RESET:
-               case CCAE_LLP_CONNECTION_LOST:
-                       BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
-
-                       spin_lock_irqsave(&qp->lock, flags);
-                       if (qp->cm_id) {
-                               qp->cm_id->rem_ref(qp->cm_id);
-                               qp->cm_id = NULL;
-                       }
-                       spin_unlock_irqrestore(&qp->lock, flags);
-                       cm_event.event = IW_CM_EVENT_CLOSE;
-                       cm_event.status = 0;
-                       if (cm_id->event_handler)
-                               cm_id->event_handler(cm_id, &cm_event);
-                       break;
-               default:
-                       BUG_ON(1);
-                       pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
-                               "CM_ID=%p\n",
-                               __func__, __LINE__,
-                               event_id, qp, cm_id);
-                       break;
-               }
-               break;
-       }
-
-       case C2_RES_IND_EP:{
-
-               struct c2wr_ae_connection_request *req =
-                       &wr->ae.ae_connection_request;
-               struct iw_cm_id *cm_id =
-                       (struct iw_cm_id *)resource_user_context;
-
-               pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
-               if (event_id != CCAE_CONNECTION_REQUEST) {
-                       pr_debug("%s: Invalid event_id: %d\n",
-                               __func__, event_id);
-                       break;
-               }
-               cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
-               cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
-               laddr->sin_addr.s_addr = req->laddr;
-               raddr->sin_addr.s_addr = req->raddr;
-               laddr->sin_port = req->lport;
-               raddr->sin_port = req->rport;
-               cm_event.private_data_len =
-                       be32_to_cpu(req->private_data_length);
-               cm_event.private_data = req->private_data;
-               /*
-                * Until ird/ord negotiation via MPAv2 support is added, send
-                * max supported values
-                */
-               cm_event.ird = cm_event.ord = 128;
-
-               if (cm_id->event_handler)
-                       cm_id->event_handler(cm_id, &cm_event);
-               break;
-       }
-
-       case C2_RES_IND_CQ:{
-               struct c2_cq *cq =
-                   (struct c2_cq *) resource_user_context;
-
-               pr_debug("IB_EVENT_CQ_ERR\n");
-               ib_event.device = &c2dev->ibdev;
-               ib_event.element.cq = &cq->ibcq;
-               ib_event.event = IB_EVENT_CQ_ERR;
-
-               if (cq->ibcq.event_handler)
-                       cq->ibcq.event_handler(&ib_event,
-                                              cq->ibcq.cq_context);
-               break;
-       }
-
-       default:
-               printk("Bad resource indicator = %d\n",
-                      resource_indicator);
-               break;
-       }
-
- ignore_it:
-       c2_mq_free(mq);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.h b/drivers/infiniband/hw/amso1100/c2_ae.h
deleted file mode 100644 (file)
index 3a065c3..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_AE_H_
-#define _C2_AE_H_
-
-/*
- * WARNING: If you change this file, also bump C2_IVN_BASE
- * in common/include/clustercore/c2_ivn.h.
- */
-
-/*
- * Asynchronous Event Identifiers
- *
- * These start at 0x80 only so it's obvious from inspection that
- * they are not work-request statuses.  This isn't critical.
- *
- * NOTE: these event id's must fit in eight bits.
- */
-enum c2_event_id {
-       CCAE_REMOTE_SHUTDOWN = 0x80,
-       CCAE_ACTIVE_CONNECT_RESULTS,
-       CCAE_CONNECTION_REQUEST,
-       CCAE_LLP_CLOSE_COMPLETE,
-       CCAE_TERMINATE_MESSAGE_RECEIVED,
-       CCAE_LLP_CONNECTION_RESET,
-       CCAE_LLP_CONNECTION_LOST,
-       CCAE_LLP_SEGMENT_SIZE_INVALID,
-       CCAE_LLP_INVALID_CRC,
-       CCAE_LLP_BAD_FPDU,
-       CCAE_INVALID_DDP_VERSION,
-       CCAE_INVALID_RDMA_VERSION,
-       CCAE_UNEXPECTED_OPCODE,
-       CCAE_INVALID_DDP_QUEUE_NUMBER,
-       CCAE_RDMA_READ_NOT_ENABLED,
-       CCAE_RDMA_WRITE_NOT_ENABLED,
-       CCAE_RDMA_READ_TOO_SMALL,
-       CCAE_NO_L_BIT,
-       CCAE_TAGGED_INVALID_STAG,
-       CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
-       CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
-       CCAE_TAGGED_INVALID_PD,
-       CCAE_WRAP_ERROR,
-       CCAE_BAD_CLOSE,
-       CCAE_BAD_LLP_CLOSE,
-       CCAE_INVALID_MSN_RANGE,
-       CCAE_INVALID_MSN_GAP,
-       CCAE_IRRQ_OVERFLOW,
-       CCAE_IRRQ_MSN_GAP,
-       CCAE_IRRQ_MSN_RANGE,
-       CCAE_IRRQ_INVALID_STAG,
-       CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
-       CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
-       CCAE_IRRQ_INVALID_PD,
-       CCAE_IRRQ_WRAP_ERROR,
-       CCAE_CQ_SQ_COMPLETION_OVERFLOW,
-       CCAE_CQ_RQ_COMPLETION_ERROR,
-       CCAE_QP_SRQ_WQE_ERROR,
-       CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
-       CCAE_CQ_OVERFLOW,
-       CCAE_CQ_OPERATION_ERROR,
-       CCAE_SRQ_LIMIT_REACHED,
-       CCAE_QP_RQ_LIMIT_REACHED,
-       CCAE_SRQ_CATASTROPHIC_ERROR,
-       CCAE_RNIC_CATASTROPHIC_ERROR
-/* WARNING If you add more id's, make sure their values fit in eight bits. */
-};
-
-/*
- * Resource Indicators and Identifiers
- */
-enum c2_resource_indicator {
-       C2_RES_IND_QP = 1,
-       C2_RES_IND_EP,
-       C2_RES_IND_CQ,
-       C2_RES_IND_SRQ,
-};
-
-#endif /* _C2_AE_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_alloc.c b/drivers/infiniband/hw/amso1100/c2_alloc.c
deleted file mode 100644 (file)
index 78d247e..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/errno.h>
-#include <linux/bitmap.h>
-
-#include "c2.h"
-
-static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
-                              struct sp_chunk **head)
-{
-       int i;
-       struct sp_chunk *new_head;
-       dma_addr_t dma_addr;
-
-       new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
-                                     &dma_addr, gfp_mask);
-       if (new_head == NULL)
-               return -ENOMEM;
-
-       new_head->dma_addr = dma_addr;
-       dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
-
-       new_head->next = NULL;
-       new_head->head = 0;
-
-       /* build list where each index is the next free slot */
-       for (i = 0;
-            i < (PAGE_SIZE - sizeof(struct sp_chunk) -
-                 sizeof(u16)) / sizeof(u16) - 1;
-            i++) {
-               new_head->shared_ptr[i] = i + 1;
-       }
-       /* terminate list */
-       new_head->shared_ptr[i] = 0xFFFF;
-
-       *head = new_head;
-       return 0;
-}
-
-int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
-                     struct sp_chunk **root)
-{
-       return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
-}
-
-void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
-{
-       struct sp_chunk *next;
-
-       while (root) {
-               next = root->next;
-               dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
-                                 dma_unmap_addr(root, mapping));
-               root = next;
-       }
-}
-
-__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
-                     dma_addr_t *dma_addr, gfp_t gfp_mask)
-{
-       u16 mqsp;
-
-       while (head) {
-               mqsp = head->head;
-               if (mqsp != 0xFFFF) {
-                       head->head = head->shared_ptr[mqsp];
-                       break;
-               } else if (head->next == NULL) {
-                       if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
-                           0) {
-                               head = head->next;
-                               mqsp = head->head;
-                               head->head = head->shared_ptr[mqsp];
-                               break;
-                       } else
-                               return NULL;
-               } else
-                       head = head->next;
-       }
-       if (head) {
-               *dma_addr = head->dma_addr +
-                           ((unsigned long) &(head->shared_ptr[mqsp]) -
-                            (unsigned long) head);
-               pr_debug("%s addr %p dma_addr %llx\n", __func__,
-                        &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
-               return (__force __be16 *) &(head->shared_ptr[mqsp]);
-       }
-       return NULL;
-}
-
-void c2_free_mqsp(__be16 *mqsp)
-{
-       struct sp_chunk *head;
-       u16 idx;
-
-       /* The chunk containing this ptr begins at the page boundary */
-       head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
-
-       /* Link head to new mqsp */
-       *mqsp = (__force __be16) head->head;
-
-       /* Compute the shared_ptr index */
-       idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
-       idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
-
-       /* Point this index at the head */
-       head->shared_ptr[idx] = head->head;
-
-       /* Point head at this index */
-       head->head = idx;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/infiniband/hw/amso1100/c2_cm.c
deleted file mode 100644 (file)
index 23bfa94..0000000
+++ /dev/null
@@ -1,461 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/slab.h>
-
-#include "c2.h"
-#include "c2_wr.h"
-#include "c2_vq.h"
-#include <rdma/iw_cm.h>
-
-int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       struct c2_dev *c2dev = to_c2dev(cm_id->device);
-       struct ib_qp *ibqp;
-       struct c2_qp *qp;
-       struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */
-       struct c2_vq_req *vq_req;
-       int err;
-       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
-
-       if (cm_id->remote_addr.ss_family != AF_INET)
-               return -ENOSYS;
-
-       ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
-       if (!ibqp)
-               return -EINVAL;
-       qp = to_c2qp(ibqp);
-
-       /* Associate QP <--> CM_ID */
-       cm_id->provider_data = qp;
-       cm_id->add_ref(cm_id);
-       qp->cm_id = cm_id;
-
-       /*
-        * only support the max private_data length
-        */
-       if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
-               err = -EINVAL;
-               goto bail0;
-       }
-       /*
-        * Set the rdma read limits
-        */
-       err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
-       if (err)
-               goto bail0;
-
-       /*
-        * Create and send a WR_QP_CONNECT...
-        */
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       c2_wr_set_id(wr, CCWR_QP_CONNECT);
-       wr->hdr.context = 0;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->qp_handle = qp->adapter_handle;
-
-       wr->remote_addr = raddr->sin_addr.s_addr;
-       wr->remote_port = raddr->sin_port;
-
-       /*
-        * Move any private data from the callers's buf into
-        * the WR.
-        */
-       if (iw_param->private_data) {
-               wr->private_data_length =
-                       cpu_to_be32(iw_param->private_data_len);
-               memcpy(&wr->private_data[0], iw_param->private_data,
-                      iw_param->private_data_len);
-       } else
-               wr->private_data_length = 0;
-
-       /*
-        * Send WR to adapter.  NOTE: There is no synch reply from
-        * the adapter.
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       vq_req_free(c2dev, vq_req);
-
- bail1:
-       kfree(wr);
- bail0:
-       if (err) {
-               /*
-                * If we fail, release reference on QP and
-                * disassociate QP from CM_ID
-                */
-               cm_id->provider_data = NULL;
-               qp->cm_id = NULL;
-               cm_id->rem_ref(cm_id);
-       }
-       return err;
-}
-
-int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
-{
-       struct c2_dev *c2dev;
-       struct c2wr_ep_listen_create_req wr;
-       struct c2wr_ep_listen_create_rep *reply;
-       struct c2_vq_req *vq_req;
-       int err;
-       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
-
-       if (cm_id->local_addr.ss_family != AF_INET)
-               return -ENOSYS;
-
-       c2dev = to_c2dev(cm_id->device);
-       if (c2dev == NULL)
-               return -EINVAL;
-
-       /*
-        * Allocate verbs request.
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
-       wr.hdr.context = (u64) (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.local_addr = laddr->sin_addr.s_addr;
-       wr.local_port = laddr->sin_port;
-       wr.backlog = cpu_to_be32(backlog);
-       wr.user_context = (u64) (unsigned long) cm_id;
-
-       /*
-        * Reference the request struct.  Dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       /*
-        * Process reply
-        */
-       reply =
-           (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       if ((err = c2_errno(reply)) != 0)
-               goto bail1;
-
-       /*
-        * Keep the adapter handle. Used in subsequent destroy
-        */
-       cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
-
-       /*
-        * free vq stuff
-        */
-       vq_repbuf_free(c2dev, reply);
-       vq_req_free(c2dev, vq_req);
-
-       return 0;
-
- bail1:
-       vq_repbuf_free(c2dev, reply);
- bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-
-int c2_llp_service_destroy(struct iw_cm_id *cm_id)
-{
-
-       struct c2_dev *c2dev;
-       struct c2wr_ep_listen_destroy_req wr;
-       struct c2wr_ep_listen_destroy_rep *reply;
-       struct c2_vq_req *vq_req;
-       int err;
-
-       c2dev = to_c2dev(cm_id->device);
-       if (c2dev == NULL)
-               return -EINVAL;
-
-       /*
-        * Allocate verbs request.
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       /*
-        * Process reply
-        */
-       reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-       if ((err = c2_errno(reply)) != 0)
-               goto bail1;
-
- bail1:
-       vq_repbuf_free(c2dev, reply);
- bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       struct c2_dev *c2dev = to_c2dev(cm_id->device);
-       struct c2_qp *qp;
-       struct ib_qp *ibqp;
-       struct c2wr_cr_accept_req *wr;  /* variable length WR */
-       struct c2_vq_req *vq_req;
-       struct c2wr_cr_accept_rep *reply;       /* VQ Reply msg ptr. */
-       int err;
-
-       ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
-       if (!ibqp)
-               return -EINVAL;
-       qp = to_c2qp(ibqp);
-
-       /* Set the RDMA read limits */
-       err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
-       if (err)
-               goto bail0;
-
-       /* Allocate verbs request. */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-       vq_req->qp = qp;
-       vq_req->cm_id = cm_id;
-       vq_req->event = IW_CM_EVENT_ESTABLISHED;
-
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       /* Build the WR */
-       c2_wr_set_id(wr, CCWR_CR_ACCEPT);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
-       wr->qp_handle = qp->adapter_handle;
-
-       /* Replace the cr_handle with the QP after accept */
-       cm_id->provider_data = qp;
-       cm_id->add_ref(cm_id);
-       qp->cm_id = cm_id;
-
-       cm_id->provider_data = qp;
-
-       /* Validate private_data length */
-       if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
-               err = -EINVAL;
-               goto bail1;
-       }
-
-       if (iw_param->private_data) {
-               wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
-               memcpy(&wr->private_data[0],
-                      iw_param->private_data, iw_param->private_data_len);
-       } else
-               wr->private_data_length = 0;
-
-       /* Reference the request struct.  Dereferenced in the int handler. */
-       vq_req_get(c2dev, vq_req);
-
-       /* Send WR to adapter */
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       /* Wait for reply from adapter */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       /* Check that reply is present */
-       reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-
-       if (!err)
-               c2_set_qp_state(qp, C2_QP_STATE_RTS);
- bail1:
-       kfree(wr);
-       vq_req_free(c2dev, vq_req);
- bail0:
-       if (err) {
-               /*
-                * If we fail, release reference on QP and
-                * disassociate QP from CM_ID
-                */
-               cm_id->provider_data = NULL;
-               qp->cm_id = NULL;
-               cm_id->rem_ref(cm_id);
-       }
-       return err;
-}
-
-int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-       struct c2_dev *c2dev;
-       struct c2wr_cr_reject_req wr;
-       struct c2_vq_req *vq_req;
-       struct c2wr_cr_reject_rep *reply;
-       int err;
-
-       c2dev = to_c2dev(cm_id->device);
-
-       /*
-        * Allocate verbs request.
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_CR_REJECT);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_cr_reject_rep *) (unsigned long)
-               vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-       err = c2_errno(reply);
-       /*
-        * free vq stuff
-        */
-       vq_repbuf_free(c2dev, reply);
-
- bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c
deleted file mode 100644 (file)
index 1b63185..0000000
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/gfp.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-#include "c2_status.h"
-
-#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
-
-static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
-{
-       struct c2_cq *cq;
-       unsigned long flags;
-
-       spin_lock_irqsave(&c2dev->lock, flags);
-       cq = c2dev->qptr_array[cqn];
-       if (!cq) {
-               spin_unlock_irqrestore(&c2dev->lock, flags);
-               return NULL;
-       }
-       atomic_inc(&cq->refcount);
-       spin_unlock_irqrestore(&c2dev->lock, flags);
-       return cq;
-}
-
-static void c2_cq_put(struct c2_cq *cq)
-{
-       if (atomic_dec_and_test(&cq->refcount))
-               wake_up(&cq->wait);
-}
-
-void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
-{
-       struct c2_cq *cq;
-
-       cq = c2_cq_get(c2dev, mq_index);
-       if (!cq) {
-               printk("discarding events on destroyed CQN=%d\n", mq_index);
-               return;
-       }
-
-       (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
-       c2_cq_put(cq);
-}
-
-void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
-{
-       struct c2_cq *cq;
-       struct c2_mq *q;
-
-       cq = c2_cq_get(c2dev, mq_index);
-       if (!cq)
-               return;
-
-       spin_lock_irq(&cq->lock);
-       q = &cq->mq;
-       if (q && !c2_mq_empty(q)) {
-               u16 priv = q->priv;
-               struct c2wr_ce *msg;
-
-               while (priv != be16_to_cpu(*q->shared)) {
-                       msg = (struct c2wr_ce *)
-                               (q->msg_pool.host + priv * q->msg_size);
-                       if (msg->qp_user_context == (u64) (unsigned long) qp) {
-                               msg->qp_user_context = (u64) 0;
-                       }
-                       priv = (priv + 1) % q->q_size;
-               }
-       }
-       spin_unlock_irq(&cq->lock);
-       c2_cq_put(cq);
-}
-
-static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
-{
-       switch (status) {
-       case C2_OK:
-               return IB_WC_SUCCESS;
-       case CCERR_FLUSHED:
-               return IB_WC_WR_FLUSH_ERR;
-       case CCERR_BASE_AND_BOUNDS_VIOLATION:
-               return IB_WC_LOC_PROT_ERR;
-       case CCERR_ACCESS_VIOLATION:
-               return IB_WC_LOC_ACCESS_ERR;
-       case CCERR_TOTAL_LENGTH_TOO_BIG:
-               return IB_WC_LOC_LEN_ERR;
-       case CCERR_INVALID_WINDOW:
-               return IB_WC_MW_BIND_ERR;
-       default:
-               return IB_WC_GENERAL_ERR;
-       }
-}
-
-
-static inline int c2_poll_one(struct c2_dev *c2dev,
-                             struct c2_cq *cq, struct ib_wc *entry)
-{
-       struct c2wr_ce *ce;
-       struct c2_qp *qp;
-       int is_recv = 0;
-
-       ce = c2_mq_consume(&cq->mq);
-       if (!ce) {
-               return -EAGAIN;
-       }
-
-       /*
-        * if the qp returned is null then this qp has already
-        * been freed and we are unable process the completion.
-        * try pulling the next message
-        */
-       while ((qp =
-               (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
-               c2_mq_free(&cq->mq);
-               ce = c2_mq_consume(&cq->mq);
-               if (!ce)
-                       return -EAGAIN;
-       }
-
-       entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
-       entry->wr_id = ce->hdr.context;
-       entry->qp = &qp->ibqp;
-       entry->wc_flags = 0;
-       entry->slid = 0;
-       entry->sl = 0;
-       entry->src_qp = 0;
-       entry->dlid_path_bits = 0;
-       entry->pkey_index = 0;
-
-       switch (c2_wr_get_id(ce)) {
-       case C2_WR_TYPE_SEND:
-               entry->opcode = IB_WC_SEND;
-               break;
-       case C2_WR_TYPE_RDMA_WRITE:
-               entry->opcode = IB_WC_RDMA_WRITE;
-               break;
-       case C2_WR_TYPE_RDMA_READ:
-               entry->opcode = IB_WC_RDMA_READ;
-               break;
-       case C2_WR_TYPE_BIND_MW:
-               entry->opcode = IB_WC_BIND_MW;
-               break;
-       case C2_WR_TYPE_RECV:
-               entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
-               entry->opcode = IB_WC_RECV;
-               is_recv = 1;
-               break;
-       default:
-               break;
-       }
-
-       /* consume the WQEs */
-       if (is_recv)
-               c2_mq_lconsume(&qp->rq_mq, 1);
-       else
-               c2_mq_lconsume(&qp->sq_mq,
-                              be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
-
-       /* free the message */
-       c2_mq_free(&cq->mq);
-
-       return 0;
-}
-
-int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-       struct c2_dev *c2dev = to_c2dev(ibcq->device);
-       struct c2_cq *cq = to_c2cq(ibcq);
-       unsigned long flags;
-       int npolled, err;
-
-       spin_lock_irqsave(&cq->lock, flags);
-
-       for (npolled = 0; npolled < num_entries; ++npolled) {
-
-               err = c2_poll_one(c2dev, cq, entry + npolled);
-               if (err)
-                       break;
-       }
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-       return npolled;
-}
-
-int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-{
-       struct c2_mq_shared __iomem *shared;
-       struct c2_cq *cq;
-       unsigned long flags;
-       int ret = 0;
-
-       cq = to_c2cq(ibcq);
-       shared = cq->mq.peer;
-
-       if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
-               writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
-       else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
-               writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
-       else
-               return -EINVAL;
-
-       writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
-
-       /*
-        * Now read back shared->armed to make the PCI
-        * write synchronous.  This is necessary for
-        * correct cq notification semantics.
-        */
-       readb(&shared->armed);
-
-       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
-               spin_lock_irqsave(&cq->lock, flags);
-               ret = !c2_mq_empty(&cq->mq);
-               spin_unlock_irqrestore(&cq->lock, flags);
-       }
-
-       return ret;
-}
-
-static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
-{
-       dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
-                         mq->msg_pool.host, dma_unmap_addr(mq, mapping));
-}
-
-static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
-                          size_t q_size, size_t msg_size)
-{
-       u8 *pool_start;
-
-       if (q_size > SIZE_MAX / msg_size)
-               return -EINVAL;
-
-       pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
-                                       &mq->host_dma, GFP_KERNEL);
-       if (!pool_start)
-               return -ENOMEM;
-
-       c2_mq_rep_init(mq,
-                      0,               /* index (currently unknown) */
-                      q_size,
-                      msg_size,
-                      pool_start,
-                      NULL,    /* peer (currently unknown) */
-                      C2_MQ_HOST_TARGET);
-
-       dma_unmap_addr_set(mq, mapping, mq->host_dma);
-
-       return 0;
-}
-
-int c2_init_cq(struct c2_dev *c2dev, int entries,
-              struct c2_ucontext *ctx, struct c2_cq *cq)
-{
-       struct c2wr_cq_create_req wr;
-       struct c2wr_cq_create_rep *reply;
-       unsigned long peer_pa;
-       struct c2_vq_req *vq_req;
-       int err;
-
-       might_sleep();
-
-       cq->ibcq.cqe = entries - 1;
-       cq->is_kernel = !ctx;
-
-       /* Allocate a shared pointer */
-       cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                     &cq->mq.shared_dma, GFP_KERNEL);
-       if (!cq->mq.shared)
-               return -ENOMEM;
-
-       /* Allocate pages for the message pool */
-       err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
-       if (err)
-               goto bail0;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_CQ_CREATE);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.msg_size = cpu_to_be32(cq->mq.msg_size);
-       wr.depth = cpu_to_be32(cq->mq.q_size);
-       wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
-       wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
-       wr.user_context = (u64) (unsigned long) (cq);
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail2;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail2;
-
-       reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail2;
-       }
-
-       if ((err = c2_errno(reply)) != 0)
-               goto bail3;
-
-       cq->adapter_handle = reply->cq_handle;
-       cq->mq.index = be32_to_cpu(reply->mq_index);
-
-       peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
-       cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
-       if (!cq->mq.peer) {
-               err = -ENOMEM;
-               goto bail3;
-       }
-
-       vq_repbuf_free(c2dev, reply);
-       vq_req_free(c2dev, vq_req);
-
-       spin_lock_init(&cq->lock);
-       atomic_set(&cq->refcount, 1);
-       init_waitqueue_head(&cq->wait);
-
-       /*
-        * Use the MQ index allocated by the adapter to
-        * store the CQ in the qptr_array
-        */
-       cq->cqn = cq->mq.index;
-       c2dev->qptr_array[cq->cqn] = cq;
-
-       return 0;
-
-      bail3:
-       vq_repbuf_free(c2dev, reply);
-      bail2:
-       vq_req_free(c2dev, vq_req);
-      bail1:
-       c2_free_cq_buf(c2dev, &cq->mq);
-      bail0:
-       c2_free_mqsp(cq->mq.shared);
-
-       return err;
-}
-
-void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
-{
-       int err;
-       struct c2_vq_req *vq_req;
-       struct c2wr_cq_destroy_req wr;
-       struct c2wr_cq_destroy_rep *reply;
-
-       might_sleep();
-
-       /* Clear CQ from the qptr array */
-       spin_lock_irq(&c2dev->lock);
-       c2dev->qptr_array[cq->mq.index] = NULL;
-       atomic_dec(&cq->refcount);
-       spin_unlock_irq(&c2dev->lock);
-
-       wait_event(cq->wait, !atomic_read(&cq->refcount));
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               goto bail0;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.cq_handle = cq->adapter_handle;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
-       if (reply)
-               vq_repbuf_free(c2dev, reply);
-      bail1:
-       vq_req_free(c2dev, vq_req);
-      bail0:
-       if (cq->is_kernel) {
-               c2_free_cq_buf(c2dev, &cq->mq);
-       }
-
-       return;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c
deleted file mode 100644 (file)
index 3a17d9b..0000000
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include <rdma/iw_cm.h>
-#include "c2_vq.h"
-
-static void handle_mq(struct c2_dev *c2dev, u32 index);
-static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
-
-/*
- * Handle RNIC interrupts
- */
-void c2_rnic_interrupt(struct c2_dev *c2dev)
-{
-       unsigned int mq_index;
-
-       while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
-               mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
-               if (mq_index & 0x80000000) {
-                       break;
-               }
-
-               c2dev->hints_read++;
-               handle_mq(c2dev, mq_index);
-       }
-
-}
-
-/*
- * Top level MQ handler
- */
-static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
-{
-       if (c2dev->qptr_array[mq_index] == NULL) {
-               pr_debug("handle_mq: stray activity for mq_index=%d\n",
-                        mq_index);
-               return;
-       }
-
-       switch (mq_index) {
-       case (0):
-               /*
-                * An index of 0 in the activity queue
-                * indicates the req vq now has messages
-                * available...
-                *
-                * Wake up any waiters waiting on req VQ
-                * message availability.
-                */
-               wake_up(&c2dev->req_vq_wo);
-               break;
-       case (1):
-               handle_vq(c2dev, mq_index);
-               break;
-       case (2):
-               /* We have to purge the VQ in case there are pending
-                * accept reply requests that would result in the
-                * generation of an ESTABLISHED event. If we don't
-                * generate these first, a CLOSE event could end up
-                * being delivered before the ESTABLISHED event.
-                */
-               handle_vq(c2dev, 1);
-
-               c2_ae_event(c2dev, mq_index);
-               break;
-       default:
-               /* There is no event synchronization between CQ events
-                * and AE or CM events. In fact, CQE could be
-                * delivered for all of the I/O up to and including the
-                * FLUSH for a peer disconenct prior to the ESTABLISHED
-                * event being delivered to the app. The reason for this
-                * is that CM events are delivered on a thread, while AE
-                * and CM events are delivered on interrupt context.
-                */
-               c2_cq_event(c2dev, mq_index);
-               break;
-       }
-
-       return;
-}
-
-/*
- * Handles verbs WR replies.
- */
-static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
-{
-       void *adapter_msg, *reply_msg;
-       struct c2wr_hdr *host_msg;
-       struct c2wr_hdr tmp;
-       struct c2_mq *reply_vq;
-       struct c2_vq_req *req;
-       struct iw_cm_event cm_event;
-       int err;
-
-       reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
-
-       /*
-        * get next msg from mq_index into adapter_msg.
-        * don't free it yet.
-        */
-       adapter_msg = c2_mq_consume(reply_vq);
-       if (adapter_msg == NULL) {
-               return;
-       }
-
-       host_msg = vq_repbuf_alloc(c2dev);
-
-       /*
-        * If we can't get a host buffer, then we'll still
-        * wakeup the waiter, we just won't give him the msg.
-        * It is assumed the waiter will deal with this...
-        */
-       if (!host_msg) {
-               pr_debug("handle_vq: no repbufs!\n");
-
-               /*
-                * just copy the WR header into a local variable.
-                * this allows us to still demux on the context
-                */
-               host_msg = &tmp;
-               memcpy(host_msg, adapter_msg, sizeof(tmp));
-               reply_msg = NULL;
-       } else {
-               memcpy(host_msg, adapter_msg, reply_vq->msg_size);
-               reply_msg = host_msg;
-       }
-
-       /*
-        * consume the msg from the MQ
-        */
-       c2_mq_free(reply_vq);
-
-       /*
-        * wakeup the waiter.
-        */
-       req = (struct c2_vq_req *) (unsigned long) host_msg->context;
-       if (req == NULL) {
-               /*
-                * We should never get here, as the adapter should
-                * never send us a reply that we're not expecting.
-                */
-               if (reply_msg != NULL)
-                       vq_repbuf_free(c2dev, host_msg);
-               pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
-               return;
-       }
-
-       if (reply_msg)
-               err = c2_errno(reply_msg);
-       else
-               err = -ENOMEM;
-
-       if (!err) switch (req->event) {
-       case IW_CM_EVENT_ESTABLISHED:
-               c2_set_qp_state(req->qp,
-                               C2_QP_STATE_RTS);
-               /*
-                * Until ird/ord negotiation via MPAv2 support is added, send
-                * max supported values
-                */
-               cm_event.ird = cm_event.ord = 128;
-       case IW_CM_EVENT_CLOSE:
-
-               /*
-                * Move the QP to RTS if this is
-                * the established event
-                */
-               cm_event.event = req->event;
-               cm_event.status = 0;
-               cm_event.local_addr = req->cm_id->local_addr;
-               cm_event.remote_addr = req->cm_id->remote_addr;
-               cm_event.private_data = NULL;
-               cm_event.private_data_len = 0;
-               req->cm_id->event_handler(req->cm_id, &cm_event);
-               break;
-       default:
-               break;
-       }
-
-       req->reply_msg = (u64) (unsigned long) (reply_msg);
-       atomic_set(&req->reply_ready, 1);
-       wake_up(&req->wait_object);
-
-       /*
-        * If the request was cancelled, then this put will
-        * free the vq_req memory...and reply_msg!!!
-        */
-       vq_req_put(c2dev, req);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_mm.c b/drivers/infiniband/hw/amso1100/c2_mm.c
deleted file mode 100644 (file)
index 119c4f3..0000000
+++ /dev/null
@@ -1,377 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-
-#define PBL_VIRT 1
-#define PBL_PHYS 2
-
-/*
- * Send all the PBL messages to convey the remainder of the PBL
- * Wait for the adapter's reply on the last one.
- * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
- *
- * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
- *       Reply buffer _is_ freed by this function.
- */
-static int
-send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
-                 unsigned long va, u32 pbl_depth,
-                 struct c2_vq_req *vq_req, int pbl_type)
-{
-       u32 pbe_count;          /* amt that fits in a PBL msg */
-       u32 count;              /* amt in this PBL MSG. */
-       struct c2wr_nsmr_pbl_req *wr;   /* PBL WR ptr */
-       struct c2wr_nsmr_pbl_rep *reply;        /* reply ptr */
-       int err, pbl_virt, pbl_index, i;
-
-       switch (pbl_type) {
-       case PBL_VIRT:
-               pbl_virt = 1;
-               break;
-       case PBL_PHYS:
-               pbl_virt = 0;
-               break;
-       default:
-               return -EINVAL;
-               break;
-       }
-
-       pbe_count = (c2dev->req_vq.msg_size -
-                    sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               return -ENOMEM;
-       }
-       c2_wr_set_id(wr, CCWR_NSMR_PBL);
-
-       /*
-        * Only the last PBL message will generate a reply from the verbs,
-        * so we set the context to 0 indicating there is no kernel verbs
-        * handler blocked awaiting this reply.
-        */
-       wr->hdr.context = 0;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->stag_index = stag_index;    /* already swapped */
-       wr->flags = 0;
-       pbl_index = 0;
-       while (pbl_depth) {
-               count = min(pbe_count, pbl_depth);
-               wr->addrs_length = cpu_to_be32(count);
-
-               /*
-                *  If this is the last message, then reference the
-                *  vq request struct cuz we're gonna wait for a reply.
-                *  also make this PBL msg as the last one.
-                */
-               if (count == pbl_depth) {
-                       /*
-                        * reference the request struct.  dereferenced in the
-                        * int handler.
-                        */
-                       vq_req_get(c2dev, vq_req);
-                       wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
-
-                       /*
-                        * This is the last PBL message.
-                        * Set the context to our VQ Request Object so we can
-                        * wait for the reply.
-                        */
-                       wr->hdr.context = (unsigned long) vq_req;
-               }
-
-               /*
-                * If pbl_virt is set then va is a virtual address
-                * that describes a virtually contiguous memory
-                * allocation. The wr needs the start of each virtual page
-                * to be converted to the corresponding physical address
-                * of the page. If pbl_virt is not set then va is an array
-                * of physical addresses and there is no conversion to do.
-                * Just fill in the wr with what is in the array.
-                */
-               for (i = 0; i < count; i++) {
-                       if (pbl_virt) {
-                               va += PAGE_SIZE;
-                       } else {
-                               wr->paddrs[i] =
-                                   cpu_to_be64(((u64 *)va)[pbl_index + i]);
-                       }
-               }
-
-               /*
-                * Send WR to adapter
-                */
-               err = vq_send_wr(c2dev, (union c2wr *) wr);
-               if (err) {
-                       if (count <= pbe_count) {
-                               vq_req_put(c2dev, vq_req);
-                       }
-                       goto bail0;
-               }
-               pbl_depth -= count;
-               pbl_index += count;
-       }
-
-       /*
-        *  Now wait for the reply...
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       kfree(wr);
-       return err;
-}
-
-#define C2_PBL_MAX_DEPTH 131072
-int
-c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
-                          int page_size, int pbl_depth, u32 length,
-                          u32 offset, u64 *va, enum c2_acf acf,
-                          struct c2_mr *mr)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_nsmr_register_req *wr;
-       struct c2wr_nsmr_register_rep *reply;
-       u16 flags;
-       int i, pbe_count, count;
-       int err;
-
-       if (!va || !length || !addr_list || !pbl_depth)
-               return -EINTR;
-
-       /*
-        * Verify PBL depth is within rnic max
-        */
-       if (pbl_depth > C2_PBL_MAX_DEPTH) {
-               return -EINTR;
-       }
-
-       /*
-        * allocate verbs request object
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       /*
-        * build the WR
-        */
-       c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-
-       flags = (acf | MEM_VA_BASED | MEM_REMOTE);
-
-       /*
-        * compute how many pbes can fit in the message
-        */
-       pbe_count = (c2dev->req_vq.msg_size -
-                    sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
-
-       if (pbl_depth <= pbe_count) {
-               flags |= MEM_PBL_COMPLETE;
-       }
-       wr->flags = cpu_to_be16(flags);
-       wr->stag_key = 0;       //stag_key;
-       wr->va = cpu_to_be64(*va);
-       wr->pd_id = mr->pd->pd_id;
-       wr->pbe_size = cpu_to_be32(page_size);
-       wr->length = cpu_to_be32(length);
-       wr->pbl_depth = cpu_to_be32(pbl_depth);
-       wr->fbo = cpu_to_be32(offset);
-       count = min(pbl_depth, pbe_count);
-       wr->addrs_length = cpu_to_be32(count);
-
-       /*
-        * fill out the PBL for this message
-        */
-       for (i = 0; i < count; i++) {
-               wr->paddrs[i] = cpu_to_be64(addr_list[i]);
-       }
-
-       /*
-        * regerence the request struct
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * send the WR to the adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       /*
-        * wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail1;
-       }
-
-       /*
-        * process reply
-        */
-       reply =
-           (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-       if ((err = c2_errno(reply))) {
-               goto bail2;
-       }
-       //*p_pb_entries = be32_to_cpu(reply->pbl_depth);
-       mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
-       vq_repbuf_free(c2dev, reply);
-
-       /*
-        * if there are still more PBEs we need to send them to
-        * the adapter and wait for a reply on the final one.
-        * reuse vq_req for this purpose.
-        */
-       pbl_depth -= count;
-       if (pbl_depth) {
-
-               vq_req->reply_msg = (unsigned long) NULL;
-               atomic_set(&vq_req->reply_ready, 0);
-               err = send_pbl_messages(c2dev,
-                                       cpu_to_be32(mr->ibmr.lkey),
-                                       (unsigned long) &addr_list[i],
-                                       pbl_depth, vq_req, PBL_PHYS);
-               if (err) {
-                       goto bail1;
-               }
-       }
-
-       vq_req_free(c2dev, vq_req);
-       kfree(wr);
-
-       return err;
-
-      bail2:
-       vq_repbuf_free(c2dev, reply);
-      bail1:
-       kfree(wr);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
-{
-       struct c2_vq_req *vq_req;       /* verbs request object */
-       struct c2wr_stag_dealloc_req wr;        /* work request */
-       struct c2wr_stag_dealloc_rep *reply;    /* WR reply  */
-       int err;
-
-
-       /*
-        * allocate verbs request object
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               return -ENOMEM;
-       }
-
-       /*
-        * Build the WR
-        */
-       c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
-       wr.hdr.context = (u64) (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.stag_index = cpu_to_be32(stag_index);
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.c b/drivers/infiniband/hw/amso1100/c2_mq.c
deleted file mode 100644 (file)
index 0cddc49..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "c2.h"
-#include "c2_mq.h"
-
-void *c2_mq_alloc(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-       if (c2_mq_full(q)) {
-               return NULL;
-       } else {
-#ifdef DEBUG
-               struct c2wr_hdr *m =
-                   (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
-#ifdef CCMSGMAGIC
-               BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
-               m->magic = cpu_to_be32(CCWR_MAGIC);
-#endif
-               return m;
-#else
-               return q->msg_pool.host + q->priv * q->msg_size;
-#endif
-       }
-}
-
-void c2_mq_produce(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-       if (!c2_mq_full(q)) {
-               q->priv = (q->priv + 1) % q->q_size;
-               q->hint_count++;
-               /* Update peer's offset. */
-               __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
-       }
-}
-
-void *c2_mq_consume(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_HOST_TARGET);
-
-       if (c2_mq_empty(q)) {
-               return NULL;
-       } else {
-#ifdef DEBUG
-               struct c2wr_hdr *m = (struct c2wr_hdr *)
-                   (q->msg_pool.host + q->priv * q->msg_size);
-#ifdef CCMSGMAGIC
-               BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
-#endif
-               return m;
-#else
-               return q->msg_pool.host + q->priv * q->msg_size;
-#endif
-       }
-}
-
-void c2_mq_free(struct c2_mq *q)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_HOST_TARGET);
-
-       if (!c2_mq_empty(q)) {
-
-#ifdef CCMSGMAGIC
-               {
-                       struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
-                           (q->msg_pool.adapter + q->priv * q->msg_size);
-                       __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
-               }
-#endif
-               q->priv = (q->priv + 1) % q->q_size;
-               /* Update peer's offset. */
-               __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
-       }
-}
-
-
-void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
-{
-       BUG_ON(q->magic != C2_MQ_MAGIC);
-       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
-
-       while (wqe_count--) {
-               BUG_ON(c2_mq_empty(q));
-               *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
-       }
-}
-
-#if 0
-u32 c2_mq_count(struct c2_mq *q)
-{
-       s32 count;
-
-       if (q->type == C2_MQ_HOST_TARGET)
-               count = be16_to_cpu(*q->shared) - q->priv;
-       else
-               count = q->priv - be16_to_cpu(*q->shared);
-
-       if (count < 0)
-               count += q->q_size;
-
-       return (u32) count;
-}
-#endif  /*  0  */
-
-void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                   u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
-{
-       BUG_ON(!q->shared);
-
-       /* This code assumes the byte swapping has already been done! */
-       q->index = index;
-       q->q_size = q_size;
-       q->msg_size = msg_size;
-       q->msg_pool.adapter = pool_start;
-       q->peer = (struct c2_mq_shared __iomem *) peer;
-       q->magic = C2_MQ_MAGIC;
-       q->type = type;
-       q->priv = 0;
-       q->hint_count = 0;
-       return;
-}
-void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                   u8 *pool_start, u16 __iomem *peer, u32 type)
-{
-       BUG_ON(!q->shared);
-
-       /* This code assumes the byte swapping has already been done! */
-       q->index = index;
-       q->q_size = q_size;
-       q->msg_size = msg_size;
-       q->msg_pool.host = pool_start;
-       q->peer = (struct c2_mq_shared __iomem *) peer;
-       q->magic = C2_MQ_MAGIC;
-       q->type = type;
-       q->priv = 0;
-       q->hint_count = 0;
-       return;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.h b/drivers/infiniband/hw/amso1100/c2_mq.h
deleted file mode 100644 (file)
index fc1b9a7..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _C2_MQ_H_
-#define _C2_MQ_H_
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include "c2_wr.h"
-
-enum c2_shared_regs {
-
-       C2_SHARED_ARMED = 0x10,
-       C2_SHARED_NOTIFY = 0x18,
-       C2_SHARED_SHARED = 0x40,
-};
-
-struct c2_mq_shared {
-       u16 unused1;
-       u8 armed;
-       u8 notification_type;
-       u32 unused2;
-       u16 shared;
-       /* Pad to 64 bytes. */
-       u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
-};
-
-enum c2_mq_type {
-       C2_MQ_HOST_TARGET = 1,
-       C2_MQ_ADAPTER_TARGET = 2,
-};
-
-/*
- * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
- * c2_user_mq_t (which is the same format) is for user-mode MQs...
- */
-#define C2_MQ_MAGIC 0x4d512020 /* 'MQ  ' */
-struct c2_mq {
-       u32 magic;
-       union {
-               u8 *host;
-               u8 __iomem *adapter;
-       } msg_pool;
-       dma_addr_t host_dma;
-       DEFINE_DMA_UNMAP_ADDR(mapping);
-       u16 hint_count;
-       u16 priv;
-       struct c2_mq_shared __iomem *peer;
-       __be16 *shared;
-       dma_addr_t shared_dma;
-       u32 q_size;
-       u32 msg_size;
-       u32 index;
-       enum c2_mq_type type;
-};
-
-static __inline__ int c2_mq_empty(struct c2_mq *q)
-{
-       return q->priv == be16_to_cpu(*q->shared);
-}
-
-static __inline__ int c2_mq_full(struct c2_mq *q)
-{
-       return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
-}
-
-extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
-extern void *c2_mq_alloc(struct c2_mq *q);
-extern void c2_mq_produce(struct c2_mq *q);
-extern void *c2_mq_consume(struct c2_mq *q);
-extern void c2_mq_free(struct c2_mq *q);
-extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                      u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
-extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
-                          u8 *pool_start, u16 __iomem *peer, u32 type);
-
-#endif                         /* _C2_MQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/infiniband/hw/amso1100/c2_pd.c
deleted file mode 100644 (file)
index f3e81dc..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/errno.h>
-
-#include "c2.h"
-#include "c2_provider.h"
-
-int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
-{
-       u32 obj;
-       int ret = 0;
-
-       spin_lock(&c2dev->pd_table.lock);
-       obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
-                                c2dev->pd_table.last);
-       if (obj >= c2dev->pd_table.max)
-               obj = find_first_zero_bit(c2dev->pd_table.table,
-                                         c2dev->pd_table.max);
-       if (obj < c2dev->pd_table.max) {
-               pd->pd_id = obj;
-               __set_bit(obj, c2dev->pd_table.table);
-               c2dev->pd_table.last = obj+1;
-               if (c2dev->pd_table.last >= c2dev->pd_table.max)
-                       c2dev->pd_table.last = 0;
-       } else
-               ret = -ENOMEM;
-       spin_unlock(&c2dev->pd_table.lock);
-       return ret;
-}
-
-void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
-{
-       spin_lock(&c2dev->pd_table.lock);
-       __clear_bit(pd->pd_id, c2dev->pd_table.table);
-       spin_unlock(&c2dev->pd_table.lock);
-}
-
-int c2_init_pd_table(struct c2_dev *c2dev)
-{
-
-       c2dev->pd_table.last = 0;
-       c2dev->pd_table.max = c2dev->props.max_pd;
-       spin_lock_init(&c2dev->pd_table.lock);
-       c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
-                                       sizeof(long), GFP_KERNEL);
-       if (!c2dev->pd_table.table)
-               return -ENOMEM;
-       bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
-       return 0;
-}
-
-void c2_cleanup_pd_table(struct c2_dev *c2dev)
-{
-       kfree(c2dev->pd_table.table);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
deleted file mode 100644 (file)
index 25c3f00..0000000
+++ /dev/null
@@ -1,912 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/if_arp.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_umem.h>
-#include <rdma/ib_user_verbs.h>
-#include "c2.h"
-#include "c2_provider.h"
-#include "c2_user.h"
-
-static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                          struct ib_udata *uhw)
-{
-       struct c2_dev *c2dev = to_c2dev(ibdev);
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       *props = c2dev->props;
-       return 0;
-}
-
-static int c2_query_port(struct ib_device *ibdev,
-                        u8 port, struct ib_port_attr *props)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       props->max_mtu = IB_MTU_4096;
-       props->lid = 0;
-       props->lmc = 0;
-       props->sm_lid = 0;
-       props->sm_sl = 0;
-       props->state = IB_PORT_ACTIVE;
-       props->phys_state = 0;
-       props->port_cap_flags =
-           IB_PORT_CM_SUP |
-           IB_PORT_REINIT_SUP |
-           IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-       props->gid_tbl_len = 1;
-       props->pkey_tbl_len = 1;
-       props->qkey_viol_cntr = 0;
-       props->active_width = 1;
-       props->active_speed = IB_SPEED_SDR;
-
-       return 0;
-}
-
-static int c2_query_pkey(struct ib_device *ibdev,
-                        u8 port, u16 index, u16 * pkey)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       *pkey = 0;
-       return 0;
-}
-
-static int c2_query_gid(struct ib_device *ibdev, u8 port,
-                       int index, union ib_gid *gid)
-{
-       struct c2_dev *c2dev = to_c2dev(ibdev);
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       memset(&(gid->raw[0]), 0, sizeof(gid->raw));
-       memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
-
-       return 0;
-}
-
-/* Allocate the user context data structure. This keeps track
- * of all objects associated with a particular user-mode client.
- */
-static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
-                                            struct ib_udata *udata)
-{
-       struct c2_ucontext *context;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       context = kmalloc(sizeof(*context), GFP_KERNEL);
-       if (!context)
-               return ERR_PTR(-ENOMEM);
-
-       return &context->ibucontext;
-}
-
-static int c2_dealloc_ucontext(struct ib_ucontext *context)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       kfree(context);
-       return 0;
-}
-
-static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
-                                struct ib_ucontext *context,
-                                struct ib_udata *udata)
-{
-       struct c2_pd *pd;
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-       if (!pd)
-               return ERR_PTR(-ENOMEM);
-
-       err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
-       if (err) {
-               kfree(pd);
-               return ERR_PTR(err);
-       }
-
-       if (context) {
-               if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
-                       c2_pd_free(to_c2dev(ibdev), pd);
-                       kfree(pd);
-                       return ERR_PTR(-EFAULT);
-               }
-       }
-
-       return &pd->ibpd;
-}
-
-static int c2_dealloc_pd(struct ib_pd *pd)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
-       kfree(pd);
-
-       return 0;
-}
-
-static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return ERR_PTR(-ENOSYS);
-}
-
-static int c2_ah_destroy(struct ib_ah *ah)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static void c2_add_ref(struct ib_qp *ibqp)
-{
-       struct c2_qp *qp;
-       BUG_ON(!ibqp);
-       qp = to_c2qp(ibqp);
-       atomic_inc(&qp->refcount);
-}
-
-static void c2_rem_ref(struct ib_qp *ibqp)
-{
-       struct c2_qp *qp;
-       BUG_ON(!ibqp);
-       qp = to_c2qp(ibqp);
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
-{
-       struct c2_dev* c2dev = to_c2dev(device);
-       struct c2_qp *qp;
-
-       qp = c2_find_qpn(c2dev, qpn);
-       pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
-               __func__, qp, qpn, device,
-               (qp?atomic_read(&qp->refcount):0));
-
-       return (qp?&qp->ibqp:NULL);
-}
-
-static struct ib_qp *c2_create_qp(struct ib_pd *pd,
-                                 struct ib_qp_init_attr *init_attr,
-                                 struct ib_udata *udata)
-{
-       struct c2_qp *qp;
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       if (init_attr->create_flags)
-               return ERR_PTR(-EINVAL);
-
-       switch (init_attr->qp_type) {
-       case IB_QPT_RC:
-               qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-               if (!qp) {
-                       pr_debug("%s: Unable to allocate QP\n", __func__);
-                       return ERR_PTR(-ENOMEM);
-               }
-               spin_lock_init(&qp->lock);
-               if (pd->uobject) {
-                       /* userspace specific */
-               }
-
-               err = c2_alloc_qp(to_c2dev(pd->device),
-                                 to_c2pd(pd), init_attr, qp);
-
-               if (err && pd->uobject) {
-                       /* userspace specific */
-               }
-
-               break;
-       default:
-               pr_debug("%s: Invalid QP type: %d\n", __func__,
-                       init_attr->qp_type);
-               return ERR_PTR(-EINVAL);
-       }
-
-       if (err) {
-               kfree(qp);
-               return ERR_PTR(err);
-       }
-
-       return &qp->ibqp;
-}
-
-static int c2_destroy_qp(struct ib_qp *ib_qp)
-{
-       struct c2_qp *qp = to_c2qp(ib_qp);
-
-       pr_debug("%s:%u qp=%p,qp->state=%d\n",
-               __func__, __LINE__, ib_qp, qp->state);
-       c2_free_qp(to_c2dev(ib_qp->device), qp);
-       kfree(qp);
-       return 0;
-}
-
-static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
-                                 const struct ib_cq_init_attr *attr,
-                                 struct ib_ucontext *context,
-                                 struct ib_udata *udata)
-{
-       int entries = attr->cqe;
-       struct c2_cq *cq;
-       int err;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               pr_debug("%s: Unable to allocate CQ\n", __func__);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
-       if (err) {
-               pr_debug("%s: error initializing CQ\n", __func__);
-               kfree(cq);
-               return ERR_PTR(err);
-       }
-
-       return &cq->ibcq;
-}
-
-static int c2_destroy_cq(struct ib_cq *ib_cq)
-{
-       struct c2_cq *cq = to_c2cq(ib_cq);
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       c2_free_cq(to_c2dev(ib_cq->device), cq);
-       kfree(cq);
-
-       return 0;
-}
-
-static inline u32 c2_convert_access(int acc)
-{
-       return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
-           (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
-           (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
-           C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
-}
-
-static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
-                                   struct ib_phys_buf *buffer_list,
-                                   int num_phys_buf, int acc, u64 * iova_start)
-{
-       struct c2_mr *mr;
-       u64 *page_list;
-       u32 total_len;
-       int err, i, j, k, page_shift, pbl_depth;
-
-       pbl_depth = 0;
-       total_len = 0;
-
-       page_shift = PAGE_SHIFT;
-       /*
-        * If there is only 1 buffer we assume this could
-        * be a map of all phy mem...use a 32k page_shift.
-        */
-       if (num_phys_buf == 1)
-               page_shift += 3;
-
-       for (i = 0; i < num_phys_buf; i++) {
-
-               if (buffer_list[i].addr & ~PAGE_MASK) {
-                       pr_debug("Unaligned Memory Buffer: 0x%x\n",
-                               (unsigned int) buffer_list[i].addr);
-                       return ERR_PTR(-EINVAL);
-               }
-
-               if (!buffer_list[i].size) {
-                       pr_debug("Invalid Buffer Size\n");
-                       return ERR_PTR(-EINVAL);
-               }
-
-               total_len += buffer_list[i].size;
-               pbl_depth += ALIGN(buffer_list[i].size,
-                                  (1 << page_shift)) >> page_shift;
-       }
-
-       page_list = vmalloc(sizeof(u64) * pbl_depth);
-       if (!page_list) {
-               pr_debug("couldn't vmalloc page_list of size %zd\n",
-                       (sizeof(u64) * pbl_depth));
-               return ERR_PTR(-ENOMEM);
-       }
-
-       for (i = 0, j = 0; i < num_phys_buf; i++) {
-
-               int naddrs;
-
-               naddrs = ALIGN(buffer_list[i].size,
-                              (1 << page_shift)) >> page_shift;
-               for (k = 0; k < naddrs; k++)
-                       page_list[j++] = (buffer_list[i].addr +
-                                                    (k << page_shift));
-       }
-
-       mr = kmalloc(sizeof(*mr), GFP_KERNEL);
-       if (!mr) {
-               vfree(page_list);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       mr->pd = to_c2pd(ib_pd);
-       mr->umem = NULL;
-       pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
-               "*iova_start %llx, first pa %llx, last pa %llx\n",
-               __func__, page_shift, pbl_depth, total_len,
-               (unsigned long long) *iova_start,
-               (unsigned long long) page_list[0],
-               (unsigned long long) page_list[pbl_depth-1]);
-       err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
-                                        (1 << page_shift), pbl_depth,
-                                        total_len, 0, iova_start,
-                                        c2_convert_access(acc), mr);
-       vfree(page_list);
-       if (err) {
-               kfree(mr);
-               return ERR_PTR(err);
-       }
-
-       return &mr->ibmr;
-}
-
-static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       struct ib_phys_buf bl;
-       u64 kva = 0;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /* AMSO1100 limit */
-       bl.size = 0xffffffff;
-       bl.addr = 0;
-       return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
-}
-
-static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                                   u64 virt, int acc, struct ib_udata *udata)
-{
-       u64 *pages;
-       u64 kva = 0;
-       int shift, n, len;
-       int i, k, entry;
-       int err = 0;
-       struct scatterlist *sg;
-       struct c2_pd *c2pd = to_c2pd(pd);
-       struct c2_mr *c2mr;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
-       if (!c2mr)
-               return ERR_PTR(-ENOMEM);
-       c2mr->pd = c2pd;
-
-       c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
-       if (IS_ERR(c2mr->umem)) {
-               err = PTR_ERR(c2mr->umem);
-               kfree(c2mr);
-               return ERR_PTR(err);
-       }
-
-       shift = ffs(c2mr->umem->page_size) - 1;
-       n = c2mr->umem->nmap;
-
-       pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
-       if (!pages) {
-               err = -ENOMEM;
-               goto err;
-       }
-
-       i = 0;
-       for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
-               len = sg_dma_len(sg) >> shift;
-               for (k = 0; k < len; ++k) {
-                       pages[i++] =
-                               sg_dma_address(sg) +
-                               (c2mr->umem->page_size * k);
-               }
-       }
-
-       kva = virt;
-       err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
-                                        pages,
-                                        c2mr->umem->page_size,
-                                        i,
-                                        length,
-                                        ib_umem_offset(c2mr->umem),
-                                        &kva,
-                                        c2_convert_access(acc),
-                                        c2mr);
-       kfree(pages);
-       if (err)
-               goto err;
-       return &c2mr->ibmr;
-
-err:
-       ib_umem_release(c2mr->umem);
-       kfree(c2mr);
-       return ERR_PTR(err);
-}
-
-static int c2_dereg_mr(struct ib_mr *ib_mr)
-{
-       struct c2_mr *mr = to_c2mr(ib_mr);
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
-       if (err)
-               pr_debug("c2_stag_dealloc failed: %d\n", err);
-       else {
-               if (mr->umem)
-                       ib_umem_release(mr->umem);
-               kfree(mr);
-       }
-
-       return err;
-}
-
-static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
-                       char *buf)
-{
-       struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "%x\n", c2dev->props.hw_ver);
-}
-
-static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
-                          char *buf)
-{
-       struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "%x.%x.%x\n",
-                      (int) (c2dev->props.fw_ver >> 32),
-                      (int) (c2dev->props.fw_ver >> 16) & 0xffff,
-                      (int) (c2dev->props.fw_ver & 0xffff));
-}
-
-static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
-                       char *buf)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "AMSO1100\n");
-}
-
-static ssize_t show_board(struct device *dev, struct device_attribute *attr,
-                         char *buf)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
-
-static struct device_attribute *c2_dev_attributes[] = {
-       &dev_attr_hw_rev,
-       &dev_attr_fw_ver,
-       &dev_attr_hca_type,
-       &dev_attr_board_id
-};
-
-static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                       int attr_mask, struct ib_udata *udata)
-{
-       int err;
-
-       err =
-           c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
-                        attr_mask);
-
-       return err;
-}
-
-static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static int c2_process_mad(struct ib_device *ibdev,
-                         int mad_flags,
-                         u8 port_num,
-                         const struct ib_wc *in_wc,
-                         const struct ib_grh *in_grh,
-                         const struct ib_mad_hdr *in_mad,
-                         size_t in_mad_size,
-                         struct ib_mad_hdr *out_mad,
-                         size_t *out_mad_size,
-                         u16 *out_mad_pkey_index)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       return -ENOSYS;
-}
-
-static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /* Request a connection */
-       return c2_llp_connect(cm_id, iw_param);
-}
-
-static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       /* Accept the new connection */
-       return c2_llp_accept(cm_id, iw_param);
-}
-
-static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       err = c2_llp_reject(cm_id, pdata, pdata_len);
-       return err;
-}
-
-static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
-{
-       int err;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       err = c2_llp_service_create(cm_id, backlog);
-       pr_debug("%s:%u err=%d\n",
-               __func__, __LINE__,
-               err);
-       return err;
-}
-
-static int c2_service_destroy(struct iw_cm_id *cm_id)
-{
-       int err;
-       pr_debug("%s:%u\n", __func__, __LINE__);
-
-       err = c2_llp_service_destroy(cm_id);
-
-       return err;
-}
-
-static int c2_pseudo_up(struct net_device *netdev)
-{
-       struct in_device *ind;
-       struct c2_dev *c2dev = netdev->ml_priv;
-
-       ind = in_dev_get(netdev);
-       if (!ind)
-               return 0;
-
-       pr_debug("adding...\n");
-       for_ifa(ind) {
-#ifdef DEBUG
-               u8 *ip = (u8 *) & ifa->ifa_address;
-
-               pr_debug("%s: %d.%d.%d.%d\n",
-                      ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
-#endif
-               c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
-       }
-       endfor_ifa(ind);
-       in_dev_put(ind);
-
-       return 0;
-}
-
-static int c2_pseudo_down(struct net_device *netdev)
-{
-       struct in_device *ind;
-       struct c2_dev *c2dev = netdev->ml_priv;
-
-       ind = in_dev_get(netdev);
-       if (!ind)
-               return 0;
-
-       pr_debug("deleting...\n");
-       for_ifa(ind) {
-#ifdef DEBUG
-               u8 *ip = (u8 *) & ifa->ifa_address;
-
-               pr_debug("%s: %d.%d.%d.%d\n",
-                      ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
-#endif
-               c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
-       }
-       endfor_ifa(ind);
-       in_dev_put(ind);
-
-       return 0;
-}
-
-static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
-{
-       kfree_skb(skb);
-       return NETDEV_TX_OK;
-}
-
-static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
-{
-       if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
-               return -EINVAL;
-
-       netdev->mtu = new_mtu;
-
-       /* TODO: Tell rnic about new rmda interface mtu */
-       return 0;
-}
-
-static const struct net_device_ops c2_pseudo_netdev_ops = {
-       .ndo_open               = c2_pseudo_up,
-       .ndo_stop               = c2_pseudo_down,
-       .ndo_start_xmit         = c2_pseudo_xmit_frame,
-       .ndo_change_mtu         = c2_pseudo_change_mtu,
-       .ndo_validate_addr      = eth_validate_addr,
-};
-
-static void setup(struct net_device *netdev)
-{
-       netdev->netdev_ops = &c2_pseudo_netdev_ops;
-
-       netdev->watchdog_timeo = 0;
-       netdev->type = ARPHRD_ETHER;
-       netdev->mtu = 1500;
-       netdev->hard_header_len = ETH_HLEN;
-       netdev->addr_len = ETH_ALEN;
-       netdev->tx_queue_len = 0;
-       netdev->flags |= IFF_NOARP;
-}
-
-static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
-{
-       char name[IFNAMSIZ];
-       struct net_device *netdev;
-
-       /* change ethxxx to iwxxx */
-       strcpy(name, "iw");
-       strcat(name, &c2dev->netdev->name[3]);
-       netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
-       if (!netdev) {
-               printk(KERN_ERR PFX "%s -  etherdev alloc failed",
-                       __func__);
-               return NULL;
-       }
-
-       netdev->ml_priv = c2dev;
-
-       SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
-
-       memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
-
-       /* Print out the MAC address */
-       pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
-
-#if 0
-       /* Disable network packets */
-       netif_stop_queue(netdev);
-#endif
-       return netdev;
-}
-
-static int c2_port_immutable(struct ib_device *ibdev, u8 port_num,
-                            struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = c2_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-       return 0;
-}
-
-int c2_register_device(struct c2_dev *dev)
-{
-       int ret = -ENOMEM;
-       int i;
-
-       /* Register pseudo network device */
-       dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
-       if (!dev->pseudo_netdev)
-               goto out;
-
-       ret = register_netdev(dev->pseudo_netdev);
-       if (ret)
-               goto out_free_netdev;
-
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
-       dev->ibdev.owner = THIS_MODULE;
-       dev->ibdev.uverbs_cmd_mask =
-           (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-           (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-           (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-           (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-           (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-           (1ull << IB_USER_VERBS_CMD_REG_MR) |
-           (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-           (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-           (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-           (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-           (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
-           (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-           (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-           (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
-           (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-           (1ull << IB_USER_VERBS_CMD_POST_SEND) |
-           (1ull << IB_USER_VERBS_CMD_POST_RECV);
-
-       dev->ibdev.node_type = RDMA_NODE_RNIC;
-       memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
-       memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
-       dev->ibdev.phys_port_cnt = 1;
-       dev->ibdev.num_comp_vectors = 1;
-       dev->ibdev.dma_device = &dev->pcidev->dev;
-       dev->ibdev.query_device = c2_query_device;
-       dev->ibdev.query_port = c2_query_port;
-       dev->ibdev.query_pkey = c2_query_pkey;
-       dev->ibdev.query_gid = c2_query_gid;
-       dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
-       dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
-       dev->ibdev.mmap = c2_mmap_uar;
-       dev->ibdev.alloc_pd = c2_alloc_pd;
-       dev->ibdev.dealloc_pd = c2_dealloc_pd;
-       dev->ibdev.create_ah = c2_ah_create;
-       dev->ibdev.destroy_ah = c2_ah_destroy;
-       dev->ibdev.create_qp = c2_create_qp;
-       dev->ibdev.modify_qp = c2_modify_qp;
-       dev->ibdev.destroy_qp = c2_destroy_qp;
-       dev->ibdev.create_cq = c2_create_cq;
-       dev->ibdev.destroy_cq = c2_destroy_cq;
-       dev->ibdev.poll_cq = c2_poll_cq;
-       dev->ibdev.get_dma_mr = c2_get_dma_mr;
-       dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
-       dev->ibdev.reg_user_mr = c2_reg_user_mr;
-       dev->ibdev.dereg_mr = c2_dereg_mr;
-       dev->ibdev.get_port_immutable = c2_port_immutable;
-
-       dev->ibdev.alloc_fmr = NULL;
-       dev->ibdev.unmap_fmr = NULL;
-       dev->ibdev.dealloc_fmr = NULL;
-       dev->ibdev.map_phys_fmr = NULL;
-
-       dev->ibdev.attach_mcast = c2_multicast_attach;
-       dev->ibdev.detach_mcast = c2_multicast_detach;
-       dev->ibdev.process_mad = c2_process_mad;
-
-       dev->ibdev.req_notify_cq = c2_arm_cq;
-       dev->ibdev.post_send = c2_post_send;
-       dev->ibdev.post_recv = c2_post_receive;
-
-       dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
-       if (dev->ibdev.iwcm == NULL) {
-               ret = -ENOMEM;
-               goto out_unregister_netdev;
-       }
-       dev->ibdev.iwcm->add_ref = c2_add_ref;
-       dev->ibdev.iwcm->rem_ref = c2_rem_ref;
-       dev->ibdev.iwcm->get_qp = c2_get_qp;
-       dev->ibdev.iwcm->connect = c2_connect;
-       dev->ibdev.iwcm->accept = c2_accept;
-       dev->ibdev.iwcm->reject = c2_reject;
-       dev->ibdev.iwcm->create_listen = c2_service_create;
-       dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
-
-       ret = ib_register_device(&dev->ibdev, NULL);
-       if (ret)
-               goto out_free_iwcm;
-
-       for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
-               ret = device_create_file(&dev->ibdev.dev,
-                                              c2_dev_attributes[i]);
-               if (ret)
-                       goto out_unregister_ibdev;
-       }
-       goto out;
-
-out_unregister_ibdev:
-       ib_unregister_device(&dev->ibdev);
-out_free_iwcm:
-       kfree(dev->ibdev.iwcm);
-out_unregister_netdev:
-       unregister_netdev(dev->pseudo_netdev);
-out_free_netdev:
-       free_netdev(dev->pseudo_netdev);
-out:
-       pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
-       return ret;
-}
-
-void c2_unregister_device(struct c2_dev *dev)
-{
-       pr_debug("%s:%u\n", __func__, __LINE__);
-       unregister_netdev(dev->pseudo_netdev);
-       free_netdev(dev->pseudo_netdev);
-       ib_unregister_device(&dev->ibdev);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h
deleted file mode 100644 (file)
index bf18998..0000000
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef C2_PROVIDER_H
-#define C2_PROVIDER_H
-#include <linux/inetdevice.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_pack.h>
-
-#include "c2_mq.h"
-#include <rdma/iw_cm.h>
-
-#define C2_MPT_FLAG_ATOMIC        (1 << 14)
-#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
-#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
-#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
-#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
-
-struct c2_buf_list {
-       void *buf;
-       DEFINE_DMA_UNMAP_ADDR(mapping);
-};
-
-
-/* The user context keeps track of objects allocated for a
- * particular user-mode client. */
-struct c2_ucontext {
-       struct ib_ucontext ibucontext;
-};
-
-struct c2_mtt;
-
-/* All objects associated with a PD are kept in the
- * associated user context if present.
- */
-struct c2_pd {
-       struct ib_pd ibpd;
-       u32 pd_id;
-};
-
-struct c2_mr {
-       struct ib_mr ibmr;
-       struct c2_pd *pd;
-       struct ib_umem *umem;
-};
-
-struct c2_av;
-
-enum c2_ah_type {
-       C2_AH_ON_HCA,
-       C2_AH_PCI_POOL,
-       C2_AH_KMALLOC
-};
-
-struct c2_ah {
-       struct ib_ah ibah;
-};
-
-struct c2_cq {
-       struct ib_cq ibcq;
-       spinlock_t lock;
-       atomic_t refcount;
-       int cqn;
-       int is_kernel;
-       wait_queue_head_t wait;
-
-       u32 adapter_handle;
-       struct c2_mq mq;
-};
-
-struct c2_wq {
-       spinlock_t lock;
-};
-struct iw_cm_id;
-struct c2_qp {
-       struct ib_qp ibqp;
-       struct iw_cm_id *cm_id;
-       spinlock_t lock;
-       atomic_t refcount;
-       wait_queue_head_t wait;
-       int qpn;
-
-       u32 adapter_handle;
-       u32 send_sgl_depth;
-       u32 recv_sgl_depth;
-       u32 rdma_write_sgl_depth;
-       u8 state;
-
-       struct c2_mq sq_mq;
-       struct c2_mq rq_mq;
-};
-
-struct c2_cr_query_attrs {
-       u32 local_addr;
-       u32 remote_addr;
-       u16 local_port;
-       u16 remote_port;
-};
-
-static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
-{
-       return container_of(ibpd, struct c2_pd, ibpd);
-}
-
-static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
-{
-       return container_of(ibucontext, struct c2_ucontext, ibucontext);
-}
-
-static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
-{
-       return container_of(ibmr, struct c2_mr, ibmr);
-}
-
-
-static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
-{
-       return container_of(ibah, struct c2_ah, ibah);
-}
-
-static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
-{
-       return container_of(ibcq, struct c2_cq, ibcq);
-}
-
-static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
-{
-       return container_of(ibqp, struct c2_qp, ibqp);
-}
-
-static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
-{
-       struct in_device *ind;
-       int ret = 0;
-
-       ind = in_dev_get(netdev);
-       if (!ind)
-               return 0;
-
-       for_ifa(ind) {
-               if (ifa->ifa_address == addr) {
-                       ret = 1;
-                       break;
-               }
-       }
-       endfor_ifa(ind);
-       in_dev_put(ind);
-       return ret;
-}
-#endif                         /* C2_PROVIDER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c
deleted file mode 100644 (file)
index 86708de..0000000
+++ /dev/null
@@ -1,1024 +0,0 @@
-/*
- * Copyright (c) 2004 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <linux/delay.h>
-#include <linux/gfp.h>
-
-#include "c2.h"
-#include "c2_vq.h"
-#include "c2_status.h"
-
-#define C2_MAX_ORD_PER_QP 128
-#define C2_MAX_IRD_PER_QP 128
-
-#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
-#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
-#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
-
-#define NO_SUPPORT -1
-static const u8 c2_opcode[] = {
-       [IB_WR_SEND] = C2_WR_TYPE_SEND,
-       [IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
-       [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
-       [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
-       [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
-       [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
-};
-
-static int to_c2_state(enum ib_qp_state ib_state)
-{
-       switch (ib_state) {
-       case IB_QPS_RESET:
-               return C2_QP_STATE_IDLE;
-       case IB_QPS_RTS:
-               return C2_QP_STATE_RTS;
-       case IB_QPS_SQD:
-               return C2_QP_STATE_CLOSING;
-       case IB_QPS_SQE:
-               return C2_QP_STATE_CLOSING;
-       case IB_QPS_ERR:
-               return C2_QP_STATE_ERROR;
-       default:
-               return -1;
-       }
-}
-
-static int to_ib_state(enum c2_qp_state c2_state)
-{
-       switch (c2_state) {
-       case C2_QP_STATE_IDLE:
-               return IB_QPS_RESET;
-       case C2_QP_STATE_CONNECTING:
-               return IB_QPS_RTR;
-       case C2_QP_STATE_RTS:
-               return IB_QPS_RTS;
-       case C2_QP_STATE_CLOSING:
-               return IB_QPS_SQD;
-       case C2_QP_STATE_ERROR:
-               return IB_QPS_ERR;
-       case C2_QP_STATE_TERMINATE:
-               return IB_QPS_SQE;
-       default:
-               return -1;
-       }
-}
-
-static const char *to_ib_state_str(int ib_state)
-{
-       static const char *state_str[] = {
-               "IB_QPS_RESET",
-               "IB_QPS_INIT",
-               "IB_QPS_RTR",
-               "IB_QPS_RTS",
-               "IB_QPS_SQD",
-               "IB_QPS_SQE",
-               "IB_QPS_ERR"
-       };
-       if (ib_state < IB_QPS_RESET ||
-           ib_state > IB_QPS_ERR)
-               return "<invalid IB QP state>";
-
-       ib_state -= IB_QPS_RESET;
-       return state_str[ib_state];
-}
-
-void c2_set_qp_state(struct c2_qp *qp, int c2_state)
-{
-       int new_state = to_ib_state(c2_state);
-
-       pr_debug("%s: qp[%p] state modify %s --> %s\n",
-              __func__,
-               qp,
-               to_ib_state_str(qp->state),
-               to_ib_state_str(new_state));
-       qp->state = new_state;
-}
-
-#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
-
-int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
-                struct ib_qp_attr *attr, int attr_mask)
-{
-       struct c2wr_qp_modify_req wr;
-       struct c2wr_qp_modify_rep *reply;
-       struct c2_vq_req *vq_req;
-       unsigned long flags;
-       u8 next_state;
-       int err;
-
-       pr_debug("%s:%d qp=%p, %s --> %s\n",
-               __func__, __LINE__,
-               qp,
-               to_ib_state_str(qp->state),
-               to_ib_state_str(attr->qp_state));
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       c2_wr_set_id(&wr, CCWR_QP_MODIFY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.qp_handle = qp->adapter_handle;
-       wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-
-       if (attr_mask & IB_QP_STATE) {
-               /* Ensure the state is valid */
-               if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
-                       err = -EINVAL;
-                       goto bail0;
-               }
-
-               wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
-
-               if (attr->qp_state == IB_QPS_ERR) {
-                       spin_lock_irqsave(&qp->lock, flags);
-                       if (qp->cm_id && qp->state == IB_QPS_RTS) {
-                               pr_debug("Generating CLOSE event for QP-->ERR, "
-                                       "qp=%p, cm_id=%p\n",qp,qp->cm_id);
-                               /* Generate an CLOSE event */
-                               vq_req->cm_id = qp->cm_id;
-                               vq_req->event = IW_CM_EVENT_CLOSE;
-                       }
-                       spin_unlock_irqrestore(&qp->lock, flags);
-               }
-               next_state =  attr->qp_state;
-
-       } else if (attr_mask & IB_QP_CUR_STATE) {
-
-               if (attr->cur_qp_state != IB_QPS_RTR &&
-                   attr->cur_qp_state != IB_QPS_RTS &&
-                   attr->cur_qp_state != IB_QPS_SQD &&
-                   attr->cur_qp_state != IB_QPS_SQE) {
-                       err = -EINVAL;
-                       goto bail0;
-               } else
-                       wr.next_qp_state =
-                           cpu_to_be32(to_c2_state(attr->cur_qp_state));
-
-               next_state = attr->cur_qp_state;
-
-       } else {
-               err = 0;
-               goto bail0;
-       }
-
-       /* reference the request struct */
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-       if (!err)
-               qp->state = next_state;
-#ifdef DEBUG
-       else
-               pr_debug("%s: c2_errno=%d\n", __func__, err);
-#endif
-       /*
-        * If we're going to error and generating the event here, then
-        * we need to remove the reference because there will be no
-        * close event generated by the adapter
-       */
-       spin_lock_irqsave(&qp->lock, flags);
-       if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
-               qp->cm_id->rem_ref(qp->cm_id);
-               qp->cm_id = NULL;
-       }
-       spin_unlock_irqrestore(&qp->lock, flags);
-
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-
-       pr_debug("%s:%d qp=%p, cur_state=%s\n",
-               __func__, __LINE__,
-               qp,
-               to_ib_state_str(qp->state));
-       return err;
-}
-
-int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
-                         int ord, int ird)
-{
-       struct c2wr_qp_modify_req wr;
-       struct c2wr_qp_modify_rep *reply;
-       struct c2_vq_req *vq_req;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       c2_wr_set_id(&wr, CCWR_QP_MODIFY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.qp_handle = qp->adapter_handle;
-       wr.ord = cpu_to_be32(ord);
-       wr.ird = cpu_to_be32(ird);
-       wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-       wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
-
-       /* reference the request struct */
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail0;
-
-       reply = (struct c2wr_qp_modify_rep *) (unsigned long)
-               vq_req->reply_msg;
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_qp_destroy_req wr;
-       struct c2wr_qp_destroy_rep *reply;
-       unsigned long flags;
-       int err;
-
-       /*
-        * Allocate a verb request message
-        */
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req) {
-               return -ENOMEM;
-       }
-
-       /*
-        * Initialize the WR
-        */
-       c2_wr_set_id(&wr, CCWR_QP_DESTROY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.qp_handle = qp->adapter_handle;
-
-       /*
-        * reference the request struct.  dereferenced in the int handler.
-        */
-       vq_req_get(c2dev, vq_req);
-
-       spin_lock_irqsave(&qp->lock, flags);
-       if (qp->cm_id && qp->state == IB_QPS_RTS) {
-               pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
-                       "qp=%p, cm_id=%p\n",qp,qp->cm_id);
-               /* Generate an CLOSE event */
-               vq_req->qp = qp;
-               vq_req->cm_id = qp->cm_id;
-               vq_req->event = IW_CM_EVENT_CLOSE;
-       }
-       spin_unlock_irqrestore(&qp->lock, flags);
-
-       /*
-        * Send WR to adapter
-        */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       /*
-        * Wait for reply from adapter
-        */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       /*
-        * Process reply
-        */
-       reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       spin_lock_irqsave(&qp->lock, flags);
-       if (qp->cm_id) {
-               qp->cm_id->rem_ref(qp->cm_id);
-               qp->cm_id = NULL;
-       }
-       spin_unlock_irqrestore(&qp->lock, flags);
-
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-       int ret;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irq(&c2dev->qp_table.lock);
-
-       ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
-       if (ret >= 0)
-               qp->qpn = ret;
-
-       spin_unlock_irq(&c2dev->qp_table.lock);
-       idr_preload_end();
-       return ret < 0 ? ret : 0;
-}
-
-static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
-{
-       spin_lock_irq(&c2dev->qp_table.lock);
-       idr_remove(&c2dev->qp_table.idr, qpn);
-       spin_unlock_irq(&c2dev->qp_table.lock);
-}
-
-struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
-{
-       unsigned long flags;
-       struct c2_qp *qp;
-
-       spin_lock_irqsave(&c2dev->qp_table.lock, flags);
-       qp = idr_find(&c2dev->qp_table.idr, qpn);
-       spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
-       return qp;
-}
-
-int c2_alloc_qp(struct c2_dev *c2dev,
-               struct c2_pd *pd,
-               struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
-{
-       struct c2wr_qp_create_req wr;
-       struct c2wr_qp_create_rep *reply;
-       struct c2_vq_req *vq_req;
-       struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
-       struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
-       unsigned long peer_pa;
-       u32 q_size, msg_size, mmap_size;
-       void __iomem *mmap;
-       int err;
-
-       err = c2_alloc_qpn(c2dev, qp);
-       if (err)
-               return err;
-       qp->ibqp.qp_num = qp->qpn;
-       qp->ibqp.qp_type = IB_QPT_RC;
-
-       /* Allocate the SQ and RQ shared pointers */
-       qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                        &qp->sq_mq.shared_dma, GFP_KERNEL);
-       if (!qp->sq_mq.shared) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                        &qp->rq_mq.shared_dma, GFP_KERNEL);
-       if (!qp->rq_mq.shared) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       /* Allocate the verbs request */
-       vq_req = vq_req_alloc(c2dev);
-       if (vq_req == NULL) {
-               err = -ENOMEM;
-               goto bail2;
-       }
-
-       /* Initialize the work request */
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_QP_CREATE);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-       wr.sq_cq_handle = send_cq->adapter_handle;
-       wr.rq_cq_handle = recv_cq->adapter_handle;
-       wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
-       wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
-       wr.srq_handle = 0;
-       wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
-                              QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
-       wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
-       wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
-       wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
-       wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
-       wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
-       wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
-       wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
-       wr.pd_id = pd->pd_id;
-       wr.user_context = (unsigned long) qp;
-
-       vq_req_get(c2dev, vq_req);
-
-       /* Send the WR to the adapter */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail3;
-       }
-
-       /* Wait for the verb reply  */
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail3;
-       }
-
-       /* Process the reply */
-       reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail3;
-       }
-
-       if ((err = c2_wr_get_result(reply)) != 0) {
-               goto bail4;
-       }
-
-       /* Fill in the kernel QP struct */
-       atomic_set(&qp->refcount, 1);
-       qp->adapter_handle = reply->qp_handle;
-       qp->state = IB_QPS_RESET;
-       qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
-       qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
-       qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
-       init_waitqueue_head(&qp->wait);
-
-       /* Initialize the SQ MQ */
-       q_size = be32_to_cpu(reply->sq_depth);
-       msg_size = be32_to_cpu(reply->sq_msg_size);
-       peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
-       mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
-       mmap = ioremap_nocache(peer_pa, mmap_size);
-       if (!mmap) {
-               err = -ENOMEM;
-               goto bail5;
-       }
-
-       c2_mq_req_init(&qp->sq_mq,
-                      be32_to_cpu(reply->sq_mq_index),
-                      q_size,
-                      msg_size,
-                      mmap + sizeof(struct c2_mq_shared),      /* pool start */
-                      mmap,                            /* peer */
-                      C2_MQ_ADAPTER_TARGET);
-
-       /* Initialize the RQ mq */
-       q_size = be32_to_cpu(reply->rq_depth);
-       msg_size = be32_to_cpu(reply->rq_msg_size);
-       peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
-       mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
-       mmap = ioremap_nocache(peer_pa, mmap_size);
-       if (!mmap) {
-               err = -ENOMEM;
-               goto bail6;
-       }
-
-       c2_mq_req_init(&qp->rq_mq,
-                      be32_to_cpu(reply->rq_mq_index),
-                      q_size,
-                      msg_size,
-                      mmap + sizeof(struct c2_mq_shared),      /* pool start */
-                      mmap,                            /* peer */
-                      C2_MQ_ADAPTER_TARGET);
-
-       vq_repbuf_free(c2dev, reply);
-       vq_req_free(c2dev, vq_req);
-
-       return 0;
-
-      bail6:
-       iounmap(qp->sq_mq.peer);
-      bail5:
-       destroy_qp(c2dev, qp);
-      bail4:
-       vq_repbuf_free(c2dev, reply);
-      bail3:
-       vq_req_free(c2dev, vq_req);
-      bail2:
-       c2_free_mqsp(qp->rq_mq.shared);
-      bail1:
-       c2_free_mqsp(qp->sq_mq.shared);
-      bail0:
-       c2_free_qpn(c2dev, qp->qpn);
-       return err;
-}
-
-static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
-{
-       if (send_cq == recv_cq)
-               spin_lock_irq(&send_cq->lock);
-       else if (send_cq > recv_cq) {
-               spin_lock_irq(&send_cq->lock);
-               spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
-       } else {
-               spin_lock_irq(&recv_cq->lock);
-               spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
-       }
-}
-
-static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
-{
-       if (send_cq == recv_cq)
-               spin_unlock_irq(&send_cq->lock);
-       else if (send_cq > recv_cq) {
-               spin_unlock(&recv_cq->lock);
-               spin_unlock_irq(&send_cq->lock);
-       } else {
-               spin_unlock(&send_cq->lock);
-               spin_unlock_irq(&recv_cq->lock);
-       }
-}
-
-void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
-{
-       struct c2_cq *send_cq;
-       struct c2_cq *recv_cq;
-
-       send_cq = to_c2cq(qp->ibqp.send_cq);
-       recv_cq = to_c2cq(qp->ibqp.recv_cq);
-
-       /*
-        * Lock CQs here, so that CQ polling code can do QP lookup
-        * without taking a lock.
-        */
-       c2_lock_cqs(send_cq, recv_cq);
-       c2_free_qpn(c2dev, qp->qpn);
-       c2_unlock_cqs(send_cq, recv_cq);
-
-       /*
-        * Destroy qp in the rnic...
-        */
-       destroy_qp(c2dev, qp);
-
-       /*
-        * Mark any unreaped CQEs as null and void.
-        */
-       c2_cq_clean(c2dev, qp, send_cq->cqn);
-       if (send_cq != recv_cq)
-               c2_cq_clean(c2dev, qp, recv_cq->cqn);
-       /*
-        * Unmap the MQs and return the shared pointers
-        * to the message pool.
-        */
-       iounmap(qp->sq_mq.peer);
-       iounmap(qp->rq_mq.peer);
-       c2_free_mqsp(qp->sq_mq.shared);
-       c2_free_mqsp(qp->rq_mq.shared);
-
-       atomic_dec(&qp->refcount);
-       wait_event(qp->wait, !atomic_read(&qp->refcount));
-}
-
-/*
- * Function: move_sgl
- *
- * Description:
- * Move an SGL from the user's work request struct into a CCIL Work Request
- * message, swapping to WR byte order and ensure the total length doesn't
- * overflow.
- *
- * IN:
- * dst         - ptr to CCIL Work Request message SGL memory.
- * src         - ptr to the consumers SGL memory.
- *
- * OUT: none
- *
- * Return:
- * CCIL status codes.
- */
-static int
-move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
-        u8 * actual_count)
-{
-       u32 tot = 0;            /* running total */
-       u8 acount = 0;          /* running total non-0 len sge's */
-
-       while (count > 0) {
-               /*
-                * If the addition of this SGE causes the
-                * total SGL length to exceed 2^32-1, then
-                * fail-n-bail.
-                *
-                * If the current total plus the next element length
-                * wraps, then it will go negative and be less than the
-                * current total...
-                */
-               if ((tot + src->length) < tot) {
-                       return -EINVAL;
-               }
-               /*
-                * Bug: 1456 (as well as 1498 & 1643)
-                * Skip over any sge's supplied with len=0
-                */
-               if (src->length) {
-                       tot += src->length;
-                       dst->stag = cpu_to_be32(src->lkey);
-                       dst->to = cpu_to_be64(src->addr);
-                       dst->length = cpu_to_be32(src->length);
-                       dst++;
-                       acount++;
-               }
-               src++;
-               count--;
-       }
-
-       if (acount == 0) {
-               /*
-                * Bug: 1476 (as well as 1498, 1456 and 1643)
-                * Setup the SGL in the WR to make it easier for the RNIC.
-                * This way, the FW doesn't have to deal with special cases.
-                * Setting length=0 should be sufficient.
-                */
-               dst->stag = 0;
-               dst->to = 0;
-               dst->length = 0;
-       }
-
-       *p_len = tot;
-       *actual_count = acount;
-       return 0;
-}
-
-/*
- * Function: c2_activity (private function)
- *
- * Description:
- * Post an mq index to the host->adapter activity fifo.
- *
- * IN:
- * c2dev       - ptr to c2dev structure
- * mq_index    - mq index to post
- * shared      - value most recently written to shared
- *
- * OUT:
- *
- * Return:
- * none
- */
-static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
-{
-       /*
-        * First read the register to see if the FIFO is full, and if so,
-        * spin until it's not.  This isn't perfect -- there is no
-        * synchronization among the clients of the register, but in
-        * practice it prevents multiple CPU from hammering the bus
-        * with PCI RETRY. Note that when this does happen, the card
-        * cannot get on the bus and the card and system hang in a
-        * deadlock -- thus the need for this code. [TOT]
-        */
-       while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
-               udelay(10);
-
-       __raw_writel(C2_HINT_MAKE(mq_index, shared),
-                    c2dev->regs + PCI_BAR0_ADAPTER_HINT);
-}
-
-/*
- * Function: qp_wr_post
- *
- * Description:
- * This in-line function allocates a MQ msg, then moves the host-copy of
- * the completed WR into msg.  Then it posts the message.
- *
- * IN:
- * q           - ptr to user MQ.
- * wr          - ptr to host-copy of the WR.
- * qp          - ptr to user qp
- * size                - Number of bytes to post.  Assumed to be divisible by 4.
- *
- * OUT: none
- *
- * Return:
- * CCIL status codes.
- */
-static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
-{
-       union c2wr *msg;
-
-       msg = c2_mq_alloc(q);
-       if (msg == NULL) {
-               return -EINVAL;
-       }
-#ifdef CCMSGMAGIC
-       ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
-#endif
-
-       /*
-        * Since all header fields in the WR are the same as the
-        * CQE, set the following so the adapter need not.
-        */
-       c2_wr_set_result(wr, CCERR_PENDING);
-
-       /*
-        * Copy the wr down to the adapter
-        */
-       memcpy((void *) msg, (void *) wr, size);
-
-       c2_mq_produce(q);
-       return 0;
-}
-
-
-int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
-                struct ib_send_wr **bad_wr)
-{
-       struct c2_dev *c2dev = to_c2dev(ibqp->device);
-       struct c2_qp *qp = to_c2qp(ibqp);
-       union c2wr wr;
-       unsigned long lock_flags;
-       int err = 0;
-
-       u32 flags;
-       u32 tot_len;
-       u8 actual_sge_count;
-       u32 msg_size;
-
-       if (qp->state > IB_QPS_RTS) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       while (ib_wr) {
-
-               flags = 0;
-               wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
-               if (ib_wr->send_flags & IB_SEND_SIGNALED) {
-                       flags |= SQ_SIGNALED;
-               }
-
-               switch (ib_wr->opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_INV:
-                       if (ib_wr->opcode == IB_WR_SEND) {
-                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
-                               else
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
-                               wr.sqwr.send.remote_stag = 0;
-                       } else {
-                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
-                               else
-                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
-                               wr.sqwr.send.remote_stag =
-                                       cpu_to_be32(ib_wr->ex.invalidate_rkey);
-                       }
-
-                       msg_size = sizeof(struct c2wr_send_req) +
-                               sizeof(struct c2_data_addr) * ib_wr->num_sge;
-                       if (ib_wr->num_sge > qp->send_sgl_depth) {
-                               err = -EINVAL;
-                               break;
-                       }
-                       if (ib_wr->send_flags & IB_SEND_FENCE) {
-                               flags |= SQ_READ_FENCE;
-                       }
-                       err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
-                                      ib_wr->sg_list,
-                                      ib_wr->num_sge,
-                                      &tot_len, &actual_sge_count);
-                       wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
-                       c2_wr_set_sge_count(&wr, actual_sge_count);
-                       break;
-               case IB_WR_RDMA_WRITE:
-                       c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
-                       msg_size = sizeof(struct c2wr_rdma_write_req) +
-                           (sizeof(struct c2_data_addr) * ib_wr->num_sge);
-                       if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
-                               err = -EINVAL;
-                               break;
-                       }
-                       if (ib_wr->send_flags & IB_SEND_FENCE) {
-                               flags |= SQ_READ_FENCE;
-                       }
-                       wr.sqwr.rdma_write.remote_stag =
-                           cpu_to_be32(ib_wr->wr.rdma.rkey);
-                       wr.sqwr.rdma_write.remote_to =
-                           cpu_to_be64(ib_wr->wr.rdma.remote_addr);
-                       err = move_sgl((struct c2_data_addr *)
-                                      & (wr.sqwr.rdma_write.data),
-                                      ib_wr->sg_list,
-                                      ib_wr->num_sge,
-                                      &tot_len, &actual_sge_count);
-                       wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
-                       c2_wr_set_sge_count(&wr, actual_sge_count);
-                       break;
-               case IB_WR_RDMA_READ:
-                       c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
-                       msg_size = sizeof(struct c2wr_rdma_read_req);
-
-                       /* IWarp only suppots 1 sge for RDMA reads */
-                       if (ib_wr->num_sge > 1) {
-                               err = -EINVAL;
-                               break;
-                       }
-
-                       /*
-                        * Move the local and remote stag/to/len into the WR.
-                        */
-                       wr.sqwr.rdma_read.local_stag =
-                           cpu_to_be32(ib_wr->sg_list->lkey);
-                       wr.sqwr.rdma_read.local_to =
-                           cpu_to_be64(ib_wr->sg_list->addr);
-                       wr.sqwr.rdma_read.remote_stag =
-                           cpu_to_be32(ib_wr->wr.rdma.rkey);
-                       wr.sqwr.rdma_read.remote_to =
-                           cpu_to_be64(ib_wr->wr.rdma.remote_addr);
-                       wr.sqwr.rdma_read.length =
-                           cpu_to_be32(ib_wr->sg_list->length);
-                       break;
-               default:
-                       /* error */
-                       msg_size = 0;
-                       err = -EINVAL;
-                       break;
-               }
-
-               /*
-                * If we had an error on the last wr build, then
-                * break out.  Possible errors include bogus WR
-                * type, and a bogus SGL length...
-                */
-               if (err) {
-                       break;
-               }
-
-               /*
-                * Store flags
-                */
-               c2_wr_set_flags(&wr, flags);
-
-               /*
-                * Post the puppy!
-                */
-               spin_lock_irqsave(&qp->lock, lock_flags);
-               err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
-               if (err) {
-                       spin_unlock_irqrestore(&qp->lock, lock_flags);
-                       break;
-               }
-
-               /*
-                * Enqueue mq index to activity FIFO.
-                */
-               c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
-               spin_unlock_irqrestore(&qp->lock, lock_flags);
-
-               ib_wr = ib_wr->next;
-       }
-
-out:
-       if (err)
-               *bad_wr = ib_wr;
-       return err;
-}
-
-int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
-                   struct ib_recv_wr **bad_wr)
-{
-       struct c2_dev *c2dev = to_c2dev(ibqp->device);
-       struct c2_qp *qp = to_c2qp(ibqp);
-       union c2wr wr;
-       unsigned long lock_flags;
-       int err = 0;
-
-       if (qp->state > IB_QPS_RTS) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       /*
-        * Try and post each work request
-        */
-       while (ib_wr) {
-               u32 tot_len;
-               u8 actual_sge_count;
-
-               if (ib_wr->num_sge > qp->recv_sgl_depth) {
-                       err = -EINVAL;
-                       break;
-               }
-
-               /*
-                * Create local host-copy of the WR
-                */
-               wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
-               c2_wr_set_id(&wr, CCWR_RECV);
-               c2_wr_set_flags(&wr, 0);
-
-               /* sge_count is limited to eight bits. */
-               BUG_ON(ib_wr->num_sge >= 256);
-               err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
-                              ib_wr->sg_list,
-                              ib_wr->num_sge, &tot_len, &actual_sge_count);
-               c2_wr_set_sge_count(&wr, actual_sge_count);
-
-               /*
-                * If we had an error on the last wr build, then
-                * break out.  Possible errors include bogus WR
-                * type, and a bogus SGL length...
-                */
-               if (err) {
-                       break;
-               }
-
-               spin_lock_irqsave(&qp->lock, lock_flags);
-               err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
-               if (err) {
-                       spin_unlock_irqrestore(&qp->lock, lock_flags);
-                       break;
-               }
-
-               /*
-                * Enqueue mq index to activity FIFO
-                */
-               c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
-               spin_unlock_irqrestore(&qp->lock, lock_flags);
-
-               ib_wr = ib_wr->next;
-       }
-
-out:
-       if (err)
-               *bad_wr = ib_wr;
-       return err;
-}
-
-void c2_init_qp_table(struct c2_dev *c2dev)
-{
-       spin_lock_init(&c2dev->qp_table.lock);
-       idr_init(&c2dev->qp_table.idr);
-}
-
-void c2_cleanup_qp_table(struct c2_dev *c2dev)
-{
-       idr_destroy(&c2dev->qp_table.idr);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c
deleted file mode 100644 (file)
index d2a6d96..0000000
+++ /dev/null
@@ -1,655 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/delay.h>
-#include <linux/ethtool.h>
-#include <linux/mii.h>
-#include <linux/if_vlan.h>
-#include <linux/crc32.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/init.h>
-#include <linux/dma-mapping.h>
-#include <linux/mm.h>
-#include <linux/inet.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include <linux/route.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-#include <rdma/ib_smi.h>
-#include "c2.h"
-#include "c2_vq.h"
-
-/* Device capabilities */
-#define C2_MIN_PAGESIZE  1024
-
-#define C2_MAX_MRS       32768
-#define C2_MAX_QPS       16000
-#define C2_MAX_WQE_SZ    256
-#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
-#define C2_MAX_SGES      4
-#define C2_MAX_SGE_RD    1
-#define C2_MAX_CQS       32768
-#define C2_MAX_CQES      4096
-#define C2_MAX_PDS       16384
-
-/*
- * Send the adapter INIT message to the amso1100
- */
-static int c2_adapter_init(struct c2_dev *c2dev)
-{
-       struct c2wr_init_req wr;
-       int err;
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_INIT);
-       wr.hdr.context = 0;
-       wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
-       wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
-       wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
-       wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
-       wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
-       wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
-
-       /* Post the init message */
-       err = vq_send_wr(c2dev, (union c2wr *) & wr);
-
-       return err;
-}
-
-/*
- * Send the adapter TERM message to the amso1100
- */
-static void c2_adapter_term(struct c2_dev *c2dev)
-{
-       struct c2wr_init_req wr;
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_TERM);
-       wr.hdr.context = 0;
-
-       /* Post the init message */
-       vq_send_wr(c2dev, (union c2wr *) & wr);
-       c2dev->init = 0;
-
-       return;
-}
-
-/*
- * Query the adapter
- */
-static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_rnic_query_req wr;
-       struct c2wr_rnic_query_rep *reply;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
-       wr.hdr.context = (unsigned long) vq_req;
-       wr.rnic_handle = c2dev->adapter_handle;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) &wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply =
-           (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply)
-               err = -ENOMEM;
-       else
-               err = c2_errno(reply);
-       if (err)
-               goto bail2;
-
-       props->fw_ver =
-               ((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
-               ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
-               (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
-       memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
-       props->max_mr_size         = 0xFFFFFFFF;
-       props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
-       props->vendor_id           = be32_to_cpu(reply->vendor_id);
-       props->vendor_part_id      = be32_to_cpu(reply->part_number);
-       props->hw_ver              = be32_to_cpu(reply->hw_version);
-       props->max_qp              = be32_to_cpu(reply->max_qps);
-       props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
-       props->device_cap_flags    = c2dev->device_cap_flags;
-       props->max_sge             = C2_MAX_SGES;
-       props->max_sge_rd          = C2_MAX_SGE_RD;
-       props->max_cq              = be32_to_cpu(reply->max_cqs);
-       props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
-       props->max_mr              = be32_to_cpu(reply->max_mrs);
-       props->max_pd              = be32_to_cpu(reply->max_pds);
-       props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
-       props->max_ee_rd_atom      = 0;
-       props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
-       props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
-       props->max_ee_init_rd_atom = 0;
-       props->atomic_cap          = IB_ATOMIC_NONE;
-       props->max_ee              = 0;
-       props->max_rdd             = 0;
-       props->max_mw              = be32_to_cpu(reply->max_mws);
-       props->max_raw_ipv6_qp     = 0;
-       props->max_raw_ethy_qp     = 0;
-       props->max_mcast_grp       = 0;
-       props->max_mcast_qp_attach = 0;
-       props->max_total_mcast_qp_attach = 0;
-       props->max_ah              = 0;
-       props->max_fmr             = 0;
-       props->max_map_per_fmr     = 0;
-       props->max_srq             = 0;
-       props->max_srq_wr          = 0;
-       props->max_srq_sge         = 0;
-       props->max_pkeys           = 0;
-       props->local_ca_ack_delay  = 0;
-
- bail2:
-       vq_repbuf_free(c2dev, reply);
-
- bail1:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Add an IP address to the RNIC interface
- */
-int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_rnic_setconfig_req *wr;
-       struct c2wr_rnic_setconfig_rep *reply;
-       struct c2_netaddr netaddr;
-       int err, len;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       len = sizeof(struct c2_netaddr);
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
-
-       netaddr.ip_addr = inaddr;
-       netaddr.netmask = inmask;
-       netaddr.mtu = 0;
-
-       memcpy(wr->data, &netaddr, len);
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply =
-           (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-
-      bail1:
-       kfree(wr);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Delete an IP address from the RNIC interface
- */
-int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
-{
-       struct c2_vq_req *vq_req;
-       struct c2wr_rnic_setconfig_req *wr;
-       struct c2wr_rnic_setconfig_rep *reply;
-       struct c2_netaddr netaddr;
-       int err, len;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (!vq_req)
-               return -ENOMEM;
-
-       len = sizeof(struct c2_netaddr);
-       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
-       if (!wr) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
-       wr->hdr.context = (unsigned long) vq_req;
-       wr->rnic_handle = c2dev->adapter_handle;
-       wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
-
-       netaddr.ip_addr = inaddr;
-       netaddr.netmask = inmask;
-       netaddr.mtu = 0;
-
-       memcpy(wr->data, &netaddr, len);
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, (union c2wr *) wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail1;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err)
-               goto bail1;
-
-       reply =
-           (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       err = c2_errno(reply);
-       vq_repbuf_free(c2dev, reply);
-
-      bail1:
-       kfree(wr);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Open a single RNIC instance to use with all
- * low level openib calls
- */
-static int c2_rnic_open(struct c2_dev *c2dev)
-{
-       struct c2_vq_req *vq_req;
-       union c2wr wr;
-       struct c2wr_rnic_open_rep *reply;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (vq_req == NULL) {
-               return -ENOMEM;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
-       wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
-       wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
-       wr.rnic_open.req.port_num = cpu_to_be16(0);
-       wr.rnic_open.req.user_context = (unsigned long) c2dev;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, &wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       if ((err = c2_errno(reply)) != 0) {
-               goto bail1;
-       }
-
-       c2dev->adapter_handle = reply->rnic_handle;
-
-      bail1:
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Close the RNIC instance
- */
-static int c2_rnic_close(struct c2_dev *c2dev)
-{
-       struct c2_vq_req *vq_req;
-       union c2wr wr;
-       struct c2wr_rnic_close_rep *reply;
-       int err;
-
-       vq_req = vq_req_alloc(c2dev);
-       if (vq_req == NULL) {
-               return -ENOMEM;
-       }
-
-       memset(&wr, 0, sizeof(wr));
-       c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
-       wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
-       wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
-
-       vq_req_get(c2dev, vq_req);
-
-       err = vq_send_wr(c2dev, &wr);
-       if (err) {
-               vq_req_put(c2dev, vq_req);
-               goto bail0;
-       }
-
-       err = vq_wait_for_reply(c2dev, vq_req);
-       if (err) {
-               goto bail0;
-       }
-
-       reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
-       if (!reply) {
-               err = -ENOMEM;
-               goto bail0;
-       }
-
-       if ((err = c2_errno(reply)) != 0) {
-               goto bail1;
-       }
-
-       c2dev->adapter_handle = 0;
-
-      bail1:
-       vq_repbuf_free(c2dev, reply);
-      bail0:
-       vq_req_free(c2dev, vq_req);
-       return err;
-}
-
-/*
- * Called by c2_probe to initialize the RNIC. This principally
- * involves initializing the various limits and resource pools that
- * comprise the RNIC instance.
- */
-int c2_rnic_init(struct c2_dev *c2dev)
-{
-       int err;
-       u32 qsize, msgsize;
-       void *q1_pages;
-       void *q2_pages;
-       void __iomem *mmio_regs;
-
-       /* Device capabilities */
-       c2dev->device_cap_flags =
-           (IB_DEVICE_RESIZE_MAX_WR |
-            IB_DEVICE_CURR_QP_STATE_MOD |
-            IB_DEVICE_SYS_IMAGE_GUID |
-            IB_DEVICE_LOCAL_DMA_LKEY |
-            IB_DEVICE_MEM_WINDOW);
-
-       /* Allocate the qptr_array */
-       c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
-       if (!c2dev->qptr_array) {
-               return -ENOMEM;
-       }
-
-       /* Initialize the qptr_array */
-       c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
-       c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
-       c2dev->qptr_array[2] = (void *) &c2dev->aeq;
-
-       /* Initialize data structures */
-       init_waitqueue_head(&c2dev->req_vq_wo);
-       spin_lock_init(&c2dev->vqlock);
-       spin_lock_init(&c2dev->lock);
-
-       /* Allocate MQ shared pointer pool for kernel clients. User
-        * mode client pools are hung off the user context
-        */
-       err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
-       if (err) {
-               goto bail0;
-       }
-
-       /* Allocate shared pointers for Q0, Q1, and Q2 from
-        * the shared pointer pool.
-        */
-
-       c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                            &c2dev->hint_count_dma,
-                                            GFP_KERNEL);
-       c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                            &c2dev->req_vq.shared_dma,
-                                            GFP_KERNEL);
-       c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                            &c2dev->rep_vq.shared_dma,
-                                            GFP_KERNEL);
-       c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
-                                         &c2dev->aeq.shared_dma, GFP_KERNEL);
-       if (!c2dev->hint_count || !c2dev->req_vq.shared ||
-           !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-
-       mmio_regs = c2dev->kva;
-       /* Initialize the Verbs Request Queue */
-       c2_mq_req_init(&c2dev->req_vq, 0,
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
-                      mmio_regs +
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
-                      mmio_regs +
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
-                      C2_MQ_ADAPTER_TARGET);
-
-       /* Initialize the Verbs Reply Queue */
-       qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
-       msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
-       q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
-                                     &c2dev->rep_vq.host_dma, GFP_KERNEL);
-       if (!q1_pages) {
-               err = -ENOMEM;
-               goto bail1;
-       }
-       dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
-       pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
-                (unsigned long long) c2dev->rep_vq.host_dma);
-       c2_mq_rep_init(&c2dev->rep_vq,
-                  1,
-                  qsize,
-                  msgsize,
-                  q1_pages,
-                  mmio_regs +
-                  be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
-                  C2_MQ_HOST_TARGET);
-
-       /* Initialize the Asynchronus Event Queue */
-       qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
-       msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
-       q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
-                                     &c2dev->aeq.host_dma, GFP_KERNEL);
-       if (!q2_pages) {
-               err = -ENOMEM;
-               goto bail2;
-       }
-       dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
-       pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
-                (unsigned long long) c2dev->aeq.host_dma);
-       c2_mq_rep_init(&c2dev->aeq,
-                      2,
-                      qsize,
-                      msgsize,
-                      q2_pages,
-                      mmio_regs +
-                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
-                      C2_MQ_HOST_TARGET);
-
-       /* Initialize the verbs request allocator */
-       err = vq_init(c2dev);
-       if (err)
-               goto bail3;
-
-       /* Enable interrupts on the adapter */
-       writel(0, c2dev->regs + C2_IDIS);
-
-       /* create the WR init message */
-       err = c2_adapter_init(c2dev);
-       if (err)
-               goto bail4;
-       c2dev->init++;
-
-       /* open an adapter instance */
-       err = c2_rnic_open(c2dev);
-       if (err)
-               goto bail4;
-
-       /* Initialize cached the adapter limits */
-       err = c2_rnic_query(c2dev, &c2dev->props);
-       if (err)
-               goto bail5;
-
-       /* Initialize the PD pool */
-       err = c2_init_pd_table(c2dev);
-       if (err)
-               goto bail5;
-
-       /* Initialize the QP pool */
-       c2_init_qp_table(c2dev);
-       return 0;
-
-      bail5:
-       c2_rnic_close(c2dev);
-      bail4:
-       vq_term(c2dev);
-      bail3:
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->aeq.q_size * c2dev->aeq.msg_size,
-                         q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
-      bail2:
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
-                         q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
-      bail1:
-       c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
-      bail0:
-       vfree(c2dev->qptr_array);
-
-       return err;
-}
-
-/*
- * Called by c2_remove to cleanup the RNIC resources.
- */
-void c2_rnic_term(struct c2_dev *c2dev)
-{
-
-       /* Close the open adapter instance */
-       c2_rnic_close(c2dev);
-
-       /* Send the TERM message to the adapter */
-       c2_adapter_term(c2dev);
-
-       /* Disable interrupts on the adapter */
-       writel(1, c2dev->regs + C2_IDIS);
-
-       /* Free the QP pool */
-       c2_cleanup_qp_table(c2dev);
-
-       /* Free the PD pool */
-       c2_cleanup_pd_table(c2dev);
-
-       /* Free the verbs request allocator */
-       vq_term(c2dev);
-
-       /* Free the asynchronus event queue */
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->aeq.q_size * c2dev->aeq.msg_size,
-                         c2dev->aeq.msg_pool.host,
-                         dma_unmap_addr(&c2dev->aeq, mapping));
-
-       /* Free the verbs reply queue */
-       dma_free_coherent(&c2dev->pcidev->dev,
-                         c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
-                         c2dev->rep_vq.msg_pool.host,
-                         dma_unmap_addr(&c2dev->rep_vq, mapping));
-
-       /* Free the MQ shared pointer pool */
-       c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
-
-       /* Free the qptr_array */
-       vfree(c2dev->qptr_array);
-
-       return;
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_status.h b/drivers/infiniband/hw/amso1100/c2_status.h
deleted file mode 100644 (file)
index 6ee4aa9..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef        _C2_STATUS_H_
-#define _C2_STATUS_H_
-
-/*
- * Verbs Status Codes
- */
-enum c2_status {
-       C2_OK = 0,              /* This must be zero */
-       CCERR_INSUFFICIENT_RESOURCES = 1,
-       CCERR_INVALID_MODIFIER = 2,
-       CCERR_INVALID_MODE = 3,
-       CCERR_IN_USE = 4,
-       CCERR_INVALID_RNIC = 5,
-       CCERR_INTERRUPTED_OPERATION = 6,
-       CCERR_INVALID_EH = 7,
-       CCERR_INVALID_CQ = 8,
-       CCERR_CQ_EMPTY = 9,
-       CCERR_NOT_IMPLEMENTED = 10,
-       CCERR_CQ_DEPTH_TOO_SMALL = 11,
-       CCERR_PD_IN_USE = 12,
-       CCERR_INVALID_PD = 13,
-       CCERR_INVALID_SRQ = 14,
-       CCERR_INVALID_ADDRESS = 15,
-       CCERR_INVALID_NETMASK = 16,
-       CCERR_INVALID_QP = 17,
-       CCERR_INVALID_QP_STATE = 18,
-       CCERR_TOO_MANY_WRS_POSTED = 19,
-       CCERR_INVALID_WR_TYPE = 20,
-       CCERR_INVALID_SGL_LENGTH = 21,
-       CCERR_INVALID_SQ_DEPTH = 22,
-       CCERR_INVALID_RQ_DEPTH = 23,
-       CCERR_INVALID_ORD = 24,
-       CCERR_INVALID_IRD = 25,
-       CCERR_QP_ATTR_CANNOT_CHANGE = 26,
-       CCERR_INVALID_STAG = 27,
-       CCERR_QP_IN_USE = 28,
-       CCERR_OUTSTANDING_WRS = 29,
-       CCERR_STAG_IN_USE = 30,
-       CCERR_INVALID_STAG_INDEX = 31,
-       CCERR_INVALID_SGL_FORMAT = 32,
-       CCERR_ADAPTER_TIMEOUT = 33,
-       CCERR_INVALID_CQ_DEPTH = 34,
-       CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
-       CCERR_INVALID_EP = 36,
-       CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
-       CCERR_FLUSHED = 38,
-       CCERR_INVALID_WQE = 39,
-       CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
-       CCERR_REMOTE_TERMINATION_ERROR = 41,
-       CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
-       CCERR_ACCESS_VIOLATION = 43,
-       CCERR_INVALID_PD_ID = 44,
-       CCERR_WRAP_ERROR = 45,
-       CCERR_INV_STAG_ACCESS_ERROR = 46,
-       CCERR_ZERO_RDMA_READ_RESOURCES = 47,
-       CCERR_QP_NOT_PRIVILEGED = 48,
-       CCERR_STAG_STATE_NOT_INVALID = 49,
-       CCERR_INVALID_PAGE_SIZE = 50,
-       CCERR_INVALID_BUFFER_SIZE = 51,
-       CCERR_INVALID_PBE = 52,
-       CCERR_INVALID_FBO = 53,
-       CCERR_INVALID_LENGTH = 54,
-       CCERR_INVALID_ACCESS_RIGHTS = 55,
-       CCERR_PBL_TOO_BIG = 56,
-       CCERR_INVALID_VA = 57,
-       CCERR_INVALID_REGION = 58,
-       CCERR_INVALID_WINDOW = 59,
-       CCERR_TOTAL_LENGTH_TOO_BIG = 60,
-       CCERR_INVALID_QP_ID = 61,
-       CCERR_ADDR_IN_USE = 62,
-       CCERR_ADDR_NOT_AVAIL = 63,
-       CCERR_NET_DOWN = 64,
-       CCERR_NET_UNREACHABLE = 65,
-       CCERR_CONN_ABORTED = 66,
-       CCERR_CONN_RESET = 67,
-       CCERR_NO_BUFS = 68,
-       CCERR_CONN_TIMEDOUT = 69,
-       CCERR_CONN_REFUSED = 70,
-       CCERR_HOST_UNREACHABLE = 71,
-       CCERR_INVALID_SEND_SGL_DEPTH = 72,
-       CCERR_INVALID_RECV_SGL_DEPTH = 73,
-       CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
-       CCERR_INSUFFICIENT_PRIVILEGES = 75,
-       CCERR_STACK_ERROR = 76,
-       CCERR_INVALID_VERSION = 77,
-       CCERR_INVALID_MTU = 78,
-       CCERR_INVALID_IMAGE = 79,
-       CCERR_PENDING = 98,     /* not an error; user internally by adapter */
-       CCERR_DEFER = 99,       /* not an error; used internally by adapter */
-       CCERR_FAILED_WRITE = 100,
-       CCERR_FAILED_ERASE = 101,
-       CCERR_FAILED_VERIFICATION = 102,
-       CCERR_NOT_FOUND = 103,
-
-};
-
-/*
- * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
- */
-enum c2_connect_status {
-       C2_CONN_STATUS_SUCCESS = C2_OK,
-       C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
-       C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
-       C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
-       C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
-       C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
-       C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
-       C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
-       C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
-       C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
-       C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
-};
-
-/*
- * Flash programming status codes.
- */
-enum c2_flash_status {
-       C2_FLASH_STATUS_SUCCESS = 0x0000,
-       C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
-       C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
-       C2_FLASH_STATUS_ECLBS = 0x0400,
-       C2_FLASH_STATUS_PSLBS = 0x0800,
-       C2_FLASH_STATUS_VPENS = 0x1000,
-};
-
-#endif                         /* _C2_STATUS_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_user.h b/drivers/infiniband/hw/amso1100/c2_user.h
deleted file mode 100644 (file)
index 7e9e7ad..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Cisco Systems.  All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef C2_USER_H
-#define C2_USER_H
-
-#include <linux/types.h>
-
-/*
- * Make sure that all structs defined in this file remain laid out so
- * that they pack the same way on 32-bit and 64-bit architectures (to
- * avoid incompatibility between 32-bit userspace and 64-bit kernels).
- * In particular do not use pointer types -- pass pointers in __u64
- * instead.
- */
-
-struct c2_alloc_ucontext_resp {
-       __u32 qp_tab_size;
-       __u32 uarc_size;
-};
-
-struct c2_alloc_pd_resp {
-       __u32 pdn;
-       __u32 reserved;
-};
-
-struct c2_create_cq {
-       __u32 lkey;
-       __u32 pdn;
-       __u64 arm_db_page;
-       __u64 set_db_page;
-       __u32 arm_db_index;
-       __u32 set_db_index;
-};
-
-struct c2_create_cq_resp {
-       __u32 cqn;
-       __u32 reserved;
-};
-
-struct c2_create_qp {
-       __u32 lkey;
-       __u32 reserved;
-       __u64 sq_db_page;
-       __u64 rq_db_page;
-       __u32 sq_db_index;
-       __u32 rq_db_index;
-};
-
-#endif                         /* C2_USER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
deleted file mode 100644 (file)
index 2ec716f..0000000
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-
-#include "c2_vq.h"
-#include "c2_provider.h"
-
-/*
- * Verbs Request Objects:
- *
- * VQ Request Objects are allocated by the kernel verbs handlers.
- * They contain a wait object, a refcnt, an atomic bool indicating that the
- * adapter has replied, and a copy of the verb reply work request.
- * A pointer to the VQ Request Object is passed down in the context
- * field of the work request message, and reflected back by the adapter
- * in the verbs reply message.  The function handle_vq() in the interrupt
- * path will use this pointer to:
- *     1) append a copy of the verbs reply message
- *     2) mark that the reply is ready
- *     3) wake up the kernel verbs handler blocked awaiting the reply.
- *
- *
- * The kernel verbs handlers do a "get" to put a 2nd reference on the
- * VQ Request object.  If the kernel verbs handler exits before the adapter
- * can respond, this extra reference will keep the VQ Request object around
- * until the adapter's reply can be processed.  The reason we need this is
- * because a pointer to this object is stuffed into the context field of
- * the verbs work request message, and reflected back in the reply message.
- * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
- * kernel verb handler that is blocked awaiting the verb reply.
- * So handle_vq() will do a "put" on the object when it's done accessing it.
- * NOTE:  If we guarantee that the kernel verb handler will never bail before
- *        getting the reply, then we don't need these refcnts.
- *
- *
- * VQ Request objects are freed by the kernel verbs handlers only
- * after the verb has been processed, or when the adapter fails and
- * does not reply.
- *
- *
- * Verbs Reply Buffers:
- *
- * VQ Reply bufs are local host memory copies of a
- * outstanding Verb Request reply
- * message.  The are always allocated by the kernel verbs handlers, and _may_ be
- * freed by either the kernel verbs handler -or- the interrupt handler.  The
- * kernel verbs handler _must_ free the repbuf, then free the vq request object
- * in that order.
- */
-
-int vq_init(struct c2_dev *c2dev)
-{
-       sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
-               (char) ('0' + c2dev->devnum));
-       c2dev->host_msg_cache =
-           kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
-                             SLAB_HWCACHE_ALIGN, NULL);
-       if (c2dev->host_msg_cache == NULL) {
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-void vq_term(struct c2_dev *c2dev)
-{
-       kmem_cache_destroy(c2dev->host_msg_cache);
-}
-
-/* vq_req_alloc - allocate a VQ Request Object and initialize it.
- * The refcnt is set to 1.
- */
-struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
-{
-       struct c2_vq_req *r;
-
-       r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
-       if (r) {
-               init_waitqueue_head(&r->wait_object);
-               r->reply_msg = 0;
-               r->event = 0;
-               r->cm_id = NULL;
-               r->qp = NULL;
-               atomic_set(&r->refcnt, 1);
-               atomic_set(&r->reply_ready, 0);
-       }
-       return r;
-}
-
-
-/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
- * has already free the VQ Reply Buffer if it existed.
- */
-void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-       r->reply_msg = 0;
-       if (atomic_dec_and_test(&r->refcnt)) {
-               kfree(r);
-       }
-}
-
-/* vq_req_get - reference a VQ Request Object.  Done
- * only in the kernel verbs handlers.
- */
-void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-       atomic_inc(&r->refcnt);
-}
-
-
-/* vq_req_put - dereference and potentially free a VQ Request Object.
- *
- * This is only called by handle_vq() on the
- * interrupt when it is done processing
- * a verb reply message.  If the associated
- * kernel verbs handler has already bailed,
- * then this put will actually free the VQ
- * Request object _and_ the VQ Reply Buffer
- * if it exists.
- */
-void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
-{
-       if (atomic_dec_and_test(&r->refcnt)) {
-               if (r->reply_msg != 0)
-                       vq_repbuf_free(c2dev,
-                                      (void *) (unsigned long) r->reply_msg);
-               kfree(r);
-       }
-}
-
-
-/*
- * vq_repbuf_alloc - allocate a VQ Reply Buffer.
- */
-void *vq_repbuf_alloc(struct c2_dev *c2dev)
-{
-       return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
-}
-
-/*
- * vq_send_wr - post a verbs request message to the Verbs Request Queue.
- * If a message is not available in the MQ, then block until one is available.
- * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
- * When the adapter drains the Verbs Request Queue,
- * it inserts MQ index 0 in to the
- * adapter->host activity fifo and interrupts the host.
- */
-int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
-{
-       void *msg;
-       wait_queue_t __wait;
-
-       /*
-        * grab adapter vq lock
-        */
-       spin_lock(&c2dev->vqlock);
-
-       /*
-        * allocate msg
-        */
-       msg = c2_mq_alloc(&c2dev->req_vq);
-
-       /*
-        * If we cannot get a msg, then we'll wait
-        * When a messages are available, the int handler will wake_up()
-        * any waiters.
-        */
-       while (msg == NULL) {
-               pr_debug("%s:%d no available msg in VQ, waiting...\n",
-                      __func__, __LINE__);
-               init_waitqueue_entry(&__wait, current);
-               add_wait_queue(&c2dev->req_vq_wo, &__wait);
-               spin_unlock(&c2dev->vqlock);
-               for (;;) {
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       if (!c2_mq_full(&c2dev->req_vq)) {
-                               break;
-                       }
-                       if (!signal_pending(current)) {
-                               schedule_timeout(1 * HZ);       /* 1 second... */
-                               continue;
-                       }
-                       set_current_state(TASK_RUNNING);
-                       remove_wait_queue(&c2dev->req_vq_wo, &__wait);
-                       return -EINTR;
-               }
-               set_current_state(TASK_RUNNING);
-               remove_wait_queue(&c2dev->req_vq_wo, &__wait);
-               spin_lock(&c2dev->vqlock);
-               msg = c2_mq_alloc(&c2dev->req_vq);
-       }
-
-       /*
-        * copy wr into adapter msg
-        */
-       memcpy(msg, wr, c2dev->req_vq.msg_size);
-
-       /*
-        * post msg
-        */
-       c2_mq_produce(&c2dev->req_vq);
-
-       /*
-        * release adapter vq lock
-        */
-       spin_unlock(&c2dev->vqlock);
-       return 0;
-}
-
-
-/*
- * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
- */
-int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
-{
-       if (!wait_event_timeout(req->wait_object,
-                               atomic_read(&req->reply_ready),
-                               60*HZ))
-               return -ETIMEDOUT;
-
-       return 0;
-}
-
-/*
- * vq_repbuf_free - Free a Verbs Reply Buffer.
- */
-void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
-{
-       kmem_cache_free(c2dev->host_msg_cache, reply);
-}
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.h b/drivers/infiniband/hw/amso1100/c2_vq.h
deleted file mode 100644 (file)
index 3380562..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_VQ_H_
-#define _C2_VQ_H_
-#include <linux/sched.h>
-#include "c2.h"
-#include "c2_wr.h"
-#include "c2_provider.h"
-
-struct c2_vq_req {
-       u64 reply_msg;          /* ptr to reply msg */
-       wait_queue_head_t wait_object;  /* wait object for vq reqs */
-       atomic_t reply_ready;   /* set when reply is ready */
-       atomic_t refcnt;        /* used to cancel WRs... */
-       int event;
-       struct iw_cm_id *cm_id;
-       struct c2_qp *qp;
-};
-
-extern int vq_init(struct c2_dev *c2dev);
-extern void vq_term(struct c2_dev *c2dev);
-
-extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
-extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
-extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
-extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
-extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
-
-extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
-extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
-
-extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
-#endif                         /* _C2_VQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_wr.h b/drivers/infiniband/hw/amso1100/c2_wr.h
deleted file mode 100644 (file)
index 8d4b4ca..0000000
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*
- * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
- * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _C2_WR_H_
-#define _C2_WR_H_
-
-#ifdef CCDEBUG
-#define CCWR_MAGIC             0xb07700b0
-#endif
-
-#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
-
-/* Maximum allowed size in bytes of private_data exchange
- * on connect.
- */
-#define C2_MAX_PRIVATE_DATA_SIZE 200
-
-/*
- * These types are shared among the adapter, host, and CCIL consumer.
- */
-enum c2_cq_notification_type {
-       C2_CQ_NOTIFICATION_TYPE_NONE = 1,
-       C2_CQ_NOTIFICATION_TYPE_NEXT,
-       C2_CQ_NOTIFICATION_TYPE_NEXT_SE
-};
-
-enum c2_setconfig_cmd {
-       C2_CFG_ADD_ADDR = 1,
-       C2_CFG_DEL_ADDR = 2,
-       C2_CFG_ADD_ROUTE = 3,
-       C2_CFG_DEL_ROUTE = 4
-};
-
-enum c2_getconfig_cmd {
-       C2_GETCONFIG_ROUTES = 1,
-       C2_GETCONFIG_ADDRS
-};
-
-/*
- *  CCIL Work Request Identifiers
- */
-enum c2wr_ids {
-       CCWR_RNIC_OPEN = 1,
-       CCWR_RNIC_QUERY,
-       CCWR_RNIC_SETCONFIG,
-       CCWR_RNIC_GETCONFIG,
-       CCWR_RNIC_CLOSE,
-       CCWR_CQ_CREATE,
-       CCWR_CQ_QUERY,
-       CCWR_CQ_MODIFY,
-       CCWR_CQ_DESTROY,
-       CCWR_QP_CONNECT,
-       CCWR_PD_ALLOC,
-       CCWR_PD_DEALLOC,
-       CCWR_SRQ_CREATE,
-       CCWR_SRQ_QUERY,
-       CCWR_SRQ_MODIFY,
-       CCWR_SRQ_DESTROY,
-       CCWR_QP_CREATE,
-       CCWR_QP_QUERY,
-       CCWR_QP_MODIFY,
-       CCWR_QP_DESTROY,
-       CCWR_NSMR_STAG_ALLOC,
-       CCWR_NSMR_REGISTER,
-       CCWR_NSMR_PBL,
-       CCWR_STAG_DEALLOC,
-       CCWR_NSMR_REREGISTER,
-       CCWR_SMR_REGISTER,
-       CCWR_MR_QUERY,
-       CCWR_MW_ALLOC,
-       CCWR_MW_QUERY,
-       CCWR_EP_CREATE,
-       CCWR_EP_GETOPT,
-       CCWR_EP_SETOPT,
-       CCWR_EP_DESTROY,
-       CCWR_EP_BIND,
-       CCWR_EP_CONNECT,
-       CCWR_EP_LISTEN,
-       CCWR_EP_SHUTDOWN,
-       CCWR_EP_LISTEN_CREATE,
-       CCWR_EP_LISTEN_DESTROY,
-       CCWR_EP_QUERY,
-       CCWR_CR_ACCEPT,
-       CCWR_CR_REJECT,
-       CCWR_CONSOLE,
-       CCWR_TERM,
-       CCWR_FLASH_INIT,
-       CCWR_FLASH,
-       CCWR_BUF_ALLOC,
-       CCWR_BUF_FREE,
-       CCWR_FLASH_WRITE,
-       CCWR_INIT,              /* WARNING: Don't move this ever again! */
-
-
-
-       /* Add new IDs here */
-
-
-
-       /*
-        * WARNING: CCWR_LAST must always be the last verbs id defined!
-        *          All the preceding IDs are fixed, and must not change.
-        *          You can add new IDs, but must not remove or reorder
-        *          any IDs. If you do, YOU will ruin any hope of
-        *          compatibility between versions.
-        */
-       CCWR_LAST,
-
-       /*
-        * Start over at 1 so that arrays indexed by user wr id's
-        * begin at 1.  This is OK since the verbs and user wr id's
-        * are always used on disjoint sets of queues.
-        */
-       /*
-        * The order of the CCWR_SEND_XX verbs must
-        * match the order of the RDMA_OPs
-        */
-       CCWR_SEND = 1,
-       CCWR_SEND_INV,
-       CCWR_SEND_SE,
-       CCWR_SEND_SE_INV,
-       CCWR_RDMA_WRITE,
-       CCWR_RDMA_READ,
-       CCWR_RDMA_READ_INV,
-       CCWR_MW_BIND,
-       CCWR_NSMR_FASTREG,
-       CCWR_STAG_INVALIDATE,
-       CCWR_RECV,
-       CCWR_NOP,
-       CCWR_UNIMPL,
-/* WARNING: This must always be the last user wr id defined! */
-};
-#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
-
-/*
- * SQ/RQ Work Request Types
- */
-enum c2_wr_type {
-       C2_WR_TYPE_SEND = CCWR_SEND,
-       C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
-       C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
-       C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
-       C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
-       C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
-       C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
-       C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
-       C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
-       C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
-       C2_WR_TYPE_RECV = CCWR_RECV,
-       C2_WR_TYPE_NOP = CCWR_NOP,
-};
-
-struct c2_netaddr {
-       __be32 ip_addr;
-       __be32 netmask;
-       u32 mtu;
-};
-
-struct c2_route {
-       u32 ip_addr;            /* 0 indicates the default route */
-       u32 netmask;            /* netmask associated with dst */
-       u32 flags;
-       union {
-               u32 ipaddr;     /* address of the nexthop interface */
-               u8 enaddr[6];
-       } nexthop;
-};
-
-/*
- * A Scatter Gather Entry.
- */
-struct c2_data_addr {
-       __be32 stag;
-       __be32 length;
-       __be64 to;
-};
-
-/*
- * MR and MW flags used by the consumer, RI, and RNIC.
- */
-enum c2_mm_flags {
-       MEM_REMOTE = 0x0001,    /* allow mw binds with remote access. */
-       MEM_VA_BASED = 0x0002,  /* Not Zero-based */
-       MEM_PBL_COMPLETE = 0x0004,      /* PBL array is complete in this msg */
-       MEM_LOCAL_READ = 0x0008,        /* allow local reads */
-       MEM_LOCAL_WRITE = 0x0010,       /* allow local writes */
-       MEM_REMOTE_READ = 0x0020,       /* allow remote reads */
-       MEM_REMOTE_WRITE = 0x0040,      /* allow remote writes */
-       MEM_WINDOW_BIND = 0x0080,       /* binds allowed */
-       MEM_SHARED = 0x0100,    /* set if MR is shared */
-       MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */
-};
-
-/*
- * CCIL API ACF flags defined in terms of the low level mem flags.
- * This minimizes translation needed in the user API
- */
-enum c2_acf {
-       C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
-       C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
-       C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
-       C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
-       C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
-};
-
-/*
- * Image types of objects written to flash
- */
-#define C2_FLASH_IMG_BITFILE 1
-#define C2_FLASH_IMG_OPTION_ROM 2
-#define C2_FLASH_IMG_VPD 3
-
-/*
- *  to fix bug 1815 we define the max size allowable of the
- *  terminate message (per the IETF spec).Refer to the IETF
- *  protocol specification, section 12.1.6, page 64)
- *  The message is prefixed by 20 types of DDP info.
- *
- *  Then the message has 6 bytes for the terminate control
- *  and DDP segment length info plus a DDP header (either
- *  14 or 18 byts) plus 28 bytes for the RDMA header.
- *  Thus the max size in:
- *  20 + (6 + 18 + 28) = 72
- */
-#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
-
-/*
- * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
- */
-#define WR_BUILD_STR_LEN 64
-
-/*
- * WARNING:  All of these structs need to align any 64bit types on
- * 64 bit boundaries!  64bit types include u64 and u64.
- */
-
-/*
- * Clustercore Work Request Header.  Be sensitive to field layout
- * and alignment.
- */
-struct c2wr_hdr {
-       /* wqe_count is part of the cqe.  It is put here so the
-        * adapter can write to it while the wr is pending without
-        * clobbering part of the wr.  This word need not be dma'd
-        * from the host to adapter by libccil, but we copy it anyway
-        * to make the memcpy to the adapter better aligned.
-        */
-       __be32 wqe_count;
-
-       /* Put these fields next so that later 32- and 64-bit
-        * quantities are naturally aligned.
-        */
-       u8 id;
-       u8 result;              /* adapter -> host */
-       u8 sge_count;           /* host -> adapter */
-       u8 flags;               /* host -> adapter */
-
-       u64 context;
-#ifdef CCMSGMAGIC
-       u32 magic;
-       u32 pad;
-#endif
-} __attribute__((packed));
-
-/*
- *------------------------ RNIC ------------------------
- */
-
-/*
- * WR_RNIC_OPEN
- */
-
-/*
- * Flags for the RNIC WRs
- */
-enum c2_rnic_flags {
-       RNIC_IRD_STATIC = 0x0001,
-       RNIC_ORD_STATIC = 0x0002,
-       RNIC_QP_STATIC = 0x0004,
-       RNIC_SRQ_SUPPORTED = 0x0008,
-       RNIC_PBL_BLOCK_MODE = 0x0010,
-       RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
-       RNIC_CQ_OVF_DETECTED = 0x0040,
-       RNIC_PRIV_MODE = 0x0080
-};
-
-struct c2wr_rnic_open_req {
-       struct c2wr_hdr hdr;
-       u64 user_context;
-       __be16 flags;           /* See enum c2_rnic_flags */
-       __be16 port_num;
-} __attribute__((packed));
-
-struct c2wr_rnic_open_rep {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed));
-
-union c2wr_rnic_open {
-       struct c2wr_rnic_open_req req;
-       struct c2wr_rnic_open_rep rep;
-} __attribute__((packed));
-
-struct c2wr_rnic_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed));
-
-/*
- * WR_RNIC_QUERY
- */
-struct c2wr_rnic_query_rep {
-       struct c2wr_hdr hdr;
-       u64 user_context;
-       __be32 vendor_id;
-       __be32 part_number;
-       __be32 hw_version;
-       __be32 fw_ver_major;
-       __be32 fw_ver_minor;
-       __be32 fw_ver_patch;
-       char fw_ver_build_str[WR_BUILD_STR_LEN];
-       __be32 max_qps;
-       __be32 max_qp_depth;
-       u32 max_srq_depth;
-       u32 max_send_sgl_depth;
-       u32 max_rdma_sgl_depth;
-       __be32 max_cqs;
-       __be32 max_cq_depth;
-       u32 max_cq_event_handlers;
-       __be32 max_mrs;
-       u32 max_pbl_depth;
-       __be32 max_pds;
-       __be32 max_global_ird;
-       u32 max_global_ord;
-       __be32 max_qp_ird;
-       __be32 max_qp_ord;
-       u32 flags;
-       __be32 max_mws;
-       u32 pbe_range_low;
-       u32 pbe_range_high;
-       u32 max_srqs;
-       u32 page_size;
-} __attribute__((packed));
-
-union c2wr_rnic_query {
-       struct c2wr_rnic_query_req req;
-       struct c2wr_rnic_query_rep rep;
-} __attribute__((packed));
-
-/*
- * WR_RNIC_GETCONFIG
- */
-
-struct c2wr_rnic_getconfig_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 option;             /* see c2_getconfig_cmd_t */
-       u64 reply_buf;
-       u32 reply_buf_len;
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_getconfig_rep {
-       struct c2wr_hdr hdr;
-       u32 option;             /* see c2_getconfig_cmd_t */
-       u32 count_len;          /* length of the number of addresses configured */
-} __attribute__((packed)) ;
-
-union c2wr_rnic_getconfig {
-       struct c2wr_rnic_getconfig_req req;
-       struct c2wr_rnic_getconfig_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * WR_RNIC_SETCONFIG
- */
-struct c2wr_rnic_setconfig_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       __be32 option;          /* See c2_setconfig_cmd_t */
-       /* variable data and pad. See c2_netaddr and c2_route */
-       u8 data[0];
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_setconfig_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_rnic_setconfig {
-       struct c2wr_rnic_setconfig_req req;
-       struct c2wr_rnic_setconfig_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * WR_RNIC_CLOSE
- */
-struct c2wr_rnic_close_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_rnic_close_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_rnic_close {
-       struct c2wr_rnic_close_req req;
-       struct c2wr_rnic_close_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ CQ ------------------------
- */
-struct c2wr_cq_create_req {
-       struct c2wr_hdr hdr;
-       __be64 shared_ht;
-       u64 user_context;
-       __be64 msg_pool;
-       u32 rnic_handle;
-       __be32 msg_size;
-       __be32 depth;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_create_rep {
-       struct c2wr_hdr hdr;
-       __be32 mq_index;
-       __be32 adapter_shared;
-       u32 cq_handle;
-} __attribute__((packed)) ;
-
-union c2wr_cq_create {
-       struct c2wr_cq_create_req req;
-       struct c2wr_cq_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_modify_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 cq_handle;
-       u32 new_depth;
-       u64 new_msg_pool;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_modify_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_cq_modify {
-       struct c2wr_cq_modify_req req;
-       struct c2wr_cq_modify_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 cq_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_cq_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_cq_destroy {
-       struct c2wr_cq_destroy_req req;
-       struct c2wr_cq_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ PD ------------------------
- */
-struct c2wr_pd_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_alloc_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_pd_alloc {
-       struct c2wr_pd_alloc_req req;
-       struct c2wr_pd_alloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_dealloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_pd_dealloc_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_pd_dealloc {
-       struct c2wr_pd_dealloc_req req;
-       struct c2wr_pd_dealloc_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ SRQ ------------------------
- */
-struct c2wr_srq_create_req {
-       struct c2wr_hdr hdr;
-       u64 shared_ht;
-       u64 user_context;
-       u32 rnic_handle;
-       u32 srq_depth;
-       u32 srq_limit;
-       u32 sgl_depth;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_create_rep {
-       struct c2wr_hdr hdr;
-       u32 srq_depth;
-       u32 sgl_depth;
-       u32 msg_size;
-       u32 mq_index;
-       u32 mq_start;
-       u32 srq_handle;
-} __attribute__((packed)) ;
-
-union c2wr_srq_create {
-       struct c2wr_srq_create_req req;
-       struct c2wr_srq_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 srq_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_srq_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_srq_destroy {
-       struct c2wr_srq_destroy_req req;
-       struct c2wr_srq_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ QP ------------------------
- */
-enum c2wr_qp_flags {
-       QP_RDMA_READ = 0x00000001,      /* RDMA read enabled? */
-       QP_RDMA_WRITE = 0x00000002,     /* RDMA write enabled? */
-       QP_MW_BIND = 0x00000004,        /* MWs enabled */
-       QP_ZERO_STAG = 0x00000008,      /* enabled? */
-       QP_REMOTE_TERMINATION = 0x00000010,     /* remote end terminated */
-       QP_RDMA_READ_RESPONSE = 0x00000020      /* Remote RDMA read  */
-           /* enabled? */
-};
-
-struct c2wr_qp_create_req {
-       struct c2wr_hdr hdr;
-       __be64 shared_sq_ht;
-       __be64 shared_rq_ht;
-       u64 user_context;
-       u32 rnic_handle;
-       u32 sq_cq_handle;
-       u32 rq_cq_handle;
-       __be32 sq_depth;
-       __be32 rq_depth;
-       u32 srq_handle;
-       u32 srq_limit;
-       __be32 flags;           /* see enum c2wr_qp_flags */
-       __be32 send_sgl_depth;
-       __be32 recv_sgl_depth;
-       __be32 rdma_write_sgl_depth;
-       __be32 ord;
-       __be32 ird;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_create_rep {
-       struct c2wr_hdr hdr;
-       __be32 sq_depth;
-       __be32 rq_depth;
-       u32 send_sgl_depth;
-       u32 recv_sgl_depth;
-       u32 rdma_write_sgl_depth;
-       u32 ord;
-       u32 ird;
-       __be32 sq_msg_size;
-       __be32 sq_mq_index;
-       __be32 sq_mq_start;
-       __be32 rq_msg_size;
-       __be32 rq_mq_index;
-       __be32 rq_mq_start;
-       u32 qp_handle;
-} __attribute__((packed)) ;
-
-union c2wr_qp_create {
-       struct c2wr_qp_create_req req;
-       struct c2wr_qp_create_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_query_rep {
-       struct c2wr_hdr hdr;
-       u64 user_context;
-       u32 rnic_handle;
-       u32 sq_depth;
-       u32 rq_depth;
-       u32 send_sgl_depth;
-       u32 rdma_write_sgl_depth;
-       u32 recv_sgl_depth;
-       u32 ord;
-       u32 ird;
-       u16 qp_state;
-       u16 flags;              /* see c2wr_qp_flags_t */
-       u32 qp_id;
-       u32 local_addr;
-       u32 remote_addr;
-       u16 local_port;
-       u16 remote_port;
-       u32 terminate_msg_length;       /* 0 if not present */
-       u8 data[0];
-       /* Terminate Message in-line here. */
-} __attribute__((packed)) ;
-
-union c2wr_qp_query {
-       struct c2wr_qp_query_req req;
-       struct c2wr_qp_query_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_modify_req {
-       struct c2wr_hdr hdr;
-       u64 stream_msg;
-       u32 stream_msg_length;
-       u32 rnic_handle;
-       u32 qp_handle;
-       __be32 next_qp_state;
-       __be32 ord;
-       __be32 ird;
-       __be32 sq_depth;
-       __be32 rq_depth;
-       u32 llp_ep_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_modify_rep {
-       struct c2wr_hdr hdr;
-       u32 ord;
-       u32 ird;
-       u32 sq_depth;
-       u32 rq_depth;
-       u32 sq_msg_size;
-       u32 sq_mq_index;
-       u32 sq_mq_start;
-       u32 rq_msg_size;
-       u32 rq_mq_index;
-       u32 rq_mq_start;
-} __attribute__((packed)) ;
-
-union c2wr_qp_modify {
-       struct c2wr_qp_modify_req req;
-       struct c2wr_qp_modify_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;
-} __attribute__((packed)) ;
-
-struct c2wr_qp_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_qp_destroy {
-       struct c2wr_qp_destroy_req req;
-       struct c2wr_qp_destroy_rep rep;
-} __attribute__((packed)) ;
-
-/*
- * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
- * only be posted when a QP is in IDLE state.  After the connect request is
- * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
- * No synchronous reply from adapter to this WR.  The results of
- * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
- * See c2wr_ae_active_connect_results_t
- */
-struct c2wr_qp_connect_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;
-       __be32 remote_addr;
-       __be16 remote_port;
-       u16 pad;
-       __be32 private_data_length;
-       u8 private_data[0];     /* Private data in-line. */
-} __attribute__((packed)) ;
-
-struct c2wr_qp_connect {
-       struct c2wr_qp_connect_req req;
-       /* no synchronous reply.         */
-} __attribute__((packed)) ;
-
-
-/*
- *------------------------ MM ------------------------
- */
-
-struct c2wr_nsmr_stag_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pbl_depth;
-       u32 pd_id;
-       u32 flags;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_stag_alloc_rep {
-       struct c2wr_hdr hdr;
-       u32 pbl_depth;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_stag_alloc {
-       struct c2wr_nsmr_stag_alloc_req req;
-       struct c2wr_nsmr_stag_alloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_register_req {
-       struct c2wr_hdr hdr;
-       __be64 va;
-       u32 rnic_handle;
-       __be16 flags;
-       u8 stag_key;
-       u8 pad;
-       u32 pd_id;
-       __be32 pbl_depth;
-       __be32 pbe_size;
-       __be32 fbo;
-       __be32 length;
-       __be32 addrs_length;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       __be64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_register_rep {
-       struct c2wr_hdr hdr;
-       u32 pbl_depth;
-       __be32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_register {
-       struct c2wr_nsmr_register_req req;
-       struct c2wr_nsmr_register_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_pbl_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       __be32 flags;
-       __be32 stag_index;
-       __be32 addrs_length;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       __be64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_pbl_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_pbl {
-       struct c2wr_nsmr_pbl_req req;
-       struct c2wr_nsmr_pbl_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mr_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_mr_query_rep {
-       struct c2wr_hdr hdr;
-       u8 stag_key;
-       u8 pad[3];
-       u32 pd_id;
-       u32 flags;
-       u32 pbl_depth;
-} __attribute__((packed)) ;
-
-union c2wr_mr_query {
-       struct c2wr_mr_query_req req;
-       struct c2wr_mr_query_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_query_rep {
-       struct c2wr_hdr hdr;
-       u8 stag_key;
-       u8 pad[3];
-       u32 pd_id;
-       u32 flags;
-} __attribute__((packed)) ;
-
-union c2wr_mw_query {
-       struct c2wr_mw_query_req req;
-       struct c2wr_mw_query_rep rep;
-} __attribute__((packed)) ;
-
-
-struct c2wr_stag_dealloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       __be32 stag_index;
-} __attribute__((packed)) ;
-
-struct c2wr_stag_dealloc_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed)) ;
-
-union c2wr_stag_dealloc {
-       struct c2wr_stag_dealloc_req req;
-       struct c2wr_stag_dealloc_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_reregister_req {
-       struct c2wr_hdr hdr;
-       u64 va;
-       u32 rnic_handle;
-       u16 flags;
-       u8 stag_key;
-       u8 pad;
-       u32 stag_index;
-       u32 pd_id;
-       u32 pbl_depth;
-       u32 pbe_size;
-       u32 fbo;
-       u32 length;
-       u32 addrs_length;
-       u32 pad1;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       u64 paddrs[0];
-} __attribute__((packed)) ;
-
-struct c2wr_nsmr_reregister_rep {
-       struct c2wr_hdr hdr;
-       u32 pbl_depth;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_nsmr_reregister {
-       struct c2wr_nsmr_reregister_req req;
-       struct c2wr_nsmr_reregister_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_smr_register_req {
-       struct c2wr_hdr hdr;
-       u64 va;
-       u32 rnic_handle;
-       u16 flags;
-       u8 stag_key;
-       u8 pad;
-       u32 stag_index;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_smr_register_rep {
-       struct c2wr_hdr hdr;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_smr_register {
-       struct c2wr_smr_register_req req;
-       struct c2wr_smr_register_rep rep;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 pd_id;
-} __attribute__((packed)) ;
-
-struct c2wr_mw_alloc_rep {
-       struct c2wr_hdr hdr;
-       u32 stag_index;
-} __attribute__((packed)) ;
-
-union c2wr_mw_alloc {
-       struct c2wr_mw_alloc_req req;
-       struct c2wr_mw_alloc_rep rep;
-} __attribute__((packed)) ;
-
-/*
- *------------------------ WRs -----------------------
- */
-
-struct c2wr_user_hdr {
-       struct c2wr_hdr hdr;            /* Has status and WR Type */
-} __attribute__((packed)) ;
-
-enum c2_qp_state {
-       C2_QP_STATE_IDLE = 0x01,
-       C2_QP_STATE_CONNECTING = 0x02,
-       C2_QP_STATE_RTS = 0x04,
-       C2_QP_STATE_CLOSING = 0x08,
-       C2_QP_STATE_TERMINATE = 0x10,
-       C2_QP_STATE_ERROR = 0x20,
-};
-
-/* Completion queue entry. */
-struct c2wr_ce {
-       struct c2wr_hdr hdr;            /* Has status and WR Type */
-       u64 qp_user_context;    /* c2_user_qp_t * */
-       u32 qp_state;           /* Current QP State */
-       u32 handle;             /* QPID or EP Handle */
-       __be32 bytes_rcvd;              /* valid for RECV WCs */
-       u32 stag;
-} __attribute__((packed)) ;
-
-
-/*
- * Flags used for all post-sq WRs.  These must fit in the flags
- * field of the struct c2wr_hdr (eight bits).
- */
-enum {
-       SQ_SIGNALED = 0x01,
-       SQ_READ_FENCE = 0x02,
-       SQ_FENCE = 0x04,
-};
-
-/*
- * Common fields for all post-sq WRs.  Namely the standard header and a
- * secondary header with fields common to all post-sq WRs.
- */
-struct c2_sq_hdr {
-       struct c2wr_user_hdr user_hdr;
-} __attribute__((packed));
-
-/*
- * Same as above but for post-rq WRs.
- */
-struct c2_rq_hdr {
-       struct c2wr_user_hdr user_hdr;
-} __attribute__((packed));
-
-/*
- * use the same struct for all sends.
- */
-struct c2wr_send_req {
-       struct c2_sq_hdr sq_hdr;
-       __be32 sge_len;
-       __be32 remote_stag;
-       u8 data[0];             /* SGE array */
-} __attribute__((packed));
-
-union c2wr_send {
-       struct c2wr_send_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_rdma_write_req {
-       struct c2_sq_hdr sq_hdr;
-       __be64 remote_to;
-       __be32 remote_stag;
-       __be32 sge_len;
-       u8 data[0];             /* SGE array */
-} __attribute__((packed));
-
-union c2wr_rdma_write {
-       struct c2wr_rdma_write_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_rdma_read_req {
-       struct c2_sq_hdr sq_hdr;
-       __be64 local_to;
-       __be64 remote_to;
-       __be32 local_stag;
-       __be32 remote_stag;
-       __be32 length;
-} __attribute__((packed));
-
-union c2wr_rdma_read {
-       struct c2wr_rdma_read_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_mw_bind_req {
-       struct c2_sq_hdr sq_hdr;
-       u64 va;
-       u8 stag_key;
-       u8 pad[3];
-       u32 mw_stag_index;
-       u32 mr_stag_index;
-       u32 length;
-       u32 flags;
-} __attribute__((packed));
-
-union c2wr_mw_bind {
-       struct c2wr_mw_bind_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_nsmr_fastreg_req {
-       struct c2_sq_hdr sq_hdr;
-       u64 va;
-       u8 stag_key;
-       u8 pad[3];
-       u32 stag_index;
-       u32 pbe_size;
-       u32 fbo;
-       u32 length;
-       u32 addrs_length;
-       /* array of paddrs (must be aligned on a 64bit boundary) */
-       u64 paddrs[0];
-} __attribute__((packed));
-
-union c2wr_nsmr_fastreg {
-       struct c2wr_nsmr_fastreg_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_stag_invalidate_req {
-       struct c2_sq_hdr sq_hdr;
-       u8 stag_key;
-       u8 pad[3];
-       u32 stag_index;
-} __attribute__((packed));
-
-union c2wr_stag_invalidate {
-       struct c2wr_stag_invalidate_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-union c2wr_sqwr {
-       struct c2_sq_hdr sq_hdr;
-       struct c2wr_send_req send;
-       struct c2wr_send_req send_se;
-       struct c2wr_send_req send_inv;
-       struct c2wr_send_req send_se_inv;
-       struct c2wr_rdma_write_req rdma_write;
-       struct c2wr_rdma_read_req rdma_read;
-       struct c2wr_mw_bind_req mw_bind;
-       struct c2wr_nsmr_fastreg_req nsmr_fastreg;
-       struct c2wr_stag_invalidate_req stag_inv;
-} __attribute__((packed));
-
-
-/*
- * RQ WRs
- */
-struct c2wr_rqwr {
-       struct c2_rq_hdr rq_hdr;
-       u8 data[0];             /* array of SGEs */
-} __attribute__((packed));
-
-union c2wr_recv {
-       struct c2wr_rqwr req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-/*
- * All AEs start with this header.  Most AEs only need to convey the
- * information in the header.  Some, like LLP connection events, need
- * more info.  The union typdef c2wr_ae_t has all the possible AEs.
- *
- * hdr.context is the user_context from the rnic_open WR.  NULL If this
- * is not affiliated with an rnic
- *
- * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
- * CCAE_LLP_CLOSE_COMPLETE)
- *
- * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
- *
- * user_context is the context passed down when the host created the resource.
- */
-struct c2wr_ae_hdr {
-       struct c2wr_hdr hdr;
-       u64 user_context;       /* user context for this res. */
-       __be32 resource_type;   /* see enum c2_resource_indicator */
-       __be32 resource;        /* handle for resource */
-       __be32 qp_state;        /* current QP State */
-} __attribute__((packed));
-
-/*
- * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
- * the adapter moves the QP into RTS state
- */
-struct c2wr_ae_active_connect_results {
-       struct c2wr_ae_hdr ae_hdr;
-       __be32 laddr;
-       __be32 raddr;
-       __be16 lport;
-       __be16 rport;
-       __be32 private_data_length;
-       u8 private_data[0];     /* data is in-line in the msg. */
-} __attribute__((packed));
-
-/*
- * When connections are established by the stack (and the private data
- * MPA frame is received), the adapter will generate an event to the host.
- * The details of the connection, any private data, and the new connection
- * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
- * AE queue:
- */
-struct c2wr_ae_connection_request {
-       struct c2wr_ae_hdr ae_hdr;
-       u32 cr_handle;          /* connreq handle (sock ptr) */
-       __be32 laddr;
-       __be32 raddr;
-       __be16 lport;
-       __be16 rport;
-       __be32 private_data_length;
-       u8 private_data[0];     /* data is in-line in the msg. */
-} __attribute__((packed));
-
-union c2wr_ae {
-       struct c2wr_ae_hdr ae_generic;
-       struct c2wr_ae_active_connect_results ae_active_connect_results;
-       struct c2wr_ae_connection_request ae_connection_request;
-} __attribute__((packed));
-
-struct c2wr_init_req {
-       struct c2wr_hdr hdr;
-       __be64 hint_count;
-       __be64 q0_host_shared;
-       __be64 q1_host_shared;
-       __be64 q1_host_msg_pool;
-       __be64 q2_host_shared;
-       __be64 q2_host_msg_pool;
-} __attribute__((packed));
-
-struct c2wr_init_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_init {
-       struct c2wr_init_req req;
-       struct c2wr_init_rep rep;
-} __attribute__((packed));
-
-/*
- * For upgrading flash.
- */
-
-struct c2wr_flash_init_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-} __attribute__((packed));
-
-struct c2wr_flash_init_rep {
-       struct c2wr_hdr hdr;
-       u32 adapter_flash_buf_offset;
-       u32 adapter_flash_len;
-} __attribute__((packed));
-
-union c2wr_flash_init {
-       struct c2wr_flash_init_req req;
-       struct c2wr_flash_init_rep rep;
-} __attribute__((packed));
-
-struct c2wr_flash_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 len;
-} __attribute__((packed));
-
-struct c2wr_flash_rep {
-       struct c2wr_hdr hdr;
-       u32 status;
-} __attribute__((packed));
-
-union c2wr_flash {
-       struct c2wr_flash_req req;
-       struct c2wr_flash_rep rep;
-} __attribute__((packed));
-
-struct c2wr_buf_alloc_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 size;
-} __attribute__((packed));
-
-struct c2wr_buf_alloc_rep {
-       struct c2wr_hdr hdr;
-       u32 offset;             /* 0 if mem not available */
-       u32 size;               /* 0 if mem not available */
-} __attribute__((packed));
-
-union c2wr_buf_alloc {
-       struct c2wr_buf_alloc_req req;
-       struct c2wr_buf_alloc_rep rep;
-} __attribute__((packed));
-
-struct c2wr_buf_free_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 offset;             /* Must match value from alloc */
-       u32 size;               /* Must match value from alloc */
-} __attribute__((packed));
-
-struct c2wr_buf_free_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_buf_free {
-       struct c2wr_buf_free_req req;
-       struct c2wr_ce rep;
-} __attribute__((packed));
-
-struct c2wr_flash_write_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 offset;
-       u32 size;
-       u32 type;
-       u32 flags;
-} __attribute__((packed));
-
-struct c2wr_flash_write_rep {
-       struct c2wr_hdr hdr;
-       u32 status;
-} __attribute__((packed));
-
-union c2wr_flash_write {
-       struct c2wr_flash_write_req req;
-       struct c2wr_flash_write_rep rep;
-} __attribute__((packed));
-
-/*
- * Messages for LLP connection setup.
- */
-
-/*
- * Listen Request.  This allocates a listening endpoint to allow passive
- * connection setup.  Newly established LLP connections are passed up
- * via an AE.  See c2wr_ae_connection_request_t
- */
-struct c2wr_ep_listen_create_req {
-       struct c2wr_hdr hdr;
-       u64 user_context;       /* returned in AEs. */
-       u32 rnic_handle;
-       __be32 local_addr;              /* local addr, or 0  */
-       __be16 local_port;              /* 0 means "pick one" */
-       u16 pad;
-       __be32 backlog;         /* tradional tcp listen bl */
-} __attribute__((packed));
-
-struct c2wr_ep_listen_create_rep {
-       struct c2wr_hdr hdr;
-       u32 ep_handle;          /* handle to new listening ep */
-       u16 local_port;         /* resulting port... */
-       u16 pad;
-} __attribute__((packed));
-
-union c2wr_ep_listen_create {
-       struct c2wr_ep_listen_create_req req;
-       struct c2wr_ep_listen_create_rep rep;
-} __attribute__((packed));
-
-struct c2wr_ep_listen_destroy_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 ep_handle;
-} __attribute__((packed));
-
-struct c2wr_ep_listen_destroy_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_ep_listen_destroy {
-       struct c2wr_ep_listen_destroy_req req;
-       struct c2wr_ep_listen_destroy_rep rep;
-} __attribute__((packed));
-
-struct c2wr_ep_query_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 ep_handle;
-} __attribute__((packed));
-
-struct c2wr_ep_query_rep {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 local_addr;
-       u32 remote_addr;
-       u16 local_port;
-       u16 remote_port;
-} __attribute__((packed));
-
-union c2wr_ep_query {
-       struct c2wr_ep_query_req req;
-       struct c2wr_ep_query_rep rep;
-} __attribute__((packed));
-
-
-/*
- * The host passes this down to indicate acceptance of a pending iWARP
- * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
- * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
- */
-struct c2wr_cr_accept_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 qp_handle;          /* QP to bind to this LLP conn */
-       u32 ep_handle;          /* LLP  handle to accept */
-       __be32 private_data_length;
-       u8 private_data[0];     /* data in-line in msg. */
-} __attribute__((packed));
-
-/*
- * adapter sends reply when private data is successfully submitted to
- * the LLP.
- */
-struct c2wr_cr_accept_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_cr_accept {
-       struct c2wr_cr_accept_req req;
-       struct c2wr_cr_accept_rep rep;
-} __attribute__((packed));
-
-/*
- * The host sends this down if a given iWARP connection request was
- * rejected by the consumer.  The cr_handle was obtained from a
- * previous c2wr_ae_connection_request_t AE sent by the adapter.
- */
-struct  c2wr_cr_reject_req {
-       struct c2wr_hdr hdr;
-       u32 rnic_handle;
-       u32 ep_handle;          /* LLP handle to reject */
-} __attribute__((packed));
-
-/*
- * Dunno if this is needed, but we'll add it for now.  The adapter will
- * send the reject_reply after the LLP endpoint has been destroyed.
- */
-struct  c2wr_cr_reject_rep {
-       struct c2wr_hdr hdr;
-} __attribute__((packed));
-
-union c2wr_cr_reject {
-       struct c2wr_cr_reject_req req;
-       struct c2wr_cr_reject_rep rep;
-} __attribute__((packed));
-
-/*
- * console command.  Used to implement a debug console over the verbs
- * request and reply queues.
- */
-
-/*
- * Console request message.  It contains:
- *     - message hdr with id = CCWR_CONSOLE
- *     - the physaddr/len of host memory to be used for the reply.
- *     - the command string.  eg:  "netstat -s" or "zoneinfo"
- */
-struct c2wr_console_req {
-       struct c2wr_hdr hdr;            /* id = CCWR_CONSOLE */
-       u64 reply_buf;          /* pinned host buf for reply */
-       u32 reply_buf_len;      /* length of reply buffer */
-       u8 command[0];          /* NUL terminated ascii string */
-       /* containing the command req */
-} __attribute__((packed));
-
-/*
- * flags used in the console reply.
- */
-enum c2_console_flags {
-       CONS_REPLY_TRUNCATED = 0x00000001       /* reply was truncated */
-} __attribute__((packed));
-
-/*
- * Console reply message.
- * hdr.result contains the c2_status_t error if the reply was _not_ generated,
- * or C2_OK if the reply was generated.
- */
-struct c2wr_console_rep {
-       struct c2wr_hdr hdr;            /* id = CCWR_CONSOLE */
-       u32 flags;
-} __attribute__((packed));
-
-union c2wr_console {
-       struct c2wr_console_req req;
-       struct c2wr_console_rep rep;
-} __attribute__((packed));
-
-
-/*
- * Giant union with all WRs.  Makes life easier...
- */
-union c2wr {
-       struct c2wr_hdr hdr;
-       struct c2wr_user_hdr user_hdr;
-       union c2wr_rnic_open rnic_open;
-       union c2wr_rnic_query rnic_query;
-       union c2wr_rnic_getconfig rnic_getconfig;
-       union c2wr_rnic_setconfig rnic_setconfig;
-       union c2wr_rnic_close rnic_close;
-       union c2wr_cq_create cq_create;
-       union c2wr_cq_modify cq_modify;
-       union c2wr_cq_destroy cq_destroy;
-       union c2wr_pd_alloc pd_alloc;
-       union c2wr_pd_dealloc pd_dealloc;
-       union c2wr_srq_create srq_create;
-       union c2wr_srq_destroy srq_destroy;
-       union c2wr_qp_create qp_create;
-       union c2wr_qp_query qp_query;
-       union c2wr_qp_modify qp_modify;
-       union c2wr_qp_destroy qp_destroy;
-       struct c2wr_qp_connect qp_connect;
-       union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
-       union c2wr_nsmr_register nsmr_register;
-       union c2wr_nsmr_pbl nsmr_pbl;
-       union c2wr_mr_query mr_query;
-       union c2wr_mw_query mw_query;
-       union c2wr_stag_dealloc stag_dealloc;
-       union c2wr_sqwr sqwr;
-       struct c2wr_rqwr rqwr;
-       struct c2wr_ce ce;
-       union c2wr_ae ae;
-       union c2wr_init init;
-       union c2wr_ep_listen_create ep_listen_create;
-       union c2wr_ep_listen_destroy ep_listen_destroy;
-       union c2wr_cr_accept cr_accept;
-       union c2wr_cr_reject cr_reject;
-       union c2wr_console console;
-       union c2wr_flash_init flash_init;
-       union c2wr_flash flash;
-       union c2wr_buf_alloc buf_alloc;
-       union c2wr_buf_free buf_free;
-       union c2wr_flash_write flash_write;
-} __attribute__((packed));
-
-
-/*
- * Accessors for the wr fields that are packed together tightly to
- * reduce the wr message size.  The wr arguments are void* so that
- * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
- * in the struct c2wr union can be passed in.
- */
-static __inline__ u8 c2_wr_get_id(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->id;
-}
-static __inline__ void c2_wr_set_id(void *wr, u8 id)
-{
-       ((struct c2wr_hdr *) wr)->id = id;
-}
-static __inline__ u8 c2_wr_get_result(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->result;
-}
-static __inline__ void c2_wr_set_result(void *wr, u8 result)
-{
-       ((struct c2wr_hdr *) wr)->result = result;
-}
-static __inline__ u8 c2_wr_get_flags(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->flags;
-}
-static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
-{
-       ((struct c2wr_hdr *) wr)->flags = flags;
-}
-static __inline__ u8 c2_wr_get_sge_count(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->sge_count;
-}
-static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
-{
-       ((struct c2wr_hdr *) wr)->sge_count = sge_count;
-}
-static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
-{
-       return ((struct c2wr_hdr *) wr)->wqe_count;
-}
-static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
-{
-       ((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
-}
-
-#endif                         /* _C2_WR_H_ */
index bbbe0184e5922f6dab1fce7c56a92d1422da986e..93308c45f298d921fb3fd25b195a2b5970471fe5 100644 (file)
@@ -800,7 +800,9 @@ static int iwch_dealloc_mw(struct ib_mw *mw)
        return 0;
 }
 
-static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
+                                  enum ib_mr_type mr_type,
+                                  u32 max_num_sg)
 {
        struct iwch_dev *rhp;
        struct iwch_pd *php;
@@ -809,6 +811,10 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
        u32 stag = 0;
        int ret = 0;
 
+       if (mr_type != IB_MR_TYPE_MEM_REG ||
+           max_num_sg > T3_MAX_FASTREG_DEPTH)
+               return ERR_PTR(-EINVAL);
+
        php = to_iwch_pd(pd);
        rhp = php->rhp;
        mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
@@ -816,10 +822,10 @@ static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
                goto err;
 
        mhp->rhp = rhp;
-       ret = iwch_alloc_pbl(mhp, pbl_depth);
+       ret = iwch_alloc_pbl(mhp, max_num_sg);
        if (ret)
                goto err1;
-       mhp->attr.pbl_size = pbl_depth;
+       mhp->attr.pbl_size = max_num_sg;
        ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
                                 mhp->attr.pbl_size, mhp->attr.pbl_addr);
        if (ret)
@@ -1443,7 +1449,7 @@ int iwch_register_device(struct iwch_dev *dev)
        dev->ibdev.alloc_mw = iwch_alloc_mw;
        dev->ibdev.bind_mw = iwch_bind_mw;
        dev->ibdev.dealloc_mw = iwch_dealloc_mw;
-       dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr;
+       dev->ibdev.alloc_mr = iwch_alloc_mr;
        dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
        dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
        dev->ibdev.attach_mcast = iwch_multicast_attach;
index 3ad8dc798f52c9101261882ac19138efb3905b94..debc39d2cbc2a61d66901dd20ef531703717ed29 100644 (file)
@@ -50,6 +50,7 @@
 #include <rdma/ib_addr.h>
 
 #include "iw_cxgb4.h"
+#include "clip_tbl.h"
 
 static char *states[] = {
        "idle",
@@ -115,11 +116,11 @@ module_param(ep_timeout_secs, int, 0644);
 MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
                                   "in seconds (default=60)");
 
-static int mpa_rev = 1;
+static int mpa_rev = 2;
 module_param(mpa_rev, int, 0644);
 MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
                "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
-               " compliant (default=1)");
+               " compliant (default=2)");
 
 static int markers_enabled;
 module_param(markers_enabled, int, 0644);
@@ -298,6 +299,16 @@ void _c4iw_free_ep(struct kref *kref)
        if (test_bit(QP_REFERENCED, &ep->com.flags))
                deref_qp(ep);
        if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+               if (ep->com.remote_addr.ss_family == AF_INET6) {
+                       struct sockaddr_in6 *sin6 =
+                                       (struct sockaddr_in6 *)
+                                       &ep->com.mapped_local_addr;
+
+                       cxgb4_clip_release(
+                                       ep->com.dev->rdev.lldi.ports[0],
+                                       (const u32 *)&sin6->sin6_addr.s6_addr,
+                                       1);
+               }
                remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
                cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
                dst_release(ep->dst);
@@ -442,6 +453,12 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
        kfree_skb(skb);
        connect_reply_upcall(ep, -EHOSTUNREACH);
        state_set(&ep->com, DEAD);
+       if (ep->com.remote_addr.ss_family == AF_INET6) {
+               struct sockaddr_in6 *sin6 =
+                       (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+               cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+       }
        remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
        cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
        dst_release(ep->dst);
@@ -640,6 +657,7 @@ static int send_connect(struct c4iw_ep *ep)
        struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)
                                   &ep->com.mapped_remote_addr;
        int win;
+       int ret;
 
        wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?
                        roundup(sizev4, 16) :
@@ -693,6 +711,11 @@ static int send_connect(struct c4iw_ep *ep)
                opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
                opt2 |= T5_ISS_F;
        }
+
+       if (ep->com.remote_addr.ss_family == AF_INET6)
+               cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+                              (const u32 *)&la6->sin6_addr.s6_addr, 1);
+
        t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);
 
        if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
@@ -790,7 +813,11 @@ static int send_connect(struct c4iw_ep *ep)
        }
 
        set_bit(ACT_OPEN_REQ, &ep->com.history);
-       return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+       if (ret && ep->com.remote_addr.ss_family == AF_INET6)
+               cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+                                  (const u32 *)&la6->sin6_addr.s6_addr, 1);
+       return ret;
 }
 
 static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
@@ -2091,6 +2118,15 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
        case CPL_ERR_CONN_EXIST:
                if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
                        set_bit(ACT_RETRY_INUSE, &ep->com.history);
+                       if (ep->com.remote_addr.ss_family == AF_INET6) {
+                               struct sockaddr_in6 *sin6 =
+                                               (struct sockaddr_in6 *)
+                                               &ep->com.mapped_local_addr;
+                               cxgb4_clip_release(
+                                               ep->com.dev->rdev.lldi.ports[0],
+                                               (const u32 *)
+                                               &sin6->sin6_addr.s6_addr, 1);
+                       }
                        remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
                                        atid);
                        cxgb4_free_atid(t, atid);
@@ -2118,6 +2154,12 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
        connect_reply_upcall(ep, status2errno(status));
        state_set(&ep->com, DEAD);
 
+       if (ep->com.remote_addr.ss_family == AF_INET6) {
+               struct sockaddr_in6 *sin6 =
+                       (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+               cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+       }
        if (status && act_open_has_tid(status))
                cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
 
@@ -2302,6 +2344,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
        struct dst_entry *dst;
        __u8 local_ip[16], peer_ip[16];
        __be16 local_port, peer_port;
+       struct sockaddr_in6 *sin6;
        int err;
        u16 peer_mss = ntohs(req->tcpopt.mss);
        int iptype;
@@ -2400,9 +2443,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
                sin->sin_port = peer_port;
                sin->sin_addr.s_addr = *(__be32 *)peer_ip;
        } else {
-               struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
-                       &child_ep->com.mapped_local_addr;
-
+               sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr;
                sin6->sin6_family = PF_INET6;
                sin6->sin6_port = local_port;
                memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
@@ -2436,6 +2477,11 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
        insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
        accept_cr(child_ep, skb, req);
        set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+       if (iptype == 6) {
+               sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_local_addr;
+               cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0],
+                              (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+       }
        goto out;
 reject:
        reject_cr(dev, hwtid, skb);
@@ -2672,6 +2718,15 @@ out:
        if (release)
                release_ep_resources(ep);
        else if (ep->retry_with_mpa_v1) {
+               if (ep->com.remote_addr.ss_family == AF_INET6) {
+                       struct sockaddr_in6 *sin6 =
+                                       (struct sockaddr_in6 *)
+                                       &ep->com.mapped_local_addr;
+                       cxgb4_clip_release(
+                                       ep->com.dev->rdev.lldi.ports[0],
+                                       (const u32 *)&sin6->sin6_addr.s6_addr,
+                                       1);
+               }
                remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
                cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
                dst_release(ep->dst);
@@ -2976,7 +3031,7 @@ static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
        struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr;
        struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr;
 
-       if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
+       if (!get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
                memcpy(la6->sin6_addr.s6_addr, &addr, 16);
                memcpy(ra6->sin6_addr.s6_addr, &addr, 16);
                return 0;
@@ -3186,6 +3241,9 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
                pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
                       err, ep->stid,
                       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
+       else
+               cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0],
+                              (const u32 *)&sin6->sin6_addr.s6_addr, 1);
        return err;
 }
 
@@ -3334,6 +3392,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
                        ep->com.dev->rdev.lldi.ports[0], ep->stid,
                        ep->com.dev->rdev.lldi.rxq_ids[0], 0);
        } else {
+               struct sockaddr_in6 *sin6;
                c4iw_init_wr_wait(&ep->com.wr_wait);
                err = cxgb4_remove_server(
                                ep->com.dev->rdev.lldi.ports[0], ep->stid,
@@ -3342,6 +3401,9 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
                        goto done;
                err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
                                          0, 0, __func__);
+               sin6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+               cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr, 1);
        }
        remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
        cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
@@ -3461,6 +3523,12 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
        mutex_unlock(&dev->rdev.stats.lock);
        connect_reply_upcall(ep, status2errno(req->retval));
        state_set(&ep->com, DEAD);
+       if (ep->com.remote_addr.ss_family == AF_INET6) {
+               struct sockaddr_in6 *sin6 =
+                       (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+               cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
+                                  (const u32 *)&sin6->sin6_addr.s6_addr, 1);
+       }
        remove_handle(dev, &dev->atid_idr, atid);
        cxgb4_free_atid(dev->rdev.lldi.tids, atid);
        dst_release(ep->dst);
index cc77844fada38e2f3b00e51223e38dbbb7e29434..c7bb38c931a555b034e76580484d79851dd7546c 100644 (file)
@@ -970,7 +970,9 @@ void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list);
 struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(
                                        struct ib_device *device,
                                        int page_list_len);
-struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth);
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
+                           enum ib_mr_type mr_type,
+                           u32 max_num_sg);
 int c4iw_dealloc_mw(struct ib_mw *mw);
 struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
 struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
index cff815b9170716a01bca790c586b7a9b14c586e7..026b91ebd5e2e6806acae6b81dd41d56d599b4d2 100644 (file)
@@ -853,7 +853,9 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
        return 0;
 }
 
-struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
+                           enum ib_mr_type mr_type,
+                           u32 max_num_sg)
 {
        struct c4iw_dev *rhp;
        struct c4iw_pd *php;
@@ -862,6 +864,10 @@ struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
        u32 stag = 0;
        int ret = 0;
 
+       if (mr_type != IB_MR_TYPE_MEM_REG ||
+           max_num_sg > t4_max_fr_depth(use_dsgl))
+               return ERR_PTR(-EINVAL);
+
        php = to_c4iw_pd(pd);
        rhp = php->rhp;
        mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
@@ -871,10 +877,10 @@ struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
        }
 
        mhp->rhp = rhp;
-       ret = alloc_pbl(mhp, pbl_depth);
+       ret = alloc_pbl(mhp, max_num_sg);
        if (ret)
                goto err1;
-       mhp->attr.pbl_size = pbl_depth;
+       mhp->attr.pbl_size = max_num_sg;
        ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
                                 mhp->attr.pbl_size, mhp->attr.pbl_addr);
        if (ret)
index 6eee3d3855415a2cc5cc376946e3d880453a4b23..7746113552e7b37cb38907c38dc5b4fca7fdebd4 100644 (file)
@@ -556,7 +556,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
        dev->ibdev.alloc_mw = c4iw_alloc_mw;
        dev->ibdev.bind_mw = c4iw_bind_mw;
        dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
-       dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr;
+       dev->ibdev.alloc_mr = c4iw_alloc_mr;
        dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl;
        dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl;
        dev->ibdev.attach_mcast = c4iw_multicast_attach;
diff --git a/drivers/infiniband/hw/ehca/Kconfig b/drivers/infiniband/hw/ehca/Kconfig
deleted file mode 100644 (file)
index 59f807d..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-config INFINIBAND_EHCA
-       tristate "eHCA support"
-       depends on IBMEBUS
-       ---help---
-       This driver supports the IBM pSeries eHCA InfiniBand adapter.
-
-       To compile the driver as a module, choose M here. The module
-       will be called ib_ehca.
-
diff --git a/drivers/infiniband/hw/ehca/Makefile b/drivers/infiniband/hw/ehca/Makefile
deleted file mode 100644 (file)
index 74d284e..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#  Authors: Heiko J Schick <schickhj@de.ibm.com>
-#           Christoph Raisch <raisch@de.ibm.com>
-#           Joachim Fenkes <fenkes@de.ibm.com>
-#
-#  Copyright (c) 2005 IBM Corporation
-#
-#  All rights reserved.
-#
-#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
-
-obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
-
-ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
-               ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
-               ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
-
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
deleted file mode 100644 (file)
index 4659263..0000000
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  address vector functions
- *
- *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Khadija Souissi <souissik@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-static struct kmem_cache *av_cache;
-
-int ehca_calc_ipd(struct ehca_shca *shca, int port,
-                 enum ib_rate path_rate, u32 *ipd)
-{
-       int path = ib_rate_to_mult(path_rate);
-       int link, ret;
-       struct ib_port_attr pa;
-
-       if (path_rate == IB_RATE_PORT_CURRENT) {
-               *ipd = 0;
-               return 0;
-       }
-
-       if (unlikely(path < 0)) {
-               ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
-                        path_rate);
-               return -EINVAL;
-       }
-
-       ret = ehca_query_port(&shca->ib_device, port, &pa);
-       if (unlikely(ret < 0)) {
-               ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
-               return ret;
-       }
-
-       link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
-
-       if (path >= link)
-               /* no need to throttle if path faster than link */
-               *ipd = 0;
-       else
-               /* IPD = round((link / path) - 1) */
-               *ipd = ((link + (path >> 1)) / path) - 1;
-
-       return 0;
-}
-
-struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
-{
-       int ret;
-       struct ehca_av *av;
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-
-       av = kmem_cache_alloc(av_cache, GFP_KERNEL);
-       if (!av) {
-               ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
-                        pd, ah_attr);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       av->av.sl = ah_attr->sl;
-       av->av.dlid = ah_attr->dlid;
-       av->av.slid_path_bits = ah_attr->src_path_bits;
-
-       if (ehca_static_rate < 0) {
-               u32 ipd;
-               if (ehca_calc_ipd(shca, ah_attr->port_num,
-                                 ah_attr->static_rate, &ipd)) {
-                       ret = -EINVAL;
-                       goto create_ah_exit1;
-               }
-               av->av.ipd = ipd;
-       } else
-               av->av.ipd = ehca_static_rate;
-
-       av->av.lnh = ah_attr->ah_flags;
-       av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
-                                           ah_attr->grh.traffic_class);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
-                                           ah_attr->grh.flow_label);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
-                                           ah_attr->grh.hop_limit);
-       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
-       /* set sgid in grh.word_1 */
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               int rc;
-               struct ib_port_attr port_attr;
-               union ib_gid gid;
-               memset(&port_attr, 0, sizeof(port_attr));
-               rc = ehca_query_port(pd->device, ah_attr->port_num,
-                                    &port_attr);
-               if (rc) { /* invalid port number */
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Invalid port number "
-                                "ehca_query_port() returned %x "
-                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
-                       goto create_ah_exit1;
-               }
-               memset(&gid, 0, sizeof(gid));
-               rc = ehca_query_gid(pd->device,
-                                   ah_attr->port_num,
-                                   ah_attr->grh.sgid_index, &gid);
-               if (rc) {
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Failed to retrieve sgid "
-                                "ehca_query_gid() returned %x "
-                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
-                       goto create_ah_exit1;
-               }
-               memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
-       }
-       av->av.pmtu = shca->max_mtu;
-
-       /* dgid comes in grh.word_3 */
-       memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
-              sizeof(ah_attr->grh.dgid));
-
-       return &av->ib_ah;
-
-create_ah_exit1:
-       kmem_cache_free(av_cache, av);
-
-       return ERR_PTR(ret);
-}
-
-int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
-       struct ehca_av *av;
-       struct ehca_ud_av new_ehca_av;
-       struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
-                                             ib_device);
-
-       memset(&new_ehca_av, 0, sizeof(new_ehca_av));
-       new_ehca_av.sl = ah_attr->sl;
-       new_ehca_av.dlid = ah_attr->dlid;
-       new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
-       new_ehca_av.ipd = ah_attr->static_rate;
-       new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
-                                        (ah_attr->ah_flags & IB_AH_GRH) > 0);
-       new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
-                                               ah_attr->grh.traffic_class);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
-                                                ah_attr->grh.flow_label);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
-                                                ah_attr->grh.hop_limit);
-       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
-
-       /* set sgid in grh.word_1 */
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               int rc;
-               struct ib_port_attr port_attr;
-               union ib_gid gid;
-               memset(&port_attr, 0, sizeof(port_attr));
-               rc = ehca_query_port(ah->device, ah_attr->port_num,
-                                    &port_attr);
-               if (rc) { /* invalid port number */
-                       ehca_err(ah->device, "Invalid port number "
-                                "ehca_query_port() returned %x "
-                                "ah=%p ah_attr=%p port_num=%x",
-                                rc, ah, ah_attr, ah_attr->port_num);
-                       return -EINVAL;
-               }
-               memset(&gid, 0, sizeof(gid));
-               rc = ehca_query_gid(ah->device,
-                                   ah_attr->port_num,
-                                   ah_attr->grh.sgid_index, &gid);
-               if (rc) {
-                       ehca_err(ah->device, "Failed to retrieve sgid "
-                                "ehca_query_gid() returned %x "
-                                "ah=%p ah_attr=%p port_num=%x "
-                                "sgid_index=%x",
-                                rc, ah, ah_attr, ah_attr->port_num,
-                                ah_attr->grh.sgid_index);
-                       return -EINVAL;
-               }
-               memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
-       }
-
-       new_ehca_av.pmtu = shca->max_mtu;
-
-       memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
-              sizeof(ah_attr->grh.dgid));
-
-       av = container_of(ah, struct ehca_av, ib_ah);
-       av->av = new_ehca_av;
-
-       return 0;
-}
-
-int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
-{
-       struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
-
-       memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
-              sizeof(ah_attr->grh.dgid));
-       ah_attr->sl = av->av.sl;
-
-       ah_attr->dlid = av->av.dlid;
-
-       ah_attr->src_path_bits = av->av.slid_path_bits;
-       ah_attr->static_rate = av->av.ipd;
-       ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
-       ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
-                                                   av->av.grh.word_0);
-       ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
-                                               av->av.grh.word_0);
-       ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
-                                                av->av.grh.word_0);
-
-       return 0;
-}
-
-int ehca_destroy_ah(struct ib_ah *ah)
-{
-       kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
-
-       return 0;
-}
-
-int ehca_init_av_cache(void)
-{
-       av_cache = kmem_cache_create("ehca_cache_av",
-                                  sizeof(struct ehca_av), 0,
-                                  SLAB_HWCACHE_ALIGN,
-                                  NULL);
-       if (!av_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_av_cache(void)
-{
-       if (av_cache)
-               kmem_cache_destroy(av_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
deleted file mode 100644 (file)
index bd45e0f..0000000
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Struct definition for eHCA internal structures
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_CLASSES_H__
-#define __EHCA_CLASSES_H__
-
-struct ehca_module;
-struct ehca_qp;
-struct ehca_cq;
-struct ehca_eq;
-struct ehca_mr;
-struct ehca_mw;
-struct ehca_pd;
-struct ehca_av;
-
-#include <linux/wait.h>
-#include <linux/mutex.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_user_verbs.h>
-
-#ifdef CONFIG_PPC64
-#include "ehca_classes_pSeries.h"
-#endif
-#include "ipz_pt_fn.h"
-#include "ehca_qes.h"
-#include "ehca_irq.h"
-
-#define EHCA_EQE_CACHE_SIZE 20
-#define EHCA_MAX_NUM_QUEUES 0xffff
-
-struct ehca_eqe_cache_entry {
-       struct ehca_eqe *eqe;
-       struct ehca_cq *cq;
-};
-
-struct ehca_eq {
-       u32 length;
-       struct ipz_queue ipz_queue;
-       struct ipz_eq_handle ipz_eq_handle;
-       struct work_struct work;
-       struct h_galpas galpas;
-       int is_initialized;
-       struct ehca_pfeq pf;
-       spinlock_t spinlock;
-       struct tasklet_struct interrupt_task;
-       u32 ist;
-       spinlock_t irq_spinlock;
-       struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
-};
-
-struct ehca_sma_attr {
-       u16 lid, lmc, sm_sl, sm_lid;
-       u16 pkey_tbl_len, pkeys[16];
-};
-
-struct ehca_sport {
-       struct ib_cq *ibcq_aqp1;
-       struct ib_qp *ibqp_sqp[2];
-       /* lock to serialze modify_qp() calls for sqp in normal
-        * and irq path (when event PORT_ACTIVE is received first time)
-        */
-       spinlock_t mod_sqp_lock;
-       enum ib_port_state port_state;
-       struct ehca_sma_attr saved_attr;
-       u32 pma_qp_nr;
-};
-
-#define HCA_CAP_MR_PGSIZE_4K  0x80000000
-#define HCA_CAP_MR_PGSIZE_64K 0x40000000
-#define HCA_CAP_MR_PGSIZE_1M  0x20000000
-#define HCA_CAP_MR_PGSIZE_16M 0x10000000
-
-struct ehca_shca {
-       struct ib_device ib_device;
-       struct platform_device *ofdev;
-       u8 num_ports;
-       int hw_level;
-       struct list_head shca_list;
-       struct ipz_adapter_handle ipz_hca_handle;
-       struct ehca_sport sport[2];
-       struct ehca_eq eq;
-       struct ehca_eq neq;
-       struct ehca_mr *maxmr;
-       struct ehca_pd *pd;
-       struct h_galpas galpas;
-       struct mutex modify_mutex;
-       u64 hca_cap;
-       /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
-       u32 hca_cap_mr_pgsize;
-       int max_mtu;
-       int max_num_qps;
-       int max_num_cqs;
-       atomic_t num_cqs;
-       atomic_t num_qps;
-};
-
-struct ehca_pd {
-       struct ib_pd ib_pd;
-       struct ipz_pd fw_pd;
-       /* small queue mgmt */
-       struct mutex lock;
-       struct list_head free[2];
-       struct list_head full[2];
-};
-
-enum ehca_ext_qp_type {
-       EQPT_NORMAL    = 0,
-       EQPT_LLQP      = 1,
-       EQPT_SRQBASE   = 2,
-       EQPT_SRQ       = 3,
-};
-
-/* struct to cache modify_qp()'s parms for GSI/SMI qp */
-struct ehca_mod_qp_parm {
-       int mask;
-       struct ib_qp_attr attr;
-};
-
-#define EHCA_MOD_QP_PARM_MAX 4
-
-#define QMAP_IDX_MASK 0xFFFFULL
-
-/* struct for tracking if cqes have been reported to the application */
-struct ehca_qmap_entry {
-       u16 app_wr_id;
-       u8 reported;
-       u8 cqe_req;
-};
-
-struct ehca_queue_map {
-       struct ehca_qmap_entry *map;
-       unsigned int entries;
-       unsigned int tail;
-       unsigned int left_to_poll;
-       unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
-};
-
-/* function to calculate the next index for the qmap */
-static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
-{
-       unsigned int temp = cur_index + 1;
-       return (temp == limit) ? 0 : temp;
-}
-
-struct ehca_qp {
-       union {
-               struct ib_qp ib_qp;
-               struct ib_srq ib_srq;
-       };
-       u32 qp_type;
-       enum ehca_ext_qp_type ext_type;
-       enum ib_qp_state state;
-       struct ipz_queue ipz_squeue;
-       struct ehca_queue_map sq_map;
-       struct ipz_queue ipz_rqueue;
-       struct ehca_queue_map rq_map;
-       struct h_galpas galpas;
-       u32 qkey;
-       u32 real_qp_num;
-       u32 token;
-       spinlock_t spinlock_s;
-       spinlock_t spinlock_r;
-       u32 sq_max_inline_data_size;
-       struct ipz_qp_handle ipz_qp_handle;
-       struct ehca_pfqp pf;
-       struct ib_qp_init_attr init_attr;
-       struct ehca_cq *send_cq;
-       struct ehca_cq *recv_cq;
-       unsigned int sqerr_purgeflag;
-       struct hlist_node list_entries;
-       /* array to cache modify_qp()'s parms for GSI/SMI qp */
-       struct ehca_mod_qp_parm *mod_qp_parm;
-       int mod_qp_parm_idx;
-       /* mmap counter for resources mapped into user space */
-       u32 mm_count_squeue;
-       u32 mm_count_rqueue;
-       u32 mm_count_galpa;
-       /* unsolicited ack circumvention */
-       int unsol_ack_circ;
-       int mtu_shift;
-       u32 message_count;
-       u32 packet_count;
-       atomic_t nr_events; /* events seen */
-       wait_queue_head_t wait_completion;
-       int mig_armed;
-       struct list_head sq_err_node;
-       struct list_head rq_err_node;
-};
-
-#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
-#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
-#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
-
-/* must be power of 2 */
-#define QP_HASHTAB_LEN 8
-
-struct ehca_cq {
-       struct ib_cq ib_cq;
-       struct ipz_queue ipz_queue;
-       struct h_galpas galpas;
-       spinlock_t spinlock;
-       u32 cq_number;
-       u32 token;
-       u32 nr_of_entries;
-       struct ipz_cq_handle ipz_cq_handle;
-       struct ehca_pfcq pf;
-       spinlock_t cb_lock;
-       struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
-       struct list_head entry;
-       u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
-       atomic_t nr_events; /* #events seen */
-       wait_queue_head_t wait_completion;
-       spinlock_t task_lock;
-       /* mmap counter for resources mapped into user space */
-       u32 mm_count_queue;
-       u32 mm_count_galpa;
-       struct list_head sqp_err_list;
-       struct list_head rqp_err_list;
-};
-
-enum ehca_mr_flag {
-       EHCA_MR_FLAG_FMR = 0x80000000,   /* FMR, created with ehca_alloc_fmr */
-       EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
-};
-
-struct ehca_mr {
-       union {
-               struct ib_mr ib_mr;     /* must always be first in ehca_mr */
-               struct ib_fmr ib_fmr;   /* must always be first in ehca_mr */
-       } ib;
-       struct ib_umem *umem;
-       spinlock_t mrlock;
-
-       enum ehca_mr_flag flags;
-       u32 num_kpages;         /* number of kernel pages */
-       u32 num_hwpages;        /* number of hw pages to form MR */
-       u64 hwpage_size;        /* hw page size used for this MR */
-       int acl;                /* ACL (stored here for usage in reregister) */
-       u64 *start;             /* virtual start address (stored here for */
-                               /* usage in reregister) */
-       u64 size;               /* size (stored here for usage in reregister) */
-       u32 fmr_page_size;      /* page size for FMR */
-       u32 fmr_max_pages;      /* max pages for FMR */
-       u32 fmr_max_maps;       /* max outstanding maps for FMR */
-       u32 fmr_map_cnt;        /* map counter for FMR */
-       /* fw specific data */
-       struct ipz_mrmw_handle ipz_mr_handle;   /* MR handle for h-calls */
-       struct h_galpas galpas;
-};
-
-struct ehca_mw {
-       struct ib_mw ib_mw;     /* gen2 mw, must always be first in ehca_mw */
-       spinlock_t mwlock;
-
-       u8 never_bound;         /* indication MW was never bound */
-       struct ipz_mrmw_handle ipz_mw_handle;   /* MW handle for h-calls */
-       struct h_galpas galpas;
-};
-
-enum ehca_mr_pgi_type {
-       EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
-                                 * ehca_rereg_phys_mr,
-                                 * ehca_reg_internal_maxmr */
-       EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
-       EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
-};
-
-struct ehca_mr_pginfo {
-       enum ehca_mr_pgi_type type;
-       u64 num_kpages;
-       u64 kpage_cnt;
-       u64 hwpage_size;     /* hw page size used for this MR */
-       u64 num_hwpages;     /* number of hw pages */
-       u64 hwpage_cnt;      /* counter for hw pages */
-       u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
-
-       union {
-               struct { /* type EHCA_MR_PGI_PHYS section */
-                       int num_phys_buf;
-                       struct ib_phys_buf *phys_buf_array;
-                       u64 next_buf;
-               } phy;
-               struct { /* type EHCA_MR_PGI_USER section */
-                       struct ib_umem *region;
-                       struct scatterlist *next_sg;
-                       u64 next_nmap;
-               } usr;
-               struct { /* type EHCA_MR_PGI_FMR section */
-                       u64 fmr_pgsize;
-                       u64 *page_list;
-                       u64 next_listelem;
-               } fmr;
-       } u;
-};
-
-/* output parameters for MR/FMR hipz calls */
-struct ehca_mr_hipzout_parms {
-       struct ipz_mrmw_handle handle;
-       u32 lkey;
-       u32 rkey;
-       u64 len;
-       u64 vaddr;
-       u32 acl;
-};
-
-/* output parameters for MW hipz calls */
-struct ehca_mw_hipzout_parms {
-       struct ipz_mrmw_handle handle;
-       u32 rkey;
-};
-
-struct ehca_av {
-       struct ib_ah ib_ah;
-       struct ehca_ud_av av;
-};
-
-struct ehca_ucontext {
-       struct ib_ucontext ib_ucontext;
-};
-
-int ehca_init_pd_cache(void);
-void ehca_cleanup_pd_cache(void);
-int ehca_init_cq_cache(void);
-void ehca_cleanup_cq_cache(void);
-int ehca_init_qp_cache(void);
-void ehca_cleanup_qp_cache(void);
-int ehca_init_av_cache(void);
-void ehca_cleanup_av_cache(void);
-int ehca_init_mrmw_cache(void);
-void ehca_cleanup_mrmw_cache(void);
-int ehca_init_small_qp_cache(void);
-void ehca_cleanup_small_qp_cache(void);
-
-extern rwlock_t ehca_qp_idr_lock;
-extern rwlock_t ehca_cq_idr_lock;
-extern struct idr ehca_qp_idr;
-extern struct idr ehca_cq_idr;
-extern spinlock_t shca_list_lock;
-
-extern int ehca_static_rate;
-extern int ehca_port_act_time;
-extern bool ehca_use_hp_mr;
-extern bool ehca_scaling_code;
-extern int ehca_lock_hcalls;
-extern int ehca_nr_ports;
-extern int ehca_max_cq;
-extern int ehca_max_qp;
-
-struct ipzu_queue_resp {
-       u32 qe_size;      /* queue entry size */
-       u32 act_nr_of_sg;
-       u32 queue_length; /* queue length allocated in bytes */
-       u32 pagesize;
-       u32 toggle_state;
-       u32 offset; /* save offset within a page for small_qp */
-};
-
-struct ehca_create_cq_resp {
-       u32 cq_number;
-       u32 token;
-       struct ipzu_queue_resp ipz_queue;
-       u32 fw_handle_ofs;
-       u32 dummy;
-};
-
-struct ehca_create_qp_resp {
-       u32 qp_num;
-       u32 token;
-       u32 qp_type;
-       u32 ext_type;
-       u32 qkey;
-       /* qp_num assigned by ehca: sqp0/1 may have got different numbers */
-       u32 real_qp_num;
-       u32 fw_handle_ofs;
-       u32 dummy;
-       struct ipzu_queue_resp ipz_squeue;
-       struct ipzu_queue_resp ipz_rqueue;
-};
-
-struct ehca_alloc_cq_parms {
-       u32 nr_cqe;
-       u32 act_nr_of_entries;
-       u32 act_pages;
-       struct ipz_eq_handle eq_handle;
-};
-
-enum ehca_service_type {
-       ST_RC  = 0,
-       ST_UC  = 1,
-       ST_RD  = 2,
-       ST_UD  = 3,
-};
-
-enum ehca_ll_comp_flags {
-       LLQP_SEND_COMP = 0x20,
-       LLQP_RECV_COMP = 0x40,
-       LLQP_COMP_MASK = 0x60,
-};
-
-struct ehca_alloc_queue_parms {
-       /* input parameters */
-       int max_wr;
-       int max_sge;
-       int page_size;
-       int is_small;
-
-       /* output parameters */
-       u16 act_nr_wqes;
-       u8  act_nr_sges;
-       u32 queue_size; /* bytes for small queues, pages otherwise */
-};
-
-struct ehca_alloc_qp_parms {
-       struct ehca_alloc_queue_parms squeue;
-       struct ehca_alloc_queue_parms rqueue;
-
-       /* input parameters */
-       enum ehca_service_type servicetype;
-       int qp_storage;
-       int sigtype;
-       enum ehca_ext_qp_type ext_type;
-       enum ehca_ll_comp_flags ll_comp_flags;
-       int ud_av_l_key_ctl;
-
-       u32 token;
-       struct ipz_eq_handle eq_handle;
-       struct ipz_pd pd;
-       struct ipz_cq_handle send_cq_handle, recv_cq_handle;
-
-       u32 srq_qpn, srq_token, srq_limit;
-
-       /* output parameters */
-       u32 real_qp_num;
-       struct ipz_qp_handle qp_handle;
-       struct h_galpas galpas;
-};
-
-int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
-int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
-struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
deleted file mode 100644 (file)
index 689c357..0000000
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  pSeries interface definitions
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_CLASSES_PSERIES_H__
-#define __EHCA_CLASSES_PSERIES_H__
-
-#include "hcp_phyp.h"
-#include "ipz_pt_fn.h"
-
-
-struct ehca_pfqp {
-       struct ipz_qpt sqpt;
-       struct ipz_qpt rqpt;
-};
-
-struct ehca_pfcq {
-       struct ipz_qpt qpt;
-       u32 cqnr;
-};
-
-struct ehca_pfeq {
-       struct ipz_qpt qpt;
-       struct h_galpa galpa;
-       u32 eqnr;
-};
-
-struct ipz_adapter_handle {
-       u64 handle;
-};
-
-struct ipz_cq_handle {
-       u64 handle;
-};
-
-struct ipz_eq_handle {
-       u64 handle;
-};
-
-struct ipz_qp_handle {
-       u64 handle;
-};
-struct ipz_mrmw_handle {
-       u64 handle;
-};
-
-struct ipz_pd {
-       u32 value;
-};
-
-struct hcp_modify_qp_control_block {
-       u32 qkey;                      /* 00 */
-       u32 rdd;                       /* reliable datagram domain */
-       u32 send_psn;                  /* 02 */
-       u32 receive_psn;               /* 03 */
-       u32 prim_phys_port;            /* 04 */
-       u32 alt_phys_port;             /* 05 */
-       u32 prim_p_key_idx;            /* 06 */
-       u32 alt_p_key_idx;             /* 07 */
-       u32 rdma_atomic_ctrl;          /* 08 */
-       u32 qp_state;                  /* 09 */
-       u32 reserved_10;               /* 10 */
-       u32 rdma_nr_atomic_resp_res;   /* 11 */
-       u32 path_migration_state;      /* 12 */
-       u32 rdma_atomic_outst_dest_qp; /* 13 */
-       u32 dest_qp_nr;                /* 14 */
-       u32 min_rnr_nak_timer_field;   /* 15 */
-       u32 service_level;             /* 16 */
-       u32 send_grh_flag;             /* 17 */
-       u32 retry_count;               /* 18 */
-       u32 timeout;                   /* 19 */
-       u32 path_mtu;                  /* 20 */
-       u32 max_static_rate;           /* 21 */
-       u32 dlid;                      /* 22 */
-       u32 rnr_retry_count;           /* 23 */
-       u32 source_path_bits;          /* 24 */
-       u32 traffic_class;             /* 25 */
-       u32 hop_limit;                 /* 26 */
-       u32 source_gid_idx;            /* 27 */
-       u32 flow_label;                /* 28 */
-       u32 reserved_29;               /* 29 */
-       union {                        /* 30 */
-               u64 dw[2];
-               u8 byte[16];
-       } dest_gid;
-       u32 service_level_al;          /* 34 */
-       u32 send_grh_flag_al;          /* 35 */
-       u32 retry_count_al;            /* 36 */
-       u32 timeout_al;                /* 37 */
-       u32 max_static_rate_al;        /* 38 */
-       u32 dlid_al;                   /* 39 */
-       u32 rnr_retry_count_al;        /* 40 */
-       u32 source_path_bits_al;       /* 41 */
-       u32 traffic_class_al;          /* 42 */
-       u32 hop_limit_al;              /* 43 */
-       u32 source_gid_idx_al;         /* 44 */
-       u32 flow_label_al;             /* 45 */
-       u32 reserved_46;               /* 46 */
-       u32 reserved_47;               /* 47 */
-       union {                        /* 48 */
-               u64 dw[2];
-               u8 byte[16];
-       } dest_gid_al;
-       u32 max_nr_outst_send_wr;      /* 52 */
-       u32 max_nr_outst_recv_wr;      /* 53 */
-       u32 disable_ete_credit_check;  /* 54 */
-       u32 qp_number;                 /* 55 */
-       u64 send_queue_handle;         /* 56 */
-       u64 recv_queue_handle;         /* 58 */
-       u32 actual_nr_sges_in_sq_wqe;  /* 60 */
-       u32 actual_nr_sges_in_rq_wqe;  /* 61 */
-       u32 qp_enable;                 /* 62 */
-       u32 curr_srq_limit;            /* 63 */
-       u64 qp_aff_asyn_ev_log_reg;    /* 64 */
-       u64 shared_rq_hndl;            /* 66 */
-       u64 trigg_doorbell_qp_hndl;    /* 68 */
-       u32 reserved_70_127[58];       /* 70 */
-};
-
-#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
-#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
-#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
-#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
-#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
-#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
-#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
-#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
-#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
-#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
-#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
-#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
-#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
-#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
-#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
-#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
-#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
-#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
-#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
-#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
-#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
-#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
-#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
-#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
-#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
-#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
-#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
-#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
-#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
-#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
-#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
-#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
-#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
-#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
-#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
-#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
-#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
-#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
-#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
-#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
-#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
-#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
-#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
-#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
-#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
-#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
-#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
-#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
-#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
-#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
-
-#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
deleted file mode 100644 (file)
index 9b68b17..0000000
+++ /dev/null
@@ -1,397 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Completion queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_iverbs.h"
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "hcp_if.h"
-
-static struct kmem_cache *cq_cache;
-
-int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
-{
-       unsigned int qp_num = qp->real_qp_num;
-       unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-       hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-
-       ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
-                cq->cq_number, qp_num);
-
-       return 0;
-}
-
-int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
-{
-       int ret = -EINVAL;
-       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
-       struct hlist_node *iter;
-       struct ehca_qp *qp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-       hlist_for_each(iter, &cq->qp_hashtab[key]) {
-               qp = hlist_entry(iter, struct ehca_qp, list_entries);
-               if (qp->real_qp_num == real_qp_num) {
-                       hlist_del(iter);
-                       ehca_dbg(cq->ib_cq.device,
-                                "removed qp from cq .cq_num=%x real_qp_num=%x",
-                                cq->cq_number, real_qp_num);
-                       ret = 0;
-                       break;
-               }
-       }
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-       if (ret)
-               ehca_err(cq->ib_cq.device,
-                        "qp not found cq_num=%x real_qp_num=%x",
-                        cq->cq_number, real_qp_num);
-
-       return ret;
-}
-
-struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
-{
-       struct ehca_qp *ret = NULL;
-       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
-       struct hlist_node *iter;
-       struct ehca_qp *qp;
-       hlist_for_each(iter, &cq->qp_hashtab[key]) {
-               qp = hlist_entry(iter, struct ehca_qp, list_entries);
-               if (qp->real_qp_num == real_qp_num) {
-                       ret = qp;
-                       break;
-               }
-       }
-       return ret;
-}
-
-struct ib_cq *ehca_create_cq(struct ib_device *device,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *context,
-                            struct ib_udata *udata)
-{
-       int cqe = attr->cqe;
-       static const u32 additional_cqe = 20;
-       struct ib_cq *cq;
-       struct ehca_cq *my_cq;
-       struct ehca_shca *shca =
-               container_of(device, struct ehca_shca, ib_device);
-       struct ipz_adapter_handle adapter_handle;
-       struct ehca_alloc_cq_parms param; /* h_call's out parameters */
-       struct h_galpa gal;
-       void *vpage;
-       u32 counter;
-       u64 rpage, cqx_fec, h_ret;
-       int ipz_rc, i;
-       unsigned long flags;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
-               return ERR_PTR(-EINVAL);
-
-       if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
-               ehca_err(device, "Unable to create CQ, max number of %i "
-                       "CQs reached.", shca->max_num_cqs);
-               ehca_err(device, "To increase the maximum number of CQs "
-                       "use the number_of_cqs module parameter.\n");
-               return ERR_PTR(-ENOSPC);
-       }
-
-       my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
-       if (!my_cq) {
-               ehca_err(device, "Out of memory for ehca_cq struct device=%p",
-                        device);
-               atomic_dec(&shca->num_cqs);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
-
-       spin_lock_init(&my_cq->spinlock);
-       spin_lock_init(&my_cq->cb_lock);
-       spin_lock_init(&my_cq->task_lock);
-       atomic_set(&my_cq->nr_events, 0);
-       init_waitqueue_head(&my_cq->wait_completion);
-
-       cq = &my_cq->ib_cq;
-
-       adapter_handle = shca->ipz_hca_handle;
-       param.eq_handle = shca->eq.ipz_eq_handle;
-
-       idr_preload(GFP_KERNEL);
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-       idr_preload_end();
-
-       if (my_cq->token < 0) {
-               cq = ERR_PTR(-ENOMEM);
-               ehca_err(device, "Can't allocate new idr entry. device=%p",
-                        device);
-               goto create_cq_exit1;
-       }
-
-       /*
-        * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
-        * for receiving errors CQEs.
-        */
-       param.nr_cqe = cqe + additional_cqe;
-       h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(device, "hipz_h_alloc_resource_cq() failed "
-                        "h_ret=%lli device=%p", h_ret, device);
-               cq = ERR_PTR(ehca2ib_return_code(h_ret));
-               goto create_cq_exit2;
-       }
-
-       ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
-                               EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
-       if (!ipz_rc) {
-               ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
-                        ipz_rc, device);
-               cq = ERR_PTR(-EINVAL);
-               goto create_cq_exit3;
-       }
-
-       for (counter = 0; counter < param.act_pages; counter++) {
-               vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
-               if (!vpage) {
-                       ehca_err(device, "ipz_qpageit_get_inc() "
-                                "returns NULL device=%p", device);
-                       cq = ERR_PTR(-EAGAIN);
-                       goto create_cq_exit4;
-               }
-               rpage = __pa(vpage);
-
-               h_ret = hipz_h_register_rpage_cq(adapter_handle,
-                                                my_cq->ipz_cq_handle,
-                                                &my_cq->pf,
-                                                0,
-                                                0,
-                                                rpage,
-                                                1,
-                                                my_cq->galpas.
-                                                kernel);
-
-               if (h_ret < H_SUCCESS) {
-                       ehca_err(device, "hipz_h_register_rpage_cq() failed "
-                                "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
-                                "act_pages=%i", my_cq, my_cq->cq_number,
-                                h_ret, counter, param.act_pages);
-                       cq = ERR_PTR(-EINVAL);
-                       goto create_cq_exit4;
-               }
-
-               if (counter == (param.act_pages - 1)) {
-                       vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
-                       if ((h_ret != H_SUCCESS) || vpage) {
-                               ehca_err(device, "Registration of pages not "
-                                        "complete ehca_cq=%p cq_num=%x "
-                                        "h_ret=%lli", my_cq, my_cq->cq_number,
-                                        h_ret);
-                               cq = ERR_PTR(-EAGAIN);
-                               goto create_cq_exit4;
-                       }
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED) {
-                               ehca_err(device, "Registration of page failed "
-                                        "ehca_cq=%p cq_num=%x h_ret=%lli "
-                                        "counter=%i act_pages=%i",
-                                        my_cq, my_cq->cq_number,
-                                        h_ret, counter, param.act_pages);
-                               cq = ERR_PTR(-ENOMEM);
-                               goto create_cq_exit4;
-                       }
-               }
-       }
-
-       ipz_qeit_reset(&my_cq->ipz_queue);
-
-       gal = my_cq->galpas.kernel;
-       cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
-       ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
-                my_cq, my_cq->cq_number, cqx_fec);
-
-       my_cq->ib_cq.cqe = my_cq->nr_of_entries =
-               param.act_nr_of_entries - additional_cqe;
-       my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
-
-       for (i = 0; i < QP_HASHTAB_LEN; i++)
-               INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
-
-       INIT_LIST_HEAD(&my_cq->sqp_err_list);
-       INIT_LIST_HEAD(&my_cq->rqp_err_list);
-
-       if (context) {
-               struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
-               struct ehca_create_cq_resp resp;
-               memset(&resp, 0, sizeof(resp));
-               resp.cq_number = my_cq->cq_number;
-               resp.token = my_cq->token;
-               resp.ipz_queue.qe_size = ipz_queue->qe_size;
-               resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
-               resp.ipz_queue.queue_length = ipz_queue->queue_length;
-               resp.ipz_queue.pagesize = ipz_queue->pagesize;
-               resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
-               resp.fw_handle_ofs = (u32)
-                       (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
-               if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-                       ehca_err(device, "Copy to udata failed.");
-                       cq = ERR_PTR(-EFAULT);
-                       goto create_cq_exit4;
-               }
-       }
-
-       return cq;
-
-create_cq_exit4:
-       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
-
-create_cq_exit3:
-       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
-       if (h_ret != H_SUCCESS)
-               ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
-                        "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
-
-create_cq_exit2:
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       idr_remove(&ehca_cq_idr, my_cq->token);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
-create_cq_exit1:
-       kmem_cache_free(cq_cache, my_cq);
-
-       atomic_dec(&shca->num_cqs);
-       return cq;
-}
-
-int ehca_destroy_cq(struct ib_cq *cq)
-{
-       u64 h_ret;
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int cq_num = my_cq->cq_number;
-       struct ib_device *device = cq->device;
-       struct ehca_shca *shca = container_of(device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       unsigned long flags;
-
-       if (cq->uobject) {
-               if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
-                       ehca_err(device, "Resources still referenced in "
-                                "user space cq_num=%x", my_cq->cq_number);
-                       return -EINVAL;
-               }
-       }
-
-       /*
-        * remove the CQ from the idr first to make sure
-        * no more interrupt tasklets will touch this CQ
-        */
-       write_lock_irqsave(&ehca_cq_idr_lock, flags);
-       idr_remove(&ehca_cq_idr, my_cq->token);
-       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
-       /* now wait until all pending events have completed */
-       wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
-
-       /* nobody's using our CQ any longer -- we can destroy it */
-       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
-       if (h_ret == H_R_STATE) {
-               /* cq in err: read err data and destroy it forcibly */
-               ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
-                        "state. Try to delete it forcibly.",
-                        my_cq, cq_num, my_cq->ipz_cq_handle.handle);
-               ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
-               h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
-               if (h_ret == H_SUCCESS)
-                       ehca_dbg(device, "cq_num=%x deleted successfully.",
-                                cq_num);
-       }
-       if (h_ret != H_SUCCESS) {
-               ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
-                        "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
-               return ehca2ib_return_code(h_ret);
-       }
-       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
-       kmem_cache_free(cq_cache, my_cq);
-
-       atomic_dec(&shca->num_cqs);
-       return 0;
-}
-
-int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
-{
-       /* TODO: proper resize needs to be done */
-       ehca_err(cq->device, "not implemented yet");
-
-       return -EFAULT;
-}
-
-int ehca_init_cq_cache(void)
-{
-       cq_cache = kmem_cache_create("ehca_cache_cq",
-                                    sizeof(struct ehca_cq), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!cq_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_cq_cache(void)
-{
-       if (cq_cache)
-               kmem_cache_destroy(cq_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c
deleted file mode 100644 (file)
index 90da674..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Event queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "ehca_iverbs.h"
-#include "ehca_qes.h"
-#include "hcp_if.h"
-#include "ipz_pt_fn.h"
-
-int ehca_create_eq(struct ehca_shca *shca,
-                  struct ehca_eq *eq,
-                  const enum ehca_eq_type type, const u32 length)
-{
-       int ret;
-       u64 h_ret;
-       u32 nr_pages;
-       u32 i;
-       void *vpage;
-       struct ib_device *ib_dev = &shca->ib_device;
-
-       spin_lock_init(&eq->spinlock);
-       spin_lock_init(&eq->irq_spinlock);
-       eq->is_initialized = 0;
-
-       if (type != EHCA_EQ && type != EHCA_NEQ) {
-               ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
-               return -EINVAL;
-       }
-       if (!length) {
-               ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
-               return -EINVAL;
-       }
-
-       h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
-                                        &eq->pf,
-                                        type,
-                                        length,
-                                        &eq->ipz_eq_handle,
-                                        &eq->length,
-                                        &nr_pages, &eq->ist);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
-               return -EINVAL;
-       }
-
-       ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
-                            EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
-       if (!ret) {
-               ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
-               goto create_eq_exit1;
-       }
-
-       for (i = 0; i < nr_pages; i++) {
-               u64 rpage;
-
-               vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
-               if (!vpage)
-                       goto create_eq_exit2;
-
-               rpage = __pa(vpage);
-               h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
-                                                eq->ipz_eq_handle,
-                                                &eq->pf,
-                                                0, 0, rpage, 1);
-
-               if (i == (nr_pages - 1)) {
-                       /* last page */
-                       vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
-                       if (h_ret != H_SUCCESS || vpage)
-                               goto create_eq_exit2;
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED)
-                               goto create_eq_exit2;
-               }
-       }
-
-       ipz_qeit_reset(&eq->ipz_queue);
-
-       /* register interrupt handlers and initialize work queues */
-       if (type == EHCA_EQ) {
-               tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
-
-               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
-                                         0, "ehca_eq",
-                                         (void *)shca);
-               if (ret < 0)
-                       ehca_err(ib_dev, "Can't map interrupt handler.");
-       } else if (type == EHCA_NEQ) {
-               tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
-
-               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
-                                         0, "ehca_neq",
-                                         (void *)shca);
-               if (ret < 0)
-                       ehca_err(ib_dev, "Can't map interrupt handler.");
-       }
-
-       eq->is_initialized = 1;
-
-       return 0;
-
-create_eq_exit2:
-       ipz_queue_dtor(NULL, &eq->ipz_queue);
-
-create_eq_exit1:
-       hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
-
-       return -EINVAL;
-}
-
-void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
-{
-       unsigned long flags;
-       void *eqe;
-
-       spin_lock_irqsave(&eq->spinlock, flags);
-       eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
-       spin_unlock_irqrestore(&eq->spinlock, flags);
-
-       return eqe;
-}
-
-int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
-{
-       unsigned long flags;
-       u64 h_ret;
-
-       ibmebus_free_irq(eq->ist, (void *)shca);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       eq->is_initialized = 0;
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       tasklet_kill(&eq->interrupt_task);
-
-       h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
-
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't free EQ resources.");
-               return -EINVAL;
-       }
-       ipz_queue_dtor(NULL, &eq->ipz_queue);
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
deleted file mode 100644 (file)
index e8b1bb6..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HCA query functions
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/gfp.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-static unsigned int limit_uint(unsigned int value)
-{
-       return min_t(unsigned int, value, INT_MAX);
-}
-
-int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                     struct ib_udata *uhw)
-{
-       int i, ret = 0;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_hca *rblock;
-
-       static const u32 cap_mapping[] = {
-               IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
-               IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
-               IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
-               IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
-               IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
-               IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
-               IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
-               IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
-               IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
-               IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
-               IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
-       };
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query device properties");
-               ret = -EINVAL;
-               goto query_device1;
-       }
-
-       memset(props, 0, sizeof(struct ib_device_attr));
-       props->page_size_cap   = shca->hca_cap_mr_pgsize;
-       props->fw_ver          = rblock->hw_ver;
-       props->max_mr_size     = rblock->max_mr_size;
-       props->vendor_id       = rblock->vendor_id >> 8;
-       props->vendor_part_id  = rblock->vendor_part_id >> 16;
-       props->hw_ver          = rblock->hw_ver;
-       props->max_qp          = limit_uint(rblock->max_qp);
-       props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
-       props->max_sge         = limit_uint(rblock->max_sge);
-       props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
-       props->max_cq          = limit_uint(rblock->max_cq);
-       props->max_cqe         = limit_uint(rblock->max_cqe);
-       props->max_mr          = limit_uint(rblock->max_mr);
-       props->max_mw          = limit_uint(rblock->max_mw);
-       props->max_pd          = limit_uint(rblock->max_pd);
-       props->max_ah          = limit_uint(rblock->max_ah);
-       props->max_ee          = limit_uint(rblock->max_rd_ee_context);
-       props->max_rdd         = limit_uint(rblock->max_rd_domain);
-       props->max_fmr         = limit_uint(rblock->max_mr);
-       props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
-       props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
-       props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
-       props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
-       props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
-
-       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
-               props->max_srq         = limit_uint(props->max_qp);
-               props->max_srq_wr      = limit_uint(props->max_qp_wr);
-               props->max_srq_sge     = 3;
-       }
-
-       props->max_pkeys           = 16;
-       /* Some FW versions say 0 here; insert sensible value in that case */
-       props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
-               min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
-       props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
-       props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
-       props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
-       props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
-       props->max_total_mcast_qp_attach
-               = limit_uint(rblock->max_total_mcast_qp_attach);
-
-       /* translate device capabilities */
-       props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
-               IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
-       for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
-               if (rblock->hca_cap_indicators & cap_mapping[i + 1])
-                       props->device_cap_flags |= cap_mapping[i];
-
-query_device1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
-{
-       switch (fw_mtu) {
-       case 0x1:
-               return IB_MTU_256;
-       case 0x2:
-               return IB_MTU_512;
-       case 0x3:
-               return IB_MTU_1024;
-       case 0x4:
-               return IB_MTU_2048;
-       case 0x5:
-               return IB_MTU_4096;
-       default:
-               ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
-                        fw_mtu);
-               return 0;
-       }
-}
-
-static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
-{
-       switch (vl_cap) {
-       case 0x1:
-               return 1;
-       case 0x2:
-               return 2;
-       case 0x3:
-               return 4;
-       case 0x4:
-               return 8;
-       case 0x5:
-               return 15;
-       default:
-               ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
-                        vl_cap);
-               return 0;
-       }
-}
-
-int ehca_query_port(struct ib_device *ibdev,
-                   u8 port, struct ib_port_attr *props)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_port *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_port1;
-       }
-
-       memset(props, 0, sizeof(struct ib_port_attr));
-
-       props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
-       props->port_cap_flags  = rblock->capability_mask;
-       props->gid_tbl_len     = rblock->gid_tbl_len;
-       if (rblock->max_msg_sz)
-               props->max_msg_sz      = rblock->max_msg_sz;
-       else
-               props->max_msg_sz      = 0x1 << 31;
-       props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
-       props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
-       props->pkey_tbl_len    = rblock->pkey_tbl_len;
-       props->lid             = rblock->lid;
-       props->sm_lid          = rblock->sm_lid;
-       props->lmc             = rblock->lmc;
-       props->sm_sl           = rblock->sm_sl;
-       props->subnet_timeout  = rblock->subnet_timeout;
-       props->init_type_reply = rblock->init_type_reply;
-       props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
-
-       if (rblock->state && rblock->phys_width) {
-               props->phys_state      = rblock->phys_pstate;
-               props->state           = rblock->phys_state;
-               props->active_width    = rblock->phys_width;
-               props->active_speed    = rblock->phys_speed;
-       } else {
-               /* old firmware releases don't report physical
-                * port info, so use default values
-                */
-               props->phys_state      = 5;
-               props->state           = rblock->state;
-               props->active_width    = IB_WIDTH_12X;
-               props->active_speed    = IB_SPEED_SDR;
-       }
-
-query_port1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_sma_attr(struct ehca_shca *shca,
-                       u8 port, struct ehca_sma_attr *attr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct hipz_query_port *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_sma_attr1;
-       }
-
-       memset(attr, 0, sizeof(struct ehca_sma_attr));
-
-       attr->lid    = rblock->lid;
-       attr->lmc    = rblock->lmc;
-       attr->sm_sl  = rblock->sm_sl;
-       attr->sm_lid = rblock->sm_lid;
-
-       attr->pkey_tbl_len = rblock->pkey_tbl_len;
-       memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
-
-query_sma_attr1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca;
-       struct hipz_query_port *rblock;
-
-       shca = container_of(ibdev, struct ehca_shca, ib_device);
-       if (index > 16) {
-               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
-               return -EINVAL;
-       }
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_pkey1;
-       }
-
-       memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
-
-query_pkey1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-int ehca_query_gid(struct ib_device *ibdev, u8 port,
-                  int index, union ib_gid *gid)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
-                                             ib_device);
-       struct hipz_query_port *rblock;
-
-       if (index < 0 || index > 255) {
-               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
-               return -EINVAL;
-       }
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto query_gid1;
-       }
-
-       memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
-       memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
-
-query_gid1:
-       ehca_free_fw_ctrlblock(rblock);
-
-       return ret;
-}
-
-static const u32 allowed_port_caps = (
-       IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
-       IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
-       IB_PORT_VENDOR_CLASS_SUP);
-
-int ehca_modify_port(struct ib_device *ibdev,
-                    u8 port, int port_modify_mask,
-                    struct ib_port_modify *props)
-{
-       int ret = 0;
-       struct ehca_shca *shca;
-       struct hipz_query_port *rblock;
-       u32 cap;
-       u64 hret;
-
-       shca = container_of(ibdev, struct ehca_shca, ib_device);
-       if ((props->set_port_cap_mask | props->clr_port_cap_mask)
-           & ~allowed_port_caps) {
-               ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
-                        "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
-                        props->clr_port_cap_mask, allowed_port_caps);
-               return -EINVAL;
-       }
-
-       if (mutex_lock_interruptible(&shca->modify_mutex))
-               return -ERESTARTSYS;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
-               ret = -ENOMEM;
-               goto modify_port1;
-       }
-
-       hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
-       if (hret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query port properties");
-               ret = -EINVAL;
-               goto modify_port2;
-       }
-
-       cap = (rblock->capability_mask | props->set_port_cap_mask)
-               & ~props->clr_port_cap_mask;
-
-       hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
-                                 cap, props->init_type, port_modify_mask);
-       if (hret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
-                        hret);
-               ret = -EINVAL;
-       }
-
-modify_port2:
-       ehca_free_fw_ctrlblock(rblock);
-
-modify_port1:
-       mutex_unlock(&shca->modify_mutex);
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
deleted file mode 100644 (file)
index 8615d7c..0000000
+++ /dev/null
@@ -1,870 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Functions for EQs, NEQs and interrupts
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <linux/smpboot.h>
-
-#include "ehca_classes.h"
-#include "ehca_irq.h"
-#include "ehca_iverbs.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-#include "ipz_pt_fn.h"
-
-#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
-#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
-#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
-#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
-#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
-#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
-#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
-
-#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
-#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
-#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
-#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
-#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
-#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
-
-#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
-#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
-
-static void queue_comp_task(struct ehca_cq *__cq);
-
-static struct ehca_comp_pool *pool;
-
-static inline void comp_event_callback(struct ehca_cq *cq)
-{
-       if (!cq->ib_cq.comp_handler)
-               return;
-
-       spin_lock(&cq->cb_lock);
-       cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
-       spin_unlock(&cq->cb_lock);
-
-       return;
-}
-
-static void print_error_data(struct ehca_shca *shca, void *data,
-                            u64 *rblock, int length)
-{
-       u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
-       u64 resource = rblock[1];
-
-       switch (type) {
-       case 0x1: /* Queue Pair */
-       {
-               struct ehca_qp *qp = (struct ehca_qp *)data;
-
-               /* only print error data if AER is set */
-               if (rblock[6] == 0)
-                       return;
-
-               ehca_err(&shca->ib_device,
-                        "QP 0x%x (resource=%llx) has errors.",
-                        qp->ib_qp.qp_num, resource);
-               break;
-       }
-       case 0x4: /* Completion Queue */
-       {
-               struct ehca_cq *cq = (struct ehca_cq *)data;
-
-               ehca_err(&shca->ib_device,
-                        "CQ 0x%x (resource=%llx) has errors.",
-                        cq->cq_number, resource);
-               break;
-       }
-       default:
-               ehca_err(&shca->ib_device,
-                        "Unknown error type: %llx on %s.",
-                        type, shca->ib_device.name);
-               break;
-       }
-
-       ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
-       ehca_err(&shca->ib_device, "EHCA ----- error data begin "
-                "---------------------------------------------------");
-       ehca_dmp(rblock, length, "resource=%llx", resource);
-       ehca_err(&shca->ib_device, "EHCA ----- error data end "
-                "----------------------------------------------------");
-
-       return;
-}
-
-int ehca_error_data(struct ehca_shca *shca, void *data,
-                   u64 resource)
-{
-
-       unsigned long ret;
-       u64 *rblock;
-       unsigned long block_count;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
-               ret = -ENOMEM;
-               goto error_data1;
-       }
-
-       /* rblock must be 4K aligned and should be 4K large */
-       ret = hipz_h_error_data(shca->ipz_hca_handle,
-                               resource,
-                               rblock,
-                               &block_count);
-
-       if (ret == H_R_STATE)
-               ehca_err(&shca->ib_device,
-                        "No error data is available: %llx.", resource);
-       else if (ret == H_SUCCESS) {
-               int length;
-
-               length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
-
-               if (length > EHCA_PAGESIZE)
-                       length = EHCA_PAGESIZE;
-
-               print_error_data(shca, data, rblock, length);
-       } else
-               ehca_err(&shca->ib_device,
-                        "Error data could not be fetched: %llx", resource);
-
-       ehca_free_fw_ctrlblock(rblock);
-
-error_data1:
-       return ret;
-
-}
-
-static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
-                             enum ib_event_type event_type)
-{
-       struct ib_event event;
-
-       /* PATH_MIG without the QP ever having been armed is false alarm */
-       if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
-               return;
-
-       event.device = &shca->ib_device;
-       event.event = event_type;
-
-       if (qp->ext_type == EQPT_SRQ) {
-               if (!qp->ib_srq.event_handler)
-                       return;
-
-               event.element.srq = &qp->ib_srq;
-               qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
-       } else {
-               if (!qp->ib_qp.event_handler)
-                       return;
-
-               event.element.qp = &qp->ib_qp;
-               qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
-       }
-}
-
-static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
-                             enum ib_event_type event_type, int fatal)
-{
-       struct ehca_qp *qp;
-       u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
-
-       read_lock(&ehca_qp_idr_lock);
-       qp = idr_find(&ehca_qp_idr, token);
-       if (qp)
-               atomic_inc(&qp->nr_events);
-       read_unlock(&ehca_qp_idr_lock);
-
-       if (!qp)
-               return;
-
-       if (fatal)
-               ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
-
-       dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
-                         IB_EVENT_SRQ_ERR : event_type);
-
-       /*
-        * eHCA only processes one WQE at a time for SRQ base QPs,
-        * so the last WQE has been processed as soon as the QP enters
-        * error state.
-        */
-       if (fatal && qp->ext_type == EQPT_SRQBASE)
-               dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
-
-       if (atomic_dec_and_test(&qp->nr_events))
-               wake_up(&qp->wait_completion);
-       return;
-}
-
-static void cq_event_callback(struct ehca_shca *shca,
-                             u64 eqe)
-{
-       struct ehca_cq *cq;
-       u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
-
-       read_lock(&ehca_cq_idr_lock);
-       cq = idr_find(&ehca_cq_idr, token);
-       if (cq)
-               atomic_inc(&cq->nr_events);
-       read_unlock(&ehca_cq_idr_lock);
-
-       if (!cq)
-               return;
-
-       ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
-
-       if (atomic_dec_and_test(&cq->nr_events))
-               wake_up(&cq->wait_completion);
-
-       return;
-}
-
-static void parse_identifier(struct ehca_shca *shca, u64 eqe)
-{
-       u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
-
-       switch (identifier) {
-       case 0x02: /* path migrated */
-               qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
-               break;
-       case 0x03: /* communication established */
-               qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
-               break;
-       case 0x04: /* send queue drained */
-               qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
-               break;
-       case 0x05: /* QP error */
-       case 0x06: /* QP error */
-               qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
-               break;
-       case 0x07: /* CQ error */
-       case 0x08: /* CQ error */
-               cq_event_callback(shca, eqe);
-               break;
-       case 0x09: /* MRMWPTE error */
-               ehca_err(&shca->ib_device, "MRMWPTE error.");
-               break;
-       case 0x0A: /* port event */
-               ehca_err(&shca->ib_device, "Port event.");
-               break;
-       case 0x0B: /* MR access error */
-               ehca_err(&shca->ib_device, "MR access error.");
-               break;
-       case 0x0C: /* EQ error */
-               ehca_err(&shca->ib_device, "EQ error.");
-               break;
-       case 0x0D: /* P/Q_Key mismatch */
-               ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
-               break;
-       case 0x10: /* sampling complete */
-               ehca_err(&shca->ib_device, "Sampling complete.");
-               break;
-       case 0x11: /* unaffiliated access error */
-               ehca_err(&shca->ib_device, "Unaffiliated access error.");
-               break;
-       case 0x12: /* path migrating */
-               ehca_err(&shca->ib_device, "Path migrating.");
-               break;
-       case 0x13: /* interface trace stopped */
-               ehca_err(&shca->ib_device, "Interface trace stopped.");
-               break;
-       case 0x14: /* first error capture info available */
-               ehca_info(&shca->ib_device, "First error capture available");
-               break;
-       case 0x15: /* SRQ limit reached */
-               qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
-                        identifier, shca->ib_device.name);
-               break;
-       }
-
-       return;
-}
-
-static void dispatch_port_event(struct ehca_shca *shca, int port_num,
-                               enum ib_event_type type, const char *msg)
-{
-       struct ib_event event;
-
-       ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
-       event.device = &shca->ib_device;
-       event.event = type;
-       event.element.port_num = port_num;
-       ib_dispatch_event(&event);
-}
-
-static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
-{
-       struct ehca_sma_attr  new_attr;
-       struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
-
-       ehca_query_sma_attr(shca, port_num, &new_attr);
-
-       if (new_attr.sm_sl  != old_attr->sm_sl ||
-           new_attr.sm_lid != old_attr->sm_lid)
-               dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
-                                   "SM changed");
-
-       if (new_attr.lid != old_attr->lid ||
-           new_attr.lmc != old_attr->lmc)
-               dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
-                                   "LID changed");
-
-       if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
-           memcmp(new_attr.pkeys, old_attr->pkeys,
-                  sizeof(u16) * new_attr.pkey_tbl_len))
-               dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
-                                   "P_Key changed");
-
-       *old_attr = new_attr;
-}
-
-/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
-static int replay_modify_qp(struct ehca_sport *sport)
-{
-       int aqp1_destroyed;
-       unsigned long flags;
-
-       spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-
-       aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
-
-       if (sport->ibqp_sqp[IB_QPT_SMI])
-               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
-       if (!aqp1_destroyed)
-               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
-
-       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-
-       return aqp1_destroyed;
-}
-
-static void parse_ec(struct ehca_shca *shca, u64 eqe)
-{
-       u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
-       u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
-       u8 spec_event;
-       struct ehca_sport *sport = &shca->sport[port - 1];
-
-       switch (ec) {
-       case 0x30: /* port availability change */
-               if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
-                       /* only replay modify_qp calls in autodetect mode;
-                        * if AQP1 was destroyed, the port is already down
-                        * again and we can drop the event.
-                        */
-                       if (ehca_nr_ports < 0)
-                               if (replay_modify_qp(sport))
-                                       break;
-
-                       sport->port_state = IB_PORT_ACTIVE;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
-                                           "is active");
-                       ehca_query_sma_attr(shca, port, &sport->saved_attr);
-               } else {
-                       sport->port_state = IB_PORT_DOWN;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
-                                           "is inactive");
-               }
-               break;
-       case 0x31:
-               /* port configuration change
-                * disruptive change is caused by
-                * LID, PKEY or SM change
-                */
-               if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
-                       ehca_warn(&shca->ib_device, "disruptive port "
-                                 "%d configuration change", port);
-
-                       sport->port_state = IB_PORT_DOWN;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
-                                           "is inactive");
-
-                       sport->port_state = IB_PORT_ACTIVE;
-                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
-                                           "is active");
-                       ehca_query_sma_attr(shca, port,
-                                           &sport->saved_attr);
-               } else
-                       notify_port_conf_change(shca, port);
-               break;
-       case 0x32: /* adapter malfunction */
-               ehca_err(&shca->ib_device, "Adapter malfunction.");
-               break;
-       case 0x33:  /* trace stopped */
-               ehca_err(&shca->ib_device, "Traced stopped.");
-               break;
-       case 0x34: /* util async event */
-               spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
-               if (spec_event == 0x80) /* client reregister required */
-                       dispatch_port_event(shca, port,
-                                           IB_EVENT_CLIENT_REREGISTER,
-                                           "client reregister req.");
-               else
-                       ehca_warn(&shca->ib_device, "Unknown util async "
-                                 "event %x on port %x", spec_event, port);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
-                        ec, shca->ib_device.name);
-               break;
-       }
-
-       return;
-}
-
-static inline void reset_eq_pending(struct ehca_cq *cq)
-{
-       u64 CQx_EP;
-       struct h_galpa gal = cq->galpas.kernel;
-
-       hipz_galpa_store_cq(gal, cqx_ep, 0x0);
-       CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
-
-       return;
-}
-
-irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
-
-       tasklet_hi_schedule(&shca->neq.interrupt_task);
-
-       return IRQ_HANDLED;
-}
-
-void ehca_tasklet_neq(unsigned long data)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)data;
-       struct ehca_eqe *eqe;
-       u64 ret;
-
-       eqe = ehca_poll_eq(shca, &shca->neq);
-
-       while (eqe) {
-               if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
-                       parse_ec(shca, eqe->entry);
-
-               eqe = ehca_poll_eq(shca, &shca->neq);
-       }
-
-       ret = hipz_h_reset_event(shca->ipz_hca_handle,
-                                shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
-
-       if (ret != H_SUCCESS)
-               ehca_err(&shca->ib_device, "Can't clear notification events.");
-
-       return;
-}
-
-irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
-{
-       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
-
-       tasklet_hi_schedule(&shca->eq.interrupt_task);
-
-       return IRQ_HANDLED;
-}
-
-
-static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
-{
-       u64 eqe_value;
-       u32 token;
-       struct ehca_cq *cq;
-
-       eqe_value = eqe->entry;
-       ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
-       if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
-               ehca_dbg(&shca->ib_device, "Got completion event");
-               token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
-               read_lock(&ehca_cq_idr_lock);
-               cq = idr_find(&ehca_cq_idr, token);
-               if (cq)
-                       atomic_inc(&cq->nr_events);
-               read_unlock(&ehca_cq_idr_lock);
-               if (cq == NULL) {
-                       ehca_err(&shca->ib_device,
-                                "Invalid eqe for non-existing cq token=%x",
-                                token);
-                       return;
-               }
-               reset_eq_pending(cq);
-               if (ehca_scaling_code)
-                       queue_comp_task(cq);
-               else {
-                       comp_event_callback(cq);
-                       if (atomic_dec_and_test(&cq->nr_events))
-                               wake_up(&cq->wait_completion);
-               }
-       } else {
-               ehca_dbg(&shca->ib_device, "Got non completion event");
-               parse_identifier(shca, eqe_value);
-       }
-}
-
-void ehca_process_eq(struct ehca_shca *shca, int is_irq)
-{
-       struct ehca_eq *eq = &shca->eq;
-       struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
-       u64 eqe_value, ret;
-       int eqe_cnt, i;
-       int eq_empty = 0;
-
-       spin_lock(&eq->irq_spinlock);
-       if (is_irq) {
-               const int max_query_cnt = 100;
-               int query_cnt = 0;
-               int int_state = 1;
-               do {
-                       int_state = hipz_h_query_int_state(
-                               shca->ipz_hca_handle, eq->ist);
-                       query_cnt++;
-                       iosync();
-               } while (int_state && query_cnt < max_query_cnt);
-               if (unlikely((query_cnt == max_query_cnt)))
-                       ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
-                                int_state, query_cnt);
-       }
-
-       /* read out all eqes */
-       eqe_cnt = 0;
-       do {
-               u32 token;
-               eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
-               if (!eqe_cache[eqe_cnt].eqe)
-                       break;
-               eqe_value = eqe_cache[eqe_cnt].eqe->entry;
-               if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
-                       token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
-                       read_lock(&ehca_cq_idr_lock);
-                       eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
-                       if (eqe_cache[eqe_cnt].cq)
-                               atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
-                       read_unlock(&ehca_cq_idr_lock);
-                       if (!eqe_cache[eqe_cnt].cq) {
-                               ehca_err(&shca->ib_device,
-                                        "Invalid eqe for non-existing cq "
-                                        "token=%x", token);
-                               continue;
-                       }
-               } else
-                       eqe_cache[eqe_cnt].cq = NULL;
-               eqe_cnt++;
-       } while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
-       if (!eqe_cnt) {
-               if (is_irq)
-                       ehca_dbg(&shca->ib_device,
-                                "No eqe found for irq event");
-               goto unlock_irq_spinlock;
-       } else if (!is_irq) {
-               ret = hipz_h_eoi(eq->ist);
-               if (ret != H_SUCCESS)
-                       ehca_err(&shca->ib_device,
-                                "bad return code EOI -rc = %lld\n", ret);
-               ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
-       }
-       if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
-               ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
-       /* enable irq for new packets */
-       for (i = 0; i < eqe_cnt; i++) {
-               if (eq->eqe_cache[i].cq)
-                       reset_eq_pending(eq->eqe_cache[i].cq);
-       }
-       /* check eq */
-       spin_lock(&eq->spinlock);
-       eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
-       spin_unlock(&eq->spinlock);
-       /* call completion handler for cached eqes */
-       for (i = 0; i < eqe_cnt; i++)
-               if (eq->eqe_cache[i].cq) {
-                       if (ehca_scaling_code)
-                               queue_comp_task(eq->eqe_cache[i].cq);
-                       else {
-                               struct ehca_cq *cq = eq->eqe_cache[i].cq;
-                               comp_event_callback(cq);
-                               if (atomic_dec_and_test(&cq->nr_events))
-                                       wake_up(&cq->wait_completion);
-                       }
-               } else {
-                       ehca_dbg(&shca->ib_device, "Got non completion event");
-                       parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
-               }
-       /* poll eq if not empty */
-       if (eq_empty)
-               goto unlock_irq_spinlock;
-       do {
-               struct ehca_eqe *eqe;
-               eqe = ehca_poll_eq(shca, &shca->eq);
-               if (!eqe)
-                       break;
-               process_eqe(shca, eqe);
-       } while (1);
-
-unlock_irq_spinlock:
-       spin_unlock(&eq->irq_spinlock);
-}
-
-void ehca_tasklet_eq(unsigned long data)
-{
-       ehca_process_eq((struct ehca_shca*)data, 1);
-}
-
-static int find_next_online_cpu(struct ehca_comp_pool *pool)
-{
-       int cpu;
-       unsigned long flags;
-
-       WARN_ON_ONCE(!in_interrupt());
-       if (ehca_debug_level >= 3)
-               ehca_dmp(cpu_online_mask, cpumask_size(), "");
-
-       spin_lock_irqsave(&pool->last_cpu_lock, flags);
-       do {
-               cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
-               if (cpu >= nr_cpu_ids)
-                       cpu = cpumask_first(cpu_online_mask);
-               pool->last_cpu = cpu;
-       } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
-       spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
-
-       return cpu;
-}
-
-static void __queue_comp_task(struct ehca_cq *__cq,
-                             struct ehca_cpu_comp_task *cct,
-                             struct task_struct *thread)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cct->task_lock, flags);
-       spin_lock(&__cq->task_lock);
-
-       if (__cq->nr_callbacks == 0) {
-               __cq->nr_callbacks++;
-               list_add_tail(&__cq->entry, &cct->cq_list);
-               cct->cq_jobs++;
-               wake_up_process(thread);
-       } else
-               __cq->nr_callbacks++;
-
-       spin_unlock(&__cq->task_lock);
-       spin_unlock_irqrestore(&cct->task_lock, flags);
-}
-
-static void queue_comp_task(struct ehca_cq *__cq)
-{
-       int cpu_id;
-       struct ehca_cpu_comp_task *cct;
-       struct task_struct *thread;
-       int cq_jobs;
-       unsigned long flags;
-
-       cpu_id = find_next_online_cpu(pool);
-       BUG_ON(!cpu_online(cpu_id));
-
-       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
-       BUG_ON(!cct || !thread);
-
-       spin_lock_irqsave(&cct->task_lock, flags);
-       cq_jobs = cct->cq_jobs;
-       spin_unlock_irqrestore(&cct->task_lock, flags);
-       if (cq_jobs > 0) {
-               cpu_id = find_next_online_cpu(pool);
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
-               thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
-               BUG_ON(!cct || !thread);
-       }
-       __queue_comp_task(__cq, cct, thread);
-}
-
-static void run_comp_task(struct ehca_cpu_comp_task *cct)
-{
-       struct ehca_cq *cq;
-
-       while (!list_empty(&cct->cq_list)) {
-               cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-               spin_unlock_irq(&cct->task_lock);
-
-               comp_event_callback(cq);
-               if (atomic_dec_and_test(&cq->nr_events))
-                       wake_up(&cq->wait_completion);
-
-               spin_lock_irq(&cct->task_lock);
-               spin_lock(&cq->task_lock);
-               cq->nr_callbacks--;
-               if (!cq->nr_callbacks) {
-                       list_del_init(cct->cq_list.next);
-                       cct->cq_jobs--;
-               }
-               spin_unlock(&cq->task_lock);
-       }
-}
-
-static void comp_task_park(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       struct ehca_cpu_comp_task *target;
-       struct task_struct *thread;
-       struct ehca_cq *cq, *tmp;
-       LIST_HEAD(list);
-
-       spin_lock_irq(&cct->task_lock);
-       cct->cq_jobs = 0;
-       cct->active = 0;
-       list_splice_init(&cct->cq_list, &list);
-       spin_unlock_irq(&cct->task_lock);
-
-       cpu = find_next_online_cpu(pool);
-       target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
-       spin_lock_irq(&target->task_lock);
-       list_for_each_entry_safe(cq, tmp, &list, entry) {
-               list_del(&cq->entry);
-               __queue_comp_task(cq, target, thread);
-       }
-       spin_unlock_irq(&target->task_lock);
-}
-
-static void comp_task_stop(unsigned int cpu, bool online)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       spin_lock_irq(&cct->task_lock);
-       cct->cq_jobs = 0;
-       cct->active = 0;
-       WARN_ON(!list_empty(&cct->cq_list));
-       spin_unlock_irq(&cct->task_lock);
-}
-
-static int comp_task_should_run(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-
-       return cct->cq_jobs;
-}
-
-static void comp_task(unsigned int cpu)
-{
-       struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
-       int cql_empty;
-
-       spin_lock_irq(&cct->task_lock);
-       cql_empty = list_empty(&cct->cq_list);
-       if (!cql_empty) {
-               __set_current_state(TASK_RUNNING);
-               run_comp_task(cct);
-       }
-       spin_unlock_irq(&cct->task_lock);
-}
-
-static struct smp_hotplug_thread comp_pool_threads = {
-       .thread_should_run      = comp_task_should_run,
-       .thread_fn              = comp_task,
-       .thread_comm            = "ehca_comp/%u",
-       .cleanup                = comp_task_stop,
-       .park                   = comp_task_park,
-};
-
-int ehca_create_comp_pool(void)
-{
-       int cpu, ret = -ENOMEM;
-
-       if (!ehca_scaling_code)
-               return 0;
-
-       pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
-       if (pool == NULL)
-               return -ENOMEM;
-
-       spin_lock_init(&pool->last_cpu_lock);
-       pool->last_cpu = cpumask_any(cpu_online_mask);
-
-       pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
-       if (!pool->cpu_comp_tasks)
-               goto out_pool;
-
-       pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
-       if (!pool->cpu_comp_threads)
-               goto out_tasks;
-
-       for_each_present_cpu(cpu) {
-               struct ehca_cpu_comp_task *cct;
-
-               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
-               spin_lock_init(&cct->task_lock);
-               INIT_LIST_HEAD(&cct->cq_list);
-       }
-
-       comp_pool_threads.store = pool->cpu_comp_threads;
-       ret = smpboot_register_percpu_thread(&comp_pool_threads);
-       if (ret)
-               goto out_threads;
-
-       pr_info("eHCA scaling code enabled\n");
-       return ret;
-
-out_threads:
-       free_percpu(pool->cpu_comp_threads);
-out_tasks:
-       free_percpu(pool->cpu_comp_tasks);
-out_pool:
-       kfree(pool);
-       return ret;
-}
-
-void ehca_destroy_comp_pool(void)
-{
-       if (!ehca_scaling_code)
-               return;
-
-       smpboot_unregister_percpu_thread(&comp_pool_threads);
-
-       free_percpu(pool->cpu_comp_threads);
-       free_percpu(pool->cpu_comp_tasks);
-       kfree(pool);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
deleted file mode 100644 (file)
index 5370199..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Function definitions and structs for EQs, NEQs and interrupts
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Khadija Souissi <souissi@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_IRQ_H
-#define __EHCA_IRQ_H
-
-
-struct ehca_shca;
-
-#include <linux/interrupt.h>
-#include <linux/types.h>
-
-int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
-
-irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
-void ehca_tasklet_neq(unsigned long data);
-
-irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
-void ehca_tasklet_eq(unsigned long data);
-void ehca_process_eq(struct ehca_shca *shca, int is_irq);
-
-struct ehca_cpu_comp_task {
-       struct list_head cq_list;
-       spinlock_t task_lock;
-       int cq_jobs;
-       int active;
-};
-
-struct ehca_comp_pool {
-       struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
-       struct task_struct * __percpu *cpu_comp_threads;
-       int last_cpu;
-       spinlock_t last_cpu_lock;
-};
-
-int ehca_create_comp_pool(void);
-void ehca_destroy_comp_pool(void);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
deleted file mode 100644 (file)
index 80e6a3d..0000000
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Function definitions for internal functions
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Dietmar Decker <ddecker@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __EHCA_IVERBS_H__
-#define __EHCA_IVERBS_H__
-
-#include "ehca_classes.h"
-
-int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                     struct ib_udata *uhw);
-
-int ehca_query_port(struct ib_device *ibdev, u8 port,
-                   struct ib_port_attr *props);
-
-enum rdma_protocol_type
-ehca_query_protocol(struct ib_device *device, u8 port_num);
-
-int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
-                       struct ehca_sma_attr *attr);
-
-int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
-
-int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
-                  union ib_gid *gid);
-
-int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
-                    struct ib_port_modify *props);
-
-struct ib_pd *ehca_alloc_pd(struct ib_device *device,
-                           struct ib_ucontext *context,
-                           struct ib_udata *udata);
-
-int ehca_dealloc_pd(struct ib_pd *pd);
-
-struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
-
-int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
-
-int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
-
-int ehca_destroy_ah(struct ib_ah *ah);
-
-struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
-
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *phys_buf_array,
-                              int num_phys_buf,
-                              int mr_access_flags, u64 *iova_start);
-
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                              u64 virt, int mr_access_flags,
-                              struct ib_udata *udata);
-
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-                      int mr_rereg_mask,
-                      struct ib_pd *pd,
-                      struct ib_phys_buf *phys_buf_array,
-                      int num_phys_buf, int mr_access_flags, u64 *iova_start);
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
-
-int ehca_dereg_mr(struct ib_mr *mr);
-
-struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
-
-int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind);
-
-int ehca_dealloc_mw(struct ib_mw *mw);
-
-struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
-                             int mr_access_flags,
-                             struct ib_fmr_attr *fmr_attr);
-
-int ehca_map_phys_fmr(struct ib_fmr *fmr,
-                     u64 *page_list, int list_len, u64 iova);
-
-int ehca_unmap_fmr(struct list_head *fmr_list);
-
-int ehca_dealloc_fmr(struct ib_fmr *fmr);
-
-enum ehca_eq_type {
-       EHCA_EQ = 0, /* Event Queue              */
-       EHCA_NEQ     /* Notification Event Queue */
-};
-
-int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
-                  enum ehca_eq_type type, const u32 length);
-
-int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
-
-void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
-
-
-struct ib_cq *ehca_create_cq(struct ib_device *device,
-                            const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *context,
-                            struct ib_udata *udata);
-
-int ehca_destroy_cq(struct ib_cq *cq);
-
-int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
-
-int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
-
-int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
-
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
-
-struct ib_qp *ehca_create_qp(struct ib_pd *pd,
-                            struct ib_qp_init_attr *init_attr,
-                            struct ib_udata *udata);
-
-int ehca_destroy_qp(struct ib_qp *qp);
-
-int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                  struct ib_udata *udata);
-
-int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
-                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-
-int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
-                  struct ib_send_wr **bad_send_wr);
-
-int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
-                  struct ib_recv_wr **bad_recv_wr);
-
-int ehca_post_srq_recv(struct ib_srq *srq,
-                      struct ib_recv_wr *recv_wr,
-                      struct ib_recv_wr **bad_recv_wr);
-
-struct ib_srq *ehca_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *init_attr,
-                              struct ib_udata *udata);
-
-int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
-                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
-
-int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-
-int ehca_destroy_srq(struct ib_srq *srq);
-
-u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
-                   struct ib_qp_init_attr *qp_init_attr);
-
-int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-
-int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
-
-struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
-                                       struct ib_udata *udata);
-
-int ehca_dealloc_ucontext(struct ib_ucontext *context);
-
-int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-
-int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in, size_t in_mad_size,
-                    struct ib_mad_hdr *out, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index);
-
-void ehca_poll_eqs(unsigned long data);
-
-int ehca_calc_ipd(struct ehca_shca *shca, int port,
-                 enum ib_rate path_rate, u32 *ipd);
-
-void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
-
-#ifdef CONFIG_PPC_64K_PAGES
-void *ehca_alloc_fw_ctrlblock(gfp_t flags);
-void ehca_free_fw_ctrlblock(void *ptr);
-#else
-#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
-#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
-#endif
-
-void ehca_recover_sqp(struct ib_qp *sqp);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
deleted file mode 100644 (file)
index 8246418..0000000
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  module start stop, hca detection
- *
- *  Authors: Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifdef CONFIG_PPC_64K_PAGES
-#include <linux/slab.h>
-#endif
-
-#include <linux/notifier.h>
-#include <linux/memory.h>
-#include <rdma/ib_mad.h>
-#include "ehca_classes.h"
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-
-#define HCAD_VERSION "0029"
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
-MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
-MODULE_VERSION(HCAD_VERSION);
-
-static bool ehca_open_aqp1    = 0;
-static int ehca_hw_level      = 0;
-static bool ehca_poll_all_eqs = 1;
-
-int ehca_debug_level   = 0;
-int ehca_nr_ports      = -1;
-bool ehca_use_hp_mr    = 0;
-int ehca_port_act_time = 30;
-int ehca_static_rate   = -1;
-bool ehca_scaling_code = 0;
-int ehca_lock_hcalls   = -1;
-int ehca_max_cq        = -1;
-int ehca_max_qp        = -1;
-
-module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
-module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
-module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
-module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
-module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
-module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
-module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
-module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
-module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
-module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
-module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
-module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
-
-MODULE_PARM_DESC(open_aqp1,
-                "Open AQP1 on startup (default: no)");
-MODULE_PARM_DESC(debug_level,
-                "Amount of debug output (0: none (default), 1: traces, "
-                "2: some dumps, 3: lots)");
-MODULE_PARM_DESC(hw_level,
-                "Hardware level (0: autosensing (default), "
-                "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
-MODULE_PARM_DESC(nr_ports,
-                "number of connected ports (-1: autodetect (default), "
-                "1: port one only, 2: two ports)");
-MODULE_PARM_DESC(use_hp_mr,
-                "Use high performance MRs (default: no)");
-MODULE_PARM_DESC(port_act_time,
-                "Time to wait for port activation (default: 30 sec)");
-MODULE_PARM_DESC(poll_all_eqs,
-                "Poll all event queues periodically (default: yes)");
-MODULE_PARM_DESC(static_rate,
-                "Set permanent static rate (default: no static rate)");
-MODULE_PARM_DESC(scaling_code,
-                "Enable scaling code (default: no)");
-MODULE_PARM_DESC(lock_hcalls,
-                "Serialize all hCalls made by the driver "
-                "(default: autodetect)");
-MODULE_PARM_DESC(number_of_cqs,
-               "Max number of CQs which can be allocated "
-               "(default: autodetect)");
-MODULE_PARM_DESC(number_of_qps,
-               "Max number of QPs which can be allocated "
-               "(default: autodetect)");
-
-DEFINE_RWLOCK(ehca_qp_idr_lock);
-DEFINE_RWLOCK(ehca_cq_idr_lock);
-DEFINE_IDR(ehca_qp_idr);
-DEFINE_IDR(ehca_cq_idr);
-
-static LIST_HEAD(shca_list); /* list of all registered ehcas */
-DEFINE_SPINLOCK(shca_list_lock);
-
-static struct timer_list poll_eqs_timer;
-
-#ifdef CONFIG_PPC_64K_PAGES
-static struct kmem_cache *ctblk_cache;
-
-void *ehca_alloc_fw_ctrlblock(gfp_t flags)
-{
-       void *ret = kmem_cache_zalloc(ctblk_cache, flags);
-       if (!ret)
-               ehca_gen_err("Out of memory for ctblk");
-       return ret;
-}
-
-void ehca_free_fw_ctrlblock(void *ptr)
-{
-       if (ptr)
-               kmem_cache_free(ctblk_cache, ptr);
-
-}
-#endif
-
-int ehca2ib_return_code(u64 ehca_rc)
-{
-       switch (ehca_rc) {
-       case H_SUCCESS:
-               return 0;
-       case H_RESOURCE:             /* Resource in use */
-       case H_BUSY:
-               return -EBUSY;
-       case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
-       case H_CONSTRAINED:          /* resource constraint */
-       case H_NO_MEM:
-               return -ENOMEM;
-       default:
-               return -EINVAL;
-       }
-}
-
-static int ehca_create_slab_caches(void)
-{
-       int ret;
-
-       ret = ehca_init_pd_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create PD SLAB cache.");
-               return ret;
-       }
-
-       ret = ehca_init_cq_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create CQ SLAB cache.");
-               goto create_slab_caches2;
-       }
-
-       ret = ehca_init_qp_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create QP SLAB cache.");
-               goto create_slab_caches3;
-       }
-
-       ret = ehca_init_av_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create AV SLAB cache.");
-               goto create_slab_caches4;
-       }
-
-       ret = ehca_init_mrmw_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create MR&MW SLAB cache.");
-               goto create_slab_caches5;
-       }
-
-       ret = ehca_init_small_qp_cache();
-       if (ret) {
-               ehca_gen_err("Cannot create small queue SLAB cache.");
-               goto create_slab_caches6;
-       }
-
-#ifdef CONFIG_PPC_64K_PAGES
-       ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
-                                       EHCA_PAGESIZE, H_CB_ALIGNMENT,
-                                       SLAB_HWCACHE_ALIGN,
-                                       NULL);
-       if (!ctblk_cache) {
-               ehca_gen_err("Cannot create ctblk SLAB cache.");
-               ehca_cleanup_small_qp_cache();
-               ret = -ENOMEM;
-               goto create_slab_caches6;
-       }
-#endif
-       return 0;
-
-create_slab_caches6:
-       ehca_cleanup_mrmw_cache();
-
-create_slab_caches5:
-       ehca_cleanup_av_cache();
-
-create_slab_caches4:
-       ehca_cleanup_qp_cache();
-
-create_slab_caches3:
-       ehca_cleanup_cq_cache();
-
-create_slab_caches2:
-       ehca_cleanup_pd_cache();
-
-       return ret;
-}
-
-static void ehca_destroy_slab_caches(void)
-{
-       ehca_cleanup_small_qp_cache();
-       ehca_cleanup_mrmw_cache();
-       ehca_cleanup_av_cache();
-       ehca_cleanup_qp_cache();
-       ehca_cleanup_cq_cache();
-       ehca_cleanup_pd_cache();
-#ifdef CONFIG_PPC_64K_PAGES
-       if (ctblk_cache)
-               kmem_cache_destroy(ctblk_cache);
-#endif
-}
-
-#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
-#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
-
-static struct cap_descr {
-       u64 mask;
-       char *descr;
-} hca_cap_descr[] = {
-       { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
-       { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
-       { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
-       { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
-       { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
-       { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
-       { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
-       { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
-       { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
-       { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
-       { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
-       { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
-       { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
-       { HCA_CAP_SRQ, "HCA_CAP_SRQ" },
-       { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
-       { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
-       { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
-       { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
-};
-
-static int ehca_sense_attributes(struct ehca_shca *shca)
-{
-       int i, ret = 0;
-       u64 h_ret;
-       struct hipz_query_hca *rblock;
-       struct hipz_query_port *port;
-       const char *loc_code;
-
-       static const u32 pgsize_map[] = {
-               HCA_CAP_MR_PGSIZE_4K,  0x1000,
-               HCA_CAP_MR_PGSIZE_64K, 0x10000,
-               HCA_CAP_MR_PGSIZE_1M,  0x100000,
-               HCA_CAP_MR_PGSIZE_16M, 0x1000000,
-       };
-
-       ehca_gen_dbg("Probing adapter %s...",
-                    shca->ofdev->dev.of_node->full_name);
-       loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
-                                  NULL);
-       if (loc_code)
-               ehca_gen_dbg(" ... location lode=%s", loc_code);
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_gen_err("Cannot allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
-       if (h_ret != H_SUCCESS) {
-               ehca_gen_err("Cannot query device properties. h_ret=%lli",
-                            h_ret);
-               ret = -EPERM;
-               goto sense_attributes1;
-       }
-
-       if (ehca_nr_ports == 1)
-               shca->num_ports = 1;
-       else
-               shca->num_ports = (u8)rblock->num_ports;
-
-       ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
-
-       if (ehca_hw_level == 0) {
-               u32 hcaaver;
-               u32 revid;
-
-               hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
-               revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
-
-               ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
-
-               if (hcaaver == 1) {
-                       if (revid <= 3)
-                               shca->hw_level = 0x10 | (revid + 1);
-                       else
-                               shca->hw_level = 0x14;
-               } else if (hcaaver == 2) {
-                       if (revid == 0)
-                               shca->hw_level = 0x21;
-                       else if (revid == 0x10)
-                               shca->hw_level = 0x22;
-                       else if (revid == 0x20 || revid == 0x21)
-                               shca->hw_level = 0x23;
-               }
-
-               if (!shca->hw_level) {
-                       ehca_gen_warn("unknown hardware version"
-                                     " - assuming default level");
-                       shca->hw_level = 0x22;
-               }
-       } else
-               shca->hw_level = ehca_hw_level;
-       ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
-
-       shca->hca_cap = rblock->hca_cap_indicators;
-       ehca_gen_dbg(" ... HCA capabilities:");
-       for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
-               if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
-                       ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
-
-       /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
-        * a firmware property, so it's valid across all adapters
-        */
-       if (ehca_lock_hcalls == -1)
-               ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
-                                       shca->hca_cap);
-
-       /* translate supported MR page sizes; always support 4K */
-       shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
-       for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
-               if (rblock->memory_page_size_supported & pgsize_map[i])
-                       shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
-
-       /* Set maximum number of CQs and QPs to calculate EQ size */
-       if (shca->max_num_qps == -1)
-               shca->max_num_qps = min_t(int, rblock->max_qp,
-                                         EHCA_MAX_NUM_QUEUES);
-       else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
-               ehca_gen_warn("The requested number of QPs is out of range "
-                             "(1 - %i) specified by HW. Value is set to %i",
-                             rblock->max_qp, rblock->max_qp);
-               shca->max_num_qps = rblock->max_qp;
-       }
-
-       if (shca->max_num_cqs == -1)
-               shca->max_num_cqs = min_t(int, rblock->max_cq,
-                                         EHCA_MAX_NUM_QUEUES);
-       else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
-               ehca_gen_warn("The requested number of CQs is out of range "
-                             "(1 - %i) specified by HW. Value is set to %i",
-                             rblock->max_cq, rblock->max_cq);
-       }
-
-       /* query max MTU from first port -- it's the same for all ports */
-       port = (struct hipz_query_port *)rblock;
-       h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
-       if (h_ret != H_SUCCESS) {
-               ehca_gen_err("Cannot query port properties. h_ret=%lli",
-                            h_ret);
-               ret = -EPERM;
-               goto sense_attributes1;
-       }
-
-       shca->max_mtu = port->max_mtu;
-
-sense_attributes1:
-       ehca_free_fw_ctrlblock(rblock);
-       return ret;
-}
-
-static int init_node_guid(struct ehca_shca *shca)
-{
-       int ret = 0;
-       struct hipz_query_hca *rblock;
-
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!rblock) {
-               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
-               return -ENOMEM;
-       }
-
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "Can't query device properties");
-               ret = -EINVAL;
-               goto init_node_guid1;
-       }
-
-       memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
-
-init_node_guid1:
-       ehca_free_fw_ctrlblock(rblock);
-       return ret;
-}
-
-static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num,
-                              struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = ehca_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-
-       return 0;
-}
-
-static int ehca_init_device(struct ehca_shca *shca)
-{
-       int ret;
-
-       ret = init_node_guid(shca);
-       if (ret)
-               return ret;
-
-       strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
-       shca->ib_device.owner               = THIS_MODULE;
-
-       shca->ib_device.uverbs_abi_ver      = 8;
-       shca->ib_device.uverbs_cmd_mask     =
-               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
-               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
-               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
-               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
-               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
-               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
-               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
-               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
-               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
-               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
-               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
-
-       shca->ib_device.node_type           = RDMA_NODE_IB_CA;
-       shca->ib_device.phys_port_cnt       = shca->num_ports;
-       shca->ib_device.num_comp_vectors    = 1;
-       shca->ib_device.dma_device          = &shca->ofdev->dev;
-       shca->ib_device.query_device        = ehca_query_device;
-       shca->ib_device.query_port          = ehca_query_port;
-       shca->ib_device.query_gid           = ehca_query_gid;
-       shca->ib_device.query_pkey          = ehca_query_pkey;
-       /* shca->in_device.modify_device    = ehca_modify_device    */
-       shca->ib_device.modify_port         = ehca_modify_port;
-       shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
-       shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
-       shca->ib_device.alloc_pd            = ehca_alloc_pd;
-       shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
-       shca->ib_device.create_ah           = ehca_create_ah;
-       /* shca->ib_device.modify_ah        = ehca_modify_ah;       */
-       shca->ib_device.query_ah            = ehca_query_ah;
-       shca->ib_device.destroy_ah          = ehca_destroy_ah;
-       shca->ib_device.create_qp           = ehca_create_qp;
-       shca->ib_device.modify_qp           = ehca_modify_qp;
-       shca->ib_device.query_qp            = ehca_query_qp;
-       shca->ib_device.destroy_qp          = ehca_destroy_qp;
-       shca->ib_device.post_send           = ehca_post_send;
-       shca->ib_device.post_recv           = ehca_post_recv;
-       shca->ib_device.create_cq           = ehca_create_cq;
-       shca->ib_device.destroy_cq          = ehca_destroy_cq;
-       shca->ib_device.resize_cq           = ehca_resize_cq;
-       shca->ib_device.poll_cq             = ehca_poll_cq;
-       /* shca->ib_device.peek_cq          = ehca_peek_cq;         */
-       shca->ib_device.req_notify_cq       = ehca_req_notify_cq;
-       /* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
-       shca->ib_device.get_dma_mr          = ehca_get_dma_mr;
-       shca->ib_device.reg_phys_mr         = ehca_reg_phys_mr;
-       shca->ib_device.reg_user_mr         = ehca_reg_user_mr;
-       shca->ib_device.query_mr            = ehca_query_mr;
-       shca->ib_device.dereg_mr            = ehca_dereg_mr;
-       shca->ib_device.rereg_phys_mr       = ehca_rereg_phys_mr;
-       shca->ib_device.alloc_mw            = ehca_alloc_mw;
-       shca->ib_device.bind_mw             = ehca_bind_mw;
-       shca->ib_device.dealloc_mw          = ehca_dealloc_mw;
-       shca->ib_device.alloc_fmr           = ehca_alloc_fmr;
-       shca->ib_device.map_phys_fmr        = ehca_map_phys_fmr;
-       shca->ib_device.unmap_fmr           = ehca_unmap_fmr;
-       shca->ib_device.dealloc_fmr         = ehca_dealloc_fmr;
-       shca->ib_device.attach_mcast        = ehca_attach_mcast;
-       shca->ib_device.detach_mcast        = ehca_detach_mcast;
-       shca->ib_device.process_mad         = ehca_process_mad;
-       shca->ib_device.mmap                = ehca_mmap;
-       shca->ib_device.dma_ops             = &ehca_dma_mapping_ops;
-       shca->ib_device.get_port_immutable  = ehca_port_immutable;
-
-       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
-               shca->ib_device.uverbs_cmd_mask |=
-                       (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
-                       (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
-
-               shca->ib_device.create_srq          = ehca_create_srq;
-               shca->ib_device.modify_srq          = ehca_modify_srq;
-               shca->ib_device.query_srq           = ehca_query_srq;
-               shca->ib_device.destroy_srq         = ehca_destroy_srq;
-               shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
-       }
-
-       return ret;
-}
-
-static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
-{
-       struct ehca_sport *sport = &shca->sport[port - 1];
-       struct ib_cq *ibcq;
-       struct ib_qp *ibqp;
-       struct ib_qp_init_attr qp_init_attr;
-       struct ib_cq_init_attr cq_attr = {};
-       int ret;
-
-       if (sport->ibcq_aqp1) {
-               ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
-               return -EPERM;
-       }
-
-       cq_attr.cqe = 10;
-       ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
-                           &cq_attr);
-       if (IS_ERR(ibcq)) {
-               ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
-               return PTR_ERR(ibcq);
-       }
-       sport->ibcq_aqp1 = ibcq;
-
-       if (sport->ibqp_sqp[IB_QPT_GSI]) {
-               ehca_err(&shca->ib_device, "AQP1 QP is already created.");
-               ret = -EPERM;
-               goto create_aqp1;
-       }
-
-       memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
-       qp_init_attr.send_cq          = ibcq;
-       qp_init_attr.recv_cq          = ibcq;
-       qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
-       qp_init_attr.cap.max_send_wr  = 100;
-       qp_init_attr.cap.max_recv_wr  = 100;
-       qp_init_attr.cap.max_send_sge = 2;
-       qp_init_attr.cap.max_recv_sge = 1;
-       qp_init_attr.qp_type          = IB_QPT_GSI;
-       qp_init_attr.port_num         = port;
-       qp_init_attr.qp_context       = NULL;
-       qp_init_attr.event_handler    = NULL;
-       qp_init_attr.srq              = NULL;
-
-       ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
-       if (IS_ERR(ibqp)) {
-               ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
-               ret = PTR_ERR(ibqp);
-               goto create_aqp1;
-       }
-       sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
-
-       return 0;
-
-create_aqp1:
-       ib_destroy_cq(sport->ibcq_aqp1);
-       return ret;
-}
-
-static int ehca_destroy_aqp1(struct ehca_sport *sport)
-{
-       int ret;
-
-       ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
-       if (ret) {
-               ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
-               return ret;
-       }
-
-       ret = ib_destroy_cq(sport->ibcq_aqp1);
-       if (ret)
-               ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
-
-       return ret;
-}
-
-static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
-{
-       return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
-}
-
-static ssize_t ehca_store_debug_level(struct device_driver *ddp,
-                                     const char *buf, size_t count)
-{
-       int value = (*buf) - '0';
-       if (value >= 0 && value <= 9)
-               ehca_debug_level = value;
-       return 1;
-}
-
-static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
-                  ehca_show_debug_level, ehca_store_debug_level);
-
-static struct attribute *ehca_drv_attrs[] = {
-       &driver_attr_debug_level.attr,
-       NULL
-};
-
-static struct attribute_group ehca_drv_attr_grp = {
-       .attrs = ehca_drv_attrs
-};
-
-static const struct attribute_group *ehca_drv_attr_groups[] = {
-       &ehca_drv_attr_grp,
-       NULL,
-};
-
-#define EHCA_RESOURCE_ATTR(name)                                           \
-static ssize_t  ehca_show_##name(struct device *dev,                       \
-                                struct device_attribute *attr,            \
-                                char *buf)                                \
-{                                                                         \
-       struct ehca_shca *shca;                                            \
-       struct hipz_query_hca *rblock;                                     \
-       int data;                                                          \
-                                                                          \
-       shca = dev_get_drvdata(dev);                                       \
-                                                                          \
-       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);                      \
-       if (!rblock) {                                                     \
-               dev_err(dev, "Can't allocate rblock memory.\n");           \
-               return 0;                                                  \
-       }                                                                  \
-                                                                          \
-       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
-               dev_err(dev, "Can't query device properties\n");           \
-               ehca_free_fw_ctrlblock(rblock);                            \
-               return 0;                                                  \
-       }                                                                  \
-                                                                          \
-       data = rblock->name;                                               \
-       ehca_free_fw_ctrlblock(rblock);                                    \
-                                                                          \
-       if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))     \
-               return snprintf(buf, 256, "1\n");                          \
-       else                                                               \
-               return snprintf(buf, 256, "%d\n", data);                   \
-                                                                          \
-}                                                                         \
-static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
-
-EHCA_RESOURCE_ATTR(num_ports);
-EHCA_RESOURCE_ATTR(hw_ver);
-EHCA_RESOURCE_ATTR(max_eq);
-EHCA_RESOURCE_ATTR(cur_eq);
-EHCA_RESOURCE_ATTR(max_cq);
-EHCA_RESOURCE_ATTR(cur_cq);
-EHCA_RESOURCE_ATTR(max_qp);
-EHCA_RESOURCE_ATTR(cur_qp);
-EHCA_RESOURCE_ATTR(max_mr);
-EHCA_RESOURCE_ATTR(cur_mr);
-EHCA_RESOURCE_ATTR(max_mw);
-EHCA_RESOURCE_ATTR(cur_mw);
-EHCA_RESOURCE_ATTR(max_pd);
-EHCA_RESOURCE_ATTR(max_ah);
-
-static ssize_t ehca_show_adapter_handle(struct device *dev,
-                                       struct device_attribute *attr,
-                                       char *buf)
-{
-       struct ehca_shca *shca = dev_get_drvdata(dev);
-
-       return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
-
-}
-static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
-
-static struct attribute *ehca_dev_attrs[] = {
-       &dev_attr_adapter_handle.attr,
-       &dev_attr_num_ports.attr,
-       &dev_attr_hw_ver.attr,
-       &dev_attr_max_eq.attr,
-       &dev_attr_cur_eq.attr,
-       &dev_attr_max_cq.attr,
-       &dev_attr_cur_cq.attr,
-       &dev_attr_max_qp.attr,
-       &dev_attr_cur_qp.attr,
-       &dev_attr_max_mr.attr,
-       &dev_attr_cur_mr.attr,
-       &dev_attr_max_mw.attr,
-       &dev_attr_cur_mw.attr,
-       &dev_attr_max_pd.attr,
-       &dev_attr_max_ah.attr,
-       NULL
-};
-
-static struct attribute_group ehca_dev_attr_grp = {
-       .attrs = ehca_dev_attrs
-};
-
-static int ehca_probe(struct platform_device *dev)
-{
-       struct ehca_shca *shca;
-       const u64 *handle;
-       struct ib_pd *ibpd;
-       int ret, i, eq_size;
-       unsigned long flags;
-
-       handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
-       if (!handle) {
-               ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
-                            dev->dev.of_node->full_name);
-               return -ENODEV;
-       }
-
-       if (!(*handle)) {
-               ehca_gen_err("Wrong eHCA handle for adapter: %s.",
-                            dev->dev.of_node->full_name);
-               return -ENODEV;
-       }
-
-       shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
-       if (!shca) {
-               ehca_gen_err("Cannot allocate shca memory.");
-               return -ENOMEM;
-       }
-
-       mutex_init(&shca->modify_mutex);
-       atomic_set(&shca->num_cqs, 0);
-       atomic_set(&shca->num_qps, 0);
-       shca->max_num_qps = ehca_max_qp;
-       shca->max_num_cqs = ehca_max_cq;
-
-       for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
-               spin_lock_init(&shca->sport[i].mod_sqp_lock);
-
-       shca->ofdev = dev;
-       shca->ipz_hca_handle.handle = *handle;
-       dev_set_drvdata(&dev->dev, shca);
-
-       ret = ehca_sense_attributes(shca);
-       if (ret < 0) {
-               ehca_gen_err("Cannot sense eHCA attributes.");
-               goto probe1;
-       }
-
-       ret = ehca_init_device(shca);
-       if (ret) {
-               ehca_gen_err("Cannot init ehca  device struct");
-               goto probe1;
-       }
-
-       eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
-       /* create event queues */
-       ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create EQ.");
-               goto probe1;
-       }
-
-       ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create NEQ.");
-               goto probe3;
-       }
-
-       /* create internal protection domain */
-       ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
-       if (IS_ERR(ibpd)) {
-               ehca_err(&shca->ib_device, "Cannot create internal PD.");
-               ret = PTR_ERR(ibpd);
-               goto probe4;
-       }
-
-       shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
-       shca->pd->ib_pd.device = &shca->ib_device;
-
-       /* create internal max MR */
-       ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
-
-       if (ret) {
-               ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
-                        ret);
-               goto probe5;
-       }
-
-       ret = ib_register_device(&shca->ib_device, NULL);
-       if (ret) {
-               ehca_err(&shca->ib_device,
-                        "ib_register_device() failed ret=%i", ret);
-               goto probe6;
-       }
-
-       /* create AQP1 for port 1 */
-       if (ehca_open_aqp1 == 1) {
-               shca->sport[0].port_state = IB_PORT_DOWN;
-               ret = ehca_create_aqp1(shca, 1);
-               if (ret) {
-                       ehca_err(&shca->ib_device,
-                                "Cannot create AQP1 for port 1.");
-                       goto probe7;
-               }
-       }
-
-       /* create AQP1 for port 2 */
-       if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
-               shca->sport[1].port_state = IB_PORT_DOWN;
-               ret = ehca_create_aqp1(shca, 2);
-               if (ret) {
-                       ehca_err(&shca->ib_device,
-                                "Cannot create AQP1 for port 2.");
-                       goto probe8;
-               }
-       }
-
-       ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
-       if (ret) /* only complain; we can live without attributes */
-               ehca_err(&shca->ib_device,
-                        "Cannot create device attributes  ret=%d", ret);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       list_add(&shca->shca_list, &shca_list);
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       return 0;
-
-probe8:
-       ret = ehca_destroy_aqp1(&shca->sport[0]);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy AQP1 for port 1. ret=%i", ret);
-
-probe7:
-       ib_unregister_device(&shca->ib_device);
-
-probe6:
-       ret = ehca_dereg_internal_maxmr(shca);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal MR. ret=%x", ret);
-
-probe5:
-       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal PD. ret=%x", ret);
-
-probe4:
-       ret = ehca_destroy_eq(shca, &shca->neq);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy NEQ. ret=%x", ret);
-
-probe3:
-       ret = ehca_destroy_eq(shca, &shca->eq);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy EQ. ret=%x", ret);
-
-probe1:
-       ib_dealloc_device(&shca->ib_device);
-
-       return -EINVAL;
-}
-
-static int ehca_remove(struct platform_device *dev)
-{
-       struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
-       unsigned long flags;
-       int ret;
-
-       sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
-
-       if (ehca_open_aqp1 == 1) {
-               int i;
-               for (i = 0; i < shca->num_ports; i++) {
-                       ret = ehca_destroy_aqp1(&shca->sport[i]);
-                       if (ret)
-                               ehca_err(&shca->ib_device,
-                                        "Cannot destroy AQP1 for port %x "
-                                        "ret=%i", ret, i);
-               }
-       }
-
-       ib_unregister_device(&shca->ib_device);
-
-       ret = ehca_dereg_internal_maxmr(shca);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal MR. ret=%i", ret);
-
-       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
-       if (ret)
-               ehca_err(&shca->ib_device,
-                        "Cannot destroy internal PD. ret=%i", ret);
-
-       ret = ehca_destroy_eq(shca, &shca->eq);
-       if (ret)
-               ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
-
-       ret = ehca_destroy_eq(shca, &shca->neq);
-       if (ret)
-               ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
-
-       ib_dealloc_device(&shca->ib_device);
-
-       spin_lock_irqsave(&shca_list_lock, flags);
-       list_del(&shca->shca_list);
-       spin_unlock_irqrestore(&shca_list_lock, flags);
-
-       return ret;
-}
-
-static struct of_device_id ehca_device_table[] =
-{
-       {
-               .name       = "lhca",
-               .compatible = "IBM,lhca",
-       },
-       {},
-};
-MODULE_DEVICE_TABLE(of, ehca_device_table);
-
-static struct platform_driver ehca_driver = {
-       .probe       = ehca_probe,
-       .remove      = ehca_remove,
-       .driver = {
-               .name = "ehca",
-               .owner = THIS_MODULE,
-               .groups = ehca_drv_attr_groups,
-               .of_match_table = ehca_device_table,
-       },
-};
-
-void ehca_poll_eqs(unsigned long data)
-{
-       struct ehca_shca *shca;
-
-       spin_lock(&shca_list_lock);
-       list_for_each_entry(shca, &shca_list, shca_list) {
-               if (shca->eq.is_initialized) {
-                       /* call deadman proc only if eq ptr does not change */
-                       struct ehca_eq *eq = &shca->eq;
-                       int max = 3;
-                       volatile u64 q_ofs, q_ofs2;
-                       unsigned long flags;
-                       spin_lock_irqsave(&eq->spinlock, flags);
-                       q_ofs = eq->ipz_queue.current_q_offset;
-                       spin_unlock_irqrestore(&eq->spinlock, flags);
-                       do {
-                               spin_lock_irqsave(&eq->spinlock, flags);
-                               q_ofs2 = eq->ipz_queue.current_q_offset;
-                               spin_unlock_irqrestore(&eq->spinlock, flags);
-                               max--;
-                       } while (q_ofs == q_ofs2 && max > 0);
-                       if (q_ofs == q_ofs2)
-                               ehca_process_eq(shca, 0);
-               }
-       }
-       mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
-       spin_unlock(&shca_list_lock);
-}
-
-static int ehca_mem_notifier(struct notifier_block *nb,
-                            unsigned long action, void *data)
-{
-       static unsigned long ehca_dmem_warn_time;
-       unsigned long flags;
-
-       switch (action) {
-       case MEM_CANCEL_OFFLINE:
-       case MEM_CANCEL_ONLINE:
-       case MEM_ONLINE:
-       case MEM_OFFLINE:
-               return NOTIFY_OK;
-       case MEM_GOING_ONLINE:
-       case MEM_GOING_OFFLINE:
-               /* only ok if no hca is attached to the lpar */
-               spin_lock_irqsave(&shca_list_lock, flags);
-               if (list_empty(&shca_list)) {
-                       spin_unlock_irqrestore(&shca_list_lock, flags);
-                       return NOTIFY_OK;
-               } else {
-                       spin_unlock_irqrestore(&shca_list_lock, flags);
-                       if (printk_timed_ratelimit(&ehca_dmem_warn_time,
-                                                  30 * 1000))
-                               ehca_gen_err("DMEM operations are not allowed"
-                                            "in conjunction with eHCA");
-                       return NOTIFY_BAD;
-               }
-       }
-       return NOTIFY_OK;
-}
-
-static struct notifier_block ehca_mem_nb = {
-       .notifier_call = ehca_mem_notifier,
-};
-
-static int __init ehca_module_init(void)
-{
-       int ret;
-
-       printk(KERN_INFO "eHCA Infiniband Device Driver "
-              "(Version " HCAD_VERSION ")\n");
-
-       ret = ehca_create_comp_pool();
-       if (ret) {
-               ehca_gen_err("Cannot create comp pool.");
-               return ret;
-       }
-
-       ret = ehca_create_slab_caches();
-       if (ret) {
-               ehca_gen_err("Cannot create SLAB caches");
-               ret = -ENOMEM;
-               goto module_init1;
-       }
-
-       ret = ehca_create_busmap();
-       if (ret) {
-               ehca_gen_err("Cannot create busmap.");
-               goto module_init2;
-       }
-
-       ret = ibmebus_register_driver(&ehca_driver);
-       if (ret) {
-               ehca_gen_err("Cannot register eHCA device driver");
-               ret = -EINVAL;
-               goto module_init3;
-       }
-
-       ret = register_memory_notifier(&ehca_mem_nb);
-       if (ret) {
-               ehca_gen_err("Failed registering memory add/remove notifier");
-               goto module_init4;
-       }
-
-       if (ehca_poll_all_eqs != 1) {
-               ehca_gen_err("WARNING!!!");
-               ehca_gen_err("It is possible to lose interrupts.");
-       } else {
-               init_timer(&poll_eqs_timer);
-               poll_eqs_timer.function = ehca_poll_eqs;
-               poll_eqs_timer.expires = jiffies + HZ;
-               add_timer(&poll_eqs_timer);
-       }
-
-       return 0;
-
-module_init4:
-       ibmebus_unregister_driver(&ehca_driver);
-
-module_init3:
-       ehca_destroy_busmap();
-
-module_init2:
-       ehca_destroy_slab_caches();
-
-module_init1:
-       ehca_destroy_comp_pool();
-       return ret;
-};
-
-static void __exit ehca_module_exit(void)
-{
-       if (ehca_poll_all_eqs == 1)
-               del_timer_sync(&poll_eqs_timer);
-
-       ibmebus_unregister_driver(&ehca_driver);
-
-       unregister_memory_notifier(&ehca_mem_nb);
-
-       ehca_destroy_busmap();
-
-       ehca_destroy_slab_caches();
-
-       ehca_destroy_comp_pool();
-
-       idr_destroy(&ehca_cq_idr);
-       idr_destroy(&ehca_qp_idr);
-};
-
-module_init(ehca_module_init);
-module_exit(ehca_module_exit);
diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c
deleted file mode 100644 (file)
index cec1815..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  mcast  functions
- *
- *  Authors: Khadija Souissi <souissik@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-#define MAX_MC_LID 0xFFFE
-#define MIN_MC_LID 0xC000      /* Multicast limits */
-#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
-#define EHCA_VALID_MULTICAST_LID(lid) \
-       (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
-
-int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
-                                             ib_device);
-       union ib_gid my_gid;
-       u64 subnet_prefix, interface_id, h_ret;
-
-       if (ibqp->qp_type != IB_QPT_UD) {
-               ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
-               return -EINVAL;
-       }
-
-       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
-               ehca_err(ibqp->device, "invalid mulitcast gid");
-               return -EINVAL;
-       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
-               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
-               return -EINVAL;
-       }
-
-       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
-
-       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
-       interface_id = be64_to_cpu(my_gid.global.interface_id);
-       h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
-                                  my_qp->ipz_qp_handle,
-                                  my_qp->galpas.kernel,
-                                  lid, subnet_prefix, interface_id);
-       if (h_ret != H_SUCCESS)
-               ehca_err(ibqp->device,
-                        "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
-                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
-
-       return ehca2ib_return_code(h_ret);
-}
-
-int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(ibqp->pd->device,
-                                             struct ehca_shca, ib_device);
-       union ib_gid my_gid;
-       u64 subnet_prefix, interface_id, h_ret;
-
-       if (ibqp->qp_type != IB_QPT_UD) {
-               ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
-               return -EINVAL;
-       }
-
-       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
-               ehca_err(ibqp->device, "invalid mulitcast gid");
-               return -EINVAL;
-       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
-               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
-               return -EINVAL;
-       }
-
-       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
-
-       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
-       interface_id = be64_to_cpu(my_gid.global.interface_id);
-       h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
-                                  my_qp->ipz_qp_handle,
-                                  my_qp->galpas.kernel,
-                                  lid, subnet_prefix, interface_id);
-       if (h_ret != H_SUCCESS)
-               ehca_err(ibqp->device,
-                        "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
-                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
-
-       return ehca2ib_return_code(h_ret);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
deleted file mode 100644 (file)
index f914b30..0000000
+++ /dev/null
@@ -1,2593 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  MR/MW functions
- *
- *  Authors: Dietmar Decker <ddecker@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-#include <rdma/ib_umem.h>
-
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "hcp_if.h"
-#include "hipz_hw.h"
-
-#define NUM_CHUNKS(length, chunk_size) \
-       (((length) + (chunk_size - 1)) / (chunk_size))
-
-/* max number of rpages (per hcall register_rpages) */
-#define MAX_RPAGES 512
-
-/* DMEM toleration management */
-#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
-#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
-#define EHCA_HUGEPAGESHIFT     34
-#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
-#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
-#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
-#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
-#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
-#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
-#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
-#define EHCA_DIR_MAP_SIZE (0x10000)
-#define EHCA_ENT_MAP_SIZE (0x10000)
-#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
-
-static unsigned long ehca_mr_len;
-
-/*
- * Memory map data structures
- */
-struct ehca_dir_bmap {
-       u64 ent[EHCA_MAP_ENTRIES];
-};
-struct ehca_top_bmap {
-       struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
-};
-struct ehca_bmap {
-       struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
-};
-
-static struct ehca_bmap *ehca_bmap;
-
-static struct kmem_cache *mr_cache;
-static struct kmem_cache *mw_cache;
-
-enum ehca_mr_pgsize {
-       EHCA_MR_PGSIZE4K  = 0x1000L,
-       EHCA_MR_PGSIZE64K = 0x10000L,
-       EHCA_MR_PGSIZE1M  = 0x100000L,
-       EHCA_MR_PGSIZE16M = 0x1000000L
-};
-
-#define EHCA_MR_PGSHIFT4K  12
-#define EHCA_MR_PGSHIFT64K 16
-#define EHCA_MR_PGSHIFT1M  20
-#define EHCA_MR_PGSHIFT16M 24
-
-static u64 ehca_map_vaddr(void *caddr);
-
-static u32 ehca_encode_hwpage_size(u32 pgsize)
-{
-       int log = ilog2(pgsize);
-       WARN_ON(log < 12 || log > 24 || log & 3);
-       return (log - 12) / 4;
-}
-
-static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
-{
-       return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
-}
-
-static struct ehca_mr *ehca_mr_new(void)
-{
-       struct ehca_mr *me;
-
-       me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
-       if (me)
-               spin_lock_init(&me->mrlock);
-       else
-               ehca_gen_err("alloc failed");
-
-       return me;
-}
-
-static void ehca_mr_delete(struct ehca_mr *me)
-{
-       kmem_cache_free(mr_cache, me);
-}
-
-static struct ehca_mw *ehca_mw_new(void)
-{
-       struct ehca_mw *me;
-
-       me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
-       if (me)
-               spin_lock_init(&me->mwlock);
-       else
-               ehca_gen_err("alloc failed");
-
-       return me;
-}
-
-static void ehca_mw_delete(struct ehca_mw *me)
-{
-       kmem_cache_free(mw_cache, me);
-}
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
-{
-       struct ib_mr *ib_mr;
-       int ret;
-       struct ehca_mr *e_maxmr;
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-
-       if (shca->maxmr) {
-               e_maxmr = ehca_mr_new();
-               if (!e_maxmr) {
-                       ehca_err(&shca->ib_device, "out of memory");
-                       ib_mr = ERR_PTR(-ENOMEM);
-                       goto get_dma_mr_exit0;
-               }
-
-               ret = ehca_reg_maxmr(shca, e_maxmr,
-                                    (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
-                                    mr_access_flags, e_pd,
-                                    &e_maxmr->ib.ib_mr.lkey,
-                                    &e_maxmr->ib.ib_mr.rkey);
-               if (ret) {
-                       ehca_mr_delete(e_maxmr);
-                       ib_mr = ERR_PTR(ret);
-                       goto get_dma_mr_exit0;
-               }
-               ib_mr = &e_maxmr->ib.ib_mr;
-       } else {
-               ehca_err(&shca->ib_device, "no internal max-MR exist!");
-               ib_mr = ERR_PTR(-EINVAL);
-               goto get_dma_mr_exit0;
-       }
-
-get_dma_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
-                        PTR_ERR(ib_mr), pd, mr_access_flags);
-       return ib_mr;
-} /* end ehca_get_dma_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
-                              struct ib_phys_buf *phys_buf_array,
-                              int num_phys_buf,
-                              int mr_access_flags,
-                              u64 *iova_start)
-{
-       struct ib_mr *ib_mr;
-       int ret;
-       struct ehca_mr *e_mr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-       u64 size;
-
-       if ((num_phys_buf <= 0) || !phys_buf_array) {
-               ehca_err(pd->device, "bad input values: num_phys_buf=%x "
-                        "phys_buf_array=%p", num_phys_buf, phys_buf_array);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-
-       /* check physical buffer list and calculate size */
-       ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
-                                           iova_start, &size);
-       if (ret) {
-               ib_mr = ERR_PTR(ret);
-               goto reg_phys_mr_exit0;
-       }
-       if ((size == 0) ||
-           (((u64)iova_start + size) < (u64)iova_start)) {
-               ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
-                        size, iova_start);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_phys_mr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(pd->device, "out of memory");
-               ib_mr = ERR_PTR(-ENOMEM);
-               goto reg_phys_mr_exit0;
-       }
-
-       /* register MR on HCA */
-       if (ehca_mr_is_maxmr(size, iova_start)) {
-               e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-               ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
-                                    e_pd, &e_mr->ib.ib_mr.lkey,
-                                    &e_mr->ib.ib_mr.rkey);
-               if (ret) {
-                       ib_mr = ERR_PTR(ret);
-                       goto reg_phys_mr_exit1;
-               }
-       } else {
-               struct ehca_mr_pginfo pginfo;
-               u32 num_kpages;
-               u32 num_hwpages;
-               u64 hw_pgsize;
-
-               num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
-                                       PAGE_SIZE);
-               /* for kernel space we try most possible pgsize */
-               hw_pgsize = ehca_get_max_hwpage_size(shca);
-               num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
-                                        hw_pgsize);
-               memset(&pginfo, 0, sizeof(pginfo));
-               pginfo.type = EHCA_MR_PGI_PHYS;
-               pginfo.num_kpages = num_kpages;
-               pginfo.hwpage_size = hw_pgsize;
-               pginfo.num_hwpages = num_hwpages;
-               pginfo.u.phy.num_phys_buf = num_phys_buf;
-               pginfo.u.phy.phys_buf_array = phys_buf_array;
-               pginfo.next_hwpage =
-                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-
-               ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
-                                 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-                                 &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-               if (ret) {
-                       ib_mr = ERR_PTR(ret);
-                       goto reg_phys_mr_exit1;
-               }
-       }
-
-       /* successful registration of all pages */
-       return &e_mr->ib.ib_mr;
-
-reg_phys_mr_exit1:
-       ehca_mr_delete(e_mr);
-reg_phys_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
-                        "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
-                        PTR_ERR(ib_mr), pd, phys_buf_array,
-                        num_phys_buf, mr_access_flags, iova_start);
-       return ib_mr;
-} /* end ehca_reg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                              u64 virt, int mr_access_flags,
-                              struct ib_udata *udata)
-{
-       struct ib_mr *ib_mr;
-       struct ehca_mr *e_mr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_mr_pginfo pginfo;
-       int ret, page_shift;
-       u32 num_kpages;
-       u32 num_hwpages;
-       u64 hwpage_size;
-
-       if (!pd) {
-               ehca_gen_err("bad pd=%p", pd);
-               return ERR_PTR(-EFAULT);
-       }
-
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit0;
-       }
-
-       if (length == 0 || virt + length < virt) {
-               ehca_err(pd->device, "bad input values: length=%llx "
-                        "virt_base=%llx", length, virt);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(pd->device, "out of memory");
-               ib_mr = ERR_PTR(-ENOMEM);
-               goto reg_user_mr_exit0;
-       }
-
-       e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
-                                mr_access_flags, 0);
-       if (IS_ERR(e_mr->umem)) {
-               ib_mr = (void *)e_mr->umem;
-               goto reg_user_mr_exit1;
-       }
-
-       if (e_mr->umem->page_size != PAGE_SIZE) {
-               ehca_err(pd->device, "page size not supported, "
-                        "e_mr->umem->page_size=%x", e_mr->umem->page_size);
-               ib_mr = ERR_PTR(-EINVAL);
-               goto reg_user_mr_exit2;
-       }
-
-       /* determine number of MR pages */
-       num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
-       /* select proper hw_pgsize */
-       page_shift = PAGE_SHIFT;
-       if (e_mr->umem->hugetlb) {
-               /* determine page_shift, clamp between 4K and 16M */
-               page_shift = (fls64(length - 1) + 3) & ~3;
-               page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
-                                EHCA_MR_PGSHIFT16M);
-       }
-       hwpage_size = 1UL << page_shift;
-
-       /* now that we have the desired page size, shift until it's
-        * supported, too. 4K is always supported, so this terminates.
-        */
-       while (!(hwpage_size & shca->hca_cap_mr_pgsize))
-               hwpage_size >>= 4;
-
-reg_user_mr_fallback:
-       num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
-       /* register MR on HCA */
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_USER;
-       pginfo.hwpage_size = hwpage_size;
-       pginfo.num_kpages = num_kpages;
-       pginfo.num_hwpages = num_hwpages;
-       pginfo.u.usr.region = e_mr->umem;
-       pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
-       pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
-       ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
-                         e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-                         &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
-       if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
-               ehca_warn(pd->device, "failed to register mr "
-                         "with hwpage_size=%llx", hwpage_size);
-               ehca_info(pd->device, "try to register mr with "
-                         "kpage_size=%lx", PAGE_SIZE);
-               /*
-                * this means kpages are not contiguous for a hw page
-                * try kernel page size as fallback solution
-                */
-               hwpage_size = PAGE_SIZE;
-               goto reg_user_mr_fallback;
-       }
-       if (ret) {
-               ib_mr = ERR_PTR(ret);
-               goto reg_user_mr_exit2;
-       }
-
-       /* successful registration of all pages */
-       return &e_mr->ib.ib_mr;
-
-reg_user_mr_exit2:
-       ib_umem_release(e_mr->umem);
-reg_user_mr_exit1:
-       ehca_mr_delete(e_mr);
-reg_user_mr_exit0:
-       if (IS_ERR(ib_mr))
-               ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
-                        PTR_ERR(ib_mr), pd, mr_access_flags, udata);
-       return ib_mr;
-} /* end ehca_reg_user_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_rereg_phys_mr(struct ib_mr *mr,
-                      int mr_rereg_mask,
-                      struct ib_pd *pd,
-                      struct ib_phys_buf *phys_buf_array,
-                      int num_phys_buf,
-                      int mr_access_flags,
-                      u64 *iova_start)
-{
-       int ret;
-
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-       u64 new_size;
-       u64 *new_start;
-       u32 new_acl;
-       struct ehca_pd *new_pd;
-       u32 tmp_lkey, tmp_rkey;
-       unsigned long sl_flags;
-       u32 num_kpages = 0;
-       u32 num_hwpages = 0;
-       struct ehca_mr_pginfo pginfo;
-
-       if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
-               /* TODO not supported, because PHYP rereg hCall needs pages */
-               ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
-                        "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       if (mr_rereg_mask & IB_MR_REREG_PD) {
-               if (!pd) {
-                       ehca_err(mr->device, "rereg with bad pd, pd=%p "
-                                "mr_rereg_mask=%x", pd, mr_rereg_mask);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-       }
-
-       if ((mr_rereg_mask &
-            ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
-           (mr_rereg_mask == 0)) {
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       /* check other parameters */
-       if (e_mr == shca->maxmr) {
-               /* should be impossible, however reject to be sure */
-               ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
-                        "shca->maxmr=%p mr->lkey=%x",
-                        mr, shca->maxmr, mr->lkey);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
-               if (e_mr->flags & EHCA_MR_FLAG_FMR) {
-                       ehca_err(mr->device, "not supported for FMR, mr=%p "
-                                "flags=%x", mr, e_mr->flags);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-               if (!phys_buf_array || num_phys_buf <= 0) {
-                       ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
-                                " phys_buf_array=%p num_phys_buf=%x",
-                                mr_rereg_mask, phys_buf_array, num_phys_buf);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit0;
-               }
-       }
-       if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&     /* change ACL */
-           (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-            ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
-                        "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
-               ret = -EINVAL;
-               goto rereg_phys_mr_exit0;
-       }
-
-       /* set requested values dependent on rereg request */
-       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-       new_start = e_mr->start;
-       new_size = e_mr->size;
-       new_acl = e_mr->acl;
-       new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
-
-       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
-               u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
-
-               new_start = iova_start; /* change address */
-               /* check physical buffer list and calculate size */
-               ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
-                                                   num_phys_buf, iova_start,
-                                                   &new_size);
-               if (ret)
-                       goto rereg_phys_mr_exit1;
-               if ((new_size == 0) ||
-                   (((u64)iova_start + new_size) < (u64)iova_start)) {
-                       ehca_err(mr->device, "bad input values: new_size=%llx "
-                                "iova_start=%p", new_size, iova_start);
-                       ret = -EINVAL;
-                       goto rereg_phys_mr_exit1;
-               }
-               num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
-                                       new_size, PAGE_SIZE);
-               num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
-                                        new_size, hw_pgsize);
-               memset(&pginfo, 0, sizeof(pginfo));
-               pginfo.type = EHCA_MR_PGI_PHYS;
-               pginfo.num_kpages = num_kpages;
-               pginfo.hwpage_size = hw_pgsize;
-               pginfo.num_hwpages = num_hwpages;
-               pginfo.u.phy.num_phys_buf = num_phys_buf;
-               pginfo.u.phy.phys_buf_array = phys_buf_array;
-               pginfo.next_hwpage =
-                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
-       }
-       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
-               new_acl = mr_access_flags;
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               new_pd = container_of(pd, struct ehca_pd, ib_pd);
-
-       ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
-                           new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-       if (ret)
-               goto rereg_phys_mr_exit1;
-
-       /* successful reregistration */
-       if (mr_rereg_mask & IB_MR_REREG_PD)
-               mr->pd = pd;
-       mr->lkey = tmp_lkey;
-       mr->rkey = tmp_rkey;
-
-rereg_phys_mr_exit1:
-       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-rereg_phys_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
-                        "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
-                        "iova_start=%p",
-                        ret, mr, mr_rereg_mask, pd, phys_buf_array,
-                        num_phys_buf, mr_access_flags, iova_start);
-       return ret;
-} /* end ehca_rereg_phys_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-       unsigned long sl_flags;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-               ret = -EINVAL;
-               goto query_mr_exit0;
-       }
-
-       memset(mr_attr, 0, sizeof(struct ib_mr_attr));
-       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
-
-       h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
-                        "hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, mr, shca->ipz_hca_handle.handle,
-                        e_mr->ipz_mr_handle.handle, mr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto query_mr_exit1;
-       }
-       mr_attr->pd = mr->pd;
-       mr_attr->device_virt_addr = hipzout.vaddr;
-       mr_attr->size = hipzout.len;
-       mr_attr->lkey = hipzout.lkey;
-       mr_attr->rkey = hipzout.rkey;
-       ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
-
-query_mr_exit1:
-       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
-query_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
-                        ret, mr, mr_attr);
-       return ret;
-} /* end ehca_query_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dereg_mr(struct ib_mr *mr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
-
-       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
-                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
-               ret = -EINVAL;
-               goto dereg_mr_exit0;
-       } else if (e_mr == shca->maxmr) {
-               /* should be impossible, however reject to be sure */
-               ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
-                        "shca->maxmr=%p mr->lkey=%x",
-                        mr, shca->maxmr, mr->lkey);
-               ret = -EINVAL;
-               goto dereg_mr_exit0;
-       }
-
-       /* TODO: BUSY: MR still has bound window(s) */
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
-                        "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
-                        h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
-                        e_mr->ipz_mr_handle.handle, mr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto dereg_mr_exit0;
-       }
-
-       if (e_mr->umem)
-               ib_umem_release(e_mr->umem);
-
-       /* successful deregistration */
-       ehca_mr_delete(e_mr);
-
-dereg_mr_exit0:
-       if (ret)
-               ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
-       return ret;
-} /* end ehca_dereg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
-{
-       struct ib_mw *ib_mw;
-       u64 h_ret;
-       struct ehca_mw *e_mw;
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_mw_hipzout_parms hipzout;
-
-       if (type != IB_MW_TYPE_1)
-               return ERR_PTR(-EINVAL);
-
-       e_mw = ehca_mw_new();
-       if (!e_mw) {
-               ib_mw = ERR_PTR(-ENOMEM);
-               goto alloc_mw_exit0;
-       }
-
-       h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
-                                        e_pd->fw_pd, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
-                        "shca=%p hca_hndl=%llx mw=%p",
-                        h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
-               ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
-               goto alloc_mw_exit1;
-       }
-       /* successful MW allocation */
-       e_mw->ipz_mw_handle = hipzout.handle;
-       e_mw->ib_mw.rkey    = hipzout.rkey;
-       return &e_mw->ib_mw;
-
-alloc_mw_exit1:
-       ehca_mw_delete(e_mw);
-alloc_mw_exit0:
-       if (IS_ERR(ib_mw))
-               ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
-       return ib_mw;
-} /* end ehca_alloc_mw() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_bind_mw(struct ib_qp *qp,
-                struct ib_mw *mw,
-                struct ib_mw_bind *mw_bind)
-{
-       /* TODO: not supported up to now */
-       ehca_gen_err("bind MW currently not supported by HCAD");
-
-       return -EPERM;
-} /* end ehca_bind_mw() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dealloc_mw(struct ib_mw *mw)
-{
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(mw->device, struct ehca_shca, ib_device);
-       struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
-
-       h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
-                        "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
-                        h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
-                        e_mw->ipz_mw_handle.handle);
-               return ehca2ib_return_code(h_ret);
-       }
-       /* successful deallocation */
-       ehca_mw_delete(e_mw);
-       return 0;
-} /* end ehca_dealloc_mw() */
-
-/*----------------------------------------------------------------------*/
-
-struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
-                             int mr_access_flags,
-                             struct ib_fmr_attr *fmr_attr)
-{
-       struct ib_fmr *ib_fmr;
-       struct ehca_shca *shca =
-               container_of(pd->device, struct ehca_shca, ib_device);
-       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_mr *e_fmr;
-       int ret;
-       u32 tmp_lkey, tmp_rkey;
-       struct ehca_mr_pginfo pginfo;
-       u64 hw_pgsize;
-
-       /* check other parameters */
-       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
-           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
-            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
-               /*
-                * Remote Write Access requires Local Write Access
-                * Remote Atomic Access requires Local Write Access
-                */
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-       if (mr_access_flags & IB_ACCESS_MW_BIND) {
-               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
-                        mr_access_flags);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-       if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
-               ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
-                        "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
-                        fmr_attr->max_pages, fmr_attr->max_maps,
-                        fmr_attr->page_shift);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-
-       hw_pgsize = 1 << fmr_attr->page_shift;
-       if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
-               ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
-                        fmr_attr->page_shift);
-               ib_fmr = ERR_PTR(-EINVAL);
-               goto alloc_fmr_exit0;
-       }
-
-       e_fmr = ehca_mr_new();
-       if (!e_fmr) {
-               ib_fmr = ERR_PTR(-ENOMEM);
-               goto alloc_fmr_exit0;
-       }
-       e_fmr->flags |= EHCA_MR_FLAG_FMR;
-
-       /* register MR on HCA */
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.hwpage_size = hw_pgsize;
-       /*
-        * pginfo.num_hwpages==0, ie register_rpages() will not be called
-        * but deferred to map_phys_fmr()
-        */
-       ret = ehca_reg_mr(shca, e_fmr, NULL,
-                         fmr_attr->max_pages * (1 << fmr_attr->page_shift),
-                         mr_access_flags, e_pd, &pginfo,
-                         &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
-       if (ret) {
-               ib_fmr = ERR_PTR(ret);
-               goto alloc_fmr_exit1;
-       }
-
-       /* successful */
-       e_fmr->hwpage_size = hw_pgsize;
-       e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
-       e_fmr->fmr_max_pages = fmr_attr->max_pages;
-       e_fmr->fmr_max_maps = fmr_attr->max_maps;
-       e_fmr->fmr_map_cnt = 0;
-       return &e_fmr->ib.ib_fmr;
-
-alloc_fmr_exit1:
-       ehca_mr_delete(e_fmr);
-alloc_fmr_exit0:
-       return ib_fmr;
-} /* end ehca_alloc_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_map_phys_fmr(struct ib_fmr *fmr,
-                     u64 *page_list,
-                     int list_len,
-                     u64 iova)
-{
-       int ret;
-       struct ehca_shca *shca =
-               container_of(fmr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
-       struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
-       struct ehca_mr_pginfo pginfo;
-       u32 tmp_lkey, tmp_rkey;
-
-       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
-                        e_fmr, e_fmr->flags);
-               ret = -EINVAL;
-               goto map_phys_fmr_exit0;
-       }
-       ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
-       if (ret)
-               goto map_phys_fmr_exit0;
-       if (iova % e_fmr->fmr_page_size) {
-               /* only whole-numbered pages */
-               ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
-                        iova, e_fmr->fmr_page_size);
-               ret = -EINVAL;
-               goto map_phys_fmr_exit0;
-       }
-       if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
-               /* HCAD does not limit the maps, however trace this anyway */
-               ehca_info(fmr->device, "map limit exceeded, fmr=%p "
-                         "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
-                         fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
-       }
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_FMR;
-       pginfo.num_kpages = list_len;
-       pginfo.hwpage_size = e_fmr->hwpage_size;
-       pginfo.num_hwpages =
-               list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
-       pginfo.u.fmr.page_list = page_list;
-       pginfo.next_hwpage =
-               (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
-       pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
-
-       ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
-                           list_len * e_fmr->fmr_page_size,
-                           e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
-       if (ret)
-               goto map_phys_fmr_exit0;
-
-       /* successful reregistration */
-       e_fmr->fmr_map_cnt++;
-       e_fmr->ib.ib_fmr.lkey = tmp_lkey;
-       e_fmr->ib.ib_fmr.rkey = tmp_rkey;
-       return 0;
-
-map_phys_fmr_exit0:
-       if (ret)
-               ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
-                        "iova=%llx", ret, fmr, page_list, list_len, iova);
-       return ret;
-} /* end ehca_map_phys_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_unmap_fmr(struct list_head *fmr_list)
-{
-       int ret = 0;
-       struct ib_fmr *ib_fmr;
-       struct ehca_shca *shca = NULL;
-       struct ehca_shca *prev_shca;
-       struct ehca_mr *e_fmr;
-       u32 num_fmr = 0;
-       u32 unmap_fmr_cnt = 0;
-
-       /* check all FMR belong to same SHCA, and check internal flag */
-       list_for_each_entry(ib_fmr, fmr_list, list) {
-               prev_shca = shca;
-               shca = container_of(ib_fmr->device, struct ehca_shca,
-                                   ib_device);
-               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
-               if ((shca != prev_shca) && prev_shca) {
-                       ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
-                                "prev_shca=%p e_fmr=%p",
-                                shca, prev_shca, e_fmr);
-                       ret = -EINVAL;
-                       goto unmap_fmr_exit0;
-               }
-               if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-                       ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
-                                "e_fmr->flags=%x", e_fmr, e_fmr->flags);
-                       ret = -EINVAL;
-                       goto unmap_fmr_exit0;
-               }
-               num_fmr++;
-       }
-
-       /* loop over all FMRs to unmap */
-       list_for_each_entry(ib_fmr, fmr_list, list) {
-               unmap_fmr_cnt++;
-               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
-               shca = container_of(ib_fmr->device, struct ehca_shca,
-                                   ib_device);
-               ret = ehca_unmap_one_fmr(shca, e_fmr);
-               if (ret) {
-                       /* unmap failed, stop unmapping of rest of FMRs */
-                       ehca_err(&shca->ib_device, "unmap of one FMR failed, "
-                                "stop rest, e_fmr=%p num_fmr=%x "
-                                "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
-                                unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
-                       goto unmap_fmr_exit0;
-               }
-       }
-
-unmap_fmr_exit0:
-       if (ret)
-               ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
-                            ret, fmr_list, num_fmr, unmap_fmr_cnt);
-       return ret;
-} /* end ehca_unmap_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dealloc_fmr(struct ib_fmr *fmr)
-{
-       int ret;
-       u64 h_ret;
-       struct ehca_shca *shca =
-               container_of(fmr->device, struct ehca_shca, ib_device);
-       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
-
-       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
-               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
-                        e_fmr, e_fmr->flags);
-               ret = -EINVAL;
-               goto free_fmr_exit0;
-       }
-
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
-                        "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle, fmr->lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto free_fmr_exit0;
-       }
-       /* successful deregistration */
-       ehca_mr_delete(e_fmr);
-       return 0;
-
-free_fmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
-       return ret;
-} /* end ehca_dealloc_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
-                                  struct ehca_mr *e_mr,
-                                  struct ehca_mr_pginfo *pginfo);
-
-int ehca_reg_mr(struct ehca_shca *shca,
-               struct ehca_mr *e_mr,
-               u64 *iova_start,
-               u64 size,
-               int acl,
-               struct ehca_pd *e_pd,
-               struct ehca_mr_pginfo *pginfo,
-               u32 *lkey, /*OUT*/
-               u32 *rkey, /*OUT*/
-               enum ehca_reg_type reg_type)
-{
-       int ret;
-       u64 h_ret;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
-       if (ehca_use_hp_mr == 1)
-               hipz_acl |= 0x00000001;
-
-       h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
-                                        (u64)iova_start, size, hipz_acl,
-                                        e_pd->fw_pd, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
-                        "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_reg_mr_exit0;
-       }
-
-       e_mr->ipz_mr_handle = hipzout.handle;
-
-       if (reg_type == EHCA_REG_BUSMAP_MR)
-               ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
-       else if (reg_type == EHCA_REG_MR)
-               ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
-       else
-               ret = -EINVAL;
-
-       if (ret)
-               goto ehca_reg_mr_exit1;
-
-       /* successful registration */
-       e_mr->num_kpages = pginfo->num_kpages;
-       e_mr->num_hwpages = pginfo->num_hwpages;
-       e_mr->hwpage_size = pginfo->hwpage_size;
-       e_mr->start = iova_start;
-       e_mr->size = size;
-       e_mr->acl = acl;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-
-ehca_reg_mr_exit1:
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
-                        "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
-                        h_ret, shca, e_mr, iova_start, size, acl, e_pd,
-                        hipzout.lkey, pginfo, pginfo->num_kpages,
-                        pginfo->num_hwpages, ret);
-               ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
-                        "not recoverable");
-       }
-ehca_reg_mr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
-                        "num_kpages=%llx num_hwpages=%llx",
-                        ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
-                        pginfo->num_kpages, pginfo->num_hwpages);
-       return ret;
-} /* end ehca_reg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_mr_rpages(struct ehca_shca *shca,
-                      struct ehca_mr *e_mr,
-                      struct ehca_mr_pginfo *pginfo)
-{
-       int ret = 0;
-       u64 h_ret;
-       u32 rnum;
-       u64 rpage;
-       u32 i;
-       u64 *kpage;
-
-       if (!pginfo->num_hwpages) /* in case of fmr */
-               return 0;
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               ret = -ENOMEM;
-               goto ehca_reg_mr_rpages_exit0;
-       }
-
-       /* max MAX_RPAGES ehca mr pages per register call */
-       for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
-
-               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
-                       rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
-                       if (rnum == 0)
-                               rnum = MAX_RPAGES;      /* last shot is full */
-               } else
-                       rnum = MAX_RPAGES;
-
-               ret = ehca_set_pagebuf(pginfo, rnum, kpage);
-               if (ret) {
-                       ehca_err(&shca->ib_device, "ehca_set_pagebuf "
-                                "bad rc, ret=%i rnum=%x kpage=%p",
-                                ret, rnum, kpage);
-                       goto ehca_reg_mr_rpages_exit1;
-               }
-
-               if (rnum > 1) {
-                       rpage = __pa(kpage);
-                       if (!rpage) {
-                               ehca_err(&shca->ib_device, "kpage=%p i=%x",
-                                        kpage, i);
-                               ret = -EFAULT;
-                               goto ehca_reg_mr_rpages_exit1;
-                       }
-               } else
-                       rpage = *kpage;
-
-               h_ret = hipz_h_register_rpage_mr(
-                       shca->ipz_hca_handle, e_mr,
-                       ehca_encode_hwpage_size(pginfo->hwpage_size),
-                       0, rpage, rnum);
-
-               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
-                       /*
-                        * check for 'registration complete'==H_SUCCESS
-                        * and for 'page registered'==H_PAGE_REGISTERED
-                        */
-                       if (h_ret != H_SUCCESS) {
-                               ehca_err(&shca->ib_device, "last "
-                                        "hipz_reg_rpage_mr failed, h_ret=%lli "
-                                        "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
-                                        " lkey=%x", h_ret, e_mr, i,
-                                        shca->ipz_hca_handle.handle,
-                                        e_mr->ipz_mr_handle.handle,
-                                        e_mr->ib.ib_mr.lkey);
-                               ret = ehca2ib_return_code(h_ret);
-                               break;
-                       } else
-                               ret = 0;
-               } else if (h_ret != H_PAGE_REGISTERED) {
-                       ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
-                                "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
-                                "mr_hndl=%llx", h_ret, e_mr, i,
-                                e_mr->ib.ib_mr.lkey,
-                                shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle);
-                       ret = ehca2ib_return_code(h_ret);
-                       break;
-               } else
-                       ret = 0;
-       } /* end for(i) */
-
-
-ehca_reg_mr_rpages_exit1:
-       ehca_free_fw_ctrlblock(kpage);
-ehca_reg_mr_rpages_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
-                        "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
-                        pginfo, pginfo->num_kpages, pginfo->num_hwpages);
-       return ret;
-} /* end ehca_reg_mr_rpages() */
-
-/*----------------------------------------------------------------------*/
-
-inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
-                               struct ehca_mr *e_mr,
-                               u64 *iova_start,
-                               u64 size,
-                               u32 acl,
-                               struct ehca_pd *e_pd,
-                               struct ehca_mr_pginfo *pginfo,
-                               u32 *lkey, /*OUT*/
-                               u32 *rkey) /*OUT*/
-{
-       int ret;
-       u64 h_ret;
-       u32 hipz_acl;
-       u64 *kpage;
-       u64 rpage;
-       struct ehca_mr_pginfo pginfo_save;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               ret = -ENOMEM;
-               goto ehca_rereg_mr_rereg1_exit0;
-       }
-
-       pginfo_save = *pginfo;
-       ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
-       if (ret) {
-               ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
-                        "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
-                        "kpage=%p", e_mr, pginfo, pginfo->type,
-                        pginfo->num_kpages, pginfo->num_hwpages, kpage);
-               goto ehca_rereg_mr_rereg1_exit1;
-       }
-       rpage = __pa(kpage);
-       if (!rpage) {
-               ehca_err(&shca->ib_device, "kpage=%p", kpage);
-               ret = -EFAULT;
-               goto ehca_rereg_mr_rereg1_exit1;
-       }
-       h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
-                                     (u64)iova_start, size, hipz_acl,
-                                     e_pd->fw_pd, rpage, &hipzout);
-       if (h_ret != H_SUCCESS) {
-               /*
-                * reregistration unsuccessful, try it again with the 3 hCalls,
-                * e.g. this is required in case H_MR_CONDITION
-                * (MW bound or MR is shared)
-                */
-               ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
-                         "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
-               *pginfo = pginfo_save;
-               ret = -EAGAIN;
-       } else if ((u64 *)hipzout.vaddr != iova_start) {
-               ehca_err(&shca->ib_device, "PHYP changed iova_start in "
-                        "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
-                        "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
-                        hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
-                        e_mr->ib.ib_mr.lkey, hipzout.lkey);
-               ret = -EFAULT;
-       } else {
-               /*
-                * successful reregistration
-                * note: start and start_out are identical for eServer HCAs
-                */
-               e_mr->num_kpages = pginfo->num_kpages;
-               e_mr->num_hwpages = pginfo->num_hwpages;
-               e_mr->hwpage_size = pginfo->hwpage_size;
-               e_mr->start = iova_start;
-               e_mr->size = size;
-               e_mr->acl = acl;
-               *lkey = hipzout.lkey;
-               *rkey = hipzout.rkey;
-       }
-
-ehca_rereg_mr_rereg1_exit1:
-       ehca_free_fw_ctrlblock(kpage);
-ehca_rereg_mr_rereg1_exit0:
-       if ( ret && (ret != -EAGAIN) )
-               ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
-                        "pginfo=%p num_kpages=%llx num_hwpages=%llx",
-                        ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
-                        pginfo->num_hwpages);
-       return ret;
-} /* end ehca_rereg_mr_rereg1() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_rereg_mr(struct ehca_shca *shca,
-                 struct ehca_mr *e_mr,
-                 u64 *iova_start,
-                 u64 size,
-                 int acl,
-                 struct ehca_pd *e_pd,
-                 struct ehca_mr_pginfo *pginfo,
-                 u32 *lkey,
-                 u32 *rkey)
-{
-       int ret = 0;
-       u64 h_ret;
-       int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
-       int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
-
-       /* first determine reregistration hCall(s) */
-       if ((pginfo->num_hwpages > MAX_RPAGES) ||
-           (e_mr->num_hwpages > MAX_RPAGES) ||
-           (pginfo->num_hwpages > e_mr->num_hwpages)) {
-               ehca_dbg(&shca->ib_device, "Rereg3 case, "
-                        "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
-                        pginfo->num_hwpages, e_mr->num_hwpages);
-               rereg_1_hcall = 0;
-               rereg_3_hcall = 1;
-       }
-
-       if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */
-               rereg_1_hcall = 0;
-               rereg_3_hcall = 1;
-               e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
-               ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
-                        e_mr);
-       }
-
-       if (rereg_1_hcall) {
-               ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
-                                          acl, e_pd, pginfo, lkey, rkey);
-               if (ret) {
-                       if (ret == -EAGAIN)
-                               rereg_3_hcall = 1;
-                       else
-                               goto ehca_rereg_mr_exit0;
-               }
-       }
-
-       if (rereg_3_hcall) {
-               struct ehca_mr save_mr;
-
-               /* first deregister old MR */
-               h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
-               if (h_ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-                                "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
-                                "mr->lkey=%x",
-                                h_ret, e_mr, shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle,
-                                e_mr->ib.ib_mr.lkey);
-                       ret = ehca2ib_return_code(h_ret);
-                       goto ehca_rereg_mr_exit0;
-               }
-               /* clean ehca_mr_t, without changing struct ib_mr and lock */
-               save_mr = *e_mr;
-               ehca_mr_deletenew(e_mr);
-
-               /* set some MR values */
-               e_mr->flags = save_mr.flags;
-               e_mr->hwpage_size = save_mr.hwpage_size;
-               e_mr->fmr_page_size = save_mr.fmr_page_size;
-               e_mr->fmr_max_pages = save_mr.fmr_max_pages;
-               e_mr->fmr_max_maps = save_mr.fmr_max_maps;
-               e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
-
-               ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-                                 e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
-               if (ret) {
-                       u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
-                       memcpy(&e_mr->flags, &(save_mr.flags),
-                              sizeof(struct ehca_mr) - offset);
-                       goto ehca_rereg_mr_exit0;
-               }
-       }
-
-ehca_rereg_mr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
-                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
-                        "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
-                        "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
-                        acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
-                        rereg_1_hcall, rereg_3_hcall);
-       return ret;
-} /* end ehca_rereg_mr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_unmap_one_fmr(struct ehca_shca *shca,
-                      struct ehca_mr *e_fmr)
-{
-       int ret = 0;
-       u64 h_ret;
-       struct ehca_pd *e_pd =
-               container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
-       struct ehca_mr save_fmr;
-       u32 tmp_lkey, tmp_rkey;
-       struct ehca_mr_pginfo pginfo;
-       struct ehca_mr_hipzout_parms hipzout;
-       struct ehca_mr save_mr;
-
-       if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
-               /*
-                * note: after using rereg hcall with len=0,
-                * rereg hcall must be used again for registering pages
-                */
-               h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
-                                             0, 0, e_pd->fw_pd, 0, &hipzout);
-               if (h_ret == H_SUCCESS) {
-                       /* successful reregistration */
-                       e_fmr->start = NULL;
-                       e_fmr->size = 0;
-                       tmp_lkey = hipzout.lkey;
-                       tmp_rkey = hipzout.rkey;
-                       return 0;
-               }
-               /*
-                * should not happen, because length checked above,
-                * FMRs are not shared and no MW bound to FMRs
-                */
-               ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
-                        "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
-                        "mr_hndl=%llx lkey=%x lkey_out=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle,
-                        e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
-               /* try free and rereg */
-       }
-
-       /* first free old FMR */
-       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-                        "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
-                        "lkey=%x",
-                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
-                        e_fmr->ipz_mr_handle.handle,
-                        e_fmr->ib.ib_fmr.lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_unmap_one_fmr_exit0;
-       }
-       /* clean ehca_mr_t, without changing lock */
-       save_fmr = *e_fmr;
-       ehca_mr_deletenew(e_fmr);
-
-       /* set some MR values */
-       e_fmr->flags = save_fmr.flags;
-       e_fmr->hwpage_size = save_fmr.hwpage_size;
-       e_fmr->fmr_page_size = save_fmr.fmr_page_size;
-       e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
-       e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
-       e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
-       e_fmr->acl = save_fmr.acl;
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_FMR;
-       ret = ehca_reg_mr(shca, e_fmr, NULL,
-                         (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
-                         e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-                         &tmp_rkey, EHCA_REG_MR);
-       if (ret) {
-               u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
-               memcpy(&e_fmr->flags, &(save_mr.flags),
-                      sizeof(struct ehca_mr) - offset);
-       }
-
-ehca_unmap_one_fmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
-                        "fmr_max_pages=%x",
-                        ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
-       return ret;
-} /* end ehca_unmap_one_fmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_smr(struct ehca_shca *shca,
-                struct ehca_mr *e_origmr,
-                struct ehca_mr *e_newmr,
-                u64 *iova_start,
-                int acl,
-                struct ehca_pd *e_pd,
-                u32 *lkey, /*OUT*/
-                u32 *rkey) /*OUT*/
-{
-       int ret = 0;
-       u64 h_ret;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
-
-       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
-                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
-                                   &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
-                        "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
-                        "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
-                        shca->ipz_hca_handle.handle,
-                        e_origmr->ipz_mr_handle.handle,
-                        e_origmr->ib.ib_mr.lkey);
-               ret = ehca2ib_return_code(h_ret);
-               goto ehca_reg_smr_exit0;
-       }
-       /* successful registration */
-       e_newmr->num_kpages = e_origmr->num_kpages;
-       e_newmr->num_hwpages = e_origmr->num_hwpages;
-       e_newmr->hwpage_size   = e_origmr->hwpage_size;
-       e_newmr->start = iova_start;
-       e_newmr->size = e_origmr->size;
-       e_newmr->acl = acl;
-       e_newmr->ipz_mr_handle = hipzout.handle;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-
-ehca_reg_smr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
-                        "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
-                        ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
-       return ret;
-} /* end ehca_reg_smr() */
-
-/*----------------------------------------------------------------------*/
-static inline void *ehca_calc_sectbase(int top, int dir, int idx)
-{
-       unsigned long ret = idx;
-       ret |= dir << EHCA_DIR_INDEX_SHIFT;
-       ret |= top << EHCA_TOP_INDEX_SHIFT;
-       return __va(ret << SECTION_SIZE_BITS);
-}
-
-#define ehca_bmap_valid(entry) \
-       ((u64)entry != (u64)EHCA_INVAL_ADDR)
-
-static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
-                              struct ehca_shca *shca, struct ehca_mr *mr,
-                              struct ehca_mr_pginfo *pginfo)
-{
-       u64 h_ret = 0;
-       unsigned long page = 0;
-       u64 rpage = __pa(kpage);
-       int page_count;
-
-       void *sectbase = ehca_calc_sectbase(top, dir, idx);
-       if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
-               ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
-                                          "hwpage_size does not fit to "
-                                          "section start address");
-       }
-       page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
-
-       while (page < page_count) {
-               u64 rnum;
-               for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
-                    rnum++) {
-                       void *pg = sectbase + ((page++) * pginfo->hwpage_size);
-                       kpage[rnum] = __pa(pg);
-               }
-
-               h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
-                       ehca_encode_hwpage_size(pginfo->hwpage_size),
-                       0, rpage, rnum);
-
-               if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
-                       ehca_err(&shca->ib_device, "register_rpage_mr failed");
-                       return h_ret;
-               }
-       }
-       return h_ret;
-}
-
-static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
-                               struct ehca_shca *shca, struct ehca_mr *mr,
-                               struct ehca_mr_pginfo *pginfo)
-{
-       u64 hret = H_SUCCESS;
-       int idx;
-
-       for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
-                       continue;
-
-               hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
-                                          pginfo);
-               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
-                               return hret;
-       }
-       return hret;
-}
-
-static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
-                                   struct ehca_mr *mr,
-                                   struct ehca_mr_pginfo *pginfo)
-{
-       u64 hret = H_SUCCESS;
-       int dir;
-
-       for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-                       continue;
-
-               hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
-               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
-                               return hret;
-       }
-       return hret;
-}
-
-/* register internal max-MR to internal SHCA */
-int ehca_reg_internal_maxmr(
-       struct ehca_shca *shca,
-       struct ehca_pd *e_pd,
-       struct ehca_mr **e_maxmr)  /*OUT*/
-{
-       int ret;
-       struct ehca_mr *e_mr;
-       u64 *iova_start;
-       u64 size_maxmr;
-       struct ehca_mr_pginfo pginfo;
-       struct ib_phys_buf ib_pbuf;
-       u32 num_kpages;
-       u32 num_hwpages;
-       u64 hw_pgsize;
-
-       if (!ehca_bmap) {
-               ret = -EFAULT;
-               goto ehca_reg_internal_maxmr_exit0;
-       }
-
-       e_mr = ehca_mr_new();
-       if (!e_mr) {
-               ehca_err(&shca->ib_device, "out of memory");
-               ret = -ENOMEM;
-               goto ehca_reg_internal_maxmr_exit0;
-       }
-       e_mr->flags |= EHCA_MR_FLAG_MAXMR;
-
-       /* register internal max-MR on HCA */
-       size_maxmr = ehca_mr_len;
-       iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
-       ib_pbuf.addr = 0;
-       ib_pbuf.size = size_maxmr;
-       num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
-                               PAGE_SIZE);
-       hw_pgsize = ehca_get_max_hwpage_size(shca);
-       num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
-                                hw_pgsize);
-
-       memset(&pginfo, 0, sizeof(pginfo));
-       pginfo.type = EHCA_MR_PGI_PHYS;
-       pginfo.num_kpages = num_kpages;
-       pginfo.num_hwpages = num_hwpages;
-       pginfo.hwpage_size = hw_pgsize;
-       pginfo.u.phy.num_phys_buf = 1;
-       pginfo.u.phy.phys_buf_array = &ib_pbuf;
-
-       ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
-                         &pginfo, &e_mr->ib.ib_mr.lkey,
-                         &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
-       if (ret) {
-               ehca_err(&shca->ib_device, "reg of internal max MR failed, "
-                        "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
-                        "num_hwpages=%x", e_mr, iova_start, size_maxmr,
-                        num_kpages, num_hwpages);
-               goto ehca_reg_internal_maxmr_exit1;
-       }
-
-       /* successful registration of all pages */
-       e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
-       e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
-       e_mr->ib.ib_mr.uobject = NULL;
-       atomic_inc(&(e_pd->ib_pd.usecnt));
-       atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
-       *e_maxmr = e_mr;
-       return 0;
-
-ehca_reg_internal_maxmr_exit1:
-       ehca_mr_delete(e_mr);
-ehca_reg_internal_maxmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
-                        ret, shca, e_pd, e_maxmr);
-       return ret;
-} /* end ehca_reg_internal_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_reg_maxmr(struct ehca_shca *shca,
-                  struct ehca_mr *e_newmr,
-                  u64 *iova_start,
-                  int acl,
-                  struct ehca_pd *e_pd,
-                  u32 *lkey,
-                  u32 *rkey)
-{
-       u64 h_ret;
-       struct ehca_mr *e_origmr = shca->maxmr;
-       u32 hipz_acl;
-       struct ehca_mr_hipzout_parms hipzout;
-
-       ehca_mrmw_map_acl(acl, &hipz_acl);
-       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
-
-       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
-                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
-                                   &hipzout);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
-                        "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
-                        h_ret, e_origmr, shca->ipz_hca_handle.handle,
-                        e_origmr->ipz_mr_handle.handle,
-                        e_origmr->ib.ib_mr.lkey);
-               return ehca2ib_return_code(h_ret);
-       }
-       /* successful registration */
-       e_newmr->num_kpages = e_origmr->num_kpages;
-       e_newmr->num_hwpages = e_origmr->num_hwpages;
-       e_newmr->hwpage_size = e_origmr->hwpage_size;
-       e_newmr->start = iova_start;
-       e_newmr->size = e_origmr->size;
-       e_newmr->acl = acl;
-       e_newmr->ipz_mr_handle = hipzout.handle;
-       *lkey = hipzout.lkey;
-       *rkey = hipzout.rkey;
-       return 0;
-} /* end ehca_reg_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
-{
-       int ret;
-       struct ehca_mr *e_maxmr;
-       struct ib_pd *ib_pd;
-
-       if (!shca->maxmr) {
-               ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
-               ret = -EINVAL;
-               goto ehca_dereg_internal_maxmr_exit0;
-       }
-
-       e_maxmr = shca->maxmr;
-       ib_pd = e_maxmr->ib.ib_mr.pd;
-       shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
-
-       ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
-       if (ret) {
-               ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
-                        "ret=%i e_maxmr=%p shca=%p lkey=%x",
-                        ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
-               shca->maxmr = e_maxmr;
-               goto ehca_dereg_internal_maxmr_exit0;
-       }
-
-       atomic_dec(&ib_pd->usecnt);
-
-ehca_dereg_internal_maxmr_exit0:
-       if (ret)
-               ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
-                        ret, shca, shca->maxmr);
-       return ret;
-} /* end ehca_dereg_internal_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * check physical buffer array of MR verbs for validness and
- * calculates MR size
- */
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-                                 int num_phys_buf,
-                                 u64 *iova_start,
-                                 u64 *size)
-{
-       struct ib_phys_buf *pbuf = phys_buf_array;
-       u64 size_count = 0;
-       u32 i;
-
-       if (num_phys_buf == 0) {
-               ehca_gen_err("bad phys buf array len, num_phys_buf=0");
-               return -EINVAL;
-       }
-       /* check first buffer */
-       if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
-               ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
-                            "pbuf->addr=%llx pbuf->size=%llx",
-                            iova_start, pbuf->addr, pbuf->size);
-               return -EINVAL;
-       }
-       if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
-           (num_phys_buf > 1)) {
-               ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
-                            "pbuf->size=%llx", pbuf->addr, pbuf->size);
-               return -EINVAL;
-       }
-
-       for (i = 0; i < num_phys_buf; i++) {
-               if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
-                       ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
-                                    "pbuf->size=%llx",
-                                    i, pbuf->addr, pbuf->size);
-                       return -EINVAL;
-               }
-               if (((i > 0) && /* not 1st */
-                    (i < (num_phys_buf - 1)) &&        /* not last */
-                    (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
-                       ehca_gen_err("bad size, i=%x pbuf->size=%llx",
-                                    i, pbuf->size);
-                       return -EINVAL;
-               }
-               size_count += pbuf->size;
-               pbuf++;
-       }
-
-       *size = size_count;
-       return 0;
-} /* end ehca_mr_chk_buf_and_calc_size() */
-
-/*----------------------------------------------------------------------*/
-
-/* check page list of map FMR verb for validness */
-int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
-                            u64 *page_list,
-                            int list_len)
-{
-       u32 i;
-       u64 *page;
-
-       if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
-               ehca_gen_err("bad list_len, list_len=%x "
-                            "e_fmr->fmr_max_pages=%x fmr=%p",
-                            list_len, e_fmr->fmr_max_pages, e_fmr);
-               return -EINVAL;
-       }
-
-       /* each page must be aligned */
-       page = page_list;
-       for (i = 0; i < list_len; i++) {
-               if (*page % e_fmr->fmr_page_size) {
-                       ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
-                                    "fmr_page_size=%x", i, *page, page, e_fmr,
-                                    e_fmr->fmr_page_size);
-                       return -EINVAL;
-               }
-               page++;
-       }
-
-       return 0;
-} /* end ehca_fmr_check_page_list() */
-
-/*----------------------------------------------------------------------*/
-
-/* PAGE_SIZE >= pginfo->hwpage_size */
-static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
-                                 u32 number,
-                                 u64 *kpage)
-{
-       int ret = 0;
-       u64 pgaddr;
-       u32 j = 0;
-       int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
-       struct scatterlist **sg = &pginfo->u.usr.next_sg;
-
-       while (*sg != NULL) {
-               pgaddr = page_to_pfn(sg_page(*sg))
-                       << PAGE_SHIFT;
-               *kpage = pgaddr + (pginfo->next_hwpage *
-                                  pginfo->hwpage_size);
-               if (!(*kpage)) {
-                       ehca_gen_err("pgaddr=%llx "
-                                    "sg_dma_address=%llx "
-                                    "entry=%llx next_hwpage=%llx",
-                                    pgaddr, (u64)sg_dma_address(*sg),
-                                    pginfo->u.usr.next_nmap,
-                                    pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               (pginfo->next_hwpage)++;
-               kpage++;
-               if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
-                       (pginfo->kpage_cnt)++;
-                       (pginfo->u.usr.next_nmap)++;
-                       pginfo->next_hwpage = 0;
-                       *sg = sg_next(*sg);
-               }
-               j++;
-               if (j >= number)
-                       break;
-       }
-
-       return ret;
-}
-
-/*
- * check given pages for contiguous layout
- * last page addr is returned in prev_pgaddr for further check
- */
-static int ehca_check_kpages_per_ate(struct scatterlist **sg,
-                                    int num_pages,
-                                    u64 *prev_pgaddr)
-{
-       for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
-               u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
-               if (ehca_debug_level >= 3)
-                       ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
-                                    *(u64 *)__va(pgaddr));
-               if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
-                       ehca_gen_err("uncontiguous page found pgaddr=%llx "
-                                    "prev_pgaddr=%llx entries_left_in_hwpage=%x",
-                                    pgaddr, *prev_pgaddr, num_pages);
-                       return -EINVAL;
-               }
-               *prev_pgaddr = pgaddr;
-       }
-       return 0;
-}
-
-/* PAGE_SIZE < pginfo->hwpage_size */
-static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
-                                 u32 number,
-                                 u64 *kpage)
-{
-       int ret = 0;
-       u64 pgaddr, prev_pgaddr;
-       u32 j = 0;
-       int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
-       int nr_kpages = kpages_per_hwpage;
-       struct scatterlist **sg = &pginfo->u.usr.next_sg;
-
-       while (*sg != NULL) {
-
-               if (nr_kpages == kpages_per_hwpage) {
-                       pgaddr = (page_to_pfn(sg_page(*sg))
-                                  << PAGE_SHIFT);
-                       *kpage = pgaddr;
-                       if (!(*kpage)) {
-                               ehca_gen_err("pgaddr=%llx entry=%llx",
-                                            pgaddr, pginfo->u.usr.next_nmap);
-                               ret = -EFAULT;
-                               return ret;
-                       }
-                       /*
-                        * The first page in a hwpage must be aligned;
-                        * the first MR page is exempt from this rule.
-                        */
-                       if (pgaddr & (pginfo->hwpage_size - 1)) {
-                               if (pginfo->hwpage_cnt) {
-                                       ehca_gen_err(
-                                               "invalid alignment "
-                                               "pgaddr=%llx entry=%llx "
-                                               "mr_pgsize=%llx",
-                                               pgaddr, pginfo->u.usr.next_nmap,
-                                               pginfo->hwpage_size);
-                                       ret = -EFAULT;
-                                       return ret;
-                               }
-                               /* first MR page */
-                               pginfo->kpage_cnt =
-                                       (pgaddr &
-                                        (pginfo->hwpage_size - 1)) >>
-                                       PAGE_SHIFT;
-                               nr_kpages -= pginfo->kpage_cnt;
-                               *kpage = pgaddr &
-                                        ~(pginfo->hwpage_size - 1);
-                       }
-                       if (ehca_debug_level >= 3) {
-                               u64 val = *(u64 *)__va(pgaddr);
-                               ehca_gen_dbg("kpage=%llx page=%llx "
-                                            "value=%016llx",
-                                            *kpage, pgaddr, val);
-                       }
-                       prev_pgaddr = pgaddr;
-                       *sg = sg_next(*sg);
-                       pginfo->kpage_cnt++;
-                       pginfo->u.usr.next_nmap++;
-                       nr_kpages--;
-                       if (!nr_kpages)
-                               goto next_kpage;
-                       continue;
-               }
-
-               ret = ehca_check_kpages_per_ate(sg, nr_kpages,
-                                               &prev_pgaddr);
-               if (ret)
-                       return ret;
-               pginfo->kpage_cnt += nr_kpages;
-               pginfo->u.usr.next_nmap += nr_kpages;
-
-next_kpage:
-               nr_kpages = kpages_per_hwpage;
-               (pginfo->hwpage_cnt)++;
-               kpage++;
-               j++;
-               if (j >= number)
-                       break;
-       }
-
-       return ret;
-}
-
-static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
-                                u32 number, u64 *kpage)
-{
-       int ret = 0;
-       struct ib_phys_buf *pbuf;
-       u64 num_hw, offs_hw;
-       u32 i = 0;
-
-       /* loop over desired phys_buf_array entries */
-       while (i < number) {
-               pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
-               num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
-                                    pbuf->size, pginfo->hwpage_size);
-               offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
-                       pginfo->hwpage_size;
-               while (pginfo->next_hwpage < offs_hw + num_hw) {
-                       /* sanity check */
-                       if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
-                           (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
-                               ehca_gen_err("kpage_cnt >= num_kpages, "
-                                            "kpage_cnt=%llx num_kpages=%llx "
-                                            "hwpage_cnt=%llx "
-                                            "num_hwpages=%llx i=%x",
-                                            pginfo->kpage_cnt,
-                                            pginfo->num_kpages,
-                                            pginfo->hwpage_cnt,
-                                            pginfo->num_hwpages, i);
-                               return -EFAULT;
-                       }
-                       *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
-                                (pginfo->next_hwpage * pginfo->hwpage_size);
-                       if ( !(*kpage) && pbuf->addr ) {
-                               ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
-                                            "next_hwpage=%llx", pbuf->addr,
-                                            pbuf->size, pginfo->next_hwpage);
-                               return -EFAULT;
-                       }
-                       (pginfo->hwpage_cnt)++;
-                       (pginfo->next_hwpage)++;
-                       if (PAGE_SIZE >= pginfo->hwpage_size) {
-                               if (pginfo->next_hwpage %
-                                   (PAGE_SIZE / pginfo->hwpage_size) == 0)
-                                       (pginfo->kpage_cnt)++;
-                       } else
-                               pginfo->kpage_cnt += pginfo->hwpage_size /
-                                       PAGE_SIZE;
-                       kpage++;
-                       i++;
-                       if (i >= number) break;
-               }
-               if (pginfo->next_hwpage >= offs_hw + num_hw) {
-                       (pginfo->u.phy.next_buf)++;
-                       pginfo->next_hwpage = 0;
-               }
-       }
-       return ret;
-}
-
-static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
-                               u32 number, u64 *kpage)
-{
-       int ret = 0;
-       u64 *fmrlist;
-       u32 i;
-
-       /* loop over desired page_list entries */
-       fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
-       for (i = 0; i < number; i++) {
-               *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
-                          pginfo->next_hwpage * pginfo->hwpage_size;
-               if ( !(*kpage) ) {
-                       ehca_gen_err("*fmrlist=%llx fmrlist=%p "
-                                    "next_listelem=%llx next_hwpage=%llx",
-                                    *fmrlist, fmrlist,
-                                    pginfo->u.fmr.next_listelem,
-                                    pginfo->next_hwpage);
-                       return -EFAULT;
-               }
-               (pginfo->hwpage_cnt)++;
-               if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
-                       if (pginfo->next_hwpage %
-                           (pginfo->u.fmr.fmr_pgsize /
-                            pginfo->hwpage_size) == 0) {
-                               (pginfo->kpage_cnt)++;
-                               (pginfo->u.fmr.next_listelem)++;
-                               fmrlist++;
-                               pginfo->next_hwpage = 0;
-                       } else
-                               (pginfo->next_hwpage)++;
-               } else {
-                       unsigned int cnt_per_hwpage = pginfo->hwpage_size /
-                               pginfo->u.fmr.fmr_pgsize;
-                       unsigned int j;
-                       u64 prev = *kpage;
-                       /* check if adrs are contiguous */
-                       for (j = 1; j < cnt_per_hwpage; j++) {
-                               u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
-                               if (prev + pginfo->u.fmr.fmr_pgsize != p) {
-                                       ehca_gen_err("uncontiguous fmr pages "
-                                                    "found prev=%llx p=%llx "
-                                                    "idx=%x", prev, p, i + j);
-                                       return -EINVAL;
-                               }
-                               prev = p;
-                       }
-                       pginfo->kpage_cnt += cnt_per_hwpage;
-                       pginfo->u.fmr.next_listelem += cnt_per_hwpage;
-                       fmrlist += cnt_per_hwpage;
-               }
-               kpage++;
-       }
-       return ret;
-}
-
-/* setup page buffer from page info */
-int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
-                    u32 number,
-                    u64 *kpage)
-{
-       int ret;
-
-       switch (pginfo->type) {
-       case EHCA_MR_PGI_PHYS:
-               ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
-               break;
-       case EHCA_MR_PGI_USER:
-               ret = PAGE_SIZE >= pginfo->hwpage_size ?
-                       ehca_set_pagebuf_user1(pginfo, number, kpage) :
-                       ehca_set_pagebuf_user2(pginfo, number, kpage);
-               break;
-       case EHCA_MR_PGI_FMR:
-               ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
-               break;
-       default:
-               ehca_gen_err("bad pginfo->type=%x", pginfo->type);
-               ret = -EFAULT;
-               break;
-       }
-       return ret;
-} /* end ehca_set_pagebuf() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * check MR if it is a max-MR, i.e. uses whole memory
- * in case it's a max-MR 1 is returned, else 0
- */
-int ehca_mr_is_maxmr(u64 size,
-                    u64 *iova_start)
-{
-       /* a MR is treated as max-MR only if it fits following: */
-       if ((size == ehca_mr_len) &&
-           (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
-               ehca_gen_dbg("this is a max-MR");
-               return 1;
-       } else
-               return 0;
-} /* end ehca_mr_is_maxmr() */
-
-/*----------------------------------------------------------------------*/
-
-/* map access control for MR/MW. This routine is used for MR and MW. */
-void ehca_mrmw_map_acl(int ib_acl,
-                      u32 *hipz_acl)
-{
-       *hipz_acl = 0;
-       if (ib_acl & IB_ACCESS_REMOTE_READ)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
-       if (ib_acl & IB_ACCESS_REMOTE_WRITE)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
-       if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
-               *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
-       if (ib_acl & IB_ACCESS_LOCAL_WRITE)
-               *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
-       if (ib_acl & IB_ACCESS_MW_BIND)
-               *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
-} /* end ehca_mrmw_map_acl() */
-
-/*----------------------------------------------------------------------*/
-
-/* sets page size in hipz access control for MR/MW. */
-void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
-{
-       *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
-} /* end ehca_mrmw_set_pgsize_hipz_acl() */
-
-/*----------------------------------------------------------------------*/
-
-/*
- * reverse map access control for MR/MW.
- * This routine is used for MR and MW.
- */
-void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
-                              int *ib_acl) /*OUT*/
-{
-       *ib_acl = 0;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
-               *ib_acl |= IB_ACCESS_REMOTE_READ;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
-               *ib_acl |= IB_ACCESS_REMOTE_WRITE;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
-               *ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
-               *ib_acl |= IB_ACCESS_LOCAL_WRITE;
-       if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
-               *ib_acl |= IB_ACCESS_MW_BIND;
-} /* end ehca_mrmw_reverse_map_acl() */
-
-
-/*----------------------------------------------------------------------*/
-
-/*
- * MR destructor and constructor
- * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
- * except struct ib_mr and spinlock
- */
-void ehca_mr_deletenew(struct ehca_mr *mr)
-{
-       mr->flags = 0;
-       mr->num_kpages = 0;
-       mr->num_hwpages = 0;
-       mr->acl = 0;
-       mr->start = NULL;
-       mr->fmr_page_size = 0;
-       mr->fmr_max_pages = 0;
-       mr->fmr_max_maps = 0;
-       mr->fmr_map_cnt = 0;
-       memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
-       memset(&mr->galpas, 0, sizeof(mr->galpas));
-} /* end ehca_mr_deletenew() */
-
-int ehca_init_mrmw_cache(void)
-{
-       mr_cache = kmem_cache_create("ehca_cache_mr",
-                                    sizeof(struct ehca_mr), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!mr_cache)
-               return -ENOMEM;
-       mw_cache = kmem_cache_create("ehca_cache_mw",
-                                    sizeof(struct ehca_mw), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!mw_cache) {
-               kmem_cache_destroy(mr_cache);
-               mr_cache = NULL;
-               return -ENOMEM;
-       }
-       return 0;
-}
-
-void ehca_cleanup_mrmw_cache(void)
-{
-       if (mr_cache)
-               kmem_cache_destroy(mr_cache);
-       if (mw_cache)
-               kmem_cache_destroy(mw_cache);
-}
-
-static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
-                                    int dir)
-{
-       if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
-               ehca_top_bmap->dir[dir] =
-                       kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
-               if (!ehca_top_bmap->dir[dir])
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
-       }
-       return 0;
-}
-
-static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
-{
-       if (!ehca_bmap_valid(ehca_bmap->top[top])) {
-               ehca_bmap->top[top] =
-                       kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
-               if (!ehca_bmap->top[top])
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
-       }
-       return ehca_init_top_bmap(ehca_bmap->top[top], dir);
-}
-
-static inline int ehca_calc_index(unsigned long i, unsigned long s)
-{
-       return (i >> s) & EHCA_INDEX_MASK;
-}
-
-void ehca_destroy_busmap(void)
-{
-       int top, dir;
-
-       if (!ehca_bmap)
-               return;
-
-       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]))
-                       continue;
-               for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
-                       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-                               continue;
-
-                       kfree(ehca_bmap->top[top]->dir[dir]);
-               }
-
-               kfree(ehca_bmap->top[top]);
-       }
-
-       kfree(ehca_bmap);
-       ehca_bmap = NULL;
-}
-
-static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
-{
-       unsigned long i, start_section, end_section;
-       int top, dir, idx;
-
-       if (!nr_pages)
-               return 0;
-
-       if (!ehca_bmap) {
-               ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
-               if (!ehca_bmap)
-                       return -ENOMEM;
-               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
-               memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
-       }
-
-       start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
-       end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
-       for (i = start_section; i < end_section; i++) {
-               int ret;
-               top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
-               dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
-               idx = i & EHCA_INDEX_MASK;
-
-               ret = ehca_init_bmap(ehca_bmap, top, dir);
-               if (ret) {
-                       ehca_destroy_busmap();
-                       return ret;
-               }
-               ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
-               ehca_mr_len += EHCA_SECTSIZE;
-       }
-       return 0;
-}
-
-static int ehca_is_hugepage(unsigned long pfn)
-{
-       int page_order;
-
-       if (pfn & EHCA_HUGEPAGE_PFN_MASK)
-               return 0;
-
-       page_order = compound_order(pfn_to_page(pfn));
-       if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
-               return 0;
-
-       return 1;
-}
-
-static int ehca_create_busmap_callback(unsigned long initial_pfn,
-                                      unsigned long total_nr_pages, void *arg)
-{
-       int ret;
-       unsigned long pfn, start_pfn, end_pfn, nr_pages;
-
-       if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
-               return ehca_update_busmap(initial_pfn, total_nr_pages);
-
-       /* Given chunk is >= 16GB -> check for hugepages */
-       start_pfn = initial_pfn;
-       end_pfn = initial_pfn + total_nr_pages;
-       pfn = start_pfn;
-
-       while (pfn < end_pfn) {
-               if (ehca_is_hugepage(pfn)) {
-                       /* Add mem found in front of the hugepage */
-                       nr_pages = pfn - start_pfn;
-                       ret = ehca_update_busmap(start_pfn, nr_pages);
-                       if (ret)
-                               return ret;
-                       /* Skip the hugepage */
-                       pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
-                       start_pfn = pfn;
-               } else
-                       pfn += (EHCA_SECTSIZE / PAGE_SIZE);
-       }
-
-       /* Add mem found behind the hugepage(s)  */
-       nr_pages = pfn - start_pfn;
-       return ehca_update_busmap(start_pfn, nr_pages);
-}
-
-int ehca_create_busmap(void)
-{
-       int ret;
-
-       ehca_mr_len = 0;
-       ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
-                                  ehca_create_busmap_callback);
-       return ret;
-}
-
-static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
-                                  struct ehca_mr *e_mr,
-                                  struct ehca_mr_pginfo *pginfo)
-{
-       int top;
-       u64 hret, *kpage;
-
-       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!kpage) {
-               ehca_err(&shca->ib_device, "kpage alloc failed");
-               return -ENOMEM;
-       }
-       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
-               if (!ehca_bmap_valid(ehca_bmap->top[top]))
-                       continue;
-               hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
-               if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
-                       break;
-       }
-
-       ehca_free_fw_ctrlblock(kpage);
-
-       if (hret == H_SUCCESS)
-               return 0; /* Everything is fine */
-       else {
-               ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
-                                "h_ret=%lli e_mr=%p top=%x lkey=%x "
-                                "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
-                                e_mr->ib.ib_mr.lkey,
-                                shca->ipz_hca_handle.handle,
-                                e_mr->ipz_mr_handle.handle);
-               return ehca2ib_return_code(hret);
-       }
-}
-
-static u64 ehca_map_vaddr(void *caddr)
-{
-       int top, dir, idx;
-       unsigned long abs_addr, offset;
-       u64 entry;
-
-       if (!ehca_bmap)
-               return EHCA_INVAL_ADDR;
-
-       abs_addr = __pa(caddr);
-       top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
-       if (!ehca_bmap_valid(ehca_bmap->top[top]))
-               return EHCA_INVAL_ADDR;
-
-       dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
-       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
-               return EHCA_INVAL_ADDR;
-
-       idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
-
-       entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
-       if (ehca_bmap_valid(entry)) {
-               offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
-               return entry | offset;
-       } else
-               return EHCA_INVAL_ADDR;
-}
-
-static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == EHCA_INVAL_ADDR;
-}
-
-static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
-                              size_t size, enum dma_data_direction direction)
-{
-       if (cpu_addr)
-               return ehca_map_vaddr(cpu_addr);
-       else
-               return EHCA_INVAL_ADDR;
-}
-
-static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
-                                 enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
-                            unsigned long offset, size_t size,
-                            enum dma_data_direction direction)
-{
-       u64 addr;
-
-       if (offset + size > PAGE_SIZE)
-               return EHCA_INVAL_ADDR;
-
-       addr = ehca_map_vaddr(page_address(page));
-       if (!ehca_dma_mapping_error(dev, addr))
-               addr += offset;
-
-       return addr;
-}
-
-static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                          int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i) {
-               u64 addr;
-               addr = ehca_map_vaddr(sg_virt(sg));
-               if (ehca_dma_mapping_error(dev, addr))
-                       return 0;
-
-               sg->dma_address = addr;
-               sg->dma_length = sg->length;
-       }
-       return nents;
-}
-
-static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
-                             int nents, enum dma_data_direction direction)
-{
-       /* This is only a stub; nothing to be done here */
-}
-
-static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
-                                        size_t size,
-                                        enum dma_data_direction dir)
-{
-       dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
-}
-
-static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
-                                           size_t size,
-                                           enum dma_data_direction dir)
-{
-       dma_sync_single_for_device(dev->dma_device, addr, size, dir);
-}
-
-static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                    u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-       u64 dma_addr;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p) {
-               addr = page_address(p);
-               dma_addr = ehca_map_vaddr(addr);
-               if (ehca_dma_mapping_error(dev, dma_addr)) {
-                       free_pages((unsigned long)addr, get_order(size));
-                       return NULL;
-               }
-               if (dma_handle)
-                       *dma_handle = dma_addr;
-               return addr;
-       }
-       return NULL;
-}
-
-static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
-                                  void *cpu_addr, u64 dma_handle)
-{
-       if (cpu_addr && size)
-               free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-
-struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
-       .mapping_error          = ehca_dma_mapping_error,
-       .map_single             = ehca_dma_map_single,
-       .unmap_single           = ehca_dma_unmap_single,
-       .map_page               = ehca_dma_map_page,
-       .unmap_page             = ehca_dma_unmap_page,
-       .map_sg                 = ehca_dma_map_sg,
-       .unmap_sg               = ehca_dma_unmap_sg,
-       .sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
-       .sync_single_for_device = ehca_dma_sync_single_for_device,
-       .alloc_coherent         = ehca_dma_alloc_coherent,
-       .free_coherent          = ehca_dma_free_coherent,
-};
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h
deleted file mode 100644 (file)
index 50d8b51..0000000
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  MR/MW declarations and inline functions
- *
- *  Authors: Dietmar Decker <ddecker@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _EHCA_MRMW_H_
-#define _EHCA_MRMW_H_
-
-enum ehca_reg_type {
-       EHCA_REG_MR,
-       EHCA_REG_BUSMAP_MR
-};
-
-int ehca_reg_mr(struct ehca_shca *shca,
-               struct ehca_mr *e_mr,
-               u64 *iova_start,
-               u64 size,
-               int acl,
-               struct ehca_pd *e_pd,
-               struct ehca_mr_pginfo *pginfo,
-               u32 *lkey,
-               u32 *rkey,
-               enum ehca_reg_type reg_type);
-
-int ehca_reg_mr_rpages(struct ehca_shca *shca,
-                      struct ehca_mr *e_mr,
-                      struct ehca_mr_pginfo *pginfo);
-
-int ehca_rereg_mr(struct ehca_shca *shca,
-                 struct ehca_mr *e_mr,
-                 u64 *iova_start,
-                 u64 size,
-                 int mr_access_flags,
-                 struct ehca_pd *e_pd,
-                 struct ehca_mr_pginfo *pginfo,
-                 u32 *lkey,
-                 u32 *rkey);
-
-int ehca_unmap_one_fmr(struct ehca_shca *shca,
-                      struct ehca_mr *e_fmr);
-
-int ehca_reg_smr(struct ehca_shca *shca,
-                struct ehca_mr *e_origmr,
-                struct ehca_mr *e_newmr,
-                u64 *iova_start,
-                int acl,
-                struct ehca_pd *e_pd,
-                u32 *lkey,
-                u32 *rkey);
-
-int ehca_reg_internal_maxmr(struct ehca_shca *shca,
-                           struct ehca_pd *e_pd,
-                           struct ehca_mr **maxmr);
-
-int ehca_reg_maxmr(struct ehca_shca *shca,
-                  struct ehca_mr *e_newmr,
-                  u64 *iova_start,
-                  int acl,
-                  struct ehca_pd *e_pd,
-                  u32 *lkey,
-                  u32 *rkey);
-
-int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
-
-int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
-                                 int num_phys_buf,
-                                 u64 *iova_start,
-                                 u64 *size);
-
-int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
-                            u64 *page_list,
-                            int list_len);
-
-int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
-                    u32 number,
-                    u64 *kpage);
-
-int ehca_mr_is_maxmr(u64 size,
-                    u64 *iova_start);
-
-void ehca_mrmw_map_acl(int ib_acl,
-                      u32 *hipz_acl);
-
-void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
-
-void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
-                              int *ib_acl);
-
-void ehca_mr_deletenew(struct ehca_mr *mr);
-
-int ehca_create_busmap(void);
-
-void ehca_destroy_busmap(void);
-
-extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
-#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
deleted file mode 100644 (file)
index 351577a..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  PD functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-
-static struct kmem_cache *pd_cache;
-
-struct ib_pd *ehca_alloc_pd(struct ib_device *device,
-                           struct ib_ucontext *context, struct ib_udata *udata)
-{
-       struct ehca_pd *pd;
-       int i;
-
-       pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
-       if (!pd) {
-               ehca_err(device, "device=%p context=%p out of memory",
-                        device, context);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       for (i = 0; i < 2; i++) {
-               INIT_LIST_HEAD(&pd->free[i]);
-               INIT_LIST_HEAD(&pd->full[i]);
-       }
-       mutex_init(&pd->lock);
-
-       /*
-        * Kernel PD: when device = -1, 0
-        * User   PD: when context != -1
-        */
-       if (!context) {
-               /*
-                * Kernel PDs after init reuses always
-                * the one created in ehca_shca_reopen()
-                */
-               struct ehca_shca *shca = container_of(device, struct ehca_shca,
-                                                     ib_device);
-               pd->fw_pd.value = shca->pd->fw_pd.value;
-       } else
-               pd->fw_pd.value = (u64)pd;
-
-       return &pd->ib_pd;
-}
-
-int ehca_dealloc_pd(struct ib_pd *pd)
-{
-       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
-       int i, leftovers = 0;
-       struct ipz_small_queue_page *page, *tmp;
-
-       for (i = 0; i < 2; i++) {
-               list_splice(&my_pd->full[i], &my_pd->free[i]);
-               list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
-                       leftovers = 1;
-                       free_page(page->page);
-                       kmem_cache_free(small_qp_cache, page);
-               }
-       }
-
-       if (leftovers)
-               ehca_warn(pd->device,
-                         "Some small queue pages were not freed");
-
-       kmem_cache_free(pd_cache, my_pd);
-
-       return 0;
-}
-
-int ehca_init_pd_cache(void)
-{
-       pd_cache = kmem_cache_create("ehca_cache_pd",
-                                    sizeof(struct ehca_pd), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!pd_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_pd_cache(void)
-{
-       if (pd_cache)
-               kmem_cache_destroy(pd_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_qes.h b/drivers/infiniband/hw/ehca/ehca_qes.h
deleted file mode 100644 (file)
index 90c4efa..0000000
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Hardware request structures
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef _EHCA_QES_H_
-#define _EHCA_QES_H_
-
-#include "ehca_tools.h"
-
-/* virtual scatter gather entry to specify remote addresses with length */
-struct ehca_vsgentry {
-       u64 vaddr;
-       u32 lkey;
-       u32 length;
-};
-
-#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
-#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
-#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
-#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
-#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
-#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
-#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
-
-/*
- * Unreliable Datagram Address Vector Format
- * see IBTA Vol1 chapter 8.3 Global Routing Header
- */
-struct ehca_ud_av {
-       u8 sl;
-       u8 lnh;
-       u16 dlid;
-       u8 reserved1;
-       u8 reserved2;
-       u8 reserved3;
-       u8 slid_path_bits;
-       u8 reserved4;
-       u8 ipd;
-       u8 reserved5;
-       u8 pmtu;
-       u32 reserved6;
-       u64 reserved7;
-       union {
-               struct {
-                       u64 word_0; /* always set to 6  */
-                       /*should be 0x1B for IB transport */
-                       u64 word_1;
-                       u64 word_2;
-                       u64 word_3;
-                       u64 word_4;
-               } grh;
-               struct {
-                       u32 wd_0;
-                       u32 wd_1;
-                       /* DWord_1 --> SGID */
-
-                       u32 sgid_wd3;
-                       u32 sgid_wd2;
-
-                       u32 sgid_wd1;
-                       u32 sgid_wd0;
-                       /* DWord_3 --> DGID */
-
-                       u32 dgid_wd3;
-                       u32 dgid_wd2;
-
-                       u32 dgid_wd1;
-                       u32 dgid_wd0;
-               } grh_l;
-       };
-};
-
-/* maximum number of sg entries allowed in a WQE */
-#define MAX_WQE_SG_ENTRIES 252
-
-#define WQE_OPTYPE_SEND             0x80
-#define WQE_OPTYPE_RDMAREAD         0x40
-#define WQE_OPTYPE_RDMAWRITE        0x20
-#define WQE_OPTYPE_CMPSWAP          0x10
-#define WQE_OPTYPE_FETCHADD         0x08
-#define WQE_OPTYPE_BIND             0x04
-
-#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
-#define WQE_WRFLAG_FENCE            0x40
-#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
-#define WQE_WRFLAG_SOLIC_EVENT      0x10
-
-#define WQEF_CACHE_HINT             0x80
-#define WQEF_CACHE_HINT_RD_WR       0x40
-#define WQEF_TIMED_WQE              0x20
-#define WQEF_PURGE                  0x08
-#define WQEF_HIGH_NIBBLE            0xF0
-
-#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
-#define MW_BIND_ACCESSCTRL_R_READ    0x20
-#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
-
-struct ehca_wqe {
-       u64 work_request_id;
-       u8 optype;
-       u8 wr_flag;
-       u16 pkeyi;
-       u8 wqef;
-       u8 nr_of_data_seg;
-       u16 wqe_provided_slid;
-       u32 destination_qp_number;
-       u32 resync_psn_sqp;
-       u32 local_ee_context_qkey;
-       u32 immediate_data;
-       union {
-               struct {
-                       u64 remote_virtual_address;
-                       u32 rkey;
-                       u32 reserved;
-                       u64 atomic_1st_op_dma_len;
-                       u64 atomic_2nd_op;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-
-               } nud;
-               struct {
-                       u64 ehca_ud_av_ptr;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 reserved3;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-               } ud_avp;
-               struct {
-                       struct ehca_ud_av ud_av;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
-                                                    2];
-               } ud_av;
-               struct {
-                       u64 reserved0;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 reserved3;
-                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
-               } all_rcv;
-
-               struct {
-                       u64 reserved;
-                       u32 rkey;
-                       u32 old_rkey;
-                       u64 reserved1;
-                       u64 reserved2;
-                       u64 virtual_address;
-                       u32 reserved3;
-                       u32 length;
-                       u32 reserved4;
-                       u16 reserved5;
-                       u8 reserved6;
-                       u8 lr_ctl;
-                       u32 lkey;
-                       u32 reserved7;
-                       u64 reserved8;
-                       u64 reserved9;
-                       u64 reserved10;
-                       u64 reserved11;
-               } bind;
-               struct {
-                       u64 reserved12;
-                       u64 reserved13;
-                       u32 size;
-                       u32 start;
-               } inline_data;
-       } u;
-
-};
-
-#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
-#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
-#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
-#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
-#define WC_STATUS_ERROR_BIT 0x80000000
-#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
-#define WC_STATUS_PURGE_BIT 0x10
-#define WC_SEND_RECEIVE_BIT 0x80
-
-struct ehca_cqe {
-       u64 work_request_id;
-       u8 optype;
-       u8 w_completion_flags;
-       u16 reserved1;
-       u32 nr_bytes_transferred;
-       u32 immediate_data;
-       u32 local_qp_number;
-       u8 freed_resource_count;
-       u8 service_level;
-       u16 wqe_count;
-       u32 qp_token;
-       u32 qkey_ee_token;
-       u32 remote_qp_number;
-       u16 dlid;
-       u16 rlid;
-       u16 reserved2;
-       u16 pkey_index;
-       u32 cqe_timestamp;
-       u32 wqe_timestamp;
-       u8 wqe_timestamp_valid;
-       u8 reserved3;
-       u8 reserved4;
-       u8 cqe_flags;
-       u32 status;
-};
-
-struct ehca_eqe {
-       u64 entry;
-};
-
-struct ehca_mrte {
-       u64 starting_va;
-       u64 length; /* length of memory region in bytes*/
-       u32 pd;
-       u8 key_instance;
-       u8 pagesize;
-       u8 mr_control;
-       u8 local_remote_access_ctrl;
-       u8 reserved[0x20 - 0x18];
-       u64 at_pointer[4];
-};
-#endif /*_EHCA_QES_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
deleted file mode 100644 (file)
index 2e89356..0000000
+++ /dev/null
@@ -1,2257 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  QP functions
- *
- *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
- *           Stefan Roscher <stefan.roscher@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-
-static struct kmem_cache *qp_cache;
-
-/*
- * attributes not supported by query qp
- */
-#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
-                                    IB_QP_EN_SQD_ASYNC_NOTIFY)
-
-/*
- * ehca (internal) qp state values
- */
-enum ehca_qp_state {
-       EHCA_QPS_RESET = 1,
-       EHCA_QPS_INIT = 2,
-       EHCA_QPS_RTR = 3,
-       EHCA_QPS_RTS = 5,
-       EHCA_QPS_SQD = 6,
-       EHCA_QPS_SQE = 8,
-       EHCA_QPS_ERR = 128
-};
-
-/*
- * qp state transitions as defined by IB Arch Rel 1.1 page 431
- */
-enum ib_qp_statetrans {
-       IB_QPST_ANY2RESET,
-       IB_QPST_ANY2ERR,
-       IB_QPST_RESET2INIT,
-       IB_QPST_INIT2RTR,
-       IB_QPST_INIT2INIT,
-       IB_QPST_RTR2RTS,
-       IB_QPST_RTS2SQD,
-       IB_QPST_RTS2RTS,
-       IB_QPST_SQD2RTS,
-       IB_QPST_SQE2RTS,
-       IB_QPST_SQD2SQD,
-       IB_QPST_MAX     /* nr of transitions, this must be last!!! */
-};
-
-/*
- * ib2ehca_qp_state maps IB to ehca qp_state
- * returns ehca qp state corresponding to given ib qp state
- */
-static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
-{
-       switch (ib_qp_state) {
-       case IB_QPS_RESET:
-               return EHCA_QPS_RESET;
-       case IB_QPS_INIT:
-               return EHCA_QPS_INIT;
-       case IB_QPS_RTR:
-               return EHCA_QPS_RTR;
-       case IB_QPS_RTS:
-               return EHCA_QPS_RTS;
-       case IB_QPS_SQD:
-               return EHCA_QPS_SQD;
-       case IB_QPS_SQE:
-               return EHCA_QPS_SQE;
-       case IB_QPS_ERR:
-               return EHCA_QPS_ERR;
-       default:
-               ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
-               return -EINVAL;
-       }
-}
-
-/*
- * ehca2ib_qp_state maps ehca to IB qp_state
- * returns ib qp state corresponding to given ehca qp state
- */
-static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
-                                               ehca_qp_state)
-{
-       switch (ehca_qp_state) {
-       case EHCA_QPS_RESET:
-               return IB_QPS_RESET;
-       case EHCA_QPS_INIT:
-               return IB_QPS_INIT;
-       case EHCA_QPS_RTR:
-               return IB_QPS_RTR;
-       case EHCA_QPS_RTS:
-               return IB_QPS_RTS;
-       case EHCA_QPS_SQD:
-               return IB_QPS_SQD;
-       case EHCA_QPS_SQE:
-               return IB_QPS_SQE;
-       case EHCA_QPS_ERR:
-               return IB_QPS_ERR;
-       default:
-               ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
-               return -EINVAL;
-       }
-}
-
-/*
- * ehca_qp_type used as index for req_attr and opt_attr of
- * struct ehca_modqp_statetrans
- */
-enum ehca_qp_type {
-       QPT_RC = 0,
-       QPT_UC = 1,
-       QPT_UD = 2,
-       QPT_SQP = 3,
-       QPT_MAX
-};
-
-/*
- * ib2ehcaqptype maps Ib to ehca qp_type
- * returns ehca qp type corresponding to ib qp type
- */
-static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
-{
-       switch (ibqptype) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               return QPT_SQP;
-       case IB_QPT_RC:
-               return QPT_RC;
-       case IB_QPT_UC:
-               return QPT_UC;
-       case IB_QPT_UD:
-               return QPT_UD;
-       default:
-               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
-               return -EINVAL;
-       }
-}
-
-static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
-                                                        int ib_tostate)
-{
-       int index = -EINVAL;
-       switch (ib_tostate) {
-       case IB_QPS_RESET:
-               index = IB_QPST_ANY2RESET;
-               break;
-       case IB_QPS_INIT:
-               switch (ib_fromstate) {
-               case IB_QPS_RESET:
-                       index = IB_QPST_RESET2INIT;
-                       break;
-               case IB_QPS_INIT:
-                       index = IB_QPST_INIT2INIT;
-                       break;
-               }
-               break;
-       case IB_QPS_RTR:
-               if (ib_fromstate == IB_QPS_INIT)
-                       index = IB_QPST_INIT2RTR;
-               break;
-       case IB_QPS_RTS:
-               switch (ib_fromstate) {
-               case IB_QPS_RTR:
-                       index = IB_QPST_RTR2RTS;
-                       break;
-               case IB_QPS_RTS:
-                       index = IB_QPST_RTS2RTS;
-                       break;
-               case IB_QPS_SQD:
-                       index = IB_QPST_SQD2RTS;
-                       break;
-               case IB_QPS_SQE:
-                       index = IB_QPST_SQE2RTS;
-                       break;
-               }
-               break;
-       case IB_QPS_SQD:
-               if (ib_fromstate == IB_QPS_RTS)
-                       index = IB_QPST_RTS2SQD;
-               break;
-       case IB_QPS_SQE:
-               break;
-       case IB_QPS_ERR:
-               index = IB_QPST_ANY2ERR;
-               break;
-       default:
-               break;
-       }
-       return index;
-}
-
-/*
- * ibqptype2servicetype returns hcp service type corresponding to given
- * ib qp type used by create_qp()
- */
-static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
-{
-       switch (ibqptype) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               return ST_UD;
-       case IB_QPT_RC:
-               return ST_RC;
-       case IB_QPT_UC:
-               return ST_UC;
-       case IB_QPT_UD:
-               return ST_UD;
-       case IB_QPT_RAW_IPV6:
-               return -EINVAL;
-       case IB_QPT_RAW_ETHERTYPE:
-               return -EINVAL;
-       default:
-               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
-               return -EINVAL;
-       }
-}
-
-/*
- * init userspace queue info from ipz_queue data
- */
-static inline void queue2resp(struct ipzu_queue_resp *resp,
-                             struct ipz_queue *queue)
-{
-       resp->qe_size = queue->qe_size;
-       resp->act_nr_of_sg = queue->act_nr_of_sg;
-       resp->queue_length = queue->queue_length;
-       resp->pagesize = queue->pagesize;
-       resp->toggle_state = queue->toggle_state;
-       resp->offset = queue->offset;
-}
-
-/*
- * init_qp_queue initializes/constructs r/squeue and registers queue pages.
- */
-static inline int init_qp_queue(struct ehca_shca *shca,
-                               struct ehca_pd *pd,
-                               struct ehca_qp *my_qp,
-                               struct ipz_queue *queue,
-                               int q_type,
-                               u64 expected_hret,
-                               struct ehca_alloc_queue_parms *parms,
-                               int wqe_size)
-{
-       int ret, cnt, ipz_rc, nr_q_pages;
-       void *vpage;
-       u64 rpage, h_ret;
-       struct ib_device *ib_dev = &shca->ib_device;
-       struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
-
-       if (!parms->queue_size)
-               return 0;
-
-       if (parms->is_small) {
-               nr_q_pages = 1;
-               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
-                                       128 << parms->page_size,
-                                       wqe_size, parms->act_nr_sges, 1);
-       } else {
-               nr_q_pages = parms->queue_size;
-               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
-                                       EHCA_PAGESIZE, wqe_size,
-                                       parms->act_nr_sges, 0);
-       }
-
-       if (!ipz_rc) {
-               ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
-                        ipz_rc);
-               return -EBUSY;
-       }
-
-       /* register queue pages */
-       for (cnt = 0; cnt < nr_q_pages; cnt++) {
-               vpage = ipz_qpageit_get_inc(queue);
-               if (!vpage) {
-                       ehca_err(ib_dev, "ipz_qpageit_get_inc() "
-                                "failed p_vpage= %p", vpage);
-                       ret = -EINVAL;
-                       goto init_qp_queue1;
-               }
-               rpage = __pa(vpage);
-
-               h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
-                                                my_qp->ipz_qp_handle,
-                                                NULL, 0, q_type,
-                                                rpage, parms->is_small ? 0 : 1,
-                                                my_qp->galpas.kernel);
-               if (cnt == (nr_q_pages - 1)) {  /* last page! */
-                       if (h_ret != expected_hret) {
-                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
-                                        "h_ret=%lli", h_ret);
-                               ret = ehca2ib_return_code(h_ret);
-                               goto init_qp_queue1;
-                       }
-                       vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
-                       if (vpage) {
-                               ehca_err(ib_dev, "ipz_qpageit_get_inc() "
-                                        "should not succeed vpage=%p", vpage);
-                               ret = -EINVAL;
-                               goto init_qp_queue1;
-                       }
-               } else {
-                       if (h_ret != H_PAGE_REGISTERED) {
-                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
-                                        "h_ret=%lli", h_ret);
-                               ret = ehca2ib_return_code(h_ret);
-                               goto init_qp_queue1;
-                       }
-               }
-       }
-
-       ipz_qeit_reset(queue);
-
-       return 0;
-
-init_qp_queue1:
-       ipz_queue_dtor(pd, queue);
-       return ret;
-}
-
-static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
-{
-       if (is_llqp)
-               return 128 << act_nr_sge;
-       else
-               return offsetof(struct ehca_wqe,
-                               u.nud.sg_list[act_nr_sge]);
-}
-
-static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
-                                      int req_nr_sge, int is_llqp)
-{
-       u32 wqe_size, q_size;
-       int act_nr_sge = req_nr_sge;
-
-       if (!is_llqp)
-               /* round up #SGEs so WQE size is a power of 2 */
-               for (act_nr_sge = 4; act_nr_sge <= 252;
-                    act_nr_sge = 4 + 2 * act_nr_sge)
-                       if (act_nr_sge >= req_nr_sge)
-                               break;
-
-       wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
-       q_size = wqe_size * (queue->max_wr + 1);
-
-       if (q_size <= 512)
-               queue->page_size = 2;
-       else if (q_size <= 1024)
-               queue->page_size = 3;
-       else
-               queue->page_size = 0;
-
-       queue->is_small = (queue->page_size != 0);
-}
-
-/* needs to be called with cq->spinlock held */
-void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
-{
-       struct list_head *list, *node;
-
-       /* TODO: support low latency QPs */
-       if (qp->ext_type == EQPT_LLQP)
-               return;
-
-       if (on_sq) {
-               list = &qp->send_cq->sqp_err_list;
-               node = &qp->sq_err_node;
-       } else {
-               list = &qp->recv_cq->rqp_err_list;
-               node = &qp->rq_err_node;
-       }
-
-       if (list_empty(node))
-               list_add_tail(node, list);
-
-       return;
-}
-
-static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cq->spinlock, flags);
-
-       if (!list_empty(node))
-               list_del_init(node);
-
-       spin_unlock_irqrestore(&cq->spinlock, flags);
-}
-
-static void reset_queue_map(struct ehca_queue_map *qmap)
-{
-       int i;
-
-       qmap->tail = qmap->entries - 1;
-       qmap->left_to_poll = 0;
-       qmap->next_wqe_idx = 0;
-       for (i = 0; i < qmap->entries; i++) {
-               qmap->map[i].reported = 1;
-               qmap->map[i].cqe_req = 0;
-       }
-}
-
-/*
- * Create an ib_qp struct that is either a QP or an SRQ, depending on
- * the value of the is_srq parameter. If init_attr and srq_init_attr share
- * fields, the field out of init_attr is used.
- */
-static struct ehca_qp *internal_create_qp(
-       struct ib_pd *pd,
-       struct ib_qp_init_attr *init_attr,
-       struct ib_srq_init_attr *srq_init_attr,
-       struct ib_udata *udata, int is_srq)
-{
-       struct ehca_qp *my_qp, *my_srq = NULL;
-       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-       struct ib_ucontext *context = NULL;
-       u64 h_ret;
-       int is_llqp = 0, has_srq = 0, is_user = 0;
-       int qp_type, max_send_sge, max_recv_sge, ret;
-
-       /* h_call's out parameters */
-       struct ehca_alloc_qp_parms parms;
-       u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
-       unsigned long flags;
-
-       if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
-               ehca_err(pd->device, "Unable to create QP, max number of %i "
-                        "QPs reached.", shca->max_num_qps);
-               ehca_err(pd->device, "To increase the maximum number of QPs "
-                        "use the number_of_qps module parameter.\n");
-               return ERR_PTR(-ENOSPC);
-       }
-
-       if (init_attr->create_flags) {
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       memset(&parms, 0, sizeof(parms));
-       qp_type = init_attr->qp_type;
-
-       if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
-               init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
-               ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
-                        init_attr->sq_sig_type);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       /* save LLQP info */
-       if (qp_type & 0x80) {
-               is_llqp = 1;
-               parms.ext_type = EQPT_LLQP;
-               parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
-       }
-       qp_type &= 0x1F;
-       init_attr->qp_type &= 0x1F;
-
-       /* handle SRQ base QPs */
-       if (init_attr->srq) {
-               my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
-
-               if (qp_type == IB_QPT_UC) {
-                       ehca_err(pd->device, "UC with SRQ not supported");
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-
-               has_srq = 1;
-               parms.ext_type = EQPT_SRQBASE;
-               parms.srq_qpn = my_srq->real_qp_num;
-       }
-
-       if (is_llqp && has_srq) {
-               ehca_err(pd->device, "LLQPs can't have an SRQ");
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       /* handle SRQs */
-       if (is_srq) {
-               parms.ext_type = EQPT_SRQ;
-               parms.srq_limit = srq_init_attr->attr.srq_limit;
-               if (init_attr->cap.max_recv_sge > 3) {
-                       ehca_err(pd->device, "no more than three SGEs "
-                                "supported for SRQ  pd=%p  max_sge=%x",
-                                pd, init_attr->cap.max_recv_sge);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       /* check QP type */
-       if (qp_type != IB_QPT_UD &&
-           qp_type != IB_QPT_UC &&
-           qp_type != IB_QPT_RC &&
-           qp_type != IB_QPT_SMI &&
-           qp_type != IB_QPT_GSI) {
-               ehca_err(pd->device, "wrong QP Type=%x", qp_type);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-EINVAL);
-       }
-
-       if (is_llqp) {
-               switch (qp_type) {
-               case IB_QPT_RC:
-                       if ((init_attr->cap.max_send_wr > 255) ||
-                           (init_attr->cap.max_recv_wr > 255)) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of max_sq_wr=%x "
-                                        "or max_rq_wr=%x for RC LLQP",
-                                        init_attr->cap.max_send_wr,
-                                        init_attr->cap.max_recv_wr);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       break;
-               case IB_QPT_UD:
-                       if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
-                               ehca_err(pd->device, "UD LLQP not supported "
-                                        "by this adapter");
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-ENOSYS);
-                       }
-                       if (!(init_attr->cap.max_send_sge <= 5
-                           && init_attr->cap.max_send_sge >= 1
-                           && init_attr->cap.max_recv_sge <= 5
-                           && init_attr->cap.max_recv_sge >= 1)) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of max_send_sge=%x "
-                                        "or max_recv_sge=%x for UD LLQP",
-                                        init_attr->cap.max_send_sge,
-                                        init_attr->cap.max_recv_sge);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       } else if (init_attr->cap.max_send_wr > 255) {
-                               ehca_err(pd->device,
-                                        "Invalid Number of "
-                                        "max_send_wr=%x for UD QP_TYPE=%x",
-                                        init_attr->cap.max_send_wr, qp_type);
-                               atomic_dec(&shca->num_qps);
-                               return ERR_PTR(-EINVAL);
-                       }
-                       break;
-               default:
-                       ehca_err(pd->device, "unsupported LL QP Type=%x",
-                                qp_type);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       } else {
-               int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
-                              || qp_type == IB_QPT_GSI) ? 250 : 252;
-
-               if (init_attr->cap.max_send_sge > max_sge
-                   || init_attr->cap.max_recv_sge > max_sge) {
-                       ehca_err(pd->device, "Invalid number of SGEs requested "
-                                "send_sge=%x recv_sge=%x max_sge=%x",
-                                init_attr->cap.max_send_sge,
-                                init_attr->cap.max_recv_sge, max_sge);
-                       atomic_dec(&shca->num_qps);
-                       return ERR_PTR(-EINVAL);
-               }
-       }
-
-       my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
-       if (!my_qp) {
-               ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
-               atomic_dec(&shca->num_qps);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       if (pd->uobject && udata) {
-               is_user = 1;
-               context = pd->uobject->context;
-       }
-
-       atomic_set(&my_qp->nr_events, 0);
-       init_waitqueue_head(&my_qp->wait_completion);
-       spin_lock_init(&my_qp->spinlock_s);
-       spin_lock_init(&my_qp->spinlock_r);
-       my_qp->qp_type = qp_type;
-       my_qp->ext_type = parms.ext_type;
-       my_qp->state = IB_QPS_RESET;
-
-       if (init_attr->recv_cq)
-               my_qp->recv_cq =
-                       container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
-       if (init_attr->send_cq)
-               my_qp->send_cq =
-                       container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
-
-       idr_preload(GFP_KERNEL);
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-
-       ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
-       if (ret >= 0)
-               my_qp->token = ret;
-
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-       idr_preload_end();
-       if (ret < 0) {
-               if (ret == -ENOSPC) {
-                       ret = -EINVAL;
-                       ehca_err(pd->device, "Invalid number of qp");
-               } else {
-                       ret = -ENOMEM;
-                       ehca_err(pd->device, "Can't allocate new idr entry.");
-               }
-               goto create_qp_exit0;
-       }
-
-       if (has_srq)
-               parms.srq_token = my_qp->token;
-
-       parms.servicetype = ibqptype2servicetype(qp_type);
-       if (parms.servicetype < 0) {
-               ret = -EINVAL;
-               ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
-               goto create_qp_exit1;
-       }
-
-       /* Always signal by WQE so we can hide circ. WQEs */
-       parms.sigtype = HCALL_SIGT_BY_WQE;
-
-       /* UD_AV CIRCUMVENTION */
-       max_send_sge = init_attr->cap.max_send_sge;
-       max_recv_sge = init_attr->cap.max_recv_sge;
-       if (parms.servicetype == ST_UD && !is_llqp) {
-               max_send_sge += 2;
-               max_recv_sge += 2;
-       }
-
-       parms.token = my_qp->token;
-       parms.eq_handle = shca->eq.ipz_eq_handle;
-       parms.pd = my_pd->fw_pd;
-       if (my_qp->send_cq)
-               parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
-       if (my_qp->recv_cq)
-               parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
-
-       parms.squeue.max_wr = init_attr->cap.max_send_wr;
-       parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
-       parms.squeue.max_sge = max_send_sge;
-       parms.rqueue.max_sge = max_recv_sge;
-
-       /* RC QPs need one more SWQE for unsolicited ack circumvention */
-       if (qp_type == IB_QPT_RC)
-               parms.squeue.max_wr++;
-
-       if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
-               if (HAS_SQ(my_qp))
-                       ehca_determine_small_queue(
-                               &parms.squeue, max_send_sge, is_llqp);
-               if (HAS_RQ(my_qp))
-                       ehca_determine_small_queue(
-                               &parms.rqueue, max_recv_sge, is_llqp);
-               parms.qp_storage =
-                       (parms.squeue.is_small || parms.rqueue.is_small);
-       }
-
-       h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
-                        h_ret);
-               ret = ehca2ib_return_code(h_ret);
-               goto create_qp_exit1;
-       }
-
-       ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
-       my_qp->ipz_qp_handle = parms.qp_handle;
-       my_qp->galpas = parms.galpas;
-
-       swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
-       rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
-
-       switch (qp_type) {
-       case IB_QPT_RC:
-               if (is_llqp) {
-                       parms.squeue.act_nr_sges = 1;
-                       parms.rqueue.act_nr_sges = 1;
-               }
-               /* hide the extra WQE */
-               parms.squeue.act_nr_wqes--;
-               break;
-       case IB_QPT_UD:
-       case IB_QPT_GSI:
-       case IB_QPT_SMI:
-               /* UD circumvention */
-               if (is_llqp) {
-                       parms.squeue.act_nr_sges = 1;
-                       parms.rqueue.act_nr_sges = 1;
-               } else {
-                       parms.squeue.act_nr_sges -= 2;
-                       parms.rqueue.act_nr_sges -= 2;
-               }
-
-               if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
-                       parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
-                       parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
-                       parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
-                       parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
-                       ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
-               }
-
-               break;
-
-       default:
-               break;
-       }
-
-       /* initialize r/squeue and register queue pages */
-       if (HAS_SQ(my_qp)) {
-               ret = init_qp_queue(
-                       shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
-                       HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
-                       &parms.squeue, swqe_size);
-               if (ret) {
-                       ehca_err(pd->device, "Couldn't initialize squeue "
-                                "and pages ret=%i", ret);
-                       goto create_qp_exit2;
-               }
-
-               if (!is_user) {
-                       my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
-                               my_qp->ipz_squeue.qe_size;
-                       my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
-                                                   sizeof(struct ehca_qmap_entry));
-                       if (!my_qp->sq_map.map) {
-                               ehca_err(pd->device, "Couldn't allocate squeue "
-                                        "map ret=%i", ret);
-                               goto create_qp_exit3;
-                       }
-                       INIT_LIST_HEAD(&my_qp->sq_err_node);
-                       /* to avoid the generation of bogus flush CQEs */
-                       reset_queue_map(&my_qp->sq_map);
-               }
-       }
-
-       if (HAS_RQ(my_qp)) {
-               ret = init_qp_queue(
-                       shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
-                       H_SUCCESS, &parms.rqueue, rwqe_size);
-               if (ret) {
-                       ehca_err(pd->device, "Couldn't initialize rqueue "
-                                "and pages ret=%i", ret);
-                       goto create_qp_exit4;
-               }
-               if (!is_user) {
-                       my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
-                               my_qp->ipz_rqueue.qe_size;
-                       my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
-                                                   sizeof(struct ehca_qmap_entry));
-                       if (!my_qp->rq_map.map) {
-                               ehca_err(pd->device, "Couldn't allocate squeue "
-                                        "map ret=%i", ret);
-                               goto create_qp_exit5;
-                       }
-                       INIT_LIST_HEAD(&my_qp->rq_err_node);
-                       /* to avoid the generation of bogus flush CQEs */
-                       reset_queue_map(&my_qp->rq_map);
-               }
-       } else if (init_attr->srq && !is_user) {
-               /* this is a base QP, use the queue map of the SRQ */
-               my_qp->rq_map = my_srq->rq_map;
-               INIT_LIST_HEAD(&my_qp->rq_err_node);
-
-               my_qp->ipz_rqueue = my_srq->ipz_rqueue;
-       }
-
-       if (is_srq) {
-               my_qp->ib_srq.pd = &my_pd->ib_pd;
-               my_qp->ib_srq.device = my_pd->ib_pd.device;
-
-               my_qp->ib_srq.srq_context = init_attr->qp_context;
-               my_qp->ib_srq.event_handler = init_attr->event_handler;
-       } else {
-               my_qp->ib_qp.qp_num = ib_qp_num;
-               my_qp->ib_qp.pd = &my_pd->ib_pd;
-               my_qp->ib_qp.device = my_pd->ib_pd.device;
-
-               my_qp->ib_qp.recv_cq = init_attr->recv_cq;
-               my_qp->ib_qp.send_cq = init_attr->send_cq;
-
-               my_qp->ib_qp.qp_type = qp_type;
-               my_qp->ib_qp.srq = init_attr->srq;
-
-               my_qp->ib_qp.qp_context = init_attr->qp_context;
-               my_qp->ib_qp.event_handler = init_attr->event_handler;
-       }
-
-       init_attr->cap.max_inline_data = 0; /* not supported yet */
-       init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
-       init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
-       init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
-       init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
-       my_qp->init_attr = *init_attr;
-
-       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
-               shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
-                       &my_qp->ib_qp;
-               if (ehca_nr_ports < 0) {
-                       /* alloc array to cache subsequent modify qp parms
-                        * for autodetect mode
-                        */
-                       my_qp->mod_qp_parm =
-                               kzalloc(EHCA_MOD_QP_PARM_MAX *
-                                       sizeof(*my_qp->mod_qp_parm),
-                                       GFP_KERNEL);
-                       if (!my_qp->mod_qp_parm) {
-                               ehca_err(pd->device,
-                                        "Could not alloc mod_qp_parm");
-                               goto create_qp_exit5;
-                       }
-               }
-       }
-
-       /* NOTE: define_apq0() not supported yet */
-       if (qp_type == IB_QPT_GSI) {
-               h_ret = ehca_define_sqp(shca, my_qp, init_attr);
-               if (h_ret != H_SUCCESS) {
-                       kfree(my_qp->mod_qp_parm);
-                       my_qp->mod_qp_parm = NULL;
-                       /* the QP pointer is no longer valid */
-                       shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
-                               NULL;
-                       ret = ehca2ib_return_code(h_ret);
-                       goto create_qp_exit6;
-               }
-       }
-
-       if (my_qp->send_cq) {
-               ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
-               if (ret) {
-                       ehca_err(pd->device,
-                                "Couldn't assign qp to send_cq ret=%i", ret);
-                       goto create_qp_exit7;
-               }
-       }
-
-       /* copy queues, galpa data to user space */
-       if (context && udata) {
-               struct ehca_create_qp_resp resp;
-               memset(&resp, 0, sizeof(resp));
-
-               resp.qp_num = my_qp->real_qp_num;
-               resp.token = my_qp->token;
-               resp.qp_type = my_qp->qp_type;
-               resp.ext_type = my_qp->ext_type;
-               resp.qkey = my_qp->qkey;
-               resp.real_qp_num = my_qp->real_qp_num;
-
-               if (HAS_SQ(my_qp))
-                       queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
-               if (HAS_RQ(my_qp))
-                       queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
-               resp.fw_handle_ofs = (u32)
-                       (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
-
-               if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
-                       ehca_err(pd->device, "Copy to udata failed");
-                       ret = -EINVAL;
-                       goto create_qp_exit8;
-               }
-       }
-
-       return my_qp;
-
-create_qp_exit8:
-       ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
-
-create_qp_exit7:
-       kfree(my_qp->mod_qp_parm);
-
-create_qp_exit6:
-       if (HAS_RQ(my_qp) && !is_user)
-               vfree(my_qp->rq_map.map);
-
-create_qp_exit5:
-       if (HAS_RQ(my_qp))
-               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
-
-create_qp_exit4:
-       if (HAS_SQ(my_qp) && !is_user)
-               vfree(my_qp->sq_map.map);
-
-create_qp_exit3:
-       if (HAS_SQ(my_qp))
-               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
-
-create_qp_exit2:
-       hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
-
-create_qp_exit1:
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-       idr_remove(&ehca_qp_idr, my_qp->token);
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
-create_qp_exit0:
-       kmem_cache_free(qp_cache, my_qp);
-       atomic_dec(&shca->num_qps);
-       return ERR_PTR(ret);
-}
-
-struct ib_qp *ehca_create_qp(struct ib_pd *pd,
-                            struct ib_qp_init_attr *qp_init_attr,
-                            struct ib_udata *udata)
-{
-       struct ehca_qp *ret;
-
-       ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
-       return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
-}
-
-static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
-                              struct ib_uobject *uobject);
-
-struct ib_srq *ehca_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *srq_init_attr,
-                              struct ib_udata *udata)
-{
-       struct ib_qp_init_attr qp_init_attr;
-       struct ehca_qp *my_qp;
-       struct ib_srq *ret;
-       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
-                                             ib_device);
-       struct hcp_modify_qp_control_block *mqpcb;
-       u64 hret, update_mask;
-
-       if (srq_init_attr->srq_type != IB_SRQT_BASIC)
-               return ERR_PTR(-ENOSYS);
-
-       /* For common attributes, internal_create_qp() takes its info
-        * out of qp_init_attr, so copy all common attrs there.
-        */
-       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
-       qp_init_attr.event_handler = srq_init_attr->event_handler;
-       qp_init_attr.qp_context = srq_init_attr->srq_context;
-       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
-       qp_init_attr.qp_type = IB_QPT_RC;
-       qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
-       qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
-
-       my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
-       if (IS_ERR(my_qp))
-               return (struct ib_srq *)my_qp;
-
-       /* copy back return values */
-       srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
-       srq_init_attr->attr.max_sge = 3;
-
-       /* drive SRQ into RTR state */
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!mqpcb) {
-               ehca_err(pd->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
-               ret = ERR_PTR(-ENOMEM);
-               goto create_srq1;
-       }
-
-       mqpcb->qp_state = EHCA_QPS_INIT;
-       mqpcb->prim_phys_port = 1;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not modify SRQ to INIT "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       mqpcb->qp_enable = 1;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not enable SRQ "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       mqpcb->qp_state  = EHCA_QPS_RTR;
-       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               update_mask,
-                               mqpcb, my_qp->galpas.kernel);
-       if (hret != H_SUCCESS) {
-               ehca_err(pd->device, "Could not modify SRQ to RTR "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, hret);
-               goto create_srq2;
-       }
-
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return &my_qp->ib_srq;
-
-create_srq2:
-       ret = ERR_PTR(ehca2ib_return_code(hret));
-       ehca_free_fw_ctrlblock(mqpcb);
-
-create_srq1:
-       internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
-
-       return ret;
-}
-
-/*
- * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
- * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
- * returns total number of bad wqes in bad_wqe_cnt
- */
-static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
-                          int *bad_wqe_cnt)
-{
-       u64 h_ret;
-       struct ipz_queue *squeue;
-       void *bad_send_wqe_p, *bad_send_wqe_v;
-       u64 q_ofs;
-       struct ehca_wqe *wqe;
-       int qp_num = my_qp->ib_qp.qp_num;
-
-       /* get send wqe pointer */
-       h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
-                                          my_qp->ipz_qp_handle, &my_qp->pf,
-                                          &bad_send_wqe_p, NULL, 2);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
-                        " ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, qp_num, h_ret);
-               return ehca2ib_return_code(h_ret);
-       }
-       bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
-       ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
-                qp_num, bad_send_wqe_p);
-       /* convert wqe pointer to vadr */
-       bad_send_wqe_v = __va((u64)bad_send_wqe_p);
-       if (ehca_debug_level >= 2)
-               ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
-       squeue = &my_qp->ipz_squeue;
-       if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
-               ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
-                        " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
-               return -EFAULT;
-       }
-
-       /* loop sets wqe's purge bit */
-       wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
-       *bad_wqe_cnt = 0;
-       while (wqe->optype != 0xff && wqe->wqef != 0xff) {
-               if (ehca_debug_level >= 2)
-                       ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
-               wqe->nr_of_data_seg = 0; /* suppress data access */
-               wqe->wqef = WQEF_PURGE; /* WQE to be purged */
-               q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
-               wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
-               *bad_wqe_cnt = (*bad_wqe_cnt)+1;
-       }
-       /*
-        * bad wqe will be reprocessed and ignored when pol_cq() is called,
-        *  i.e. nr of wqes with flush error status is one less
-        */
-       ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
-                qp_num, (*bad_wqe_cnt)-1);
-       wqe->wqef = 0;
-
-       return 0;
-}
-
-static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
-                         struct ehca_queue_map *qmap)
-{
-       void *wqe_v;
-       u64 q_ofs;
-       u32 wqe_idx;
-       unsigned int tail_idx;
-
-       /* convert real to abs address */
-       wqe_p = wqe_p & (~(1UL << 63));
-
-       wqe_v = __va(wqe_p);
-
-       if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
-               ehca_gen_err("Invalid offset for calculating left cqes "
-                               "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
-               return -EFAULT;
-       }
-
-       tail_idx = next_index(qmap->tail, qmap->entries);
-       wqe_idx = q_ofs / ipz_queue->qe_size;
-
-       /* check all processed wqes, whether a cqe is requested or not */
-       while (tail_idx != wqe_idx) {
-               if (qmap->map[tail_idx].cqe_req)
-                       qmap->left_to_poll++;
-               tail_idx = next_index(tail_idx, qmap->entries);
-       }
-       /* save index in queue, where we have to start flushing */
-       qmap->next_wqe_idx = wqe_idx;
-       return 0;
-}
-
-static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
-{
-       u64 h_ret;
-       void *send_wqe_p, *recv_wqe_p;
-       int ret;
-       unsigned long flags;
-       int qp_num = my_qp->ib_qp.qp_num;
-
-       /* this hcall is not supported on base QPs */
-       if (my_qp->ext_type != EQPT_SRQBASE) {
-               /* get send and receive wqe pointer */
-               h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle, &my_qp->pf,
-                               &send_wqe_p, &recv_wqe_p, 4);
-               if (h_ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device, "disable_and_get_wqe() "
-                                "failed ehca_qp=%p qp_num=%x h_ret=%lli",
-                                my_qp, qp_num, h_ret);
-                       return ehca2ib_return_code(h_ret);
-               }
-
-               /*
-                * acquire lock to ensure that nobody is polling the cq which
-                * could mean that the qmap->tail pointer is in an
-                * inconsistent state.
-                */
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
-                               &my_qp->sq_map);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-               if (ret)
-                       return ret;
-
-
-               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-               ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
-                               &my_qp->rq_map);
-               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
-               if (ret)
-                       return ret;
-       } else {
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               my_qp->sq_map.left_to_poll = 0;
-               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
-                                                       my_qp->sq_map.entries);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-
-               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-               my_qp->rq_map.left_to_poll = 0;
-               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
-                                                       my_qp->rq_map.entries);
-               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
-       }
-
-       /* this assures flush cqes being generated only for pending wqes */
-       if ((my_qp->sq_map.left_to_poll == 0) &&
-                               (my_qp->rq_map.left_to_poll == 0)) {
-               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
-               ehca_add_to_err_list(my_qp, 1);
-               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
-
-               if (HAS_RQ(my_qp)) {
-                       spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
-                       ehca_add_to_err_list(my_qp, 0);
-                       spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
-                                       flags);
-               }
-       }
-
-       return 0;
-}
-
-/*
- * internal_modify_qp with circumvention to handle aqp0 properly
- * smi_reset2init indicates if this is an internal reset-to-init-call for
- * smi. This flag must always be zero if called from ehca_modify_qp()!
- * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
- */
-static int internal_modify_qp(struct ib_qp *ibqp,
-                             struct ib_qp_attr *attr,
-                             int attr_mask, int smi_reset2init)
-{
-       enum ib_qp_state qp_cur_state, qp_new_state;
-       int cnt, qp_attr_idx, ret = 0;
-       enum ib_qp_statetrans statetrans;
-       struct hcp_modify_qp_control_block *mqpcb;
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca =
-               container_of(ibqp->pd->device, struct ehca_shca, ib_device);
-       u64 update_mask;
-       u64 h_ret;
-       int bad_wqe_cnt = 0;
-       int is_user = 0;
-       int squeue_locked = 0;
-       unsigned long flags = 0;
-
-       /* do query_qp to obtain current attr values */
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
-       if (!mqpcb) {
-               ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               mqpcb, my_qp->galpas.kernel);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(ibqp->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, ibqp->qp_num, h_ret);
-               ret = ehca2ib_return_code(h_ret);
-               goto modify_qp_exit1;
-       }
-       if (ibqp->uobject)
-               is_user = 1;
-
-       qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
-
-       if (qp_cur_state == -EINVAL) {  /* invalid qp state */
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        mqpcb->qp_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-       /*
-        * circumvention to set aqp0 initial state to init
-        * as expected by IB spec
-        */
-       if (smi_reset2init == 0 &&
-           ibqp->qp_type == IB_QPT_SMI &&
-           qp_cur_state == IB_QPS_RESET &&
-           (attr_mask & IB_QP_STATE) &&
-           attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
-               struct ib_qp_attr smiqp_attr = {
-                       .qp_state = IB_QPS_INIT,
-                       .port_num = my_qp->init_attr.port_num,
-                       .pkey_index = 0,
-                       .qkey = 0
-               };
-               int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
-                       IB_QP_PKEY_INDEX | IB_QP_QKEY;
-               int smirc = internal_modify_qp(
-                       ibqp, &smiqp_attr, smiqp_attr_mask, 1);
-               if (smirc) {
-                       ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
-                                "ehca_modify_qp() rc=%i", smirc);
-                       ret = H_PARAMETER;
-                       goto modify_qp_exit1;
-               }
-               qp_cur_state = IB_QPS_INIT;
-               ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
-       }
-       /* is transmitted current state  equal to "real" current state */
-       if ((attr_mask & IB_QP_CUR_STATE) &&
-           qp_cur_state != attr->cur_qp_state) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device,
-                        "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
-                        " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
-                        attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
-                "new qp_state=%x attribute_mask=%x",
-                my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
-
-       qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
-       if (!smi_reset2init &&
-           !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
-                               attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device,
-                        "Invalid qp transition new_state=%x cur_state=%x "
-                        "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
-                        qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
-               goto modify_qp_exit1;
-       }
-
-       mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
-       if (mqpcb->qp_state)
-               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
-       else {
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "Invalid new qp state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        qp_new_state, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       /* retrieve state transition struct to get req and opt attrs */
-       statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
-       if (statetrans < 0) {
-               ret = -EINVAL;
-               ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
-                        "new_qp_state=%x State_xsition=%x ehca_qp=%p "
-                        "qp_num=%x", qp_cur_state, qp_new_state,
-                        statetrans, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
-
-       if (qp_attr_idx < 0) {
-               ret = qp_attr_idx;
-               ehca_err(ibqp->device,
-                        "Invalid QP type=%x ehca_qp=%p qp_num=%x",
-                        ibqp->qp_type, my_qp, ibqp->qp_num);
-               goto modify_qp_exit1;
-       }
-
-       ehca_dbg(ibqp->device,
-                "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
-                my_qp, ibqp->qp_num, statetrans);
-
-       /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
-        * in non-LL UD QPs.
-        */
-       if ((my_qp->qp_type == IB_QPT_UD) &&
-           (my_qp->ext_type != EQPT_LLQP) &&
-           (statetrans == IB_QPST_INIT2RTR) &&
-           (shca->hw_level >= 0x22)) {
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
-               mqpcb->send_grh_flag = 1;
-       }
-
-       /* sqe -> rts: set purge bit of bad wqe before actual trans */
-       if ((my_qp->qp_type == IB_QPT_UD ||
-            my_qp->qp_type == IB_QPT_GSI ||
-            my_qp->qp_type == IB_QPT_SMI) &&
-           statetrans == IB_QPST_SQE2RTS) {
-               /* mark next free wqe if kernel */
-               if (!ibqp->uobject) {
-                       struct ehca_wqe *wqe;
-                       /* lock send queue */
-                       spin_lock_irqsave(&my_qp->spinlock_s, flags);
-                       squeue_locked = 1;
-                       /* mark next free wqe */
-                       wqe = (struct ehca_wqe *)
-                               ipz_qeit_get(&my_qp->ipz_squeue);
-                       wqe->optype = wqe->wqef = 0xff;
-                       ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
-                                ibqp->qp_num, wqe);
-               }
-               ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
-               if (ret) {
-                       ehca_err(ibqp->device, "prepare_sqe_rts() failed "
-                                "ehca_qp=%p qp_num=%x ret=%i",
-                                my_qp, ibqp->qp_num, ret);
-                       goto modify_qp_exit2;
-               }
-       }
-
-       /*
-        * enable RDMA_Atomic_Control if reset->init und reliable con
-        * this is necessary since gen2 does not provide that flag,
-        * but pHyp requires it
-        */
-       if (statetrans == IB_QPST_RESET2INIT &&
-           (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
-               mqpcb->rdma_atomic_ctrl = 3;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
-       }
-       /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
-       if (statetrans == IB_QPST_INIT2RTR &&
-           (ibqp->qp_type == IB_QPT_UC) &&
-           !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
-               mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX) {
-               if (attr->pkey_index >= 16) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid pkey_index=%x. "
-                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
-                                attr->pkey_index, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->prim_p_key_idx = attr->pkey_index;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
-       }
-       if (attr_mask & IB_QP_PORT) {
-               struct ehca_sport *sport;
-               struct ehca_qp *aqp1;
-               if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid port=%x. "
-                                "ehca_qp=%p qp_num=%x num_ports=%x",
-                                attr->port_num, my_qp, ibqp->qp_num,
-                                shca->num_ports);
-                       goto modify_qp_exit2;
-               }
-               sport = &shca->sport[attr->port_num - 1];
-               if (!sport->ibqp_sqp[IB_QPT_GSI]) {
-                       /* should not occur */
-                       ret = -EFAULT;
-                       ehca_err(ibqp->device, "AQP1 was not created for "
-                                "port=%x", attr->port_num);
-                       goto modify_qp_exit2;
-               }
-               aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
-                                   struct ehca_qp, ib_qp);
-               if (ibqp->qp_type != IB_QPT_GSI &&
-                   ibqp->qp_type != IB_QPT_SMI &&
-                   aqp1->mod_qp_parm) {
-                       /*
-                        * firmware will reject this modify_qp() because
-                        * port is not activated/initialized fully
-                        */
-                       ret = -EFAULT;
-                       ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
-                                 "either port is being activated (try again) "
-                                 "or cabling issue", attr->port_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->prim_phys_port = attr->port_num;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
-       }
-       if (attr_mask & IB_QP_QKEY) {
-               mqpcb->qkey = attr->qkey;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
-       }
-       if (attr_mask & IB_QP_AV) {
-               mqpcb->dlid = attr->ah_attr.dlid;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
-               mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
-               mqpcb->service_level = attr->ah_attr.sl;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
-
-               if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
-                                 attr->ah_attr.static_rate,
-                                 &mqpcb->max_static_rate)) {
-                       ret = -EINVAL;
-                       goto modify_qp_exit2;
-               }
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
-
-               /*
-                * Always supply the GRH flag, even if it's zero, to give the
-                * hypervisor a clear "yes" or "no" instead of a "perhaps"
-                */
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
-
-               /*
-                * only if GRH is TRUE we might consider SOURCE_GID_IDX
-                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
-                */
-               if (attr->ah_attr.ah_flags == IB_AH_GRH) {
-                       mqpcb->send_grh_flag = 1;
-
-                       mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
-
-                       for (cnt = 0; cnt < 16; cnt++)
-                               mqpcb->dest_gid.byte[cnt] =
-                                       attr->ah_attr.grh.dgid.raw[cnt];
-
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
-                       mqpcb->flow_label = attr->ah_attr.grh.flow_label;
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
-                       mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
-                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
-                       mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
-               }
-       }
-
-       if (attr_mask & IB_QP_PATH_MTU) {
-               /* store ld(MTU) */
-               my_qp->mtu_shift = attr->path_mtu + 7;
-               mqpcb->path_mtu = attr->path_mtu;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
-       }
-       if (attr_mask & IB_QP_TIMEOUT) {
-               mqpcb->timeout = attr->timeout;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
-       }
-       if (attr_mask & IB_QP_RETRY_CNT) {
-               mqpcb->retry_count = attr->retry_cnt;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
-       }
-       if (attr_mask & IB_QP_RNR_RETRY) {
-               mqpcb->rnr_retry_count = attr->rnr_retry;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
-       }
-       if (attr_mask & IB_QP_RQ_PSN) {
-               mqpcb->receive_psn = attr->rq_psn;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
-       }
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-               mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
-                       attr->max_dest_rd_atomic : 2;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
-       }
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
-               mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
-                       attr->max_rd_atomic : 2;
-               update_mask |=
-                       EHCA_BMASK_SET
-                       (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
-       }
-       if (attr_mask & IB_QP_ALT_PATH) {
-               if (attr->alt_port_num < 1
-                   || attr->alt_port_num > shca->num_ports) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid alt_port=%x. "
-                                "ehca_qp=%p qp_num=%x num_ports=%x",
-                                attr->alt_port_num, my_qp, ibqp->qp_num,
-                                shca->num_ports);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->alt_phys_port = attr->alt_port_num;
-
-               if (attr->alt_pkey_index >= 16) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
-                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
-                                attr->pkey_index, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->alt_p_key_idx = attr->alt_pkey_index;
-
-               mqpcb->timeout_al = attr->alt_timeout;
-               mqpcb->dlid_al = attr->alt_ah_attr.dlid;
-               mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
-               mqpcb->service_level_al = attr->alt_ah_attr.sl;
-
-               if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
-                                 attr->alt_ah_attr.static_rate,
-                                 &mqpcb->max_static_rate_al)) {
-                       ret = -EINVAL;
-                       goto modify_qp_exit2;
-               }
-
-               /* OpenIB doesn't support alternate retry counts - copy them */
-               mqpcb->retry_count_al = mqpcb->retry_count;
-               mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
-
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
-
-               /*
-                * Always supply the GRH flag, even if it's zero, to give the
-                * hypervisor a clear "yes" or "no" instead of a "perhaps"
-                */
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
-
-               /*
-                * only if GRH is TRUE we might consider SOURCE_GID_IDX
-                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
-                */
-               if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
-                       mqpcb->send_grh_flag_al = 1;
-
-                       for (cnt = 0; cnt < 16; cnt++)
-                               mqpcb->dest_gid_al.byte[cnt] =
-                                       attr->alt_ah_attr.grh.dgid.raw[cnt];
-                       mqpcb->source_gid_idx_al =
-                               attr->alt_ah_attr.grh.sgid_index;
-                       mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
-                       mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
-                       mqpcb->traffic_class_al =
-                               attr->alt_ah_attr.grh.traffic_class;
-
-                       update_mask |=
-                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
-                               | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
-                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
-               }
-       }
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
-               mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
-       }
-
-       if (attr_mask & IB_QP_SQ_PSN) {
-               mqpcb->send_psn = attr->sq_psn;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
-       }
-
-       if (attr_mask & IB_QP_DEST_QPN) {
-               mqpcb->dest_qp_nr = attr->dest_qp_num;
-               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
-       }
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE) {
-               if (attr->path_mig_state != IB_MIG_REARM
-                   && attr->path_mig_state != IB_MIG_MIGRATED) {
-                       ret = -EINVAL;
-                       ehca_err(ibqp->device, "Invalid mig_state=%x",
-                                attr->path_mig_state);
-                       goto modify_qp_exit2;
-               }
-               mqpcb->path_migration_state = attr->path_mig_state + 1;
-               if (attr->path_mig_state == IB_MIG_REARM)
-                       my_qp->mig_armed = 1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
-       }
-
-       if (attr_mask & IB_QP_CAP) {
-               mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
-               mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
-               /* no support for max_send/recv_sge yet */
-       }
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
-
-       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                                my_qp->ipz_qp_handle,
-                                &my_qp->pf,
-                                update_mask,
-                                mqpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
-               goto modify_qp_exit2;
-       }
-
-       if ((my_qp->qp_type == IB_QPT_UD ||
-            my_qp->qp_type == IB_QPT_GSI ||
-            my_qp->qp_type == IB_QPT_SMI) &&
-           statetrans == IB_QPST_SQE2RTS) {
-               /* doorbell to reprocessing wqes */
-               iosync(); /* serialize GAL register access */
-               hipz_update_sqa(my_qp, bad_wqe_cnt-1);
-               ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
-       }
-
-       if (statetrans == IB_QPST_RESET2INIT ||
-           statetrans == IB_QPST_INIT2INIT) {
-               mqpcb->qp_enable = 1;
-               mqpcb->qp_state = EHCA_QPS_INIT;
-               update_mask = 0;
-               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
-
-               h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
-                                        my_qp->ipz_qp_handle,
-                                        &my_qp->pf,
-                                        update_mask,
-                                        mqpcb,
-                                        my_qp->galpas.kernel);
-
-               if (h_ret != H_SUCCESS) {
-                       ret = ehca2ib_return_code(h_ret);
-                       ehca_err(ibqp->device, "ENABLE in context of "
-                                "RESET_2_INIT failed! Maybe you didn't get "
-                                "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
-                                h_ret, my_qp, ibqp->qp_num);
-                       goto modify_qp_exit2;
-               }
-       }
-       if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
-           && !is_user) {
-               ret = check_for_left_cqes(my_qp, shca);
-               if (ret)
-                       goto modify_qp_exit2;
-       }
-
-       if (statetrans == IB_QPST_ANY2RESET) {
-               ipz_qeit_reset(&my_qp->ipz_rqueue);
-               ipz_qeit_reset(&my_qp->ipz_squeue);
-
-               if (qp_cur_state == IB_QPS_ERR && !is_user) {
-                       del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
-
-                       if (HAS_RQ(my_qp))
-                               del_from_err_list(my_qp->recv_cq,
-                                                 &my_qp->rq_err_node);
-               }
-               if (!is_user)
-                       reset_queue_map(&my_qp->sq_map);
-
-               if (HAS_RQ(my_qp) && !is_user)
-                       reset_queue_map(&my_qp->rq_map);
-       }
-
-       if (attr_mask & IB_QP_QKEY)
-               my_qp->qkey = attr->qkey;
-
-modify_qp_exit2:
-       if (squeue_locked) { /* this means: sqe -> rts */
-               spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
-               my_qp->sqerr_purgeflag = 1;
-       }
-
-modify_qp_exit1:
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return ret;
-}
-
-int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-                  struct ib_udata *udata)
-{
-       int ret = 0;
-
-       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
-                                             ib_device);
-       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
-
-       /* The if-block below caches qp_attr to be modified for GSI and SMI
-        * qps during the initialization by ib_mad. When the respective port
-        * is activated, ie we got an event PORT_ACTIVE, we'll replay the
-        * cached modify calls sequence, see ehca_recover_sqs() below.
-        * Why that is required:
-        * 1) If one port is connected, older code requires that port one
-        *    to be connected and module option nr_ports=1 to be given by
-        *    user, which is very inconvenient for end user.
-        * 2) Firmware accepts modify_qp() only if respective port has become
-        *    active. Older code had a wait loop of 30sec create_qp()/
-        *    define_aqp1(), which is not appropriate in practice. This
-        *    code now removes that wait loop, see define_aqp1(), and always
-        *    reports all ports to ib_mad resp. users. Only activated ports
-        *    will then usable for the users.
-        */
-       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
-               int port = my_qp->init_attr.port_num;
-               struct ehca_sport *sport = &shca->sport[port - 1];
-               unsigned long flags;
-               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-               /* cache qp_attr only during init */
-               if (my_qp->mod_qp_parm) {
-                       struct ehca_mod_qp_parm *p;
-                       if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
-                               ehca_err(&shca->ib_device,
-                                        "mod_qp_parm overflow state=%x port=%x"
-                                        " type=%x", attr->qp_state,
-                                        my_qp->init_attr.port_num,
-                                        ibqp->qp_type);
-                               spin_unlock_irqrestore(&sport->mod_sqp_lock,
-                                                      flags);
-                               return -EINVAL;
-                       }
-                       p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
-                       p->mask = attr_mask;
-                       p->attr = *attr;
-                       my_qp->mod_qp_parm_idx++;
-                       ehca_dbg(&shca->ib_device,
-                                "Saved qp_attr for state=%x port=%x type=%x",
-                                attr->qp_state, my_qp->init_attr.port_num,
-                                ibqp->qp_type);
-                       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-                       goto out;
-               }
-               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-       }
-
-       ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
-
-out:
-       if ((ret == 0) && (attr_mask & IB_QP_STATE))
-               my_qp->state = attr->qp_state;
-
-       return ret;
-}
-
-void ehca_recover_sqp(struct ib_qp *sqp)
-{
-       struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
-       int port = my_sqp->init_attr.port_num;
-       struct ib_qp_attr attr;
-       struct ehca_mod_qp_parm *qp_parm;
-       int i, qp_parm_idx, ret;
-       unsigned long flags, wr_cnt;
-
-       if (!my_sqp->mod_qp_parm)
-               return;
-       ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
-
-       qp_parm = my_sqp->mod_qp_parm;
-       qp_parm_idx = my_sqp->mod_qp_parm_idx;
-       for (i = 0; i < qp_parm_idx; i++) {
-               attr = qp_parm[i].attr;
-               ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
-               if (ret) {
-                       ehca_err(sqp->device, "Could not modify SQP port=%x "
-                                "qp_num=%x ret=%x", port, sqp->qp_num, ret);
-                       goto free_qp_parm;
-               }
-               ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
-                        port, sqp->qp_num, attr.qp_state);
-       }
-
-       /* re-trigger posted recv wrs */
-       wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
-               my_sqp->ipz_rqueue.qe_size;
-       if (wr_cnt) {
-               spin_lock_irqsave(&my_sqp->spinlock_r, flags);
-               hipz_update_rqa(my_sqp, wr_cnt);
-               spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
-               ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
-                        port, sqp->qp_num, wr_cnt);
-       }
-
-free_qp_parm:
-       kfree(qp_parm);
-       /* this prevents subsequent calls to modify_qp() to cache qp_attr */
-       my_sqp->mod_qp_parm = NULL;
-}
-
-int ehca_query_qp(struct ib_qp *qp,
-                 struct ib_qp_attr *qp_attr,
-                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-       struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       struct hcp_modify_qp_control_block *qpcb;
-       int cnt, ret = 0;
-       u64 h_ret;
-
-       if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
-               ehca_err(qp->device, "Invalid attribute mask "
-                        "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
-                        my_qp, qp->qp_num, qp_attr_mask);
-               return -EINVAL;
-       }
-
-       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!qpcb) {
-               ehca_err(qp->device, "Out of memory for qpcb "
-                        "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(adapter_handle,
-                               my_qp->ipz_qp_handle,
-                               &my_qp->pf,
-                               qpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(qp->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, qp->qp_num, h_ret);
-               goto query_qp_exit1;
-       }
-
-       qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
-       qp_attr->qp_state = qp_attr->cur_qp_state;
-
-       if (qp_attr->cur_qp_state == -EINVAL) {
-               ret = -EINVAL;
-               ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
-                        "ehca_qp=%p qp_num=%x",
-                        qpcb->qp_state, my_qp, qp->qp_num);
-               goto query_qp_exit1;
-       }
-
-       if (qp_attr->qp_state == IB_QPS_SQD)
-               qp_attr->sq_draining = 1;
-
-       qp_attr->qkey = qpcb->qkey;
-       qp_attr->path_mtu = qpcb->path_mtu;
-       qp_attr->path_mig_state = qpcb->path_migration_state - 1;
-       qp_attr->rq_psn = qpcb->receive_psn;
-       qp_attr->sq_psn = qpcb->send_psn;
-       qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
-       qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
-       qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
-       /* UD_AV CIRCUMVENTION */
-       if (my_qp->qp_type == IB_QPT_UD) {
-               qp_attr->cap.max_send_sge =
-                       qpcb->actual_nr_sges_in_sq_wqe - 2;
-               qp_attr->cap.max_recv_sge =
-                       qpcb->actual_nr_sges_in_rq_wqe - 2;
-       } else {
-               qp_attr->cap.max_send_sge =
-                       qpcb->actual_nr_sges_in_sq_wqe;
-               qp_attr->cap.max_recv_sge =
-                       qpcb->actual_nr_sges_in_rq_wqe;
-       }
-
-       qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
-       qp_attr->dest_qp_num = qpcb->dest_qp_nr;
-
-       qp_attr->pkey_index = qpcb->prim_p_key_idx;
-       qp_attr->port_num = qpcb->prim_phys_port;
-       qp_attr->timeout = qpcb->timeout;
-       qp_attr->retry_cnt = qpcb->retry_count;
-       qp_attr->rnr_retry = qpcb->rnr_retry_count;
-
-       qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
-       qp_attr->alt_port_num = qpcb->alt_phys_port;
-       qp_attr->alt_timeout = qpcb->timeout_al;
-
-       qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
-       qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
-
-       /* primary av */
-       qp_attr->ah_attr.sl = qpcb->service_level;
-
-       if (qpcb->send_grh_flag) {
-               qp_attr->ah_attr.ah_flags = IB_AH_GRH;
-       }
-
-       qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
-       qp_attr->ah_attr.dlid = qpcb->dlid;
-       qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
-       qp_attr->ah_attr.port_num = qp_attr->port_num;
-
-       /* primary GRH */
-       qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
-       qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
-       qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
-       qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
-
-       for (cnt = 0; cnt < 16; cnt++)
-               qp_attr->ah_attr.grh.dgid.raw[cnt] =
-                       qpcb->dest_gid.byte[cnt];
-
-       /* alternate AV */
-       qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
-       if (qpcb->send_grh_flag_al) {
-               qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
-       }
-
-       qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
-       qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
-       qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
-
-       /* alternate GRH */
-       qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
-       qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
-       qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
-       qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
-
-       for (cnt = 0; cnt < 16; cnt++)
-               qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
-                       qpcb->dest_gid_al.byte[cnt];
-
-       /* return init attributes given in ehca_create_qp */
-       if (qp_init_attr)
-               *qp_init_attr = my_qp->init_attr;
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
-
-query_qp_exit1:
-       ehca_free_fw_ctrlblock(qpcb);
-
-       return ret;
-}
-
-int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
-{
-       struct ehca_qp *my_qp =
-               container_of(ibsrq, struct ehca_qp, ib_srq);
-       struct ehca_shca *shca =
-               container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
-       struct hcp_modify_qp_control_block *mqpcb;
-       u64 update_mask;
-       u64 h_ret;
-       int ret = 0;
-
-       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!mqpcb) {
-               ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
-                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
-               return -ENOMEM;
-       }
-
-       update_mask = 0;
-       if (attr_mask & IB_SRQ_LIMIT) {
-               attr_mask &= ~IB_SRQ_LIMIT;
-               update_mask |=
-                       EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
-                       | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
-               mqpcb->curr_srq_limit = attr->srq_limit;
-               mqpcb->qp_aff_asyn_ev_log_reg =
-                       EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
-       }
-
-       /* by now, all bits in attr_mask should have been cleared */
-       if (attr_mask) {
-               ehca_err(ibsrq->device, "invalid attribute mask bits set  "
-                        "attr_mask=%x", attr_mask);
-               ret = -EINVAL;
-               goto modify_srq_exit0;
-       }
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
-
-       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
-                                NULL, update_mask, mqpcb,
-                                my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x",
-                        h_ret, my_qp, my_qp->real_qp_num);
-       }
-
-modify_srq_exit0:
-       ehca_free_fw_ctrlblock(mqpcb);
-
-       return ret;
-}
-
-int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
-{
-       struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
-       struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
-                                             ib_device);
-       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
-       struct hcp_modify_qp_control_block *qpcb;
-       int ret = 0;
-       u64 h_ret;
-
-       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
-       if (!qpcb) {
-               ehca_err(srq->device, "Out of memory for qpcb "
-                        "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
-               return -ENOMEM;
-       }
-
-       h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
-                               NULL, qpcb, my_qp->galpas.kernel);
-
-       if (h_ret != H_SUCCESS) {
-               ret = ehca2ib_return_code(h_ret);
-               ehca_err(srq->device, "hipz_h_query_qp() failed "
-                        "ehca_qp=%p qp_num=%x h_ret=%lli",
-                        my_qp, my_qp->real_qp_num, h_ret);
-               goto query_srq_exit1;
-       }
-
-       srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
-       srq_attr->max_sge = 3;
-       srq_attr->srq_limit = qpcb->curr_srq_limit;
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
-
-query_srq_exit1:
-       ehca_free_fw_ctrlblock(qpcb);
-
-       return ret;
-}
-
-static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
-                              struct ib_uobject *uobject)
-{
-       struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
-       struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
-                                            ib_pd);
-       struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
-       u32 qp_num = my_qp->real_qp_num;
-       int ret;
-       u64 h_ret;
-       u8 port_num;
-       int is_user = 0;
-       enum ib_qp_type qp_type;
-       unsigned long flags;
-
-       if (uobject) {
-               is_user = 1;
-               if (my_qp->mm_count_galpa ||
-                   my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
-                       ehca_err(dev, "Resources still referenced in "
-                                "user space qp_num=%x", qp_num);
-                       return -EINVAL;
-               }
-       }
-
-       if (my_qp->send_cq) {
-               ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
-               if (ret) {
-                       ehca_err(dev, "Couldn't unassign qp from "
-                                "send_cq ret=%i qp_num=%x cq_num=%x", ret,
-                                qp_num, my_qp->send_cq->cq_number);
-                       return ret;
-               }
-       }
-
-       write_lock_irqsave(&ehca_qp_idr_lock, flags);
-       idr_remove(&ehca_qp_idr, my_qp->token);
-       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
-       /*
-        * SRQs will never get into an error list and do not have a recv_cq,
-        * so we need to skip them here.
-        */
-       if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
-               del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
-
-       if (HAS_SQ(my_qp) && !is_user)
-               del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
-
-       /* now wait until all pending events have completed */
-       wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
-
-       h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
-       if (h_ret != H_SUCCESS) {
-               ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
-                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
-               return ehca2ib_return_code(h_ret);
-       }
-
-       port_num = my_qp->init_attr.port_num;
-       qp_type  = my_qp->init_attr.qp_type;
-
-       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
-               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
-               kfree(my_qp->mod_qp_parm);
-               my_qp->mod_qp_parm = NULL;
-               shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
-               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
-       }
-
-       /* no support for IB_QPT_SMI yet */
-       if (qp_type == IB_QPT_GSI) {
-               struct ib_event event;
-               ehca_info(dev, "device %s: port %x is inactive.",
-                               shca->ib_device.name, port_num);
-               event.device = &shca->ib_device;
-               event.event = IB_EVENT_PORT_ERR;
-               event.element.port_num = port_num;
-               shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
-               ib_dispatch_event(&event);
-       }
-
-       if (HAS_RQ(my_qp)) {
-               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
-               if (!is_user)
-                       vfree(my_qp->rq_map.map);
-       }
-       if (HAS_SQ(my_qp)) {
-               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
-               if (!is_user)
-                       vfree(my_qp->sq_map.map);
-       }
-       kmem_cache_free(qp_cache, my_qp);
-       atomic_dec(&shca->num_qps);
-       return 0;
-}
-
-int ehca_destroy_qp(struct ib_qp *qp)
-{
-       return internal_destroy_qp(qp->device,
-                                  container_of(qp, struct ehca_qp, ib_qp),
-                                  qp->uobject);
-}
-
-int ehca_destroy_srq(struct ib_srq *srq)
-{
-       return internal_destroy_qp(srq->device,
-                                  container_of(srq, struct ehca_qp, ib_srq),
-                                  srq->uobject);
-}
-
-int ehca_init_qp_cache(void)
-{
-       qp_cache = kmem_cache_create("ehca_cache_qp",
-                                    sizeof(struct ehca_qp), 0,
-                                    SLAB_HWCACHE_ALIGN,
-                                    NULL);
-       if (!qp_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-void ehca_cleanup_qp_cache(void)
-{
-       if (qp_cache)
-               kmem_cache_destroy(qp_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
deleted file mode 100644 (file)
index 47f9498..0000000
+++ /dev/null
@@ -1,953 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  post_send/recv, poll_cq, req_notify
- *
- *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-#include "hipz_fns.h"
-
-/* in RC traffic, insert an empty RDMA READ every this many packets */
-#define ACK_CIRC_THRESHOLD 2000000
-
-static u64 replace_wr_id(u64 wr_id, u16 idx)
-{
-       u64 ret;
-
-       ret = wr_id & ~QMAP_IDX_MASK;
-       ret |= idx & QMAP_IDX_MASK;
-
-       return ret;
-}
-
-static u16 get_app_wr_id(u64 wr_id)
-{
-       return wr_id & QMAP_IDX_MASK;
-}
-
-static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
-                                 struct ehca_wqe *wqe_p,
-                                 struct ib_recv_wr *recv_wr,
-                                 u32 rq_map_idx)
-{
-       u8 cnt_ds;
-       if (unlikely((recv_wr->num_sge < 0) ||
-                    (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
-               ehca_gen_err("Invalid number of WQE SGE. "
-                        "num_sqe=%x max_nr_of_sg=%x",
-                        recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
-               return -EINVAL; /* invalid SG list length */
-       }
-
-       /* clear wqe header until sglist */
-       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
-
-       wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
-       wqe_p->nr_of_data_seg = recv_wr->num_sge;
-
-       for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
-               wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
-                       recv_wr->sg_list[cnt_ds].addr;
-               wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
-                       recv_wr->sg_list[cnt_ds].lkey;
-               wqe_p->u.all_rcv.sg_list[cnt_ds].length =
-                       recv_wr->sg_list[cnt_ds].length;
-       }
-
-       if (ehca_debug_level >= 3) {
-               ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
-                            ipz_rqueue);
-               ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
-       }
-
-       return 0;
-}
-
-#if defined(DEBUG_GSI_SEND_WR)
-
-/* need ib_mad struct */
-#include <rdma/ib_mad.h>
-
-static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
-{
-       int idx;
-       int j;
-       while (send_wr) {
-               struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
-               struct ib_sge *sge = send_wr->sg_list;
-               ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
-                            "send_flags=%x opcode=%x", idx, send_wr->wr_id,
-                            send_wr->num_sge, send_wr->send_flags,
-                            send_wr->opcode);
-               if (mad_hdr) {
-                       ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
-                                    "mgmt_class=%x class_version=%x method=%x "
-                                    "status=%x class_specific=%x tid=%lx "
-                                    "attr_id=%x resv=%x attr_mod=%x",
-                                    idx, mad_hdr->base_version,
-                                    mad_hdr->mgmt_class,
-                                    mad_hdr->class_version, mad_hdr->method,
-                                    mad_hdr->status, mad_hdr->class_specific,
-                                    mad_hdr->tid, mad_hdr->attr_id,
-                                    mad_hdr->resv,
-                                    mad_hdr->attr_mod);
-               }
-               for (j = 0; j < send_wr->num_sge; j++) {
-                       u8 *data = __va(sge->addr);
-                       ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
-                                    "lkey=%x",
-                                    idx, j, data, sge->length, sge->lkey);
-                       /* assume length is n*16 */
-                       ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
-                                idx, j);
-                       sge++;
-               } /* eof for j */
-               idx++;
-               send_wr = send_wr->next;
-       } /* eof while send_wr */
-}
-
-#endif /* DEBUG_GSI_SEND_WR */
-
-static inline int ehca_write_swqe(struct ehca_qp *qp,
-                                 struct ehca_wqe *wqe_p,
-                                 const struct ib_send_wr *send_wr,
-                                 u32 sq_map_idx,
-                                 int hidden)
-{
-       u32 idx;
-       u64 dma_length;
-       struct ehca_av *my_av;
-       u32 remote_qkey = send_wr->wr.ud.remote_qkey;
-       struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
-
-       if (unlikely((send_wr->num_sge < 0) ||
-                    (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
-               ehca_gen_err("Invalid number of WQE SGE. "
-                        "num_sqe=%x max_nr_of_sg=%x",
-                        send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
-               return -EINVAL; /* invalid SG list length */
-       }
-
-       /* clear wqe header until sglist */
-       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
-
-       wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
-
-       qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
-       qmap_entry->reported = 0;
-       qmap_entry->cqe_req = 0;
-
-       switch (send_wr->opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               wqe_p->optype = WQE_OPTYPE_SEND;
-               break;
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
-               break;
-       case IB_WR_RDMA_READ:
-               wqe_p->optype = WQE_OPTYPE_RDMAREAD;
-               break;
-       default:
-               ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
-               return -EINVAL; /* invalid opcode */
-       }
-
-       wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
-
-       wqe_p->wr_flag = 0;
-
-       if ((send_wr->send_flags & IB_SEND_SIGNALED ||
-           qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
-           && !hidden) {
-               wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
-               qmap_entry->cqe_req = 1;
-       }
-
-       if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
-           send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
-               /* this might not work as long as HW does not support it */
-               wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
-               wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
-       }
-
-       wqe_p->nr_of_data_seg = send_wr->num_sge;
-
-       switch (qp->qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               /* no break is intential here */
-       case IB_QPT_UD:
-               /* IB 1.2 spec C10-15 compliance */
-               if (send_wr->wr.ud.remote_qkey & 0x80000000)
-                       remote_qkey = qp->qkey;
-
-               wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
-               wqe_p->local_ee_context_qkey = remote_qkey;
-               if (unlikely(!send_wr->wr.ud.ah)) {
-                       ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
-                       return -EINVAL;
-               }
-               if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
-                       ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
-                       return -EINVAL;
-               }
-               my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
-               wqe_p->u.ud_av.ud_av = my_av->av;
-
-               /*
-                * omitted check of IB_SEND_INLINE
-                * since HW does not support it
-                */
-               for (idx = 0; idx < send_wr->num_sge; idx++) {
-                       wqe_p->u.ud_av.sg_list[idx].vaddr =
-                               send_wr->sg_list[idx].addr;
-                       wqe_p->u.ud_av.sg_list[idx].lkey =
-                               send_wr->sg_list[idx].lkey;
-                       wqe_p->u.ud_av.sg_list[idx].length =
-                               send_wr->sg_list[idx].length;
-               } /* eof for idx */
-               if (qp->qp_type == IB_QPT_SMI ||
-                   qp->qp_type == IB_QPT_GSI)
-                       wqe_p->u.ud_av.ud_av.pmtu = 1;
-               if (qp->qp_type == IB_QPT_GSI) {
-                       wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
-#ifdef DEBUG_GSI_SEND_WR
-                       trace_send_wr_ud(send_wr);
-#endif /* DEBUG_GSI_SEND_WR */
-               }
-               break;
-
-       case IB_QPT_UC:
-               if (send_wr->send_flags & IB_SEND_FENCE)
-                       wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
-               /* no break is intentional here */
-       case IB_QPT_RC:
-               /* TODO: atomic not implemented */
-               wqe_p->u.nud.remote_virtual_address =
-                       send_wr->wr.rdma.remote_addr;
-               wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
-
-               /*
-                * omitted checking of IB_SEND_INLINE
-                * since HW does not support it
-                */
-               dma_length = 0;
-               for (idx = 0; idx < send_wr->num_sge; idx++) {
-                       wqe_p->u.nud.sg_list[idx].vaddr =
-                               send_wr->sg_list[idx].addr;
-                       wqe_p->u.nud.sg_list[idx].lkey =
-                               send_wr->sg_list[idx].lkey;
-                       wqe_p->u.nud.sg_list[idx].length =
-                               send_wr->sg_list[idx].length;
-                       dma_length += send_wr->sg_list[idx].length;
-               } /* eof idx */
-               wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
-
-               /* unsolicited ack circumvention */
-               if (send_wr->opcode == IB_WR_RDMA_READ) {
-                       /* on RDMA read, switch on and reset counters */
-                       qp->message_count = qp->packet_count = 0;
-                       qp->unsol_ack_circ = 1;
-               } else
-                       /* else estimate #packets */
-                       qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
-
-               break;
-
-       default:
-               ehca_gen_err("Invalid qptype=%x", qp->qp_type);
-               return -EINVAL;
-       }
-
-       if (ehca_debug_level >= 3) {
-               ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
-               ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
-       }
-       return 0;
-}
-
-/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
-static inline void map_ib_wc_status(u32 cqe_status,
-                                   enum ib_wc_status *wc_status)
-{
-       if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
-               switch (cqe_status & 0x3F) {
-               case 0x01:
-               case 0x21:
-                       *wc_status = IB_WC_LOC_LEN_ERR;
-                       break;
-               case 0x02:
-               case 0x22:
-                       *wc_status = IB_WC_LOC_QP_OP_ERR;
-                       break;
-               case 0x03:
-               case 0x23:
-                       *wc_status = IB_WC_LOC_EEC_OP_ERR;
-                       break;
-               case 0x04:
-               case 0x24:
-                       *wc_status = IB_WC_LOC_PROT_ERR;
-                       break;
-               case 0x05:
-               case 0x25:
-                       *wc_status = IB_WC_WR_FLUSH_ERR;
-                       break;
-               case 0x06:
-                       *wc_status = IB_WC_MW_BIND_ERR;
-                       break;
-               case 0x07: /* remote error - look into bits 20:24 */
-                       switch ((cqe_status
-                                & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
-                       case 0x0:
-                               /*
-                                * PSN Sequence Error!
-                                * couldn't find a matching status!
-                                */
-                               *wc_status = IB_WC_GENERAL_ERR;
-                               break;
-                       case 0x1:
-                               *wc_status = IB_WC_REM_INV_REQ_ERR;
-                               break;
-                       case 0x2:
-                               *wc_status = IB_WC_REM_ACCESS_ERR;
-                               break;
-                       case 0x3:
-                               *wc_status = IB_WC_REM_OP_ERR;
-                               break;
-                       case 0x4:
-                               *wc_status = IB_WC_REM_INV_RD_REQ_ERR;
-                               break;
-                       }
-                       break;
-               case 0x08:
-                       *wc_status = IB_WC_RETRY_EXC_ERR;
-                       break;
-               case 0x09:
-                       *wc_status = IB_WC_RNR_RETRY_EXC_ERR;
-                       break;
-               case 0x0A:
-               case 0x2D:
-                       *wc_status = IB_WC_REM_ABORT_ERR;
-                       break;
-               case 0x0B:
-               case 0x2E:
-                       *wc_status = IB_WC_INV_EECN_ERR;
-                       break;
-               case 0x0C:
-               case 0x2F:
-                       *wc_status = IB_WC_INV_EEC_STATE_ERR;
-                       break;
-               case 0x0D:
-                       *wc_status = IB_WC_BAD_RESP_ERR;
-                       break;
-               case 0x10:
-                       /* WQE purged */
-                       *wc_status = IB_WC_WR_FLUSH_ERR;
-                       break;
-               default:
-                       *wc_status = IB_WC_FATAL_ERR;
-
-               }
-       } else
-               *wc_status = IB_WC_SUCCESS;
-}
-
-static inline int post_one_send(struct ehca_qp *my_qp,
-                        struct ib_send_wr *cur_send_wr,
-                        int hidden)
-{
-       struct ehca_wqe *wqe_p;
-       int ret;
-       u32 sq_map_idx;
-       u64 start_offset = my_qp->ipz_squeue.current_q_offset;
-
-       /* get pointer next to free WQE */
-       wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
-       if (unlikely(!wqe_p)) {
-               /* too many posted work requests: queue overflow */
-               ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
-                        "qp_num=%x", my_qp->ib_qp.qp_num);
-               return -ENOMEM;
-       }
-
-       /*
-        * Get the index of the WQE in the send queue. The same index is used
-        * for writing into the sq_map.
-        */
-       sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
-
-       /* write a SEND WQE into the QUEUE */
-       ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
-       /*
-        * if something failed,
-        * reset the free entry pointer to the start value
-        */
-       if (unlikely(ret)) {
-               my_qp->ipz_squeue.current_q_offset = start_offset;
-               ehca_err(my_qp->ib_qp.device, "Could not write WQE "
-                        "qp_num=%x", my_qp->ib_qp.qp_num);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int ehca_post_send(struct ib_qp *qp,
-                  struct ib_send_wr *send_wr,
-                  struct ib_send_wr **bad_send_wr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-       int wqe_cnt = 0;
-       int ret = 0;
-       unsigned long flags;
-
-       /* Reject WR if QP is in RESET, INIT or RTR state */
-       if (unlikely(my_qp->state < IB_QPS_RTS)) {
-               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
-                        my_qp->state, qp->qp_num);
-               ret = -EINVAL;
-               goto out;
-       }
-
-       /* LOCK the QUEUE */
-       spin_lock_irqsave(&my_qp->spinlock_s, flags);
-
-       /* Send an empty extra RDMA read if:
-        *  1) there has been an RDMA read on this connection before
-        *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
-        *  3) we can be sure that any previous extra RDMA read has been
-        *     processed so we don't overflow the SQ
-        */
-       if (unlikely(my_qp->unsol_ack_circ &&
-                    my_qp->packet_count > ACK_CIRC_THRESHOLD &&
-                    my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
-               /* insert an empty RDMA READ to fix up the remote QP state */
-               struct ib_send_wr circ_wr;
-               memset(&circ_wr, 0, sizeof(circ_wr));
-               circ_wr.opcode = IB_WR_RDMA_READ;
-               post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
-               wqe_cnt++;
-               ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
-               my_qp->message_count = my_qp->packet_count = 0;
-       }
-
-       /* loop processes list of send reqs */
-       while (send_wr) {
-               ret = post_one_send(my_qp, send_wr, 0);
-               if (unlikely(ret)) {
-                       goto post_send_exit0;
-               }
-               wqe_cnt++;
-               send_wr = send_wr->next;
-       }
-
-post_send_exit0:
-       iosync(); /* serialize GAL register access */
-       hipz_update_sqa(my_qp, wqe_cnt);
-       if (unlikely(ret || ehca_debug_level >= 2))
-               ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
-                        my_qp, qp->qp_num, wqe_cnt, ret);
-       my_qp->message_count += wqe_cnt;
-       spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
-
-out:
-       if (ret)
-               *bad_send_wr = send_wr;
-       return ret;
-}
-
-static int internal_post_recv(struct ehca_qp *my_qp,
-                             struct ib_device *dev,
-                             struct ib_recv_wr *recv_wr,
-                             struct ib_recv_wr **bad_recv_wr)
-{
-       struct ehca_wqe *wqe_p;
-       int wqe_cnt = 0;
-       int ret = 0;
-       u32 rq_map_idx;
-       unsigned long flags;
-       struct ehca_qmap_entry *qmap_entry;
-
-       if (unlikely(!HAS_RQ(my_qp))) {
-               ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
-                        my_qp, my_qp->real_qp_num, my_qp->ext_type);
-               ret = -ENODEV;
-               goto out;
-       }
-
-       /* LOCK the QUEUE */
-       spin_lock_irqsave(&my_qp->spinlock_r, flags);
-
-       /* loop processes list of recv reqs */
-       while (recv_wr) {
-               u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
-               /* get pointer next to free WQE */
-               wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
-               if (unlikely(!wqe_p)) {
-                       /* too many posted work requests: queue overflow */
-                       ret = -ENOMEM;
-                       ehca_err(dev, "Too many posted WQEs "
-                               "qp_num=%x", my_qp->real_qp_num);
-                       goto post_recv_exit0;
-               }
-               /*
-                * Get the index of the WQE in the recv queue. The same index
-                * is used for writing into the rq_map.
-                */
-               rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
-
-               /* write a RECV WQE into the QUEUE */
-               ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
-                               rq_map_idx);
-               /*
-                * if something failed,
-                * reset the free entry pointer to the start value
-                */
-               if (unlikely(ret)) {
-                       my_qp->ipz_rqueue.current_q_offset = start_offset;
-                       ret = -EINVAL;
-                       ehca_err(dev, "Could not write WQE "
-                               "qp_num=%x", my_qp->real_qp_num);
-                       goto post_recv_exit0;
-               }
-
-               qmap_entry = &my_qp->rq_map.map[rq_map_idx];
-               qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
-               qmap_entry->reported = 0;
-               qmap_entry->cqe_req = 1;
-
-               wqe_cnt++;
-               recv_wr = recv_wr->next;
-       } /* eof for recv_wr */
-
-post_recv_exit0:
-       iosync(); /* serialize GAL register access */
-       hipz_update_rqa(my_qp, wqe_cnt);
-       if (unlikely(ret || ehca_debug_level >= 2))
-           ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
-                    my_qp, my_qp->real_qp_num, wqe_cnt, ret);
-       spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
-
-out:
-       if (ret)
-               *bad_recv_wr = recv_wr;
-
-       return ret;
-}
-
-int ehca_post_recv(struct ib_qp *qp,
-                  struct ib_recv_wr *recv_wr,
-                  struct ib_recv_wr **bad_recv_wr)
-{
-       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
-
-       /* Reject WR if QP is in RESET state */
-       if (unlikely(my_qp->state == IB_QPS_RESET)) {
-               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
-                        my_qp->state, qp->qp_num);
-               *bad_recv_wr = recv_wr;
-               return -EINVAL;
-       }
-
-       return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
-}
-
-int ehca_post_srq_recv(struct ib_srq *srq,
-                      struct ib_recv_wr *recv_wr,
-                      struct ib_recv_wr **bad_recv_wr)
-{
-       return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
-                                 srq->device, recv_wr, bad_recv_wr);
-}
-
-/*
- * ib_wc_opcode table converts ehca wc opcode to ib
- * Since we use zero to indicate invalid opcode, the actual ib opcode must
- * be decremented!!!
- */
-static const u8 ib_wc_opcode[255] = {
-       [0x01] = IB_WC_RECV+1,
-       [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
-       [0x04] = IB_WC_BIND_MW+1,
-       [0x08] = IB_WC_FETCH_ADD+1,
-       [0x10] = IB_WC_COMP_SWAP+1,
-       [0x20] = IB_WC_RDMA_WRITE+1,
-       [0x40] = IB_WC_RDMA_READ+1,
-       [0x80] = IB_WC_SEND+1
-};
-
-/* internal function to poll one entry of cq */
-static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
-{
-       int ret = 0, qmap_tail_idx;
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       struct ehca_cqe *cqe;
-       struct ehca_qp *my_qp;
-       struct ehca_qmap_entry *qmap_entry;
-       struct ehca_queue_map *qmap;
-       int cqe_count = 0, is_error;
-
-repoll:
-       cqe = (struct ehca_cqe *)
-               ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
-       if (!cqe) {
-               ret = -EAGAIN;
-               if (ehca_debug_level >= 3)
-                       ehca_dbg(cq->device, "Completion queue is empty  "
-                                "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
-               goto poll_cq_one_exit0;
-       }
-
-       /* prevents loads being reordered across this point */
-       rmb();
-
-       cqe_count++;
-       if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
-               struct ehca_qp *qp;
-               int purgeflag;
-               unsigned long flags;
-
-               qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
-               if (!qp) {
-                       ehca_err(cq->device, "cq_num=%x qp_num=%x "
-                                "could not find qp -> ignore cqe",
-                                my_cq->cq_number, cqe->local_qp_number);
-                       ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
-                                my_cq->cq_number, cqe->local_qp_number);
-                       /* ignore this purged cqe */
-                       goto repoll;
-               }
-               spin_lock_irqsave(&qp->spinlock_s, flags);
-               purgeflag = qp->sqerr_purgeflag;
-               spin_unlock_irqrestore(&qp->spinlock_s, flags);
-
-               if (purgeflag) {
-                       ehca_dbg(cq->device,
-                                "Got CQE with purged bit qp_num=%x src_qp=%x",
-                                cqe->local_qp_number, cqe->remote_qp_number);
-                       if (ehca_debug_level >= 2)
-                               ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
-                                        cqe->local_qp_number,
-                                        cqe->remote_qp_number);
-                       /*
-                        * ignore this to avoid double cqes of bad wqe
-                        * that caused sqe and turn off purge flag
-                        */
-                       qp->sqerr_purgeflag = 0;
-                       goto repoll;
-               }
-       }
-
-       is_error = cqe->status & WC_STATUS_ERROR_BIT;
-
-       /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
-       if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
-               ehca_dbg(cq->device,
-                        "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
-                        is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
-               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
-                        my_cq, my_cq->cq_number);
-               ehca_dbg(cq->device,
-                        "ehca_cq=%p cq_num=%x -------------------------",
-                        my_cq, my_cq->cq_number);
-       }
-
-       read_lock(&ehca_qp_idr_lock);
-       my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
-       read_unlock(&ehca_qp_idr_lock);
-       if (!my_qp)
-               goto repoll;
-       wc->qp = &my_qp->ib_qp;
-
-       qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
-       if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
-               /* We got a send completion. */
-               qmap = &my_qp->sq_map;
-       else
-               /* We got a receive completion. */
-               qmap = &my_qp->rq_map;
-
-       /* advance the tail pointer */
-       qmap->tail = qmap_tail_idx;
-
-       if (is_error) {
-               /*
-                * set left_to_poll to 0 because in error state, we will not
-                * get any additional CQEs
-                */
-               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
-                                                       my_qp->sq_map.entries);
-               my_qp->sq_map.left_to_poll = 0;
-               ehca_add_to_err_list(my_qp, 1);
-
-               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
-                                                       my_qp->rq_map.entries);
-               my_qp->rq_map.left_to_poll = 0;
-               if (HAS_RQ(my_qp))
-                       ehca_add_to_err_list(my_qp, 0);
-       }
-
-       qmap_entry = &qmap->map[qmap_tail_idx];
-       if (qmap_entry->reported) {
-               ehca_warn(cq->device, "Double cqe on qp_num=%#x",
-                               my_qp->real_qp_num);
-               /* found a double cqe, discard it and read next one */
-               goto repoll;
-       }
-
-       wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
-       qmap_entry->reported = 1;
-
-       /* if left_to_poll is decremented to 0, add the QP to the error list */
-       if (qmap->left_to_poll > 0) {
-               qmap->left_to_poll--;
-               if ((my_qp->sq_map.left_to_poll == 0) &&
-                               (my_qp->rq_map.left_to_poll == 0)) {
-                       ehca_add_to_err_list(my_qp, 1);
-                       if (HAS_RQ(my_qp))
-                               ehca_add_to_err_list(my_qp, 0);
-               }
-       }
-
-       /* eval ib_wc_opcode */
-       wc->opcode = ib_wc_opcode[cqe->optype]-1;
-       if (unlikely(wc->opcode == -1)) {
-               ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
-                        "ehca_cq=%p cq_num=%x",
-                        cqe->optype, cqe->status, my_cq, my_cq->cq_number);
-               /* dump cqe for other infos */
-               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
-                        my_cq, my_cq->cq_number);
-               /* update also queue adder to throw away this entry!!! */
-               goto repoll;
-       }
-
-       /* eval ib_wc_status */
-       if (unlikely(is_error)) {
-               /* complete with errors */
-               map_ib_wc_status(cqe->status, &wc->status);
-               wc->vendor_err = wc->status;
-       } else
-               wc->status = IB_WC_SUCCESS;
-
-       wc->byte_len = cqe->nr_bytes_transferred;
-       wc->pkey_index = cqe->pkey_index;
-       wc->slid = cqe->rlid;
-       wc->dlid_path_bits = cqe->dlid;
-       wc->src_qp = cqe->remote_qp_number;
-       /*
-        * HW has "Immed data present" and "GRH present" in bits 6 and 5.
-        * SW defines those in bits 1 and 0, so we can just shift and mask.
-        */
-       wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
-       wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
-       wc->sl = cqe->service_level;
-
-poll_cq_one_exit0:
-       if (cqe_count > 0)
-               hipz_update_feca(my_cq, cqe_count);
-
-       return ret;
-}
-
-static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
-                              struct ib_wc *wc, int num_entries,
-                              struct ipz_queue *ipz_queue, int on_sq)
-{
-       int nr = 0;
-       struct ehca_wqe *wqe;
-       u64 offset;
-       struct ehca_queue_map *qmap;
-       struct ehca_qmap_entry *qmap_entry;
-
-       if (on_sq)
-               qmap = &my_qp->sq_map;
-       else
-               qmap = &my_qp->rq_map;
-
-       qmap_entry = &qmap->map[qmap->next_wqe_idx];
-
-       while ((nr < num_entries) && (qmap_entry->reported == 0)) {
-               /* generate flush CQE */
-
-               memset(wc, 0, sizeof(*wc));
-
-               offset = qmap->next_wqe_idx * ipz_queue->qe_size;
-               wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
-               if (!wqe) {
-                       ehca_err(cq->device, "Invalid wqe offset=%#llx on "
-                                "qp_num=%#x", offset, my_qp->real_qp_num);
-                       return nr;
-               }
-
-               wc->wr_id = replace_wr_id(wqe->work_request_id,
-                                         qmap_entry->app_wr_id);
-
-               if (on_sq) {
-                       switch (wqe->optype) {
-                       case WQE_OPTYPE_SEND:
-                               wc->opcode = IB_WC_SEND;
-                               break;
-                       case WQE_OPTYPE_RDMAWRITE:
-                               wc->opcode = IB_WC_RDMA_WRITE;
-                               break;
-                       case WQE_OPTYPE_RDMAREAD:
-                               wc->opcode = IB_WC_RDMA_READ;
-                               break;
-                       default:
-                               ehca_err(cq->device, "Invalid optype=%x",
-                                               wqe->optype);
-                               return nr;
-                       }
-               } else
-                       wc->opcode = IB_WC_RECV;
-
-               if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
-                       wc->ex.imm_data = wqe->immediate_data;
-                       wc->wc_flags |= IB_WC_WITH_IMM;
-               }
-
-               wc->status = IB_WC_WR_FLUSH_ERR;
-
-               wc->qp = &my_qp->ib_qp;
-
-               /* mark as reported and advance next_wqe pointer */
-               qmap_entry->reported = 1;
-               qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
-                                               qmap->entries);
-               qmap_entry = &qmap->map[qmap->next_wqe_idx];
-
-               wc++; nr++;
-       }
-
-       return nr;
-
-}
-
-int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
-{
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int nr;
-       struct ehca_qp *err_qp;
-       struct ib_wc *current_wc = wc;
-       int ret = 0;
-       unsigned long flags;
-       int entries_left = num_entries;
-
-       if (num_entries < 1) {
-               ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
-                        "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
-               ret = -EINVAL;
-               goto poll_cq_exit0;
-       }
-
-       spin_lock_irqsave(&my_cq->spinlock, flags);
-
-       /* generate flush cqes for send queues */
-       list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
-               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
-                               &err_qp->ipz_squeue, 1);
-               entries_left -= nr;
-               current_wc += nr;
-
-               if (entries_left == 0)
-                       break;
-       }
-
-       /* generate flush cqes for receive queues */
-       list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
-               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
-                               &err_qp->ipz_rqueue, 0);
-               entries_left -= nr;
-               current_wc += nr;
-
-               if (entries_left == 0)
-                       break;
-       }
-
-       for (nr = 0; nr < entries_left; nr++) {
-               ret = ehca_poll_cq_one(cq, current_wc);
-               if (ret)
-                       break;
-               current_wc++;
-       } /* eof for nr */
-       entries_left -= nr;
-
-       spin_unlock_irqrestore(&my_cq->spinlock, flags);
-       if (ret == -EAGAIN  || !ret)
-               ret = num_entries - entries_left;
-
-poll_cq_exit0:
-       return ret;
-}
-
-int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
-{
-       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
-       int ret = 0;
-
-       switch (notify_flags & IB_CQ_SOLICITED_MASK) {
-       case IB_CQ_SOLICITED:
-               hipz_set_cqx_n0(my_cq, 1);
-               break;
-       case IB_CQ_NEXT_COMP:
-               hipz_set_cqx_n1(my_cq, 1);
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
-               unsigned long spl_flags;
-               spin_lock_irqsave(&my_cq->spinlock, spl_flags);
-               ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
-               spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
-       }
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
deleted file mode 100644 (file)
index 376b031..0000000
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  SQP functions
- *
- *  Authors: Khadija Souissi <souissi@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <rdma/ib_mad.h>
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "ehca_iverbs.h"
-#include "hcp_if.h"
-
-#define IB_MAD_STATUS_REDIRECT         cpu_to_be16(0x0002)
-#define IB_MAD_STATUS_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_MAD_STATUS_UNSUP_METHOD     cpu_to_be16(0x0008)
-
-#define IB_PMA_CLASS_PORT_INFO         cpu_to_be16(0x0001)
-
-/**
- * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
- * pair is created successfully, the corresponding port gets active.
- *
- * Define Special Queue pair 0 (SMI QP) is still not supported.
- *
- * @qp_init_attr: Queue pair init attributes with port and queue pair type
- */
-
-u64 ehca_define_sqp(struct ehca_shca *shca,
-                   struct ehca_qp *ehca_qp,
-                   struct ib_qp_init_attr *qp_init_attr)
-{
-       u32 pma_qp_nr, bma_qp_nr;
-       u64 ret;
-       u8 port = qp_init_attr->port_num;
-       int counter;
-
-       shca->sport[port - 1].port_state = IB_PORT_DOWN;
-
-       switch (qp_init_attr->qp_type) {
-       case IB_QPT_SMI:
-               /* function not supported yet */
-               break;
-       case IB_QPT_GSI:
-               ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
-                                        ehca_qp->ipz_qp_handle,
-                                        ehca_qp->galpas.kernel,
-                                        (u32) qp_init_attr->port_num,
-                                        &pma_qp_nr, &bma_qp_nr);
-
-               if (ret != H_SUCCESS) {
-                       ehca_err(&shca->ib_device,
-                                "Can't define AQP1 for port %x. h_ret=%lli",
-                                port, ret);
-                       return ret;
-               }
-               shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
-               ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
-                        port, pma_qp_nr);
-               break;
-       default:
-               ehca_err(&shca->ib_device, "invalid qp_type=%x",
-                        qp_init_attr->qp_type);
-               return H_PARAMETER;
-       }
-
-       if (ehca_nr_ports < 0) /* autodetect mode */
-               return H_SUCCESS;
-
-       for (counter = 0;
-            shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
-                    counter < ehca_port_act_time;
-            counter++) {
-               ehca_dbg(&shca->ib_device, "... wait until port %x is active",
-                        port);
-               msleep_interruptible(1000);
-       }
-
-       if (counter == ehca_port_act_time) {
-               ehca_err(&shca->ib_device, "Port %x is not active.", port);
-               return H_HARDWARE;
-       }
-
-       return H_SUCCESS;
-}
-
-struct ib_perf {
-       struct ib_mad_hdr mad_hdr;
-       u8 reserved[40];
-       u8 data[192];
-} __attribute__ ((packed));
-
-/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
-struct tcslfl {
-       u32 tc:8;
-       u32 sl:4;
-       u32 fl:20;
-} __attribute__ ((packed));
-
-/* IP Version/TC/FL packed into 32 bits, as in GRH */
-struct vertcfl {
-       u32 ver:4;
-       u32 tc:8;
-       u32 fl:20;
-} __attribute__ ((packed));
-
-static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
-                            const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                            const struct ib_mad *in_mad, struct ib_mad *out_mad)
-{
-       const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
-       struct ib_perf *out_perf = (struct ib_perf *)out_mad;
-       struct ib_class_port_info *poi =
-               (struct ib_class_port_info *)out_perf->data;
-       struct tcslfl *tcslfl =
-               (struct tcslfl *)&poi->redirect_tcslfl;
-       struct ehca_shca *shca =
-               container_of(ibdev, struct ehca_shca, ib_device);
-       struct ehca_sport *sport = &shca->sport[port_num - 1];
-
-       ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
-
-       *out_mad = *in_mad;
-
-       if (in_perf->mad_hdr.class_version != 1) {
-               ehca_warn(ibdev, "Unsupported class_version=%x",
-                         in_perf->mad_hdr.class_version);
-               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
-               goto perf_reply;
-       }
-
-       switch (in_perf->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-       case IB_MGMT_METHOD_SET:
-               /* set class port info for redirection */
-               out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
-               out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
-               memset(poi, 0, sizeof(*poi));
-               poi->base_version = 1;
-               poi->class_version = 1;
-               poi->resp_time_value = 18;
-
-               /* copy local routing information from WC where applicable */
-               tcslfl->sl         = in_wc->sl;
-               poi->redirect_lid  =
-                       sport->saved_attr.lid | in_wc->dlid_path_bits;
-               poi->redirect_qp   = sport->pma_qp_nr;
-               poi->redirect_qkey = IB_QP1_QKEY;
-
-               ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
-                               &poi->redirect_pkey);
-
-               /* if request was globally routed, copy route info */
-               if (in_grh) {
-                       const struct vertcfl *vertcfl =
-                               (const struct vertcfl *)&in_grh->version_tclass_flow;
-                       memcpy(poi->redirect_gid, in_grh->dgid.raw,
-                              sizeof(poi->redirect_gid));
-                       tcslfl->tc        = vertcfl->tc;
-                       tcslfl->fl        = vertcfl->fl;
-               } else
-                       /* else only fill in default GID */
-                       ehca_query_gid(ibdev, port_num, 0,
-                                      (union ib_gid *)&poi->redirect_gid);
-
-               ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
-                        sport->saved_attr.lid, sport->pma_qp_nr);
-               break;
-
-       case IB_MGMT_METHOD_GET_RESP:
-               return IB_MAD_RESULT_FAILURE;
-
-       default:
-               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
-               break;
-       }
-
-perf_reply:
-       out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
-
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                    const struct ib_mad_hdr *in, size_t in_mad_size,
-                    struct ib_mad_hdr *out, size_t *out_mad_size,
-                    u16 *out_mad_pkey_index)
-{
-       int ret;
-       const struct ib_mad *in_mad = (const struct ib_mad *)in;
-       struct ib_mad *out_mad = (struct ib_mad *)out;
-
-       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-                        *out_mad_size != sizeof(*out_mad)))
-               return IB_MAD_RESULT_FAILURE;
-
-       if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
-               return IB_MAD_RESULT_FAILURE;
-
-       /* accept only pma request */
-       if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
-               return IB_MAD_RESULT_SUCCESS;
-
-       ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
-       ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
-                               in_mad, out_mad);
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h
deleted file mode 100644 (file)
index d280b12..0000000
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  auxiliary functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Khadija Souissi <souissik@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-#ifndef EHCA_TOOLS_H
-#define EHCA_TOOLS_H
-
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-#include <linux/idr.h>
-#include <linux/kthread.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/vmalloc.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/device.h>
-
-#include <linux/atomic.h>
-#include <asm/ibmebus.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/hvcall.h>
-
-extern int ehca_debug_level;
-
-#define ehca_dbg(ib_dev, format, arg...) \
-       do { \
-               if (unlikely(ehca_debug_level)) \
-                       dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
-                                  "PU%04x EHCA_DBG:%s " format "\n", \
-                                  raw_smp_processor_id(), __func__, \
-                                  ## arg); \
-       } while (0)
-
-#define ehca_info(ib_dev, format, arg...) \
-       dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
-                raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_warn(ib_dev, format, arg...) \
-       dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
-                raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_err(ib_dev, format, arg...) \
-       dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
-               raw_smp_processor_id(), __func__, ## arg)
-
-/* use this one only if no ib_dev available */
-#define ehca_gen_dbg(format, arg...) \
-       do { \
-               if (unlikely(ehca_debug_level)) \
-                       printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
-                              raw_smp_processor_id(), __func__, ## arg); \
-       } while (0)
-
-#define ehca_gen_warn(format, arg...) \
-       printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
-              raw_smp_processor_id(), __func__, ## arg)
-
-#define ehca_gen_err(format, arg...) \
-       printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
-              raw_smp_processor_id(), __func__, ## arg)
-
-/**
- * ehca_dmp - printk a memory block, whose length is n*8 bytes.
- * Each line has the following layout:
- * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
- */
-#define ehca_dmp(adr, len, format, args...) \
-       do { \
-               unsigned int x; \
-               unsigned int l = (unsigned int)(len); \
-               unsigned char *deb = (unsigned char *)(adr); \
-               for (x = 0; x < l; x += 16) { \
-                       printk(KERN_INFO "EHCA_DMP:%s " format \
-                              " adr=%p ofs=%04x %016llx %016llx\n", \
-                              __func__, ##args, deb, x, \
-                              *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
-                       deb += 16; \
-               } \
-       } while (0)
-
-/* define a bitmask, little endian version */
-#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
-
-/* define a bitmask, the ibm way... */
-#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
-
-/* internal function, don't use */
-#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
-
-/* internal function, don't use */
-#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
-
-/**
- * EHCA_BMASK_SET - return value shifted and masked by mask
- * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
- * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
- * in variable
- */
-#define EHCA_BMASK_SET(mask, value) \
-       ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
-
-/**
- * EHCA_BMASK_GET - extract a parameter from value by mask
- */
-#define EHCA_BMASK_GET(mask, value) \
-       (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
-
-/* Converts ehca to ib return code */
-int ehca2ib_return_code(u64 ehca_rc);
-
-#endif /* EHCA_TOOLS_H */
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
deleted file mode 100644 (file)
index 1a1d5d9..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  userspace support verbs
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_classes.h"
-#include "ehca_iverbs.h"
-#include "ehca_mrmw.h"
-#include "ehca_tools.h"
-#include "hcp_if.h"
-
-struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
-                                       struct ib_udata *udata)
-{
-       struct ehca_ucontext *my_context;
-
-       my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
-       if (!my_context) {
-               ehca_err(device, "Out of memory device=%p", device);
-               return ERR_PTR(-ENOMEM);
-       }
-
-       return &my_context->ib_ucontext;
-}
-
-int ehca_dealloc_ucontext(struct ib_ucontext *context)
-{
-       kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
-       return 0;
-}
-
-static void ehca_mm_open(struct vm_area_struct *vma)
-{
-       u32 *count = (u32 *)vma->vm_private_data;
-       if (!count) {
-               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-               return;
-       }
-       (*count)++;
-       if (!(*count))
-               ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
-                    vma->vm_start, vma->vm_end, *count);
-}
-
-static void ehca_mm_close(struct vm_area_struct *vma)
-{
-       u32 *count = (u32 *)vma->vm_private_data;
-       if (!count) {
-               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
-                            vma->vm_start, vma->vm_end);
-               return;
-       }
-       (*count)--;
-       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
-                    vma->vm_start, vma->vm_end, *count);
-}
-
-static const struct vm_operations_struct vm_ops = {
-       .open = ehca_mm_open,
-       .close = ehca_mm_close,
-};
-
-static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
-                       u32 *mm_count)
-{
-       int ret;
-       u64 vsize, physical;
-
-       vsize = vma->vm_end - vma->vm_start;
-       if (vsize < EHCA_PAGESIZE) {
-               ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
-               return -EINVAL;
-       }
-
-       physical = galpas->user.fw_handle;
-       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-       ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
-       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
-       ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
-                          vma->vm_page_prot);
-       if (unlikely(ret)) {
-               ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
-               return -ENOMEM;
-       }
-
-       vma->vm_private_data = mm_count;
-       (*mm_count)++;
-       vma->vm_ops = &vm_ops;
-
-       return 0;
-}
-
-static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
-                          u32 *mm_count)
-{
-       int ret;
-       u64 start, ofs;
-       struct page *page;
-
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-       start = vma->vm_start;
-       for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
-               u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
-               page = virt_to_page(virt_addr);
-               ret = vm_insert_page(vma, start, page);
-               if (unlikely(ret)) {
-                       ehca_gen_err("vm_insert_page() failed rc=%i", ret);
-                       return ret;
-               }
-               start += PAGE_SIZE;
-       }
-       vma->vm_private_data = mm_count;
-       (*mm_count)++;
-       vma->vm_ops = &vm_ops;
-
-       return 0;
-}
-
-static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
-                       u32 rsrc_type)
-{
-       int ret;
-
-       switch (rsrc_type) {
-       case 0: /* galpa fw handle */
-               ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
-               ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_fw() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       case 1: /* cq queue_addr */
-               ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
-               ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_queue() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
-                        rsrc_type, cq->cq_number);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
-                       u32 rsrc_type)
-{
-       int ret;
-
-       switch (rsrc_type) {
-       case 0: /* galpa fw handle */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
-               ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "remap_pfn_range() failed ret=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return -ENOMEM;
-               }
-               break;
-
-       case 1: /* qp rqueue_addr */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
-               ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
-                                     &qp->mm_count_rqueue);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       case 2: /* qp squeue_addr */
-               ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
-               ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
-                                     &qp->mm_count_squeue);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
-                        rsrc_type, qp->ib_qp.qp_num);
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       u64 fileoffset = vma->vm_pgoff;
-       u32 idr_handle = fileoffset & 0x1FFFFFF;
-       u32 q_type = (fileoffset >> 27) & 0x1;    /* CQ, QP,...        */
-       u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
-       u32 ret;
-       struct ehca_cq *cq;
-       struct ehca_qp *qp;
-       struct ib_uobject *uobject;
-
-       switch (q_type) {
-       case  0: /* CQ */
-               read_lock(&ehca_cq_idr_lock);
-               cq = idr_find(&ehca_cq_idr, idr_handle);
-               read_unlock(&ehca_cq_idr_lock);
-
-               /* make sure this mmap really belongs to the authorized user */
-               if (!cq)
-                       return -EINVAL;
-
-               if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
-                       return -EINVAL;
-
-               ret = ehca_mmap_cq(vma, cq, rsrc_type);
-               if (unlikely(ret)) {
-                       ehca_err(cq->ib_cq.device,
-                                "ehca_mmap_cq() failed rc=%i cq_num=%x",
-                                ret, cq->cq_number);
-                       return ret;
-               }
-               break;
-
-       case 1: /* QP */
-               read_lock(&ehca_qp_idr_lock);
-               qp = idr_find(&ehca_qp_idr, idr_handle);
-               read_unlock(&ehca_qp_idr_lock);
-
-               /* make sure this mmap really belongs to the authorized user */
-               if (!qp)
-                       return -EINVAL;
-
-               uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
-               if (!uobject || uobject->context != context)
-                       return -EINVAL;
-
-               ret = ehca_mmap_qp(vma, qp, rsrc_type);
-               if (unlikely(ret)) {
-                       ehca_err(qp->ib_qp.device,
-                                "ehca_mmap_qp() failed rc=%i qp_num=%x",
-                                ret, qp->ib_qp.qp_num);
-                       return ret;
-               }
-               break;
-
-       default:
-               ehca_gen_err("bad queue type %x", q_type);
-               return -EINVAL;
-       }
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
deleted file mode 100644 (file)
index 89517ff..0000000
+++ /dev/null
@@ -1,949 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware Infiniband Interface code for POWER
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Joachim Fenkes <fenkes@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <asm/hvcall.h>
-#include "ehca_tools.h"
-#include "hcp_if.h"
-#include "hcp_phyp.h"
-#include "hipz_fns.h"
-#include "ipz_pt_fn.h"
-
-#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
-#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
-#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
-#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
-#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
-#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
-#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
-#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
-#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
-#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
-#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
-
-#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
-#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
-#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
-#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
-
-#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
-#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
-#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
-#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
-#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
-
-#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
-#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
-#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
-#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
-
-#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
-#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
-
-#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
-#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
-#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
-
-#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
-#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
-#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
-
-static DEFINE_SPINLOCK(hcall_lock);
-
-static long ehca_plpar_hcall_norets(unsigned long opcode,
-                                   unsigned long arg1,
-                                   unsigned long arg2,
-                                   unsigned long arg3,
-                                   unsigned long arg4,
-                                   unsigned long arg5,
-                                   unsigned long arg6,
-                                   unsigned long arg7)
-{
-       long ret;
-       int i, sleep_msecs;
-       unsigned long flags = 0;
-
-       if (unlikely(ehca_debug_level >= 2))
-               ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
-                            opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
-
-       for (i = 0; i < 5; i++) {
-               /* serialize hCalls to work around firmware issue */
-               if (ehca_lock_hcalls)
-                       spin_lock_irqsave(&hcall_lock, flags);
-
-               ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
-                                        arg5, arg6, arg7);
-
-               if (ehca_lock_hcalls)
-                       spin_unlock_irqrestore(&hcall_lock, flags);
-
-               if (H_IS_LONG_BUSY(ret)) {
-                       sleep_msecs = get_longbusy_msecs(ret);
-                       msleep_interruptible(sleep_msecs);
-                       continue;
-               }
-
-               if (ret < H_SUCCESS)
-                       ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
-                                    opcode, ret, arg1, arg2, arg3,
-                                    arg4, arg5, arg6, arg7);
-               else
-                       if (unlikely(ehca_debug_level >= 2))
-                               ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
-
-               return ret;
-       }
-
-       return H_BUSY;
-}
-
-static long ehca_plpar_hcall9(unsigned long opcode,
-                             unsigned long *outs, /* array of 9 outputs */
-                             unsigned long arg1,
-                             unsigned long arg2,
-                             unsigned long arg3,
-                             unsigned long arg4,
-                             unsigned long arg5,
-                             unsigned long arg6,
-                             unsigned long arg7,
-                             unsigned long arg8,
-                             unsigned long arg9)
-{
-       long ret;
-       int i, sleep_msecs;
-       unsigned long flags = 0;
-
-       if (unlikely(ehca_debug_level >= 2))
-               ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
-                            arg1, arg2, arg3, arg4, arg5,
-                            arg6, arg7, arg8, arg9);
-
-       for (i = 0; i < 5; i++) {
-               /* serialize hCalls to work around firmware issue */
-               if (ehca_lock_hcalls)
-                       spin_lock_irqsave(&hcall_lock, flags);
-
-               ret = plpar_hcall9(opcode, outs,
-                                  arg1, arg2, arg3, arg4, arg5,
-                                  arg6, arg7, arg8, arg9);
-
-               if (ehca_lock_hcalls)
-                       spin_unlock_irqrestore(&hcall_lock, flags);
-
-               if (H_IS_LONG_BUSY(ret)) {
-                       sleep_msecs = get_longbusy_msecs(ret);
-                       msleep_interruptible(sleep_msecs);
-                       continue;
-               }
-
-               if (ret < H_SUCCESS) {
-                       ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
-                                    opcode, arg1, arg2, arg3, arg4, arg5,
-                                    arg6, arg7, arg8, arg9);
-                       ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
-                                    ret, outs[0], outs[1], outs[2], outs[3],
-                                    outs[4], outs[5], outs[6], outs[7],
-                                    outs[8]);
-               } else if (unlikely(ehca_debug_level >= 2))
-                       ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
-                                    ret, outs[0], outs[1], outs[2], outs[3],
-                                    outs[4], outs[5], outs[6], outs[7],
-                                    outs[8]);
-               return ret;
-       }
-
-       return H_BUSY;
-}
-
-u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u32 neq_control,
-                            const u32 number_of_entries,
-                            struct ipz_eq_handle *eq_handle,
-                            u32 *act_nr_of_entries,
-                            u32 *act_pages,
-                            u32 *eq_ist)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-       u64 allocate_controls;
-
-       /* resource type */
-       allocate_controls = 3ULL;
-
-       /* ISN is associated */
-       if (neq_control != 1)
-               allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
-       else /* notification event queue */
-               allocate_controls = (1ULL << 63) | allocate_controls;
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,  /* r4 */
-                               allocate_controls,      /* r5 */
-                               number_of_entries,      /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       eq_handle->handle = outs[0];
-       *act_nr_of_entries = (u32)outs[3];
-       *act_pages = (u32)outs[4];
-       *eq_ist = (u32)outs[5];
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resource - ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
-                      struct ipz_eq_handle eq_handle,
-                      const u64 event_mask)
-{
-       return ehca_plpar_hcall_norets(H_RESET_EVENTS,
-                                      adapter_handle.handle, /* r4 */
-                                      eq_handle.handle,      /* r5 */
-                                      event_mask,            /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_cq *cq,
-                            struct ehca_alloc_cq_parms *param)
-{
-       int rc;
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,   /* r4  */
-                               2,                       /* r5  */
-                               param->eq_handle.handle, /* r6  */
-                               cq->token,               /* r7  */
-                               param->nr_cqe,           /* r8  */
-                               0, 0, 0, 0);
-       cq->ipz_cq_handle.handle = outs[0];
-       param->act_nr_of_entries = (u32)outs[3];
-       param->act_pages = (u32)outs[4];
-
-       if (ret == H_SUCCESS) {
-               rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
-               if (rc) {
-                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
-                                    rc, outs[5]);
-
-                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                               adapter_handle.handle,     /* r4 */
-                                               cq->ipz_cq_handle.handle,  /* r5 */
-                                               0, 0, 0, 0, 0);
-                       ret = H_NO_MEM;
-               }
-       }
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_alloc_qp_parms *parms, int is_user)
-{
-       int rc;
-       u64 ret;
-       u64 allocate_controls, max_r10_reg, r11, r12;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       allocate_controls =
-               EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
-                                parms->squeue.page_size)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
-                                parms->rqueue.page_size)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
-                                !!(parms->ll_comp_flags & LLQP_RECV_COMP))
-               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
-                                !!(parms->ll_comp_flags & LLQP_SEND_COMP))
-               | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
-                                parms->ud_av_l_key_ctl)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
-
-       max_r10_reg =
-               EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
-                              parms->squeue.max_wr + 1)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
-                                parms->rqueue.max_wr + 1)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
-                                parms->squeue.max_sge)
-               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
-                                parms->rqueue.max_sge);
-
-       r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
-
-       if (parms->ext_type == EQPT_SRQ)
-               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
-       else
-               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,             /* r4  */
-                               allocate_controls,                 /* r5  */
-                               parms->send_cq_handle.handle,
-                               parms->recv_cq_handle.handle,
-                               parms->eq_handle.handle,
-                               ((u64)parms->token << 32) | parms->pd.value,
-                               max_r10_reg, r11, r12);
-
-       parms->qp_handle.handle = outs[0];
-       parms->real_qp_num = (u32)outs[1];
-       parms->squeue.act_nr_wqes =
-               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
-       parms->rqueue.act_nr_wqes =
-               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
-       parms->squeue.act_nr_sges =
-               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
-       parms->rqueue.act_nr_sges =
-               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
-       parms->squeue.queue_size =
-               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
-       parms->rqueue.queue_size =
-               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
-
-       if (ret == H_SUCCESS) {
-               rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
-               if (rc) {
-                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
-                                    rc, outs[6]);
-
-                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                               adapter_handle.handle,     /* r4 */
-                                               parms->qp_handle.handle,  /* r5 */
-                                               0, 0, 0, 0, 0);
-                       ret = H_NO_MEM;
-               }
-       }
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
-                     const u8 port_id,
-                     struct hipz_query_port *query_port_response_block)
-{
-       u64 ret;
-       u64 r_cb = __pa(query_port_response_block);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("response block not page aligned");
-               return H_PARAMETER;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
-                                     adapter_handle.handle, /* r4 */
-                                     port_id,               /* r5 */
-                                     r_cb,                  /* r6 */
-                                     0, 0, 0, 0);
-
-       if (ehca_debug_level >= 2)
-               ehca_dmp(query_port_response_block, 64, "response_block");
-
-       return ret;
-}
-
-u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
-                      const u8 port_id, const u32 port_cap,
-                      const u8 init_type, const int modify_mask)
-{
-       u64 port_attributes = port_cap;
-
-       if (modify_mask & IB_PORT_SHUTDOWN)
-               port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
-       if (modify_mask & IB_PORT_INIT_TYPE)
-               port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
-       if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
-               port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
-
-       return ehca_plpar_hcall_norets(H_MODIFY_PORT,
-                                      adapter_handle.handle, /* r4 */
-                                      port_id,               /* r5 */
-                                      port_attributes,       /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
-                    struct hipz_query_hca *query_hca_rblock)
-{
-       u64 r_cb = __pa(query_hca_rblock);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("response_block=%p not page aligned",
-                            query_hca_rblock);
-               return H_PARAMETER;
-       }
-
-       return ehca_plpar_hcall_norets(H_QUERY_HCA,
-                                      adapter_handle.handle, /* r4 */
-                                      r_cb,                  /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
-                         const u8 pagesize,
-                         const u8 queue_type,
-                         const u64 resource_handle,
-                         const u64 logical_address_of_page,
-                         u64 count)
-{
-       return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
-                                      adapter_handle.handle,      /* r4  */
-                                      (u64)queue_type | ((u64)pagesize) << 8,
-                                      /* r5  */
-                                      resource_handle,            /* r6  */
-                                      logical_address_of_page,    /* r7  */
-                                      count,                      /* r8  */
-                                      0, 0);
-}
-
-u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_eq_handle eq_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count)
-{
-       if (count != 1) {
-               ehca_gen_err("Ppage counter=%llx", count);
-               return H_PARAMETER;
-       }
-       return hipz_h_register_rpage(adapter_handle,
-                                    pagesize,
-                                    queue_type,
-                                    eq_handle.handle,
-                                    logical_address_of_page, count);
-}
-
-u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
-                          u32 ist)
-{
-       u64 ret;
-       ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
-                                     adapter_handle.handle, /* r4 */
-                                     ist,                   /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret != H_SUCCESS && ret != H_BUSY)
-               ehca_gen_err("Could not query interrupt state.");
-
-       return ret;
-}
-
-u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_cq_handle cq_handle,
-                            struct ehca_pfcq *pfcq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa gal)
-{
-       if (count != 1) {
-               ehca_gen_err("Page counter=%llx", count);
-               return H_PARAMETER;
-       }
-
-       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
-                                    cq_handle.handle, logical_address_of_page,
-                                    count);
-}
-
-u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_qp_handle qp_handle,
-                            struct ehca_pfqp *pfqp,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa galpa)
-{
-       if (count > 1) {
-               ehca_gen_err("Page counter=%llx", count);
-               return H_PARAMETER;
-       }
-
-       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
-                                    qp_handle.handle, logical_address_of_page,
-                                    count);
-}
-
-u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
-                              const struct ipz_qp_handle qp_handle,
-                              struct ehca_pfqp *pfqp,
-                              void **log_addr_next_sq_wqe2processed,
-                              void **log_addr_next_rq_wqe2processed,
-                              int dis_and_get_function_code)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
-                               adapter_handle.handle,     /* r4 */
-                               dis_and_get_function_code, /* r5 */
-                               qp_handle.handle,          /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       if (log_addr_next_sq_wqe2processed)
-               *log_addr_next_sq_wqe2processed = (void *)outs[0];
-       if (log_addr_next_rq_wqe2processed)
-               *log_addr_next_rq_wqe2processed = (void *)outs[1];
-
-       return ret;
-}
-
-u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
-                    const struct ipz_qp_handle qp_handle,
-                    struct ehca_pfqp *pfqp,
-                    const u64 update_mask,
-                    struct hcp_modify_qp_control_block *mqpcb,
-                    struct h_galpa gal)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-       ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
-                               adapter_handle.handle, /* r4 */
-                               qp_handle.handle,      /* r5 */
-                               update_mask,           /* r6 */
-                               __pa(mqpcb),           /* r7 */
-                               0, 0, 0, 0, 0);
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Insufficient resources ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
-                   const struct ipz_qp_handle qp_handle,
-                   struct ehca_pfqp *pfqp,
-                   struct hcp_modify_qp_control_block *qqpcb,
-                   struct h_galpa gal)
-{
-       return ehca_plpar_hcall_norets(H_QUERY_QP,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      __pa(qqpcb),           /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_qp *qp)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = hcp_galpas_dtor(&qp->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct qp->galpas");
-               return H_RESOURCE;
-       }
-       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
-                               adapter_handle.handle,     /* r4 */
-                               /* function code */
-                               1,                         /* r5 */
-                               qp->ipz_qp_handle.handle,  /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       if (ret == H_HARDWARE)
-               ehca_gen_err("HCA not operational. ret=%lli", ret);
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     qp->ipz_qp_handle.handle,  /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("Resource still in use. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port)
-{
-       return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      port,                  /* r6 */
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port, u32 * pma_qp_nr,
-                      u32 * bma_qp_nr)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
-                               adapter_handle.handle, /* r4 */
-                               qp_handle.handle,      /* r5 */
-                               port,                  /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       *pma_qp_nr = (u32)outs[0];
-       *bma_qp_nr = (u32)outs[1];
-
-       if (ret == H_ALIAS_EXIST)
-               ehca_gen_err("AQP1 already exists. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id)
-{
-       u64 ret;
-
-       ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
-                                     adapter_handle.handle,  /* r4 */
-                                     qp_handle.handle,       /* r5 */
-                                     mcg_dlid,               /* r6 */
-                                     interface_id,           /* r7 */
-                                     subnet_prefix,          /* r8 */
-                                     0, 0);
-
-       if (ret == H_NOT_ENOUGH_RESOURCES)
-               ehca_gen_err("Not enough resources. ret=%lli", ret);
-
-       return ret;
-}
-
-u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id)
-{
-       return ehca_plpar_hcall_norets(H_DETACH_MCQP,
-                                      adapter_handle.handle, /* r4 */
-                                      qp_handle.handle,      /* r5 */
-                                      mcg_dlid,              /* r6 */
-                                      interface_id,          /* r7 */
-                                      subnet_prefix,         /* r8 */
-                                      0, 0);
-}
-
-u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_cq *cq,
-                     u8 force_flag)
-{
-       u64 ret;
-
-       ret = hcp_galpas_dtor(&cq->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct cp->galpas");
-               return H_RESOURCE;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     cq->ipz_cq_handle.handle,  /* r5 */
-                                     force_flag != 0 ? 1L : 0L, /* r6 */
-                                     0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_eq *eq)
-{
-       u64 ret;
-
-       ret = hcp_galpas_dtor(&eq->galpas);
-       if (ret) {
-               ehca_gen_err("Could not destruct eq->galpas");
-               return H_RESOURCE;
-       }
-
-       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                     adapter_handle.handle,     /* r4 */
-                                     eq->ipz_eq_handle.handle,  /* r5 */
-                                     0, 0, 0, 0, 0);
-
-       if (ret == H_RESOURCE)
-               ehca_gen_err("Resource in use. ret=%lli ", ret);
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u64 vaddr,
-                            const u64 length,
-                            const u32 access_ctrl,
-                            const struct ipz_pd pd,
-                            struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,            /* r4 */
-                               5,                                /* r5 */
-                               vaddr,                            /* r6 */
-                               length,                           /* r7 */
-                               (((u64)access_ctrl) << 32ULL),    /* r8 */
-                               pd.value,                         /* r9 */
-                               0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count)
-{
-       u64 ret;
-
-       if (unlikely(ehca_debug_level >= 3)) {
-               if (count > 1) {
-                       u64 *kpage;
-                       int i;
-                       kpage = __va(logical_address_of_page);
-                       for (i = 0; i < count; i++)
-                               ehca_gen_dbg("kpage[%d]=%p",
-                                            i, (void *)kpage[i]);
-               } else
-                       ehca_gen_dbg("kpage=%p",
-                                    (void *)logical_address_of_page);
-       }
-
-       if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
-               ehca_gen_err("logical_address_of_page not on a 4k boundary "
-                            "adapter_handle=%llx mr=%p mr_handle=%llx "
-                            "pagesize=%x queue_type=%x "
-                            "logical_address_of_page=%llx count=%llx",
-                            adapter_handle.handle, mr,
-                            mr->ipz_mr_handle.handle, pagesize, queue_type,
-                            logical_address_of_page, count);
-               ret = H_PARAMETER;
-       } else
-               ret = hipz_h_register_rpage(adapter_handle, pagesize,
-                                           queue_type,
-                                           mr->ipz_mr_handle.handle,
-                                           logical_address_of_page, count);
-       return ret;
-}
-
-u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mr *mr,
-                   struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
-                               adapter_handle.handle,     /* r4 */
-                               mr->ipz_mr_handle.handle,  /* r5 */
-                               0, 0, 0, 0, 0, 0, 0);
-       outparms->len = outs[0];
-       outparms->vaddr = outs[1];
-       outparms->acl  = outs[4] >> 32;
-       outparms->lkey = (u32)(outs[5] >> 32);
-       outparms->rkey = (u32)(outs[5] & (0xffffffff));
-
-       return ret;
-}
-
-u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mr *mr)
-{
-       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                      adapter_handle.handle,    /* r4 */
-                                      mr->ipz_mr_handle.handle, /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
-                         const struct ehca_mr *mr,
-                         const u64 vaddr_in,
-                         const u64 length,
-                         const u32 access_ctrl,
-                         const struct ipz_pd pd,
-                         const u64 mr_addr_cb,
-                         struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
-                               adapter_handle.handle,    /* r4 */
-                               mr->ipz_mr_handle.handle, /* r5 */
-                               vaddr_in,                 /* r6 */
-                               length,                   /* r7 */
-                               /* r8 */
-                               ((((u64)access_ctrl) << 32ULL) | pd.value),
-                               mr_addr_cb,               /* r9 */
-                               0, 0, 0);
-       outparms->vaddr = outs[1];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
-                       const struct ehca_mr *mr,
-                       const struct ehca_mr *orig_mr,
-                       const u64 vaddr_in,
-                       const u32 access_ctrl,
-                       const struct ipz_pd pd,
-                       struct ehca_mr_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
-                               adapter_handle.handle,            /* r4 */
-                               orig_mr->ipz_mr_handle.handle,    /* r5 */
-                               vaddr_in,                         /* r6 */
-                               (((u64)access_ctrl) << 32ULL),    /* r7 */
-                               pd.value,                         /* r8 */
-                               0, 0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->lkey = (u32)outs[2];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mw *mw,
-                            const struct ipz_pd pd,
-                            struct ehca_mw_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
-                               adapter_handle.handle,      /* r4 */
-                               6,                          /* r5 */
-                               pd.value,                   /* r6 */
-                               0, 0, 0, 0, 0, 0);
-       outparms->handle.handle = outs[0];
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mw *mw,
-                   struct ehca_mw_hipzout_parms *outparms)
-{
-       u64 ret;
-       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
-
-       ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
-                               adapter_handle.handle,    /* r4 */
-                               mw->ipz_mw_handle.handle, /* r5 */
-                               0, 0, 0, 0, 0, 0, 0);
-       outparms->rkey = (u32)outs[3];
-
-       return ret;
-}
-
-u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mw *mw)
-{
-       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
-                                      adapter_handle.handle,    /* r4 */
-                                      mw->ipz_mw_handle.handle, /* r5 */
-                                      0, 0, 0, 0, 0);
-}
-
-u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
-                     const u64 ressource_handle,
-                     void *rblock,
-                     unsigned long *byte_count)
-{
-       u64 r_cb = __pa(rblock);
-
-       if (r_cb & (EHCA_PAGESIZE-1)) {
-               ehca_gen_err("rblock not page aligned.");
-               return H_PARAMETER;
-       }
-
-       return ehca_plpar_hcall_norets(H_ERROR_DATA,
-                                      adapter_handle.handle,
-                                      ressource_handle,
-                                      r_cb,
-                                      0, 0, 0, 0);
-}
-
-u64 hipz_h_eoi(int irq)
-{
-       unsigned long xirr;
-
-       iosync();
-       xirr = (0xffULL << 24) | irq;
-
-       return plpar_hcall_norets(H_EOI, xirr);
-}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
deleted file mode 100644 (file)
index a46e514..0000000
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware Infiniband Interface code for POWER
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HCP_IF_H__
-#define __HCP_IF_H__
-
-#include "ehca_classes.h"
-#include "ehca_tools.h"
-#include "hipz_hw.h"
-
-/*
- * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
- * resources, create the empty EQPT (ring).
- */
-u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u32 neq_control,
-                            const u32 number_of_entries,
-                            struct ipz_eq_handle *eq_handle,
-                            u32 * act_nr_of_entries,
-                            u32 * act_pages,
-                            u32 * eq_ist);
-
-u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
-                      struct ipz_eq_handle eq_handle,
-                      const u64 event_mask);
-/*
- * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
- * resources, create the empty CQPT (ring).
- */
-u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_cq *cq,
-                            struct ehca_alloc_cq_parms *param);
-
-
-/*
- * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
- * initialize resources, create empty QPPTs (2 rings).
- */
-u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
-                            struct ehca_alloc_qp_parms *parms, int is_user);
-
-u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
-                     const u8 port_id,
-                     struct hipz_query_port *query_port_response_block);
-
-u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
-                      const u8 port_id, const u32 port_cap,
-                      const u8 init_type, const int modify_mask);
-
-u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
-                    struct hipz_query_hca *query_hca_rblock);
-
-/*
- * hipz_h_register_rpage internal function in hcp_if.h for all
- * hcp_H_REGISTER_RPAGE calls.
- */
-u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
-                         const u8 pagesize,
-                         const u8 queue_type,
-                         const u64 resource_handle,
-                         const u64 logical_address_of_page,
-                         u64 count);
-
-u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_eq_handle eq_handle,
-                            struct ehca_pfeq *pfeq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count);
-
-u64 hipz_h_query_int_state(const struct ipz_adapter_handle
-                          hcp_adapter_handle,
-                          u32 ist);
-
-u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_cq_handle cq_handle,
-                            struct ehca_pfcq *pfcq,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa gal);
-
-u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
-                            const struct ipz_qp_handle qp_handle,
-                            struct ehca_pfqp *pfqp,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count,
-                            const struct h_galpa galpa);
-
-u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
-                              const struct ipz_qp_handle qp_handle,
-                              struct ehca_pfqp *pfqp,
-                              void **log_addr_next_sq_wqe_tb_processed,
-                              void **log_addr_next_rq_wqe_tb_processed,
-                              int dis_and_get_function_code);
-enum hcall_sigt {
-       HCALL_SIGT_NO_CQE = 0,
-       HCALL_SIGT_BY_WQE = 1,
-       HCALL_SIGT_EVERY = 2
-};
-
-u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
-                    const struct ipz_qp_handle qp_handle,
-                    struct ehca_pfqp *pfqp,
-                    const u64 update_mask,
-                    struct hcp_modify_qp_control_block *mqpcb,
-                    struct h_galpa gal);
-
-u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
-                   const struct ipz_qp_handle qp_handle,
-                   struct ehca_pfqp *pfqp,
-                   struct hcp_modify_qp_control_block *qqpcb,
-                   struct h_galpa gal);
-
-u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_qp *qp);
-
-u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port);
-
-u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u32 port, u32 * pma_qp_nr,
-                      u32 * bma_qp_nr);
-
-u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id);
-
-u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
-                      const struct ipz_qp_handle qp_handle,
-                      struct h_galpa gal,
-                      u16 mcg_dlid,
-                      u64 subnet_prefix, u64 interface_id);
-
-u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_cq *cq,
-                     u8 force_flag);
-
-u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
-                     struct ehca_eq *eq);
-
-/*
- * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
- * resources.
- */
-u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u64 vaddr,
-                            const u64 length,
-                            const u32 access_ctrl,
-                            const struct ipz_pd pd,
-                            struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
-u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mr *mr,
-                            const u8 pagesize,
-                            const u8 queue_type,
-                            const u64 logical_address_of_page,
-                            const u64 count);
-
-/* hipz_h_query_mr queries MR in HW and FW */
-u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mr *mr,
-                   struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_free_resource_mr frees MR resources in HW and FW */
-u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mr *mr);
-
-/* hipz_h_reregister_pmr reregisters MR in HW and FW */
-u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
-                         const struct ehca_mr *mr,
-                         const u64 vaddr_in,
-                         const u64 length,
-                         const u32 access_ctrl,
-                         const struct ipz_pd pd,
-                         const u64 mr_addr_cb,
-                         struct ehca_mr_hipzout_parms *outparms);
-
-/* hipz_h_register_smr register shared MR in HW and FW */
-u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
-                       const struct ehca_mr *mr,
-                       const struct ehca_mr *orig_mr,
-                       const u64 vaddr_in,
-                       const u32 access_ctrl,
-                       const struct ipz_pd pd,
-                       struct ehca_mr_hipzout_parms *outparms);
-
-/*
- * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
- * resources.
- */
-u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                            const struct ehca_mw *mw,
-                            const struct ipz_pd pd,
-                            struct ehca_mw_hipzout_parms *outparms);
-
-/* hipz_h_query_mw queries MW in HW and FW */
-u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
-                   const struct ehca_mw *mw,
-                   struct ehca_mw_hipzout_parms *outparms);
-
-/* hipz_h_free_resource_mw frees MW resources in HW and FW */
-u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
-                           const struct ehca_mw *mw);
-
-u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
-                     const u64 ressource_handle,
-                     void *rblock,
-                     unsigned long *byte_count);
-u64 hipz_h_eoi(int irq);
-
-#endif /* __HCP_IF_H__ */
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c
deleted file mode 100644 (file)
index 077376f..0000000
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *   load store abstraction for ehca register access with tracing
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "ehca_classes.h"
-#include "hipz_hw.h"
-
-u64 hcall_map_page(u64 physaddr)
-{
-       return (u64)ioremap(physaddr, EHCA_PAGESIZE);
-}
-
-int hcall_unmap_page(u64 mapaddr)
-{
-       iounmap((volatile void __iomem *) mapaddr);
-       return 0;
-}
-
-int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
-                   u64 paddr_kernel, u64 paddr_user)
-{
-       if (!is_user) {
-               galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
-               if (!galpas->kernel.fw_handle)
-                       return -ENOMEM;
-       } else
-               galpas->kernel.fw_handle = 0;
-
-       galpas->user.fw_handle = paddr_user;
-
-       return 0;
-}
-
-int hcp_galpas_dtor(struct h_galpas *galpas)
-{
-       if (galpas->kernel.fw_handle) {
-               int ret = hcall_unmap_page(galpas->kernel.fw_handle);
-               if (ret)
-                       return ret;
-       }
-
-       galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h
deleted file mode 100644 (file)
index d1b0299..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  Firmware calls
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Waleri Fomin <fomin@de.ibm.com>
- *           Gerd Bayer <gerd.bayer@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HCP_PHYP_H__
-#define __HCP_PHYP_H__
-
-
-/*
- * eHCA page (mapped into memory)
- * resource to access eHCA register pages in CPU address space
-*/
-struct h_galpa {
-       u64 fw_handle;
-       /* for pSeries this is a 64bit memory address where
-          I/O memory is mapped into CPU address space (kv) */
-};
-
-/*
- * resource to access eHCA address space registers, all types
- */
-struct h_galpas {
-       u32 pid;                /*PID of userspace galpa checking */
-       struct h_galpa user;    /* user space accessible resource,
-                                  set to 0 if unused */
-       struct h_galpa kernel;  /* kernel space accessible resource,
-                                  set to 0 if unused */
-};
-
-static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
-{
-       u64 addr = galpa.fw_handle + offset;
-       return *(volatile u64 __force *)addr;
-}
-
-static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
-{
-       u64 addr = galpa.fw_handle + offset;
-       *(volatile u64 __force *)addr = value;
-}
-
-int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
-                   u64 paddr_kernel, u64 paddr_user);
-
-int hcp_galpas_dtor(struct h_galpas *galpas);
-
-u64 hcall_map_page(u64 physaddr);
-
-int hcall_unmap_page(u64 mapaddr);
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns.h b/drivers/infiniband/hw/ehca/hipz_fns.h
deleted file mode 100644 (file)
index 9dac93d..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HW abstraction register functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_FNS_H__
-#define __HIPZ_FNS_H__
-
-#include "ehca_classes.h"
-#include "hipz_hw.h"
-
-#include "hipz_fns_core.h"
-
-#define hipz_galpa_store_eq(gal, offset, value) \
-       hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_eq(gal, offset) \
-       hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
-
-#define hipz_galpa_store_qped(gal, offset, value) \
-       hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_qped(gal, offset) \
-       hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
-
-#define hipz_galpa_store_mrmw(gal, offset, value) \
-       hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_mrmw(gal, offset) \
-       hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns_core.h b/drivers/infiniband/hw/ehca/hipz_fns_core.h
deleted file mode 100644 (file)
index 868735f..0000000
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  HW abstraction register functions
- *
- *  Authors: Christoph Raisch <raisch@de.ibm.com>
- *           Heiko J Schick <schickhj@de.ibm.com>
- *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_FNS_CORE_H__
-#define __HIPZ_FNS_CORE_H__
-
-#include "hcp_phyp.h"
-#include "hipz_hw.h"
-
-#define hipz_galpa_store_cq(gal, offset, value) \
-       hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
-
-#define hipz_galpa_load_cq(gal, offset) \
-       hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
-
-#define hipz_galpa_store_qp(gal, offset, value) \
-       hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
-#define hipz_galpa_load_qp(gal, offset) \
-       hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
-
-static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
-{
-       /*  ringing doorbell :-) */
-       hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
-                           EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
-}
-
-static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
-{
-       /*  ringing doorbell :-) */
-       hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
-                           EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
-}
-
-static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
-{
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
-                           EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
-}
-
-static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
-{
-       u64 cqx_n0_reg;
-
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
-                           EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
-                                          value));
-       cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
-}
-
-static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
-{
-       u64 cqx_n1_reg;
-
-       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
-                           EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
-       cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
-}
-
-#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/infiniband/hw/ehca/hipz_hw.h b/drivers/infiniband/hw/ehca/hipz_hw.h
deleted file mode 100644 (file)
index bf996c7..0000000
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  eHCA register definitions
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __HIPZ_HW_H__
-#define __HIPZ_HW_H__
-
-#include "ehca_tools.h"
-
-#define EHCA_MAX_MTU 4
-
-/* QP Table Entry Memory Map */
-struct hipz_qptemm {
-       u64 qpx_hcr;
-       u64 qpx_c;
-       u64 qpx_herr;
-       u64 qpx_aer;
-/* 0x20*/
-       u64 qpx_sqa;
-       u64 qpx_sqc;
-       u64 qpx_rqa;
-       u64 qpx_rqc;
-/* 0x40*/
-       u64 qpx_st;
-       u64 qpx_pmstate;
-       u64 qpx_pmfa;
-       u64 qpx_pkey;
-/* 0x60*/
-       u64 qpx_pkeya;
-       u64 qpx_pkeyb;
-       u64 qpx_pkeyc;
-       u64 qpx_pkeyd;
-/* 0x80*/
-       u64 qpx_qkey;
-       u64 qpx_dqp;
-       u64 qpx_dlidp;
-       u64 qpx_portp;
-/* 0xa0*/
-       u64 qpx_slidp;
-       u64 qpx_slidpp;
-       u64 qpx_dlida;
-       u64 qpx_porta;
-/* 0xc0*/
-       u64 qpx_slida;
-       u64 qpx_slidpa;
-       u64 qpx_slvl;
-       u64 qpx_ipd;
-/* 0xe0*/
-       u64 qpx_mtu;
-       u64 qpx_lato;
-       u64 qpx_rlimit;
-       u64 qpx_rnrlimit;
-/* 0x100*/
-       u64 qpx_t;
-       u64 qpx_sqhp;
-       u64 qpx_sqptp;
-       u64 qpx_nspsn;
-/* 0x120*/
-       u64 qpx_nspsnhwm;
-       u64 reserved1;
-       u64 qpx_sdsi;
-       u64 qpx_sdsbc;
-/* 0x140*/
-       u64 qpx_sqwsize;
-       u64 qpx_sqwts;
-       u64 qpx_lsn;
-       u64 qpx_nssn;
-/* 0x160 */
-       u64 qpx_mor;
-       u64 qpx_cor;
-       u64 qpx_sqsize;
-       u64 qpx_erc;
-/* 0x180*/
-       u64 qpx_rnrrc;
-       u64 qpx_ernrwt;
-       u64 qpx_rnrresp;
-       u64 qpx_lmsna;
-/* 0x1a0 */
-       u64 qpx_sqhpc;
-       u64 qpx_sqcptp;
-       u64 qpx_sigt;
-       u64 qpx_wqecnt;
-/* 0x1c0*/
-       u64 qpx_rqhp;
-       u64 qpx_rqptp;
-       u64 qpx_rqsize;
-       u64 qpx_nrr;
-/* 0x1e0*/
-       u64 qpx_rdmac;
-       u64 qpx_nrpsn;
-       u64 qpx_lapsn;
-       u64 qpx_lcr;
-/* 0x200*/
-       u64 qpx_rwc;
-       u64 qpx_rwva;
-       u64 qpx_rdsi;
-       u64 qpx_rdsbc;
-/* 0x220*/
-       u64 qpx_rqwsize;
-       u64 qpx_crmsn;
-       u64 qpx_rdd;
-       u64 qpx_larpsn;
-/* 0x240*/
-       u64 qpx_pd;
-       u64 qpx_scqn;
-       u64 qpx_rcqn;
-       u64 qpx_aeqn;
-/* 0x260*/
-       u64 qpx_aaelog;
-       u64 qpx_ram;
-       u64 qpx_rdmaqe0;
-       u64 qpx_rdmaqe1;
-/* 0x280*/
-       u64 qpx_rdmaqe2;
-       u64 qpx_rdmaqe3;
-       u64 qpx_nrpsnhwm;
-/* 0x298*/
-       u64 reserved[(0x400 - 0x298) / 8];
-/* 0x400 extended data */
-       u64 reserved_ext[(0x500 - 0x400) / 8];
-/* 0x500 */
-       u64 reserved2[(0x1000 - 0x500) / 8];
-/* 0x1000      */
-};
-
-#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
-#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
-#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
-
-#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
-
-/* MRMWPT Entry Memory Map */
-struct hipz_mrmwmm {
-       /* 0x00 */
-       u64 mrx_hcr;
-
-       u64 mrx_c;
-       u64 mrx_herr;
-       u64 mrx_aer;
-       /* 0x20 */
-       u64 mrx_pp;
-       u64 reserved1;
-       u64 reserved2;
-       u64 reserved3;
-       /* 0x40 */
-       u64 reserved4[(0x200 - 0x40) / 8];
-       /* 0x200 */
-       u64 mrx_ctl[64];
-
-};
-
-#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
-
-struct hipz_qpedmm {
-       /* 0x00 */
-       u64 reserved0[(0x400) / 8];
-       /* 0x400 */
-       u64 qpedx_phh;
-       u64 qpedx_ppsgp;
-       /* 0x410 */
-       u64 qpedx_ppsgu;
-       u64 qpedx_ppdgp;
-       /* 0x420 */
-       u64 qpedx_ppdgu;
-       u64 qpedx_aph;
-       /* 0x430 */
-       u64 qpedx_apsgp;
-       u64 qpedx_apsgu;
-       /* 0x440 */
-       u64 qpedx_apdgp;
-       u64 qpedx_apdgu;
-       /* 0x450 */
-       u64 qpedx_apav;
-       u64 qpedx_apsav;
-       /* 0x460  */
-       u64 qpedx_hcr;
-       u64 reserved1[4];
-       /* 0x488 */
-       u64 qpedx_rrl0;
-       /* 0x490 */
-       u64 qpedx_rrrkey0;
-       u64 qpedx_rrva0;
-       /* 0x4a0 */
-       u64 reserved2;
-       u64 qpedx_rrl1;
-       /* 0x4b0 */
-       u64 qpedx_rrrkey1;
-       u64 qpedx_rrva1;
-       /* 0x4c0 */
-       u64 reserved3;
-       u64 qpedx_rrl2;
-       /* 0x4d0 */
-       u64 qpedx_rrrkey2;
-       u64 qpedx_rrva2;
-       /* 0x4e0 */
-       u64 reserved4;
-       u64 qpedx_rrl3;
-       /* 0x4f0 */
-       u64 qpedx_rrrkey3;
-       u64 qpedx_rrva3;
-};
-
-#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
-
-/* CQ Table Entry Memory Map */
-struct hipz_cqtemm {
-       u64 cqx_hcr;
-       u64 cqx_c;
-       u64 cqx_herr;
-       u64 cqx_aer;
-/* 0x20  */
-       u64 cqx_ptp;
-       u64 cqx_tp;
-       u64 cqx_fec;
-       u64 cqx_feca;
-/* 0x40  */
-       u64 cqx_ep;
-       u64 cqx_eq;
-/* 0x50  */
-       u64 reserved1;
-       u64 cqx_n0;
-/* 0x60  */
-       u64 cqx_n1;
-       u64 reserved2[(0x1000 - 0x60) / 8];
-/* 0x1000 */
-};
-
-#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
-#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
-#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
-#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
-
-#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
-
-/* EQ Table Entry Memory Map */
-struct hipz_eqtemm {
-       u64 eqx_hcr;
-       u64 eqx_c;
-
-       u64 eqx_herr;
-       u64 eqx_aer;
-/* 0x20 */
-       u64 eqx_ptp;
-       u64 eqx_tp;
-       u64 eqx_ssba;
-       u64 eqx_psba;
-
-/* 0x40 */
-       u64 eqx_cec;
-       u64 eqx_meql;
-       u64 eqx_xisbi;
-       u64 eqx_xisc;
-/* 0x60 */
-       u64 eqx_it;
-
-};
-
-#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
-
-/* access control defines for MR/MW */
-#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
-#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
-#define HIPZ_ACCESSCTRL_R_READ   0x00200000
-#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
-#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
-
-/* query hca response block */
-struct hipz_query_hca {
-       u32 cur_reliable_dg;
-       u32 cur_qp;
-       u32 cur_cq;
-       u32 cur_eq;
-       u32 cur_mr;
-       u32 cur_mw;
-       u32 cur_ee_context;
-       u32 cur_mcast_grp;
-       u32 cur_qp_attached_mcast_grp;
-       u32 reserved1;
-       u32 cur_ipv6_qp;
-       u32 cur_eth_qp;
-       u32 cur_hp_mr;
-       u32 reserved2[3];
-       u32 max_rd_domain;
-       u32 max_qp;
-       u32 max_cq;
-       u32 max_eq;
-       u32 max_mr;
-       u32 max_hp_mr;
-       u32 max_mw;
-       u32 max_mrwpte;
-       u32 max_special_mrwpte;
-       u32 max_rd_ee_context;
-       u32 max_mcast_grp;
-       u32 max_total_mcast_qp_attach;
-       u32 max_mcast_qp_attach;
-       u32 max_raw_ipv6_qp;
-       u32 max_raw_ethy_qp;
-       u32 internal_clock_frequency;
-       u32 max_pd;
-       u32 max_ah;
-       u32 max_cqe;
-       u32 max_wqes_wq;
-       u32 max_partitions;
-       u32 max_rr_ee_context;
-       u32 max_rr_qp;
-       u32 max_rr_hca;
-       u32 max_act_wqs_ee_context;
-       u32 max_act_wqs_qp;
-       u32 max_sge;
-       u32 max_sge_rd;
-       u32 memory_page_size_supported;
-       u64 max_mr_size;
-       u32 local_ca_ack_delay;
-       u32 num_ports;
-       u32 vendor_id;
-       u32 vendor_part_id;
-       u32 hw_ver;
-       u64 node_guid;
-       u64 hca_cap_indicators;
-       u32 data_counter_register_size;
-       u32 max_shared_rq;
-       u32 max_isns_eq;
-       u32 max_neq;
-} __attribute__ ((packed));
-
-#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
-#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
-#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
-#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
-#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
-#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
-#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
-#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
-#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
-#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
-#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
-#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
-#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
-#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
-#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
-#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
-#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
-#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
-
-/* query port response block */
-struct hipz_query_port {
-       u32 state;
-       u32 bad_pkey_cntr;
-       u32 lmc;
-       u32 lid;
-       u32 subnet_timeout;
-       u32 qkey_viol_cntr;
-       u32 sm_sl;
-       u32 sm_lid;
-       u32 capability_mask;
-       u32 init_type_reply;
-       u32 pkey_tbl_len;
-       u32 gid_tbl_len;
-       u64 gid_prefix;
-       u32 port_nr;
-       u16 pkey_entries[16];
-       u8  reserved1[32];
-       u32 trent_size;
-       u32 trbuf_size;
-       u64 max_msg_sz;
-       u32 max_mtu;
-       u32 vl_cap;
-       u32 phys_pstate;
-       u32 phys_state;
-       u32 phys_speed;
-       u32 phys_width;
-       u8  reserved2[1884];
-       u64 guid_entries[255];
-} __attribute__ ((packed));
-
-#endif
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
deleted file mode 100644 (file)
index 7ffc748..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  internal queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/slab.h>
-
-#include "ehca_tools.h"
-#include "ipz_pt_fn.h"
-#include "ehca_classes.h"
-
-#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
-
-struct kmem_cache *small_qp_cache;
-
-void *ipz_qpageit_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       queue->current_q_offset += queue->pagesize;
-       if (queue->current_q_offset > queue->queue_length) {
-               queue->current_q_offset -= queue->pagesize;
-               ret = NULL;
-       }
-       if (((u64)ret) % queue->pagesize) {
-               ehca_gen_err("ERROR!! not at PAGE-Boundary");
-               return NULL;
-       }
-       return ret;
-}
-
-void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u64 last_entry_in_q = queue->queue_length - queue->qe_size;
-
-       queue->current_q_offset += queue->qe_size;
-       if (queue->current_q_offset > last_entry_in_q) {
-               queue->current_q_offset = 0;
-               queue->toggle_state = (~queue->toggle_state) & 1;
-       }
-
-       return ret;
-}
-
-int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
-{
-       int i;
-       for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
-               u64 page = __pa(queue->queue_pages[i]);
-               if (addr >= page && addr < page + queue->pagesize) {
-                       *q_offset = addr - page + i * queue->pagesize;
-                       return 0;
-               }
-       }
-       return -EINVAL;
-}
-
-#if PAGE_SHIFT < EHCA_PAGESHIFT
-#error Kernel pages must be at least as large than eHCA pages (4K) !
-#endif
-
-/*
- * allocate pages for queue:
- * outer loop allocates whole kernel pages (page aligned) and
- * inner loop divides a kernel page into smaller hca queue pages
- */
-static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
-{
-       int k, f = 0;
-       u8 *kpage;
-
-       while (f < nr_of_pages) {
-               kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
-               if (!kpage)
-                       goto out;
-
-               for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
-                       queue->queue_pages[f] = (struct ipz_page *)kpage;
-                       kpage += EHCA_PAGESIZE;
-                       f++;
-               }
-       }
-       return 1;
-
-out:
-       for (f = 0; f < nr_of_pages && queue->queue_pages[f];
-            f += PAGES_PER_KPAGE)
-               free_page((unsigned long)(queue->queue_pages)[f]);
-       return 0;
-}
-
-static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
-{
-       int order = ilog2(queue->pagesize) - 9;
-       struct ipz_small_queue_page *page;
-       unsigned long bit;
-
-       mutex_lock(&pd->lock);
-
-       if (!list_empty(&pd->free[order]))
-               page = list_entry(pd->free[order].next,
-                                 struct ipz_small_queue_page, list);
-       else {
-               page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
-               if (!page)
-                       goto out;
-
-               page->page = get_zeroed_page(GFP_KERNEL);
-               if (!page->page) {
-                       kmem_cache_free(small_qp_cache, page);
-                       goto out;
-               }
-
-               list_add(&page->list, &pd->free[order]);
-       }
-
-       bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
-       __set_bit(bit, page->bitmap);
-       page->fill++;
-
-       if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
-               list_move(&page->list, &pd->full[order]);
-
-       mutex_unlock(&pd->lock);
-
-       queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
-       queue->small_page = page;
-       queue->offset = bit << (order + 9);
-       return 1;
-
-out:
-       ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
-       mutex_unlock(&pd->lock);
-       return 0;
-}
-
-static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
-{
-       int order = ilog2(queue->pagesize) - 9;
-       struct ipz_small_queue_page *page = queue->small_page;
-       unsigned long bit;
-       int free_page = 0;
-
-       bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
-               >> (order + 9);
-
-       mutex_lock(&pd->lock);
-
-       __clear_bit(bit, page->bitmap);
-       page->fill--;
-
-       if (page->fill == 0) {
-               list_del(&page->list);
-               free_page = 1;
-       }
-
-       if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
-               /* the page was full until we freed the chunk */
-               list_move_tail(&page->list, &pd->free[order]);
-
-       mutex_unlock(&pd->lock);
-
-       if (free_page) {
-               free_page(page->page);
-               kmem_cache_free(small_qp_cache, page);
-       }
-}
-
-int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
-                  const u32 nr_of_pages, const u32 pagesize,
-                  const u32 qe_size, const u32 nr_of_sg,
-                  int is_small)
-{
-       if (pagesize > PAGE_SIZE) {
-               ehca_gen_err("FATAL ERROR: pagesize=%x "
-                            "is greater than kernel page size", pagesize);
-               return 0;
-       }
-
-       /* init queue fields */
-       queue->queue_length = nr_of_pages * pagesize;
-       queue->pagesize = pagesize;
-       queue->qe_size = qe_size;
-       queue->act_nr_of_sg = nr_of_sg;
-       queue->current_q_offset = 0;
-       queue->toggle_state = 1;
-       queue->small_page = NULL;
-
-       /* allocate queue page pointers */
-       queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
-                                    GFP_KERNEL | __GFP_NOWARN);
-       if (!queue->queue_pages) {
-               queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
-               if (!queue->queue_pages) {
-                       ehca_gen_err("Couldn't allocate queue page list");
-                       return 0;
-               }
-       }
-
-       /* allocate actual queue pages */
-       if (is_small) {
-               if (!alloc_small_queue_page(queue, pd))
-                       goto ipz_queue_ctor_exit0;
-       } else
-               if (!alloc_queue_pages(queue, nr_of_pages))
-                       goto ipz_queue_ctor_exit0;
-
-       return 1;
-
-ipz_queue_ctor_exit0:
-       ehca_gen_err("Couldn't alloc pages queue=%p "
-                "nr_of_pages=%x",  queue, nr_of_pages);
-       kvfree(queue->queue_pages);
-
-       return 0;
-}
-
-int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
-{
-       int i, nr_pages;
-
-       if (!queue || !queue->queue_pages) {
-               ehca_gen_dbg("queue or queue_pages is NULL");
-               return 0;
-       }
-
-       if (queue->small_page)
-               free_small_queue_page(queue, pd);
-       else {
-               nr_pages = queue->queue_length / queue->pagesize;
-               for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
-                       free_page((unsigned long)queue->queue_pages[i]);
-       }
-
-       kvfree(queue->queue_pages);
-
-       return 1;
-}
-
-int ehca_init_small_qp_cache(void)
-{
-       small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
-                                          sizeof(struct ipz_small_queue_page),
-                                          0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!small_qp_cache)
-               return -ENOMEM;
-
-       return 0;
-}
-
-void ehca_cleanup_small_qp_cache(void)
-{
-       kmem_cache_destroy(small_qp_cache);
-}
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
deleted file mode 100644 (file)
index a801274..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- *  IBM eServer eHCA Infiniband device driver for Linux on POWER
- *
- *  internal queue handling
- *
- *  Authors: Waleri Fomin <fomin@de.ibm.com>
- *           Reinhard Ernst <rernst@de.ibm.com>
- *           Christoph Raisch <raisch@de.ibm.com>
- *
- *  Copyright (c) 2005 IBM Corporation
- *
- *  All rights reserved.
- *
- *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
- *  BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __IPZ_PT_FN_H__
-#define __IPZ_PT_FN_H__
-
-#define EHCA_PAGESHIFT   12
-#define EHCA_PAGESIZE   4096UL
-#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
-#define EHCA_PT_ENTRIES 512UL
-
-#include "ehca_tools.h"
-#include "ehca_qes.h"
-
-struct ehca_pd;
-struct ipz_small_queue_page;
-
-extern struct kmem_cache *small_qp_cache;
-
-/* struct generic ehca page */
-struct ipz_page {
-       u8 entries[EHCA_PAGESIZE];
-};
-
-#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
-
-struct ipz_small_queue_page {
-       unsigned long page;
-       unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
-       int fill;
-       void *mapped_addr;
-       u32 mmap_count;
-       struct list_head list;
-};
-
-/* struct generic queue in linux kernel virtual memory (kv) */
-struct ipz_queue {
-       u64 current_q_offset;   /* current queue entry */
-
-       struct ipz_page **queue_pages;  /* array of pages belonging to queue */
-       u32 qe_size;            /* queue entry size */
-       u32 act_nr_of_sg;
-       u32 queue_length;       /* queue length allocated in bytes */
-       u32 pagesize;
-       u32 toggle_state;       /* toggle flag - per page */
-       u32 offset; /* save offset within page for small_qp */
-       struct ipz_small_queue_page *small_page;
-};
-
-/*
- * return current Queue Entry for a certain q_offset
- * returns address (kv) of Queue Entry
- */
-static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
-{
-       struct ipz_page *current_page;
-       if (q_offset >= queue->queue_length)
-               return NULL;
-       current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
-       return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
-}
-
-/*
- * return current Queue Entry
- * returns address (kv) of Queue Entry
- */
-static inline void *ipz_qeit_get(struct ipz_queue *queue)
-{
-       return ipz_qeit_calc(queue, queue->current_q_offset);
-}
-
-/*
- * return current Queue Page , increment Queue Page iterator from
- * page to page in struct ipz_queue, last increment will return 0! and
- * NOT wrap
- * returns address (kv) of Queue Page
- * warning don't use in parallel with ipz_QE_get_inc()
- */
-void *ipz_qpageit_get_inc(struct ipz_queue *queue);
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * warning don't use in parallel with ipz_qpageit_get_inc()
- */
-static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       queue->current_q_offset += queue->qe_size;
-       if (queue->current_q_offset >= queue->queue_length) {
-               queue->current_q_offset = 0;
-               /* toggle the valid flag */
-               queue->toggle_state = (~queue->toggle_state) & 1;
-       }
-
-       return ret;
-}
-
-/*
- * return a bool indicating whether current Queue Entry is valid
- */
-static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
-{
-       struct ehca_cqe *cqe = ipz_qeit_get(queue);
-       return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
-}
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * returns 0 and does not increment, if wrong valid state
- * warning don't use in parallel with ipz_qpageit_get_inc()
- */
-static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
-{
-       return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
-}
-
-/*
- * returns and resets Queue Entry iterator
- * returns address (kv) of first Queue Entry
- */
-static inline void *ipz_qeit_reset(struct ipz_queue *queue)
-{
-       queue->current_q_offset = 0;
-       return ipz_qeit_get(queue);
-}
-
-/*
- * return the q_offset corresponding to an absolute address
- */
-int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
-
-/*
- * return the next queue offset. don't modify the queue.
- */
-static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
-{
-       offset += queue->qe_size;
-       if (offset >= queue->queue_length) offset = 0;
-       return offset;
-}
-
-/* struct generic page table */
-struct ipz_pt {
-       u64 entries[EHCA_PT_ENTRIES];
-};
-
-/* struct page table for a queue, only to be used in pf */
-struct ipz_qpt {
-       /* queue page tables (kv), use u64 because we know the element length */
-       u64 *qpts;
-       u32 n_qpts;
-       u32 n_ptes;       /*  number of page table entries */
-       u64 *current_pte_addr;
-};
-
-/*
- * constructor for a ipz_queue_t, placement new for ipz_queue_t,
- * new for all dependent datastructors
- * all QP Tables are the same
- * flow:
- *    allocate+pin queue
- * see ipz_qpt_ctor()
- * returns true if ok, false if out of memory
- */
-int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
-                  const u32 nr_of_pages, const u32 pagesize,
-                  const u32 qe_size, const u32 nr_of_sg,
-                  int is_small);
-
-/*
- * destructor for a ipz_queue_t
- *  -# free queue
- *  see ipz_queue_ctor()
- *  returns true if ok, false if queue was NULL-ptr of free failed
- */
-int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
-
-/*
- * constructor for a ipz_qpt_t,
- * placement new for struct ipz_queue, new for all dependent datastructors
- * all QP Tables are the same,
- * flow:
- * -# allocate+pin queue
- * -# initialise ptcb
- * -# allocate+pin PTs
- * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
- * -# the ring must have room for exactly nr_of_PTEs
- * see ipz_qpt_ctor()
- */
-void ipz_qpt_ctor(struct ipz_qpt *qpt,
-                 const u32 nr_of_qes,
-                 const u32 pagesize,
-                 const u32 qe_size,
-                 const u8 lowbyte, const u8 toggle,
-                 u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
-
-/*
- * return current Queue Entry, increment Queue Entry iterator by one
- * step in struct ipz_queue, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * warning don't use in parallel with ipz_qpageit_get_inc()
- * warning unpredictable results may occur if steps>act_nr_of_queue_entries
- * fix EQ page problems
- */
-void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
-
-/*
- * return current Event Queue Entry, increment Queue Entry iterator
- * by one step in struct ipz_queue if valid, will wrap in ringbuffer
- * returns address (kv) of Queue Entry BEFORE increment
- * returns 0 and does not increment, if wrong valid state
- * warning don't use in parallel with ipz_queue_QPageit_get_inc()
- * warning unpredictable results may occur if steps>act_nr_of_queue_entries
- */
-static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u32 qe = *(u8 *)ret;
-       if ((qe >> 7) != (queue->toggle_state & 1))
-               return NULL;
-       ipz_qeit_eq_get_inc(queue); /* this is a good one */
-       return ret;
-}
-
-static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
-{
-       void *ret = ipz_qeit_get(queue);
-       u32 qe = *(u8 *)ret;
-       if ((qe >> 7) != (queue->toggle_state & 1))
-               return NULL;
-       return ret;
-}
-
-/* returns address (GX) of first queue entry */
-static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
-{
-       return be64_to_cpu(qpt->qpts[0]);
-}
-
-/* returns address (kv) of first page of queue page table */
-static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
-{
-       return qpt->qpts;
-}
-
-#endif                         /* __IPZ_PT_FN_H__ */
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig
deleted file mode 100644 (file)
index 8fe54ff..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-config INFINIBAND_IPATH
-       tristate "QLogic HTX HCA support"
-       depends on 64BIT && NET && HT_IRQ
-       ---help---
-       This is a driver for the obsolete QLogic Hyper-Transport
-       IB host channel adapter (model QHT7140),
-       including InfiniBand verbs support.  This driver allows these
-       devices to be used with both kernel upper level protocols such
-       as IP-over-InfiniBand as well as with userspace applications
-       (in conjunction with InfiniBand userspace access).
-       For QLogic PCIe QLE based cards, use the QIB driver instead.
-
-       If you have this hardware you will need to boot with PAT disabled
-       on your x86-64 systems, use the nopat kernel parameter.
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
deleted file mode 100644 (file)
index 4496f28..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
-       -DIPATH_KERN_TYPE=0
-
-obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
-
-ib_ipath-y := \
-       ipath_cq.o \
-       ipath_diag.o \
-       ipath_dma.o \
-       ipath_driver.o \
-       ipath_eeprom.o \
-       ipath_file_ops.o \
-       ipath_fs.o \
-       ipath_init_chip.o \
-       ipath_intr.o \
-       ipath_keys.o \
-       ipath_mad.o \
-       ipath_mmap.o \
-       ipath_mr.o \
-       ipath_qp.o \
-       ipath_rc.o \
-       ipath_ruc.o \
-       ipath_sdma.o \
-       ipath_srq.o \
-       ipath_stats.o \
-       ipath_sysfs.o \
-       ipath_uc.o \
-       ipath_ud.o \
-       ipath_user_pages.o \
-       ipath_user_sdma.o \
-       ipath_verbs_mcast.o \
-       ipath_verbs.o
-
-ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
-
-ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
-ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
deleted file mode 100644 (file)
index 28cfe97..0000000
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_COMMON_H
-#define _IPATH_COMMON_H
-
-/*
- * This file contains defines, structures, etc. that are used
- * to communicate between kernel and user code.
- */
-
-
-/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
-#define IPATH_SRC_OUI_1 0x00
-#define IPATH_SRC_OUI_2 0x11
-#define IPATH_SRC_OUI_3 0x75
-
-/* version of protocol header (known to chip also). In the long run,
- * we should be able to generate and accept a range of version numbers;
- * for now we only accept one, and it's compiled in.
- */
-#define IPS_PROTO_VERSION 2
-
-/*
- * These are compile time constants that you may want to enable or disable
- * if you are trying to debug problems with code or performance.
- * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
- * fastpath code
- * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in faspath code
- * _IPATH_TRACING define as 0 if you want to remove all tracing in a
- * compilation unit
- * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
- */
-
-/*
- * The value in the BTH QP field that InfiniPath uses to differentiate
- * an infinipath protocol IB packet vs standard IB transport
- */
-#define IPATH_KD_QP 0x656b79
-
-/*
- * valid states passed to ipath_set_linkstate() user call
- */
-#define IPATH_IB_LINKDOWN              0
-#define IPATH_IB_LINKARM               1
-#define IPATH_IB_LINKACTIVE            2
-#define IPATH_IB_LINKDOWN_ONLY         3
-#define IPATH_IB_LINKDOWN_SLEEP                4
-#define IPATH_IB_LINKDOWN_DISABLE      5
-#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */
-#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */
-#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */
-#define IPATH_IB_LINK_HRTBT    9 /* enable heartbeat, normal, non-loopback */
-
-/*
- * These 3 values (SDR and DDR may be ORed for auto-speed
- * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
- * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
- * are also the the possible values for ipath_link_speed_enabled and active
- * The values were chosen to match values used within the IB spec.
- */
-#define IPATH_IB_SDR 1
-#define IPATH_IB_DDR 2
-
-/*
- * stats maintained by the driver.  For now, at least, this is global
- * to all minor devices.
- */
-struct infinipath_stats {
-       /* number of interrupts taken */
-       __u64 sps_ints;
-       /* number of interrupts for errors */
-       __u64 sps_errints;
-       /* number of errors from chip (not incl. packet errors or CRC) */
-       __u64 sps_errs;
-       /* number of packet errors from chip other than CRC */
-       __u64 sps_pkterrs;
-       /* number of packets with CRC errors (ICRC and VCRC) */
-       __u64 sps_crcerrs;
-       /* number of hardware errors reported (parity, etc.) */
-       __u64 sps_hwerrs;
-       /* number of times IB link changed state unexpectedly */
-       __u64 sps_iblink;
-       __u64 sps_unused; /* was fastrcvint, no longer implemented */
-       /* number of kernel (port0) packets received */
-       __u64 sps_port0pkts;
-       /* number of "ethernet" packets sent by driver */
-       __u64 sps_ether_spkts;
-       /* number of "ethernet" packets received by driver */
-       __u64 sps_ether_rpkts;
-       /* number of SMA packets sent by driver. Obsolete. */
-       __u64 sps_sma_spkts;
-       /* number of SMA packets received by driver. Obsolete. */
-       __u64 sps_sma_rpkts;
-       /* number of times all ports rcvhdrq was full and packet dropped */
-       __u64 sps_hdrqfull;
-       /* number of times all ports egrtid was full and packet dropped */
-       __u64 sps_etidfull;
-       /*
-        * number of times we tried to send from driver, but no pio buffers
-        * avail
-        */
-       __u64 sps_nopiobufs;
-       /* number of ports currently open */
-       __u64 sps_ports;
-       /* list of pkeys (other than default) accepted (0 means not set) */
-       __u16 sps_pkeys[4];
-       __u16 sps_unused16[4]; /* available; maintaining compatible layout */
-       /* number of user ports per chip (not IB ports) */
-       __u32 sps_nports;
-       /* not our interrupt, or already handled */
-       __u32 sps_nullintr;
-       /* max number of packets handled per receive call */
-       __u32 sps_maxpkts_call;
-       /* avg number of packets handled per receive call */
-       __u32 sps_avgpkts_call;
-       /* total number of pages locked */
-       __u64 sps_pagelocks;
-       /* total number of pages unlocked */
-       __u64 sps_pageunlocks;
-       /*
-        * Number of packets dropped in kernel other than errors (ether
-        * packets if ipath not configured, etc.)
-        */
-       __u64 sps_krdrops;
-       __u64 sps_txeparity; /* PIO buffer parity error, recovered */
-       /* pad for future growth */
-       __u64 __sps_pad[45];
-};
-
-/*
- * These are the status bits readable (in ascii form, 64bit value)
- * from the "status" sysfs file.
- */
-#define IPATH_STATUS_INITTED       0x1 /* basic initialization done */
-#define IPATH_STATUS_DISABLED      0x2 /* hardware disabled */
-/* Device has been disabled via admin request */
-#define IPATH_STATUS_ADMIN_DISABLED    0x4
-/* Chip has been found and initted */
-#define IPATH_STATUS_CHIP_PRESENT 0x20
-/* IB link is at ACTIVE, usable for data traffic */
-#define IPATH_STATUS_IB_READY     0x40
-/* link is configured, LID, MTU, etc. have been set */
-#define IPATH_STATUS_IB_CONF      0x80
-/* no link established, probably no cable */
-#define IPATH_STATUS_IB_NOCABLE  0x100
-/* A Fatal hardware error has occurred. */
-#define IPATH_STATUS_HWERROR     0x200
-
-/*
- * The list of usermode accessible registers.  Also see Reg_* later in file.
- */
-typedef enum _ipath_ureg {
-       /* (RO)  DMA RcvHdr to be used next. */
-       ur_rcvhdrtail = 0,
-       /* (RW)  RcvHdr entry to be processed next by host. */
-       ur_rcvhdrhead = 1,
-       /* (RO)  Index of next Eager index to use. */
-       ur_rcvegrindextail = 2,
-       /* (RW)  Eager TID to be processed next */
-       ur_rcvegrindexhead = 3,
-       /* For internal use only; max register number. */
-       _IPATH_UregMax
-} ipath_ureg;
-
-/* bit values for spi_runtime_flags */
-#define IPATH_RUNTIME_HT       0x1
-#define IPATH_RUNTIME_PCIE     0x2
-#define IPATH_RUNTIME_FORCE_WC_ORDER   0x4
-#define IPATH_RUNTIME_RCVHDR_COPY      0x8
-#define IPATH_RUNTIME_MASTER   0x10
-#define IPATH_RUNTIME_NODMA_RTAIL 0x80
-#define IPATH_RUNTIME_SDMA           0x200
-#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
-#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
-
-/*
- * This structure is returned by ipath_userinit() immediately after
- * open to get implementation-specific info, and info specific to this
- * instance.
- *
- * This struct must have explict pad fields where type sizes
- * may result in different alignments between 32 and 64 bit
- * programs, since the 64 bit * bit kernel requires the user code
- * to have matching offsets
- */
-struct ipath_base_info {
-       /* version of hardware, for feature checking. */
-       __u32 spi_hw_version;
-       /* version of software, for feature checking. */
-       __u32 spi_sw_version;
-       /* InfiniPath port assigned, goes into sent packets */
-       __u16 spi_port;
-       __u16 spi_subport;
-       /*
-        * IB MTU, packets IB data must be less than this.
-        * The MTU is in bytes, and will be a multiple of 4 bytes.
-        */
-       __u32 spi_mtu;
-       /*
-        * Size of a PIO buffer.  Any given packet's total size must be less
-        * than this (in words).  Included is the starting control word, so
-        * if 513 is returned, then total pkt size is 512 words or less.
-        */
-       __u32 spi_piosize;
-       /* size of the TID cache in infinipath, in entries */
-       __u32 spi_tidcnt;
-       /* size of the TID Eager list in infinipath, in entries */
-       __u32 spi_tidegrcnt;
-       /* size of a single receive header queue entry in words. */
-       __u32 spi_rcvhdrent_size;
-       /*
-        * Count of receive header queue entries allocated.
-        * This may be less than the spu_rcvhdrcnt passed in!.
-        */
-       __u32 spi_rcvhdr_cnt;
-
-       /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
-       __u32 spi_runtime_flags;
-
-       /* address where receive buffer queue is mapped into */
-       __u64 spi_rcvhdr_base;
-
-       /* user program. */
-
-       /* base address of eager TID receive buffers. */
-       __u64 spi_rcv_egrbufs;
-
-       /* Allocated by initialization code, not by protocol. */
-
-       /*
-        * Size of each TID buffer in host memory, starting at
-        * spi_rcv_egrbufs.  The buffers are virtually contiguous.
-        */
-       __u32 spi_rcv_egrbufsize;
-       /*
-        * The special QP (queue pair) value that identifies an infinipath
-        * protocol packet from standard IB packets.  More, probably much
-        * more, to be added.
-        */
-       __u32 spi_qpair;
-
-       /*
-        * User register base for init code, not to be used directly by
-        * protocol or applications.
-        */
-       __u64 __spi_uregbase;
-       /*
-        * Maximum buffer size in bytes that can be used in a single TID
-        * entry (assuming the buffer is aligned to this boundary).  This is
-        * the minimum of what the hardware and software support Guaranteed
-        * to be a power of 2.
-        */
-       __u32 spi_tid_maxsize;
-       /*
-        * alignment of each pio send buffer (byte count
-        * to add to spi_piobufbase to get to second buffer)
-        */
-       __u32 spi_pioalign;
-       /*
-        * The index of the first pio buffer available to this process;
-        * needed to do lookup in spi_pioavailaddr; not added to
-        * spi_piobufbase.
-        */
-       __u32 spi_pioindex;
-        /* number of buffers mapped for this process */
-       __u32 spi_piocnt;
-
-       /*
-        * Base address of writeonly pio buffers for this process.
-        * Each buffer has spi_piosize words, and is aligned on spi_pioalign
-        * boundaries.  spi_piocnt buffers are mapped from this address
-        */
-       __u64 spi_piobufbase;
-
-       /*
-        * Base address of readonly memory copy of the pioavail registers.
-        * There are 2 bits for each buffer.
-        */
-       __u64 spi_pioavailaddr;
-
-       /*
-        * Address where driver updates a copy of the interface and driver
-        * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
-        * string indicating hardware error, if there was one.
-        */
-       __u64 spi_status;
-
-       /* number of chip ports available to user processes */
-       __u32 spi_nports;
-       /* unit number of chip we are using */
-       __u32 spi_unit;
-       /* num bufs in each contiguous set */
-       __u32 spi_rcv_egrperchunk;
-       /* size in bytes of each contiguous set */
-       __u32 spi_rcv_egrchunksize;
-       /* total size of mmap to cover full rcvegrbuffers */
-       __u32 spi_rcv_egrbuftotlen;
-       __u32 spi_filler_for_align;
-       /* address of readonly memory copy of the rcvhdrq tail register. */
-       __u64 spi_rcvhdr_tailaddr;
-
-       /* shared memory pages for subports if port is shared */
-       __u64 spi_subport_uregbase;
-       __u64 spi_subport_rcvegrbuf;
-       __u64 spi_subport_rcvhdr_base;
-
-       /* shared memory page for hardware port if it is shared */
-       __u64 spi_port_uregbase;
-       __u64 spi_port_rcvegrbuf;
-       __u64 spi_port_rcvhdr_base;
-       __u64 spi_port_rcvhdr_tailaddr;
-
-} __attribute__ ((aligned(8)));
-
-
-/*
- * This version number is given to the driver by the user code during
- * initialization in the spu_userversion field of ipath_user_info, so
- * the driver can check for compatibility with user code.
- *
- * The major version changes when data structures
- * change in an incompatible way.  The driver must be the same or higher
- * for initialization to succeed.  In some cases, a higher version
- * driver will not interoperate with older software, and initialization
- * will return an error.
- */
-#define IPATH_USER_SWMAJOR 1
-
-/*
- * Minor version differences are always compatible
- * a within a major version, however if user software is larger
- * than driver software, some new features and/or structure fields
- * may not be implemented; the user code must deal with this if it
- * cares, or it must abort after initialization reports the difference.
- */
-#define IPATH_USER_SWMINOR 6
-
-#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
-
-#define IPATH_KERN_TYPE 0
-
-/*
- * Similarly, this is the kernel version going back to the user.  It's
- * slightly different, in that we want to tell if the driver was built as
- * part of a QLogic release, or from the driver from openfabrics.org,
- * kernel.org, or a standard distribution, for support reasons.
- * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
- *
- * It's returned by the driver to the user code during initialization in the
- * spi_sw_version field of ipath_base_info, so the user code can in turn
- * check for compatibility with the kernel.
-*/
-#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
-
-/*
- * This structure is passed to ipath_userinit() to tell the driver where
- * user code buffers are, sizes, etc.   The offsets and sizes of the
- * fields must remain unchanged, for binary compatibility.  It can
- * be extended, if userversion is changed so user code can tell, if needed
- */
-struct ipath_user_info {
-       /*
-        * version of user software, to detect compatibility issues.
-        * Should be set to IPATH_USER_SWVERSION.
-        */
-       __u32 spu_userversion;
-
-       /* desired number of receive header queue entries */
-       __u32 spu_rcvhdrcnt;
-
-       /* size of struct base_info to write to */
-       __u32 spu_base_info_size;
-
-       /*
-        * number of words in KD protocol header
-        * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
-        * kernel uses a default.  Once set, attempts to set any other value
-        * are an error (EAGAIN) until driver is reloaded.
-        */
-       __u32 spu_rcvhdrsize;
-
-       /*
-        * If two or more processes wish to share a port, each process
-        * must set the spu_subport_cnt and spu_subport_id to the same
-        * values.  The only restriction on the spu_subport_id is that
-        * it be unique for a given node.
-        */
-       __u16 spu_subport_cnt;
-       __u16 spu_subport_id;
-
-       __u32 spu_unused; /* kept for compatible layout */
-
-       /*
-        * address of struct base_info to write to
-        */
-       __u64 spu_base_info;
-
-} __attribute__ ((aligned(8)));
-
-/* User commands. */
-
-#define IPATH_CMD_MIN          16
-
-#define __IPATH_CMD_USER_INIT  16      /* old set up userspace (for old user code) */
-#define IPATH_CMD_PORT_INFO    17      /* find out what resources we got */
-#define IPATH_CMD_RECV_CTRL    18      /* control receipt of packets */
-#define IPATH_CMD_TID_UPDATE   19      /* update expected TID entries */
-#define IPATH_CMD_TID_FREE     20      /* free expected TID entries */
-#define IPATH_CMD_SET_PART_KEY 21      /* add partition key */
-#define __IPATH_CMD_SLAVE_INFO 22      /* return info on slave processes (for old user code) */
-#define IPATH_CMD_ASSIGN_PORT  23      /* allocate HCA and port */
-#define IPATH_CMD_USER_INIT    24      /* set up userspace */
-#define IPATH_CMD_UNUSED_1     25
-#define IPATH_CMD_UNUSED_2     26
-#define IPATH_CMD_PIOAVAILUPD  27      /* force an update of PIOAvail reg */
-#define IPATH_CMD_POLL_TYPE    28      /* set the kind of polling we want */
-#define IPATH_CMD_ARMLAUNCH_CTRL       29 /* armlaunch detection control */
-/* 30 is unused */
-#define IPATH_CMD_SDMA_INFLIGHT 31     /* sdma inflight counter request */
-#define IPATH_CMD_SDMA_COMPLETE 32     /* sdma completion counter request */
-
-/*
- * Poll types
- */
-#define IPATH_POLL_TYPE_URGENT  0x01
-#define IPATH_POLL_TYPE_OVERFLOW 0x02
-
-struct ipath_port_info {
-       __u32 num_active;       /* number of active units */
-       __u32 unit;             /* unit (chip) assigned to caller */
-       __u16 port;             /* port on unit assigned to caller */
-       __u16 subport;          /* subport on unit assigned to caller */
-       __u16 num_ports;        /* number of ports available on unit */
-       __u16 num_subports;     /* number of subports opened on port */
-};
-
-struct ipath_tid_info {
-       __u32 tidcnt;
-       /* make structure same size in 32 and 64 bit */
-       __u32 tid__unused;
-       /* virtual address of first page in transfer */
-       __u64 tidvaddr;
-       /* pointer (same size 32/64 bit) to __u16 tid array */
-       __u64 tidlist;
-
-       /*
-        * pointer (same size 32/64 bit) to bitmap of TIDs used
-        * for this call; checked for being large enough at open
-        */
-       __u64 tidmap;
-};
-
-struct ipath_cmd {
-       __u32 type;                     /* command type */
-       union {
-               struct ipath_tid_info tid_info;
-               struct ipath_user_info user_info;
-
-               /*
-                * address in userspace where we should put the sdma
-                * inflight counter
-                */
-               __u64 sdma_inflight;
-               /*
-                * address in userspace where we should put the sdma
-                * completion counter
-                */
-               __u64 sdma_complete;
-               /* address in userspace of struct ipath_port_info to
-                  write result to */
-               __u64 port_info;
-               /* enable/disable receipt of packets */
-               __u32 recv_ctrl;
-               /* enable/disable armlaunch errors (non-zero to enable) */
-               __u32 armlaunch_ctrl;
-               /* partition key to set */
-               __u16 part_key;
-               /* user address of __u32 bitmask of active slaves */
-               __u64 slave_mask_addr;
-               /* type of polling we want */
-               __u16 poll_type;
-       } cmd;
-};
-
-struct ipath_iovec {
-       /* Pointer to data, but same size 32 and 64 bit */
-       __u64 iov_base;
-
-       /*
-        * Length of data; don't need 64 bits, but want
-        * ipath_sendpkt to remain same size as before 32 bit changes, so...
-        */
-       __u64 iov_len;
-};
-
-/*
- * Describes a single packet for send.  Each packet can have one or more
- * buffers, but the total length (exclusive of IB headers) must be less
- * than the MTU, and if using the PIO method, entire packet length,
- * including IB headers, must be less than the ipath_piosize value (words).
- * Use of this necessitates including sys/uio.h
- */
-struct __ipath_sendpkt {
-       __u32 sps_flags;        /* flags for packet (TBD) */
-       __u32 sps_cnt;          /* number of entries to use in sps_iov */
-       /* array of iov's describing packet. TEMPORARY */
-       struct ipath_iovec sps_iov[4];
-};
-
-/*
- * diagnostics can send a packet by "writing" one of the following
- * two structs to diag data special file
- * The first is the legacy version for backward compatibility
- */
-struct ipath_diag_pkt {
-       __u32 unit;
-       __u64 data;
-       __u32 len;
-};
-
-/* The second diag_pkt struct is the expanded version that allows
- * more control over the packet, specifically, by allowing a custom
- * pbc (+ static rate) qword, so that special modes and deliberate
- * changes to CRCs can be used. The elements were also re-ordered
- * for better alignment and to avoid padding issues.
- */
-struct ipath_diag_xpkt {
-       __u64 data;
-       __u64 pbc_wd;
-       __u32 unit;
-       __u32 len;
-};
-
-/*
- * Data layout in I2C flash (for GUID, etc.)
- * All fields are little-endian binary unless otherwise stated
- */
-#define IPATH_FLASH_VERSION 2
-struct ipath_flash {
-       /* flash layout version (IPATH_FLASH_VERSION) */
-       __u8 if_fversion;
-       /* checksum protecting if_length bytes */
-       __u8 if_csum;
-       /*
-        * valid length (in use, protected by if_csum), including
-        * if_fversion and if_csum themselves)
-        */
-       __u8 if_length;
-       /* the GUID, in network order */
-       __u8 if_guid[8];
-       /* number of GUIDs to use, starting from if_guid */
-       __u8 if_numguid;
-       /* the (last 10 characters of) board serial number, in ASCII */
-       char if_serial[12];
-       /* board mfg date (YYYYMMDD ASCII) */
-       char if_mfgdate[8];
-       /* last board rework/test date (YYYYMMDD ASCII) */
-       char if_testdate[8];
-       /* logging of error counts, TBD */
-       __u8 if_errcntp[4];
-       /* powered on hours, updated at driver unload */
-       __u8 if_powerhour[2];
-       /* ASCII free-form comment field */
-       char if_comment[32];
-       /* Backwards compatible prefix for longer QLogic Serial Numbers */
-       char if_sprefix[4];
-       /* 82 bytes used, min flash size is 128 bytes */
-       __u8 if_future[46];
-};
-
-/*
- * These are the counters implemented in the chip, and are listed in order.
- * The InterCaps naming is taken straight from the chip spec.
- */
-struct infinipath_counters {
-       __u64 LBIntCnt;
-       __u64 LBFlowStallCnt;
-       __u64 TxSDmaDescCnt;    /* was Reserved1 */
-       __u64 TxUnsupVLErrCnt;
-       __u64 TxDataPktCnt;
-       __u64 TxFlowPktCnt;
-       __u64 TxDwordCnt;
-       __u64 TxLenErrCnt;
-       __u64 TxMaxMinLenErrCnt;
-       __u64 TxUnderrunCnt;
-       __u64 TxFlowStallCnt;
-       __u64 TxDroppedPktCnt;
-       __u64 RxDroppedPktCnt;
-       __u64 RxDataPktCnt;
-       __u64 RxFlowPktCnt;
-       __u64 RxDwordCnt;
-       __u64 RxLenErrCnt;
-       __u64 RxMaxMinLenErrCnt;
-       __u64 RxICRCErrCnt;
-       __u64 RxVCRCErrCnt;
-       __u64 RxFlowCtrlErrCnt;
-       __u64 RxBadFormatCnt;
-       __u64 RxLinkProblemCnt;
-       __u64 RxEBPCnt;
-       __u64 RxLPCRCErrCnt;
-       __u64 RxBufOvflCnt;
-       __u64 RxTIDFullErrCnt;
-       __u64 RxTIDValidErrCnt;
-       __u64 RxPKeyMismatchCnt;
-       __u64 RxP0HdrEgrOvflCnt;
-       __u64 RxP1HdrEgrOvflCnt;
-       __u64 RxP2HdrEgrOvflCnt;
-       __u64 RxP3HdrEgrOvflCnt;
-       __u64 RxP4HdrEgrOvflCnt;
-       __u64 RxP5HdrEgrOvflCnt;
-       __u64 RxP6HdrEgrOvflCnt;
-       __u64 RxP7HdrEgrOvflCnt;
-       __u64 RxP8HdrEgrOvflCnt;
-       __u64 RxP9HdrEgrOvflCnt;        /* was Reserved6 */
-       __u64 RxP10HdrEgrOvflCnt;       /* was Reserved7 */
-       __u64 RxP11HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP12HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP13HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP14HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP15HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 RxP16HdrEgrOvflCnt;       /* new for IBA7220 */
-       __u64 IBStatusChangeCnt;
-       __u64 IBLinkErrRecoveryCnt;
-       __u64 IBLinkDownedCnt;
-       __u64 IBSymbolErrCnt;
-       /* The following are new for IBA7220 */
-       __u64 RxVL15DroppedPktCnt;
-       __u64 RxOtherLocalPhyErrCnt;
-       __u64 PcieRetryBufDiagQwordCnt;
-       __u64 ExcessBufferOvflCnt;
-       __u64 LocalLinkIntegrityErrCnt;
-       __u64 RxVlErrCnt;
-       __u64 RxDlidFltrCnt;
-};
-
-/*
- * The next set of defines are for packet headers, and chip register
- * and memory bits that are visible to and/or used by user-mode software
- * The other bits that are used only by the driver or diags are in
- * ipath_registers.h
- */
-
-/* RcvHdrFlags bits */
-#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
-#define INFINIPATH_RHF_LENGTH_SHIFT 0
-#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
-#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
-#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
-#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
-#define INFINIPATH_RHF_SEQ_MASK 0xF
-#define INFINIPATH_RHF_SEQ_SHIFT 0
-#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
-#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
-#define INFINIPATH_RHF_H_ICRCERR   0x80000000
-#define INFINIPATH_RHF_H_VCRCERR   0x40000000
-#define INFINIPATH_RHF_H_PARITYERR 0x20000000
-#define INFINIPATH_RHF_H_LENERR    0x10000000
-#define INFINIPATH_RHF_H_MTUERR    0x08000000
-#define INFINIPATH_RHF_H_IHDRERR   0x04000000
-#define INFINIPATH_RHF_H_TIDERR    0x02000000
-#define INFINIPATH_RHF_H_MKERR     0x01000000
-#define INFINIPATH_RHF_H_IBERR     0x00800000
-#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
-#define INFINIPATH_RHF_L_USE_EGR   0x80000000
-#define INFINIPATH_RHF_L_SWA       0x00008000
-#define INFINIPATH_RHF_L_SWB       0x00004000
-
-/* infinipath header fields */
-#define INFINIPATH_I_VERS_MASK 0xF
-#define INFINIPATH_I_VERS_SHIFT 28
-#define INFINIPATH_I_PORT_MASK 0xF
-#define INFINIPATH_I_PORT_SHIFT 24
-#define INFINIPATH_I_TID_MASK 0x7FF
-#define INFINIPATH_I_TID_SHIFT 13
-#define INFINIPATH_I_OFFSET_MASK 0x1FFF
-#define INFINIPATH_I_OFFSET_SHIFT 0
-
-/* K_PktFlags bits */
-#define INFINIPATH_KPF_INTR 0x1
-#define INFINIPATH_KPF_SUBPORT_MASK 0x3
-#define INFINIPATH_KPF_SUBPORT_SHIFT 1
-
-#define INFINIPATH_MAX_SUBPORT 4
-
-/* SendPIO per-buffer control */
-#define INFINIPATH_SP_TEST    0x40
-#define INFINIPATH_SP_TESTEBP 0x20
-#define INFINIPATH_SP_TRIGGER_SHIFT  15
-
-/* SendPIOAvail bits */
-#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
-#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
-
-/* infinipath header format */
-struct ipath_header {
-       /*
-        * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
-        * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
-        * Port 4, TID 11, offset 13.
-        */
-       __le32 ver_port_tid_offset;
-       __le16 chksum;
-       __le16 pkt_flags;
-};
-
-/* infinipath user message header format.
- * This structure contains the first 4 fields common to all protocols
- * that employ infinipath.
- */
-struct ipath_message_header {
-       __be16 lrh[4];
-       __be32 bth[3];
-       /* fields below this point are in host byte order */
-       struct ipath_header iph;
-       __u8 sub_opcode;
-};
-
-/* infinipath ethernet header format */
-struct ether_header {
-       __be16 lrh[4];
-       __be32 bth[3];
-       struct ipath_header iph;
-       __u8 sub_opcode;
-       __u8 cmd;
-       __be16 lid;
-       __u16 mac[3];
-       __u8 frag_num;
-       __u8 seq_num;
-       __le32 len;
-       /* MUST be of word size due to PIO write requirements */
-       __le32 csum;
-       __le16 csum_offset;
-       __le16 flags;
-       __u16 first_2_bytes;
-       __u8 unused[2];         /* currently unused */
-};
-
-
-/* IB - LRH header consts */
-#define IPATH_LRH_GRH 0x0003   /* 1. word of IB LRH - next header: GRH */
-#define IPATH_LRH_BTH 0x0002   /* 1. word of IB LRH - next header: BTH */
-
-/* misc. */
-#define SIZE_OF_CRC 1
-
-#define IPATH_DEFAULT_P_KEY 0xFFFF
-#define IPATH_PERMISSIVE_LID 0xFFFF
-#define IPATH_AETH_CREDIT_SHIFT 24
-#define IPATH_AETH_CREDIT_MASK 0x1F
-#define IPATH_AETH_CREDIT_INVAL 0x1F
-#define IPATH_PSN_MASK 0xFFFFFF
-#define IPATH_MSN_MASK 0xFFFFFF
-#define IPATH_QPN_MASK 0xFFFFFF
-#define IPATH_MULTICAST_LID_BASE 0xC000
-#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
-#define IPATH_MULTICAST_QPN 0xFFFFFF
-
-/* Receive Header Queue: receive type (from infinipath) */
-#define RCVHQ_RCV_TYPE_EXPECTED  0
-#define RCVHQ_RCV_TYPE_EAGER     1
-#define RCVHQ_RCV_TYPE_NON_KD    2
-#define RCVHQ_RCV_TYPE_ERROR     3
-
-
-/* sub OpCodes - ith4x  */
-#define IPATH_ITH4X_OPCODE_ENCAP 0x81
-#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
-
-#define IPATH_HEADER_QUEUE_WORDS 9
-
-/* functions for extracting fields from rcvhdrq entries for the driver.
- */
-static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
-{
-       return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
-}
-
-static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
-{
-       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
-           & INFINIPATH_RHF_RCVTYPE_MASK;
-}
-
-static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
-{
-       return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
-               & INFINIPATH_RHF_LENGTH_MASK) << 2;
-}
-
-static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
-{
-       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
-           & INFINIPATH_RHF_EGRINDEX_MASK;
-}
-
-static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
-{
-       return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
-               & INFINIPATH_RHF_SEQ_MASK;
-}
-
-static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
-{
-       return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
-               & INFINIPATH_RHF_HDRQ_OFFSET_MASK;
-}
-
-static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
-{
-       return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
-}
-
-static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
-{
-       return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
-           & INFINIPATH_I_VERS_MASK;
-}
-
-#endif                         /* _IPATH_COMMON_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
deleted file mode 100644 (file)
index e9dd911..0000000
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_cq_enter - add a new entry to the completion queue
- * @cq: completion queue
- * @entry: work completion entry to add
- * @sig: true if @entry is a solicitated entry
- *
- * This may be called with qp->s_lock held.
- */
-void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
-{
-       struct ipath_cq_wc *wc;
-       unsigned long flags;
-       u32 head;
-       u32 next;
-
-       spin_lock_irqsave(&cq->lock, flags);
-
-       /*
-        * Note that the head pointer might be writable by user processes.
-        * Take care to verify it is a sane value.
-        */
-       wc = cq->queue;
-       head = wc->head;
-       if (head >= (unsigned) cq->ibcq.cqe) {
-               head = cq->ibcq.cqe;
-               next = 0;
-       } else
-               next = head + 1;
-       if (unlikely(next == wc->tail)) {
-               spin_unlock_irqrestore(&cq->lock, flags);
-               if (cq->ibcq.event_handler) {
-                       struct ib_event ev;
-
-                       ev.device = cq->ibcq.device;
-                       ev.element.cq = &cq->ibcq;
-                       ev.event = IB_EVENT_CQ_ERR;
-                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
-               }
-               return;
-       }
-       if (cq->ip) {
-               wc->uqueue[head].wr_id = entry->wr_id;
-               wc->uqueue[head].status = entry->status;
-               wc->uqueue[head].opcode = entry->opcode;
-               wc->uqueue[head].vendor_err = entry->vendor_err;
-               wc->uqueue[head].byte_len = entry->byte_len;
-               wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
-               wc->uqueue[head].qp_num = entry->qp->qp_num;
-               wc->uqueue[head].src_qp = entry->src_qp;
-               wc->uqueue[head].wc_flags = entry->wc_flags;
-               wc->uqueue[head].pkey_index = entry->pkey_index;
-               wc->uqueue[head].slid = entry->slid;
-               wc->uqueue[head].sl = entry->sl;
-               wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
-               wc->uqueue[head].port_num = entry->port_num;
-               /* Make sure entry is written before the head index. */
-               smp_wmb();
-       } else
-               wc->kqueue[head] = *entry;
-       wc->head = next;
-
-       if (cq->notify == IB_CQ_NEXT_COMP ||
-           (cq->notify == IB_CQ_SOLICITED && solicited)) {
-               cq->notify = IB_CQ_NONE;
-               cq->triggered++;
-               /*
-                * This will cause send_complete() to be called in
-                * another thread.
-                */
-               tasklet_hi_schedule(&cq->comptask);
-       }
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-       if (entry->status != IB_WC_SUCCESS)
-               to_idev(cq->ibcq.device)->n_wqe_errs++;
-}
-
-/**
- * ipath_poll_cq - poll for work completion entries
- * @ibcq: the completion queue to poll
- * @num_entries: the maximum number of entries to return
- * @entry: pointer to array where work completions are placed
- *
- * Returns the number of completion entries polled.
- *
- * This may be called from interrupt context.  Also called by ib_poll_cq()
- * in the generic verbs code.
- */
-int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
-{
-       struct ipath_cq *cq = to_icq(ibcq);
-       struct ipath_cq_wc *wc;
-       unsigned long flags;
-       int npolled;
-       u32 tail;
-
-       /* The kernel can only poll a kernel completion queue */
-       if (cq->ip) {
-               npolled = -EINVAL;
-               goto bail;
-       }
-
-       spin_lock_irqsave(&cq->lock, flags);
-
-       wc = cq->queue;
-       tail = wc->tail;
-       if (tail > (u32) cq->ibcq.cqe)
-               tail = (u32) cq->ibcq.cqe;
-       for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
-               if (tail == wc->head)
-                       break;
-               /* The kernel doesn't need a RMB since it has the lock. */
-               *entry = wc->kqueue[tail];
-               if (tail >= cq->ibcq.cqe)
-                       tail = 0;
-               else
-                       tail++;
-       }
-       wc->tail = tail;
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-bail:
-       return npolled;
-}
-
-static void send_complete(unsigned long data)
-{
-       struct ipath_cq *cq = (struct ipath_cq *)data;
-
-       /*
-        * The completion handler will most likely rearm the notification
-        * and poll for all pending entries.  If a new completion entry
-        * is added while we are in this routine, tasklet_hi_schedule()
-        * won't call us again until we return so we check triggered to
-        * see if we need to call the handler again.
-        */
-       for (;;) {
-               u8 triggered = cq->triggered;
-
-               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
-
-               if (cq->triggered == triggered)
-                       return;
-       }
-}
-
-/**
- * ipath_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
- * @attr: creation attributes
- * @context: unused by the InfiniPath driver
- * @udata: unused by the InfiniPath driver
- *
- * Returns a pointer to the completion queue or negative errno values
- * for failure.
- *
- * Called by ib_create_cq() in the generic verbs code.
- */
-struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
-                             const struct ib_cq_init_attr *attr,
-                             struct ib_ucontext *context,
-                             struct ib_udata *udata)
-{
-       int entries = attr->cqe;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cq *cq;
-       struct ipath_cq_wc *wc;
-       struct ib_cq *ret;
-       u32 sz;
-
-       if (attr->flags)
-               return ERR_PTR(-EINVAL);
-
-       if (entries < 1 || entries > ib_ipath_max_cqes) {
-               ret = ERR_PTR(-EINVAL);
-               goto done;
-       }
-
-       /* Allocate the completion queue structure. */
-       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
-       if (!cq) {
-               ret = ERR_PTR(-ENOMEM);
-               goto done;
-       }
-
-       /*
-        * Allocate the completion queue entries and head/tail pointers.
-        * This is allocated separately so that it can be resized and
-        * also mapped into user space.
-        * We need to use vmalloc() in order to support mmap and large
-        * numbers of entries.
-        */
-       sz = sizeof(*wc);
-       if (udata && udata->outlen >= sizeof(__u64))
-               sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
-       else
-               sz += sizeof(struct ib_wc) * (entries + 1);
-       wc = vmalloc_user(sz);
-       if (!wc) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_cq;
-       }
-
-       /*
-        * Return the address of the WC as the offset to mmap.
-        * See ipath_mmap() for details.
-        */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               int err;
-
-               cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
-               if (!cq->ip) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail_wc;
-               }
-
-               err = ib_copy_to_udata(udata, &cq->ip->offset,
-                                      sizeof(cq->ip->offset));
-               if (err) {
-                       ret = ERR_PTR(err);
-                       goto bail_ip;
-               }
-       } else
-               cq->ip = NULL;
-
-       spin_lock(&dev->n_cqs_lock);
-       if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
-               spin_unlock(&dev->n_cqs_lock);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_ip;
-       }
-
-       dev->n_cqs_allocated++;
-       spin_unlock(&dev->n_cqs_lock);
-
-       if (cq->ip) {
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       /*
-        * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
-        * The number of entries should be >= the number requested or return
-        * an error.
-        */
-       cq->ibcq.cqe = entries;
-       cq->notify = IB_CQ_NONE;
-       cq->triggered = 0;
-       spin_lock_init(&cq->lock);
-       tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
-       wc->head = 0;
-       wc->tail = 0;
-       cq->queue = wc;
-
-       ret = &cq->ibcq;
-
-       goto done;
-
-bail_ip:
-       kfree(cq->ip);
-bail_wc:
-       vfree(wc);
-bail_cq:
-       kfree(cq);
-done:
-       return ret;
-}
-
-/**
- * ipath_destroy_cq - destroy a completion queue
- * @ibcq: the completion queue to destroy.
- *
- * Returns 0 for success.
- *
- * Called by ib_destroy_cq() in the generic verbs code.
- */
-int ipath_destroy_cq(struct ib_cq *ibcq)
-{
-       struct ipath_ibdev *dev = to_idev(ibcq->device);
-       struct ipath_cq *cq = to_icq(ibcq);
-
-       tasklet_kill(&cq->comptask);
-       spin_lock(&dev->n_cqs_lock);
-       dev->n_cqs_allocated--;
-       spin_unlock(&dev->n_cqs_lock);
-       if (cq->ip)
-               kref_put(&cq->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(cq->queue);
-       kfree(cq);
-
-       return 0;
-}
-
-/**
- * ipath_req_notify_cq - change the notification type for a completion queue
- * @ibcq: the completion queue
- * @notify_flags: the type of notification to request
- *
- * Returns 0 for success.
- *
- * This may be called from interrupt context.  Also called by
- * ib_req_notify_cq() in the generic verbs code.
- */
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
-{
-       struct ipath_cq *cq = to_icq(ibcq);
-       unsigned long flags;
-       int ret = 0;
-
-       spin_lock_irqsave(&cq->lock, flags);
-       /*
-        * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
-        * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
-        */
-       if (cq->notify != IB_CQ_NEXT_COMP)
-               cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
-
-       if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
-           cq->queue->head != cq->queue->tail)
-               ret = 1;
-
-       spin_unlock_irqrestore(&cq->lock, flags);
-
-       return ret;
-}
-
-/**
- * ipath_resize_cq - change the size of the CQ
- * @ibcq: the completion queue
- *
- * Returns 0 for success.
- */
-int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
-{
-       struct ipath_cq *cq = to_icq(ibcq);
-       struct ipath_cq_wc *old_wc;
-       struct ipath_cq_wc *wc;
-       u32 head, tail, n;
-       int ret;
-       u32 sz;
-
-       if (cqe < 1 || cqe > ib_ipath_max_cqes) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /*
-        * Need to use vmalloc() if we want to support large #s of entries.
-        */
-       sz = sizeof(*wc);
-       if (udata && udata->outlen >= sizeof(__u64))
-               sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
-       else
-               sz += sizeof(struct ib_wc) * (cqe + 1);
-       wc = vmalloc_user(sz);
-       if (!wc) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       /* Check that we can write the offset to mmap. */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               __u64 offset = 0;
-
-               ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
-               if (ret)
-                       goto bail_free;
-       }
-
-       spin_lock_irq(&cq->lock);
-       /*
-        * Make sure head and tail are sane since they
-        * might be user writable.
-        */
-       old_wc = cq->queue;
-       head = old_wc->head;
-       if (head > (u32) cq->ibcq.cqe)
-               head = (u32) cq->ibcq.cqe;
-       tail = old_wc->tail;
-       if (tail > (u32) cq->ibcq.cqe)
-               tail = (u32) cq->ibcq.cqe;
-       if (head < tail)
-               n = cq->ibcq.cqe + 1 + head - tail;
-       else
-               n = head - tail;
-       if (unlikely((u32)cqe < n)) {
-               ret = -EINVAL;
-               goto bail_unlock;
-       }
-       for (n = 0; tail != head; n++) {
-               if (cq->ip)
-                       wc->uqueue[n] = old_wc->uqueue[tail];
-               else
-                       wc->kqueue[n] = old_wc->kqueue[tail];
-               if (tail == (u32) cq->ibcq.cqe)
-                       tail = 0;
-               else
-                       tail++;
-       }
-       cq->ibcq.cqe = cqe;
-       wc->head = n;
-       wc->tail = 0;
-       cq->queue = wc;
-       spin_unlock_irq(&cq->lock);
-
-       vfree(old_wc);
-
-       if (cq->ip) {
-               struct ipath_ibdev *dev = to_idev(ibcq->device);
-               struct ipath_mmap_info *ip = cq->ip;
-
-               ipath_update_mmap_info(dev, ip, sz, wc);
-
-               /*
-                * Return the offset to mmap.
-                * See ipath_mmap() for details.
-                */
-               if (udata && udata->outlen >= sizeof(__u64)) {
-                       ret = ib_copy_to_udata(udata, &ip->offset,
-                                              sizeof(ip->offset));
-                       if (ret)
-                               goto bail;
-               }
-
-               spin_lock_irq(&dev->pending_lock);
-               if (list_empty(&ip->pending_mmaps))
-                       list_add(&ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       ret = 0;
-       goto bail;
-
-bail_unlock:
-       spin_unlock_irq(&cq->lock);
-bail_free:
-       vfree(wc);
-bail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h
deleted file mode 100644 (file)
index 65926cd..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_DEBUG_H
-#define _IPATH_DEBUG_H
-
-#ifndef _IPATH_DEBUGGING       /* debugging enabled or not */
-#define _IPATH_DEBUGGING 1
-#endif
-
-#if _IPATH_DEBUGGING
-
-/*
- * Mask values for debugging.  The scheme allows us to compile out any
- * of the debug tracing stuff, and if compiled in, to enable or disable
- * dynamically.  This can be set at modprobe time also:
- *      modprobe infinipath.ko infinipath_debug=7
- */
-
-#define __IPATH_INFO        0x1        /* generic low verbosity stuff */
-#define __IPATH_DBG         0x2        /* generic debug */
-#define __IPATH_TRSAMPLE    0x8        /* generate trace buffer sample entries */
-/* leave some low verbosity spots open */
-#define __IPATH_VERBDBG     0x40       /* very verbose debug */
-#define __IPATH_PKTDBG      0x80       /* print packet data */
-/* print process startup (init)/exit messages */
-#define __IPATH_PROCDBG     0x100
-/* print mmap/fault stuff, not using VDBG any more */
-#define __IPATH_MMDBG       0x200
-#define __IPATH_ERRPKTDBG   0x400
-#define __IPATH_USER_SEND   0x1000     /* use user mode send */
-#define __IPATH_KERNEL_SEND 0x2000     /* use kernel mode send */
-#define __IPATH_EPKTDBG     0x4000     /* print ethernet packet data */
-#define __IPATH_IPATHDBG    0x10000    /* Ethernet (IPATH) gen debug */
-#define __IPATH_IPATHWARN   0x20000    /* Ethernet (IPATH) warnings */
-#define __IPATH_IPATHERR    0x40000    /* Ethernet (IPATH) errors */
-#define __IPATH_IPATHPD     0x80000    /* Ethernet (IPATH) packet dump */
-#define __IPATH_IPATHTABLE  0x100000   /* Ethernet (IPATH) table dump */
-#define __IPATH_LINKVERBDBG 0x200000   /* very verbose linkchange debug */
-
-#else                          /* _IPATH_DEBUGGING */
-
-/*
- * define all of these even with debugging off, for the few places that do
- * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
- * compiler eliminate the code
- */
-
-#define __IPATH_INFO      0x0  /* generic low verbosity stuff */
-#define __IPATH_DBG       0x0  /* generic debug */
-#define __IPATH_TRSAMPLE  0x0  /* generate trace buffer sample entries */
-#define __IPATH_VERBDBG   0x0  /* very verbose debug */
-#define __IPATH_PKTDBG    0x0  /* print packet data */
-#define __IPATH_PROCDBG   0x0  /* process startup (init)/exit messages */
-/* print mmap/fault stuff, not using VDBG any more */
-#define __IPATH_MMDBG     0x0
-#define __IPATH_EPKTDBG   0x0  /* print ethernet packet data */
-#define __IPATH_IPATHDBG  0x0  /* Ethernet (IPATH) table dump on */
-#define __IPATH_IPATHWARN 0x0  /* Ethernet (IPATH) warnings on   */
-#define __IPATH_IPATHERR  0x0  /* Ethernet (IPATH) errors on   */
-#define __IPATH_IPATHPD   0x0  /* Ethernet (IPATH) packet dump on   */
-#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on   */
-#define __IPATH_LINKVERBDBG 0x0        /* very verbose linkchange debug */
-
-#endif                         /* _IPATH_DEBUGGING */
-
-#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
-
-#endif                         /* _IPATH_DEBUG_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
deleted file mode 100644 (file)
index 45802e9..0000000
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file contains support for diagnostic functions.  It is accessed by
- * opening the ipath_diag device, normally minor number 129.  Diagnostic use
- * of the InfiniPath chip may render the chip or board unusable until the
- * driver is unloaded, or in some cases, until the system is rebooted.
- *
- * Accesses to the chip through this interface are not similar to going
- * through the /sys/bus/pci resource mmap interface.
- */
-
-#include <linux/io.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/export.h>
-#include <asm/uaccess.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-
-int ipath_diag_inuse;
-static int diag_set_link;
-
-static int ipath_diag_open(struct inode *in, struct file *fp);
-static int ipath_diag_release(struct inode *in, struct file *fp);
-static ssize_t ipath_diag_read(struct file *fp, char __user *data,
-                              size_t count, loff_t *off);
-static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off);
-
-static const struct file_operations diag_file_ops = {
-       .owner = THIS_MODULE,
-       .write = ipath_diag_write,
-       .read = ipath_diag_read,
-       .open = ipath_diag_open,
-       .release = ipath_diag_release,
-       .llseek = default_llseek,
-};
-
-static ssize_t ipath_diagpkt_write(struct file *fp,
-                                  const char __user *data,
-                                  size_t count, loff_t *off);
-
-static const struct file_operations diagpkt_file_ops = {
-       .owner = THIS_MODULE,
-       .write = ipath_diagpkt_write,
-       .llseek = noop_llseek,
-};
-
-static atomic_t diagpkt_count = ATOMIC_INIT(0);
-static struct cdev *diagpkt_cdev;
-static struct device *diagpkt_dev;
-
-int ipath_diag_add(struct ipath_devdata *dd)
-{
-       char name[16];
-       int ret = 0;
-
-       if (atomic_inc_return(&diagpkt_count) == 1) {
-               ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
-                                     "ipath_diagpkt", &diagpkt_file_ops,
-                                     &diagpkt_cdev, &diagpkt_dev);
-
-               if (ret) {
-                       ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
-                                     "device: %d", ret);
-                       goto done;
-               }
-       }
-
-       snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
-
-       ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
-                             &diag_file_ops, &dd->diag_cdev,
-                             &dd->diag_dev);
-       if (ret)
-               ipath_dev_err(dd, "Couldn't create %s device: %d",
-                             name, ret);
-
-done:
-       return ret;
-}
-
-void ipath_diag_remove(struct ipath_devdata *dd)
-{
-       if (atomic_dec_and_test(&diagpkt_count))
-               ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
-
-       ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
-}
-
-/**
- * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
- * @dd: the infinipath device
- * @uaddr: the location to store the data in user memory
- * @caddr: the source chip address (full pointer, not offset)
- * @count: number of bytes to copy (multiple of 32 bits)
- *
- * This function also localizes all chip memory accesses.
- * The copy should be written such that we read full cacheline packets
- * from the chip.  This is usually used for a single qword
- *
- * NOTE:  This assumes the chip address is 64-bit aligned.
- */
-static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
-                            const void __iomem *caddr, size_t count)
-{
-       const u64 __iomem *reg_addr = caddr;
-       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
-       int ret;
-
-       /* not very efficient, but it works for now */
-       if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       while (reg_addr < reg_end) {
-               u64 data = readq(reg_addr);
-               if (copy_to_user(uaddr, &data, sizeof(u64))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               reg_addr++;
-               uaddr += sizeof(u64);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
- * @dd: the infinipath device
- * @caddr: the destination chip address (full pointer, not offset)
- * @uaddr: the source of the data in user memory
- * @count: the number of bytes to copy (multiple of 32 bits)
- *
- * This is usually used for a single qword
- * NOTE:  This assumes the chip address is 64-bit aligned.
- */
-
-static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
-                             const void __user *uaddr, size_t count)
-{
-       u64 __iomem *reg_addr = caddr;
-       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
-       int ret;
-
-       /* not very efficient, but it works for now */
-       if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       while (reg_addr < reg_end) {
-               u64 data;
-               if (copy_from_user(&data, uaddr, sizeof(data))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               writeq(data, reg_addr);
-
-               reg_addr++;
-               uaddr += sizeof(u64);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
- * @dd: the infinipath device
- * @uaddr: the location to store the data in user memory
- * @caddr: the source chip address (full pointer, not offset)
- * @count: number of bytes to copy
- *
- * read 32 bit values, not 64 bit; for memories that only
- * support 32 bit reads; usually a single dword.
- */
-static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
-                            const void __iomem *caddr, size_t count)
-{
-       const u32 __iomem *reg_addr = caddr;
-       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
-       int ret;
-
-       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
-           reg_end > (u32 __iomem *) dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       /* not very efficient, but it works for now */
-       while (reg_addr < reg_end) {
-               u32 data = readl(reg_addr);
-               if (copy_to_user(uaddr, &data, sizeof(data))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-
-               reg_addr++;
-               uaddr += sizeof(u32);
-
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
- * @dd: the infinipath device
- * @caddr: the destination chip address (full pointer, not offset)
- * @uaddr: the source of the data in user memory
- * @count: number of bytes to copy
- *
- * write 32 bit values, not 64 bit; for memories that only
- * support 32 bit write; usually a single dword.
- */
-
-static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
-                             const void __user *uaddr, size_t count)
-{
-       u32 __iomem *reg_addr = caddr;
-       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
-       int ret;
-
-       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
-           reg_end > (u32 __iomem *) dd->ipath_kregend) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       while (reg_addr < reg_end) {
-               u32 data;
-               if (copy_from_user(&data, uaddr, sizeof(data))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               writel(data, reg_addr);
-
-               reg_addr++;
-               uaddr += sizeof(u32);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-static int ipath_diag_open(struct inode *in, struct file *fp)
-{
-       int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
-       struct ipath_devdata *dd;
-       int ret;
-
-       mutex_lock(&ipath_mutex);
-
-       if (ipath_diag_inuse) {
-               ret = -EBUSY;
-               goto bail;
-       }
-
-       dd = ipath_lookup(unit);
-
-       if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
-           !dd->ipath_kregbase) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       fp->private_data = dd;
-       ipath_diag_inuse = -2;
-       diag_set_link = 0;
-       ret = 0;
-
-       /* Only expose a way to reset the device if we
-          make it into diag mode. */
-       ipath_expose_reset(&dd->pcidev->dev);
-
-bail:
-       mutex_unlock(&ipath_mutex);
-
-       return ret;
-}
-
-/**
- * ipath_diagpkt_write - write an IB packet
- * @fp: the diag data device file pointer
- * @data: ipath_diag_pkt structure saying where to get the packet
- * @count: size of data to write
- * @off: unused by this code
- */
-static ssize_t ipath_diagpkt_write(struct file *fp,
-                                  const char __user *data,
-                                  size_t count, loff_t *off)
-{
-       u32 __iomem *piobuf;
-       u32 plen, pbufn, maxlen_reserve;
-       struct ipath_diag_pkt odp;
-       struct ipath_diag_xpkt dp;
-       u32 *tmpbuf = NULL;
-       struct ipath_devdata *dd;
-       ssize_t ret = 0;
-       u64 val;
-       u32 l_state, lt_state; /* LinkState, LinkTrainingState */
-
-
-       if (count == sizeof(dp)) {
-               if (copy_from_user(&dp, data, sizeof(dp))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-       } else if (count == sizeof(odp)) {
-               if (copy_from_user(&odp, data, sizeof(odp))) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-               dp.len = odp.len;
-               dp.unit = odp.unit;
-               dp.data = odp.data;
-               dp.pbc_wd = 0;
-       } else {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /* send count must be an exact number of dwords */
-       if (dp.len & 3) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       plen = dp.len >> 2;
-
-       dd = ipath_lookup(dp.unit);
-       if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
-           !dd->ipath_kregbase) {
-               ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
-                          dp.unit);
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (ipath_diag_inuse && !diag_set_link &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               diag_set_link = 1;
-               ipath_cdbg(VERBOSE, "Trying to set to set link active for "
-                          "diag pkt\n");
-               ipath_set_linkstate(dd, IPATH_IB_LINKARM);
-               ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
-       }
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
-               ret = -ENODEV;
-               goto bail;
-       }
-       /*
-        * Want to skip check for l_state if using custom PBC,
-        * because we might be trying to force an SM packet out.
-        * first-cut, skip _all_ state checking in that case.
-        */
-       val = ipath_ib_state(dd, dd->ipath_lastibcstat);
-       lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
-       l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
-       if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
-           (val != dd->ib_init && val != dd->ib_arm &&
-           val != dd->ib_active))) {
-               ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
-                          dd->ipath_unit, (unsigned long long) val);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /*
-        * need total length before first word written, plus 2 Dwords. One Dword
-        * is for padding so we get the full user data when not aligned on
-        * a word boundary. The other Dword is to make sure we have room for the
-        * ICRC which gets tacked on later.
-        */
-       maxlen_reserve = 2 * sizeof(u32);
-       if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
-               ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
-                         dp.len, dd->ipath_ibmaxlen);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       plen = sizeof(u32) + dp.len;
-
-       tmpbuf = vmalloc(plen);
-       if (!tmpbuf) {
-               dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
-                        "failing\n");
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       if (copy_from_user(tmpbuf,
-                          (const void __user *) (unsigned long) dp.data,
-                          dp.len)) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       plen >>= 2;             /* in dwords */
-
-       piobuf = ipath_getpiobuf(dd, plen, &pbufn);
-       if (!piobuf) {
-               ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
-                          dd->ipath_unit);
-               ret = -EBUSY;
-               goto bail;
-       }
-       /* disarm it just to be extra sure */
-       ipath_disarm_piobufs(dd, pbufn, 1);
-
-       if (ipath_debug & __IPATH_PKTDBG)
-               ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
-                          dd->ipath_unit, plen - 1, pbufn);
-
-       if (dp.pbc_wd == 0)
-               dp.pbc_wd = plen;
-       writeq(dp.pbc_wd, piobuf);
-       /*
-        * Copy all by the trigger word, then flush, so it's written
-        * to chip before trigger word, then write trigger word, then
-        * flush again, so packet is sent.
-        */
-       if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
-               ipath_flush_wc();
-               __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
-               ipath_flush_wc();
-               __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
-       } else
-               __iowrite32_copy(piobuf + 2, tmpbuf, plen);
-
-       ipath_flush_wc();
-
-       ret = sizeof(dp);
-
-bail:
-       vfree(tmpbuf);
-       return ret;
-}
-
-static int ipath_diag_release(struct inode *in, struct file *fp)
-{
-       mutex_lock(&ipath_mutex);
-       ipath_diag_inuse = 0;
-       fp->private_data = NULL;
-       mutex_unlock(&ipath_mutex);
-       return 0;
-}
-
-static ssize_t ipath_diag_read(struct file *fp, char __user *data,
-                              size_t count, loff_t *off)
-{
-       struct ipath_devdata *dd = fp->private_data;
-       void __iomem *kreg_base;
-       ssize_t ret;
-
-       kreg_base = dd->ipath_kregbase;
-
-       if (count == 0)
-               ret = 0;
-       else if ((count % 4) || (*off % 4))
-               /* address or length is not 32-bit aligned, hence invalid */
-               ret = -EINVAL;
-       else if (ipath_diag_inuse < 1 && (*off || count != 8))
-               ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
-       else if ((count % 8) || (*off % 8))
-               /* address or length not 64-bit aligned; do 32-bit reads */
-               ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
-       else
-               ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
-
-       if (ret >= 0) {
-               *off += count;
-               ret = count;
-               if (ipath_diag_inuse == -2)
-                       ipath_diag_inuse++;
-       }
-
-       return ret;
-}
-
-static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
-                               size_t count, loff_t *off)
-{
-       struct ipath_devdata *dd = fp->private_data;
-       void __iomem *kreg_base;
-       ssize_t ret;
-
-       kreg_base = dd->ipath_kregbase;
-
-       if (count == 0)
-               ret = 0;
-       else if ((count % 4) || (*off % 4))
-               /* address or length is not 32-bit aligned, hence invalid */
-               ret = -EINVAL;
-       else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
-                ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
-               ret = -EINVAL;  /* before any other write allowed */
-       else if ((count % 8) || (*off % 8))
-               /* address or length not 64-bit aligned; do 32-bit writes */
-               ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
-       else
-               ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
-
-       if (ret >= 0) {
-               *off += count;
-               ret = count;
-               if (ipath_diag_inuse == -1)
-                       ipath_diag_inuse = 1; /* all read/write OK now */
-       }
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c
deleted file mode 100644 (file)
index 123a8c0..0000000
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/scatterlist.h>
-#include <linux/gfp.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_verbs.h"
-
-#define BAD_DMA_ADDRESS ((u64) 0)
-
-/*
- * The following functions implement driver specific replacements
- * for the ib_dma_*() functions.
- *
- * These functions return kernel virtual addresses instead of
- * device bus addresses since the driver uses the CPU to copy
- * data instead of using hardware DMA.
- */
-
-static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
-{
-       return dma_addr == BAD_DMA_ADDRESS;
-}
-
-static u64 ipath_dma_map_single(struct ib_device *dev,
-                               void *cpu_addr, size_t size,
-                               enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-       return (u64) cpu_addr;
-}
-
-static void ipath_dma_unmap_single(struct ib_device *dev,
-                                  u64 addr, size_t size,
-                                  enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-}
-
-static u64 ipath_dma_map_page(struct ib_device *dev,
-                             struct page *page,
-                             unsigned long offset,
-                             size_t size,
-                             enum dma_data_direction direction)
-{
-       u64 addr;
-
-       BUG_ON(!valid_dma_direction(direction));
-
-       if (offset + size > PAGE_SIZE) {
-               addr = BAD_DMA_ADDRESS;
-               goto done;
-       }
-
-       addr = (u64) page_address(page);
-       if (addr)
-               addr += offset;
-       /* TODO: handle highmem pages */
-
-done:
-       return addr;
-}
-
-static void ipath_dma_unmap_page(struct ib_device *dev,
-                                u64 addr, size_t size,
-                                enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-}
-
-static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
-                       int nents, enum dma_data_direction direction)
-{
-       struct scatterlist *sg;
-       u64 addr;
-       int i;
-       int ret = nents;
-
-       BUG_ON(!valid_dma_direction(direction));
-
-       for_each_sg(sgl, sg, nents, i) {
-               addr = (u64) page_address(sg_page(sg));
-               /* TODO: handle highmem pages */
-               if (!addr) {
-                       ret = 0;
-                       break;
-               }
-               sg->dma_address = addr + sg->offset;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-               sg->dma_length = sg->length;
-#endif
-       }
-       return ret;
-}
-
-static void ipath_unmap_sg(struct ib_device *dev,
-                          struct scatterlist *sg, int nents,
-                          enum dma_data_direction direction)
-{
-       BUG_ON(!valid_dma_direction(direction));
-}
-
-static void ipath_sync_single_for_cpu(struct ib_device *dev,
-                                     u64 addr,
-                                     size_t size,
-                                     enum dma_data_direction dir)
-{
-}
-
-static void ipath_sync_single_for_device(struct ib_device *dev,
-                                        u64 addr,
-                                        size_t size,
-                                        enum dma_data_direction dir)
-{
-}
-
-static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
-                                     u64 *dma_handle, gfp_t flag)
-{
-       struct page *p;
-       void *addr = NULL;
-
-       p = alloc_pages(flag, get_order(size));
-       if (p)
-               addr = page_address(p);
-       if (dma_handle)
-               *dma_handle = (u64) addr;
-       return addr;
-}
-
-static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
-                                   void *cpu_addr, u64 dma_handle)
-{
-       free_pages((unsigned long) cpu_addr, get_order(size));
-}
-
-struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
-       .mapping_error = ipath_mapping_error,
-       .map_single = ipath_dma_map_single,
-       .unmap_single = ipath_dma_unmap_single,
-       .map_page = ipath_dma_map_page,
-       .unmap_page = ipath_dma_unmap_page,
-       .map_sg = ipath_map_sg,
-       .unmap_sg = ipath_unmap_sg,
-       .sync_single_for_cpu = ipath_sync_single_for_cpu,
-       .sync_single_for_device = ipath_sync_single_for_device,
-       .alloc_coherent = ipath_dma_alloc_coherent,
-       .free_coherent = ipath_dma_free_coherent
-};
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
deleted file mode 100644 (file)
index 871dbe5..0000000
+++ /dev/null
@@ -1,2789 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/netdevice.h>
-#include <linux/vmalloc.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#ifdef CONFIG_X86_64
-#include <asm/pat.h>
-#endif
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-
-static void ipath_update_pio_bufs(struct ipath_devdata *);
-
-const char *ipath_get_unit_name(int unit)
-{
-       static char iname[16];
-       snprintf(iname, sizeof iname, "infinipath%u", unit);
-       return iname;
-}
-
-#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
-#define PFX IPATH_DRV_NAME ": "
-
-/*
- * The size has to be longer than this string, so we can append
- * board/chip information to it in the init code.
- */
-const char ib_ipath_version[] = IPATH_IDSTR "\n";
-
-static struct idr unit_table;
-DEFINE_SPINLOCK(ipath_devs_lock);
-LIST_HEAD(ipath_dev_list);
-
-wait_queue_head_t ipath_state_wait;
-
-unsigned ipath_debug = __IPATH_INFO;
-
-module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(debug, "mask for debug prints");
-EXPORT_SYMBOL_GPL(ipath_debug);
-
-unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
-module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
-MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
-
-static unsigned ipath_hol_timeout_ms = 13000;
-module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
-MODULE_PARM_DESC(hol_timeout_ms,
-       "duration of user app suspension after link failure");
-
-unsigned ipath_linkrecovery = 1;
-module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("QLogic <support@qlogic.com>");
-MODULE_DESCRIPTION("QLogic InfiniPath driver");
-
-/*
- * Table to translate the LINKTRAININGSTATE portion of
- * IBCStatus to a human-readable form.
- */
-const char *ipath_ibcstatus_str[] = {
-       "Disabled",
-       "LinkUp",
-       "PollActive",
-       "PollQuiet",
-       "SleepDelay",
-       "SleepQuiet",
-       "LState6",              /* unused */
-       "LState7",              /* unused */
-       "CfgDebounce",
-       "CfgRcvfCfg",
-       "CfgWaitRmt",
-       "CfgIdle",
-       "RecovRetrain",
-       "CfgTxRevLane",         /* unused before IBA7220 */
-       "RecovWaitRmt",
-       "RecovIdle",
-       /* below were added for IBA7220 */
-       "CfgEnhanced",
-       "CfgTest",
-       "CfgWaitRmtTest",
-       "CfgWaitCfgEnhanced",
-       "SendTS_T",
-       "SendTstIdles",
-       "RcvTS_T",
-       "SendTst_TS1s",
-       "LTState18", "LTState19", "LTState1A", "LTState1B",
-       "LTState1C", "LTState1D", "LTState1E", "LTState1F"
-};
-
-static void ipath_remove_one(struct pci_dev *);
-static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
-
-/* Only needed for registration, nothing else needs this info */
-#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
-#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
-
-/* Number of seconds before our card status check...  */
-#define STATUS_TIMEOUT 60
-
-static const struct pci_device_id ipath_pci_tbl[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
-       { 0, }
-};
-
-MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
-
-static struct pci_driver ipath_driver = {
-       .name = IPATH_DRV_NAME,
-       .probe = ipath_init_one,
-       .remove = ipath_remove_one,
-       .id_table = ipath_pci_tbl,
-       .driver = {
-               .groups = ipath_driver_attr_groups,
-       },
-};
-
-static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
-                            u32 *bar0, u32 *bar1)
-{
-       int ret;
-
-       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
-       if (ret)
-               ipath_dev_err(dd, "failed to read bar0 before enable: "
-                             "error %d\n", -ret);
-
-       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
-       if (ret)
-               ipath_dev_err(dd, "failed to read bar1 before enable: "
-                             "error %d\n", -ret);
-
-       ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
-}
-
-static void ipath_free_devdata(struct pci_dev *pdev,
-                              struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       pci_set_drvdata(pdev, NULL);
-
-       if (dd->ipath_unit != -1) {
-               spin_lock_irqsave(&ipath_devs_lock, flags);
-               idr_remove(&unit_table, dd->ipath_unit);
-               list_del(&dd->ipath_list);
-               spin_unlock_irqrestore(&ipath_devs_lock, flags);
-       }
-       vfree(dd);
-}
-
-static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
-{
-       unsigned long flags;
-       struct ipath_devdata *dd;
-       int ret;
-
-       dd = vzalloc(sizeof(*dd));
-       if (!dd) {
-               dd = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-       dd->ipath_unit = -1;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not allocate unit ID: error %d\n", -ret);
-               ipath_free_devdata(pdev, dd);
-               dd = ERR_PTR(ret);
-               goto bail_unlock;
-       }
-       dd->ipath_unit = ret;
-
-       dd->pcidev = pdev;
-       pci_set_drvdata(pdev, dd);
-
-       list_add(&dd->ipath_list, &ipath_dev_list);
-
-bail_unlock:
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-       idr_preload_end();
-bail:
-       return dd;
-}
-
-static inline struct ipath_devdata *__ipath_lookup(int unit)
-{
-       return idr_find(&unit_table, unit);
-}
-
-struct ipath_devdata *ipath_lookup(int unit)
-{
-       struct ipath_devdata *dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-       dd = __ipath_lookup(unit);
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-       return dd;
-}
-
-int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
-{
-       int nunits, npresent, nup;
-       struct ipath_devdata *dd;
-       unsigned long flags;
-       int maxports;
-
-       nunits = npresent = nup = maxports = 0;
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
-               nunits++;
-               if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
-                       npresent++;
-               if (dd->ipath_lid &&
-                   !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
-                                        | IPATH_LINKUNK)))
-                       nup++;
-               if (dd->ipath_cfgports > maxports)
-                       maxports = dd->ipath_cfgports;
-       }
-
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-       if (npresentp)
-               *npresentp = npresent;
-       if (nupp)
-               *nupp = nup;
-       if (maxportsp)
-               *maxportsp = maxports;
-
-       return nunits;
-}
-
-/*
- * These next two routines are placeholders in case we don't have per-arch
- * code for controlling write combining.  If explicit control of write
- * combining is not available, performance will probably be awful.
- */
-
-int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
-{
-       return -EOPNOTSUPP;
-}
-
-void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
-{
-}
-
-/*
- * Perform a PIO buffer bandwidth write test, to verify proper system
- * configuration.  Even when all the setup calls work, occasionally
- * BIOS or other issues can prevent write combining from working, or
- * can cause other bandwidth problems to the chip.
- *
- * This test simply writes the same buffer over and over again, and
- * measures close to the peak bandwidth to the chip (not testing
- * data bandwidth to the wire).   On chips that use an address-based
- * trigger to send packets to the wire, this is easy.  On chips that
- * use a count to trigger, we want to make sure that the packet doesn't
- * go out on the wire, or trigger flow control checks.
- */
-static void ipath_verify_pioperf(struct ipath_devdata *dd)
-{
-       u32 pbnum, cnt, lcnt;
-       u32 __iomem *piobuf;
-       u32 *addr;
-       u64 msecs, emsecs;
-
-       piobuf = ipath_getpiobuf(dd, 0, &pbnum);
-       if (!piobuf) {
-               dev_info(&dd->pcidev->dev,
-                       "No PIObufs for checking perf, skipping\n");
-               return;
-       }
-
-       /*
-        * Enough to give us a reasonable test, less than piobuf size, and
-        * likely multiple of store buffer length.
-        */
-       cnt = 1024;
-
-       addr = vmalloc(cnt);
-       if (!addr) {
-               dev_info(&dd->pcidev->dev,
-                       "Couldn't get memory for checking PIO perf,"
-                       " skipping\n");
-               goto done;
-       }
-
-       preempt_disable();  /* we want reasonably accurate elapsed time */
-       msecs = 1 + jiffies_to_msecs(jiffies);
-       for (lcnt = 0; lcnt < 10000U; lcnt++) {
-               /* wait until we cross msec boundary */
-               if (jiffies_to_msecs(jiffies) >= msecs)
-                       break;
-               udelay(1);
-       }
-
-       ipath_disable_armlaunch(dd);
-
-       /*
-        * length 0, no dwords actually sent, and mark as VL15
-        * on chips where that may matter (due to IB flowcontrol)
-        */
-       if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
-               writeq(1UL << 63, piobuf);
-       else
-               writeq(0, piobuf);
-       ipath_flush_wc();
-
-       /*
-        * this is only roughly accurate, since even with preempt we
-        * still take interrupts that could take a while.   Running for
-        * >= 5 msec seems to get us "close enough" to accurate values
-        */
-       msecs = jiffies_to_msecs(jiffies);
-       for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
-               __iowrite32_copy(piobuf + 64, addr, cnt >> 2);
-               emsecs = jiffies_to_msecs(jiffies) - msecs;
-       }
-
-       /* 1 GiB/sec, slightly over IB SDR line rate */
-       if (lcnt < (emsecs * 1024U))
-               ipath_dev_err(dd,
-                       "Performance problem: bandwidth to PIO buffers is "
-                       "only %u MiB/sec\n",
-                       lcnt / (u32) emsecs);
-       else
-               ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
-                       lcnt / (u32) emsecs);
-
-       preempt_enable();
-
-       vfree(addr);
-
-done:
-       /* disarm piobuf, so it's available again */
-       ipath_disarm_piobufs(dd, pbnum, 1);
-       ipath_enable_armlaunch(dd);
-}
-
-static void cleanup_device(struct ipath_devdata *dd);
-
-static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-       int ret, len, j;
-       struct ipath_devdata *dd;
-       unsigned long long addr;
-       u32 bar0 = 0, bar1 = 0;
-
-#ifdef CONFIG_X86_64
-       if (pat_enabled()) {
-               pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n");
-               ret = -ENODEV;
-               goto bail;
-       }
-#endif
-
-       dd = ipath_alloc_devdata(pdev);
-       if (IS_ERR(dd)) {
-               ret = PTR_ERR(dd);
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not allocate devdata: error %d\n", -ret);
-               goto bail;
-       }
-
-       ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
-
-       ret = pci_enable_device(pdev);
-       if (ret) {
-               /* This can happen iff:
-                *
-                * We did a chip reset, and then failed to reprogram the
-                * BAR, or the chip reset due to an internal error.  We then
-                * unloaded the driver and reloaded it.
-                *
-                * Both reset cases set the BAR back to initial state.  For
-                * the latter case, the AER sticky error bit at offset 0x718
-                * should be set, but the Linux kernel doesn't yet know
-                * about that, it appears.  If the original BAR was retained
-                * in the kernel data structures, this may be OK.
-                */
-               ipath_dev_err(dd, "enable unit %d failed: error %d\n",
-                             dd->ipath_unit, -ret);
-               goto bail_devdata;
-       }
-       addr = pci_resource_start(pdev, 0);
-       len = pci_resource_len(pdev, 0);
-       ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
-                  "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
-                  ent->device, ent->driver_data);
-
-       read_bars(dd, pdev, &bar0, &bar1);
-
-       if (!bar1 && !(bar0 & ~0xf)) {
-               if (addr) {
-                       dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
-                                "rewriting as %llx\n", addr);
-                       ret = pci_write_config_dword(
-                               pdev, PCI_BASE_ADDRESS_0, addr);
-                       if (ret) {
-                               ipath_dev_err(dd, "rewrite of BAR0 "
-                                             "failed: err %d\n", -ret);
-                               goto bail_disable;
-                       }
-                       ret = pci_write_config_dword(
-                               pdev, PCI_BASE_ADDRESS_1, addr >> 32);
-                       if (ret) {
-                               ipath_dev_err(dd, "rewrite of BAR1 "
-                                             "failed: err %d\n", -ret);
-                               goto bail_disable;
-                       }
-               } else {
-                       ipath_dev_err(dd, "BAR is 0 (probable RESET), "
-                                     "not usable until reboot\n");
-                       ret = -ENODEV;
-                       goto bail_disable;
-               }
-       }
-
-       ret = pci_request_regions(pdev, IPATH_DRV_NAME);
-       if (ret) {
-               dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
-                        "err %d\n", dd->ipath_unit, -ret);
-               goto bail_disable;
-       }
-
-       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (ret) {
-               /*
-                * if the 64 bit setup fails, try 32 bit.  Some systems
-                * do not setup 64 bit maps on systems with 2GB or less
-                * memory installed.
-                */
-               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-               if (ret) {
-                       dev_info(&pdev->dev,
-                               "Unable to set DMA mask for unit %u: %d\n",
-                               dd->ipath_unit, ret);
-                       goto bail_regions;
-               }
-               else {
-                       ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
-                       ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-                       if (ret)
-                               dev_info(&pdev->dev,
-                                       "Unable to set DMA consistent mask "
-                                       "for unit %u: %d\n",
-                                       dd->ipath_unit, ret);
-
-               }
-       }
-       else {
-               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-               if (ret)
-                       dev_info(&pdev->dev,
-                               "Unable to set DMA consistent mask "
-                               "for unit %u: %d\n",
-                               dd->ipath_unit, ret);
-       }
-
-       pci_set_master(pdev);
-
-       /*
-        * Save BARs to rewrite after device reset.  Save all 64 bits of
-        * BAR, just in case.
-        */
-       dd->ipath_pcibar0 = addr;
-       dd->ipath_pcibar1 = addr >> 32;
-       dd->ipath_deviceid = ent->device;       /* save for later use */
-       dd->ipath_vendorid = ent->vendor;
-
-       /* setup the chip-specific functions, as early as possible. */
-       switch (ent->device) {
-       case PCI_DEVICE_ID_INFINIPATH_HT:
-               ipath_init_iba6110_funcs(dd);
-               break;
-
-       default:
-               ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
-                             "failing\n", ent->device);
-               return -ENODEV;
-       }
-
-       for (j = 0; j < 6; j++) {
-               if (!pdev->resource[j].start)
-                       continue;
-               ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
-                          j, &pdev->resource[j],
-                          (unsigned long long)pci_resource_len(pdev, j));
-       }
-
-       if (!addr) {
-               ipath_dev_err(dd, "No valid address in BAR 0!\n");
-               ret = -ENODEV;
-               goto bail_regions;
-       }
-
-       dd->ipath_pcirev = pdev->revision;
-
-#if defined(__powerpc__)
-       /* There isn't a generic way to specify writethrough mappings */
-       dd->ipath_kregbase = __ioremap(addr, len,
-               (_PAGE_NO_CACHE|_PAGE_WRITETHRU));
-#else
-       /* XXX: split this properly to enable on PAT */
-       dd->ipath_kregbase = ioremap_nocache(addr, len);
-#endif
-
-       if (!dd->ipath_kregbase) {
-               ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
-                         addr);
-               ret = -ENOMEM;
-               goto bail_iounmap;
-       }
-       dd->ipath_kregend = (u64 __iomem *)
-               ((void __iomem *)dd->ipath_kregbase + len);
-       dd->ipath_physaddr = addr;      /* used for io_remap, etc. */
-       /* for user mmap */
-       ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
-                  addr, dd->ipath_kregbase);
-
-       if (dd->ipath_f_bus(dd, pdev))
-               ipath_dev_err(dd, "Failed to setup config space; "
-                             "continuing anyway\n");
-
-       /*
-        * set up our interrupt handler; IRQF_SHARED probably not needed,
-        * since MSI interrupts shouldn't be shared but won't  hurt for now.
-        * check 0 irq after we return from chip-specific bus setup, since
-        * that can affect this due to setup
-        */
-       if (!dd->ipath_irq)
-               ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
-                             "work\n");
-       else {
-               ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
-                                 IPATH_DRV_NAME, dd);
-               if (ret) {
-                       ipath_dev_err(dd, "Couldn't setup irq handler, "
-                                     "irq=%d: %d\n", dd->ipath_irq, ret);
-                       goto bail_iounmap;
-               }
-       }
-
-       ret = ipath_init_chip(dd, 0);   /* do the chip-specific init */
-       if (ret)
-               goto bail_irqsetup;
-
-       ret = ipath_enable_wc(dd);
-
-       if (ret)
-               ret = 0;
-
-       ipath_verify_pioperf(dd);
-
-       ipath_device_create_group(&pdev->dev, dd);
-       ipathfs_add_device(dd);
-       ipath_user_add(dd);
-       ipath_diag_add(dd);
-       ipath_register_ib_device(dd);
-
-       goto bail;
-
-bail_irqsetup:
-       cleanup_device(dd);
-
-       if (dd->ipath_irq)
-               dd->ipath_f_free_irq(dd);
-
-       if (dd->ipath_f_cleanup)
-               dd->ipath_f_cleanup(dd);
-
-bail_iounmap:
-       iounmap((volatile void __iomem *) dd->ipath_kregbase);
-
-bail_regions:
-       pci_release_regions(pdev);
-
-bail_disable:
-       pci_disable_device(pdev);
-
-bail_devdata:
-       ipath_free_devdata(pdev, dd);
-
-bail:
-       return ret;
-}
-
-static void cleanup_device(struct ipath_devdata *dd)
-{
-       int port;
-       struct ipath_portdata **tmp;
-       unsigned long flags;
-
-       if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
-               /* can't do anything more with chip; needs re-init */
-               *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
-               if (dd->ipath_kregbase) {
-                       /*
-                        * if we haven't already cleaned up before these are
-                        * to ensure any register reads/writes "fail" until
-                        * re-init
-                        */
-                       dd->ipath_kregbase = NULL;
-                       dd->ipath_uregbase = 0;
-                       dd->ipath_sregbase = 0;
-                       dd->ipath_cregbase = 0;
-                       dd->ipath_kregsize = 0;
-               }
-               ipath_disable_wc(dd);
-       }
-
-       if (dd->ipath_spectriggerhit)
-               dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
-                        dd->ipath_spectriggerhit);
-
-       if (dd->ipath_pioavailregs_dma) {
-               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                 (void *) dd->ipath_pioavailregs_dma,
-                                 dd->ipath_pioavailregs_phys);
-               dd->ipath_pioavailregs_dma = NULL;
-       }
-       if (dd->ipath_dummy_hdrq) {
-               dma_free_coherent(&dd->pcidev->dev,
-                       dd->ipath_pd[0]->port_rcvhdrq_size,
-                       dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
-               dd->ipath_dummy_hdrq = NULL;
-       }
-
-       if (dd->ipath_pageshadow) {
-               struct page **tmpp = dd->ipath_pageshadow;
-               dma_addr_t *tmpd = dd->ipath_physshadow;
-               int i, cnt = 0;
-
-               ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
-                          "locked\n");
-               for (port = 0; port < dd->ipath_cfgports; port++) {
-                       int port_tidbase = port * dd->ipath_rcvtidcnt;
-                       int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-                       for (i = port_tidbase; i < maxtid; i++) {
-                               if (!tmpp[i])
-                                       continue;
-                               pci_unmap_page(dd->pcidev, tmpd[i],
-                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                               ipath_release_user_pages(&tmpp[i], 1);
-                               tmpp[i] = NULL;
-                               cnt++;
-                       }
-               }
-               if (cnt) {
-                       ipath_stats.sps_pageunlocks += cnt;
-                       ipath_cdbg(VERBOSE, "There were still %u expTID "
-                                  "entries locked\n", cnt);
-               }
-               if (ipath_stats.sps_pagelocks ||
-                   ipath_stats.sps_pageunlocks)
-                       ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
-                                  "unlocked via ipath_m{un}lock\n",
-                                  (unsigned long long)
-                                  ipath_stats.sps_pagelocks,
-                                  (unsigned long long)
-                                  ipath_stats.sps_pageunlocks);
-
-               ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
-                          dd->ipath_pageshadow);
-               tmpp = dd->ipath_pageshadow;
-               dd->ipath_pageshadow = NULL;
-               vfree(tmpp);
-
-               dd->ipath_egrtidbase = NULL;
-       }
-
-       /*
-        * free any resources still in use (usually just kernel ports)
-        * at unload; we do for portcnt, because that's what we allocate.
-        * We acquire lock to be really paranoid that ipath_pd isn't being
-        * accessed from some interrupt-related code (that should not happen,
-        * but best to be sure).
-        */
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       tmp = dd->ipath_pd;
-       dd->ipath_pd = NULL;
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-       for (port = 0; port < dd->ipath_portcnt; port++) {
-               struct ipath_portdata *pd = tmp[port];
-               tmp[port] = NULL; /* debugging paranoia */
-               ipath_free_pddata(dd, pd);
-       }
-       kfree(tmp);
-}
-
-static void ipath_remove_one(struct pci_dev *pdev)
-{
-       struct ipath_devdata *dd = pci_get_drvdata(pdev);
-
-       ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
-
-       /*
-        * disable the IB link early, to be sure no new packets arrive, which
-        * complicates the shutdown process
-        */
-       ipath_shutdown_device(dd);
-
-       flush_workqueue(ib_wq);
-
-       if (dd->verbs_dev)
-               ipath_unregister_ib_device(dd->verbs_dev);
-
-       ipath_diag_remove(dd);
-       ipath_user_remove(dd);
-       ipathfs_remove_device(dd);
-       ipath_device_remove_group(&pdev->dev, dd);
-
-       ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
-                  "unit %u\n", dd, (u32) dd->ipath_unit);
-
-       cleanup_device(dd);
-
-       /*
-        * turn off rcv, send, and interrupts for all ports, all drivers
-        * should also hard reset the chip here?
-        * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
-        * for all versions of the driver, if they were allocated
-        */
-       if (dd->ipath_irq) {
-               ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
-                          dd->ipath_unit, dd->ipath_irq);
-               dd->ipath_f_free_irq(dd);
-       } else
-               ipath_dbg("irq is 0, not doing free_irq "
-                         "for unit %u\n", dd->ipath_unit);
-       /*
-        * we check for NULL here, because it's outside
-        * the kregbase check, and we need to call it
-        * after the free_irq.  Thus it's possible that
-        * the function pointers were never initialized.
-        */
-       if (dd->ipath_f_cleanup)
-               /* clean up chip-specific stuff */
-               dd->ipath_f_cleanup(dd);
-
-       ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
-       iounmap((volatile void __iomem *) dd->ipath_kregbase);
-       pci_release_regions(pdev);
-       ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
-       pci_disable_device(pdev);
-
-       ipath_free_devdata(pdev, dd);
-}
-
-/* general driver use */
-DEFINE_MUTEX(ipath_mutex);
-
-static DEFINE_SPINLOCK(ipath_pioavail_lock);
-
-/**
- * ipath_disarm_piobufs - cancel a range of PIO buffers
- * @dd: the infinipath device
- * @first: the first PIO buffer to cancel
- * @cnt: the number of PIO buffers to cancel
- *
- * cancel a range of PIO buffers, used when they might be armed, but
- * not triggered.  Used at init to ensure buffer state, and also user
- * process close, in case it died while writing to a PIO buffer
- * Also after errors.
- */
-void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
-                         unsigned cnt)
-{
-       unsigned i, last = first + cnt;
-       unsigned long flags;
-
-       ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
-       for (i = first; i < last; i++) {
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               /*
-                * The disarm-related bits are write-only, so it
-                * is ok to OR them in with our copy of sendctrl
-                * while we hold the lock.
-                */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl | INFINIPATH_S_DISARM |
-                       (i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
-               /* can't disarm bufs back-to-back per iba7220 spec */
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-       /* on some older chips, update may not happen after cancel */
-       ipath_force_pio_avail_update(dd);
-}
-
-/**
- * ipath_wait_linkstate - wait for an IB link state change to occur
- * @dd: the infinipath device
- * @state: the state to wait for
- * @msecs: the number of milliseconds to wait
- *
- * wait up to msecs milliseconds for IB link state change to occur for
- * now, take the easy polling route.  Currently used only by
- * ipath_set_linkstate.  Returns 0 if state reached, otherwise
- * -ETIMEDOUT state can have multiple states set, for any of several
- * transitions.
- */
-int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
-{
-       dd->ipath_state_wanted = state;
-       wait_event_interruptible_timeout(ipath_state_wait,
-                                        (dd->ipath_flags & state),
-                                        msecs_to_jiffies(msecs));
-       dd->ipath_state_wanted = 0;
-
-       if (!(dd->ipath_flags & state)) {
-               u64 val;
-               ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
-                          " ms\n",
-                          /* test INIT ahead of DOWN, both can be set */
-                          (state & IPATH_LINKINIT) ? "INIT" :
-                          ((state & IPATH_LINKDOWN) ? "DOWN" :
-                           ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
-                          msecs);
-               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-               ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
-                          (unsigned long long) ipath_read_kreg64(
-                                  dd, dd->ipath_kregs->kr_ibcctrl),
-                          (unsigned long long) val,
-                          ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
-       }
-       return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
-}
-
-static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
-       char *buf, size_t blen)
-{
-       static const struct {
-               ipath_err_t err;
-               const char *msg;
-       } errs[] = {
-               { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
-               { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
-               { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
-               { INFINIPATH_E_SDMABASE, "SDmaBase" },
-               { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
-               { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
-               { INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
-               { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
-               { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
-               { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
-               { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
-               { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
-       };
-       int i;
-       int expected;
-       size_t bidx = 0;
-
-       for (i = 0; i < ARRAY_SIZE(errs); i++) {
-               expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
-                       test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-               if ((err & errs[i].err) && !expected)
-                       bidx += snprintf(buf + bidx, blen - bidx,
-                                        "%s ", errs[i].msg);
-       }
-}
-
-/*
- * Decode the error status into strings, deciding whether to always
- * print * it or not depending on "normal packet errors" vs everything
- * else.   Return 1 if "real" errors, otherwise 0 if only packet
- * errors, so caller can decide what to print with the string.
- */
-int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-       ipath_err_t err)
-{
-       int iserr = 1;
-       *buf = '\0';
-       if (err & INFINIPATH_E_PKTERRS) {
-               if (!(err & ~INFINIPATH_E_PKTERRS))
-                       iserr = 0; // if only packet errors.
-               if (ipath_debug & __IPATH_ERRPKTDBG) {
-                       if (err & INFINIPATH_E_REBP)
-                               strlcat(buf, "EBP ", blen);
-                       if (err & INFINIPATH_E_RVCRC)
-                               strlcat(buf, "VCRC ", blen);
-                       if (err & INFINIPATH_E_RICRC) {
-                               strlcat(buf, "CRC ", blen);
-                               // clear for check below, so only once
-                               err &= INFINIPATH_E_RICRC;
-                       }
-                       if (err & INFINIPATH_E_RSHORTPKTLEN)
-                               strlcat(buf, "rshortpktlen ", blen);
-                       if (err & INFINIPATH_E_SDROPPEDDATAPKT)
-                               strlcat(buf, "sdroppeddatapkt ", blen);
-                       if (err & INFINIPATH_E_SPKTLEN)
-                               strlcat(buf, "spktlen ", blen);
-               }
-               if ((err & INFINIPATH_E_RICRC) &&
-                       !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
-                       strlcat(buf, "CRC ", blen);
-               if (!iserr)
-                       goto done;
-       }
-       if (err & INFINIPATH_E_RHDRLEN)
-               strlcat(buf, "rhdrlen ", blen);
-       if (err & INFINIPATH_E_RBADTID)
-               strlcat(buf, "rbadtid ", blen);
-       if (err & INFINIPATH_E_RBADVERSION)
-               strlcat(buf, "rbadversion ", blen);
-       if (err & INFINIPATH_E_RHDR)
-               strlcat(buf, "rhdr ", blen);
-       if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
-               strlcat(buf, "sendspecialtrigger ", blen);
-       if (err & INFINIPATH_E_RLONGPKTLEN)
-               strlcat(buf, "rlongpktlen ", blen);
-       if (err & INFINIPATH_E_RMAXPKTLEN)
-               strlcat(buf, "rmaxpktlen ", blen);
-       if (err & INFINIPATH_E_RMINPKTLEN)
-               strlcat(buf, "rminpktlen ", blen);
-       if (err & INFINIPATH_E_SMINPKTLEN)
-               strlcat(buf, "sminpktlen ", blen);
-       if (err & INFINIPATH_E_RFORMATERR)
-               strlcat(buf, "rformaterr ", blen);
-       if (err & INFINIPATH_E_RUNSUPVL)
-               strlcat(buf, "runsupvl ", blen);
-       if (err & INFINIPATH_E_RUNEXPCHAR)
-               strlcat(buf, "runexpchar ", blen);
-       if (err & INFINIPATH_E_RIBFLOW)
-               strlcat(buf, "ribflow ", blen);
-       if (err & INFINIPATH_E_SUNDERRUN)
-               strlcat(buf, "sunderrun ", blen);
-       if (err & INFINIPATH_E_SPIOARMLAUNCH)
-               strlcat(buf, "spioarmlaunch ", blen);
-       if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
-               strlcat(buf, "sunexperrpktnum ", blen);
-       if (err & INFINIPATH_E_SDROPPEDSMPPKT)
-               strlcat(buf, "sdroppedsmppkt ", blen);
-       if (err & INFINIPATH_E_SMAXPKTLEN)
-               strlcat(buf, "smaxpktlen ", blen);
-       if (err & INFINIPATH_E_SUNSUPVL)
-               strlcat(buf, "sunsupVL ", blen);
-       if (err & INFINIPATH_E_INVALIDADDR)
-               strlcat(buf, "invalidaddr ", blen);
-       if (err & INFINIPATH_E_RRCVEGRFULL)
-               strlcat(buf, "rcvegrfull ", blen);
-       if (err & INFINIPATH_E_RRCVHDRFULL)
-               strlcat(buf, "rcvhdrfull ", blen);
-       if (err & INFINIPATH_E_IBSTATUSCHANGED)
-               strlcat(buf, "ibcstatuschg ", blen);
-       if (err & INFINIPATH_E_RIBLOSTLINK)
-               strlcat(buf, "riblostlink ", blen);
-       if (err & INFINIPATH_E_HARDWARE)
-               strlcat(buf, "hardware ", blen);
-       if (err & INFINIPATH_E_RESET)
-               strlcat(buf, "reset ", blen);
-       if (err & INFINIPATH_E_SDMAERRS)
-               decode_sdma_errs(dd, err, buf, blen);
-       if (err & INFINIPATH_E_INVALIDEEPCMD)
-               strlcat(buf, "invalideepromcmd ", blen);
-done:
-       return iserr;
-}
-
-/**
- * get_rhf_errstring - decode RHF errors
- * @err: the err number
- * @msg: the output buffer
- * @len: the length of the output buffer
- *
- * only used one place now, may want more later
- */
-static void get_rhf_errstring(u32 err, char *msg, size_t len)
-{
-       /* if no errors, and so don't need to check what's first */
-       *msg = '\0';
-
-       if (err & INFINIPATH_RHF_H_ICRCERR)
-               strlcat(msg, "icrcerr ", len);
-       if (err & INFINIPATH_RHF_H_VCRCERR)
-               strlcat(msg, "vcrcerr ", len);
-       if (err & INFINIPATH_RHF_H_PARITYERR)
-               strlcat(msg, "parityerr ", len);
-       if (err & INFINIPATH_RHF_H_LENERR)
-               strlcat(msg, "lenerr ", len);
-       if (err & INFINIPATH_RHF_H_MTUERR)
-               strlcat(msg, "mtuerr ", len);
-       if (err & INFINIPATH_RHF_H_IHDRERR)
-               /* infinipath hdr checksum error */
-               strlcat(msg, "ipathhdrerr ", len);
-       if (err & INFINIPATH_RHF_H_TIDERR)
-               strlcat(msg, "tiderr ", len);
-       if (err & INFINIPATH_RHF_H_MKERR)
-               /* bad port, offset, etc. */
-               strlcat(msg, "invalid ipathhdr ", len);
-       if (err & INFINIPATH_RHF_H_IBERR)
-               strlcat(msg, "iberr ", len);
-       if (err & INFINIPATH_RHF_L_SWA)
-               strlcat(msg, "swA ", len);
-       if (err & INFINIPATH_RHF_L_SWB)
-               strlcat(msg, "swB ", len);
-}
-
-/**
- * ipath_get_egrbuf - get an eager buffer
- * @dd: the infinipath device
- * @bufnum: the eager buffer to get
- *
- * must only be called if ipath_pd[port] is known to be allocated
- */
-static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
-{
-       return dd->ipath_port0_skbinfo ?
-               (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
-}
-
-/**
- * ipath_alloc_skb - allocate an skb and buffer with possible constraints
- * @dd: the infinipath device
- * @gfp_mask: the sk_buff SFP mask
- */
-struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
-                               gfp_t gfp_mask)
-{
-       struct sk_buff *skb;
-       u32 len;
-
-       /*
-        * Only fully supported way to handle this is to allocate lots
-        * extra, align as needed, and then do skb_reserve().  That wastes
-        * a lot of memory...  I'll have to hack this into infinipath_copy
-        * also.
-        */
-
-       /*
-        * We need 2 extra bytes for ipath_ether data sent in the
-        * key header.  In order to keep everything dword aligned,
-        * we'll reserve 4 bytes.
-        */
-       len = dd->ipath_ibmaxlen + 4;
-
-       if (dd->ipath_flags & IPATH_4BYTE_TID) {
-               /* We need a 2KB multiple alignment, and there is no way
-                * to do it except to allocate extra and then skb_reserve
-                * enough to bring it up to the right alignment.
-                */
-               len += 2047;
-       }
-
-       skb = __dev_alloc_skb(len, gfp_mask);
-       if (!skb) {
-               ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
-                             len);
-               goto bail;
-       }
-
-       skb_reserve(skb, 4);
-
-       if (dd->ipath_flags & IPATH_4BYTE_TID) {
-               u32 una = (unsigned long)skb->data & 2047;
-               if (una)
-                       skb_reserve(skb, 2048 - una);
-       }
-
-bail:
-       return skb;
-}
-
-static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
-                            u32 eflags,
-                            u32 l,
-                            u32 etail,
-                            __le32 *rhf_addr,
-                            struct ipath_message_header *hdr)
-{
-       char emsg[128];
-
-       get_rhf_errstring(eflags, emsg, sizeof emsg);
-       ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
-                  "tlen=%x opcode=%x egridx=%x: %s\n",
-                  eflags, l,
-                  ipath_hdrget_rcv_type(rhf_addr),
-                  ipath_hdrget_length_in_bytes(rhf_addr),
-                  be32_to_cpu(hdr->bth[0]) >> 24,
-                  etail, emsg);
-
-       /* Count local link integrity errors. */
-       if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
-               u8 n = (dd->ipath_ibcctrl >>
-                       INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-                       INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-
-               if (++dd->ipath_lli_counter > n) {
-                       dd->ipath_lli_counter = 0;
-                       dd->ipath_lli_errors++;
-               }
-       }
-}
-
-/*
- * ipath_kreceive - receive a packet
- * @pd: the infinipath port
- *
- * called from interrupt handler for errors or receive interrupt
- */
-void ipath_kreceive(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       __le32 *rhf_addr;
-       void *ebuf;
-       const u32 rsize = dd->ipath_rcvhdrentsize;      /* words */
-       const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
-       u32 etail = -1, l, hdrqtail;
-       struct ipath_message_header *hdr;
-       u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
-       static u64 totcalls;    /* stats, may eventually remove */
-       int last;
-
-       l = pd->port_head;
-       rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
-       if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-               u32 seq = ipath_hdrget_seq(rhf_addr);
-
-               if (seq != pd->port_seq_cnt)
-                       goto bail;
-               hdrqtail = 0;
-       } else {
-               hdrqtail = ipath_get_rcvhdrtail(pd);
-               if (l == hdrqtail)
-                       goto bail;
-               smp_rmb();
-       }
-
-reloop:
-       for (last = 0, i = 1; !last; i += !last) {
-               hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
-               eflags = ipath_hdrget_err_flags(rhf_addr);
-               etype = ipath_hdrget_rcv_type(rhf_addr);
-               /* total length */
-               tlen = ipath_hdrget_length_in_bytes(rhf_addr);
-               ebuf = NULL;
-               if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
-                   ipath_hdrget_use_egr_buf(rhf_addr) :
-                   (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
-                       /*
-                        * It turns out that the chip uses an eager buffer
-                        * for all non-expected packets, whether it "needs"
-                        * one or not.  So always get the index, but don't
-                        * set ebuf (so we try to copy data) unless the
-                        * length requires it.
-                        */
-                       etail = ipath_hdrget_index(rhf_addr);
-                       updegr = 1;
-                       if (tlen > sizeof(*hdr) ||
-                           etype == RCVHQ_RCV_TYPE_NON_KD)
-                               ebuf = ipath_get_egrbuf(dd, etail);
-               }
-
-               /*
-                * both tiderr and ipathhdrerr are set for all plain IB
-                * packets; only ipathhdrerr should be set.
-                */
-
-               if (etype != RCVHQ_RCV_TYPE_NON_KD &&
-                   etype != RCVHQ_RCV_TYPE_ERROR &&
-                   ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
-                   IPS_PROTO_VERSION)
-                       ipath_cdbg(PKT, "Bad InfiniPath protocol version "
-                                  "%x\n", etype);
-
-               if (unlikely(eflags))
-                       ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
-               else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
-                       ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
-                       if (dd->ipath_lli_counter)
-                               dd->ipath_lli_counter--;
-               } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
-                       u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
-                       u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
-                       ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
-                                  "qp=%x), len %x; ignored\n",
-                                  etype, opcode, qp, tlen);
-               }
-               else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
-                       ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
-                                 be32_to_cpu(hdr->bth[0]) >> 24);
-               else {
-                       /*
-                        * error packet, type of error unknown.
-                        * Probably type 3, but we don't know, so don't
-                        * even try to print the opcode, etc.
-                        * Usually caused by a "bad packet", that has no
-                        * BTH, when the LRH says it should.
-                        */
-                       ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
-                                 " %x, len %x hdrq+%x rhf: %Lx\n",
-                                 etail, tlen, l, (unsigned long long)
-                                 le64_to_cpu(*(__le64 *) rhf_addr));
-                       if (ipath_debug & __IPATH_ERRPKTDBG) {
-                               u32 j, *d, dw = rsize-2;
-                               if (rsize > (tlen>>2))
-                                       dw = tlen>>2;
-                               d = (u32 *)hdr;
-                               printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
-                                       dw);
-                               for (j = 0; j < dw; j++)
-                                       printk(KERN_DEBUG "%8x%s", d[j],
-                                               (j%8) == 7 ? "\n" : " ");
-                               printk(KERN_DEBUG ".\n");
-                       }
-               }
-               l += rsize;
-               if (l >= maxcnt)
-                       l = 0;
-               rhf_addr = (__le32 *) pd->port_rcvhdrq +
-                       l + dd->ipath_rhf_offset;
-               if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-                       u32 seq = ipath_hdrget_seq(rhf_addr);
-
-                       if (++pd->port_seq_cnt > 13)
-                               pd->port_seq_cnt = 1;
-                       if (seq != pd->port_seq_cnt)
-                               last = 1;
-               } else if (l == hdrqtail)
-                       last = 1;
-               /*
-                * update head regs on last packet, and every 16 packets.
-                * Reduce bus traffic, while still trying to prevent
-                * rcvhdrq overflows, for when the queue is nearly full
-                */
-               if (last || !(i & 0xf)) {
-                       u64 lval = l;
-
-                       /* request IBA6120 and 7220 interrupt only on last */
-                       if (last)
-                               lval |= dd->ipath_rhdrhead_intr_off;
-                       ipath_write_ureg(dd, ur_rcvhdrhead, lval,
-                               pd->port_port);
-                       if (updegr) {
-                               ipath_write_ureg(dd, ur_rcvegrindexhead,
-                                                etail, pd->port_port);
-                               updegr = 0;
-                       }
-               }
-       }
-
-       if (!dd->ipath_rhdrhead_intr_off && !reloop &&
-           !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-               /* IBA6110 workaround; we can have a race clearing chip
-                * interrupt with another interrupt about to be delivered,
-                * and can clear it before it is delivered on the GPIO
-                * workaround.  By doing the extra check here for the
-                * in-memory tail register updating while we were doing
-                * earlier packets, we "almost" guarantee we have covered
-                * that case.
-                */
-               u32 hqtail = ipath_get_rcvhdrtail(pd);
-               if (hqtail != hdrqtail) {
-                       hdrqtail = hqtail;
-                       reloop = 1; /* loop 1 extra time at most */
-                       goto reloop;
-               }
-       }
-
-       pkttot += i;
-
-       pd->port_head = l;
-
-       if (pkttot > ipath_stats.sps_maxpkts_call)
-               ipath_stats.sps_maxpkts_call = pkttot;
-       ipath_stats.sps_port0pkts += pkttot;
-       ipath_stats.sps_avgpkts_call =
-               ipath_stats.sps_port0pkts / ++totcalls;
-
-bail:;
-}
-
-/**
- * ipath_update_pio_bufs - update shadow copy of the PIO availability map
- * @dd: the infinipath device
- *
- * called whenever our local copy indicates we have run out of send buffers
- * NOTE: This can be called from interrupt context by some code
- * and from non-interrupt context by ipath_getpiobuf().
- */
-
-static void ipath_update_pio_bufs(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       int i;
-       const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
-
-       /* If the generation (check) bits have changed, then we update the
-        * busy bit for the corresponding PIO buffer.  This algorithm will
-        * modify positions to the value they already have in some cases
-        * (i.e., no change), but it's faster than changing only the bits
-        * that have changed.
-        *
-        * We would like to do this atomicly, to avoid spinlocks in the
-        * critical send path, but that's not really possible, given the
-        * type of changes, and that this routine could be called on
-        * multiple cpu's simultaneously, so we lock in this routine only,
-        * to avoid conflicting updates; all we change is the shadow, and
-        * it's a single 64 bit memory location, so by definition the update
-        * is atomic in terms of what other cpu's can see in testing the
-        * bits.  The spin_lock overhead isn't too bad, since it only
-        * happens when all buffers are in use, so only cpu overhead, not
-        * latency or bandwidth is affected.
-        */
-       if (!dd->ipath_pioavailregs_dma) {
-               ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
-               return;
-       }
-       if (ipath_debug & __IPATH_VERBDBG) {
-               /* only if packet debug and verbose */
-               volatile __le64 *dma = dd->ipath_pioavailregs_dma;
-               unsigned long *shadow = dd->ipath_pioavailshadow;
-
-               ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
-                          "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
-                          "s3=%lx\n",
-                          (unsigned long long) le64_to_cpu(dma[0]),
-                          shadow[0],
-                          (unsigned long long) le64_to_cpu(dma[1]),
-                          shadow[1],
-                          (unsigned long long) le64_to_cpu(dma[2]),
-                          shadow[2],
-                          (unsigned long long) le64_to_cpu(dma[3]),
-                          shadow[3]);
-               if (piobregs > 4)
-                       ipath_cdbg(
-                               PKT, "2nd group, dma4=%llx shad4=%lx, "
-                               "d5=%llx s5=%lx, d6=%llx s6=%lx, "
-                               "d7=%llx s7=%lx\n",
-                               (unsigned long long) le64_to_cpu(dma[4]),
-                               shadow[4],
-                               (unsigned long long) le64_to_cpu(dma[5]),
-                               shadow[5],
-                               (unsigned long long) le64_to_cpu(dma[6]),
-                               shadow[6],
-                               (unsigned long long) le64_to_cpu(dma[7]),
-                               shadow[7]);
-       }
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       for (i = 0; i < piobregs; i++) {
-               u64 pchbusy, pchg, piov, pnew;
-               /*
-                * Chip Errata: bug 6641; even and odd qwords>3 are swapped
-                */
-               if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
-               else
-                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
-               pchg = dd->ipath_pioavailkernel[i] &
-                       ~(dd->ipath_pioavailshadow[i] ^ piov);
-               pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
-               if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
-                       pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
-                       pnew |= piov & pchbusy;
-                       dd->ipath_pioavailshadow[i] = pnew;
-               }
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-}
-
-/*
- * used to force update of pioavailshadow if we can't get a pio buffer.
- * Needed primarily due to exitting freeze mode after recovering
- * from errors.  Done lazily, because it's safer (known to not
- * be writing pio buffers).
- */
-static void ipath_reset_availshadow(struct ipath_devdata *dd)
-{
-       int i, im;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       for (i = 0; i < dd->ipath_pioavregs; i++) {
-               u64 val, oldval;
-               /* deal with 6110 chip bug on high register #s */
-               im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-                       i ^ 1 : i;
-               val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
-               /*
-                * busy out the buffers not in the kernel avail list,
-                * without changing the generation bits.
-                */
-               oldval = dd->ipath_pioavailshadow[i];
-               dd->ipath_pioavailshadow[i] = val |
-                       ((~dd->ipath_pioavailkernel[i] <<
-                       INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
-                       0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
-               if (oldval != dd->ipath_pioavailshadow[i])
-                       ipath_dbg("shadow[%d] was %Lx, now %lx\n",
-                               i, (unsigned long long) oldval,
-                               dd->ipath_pioavailshadow[i]);
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-}
-
-/**
- * ipath_setrcvhdrsize - set the receive header size
- * @dd: the infinipath device
- * @rhdrsize: the receive header size
- *
- * called from user init code, and also layered driver init
- */
-int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
-{
-       int ret = 0;
-
-       if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
-               if (dd->ipath_rcvhdrsize != rhdrsize) {
-                       dev_info(&dd->pcidev->dev,
-                                "Error: can't set protocol header "
-                                "size %u, already %u\n",
-                                rhdrsize, dd->ipath_rcvhdrsize);
-                       ret = -EAGAIN;
-               } else
-                       ipath_cdbg(VERBOSE, "Reuse same protocol header "
-                                  "size %u\n", dd->ipath_rcvhdrsize);
-       } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
-                              (sizeof(u64) / sizeof(u32)))) {
-               ipath_dbg("Error: can't set protocol header size %u "
-                         "(> max %u)\n", rhdrsize,
-                         dd->ipath_rcvhdrentsize -
-                         (u32) (sizeof(u64) / sizeof(u32)));
-               ret = -EOVERFLOW;
-       } else {
-               dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
-               dd->ipath_rcvhdrsize = rhdrsize;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
-                                dd->ipath_rcvhdrsize);
-               ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
-                          dd->ipath_rcvhdrsize);
-       }
-       return ret;
-}
-
-/*
- * debugging code and stats updates if no pio buffers available.
- */
-static noinline void no_pio_bufs(struct ipath_devdata *dd)
-{
-       unsigned long *shadow = dd->ipath_pioavailshadow;
-       __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
-
-       dd->ipath_upd_pio_shadow = 1;
-
-       /*
-        * not atomic, but if we lose a stat count in a while, that's OK
-        */
-       ipath_stats.sps_nopiobufs++;
-       if (!(++dd->ipath_consec_nopiobuf % 100000)) {
-               ipath_force_pio_avail_update(dd); /* at start */
-               ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
-                       "%llx %llx %llx %llx\n"
-                       "ipath  shadow:  %lx %lx %lx %lx\n",
-                       dd->ipath_consec_nopiobuf,
-                       (unsigned long)get_cycles(),
-                       (unsigned long long) le64_to_cpu(dma[0]),
-                       (unsigned long long) le64_to_cpu(dma[1]),
-                       (unsigned long long) le64_to_cpu(dma[2]),
-                       (unsigned long long) le64_to_cpu(dma[3]),
-                       shadow[0], shadow[1], shadow[2], shadow[3]);
-               /*
-                * 4 buffers per byte, 4 registers above, cover rest
-                * below
-                */
-               if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
-                   (sizeof(shadow[0]) * 4 * 4))
-                       ipath_dbg("2nd group: dmacopy: "
-                                 "%llx %llx %llx %llx\n"
-                                 "ipath  shadow:  %lx %lx %lx %lx\n",
-                                 (unsigned long long)le64_to_cpu(dma[4]),
-                                 (unsigned long long)le64_to_cpu(dma[5]),
-                                 (unsigned long long)le64_to_cpu(dma[6]),
-                                 (unsigned long long)le64_to_cpu(dma[7]),
-                                 shadow[4], shadow[5], shadow[6], shadow[7]);
-
-               /* at end, so update likely happened */
-               ipath_reset_availshadow(dd);
-       }
-}
-
-/*
- * common code for normal driver pio buffer allocation, and reserved
- * allocation.
- *
- * do appropriate marking as busy, etc.
- * returns buffer number if one found (>=0), negative number is error.
- */
-static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
-       u32 *pbufnum, u32 first, u32 last, u32 firsti)
-{
-       int i, j, updated = 0;
-       unsigned piobcnt;
-       unsigned long flags;
-       unsigned long *shadow = dd->ipath_pioavailshadow;
-       u32 __iomem *buf;
-
-       piobcnt = last - first;
-       if (dd->ipath_upd_pio_shadow) {
-               /*
-                * Minor optimization.  If we had no buffers on last call,
-                * start out by doing the update; continue and do scan even
-                * if no buffers were updated, to be paranoid
-                */
-               ipath_update_pio_bufs(dd);
-               updated++;
-               i = first;
-       } else
-               i = firsti;
-rescan:
-       /*
-        * while test_and_set_bit() is atomic, we do that and then the
-        * change_bit(), and the pair is not.  See if this is the cause
-        * of the remaining armlaunch errors.
-        */
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       for (j = 0; j < piobcnt; j++, i++) {
-               if (i >= last)
-                       i = first;
-               if (__test_and_set_bit((2 * i) + 1, shadow))
-                       continue;
-               /* flip generation bit */
-               __change_bit(2 * i, shadow);
-               break;
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-
-       if (j == piobcnt) {
-               if (!updated) {
-                       /*
-                        * first time through; shadow exhausted, but may be
-                        * buffers available, try an update and then rescan.
-                        */
-                       ipath_update_pio_bufs(dd);
-                       updated++;
-                       i = first;
-                       goto rescan;
-               } else if (updated == 1 && piobcnt <=
-                       ((dd->ipath_sendctrl
-                       >> INFINIPATH_S_UPDTHRESH_SHIFT) &
-                       INFINIPATH_S_UPDTHRESH_MASK)) {
-                       /*
-                        * for chips supporting and using the update
-                        * threshold we need to force an update of the
-                        * in-memory copy if the count is less than the
-                        * thershold, then check one more time.
-                        */
-                       ipath_force_pio_avail_update(dd);
-                       ipath_update_pio_bufs(dd);
-                       updated++;
-                       i = first;
-                       goto rescan;
-               }
-
-               no_pio_bufs(dd);
-               buf = NULL;
-       } else {
-               if (i < dd->ipath_piobcnt2k)
-                       buf = (u32 __iomem *) (dd->ipath_pio2kbase +
-                                              i * dd->ipath_palign);
-               else
-                       buf = (u32 __iomem *)
-                               (dd->ipath_pio4kbase +
-                                (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
-               if (pbufnum)
-                       *pbufnum = i;
-       }
-
-       return buf;
-}
-
-/**
- * ipath_getpiobuf - find an available pio buffer
- * @dd: the infinipath device
- * @plen: the size of the PIO buffer needed in 32-bit words
- * @pbufnum: the buffer number is placed here
- */
-u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
-{
-       u32 __iomem *buf;
-       u32 pnum, nbufs;
-       u32 first, lasti;
-
-       if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
-               first = dd->ipath_piobcnt2k;
-               lasti = dd->ipath_lastpioindexl;
-       } else {
-               first = 0;
-               lasti = dd->ipath_lastpioindex;
-       }
-       nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
-
-       if (buf) {
-               /*
-                * Set next starting place.  It's just an optimization,
-                * it doesn't matter who wins on this, so no locking
-                */
-               if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
-                       dd->ipath_lastpioindexl = pnum + 1;
-               else
-                       dd->ipath_lastpioindex = pnum + 1;
-               if (dd->ipath_upd_pio_shadow)
-                       dd->ipath_upd_pio_shadow = 0;
-               if (dd->ipath_consec_nopiobuf)
-                       dd->ipath_consec_nopiobuf = 0;
-               ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
-                          pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
-               if (pbufnum)
-                       *pbufnum = pnum;
-
-       }
-       return buf;
-}
-
-/**
- * ipath_chg_pioavailkernel - change which send buffers are available for kernel
- * @dd: the infinipath device
- * @start: the starting send buffer number
- * @len: the number of send buffers
- * @avail: true if the buffers are available for kernel use, false otherwise
- */
-void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
-                             unsigned len, int avail)
-{
-       unsigned long flags;
-       unsigned end, cnt = 0;
-
-       /* There are two bits per send buffer (busy and generation) */
-       start *= 2;
-       end = start + len * 2;
-
-       spin_lock_irqsave(&ipath_pioavail_lock, flags);
-       /* Set or clear the busy bit in the shadow. */
-       while (start < end) {
-               if (avail) {
-                       unsigned long dma;
-                       int i, im;
-                       /*
-                        * the BUSY bit will never be set, because we disarm
-                        * the user buffers before we hand them back to the
-                        * kernel.  We do have to make sure the generation
-                        * bit is set correctly in shadow, since it could
-                        * have changed many times while allocated to user.
-                        * We can't use the bitmap functions on the full
-                        * dma array because it is always little-endian, so
-                        * we have to flip to host-order first.
-                        * BITS_PER_LONG is slightly wrong, since it's
-                        * always 64 bits per register in chip...
-                        * We only work on 64 bit kernels, so that's OK.
-                        */
-                       /* deal with 6110 chip bug on high register #s */
-                       i = start / BITS_PER_LONG;
-                       im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
-                               i ^ 1 : i;
-                       __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
-                               + start, dd->ipath_pioavailshadow);
-                       dma = (unsigned long) le64_to_cpu(
-                               dd->ipath_pioavailregs_dma[im]);
-                       if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-                               + start) % BITS_PER_LONG, &dma))
-                               __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-                                       + start, dd->ipath_pioavailshadow);
-                       else
-                               __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
-                                       + start, dd->ipath_pioavailshadow);
-                       __set_bit(start, dd->ipath_pioavailkernel);
-               } else {
-                       __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
-                               dd->ipath_pioavailshadow);
-                       __clear_bit(start, dd->ipath_pioavailkernel);
-               }
-               start += 2;
-       }
-
-       if (dd->ipath_pioupd_thresh) {
-               end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
-               cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
-       }
-       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
-
-       /*
-        * When moving buffers from kernel to user, if number assigned to
-        * the user is less than the pio update threshold, and threshold
-        * is supported (cnt was computed > 0), drop the update threshold
-        * so we update at least once per allocated number of buffers.
-        * In any case, if the kernel buffers are less than the threshold,
-        * drop the threshold.  We don't bother increasing it, having once
-        * decreased it, since it would typically just cycle back and forth.
-        * If we don't decrease below buffers in use, we can wait a long
-        * time for an update, until some other context uses PIO buffers.
-        */
-       if (!avail && len < cnt)
-               cnt = len;
-       if (cnt < dd->ipath_pioupd_thresh) {
-               dd->ipath_pioupd_thresh = cnt;
-               ipath_dbg("Decreased pio update threshold to %u\n",
-                       dd->ipath_pioupd_thresh);
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
-                       << INFINIPATH_S_UPDTHRESH_SHIFT);
-               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-                       << INFINIPATH_S_UPDTHRESH_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-}
-
-/**
- * ipath_create_rcvhdrq - create a receive header queue
- * @dd: the infinipath device
- * @pd: the port data
- *
- * this must be contiguous memory (from an i/o perspective), and must be
- * DMA'able (which means for some systems, it will go through an IOMMU,
- * or be forced into a low address range).
- */
-int ipath_create_rcvhdrq(struct ipath_devdata *dd,
-                        struct ipath_portdata *pd)
-{
-       int ret = 0;
-
-       if (!pd->port_rcvhdrq) {
-               dma_addr_t phys_hdrqtail;
-               gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-               int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
-                               sizeof(u32), PAGE_SIZE);
-
-               pd->port_rcvhdrq = dma_alloc_coherent(
-                       &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
-                       gfp_flags);
-
-               if (!pd->port_rcvhdrq) {
-                       ipath_dev_err(dd, "attempt to allocate %d bytes "
-                                     "for port %u rcvhdrq failed\n",
-                                     amt, pd->port_port);
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-                       pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
-                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
-                               GFP_KERNEL);
-                       if (!pd->port_rcvhdrtail_kvaddr) {
-                               ipath_dev_err(dd, "attempt to allocate 1 page "
-                                       "for port %u rcvhdrqtailaddr "
-                                       "failed\n", pd->port_port);
-                               ret = -ENOMEM;
-                               dma_free_coherent(&dd->pcidev->dev, amt,
-                                       pd->port_rcvhdrq,
-                                       pd->port_rcvhdrq_phys);
-                               pd->port_rcvhdrq = NULL;
-                               goto bail;
-                       }
-                       pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
-                       ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
-                                  "physical\n", pd->port_port,
-                                  (unsigned long long) phys_hdrqtail);
-               }
-
-               pd->port_rcvhdrq_size = amt;
-
-               ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
-                          "for port %u rcvhdr Q\n",
-                          amt >> PAGE_SHIFT, pd->port_rcvhdrq,
-                          (unsigned long) pd->port_rcvhdrq_phys,
-                          (unsigned long) pd->port_rcvhdrq_size,
-                          pd->port_port);
-       }
-       else
-               ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
-                          "hdrtailaddr@%p %llx physical\n",
-                          pd->port_port, pd->port_rcvhdrq,
-                          (unsigned long long) pd->port_rcvhdrq_phys,
-                          pd->port_rcvhdrtail_kvaddr, (unsigned long long)
-                          pd->port_rcvhdrqtailaddr_phys);
-
-       /* clear for security and sanity on each use */
-       memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
-       if (pd->port_rcvhdrtail_kvaddr)
-               memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
-
-       /*
-        * tell chip each time we init it, even if we are re-using previous
-        * memory (we zero the register at process close)
-        */
-       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
-                             pd->port_port, pd->port_rcvhdrqtailaddr_phys);
-       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
-                             pd->port_port, pd->port_rcvhdrq_phys);
-
-bail:
-       return ret;
-}
-
-
-/*
- * Flush all sends that might be in the ready to send state, as well as any
- * that are in the process of being sent.   Used whenever we need to be
- * sure the send side is idle.  Cleans up all buffer state by canceling
- * all pio buffers, and issuing an abort, which cleans up anything in the
- * launch fifo.  The cancel is superfluous on some chip versions, but
- * it's safer to always do it.
- * PIOAvail bits are updated by the chip as if normal send had happened.
- */
-void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
-{
-       unsigned long flags;
-
-       if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
-               ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
-               goto bail;
-       }
-       /*
-        * If we have SDMA, and it's not disabled, we have to kick off the
-        * abort state machine, provided we aren't already aborting.
-        * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
-        * we skip the rest of this routine. It is already "in progress"
-        */
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
-               int skip_cancel;
-               unsigned long *statp = &dd->ipath_sdma_status;
-
-               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-               skip_cancel =
-                       test_and_set_bit(IPATH_SDMA_ABORTING, statp)
-                       && !test_bit(IPATH_SDMA_DISABLED, statp);
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-               if (skip_cancel)
-                       goto bail;
-       }
-
-       ipath_dbg("Cancelling all in-progress send buffers\n");
-
-       /* skip armlaunch errs for a while */
-       dd->ipath_lastcancel = jiffies + HZ / 2;
-
-       /*
-        * The abort bit is auto-clearing.  We also don't want pioavail
-        * update happening during this, and we don't want any other
-        * sends going out, so turn those off for the duration.  We read
-        * the scratch register to be sure that cancels and the abort
-        * have taken effect in the chip.  Otherwise two parts are same
-        * as ipath_force_pio_avail_update()
-        */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
-               | INFINIPATH_S_PIOENABLE);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-               dd->ipath_sendctrl | INFINIPATH_S_ABORT);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /* disarm all send buffers */
-       ipath_disarm_piobufs(dd, 0,
-               dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
-
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
-
-       if (restore_sendctrl) {
-               /* else done by caller later if needed */
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
-                       INFINIPATH_S_PIOENABLE;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               /* and again, be sure all have hit the chip */
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-
-       if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
-           !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
-           test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
-               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-               /* only wait so long for intr */
-               dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
-               dd->ipath_sdma_reset_wait = 200;
-               if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-                       tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       }
-bail:;
-}
-
-/*
- * Force an update of in-memory copy of the pioavail registers, when
- * needed for any of a variety of reasons.  We read the scratch register
- * to make it highly likely that the update will have happened by the
- * time we return.  If already off (as in cancel_sends above), this
- * routine is a nop, on the assumption that the caller will "do the
- * right thing".
- */
-void ipath_force_pio_avail_update(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                       dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       }
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-}
-
-static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
-                               int linitcmd)
-{
-       u64 mod_wd;
-       static const char *what[4] = {
-               [0] = "NOP",
-               [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
-               [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
-               [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
-       };
-
-       if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
-               /*
-                * If we are told to disable, note that so link-recovery
-                * code does not attempt to bring us back up.
-                */
-               preempt_disable();
-               dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
-               preempt_enable();
-       } else if (linitcmd) {
-               /*
-                * Any other linkinitcmd will lead to LINKDOWN and then
-                * to INIT (if all is well), so clear flag to let
-                * link-recovery code attempt to bring us back up.
-                */
-               preempt_disable();
-               dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
-               preempt_enable();
-       }
-
-       mod_wd = (linkcmd << dd->ibcc_lc_shift) |
-               (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
-       ipath_cdbg(VERBOSE,
-               "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
-               dd->ipath_unit, what[linkcmd], linitcmd,
-               ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
-                       ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                        dd->ipath_ibcctrl | mod_wd);
-       /* read from chip so write is flushed */
-       (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-}
-
-int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
-{
-       u32 lstate;
-       int ret;
-
-       switch (newstate) {
-       case IPATH_IB_LINKDOWN_ONLY:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKDOWN:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-                                       INFINIPATH_IBCC_LINKINITCMD_POLL);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKDOWN_SLEEP:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-                                       INFINIPATH_IBCC_LINKINITCMD_SLEEP);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKDOWN_DISABLE:
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
-                                       INFINIPATH_IBCC_LINKINITCMD_DISABLE);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINKARM:
-               if (dd->ipath_flags & IPATH_LINKARMED) {
-                       ret = 0;
-                       goto bail;
-               }
-               if (!(dd->ipath_flags &
-                     (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
-
-               /*
-                * Since the port can transition to ACTIVE by receiving
-                * a non VL 15 packet, wait for either state.
-                */
-               lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
-               break;
-
-       case IPATH_IB_LINKACTIVE:
-               if (dd->ipath_flags & IPATH_LINKACTIVE) {
-                       ret = 0;
-                       goto bail;
-               }
-               if (!(dd->ipath_flags & IPATH_LINKARMED)) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
-               lstate = IPATH_LINKACTIVE;
-               break;
-
-       case IPATH_IB_LINK_LOOPBACK:
-               dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
-               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-
-               /* turn heartbeat off, as it causes loopback to fail */
-               dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                                      IPATH_IB_HRTBT_OFF);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       case IPATH_IB_LINK_EXTERNAL:
-               dev_info(&dd->pcidev->dev,
-                       "Disabling IB local loopback (normal)\n");
-               dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                                      IPATH_IB_HRTBT_ON);
-               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-               /* don't wait */
-               ret = 0;
-               goto bail;
-
-       /*
-        * Heartbeat can be explicitly enabled by the user via
-        * "hrtbt_enable" "file", and if disabled, trying to enable here
-        * will have no effect.  Implicit changes (heartbeat off when
-        * loopback on, and vice versa) are included to ease testing.
-        */
-       case IPATH_IB_LINK_HRTBT:
-               ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                       IPATH_IB_HRTBT_ON);
-               goto bail;
-
-       case IPATH_IB_LINK_NO_HRTBT:
-               ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
-                       IPATH_IB_HRTBT_OFF);
-               goto bail;
-
-       default:
-               ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
-               ret = -EINVAL;
-               goto bail;
-       }
-       ret = ipath_wait_linkstate(dd, lstate, 2000);
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_set_mtu - set the MTU
- * @dd: the infinipath device
- * @arg: the new MTU
- *
- * we can handle "any" incoming size, the issue here is whether we
- * need to restrict our outgoing size.   For now, we don't do any
- * sanity checking on this, and we don't deal with what happens to
- * programs that are already running when the size changes.
- * NOTE: changing the MTU will usually cause the IBC to go back to
- * link INIT state...
- */
-int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
-{
-       u32 piosize;
-       int changed = 0;
-       int ret;
-
-       /*
-        * mtu is IB data payload max.  It's the largest power of 2 less
-        * than piosize (or even larger, since it only really controls the
-        * largest we can receive; we can send the max of the mtu and
-        * piosize).  We check that it's one of the valid IB sizes.
-        */
-       if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
-           (arg != 4096 || !ipath_mtu4096)) {
-               ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
-               ret = -EINVAL;
-               goto bail;
-       }
-       if (dd->ipath_ibmtu == arg) {
-               ret = 0;        /* same as current */
-               goto bail;
-       }
-
-       piosize = dd->ipath_ibmaxlen;
-       dd->ipath_ibmtu = arg;
-
-       if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
-               /* Only if it's not the initial value (or reset to it) */
-               if (piosize != dd->ipath_init_ibmaxlen) {
-                       if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
-                               piosize = dd->ipath_init_ibmaxlen;
-                       dd->ipath_ibmaxlen = piosize;
-                       changed = 1;
-               }
-       } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
-               piosize = arg + IPATH_PIO_MAXIBHDR;
-               ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
-                          "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
-                          arg);
-               dd->ipath_ibmaxlen = piosize;
-               changed = 1;
-       }
-
-       if (changed) {
-               u64 ibc = dd->ipath_ibcctrl, ibdw;
-               /*
-                * update our housekeeping variables, and set IBC max
-                * size, same as init code; max IBC is max we allow in
-                * buffer, less the qword pbc, plus 1 for ICRC, in dwords
-                */
-               dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
-               ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
-               ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
-                        dd->ibcc_mpl_shift);
-               ibc |= ibdw << dd->ibcc_mpl_shift;
-               dd->ipath_ibcctrl = ibc;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-               dd->ipath_f_tidtemplate(dd);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
-{
-       dd->ipath_lid = lid;
-       dd->ipath_lmc = lmc;
-
-       dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
-               (~((1U << lmc) - 1)) << 16);
-
-       dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
-
-       return 0;
-}
-
-
-/**
- * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
- * @dd: the infinipath device
- * @regno: the register number to write
- * @port: the port containing the register
- * @value: the value to write
- *
- * Registers that vary with the chip implementation constants (port)
- * use this routine.
- */
-void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
-                         unsigned port, u64 value)
-{
-       u16 where;
-
-       if (port < dd->ipath_portcnt &&
-           (regno == dd->ipath_kregs->kr_rcvhdraddr ||
-            regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
-               where = regno + port;
-       else
-               where = -1;
-
-       ipath_write_kreg(dd, where, value);
-}
-
-/*
- * Following deal with the "obviously simple" task of overriding the state
- * of the LEDS, which normally indicate link physical and logical status.
- * The complications arise in dealing with different hardware mappings
- * and the board-dependent routine being called from interrupts.
- * and then there's the requirement to _flash_ them.
- */
-#define LED_OVER_FREQ_SHIFT 8
-#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
-/* Below is "non-zero" to force override, but both actual LEDs are off */
-#define LED_OVER_BOTH_OFF (8)
-
-static void ipath_run_led_override(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-       int timeoff;
-       int pidx;
-       u64 lstate, ltstate, val;
-
-       if (!(dd->ipath_flags & IPATH_INITTED))
-               return;
-
-       pidx = dd->ipath_led_override_phase++ & 1;
-       dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
-       timeoff = dd->ipath_led_override_timeoff;
-
-       /*
-        * below potentially restores the LED values per current status,
-        * should also possibly setup the traffic-blink register,
-        * but leave that to per-chip functions.
-        */
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-       ltstate = ipath_ib_linktrstate(dd, val);
-       lstate = ipath_ib_linkstate(dd, val);
-
-       dd->ipath_f_setextled(dd, lstate, ltstate);
-       mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
-}
-
-void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
-{
-       int timeoff, freq;
-
-       if (!(dd->ipath_flags & IPATH_INITTED))
-               return;
-
-       /* First check if we are blinking. If not, use 1HZ polling */
-       timeoff = HZ;
-       freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
-
-       if (freq) {
-               /* For blink, set each phase from one nybble of val */
-               dd->ipath_led_override_vals[0] = val & 0xF;
-               dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
-               timeoff = (HZ << 4)/freq;
-       } else {
-               /* Non-blink set both phases the same. */
-               dd->ipath_led_override_vals[0] = val & 0xF;
-               dd->ipath_led_override_vals[1] = val & 0xF;
-       }
-       dd->ipath_led_override_timeoff = timeoff;
-
-       /*
-        * If the timer has not already been started, do so. Use a "quick"
-        * timeout so the function will be called soon, to look at our request.
-        */
-       if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
-               /* Need to start timer */
-               init_timer(&dd->ipath_led_override_timer);
-               dd->ipath_led_override_timer.function =
-                                                ipath_run_led_override;
-               dd->ipath_led_override_timer.data = (unsigned long) dd;
-               dd->ipath_led_override_timer.expires = jiffies + 1;
-               add_timer(&dd->ipath_led_override_timer);
-       } else
-               atomic_dec(&dd->ipath_led_override_timer_active);
-}
-
-/**
- * ipath_shutdown_device - shut down a device
- * @dd: the infinipath device
- *
- * This is called to make the device quiet when we are about to
- * unload the driver, and also when the device is administratively
- * disabled.   It does not free any data structures.
- * Everything it does has to be setup again by ipath_init_chip(dd,1)
- */
-void ipath_shutdown_device(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       ipath_dbg("Shutting down the device\n");
-
-       ipath_hol_up(dd); /* make sure user processes aren't suspended */
-
-       dd->ipath_flags |= IPATH_LINKUNK;
-       dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
-                            IPATH_LINKINIT | IPATH_LINKARMED |
-                            IPATH_LINKACTIVE);
-       *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
-                               IPATH_STATUS_IB_READY);
-
-       /* mask interrupts, but not errors */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-
-       dd->ipath_rcvctrl = 0;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               teardown_sdma(dd);
-
-       /*
-        * gracefully stop all sends allowing any in progress to trickle out
-        * first.
-        */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl = 0;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       /* flush it */
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /*
-        * enough for anything that's going to trickle out to have actually
-        * done so.
-        */
-       udelay(5);
-
-       dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
-
-       ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
-       ipath_cancel_sends(dd, 0);
-
-       /*
-        * we are shutting down, so tell components that care.  We don't do
-        * this on just a link state change, much like ethernet, a cable
-        * unplug, etc. doesn't change driver state
-        */
-       signal_ib_event(dd, IB_EVENT_PORT_ERR);
-
-       /* disable IBC */
-       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control | INFINIPATH_C_FREEZEMODE);
-
-       /*
-        * clear SerdesEnable and turn the leds off; do this here because
-        * we are unloading, so don't count on interrupts to move along
-        * Turn the LEDs off explicitly for the same reason.
-        */
-       dd->ipath_f_quiet_serdes(dd);
-
-       /* stop all the timers that might still be running */
-       del_timer_sync(&dd->ipath_hol_timer);
-       if (dd->ipath_stats_timer_active) {
-               del_timer_sync(&dd->ipath_stats_timer);
-               dd->ipath_stats_timer_active = 0;
-       }
-       if (dd->ipath_intrchk_timer.data) {
-               del_timer_sync(&dd->ipath_intrchk_timer);
-               dd->ipath_intrchk_timer.data = 0;
-       }
-       if (atomic_read(&dd->ipath_led_override_timer_active)) {
-               del_timer_sync(&dd->ipath_led_override_timer);
-               atomic_set(&dd->ipath_led_override_timer_active, 0);
-       }
-
-       /*
-        * clear all interrupts and errors, so that the next time the driver
-        * is loaded or device is enabled, we know that whatever is set
-        * happened while we were unloaded
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
-
-       ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
-       ipath_update_eeprom_log(dd);
-}
-
-/**
- * ipath_free_pddata - free a port's allocated data
- * @dd: the infinipath device
- * @pd: the portdata structure
- *
- * free up any allocated data for a port
- * This should not touch anything that would affect a simultaneous
- * re-allocation of port data, because it is called after ipath_mutex
- * is released (and can be called from reinit as well).
- * It should never change any chip state, or global driver state.
- * (The only exception to global state is freeing the port0 port0_skbs.)
- */
-void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
-{
-       if (!pd)
-               return;
-
-       if (pd->port_rcvhdrq) {
-               ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
-                          "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
-                          (unsigned long) pd->port_rcvhdrq_size);
-               dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
-                                 pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
-               pd->port_rcvhdrq = NULL;
-               if (pd->port_rcvhdrtail_kvaddr) {
-                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                        pd->port_rcvhdrtail_kvaddr,
-                                        pd->port_rcvhdrqtailaddr_phys);
-                       pd->port_rcvhdrtail_kvaddr = NULL;
-               }
-       }
-       if (pd->port_port && pd->port_rcvegrbuf) {
-               unsigned e;
-
-               for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
-                       void *base = pd->port_rcvegrbuf[e];
-                       size_t size = pd->port_rcvegrbuf_size;
-
-                       ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
-                                  "chunk %u/%u\n", base,
-                                  (unsigned long) size,
-                                  e, pd->port_rcvegrbuf_chunks);
-                       dma_free_coherent(&dd->pcidev->dev, size,
-                               base, pd->port_rcvegrbuf_phys[e]);
-               }
-               kfree(pd->port_rcvegrbuf);
-               pd->port_rcvegrbuf = NULL;
-               kfree(pd->port_rcvegrbuf_phys);
-               pd->port_rcvegrbuf_phys = NULL;
-               pd->port_rcvegrbuf_chunks = 0;
-       } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
-               unsigned e;
-               struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
-
-               dd->ipath_port0_skbinfo = NULL;
-               ipath_cdbg(VERBOSE, "free closed port %d "
-                          "ipath_port0_skbinfo @ %p\n", pd->port_port,
-                          skbinfo);
-               for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
-                       if (skbinfo[e].skb) {
-                               pci_unmap_single(dd->pcidev, skbinfo[e].phys,
-                                                dd->ipath_ibmaxlen,
-                                                PCI_DMA_FROMDEVICE);
-                               dev_kfree_skb(skbinfo[e].skb);
-                       }
-               vfree(skbinfo);
-       }
-       kfree(pd->port_tid_pg_list);
-       vfree(pd->subport_uregbase);
-       vfree(pd->subport_rcvegrbuf);
-       vfree(pd->subport_rcvhdr_base);
-       kfree(pd);
-}
-
-static int __init infinipath_init(void)
-{
-       int ret;
-
-       if (ipath_debug & __IPATH_DBG)
-               printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
-
-       /*
-        * These must be called before the driver is registered with
-        * the PCI subsystem.
-        */
-       idr_init(&unit_table);
-
-       ret = pci_register_driver(&ipath_driver);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Unable to register driver: error %d\n", -ret);
-               goto bail_unit;
-       }
-
-       ret = ipath_init_ipathfs();
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
-                      "ipathfs: error %d\n", -ret);
-               goto bail_pci;
-       }
-
-       goto bail;
-
-bail_pci:
-       pci_unregister_driver(&ipath_driver);
-
-bail_unit:
-       idr_destroy(&unit_table);
-
-bail:
-       return ret;
-}
-
-static void __exit infinipath_cleanup(void)
-{
-       ipath_exit_ipathfs();
-
-       ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
-       pci_unregister_driver(&ipath_driver);
-
-       idr_destroy(&unit_table);
-}
-
-/**
- * ipath_reset_device - reset the chip if possible
- * @unit: the device to reset
- *
- * Whether or not reset is successful, we attempt to re-initialize the chip
- * (that is, much like a driver unload/reload).  We clear the INITTED flag
- * so that the various entry points will fail until we reinitialize.  For
- * now, we only allow this if no user ports are open that use chip resources
- */
-int ipath_reset_device(int unit)
-{
-       int ret, i;
-       struct ipath_devdata *dd = ipath_lookup(unit);
-       unsigned long flags;
-
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (atomic_read(&dd->ipath_led_override_timer_active)) {
-               /* Need to stop LED timer, _then_ shut off LEDs */
-               del_timer_sync(&dd->ipath_led_override_timer);
-               atomic_set(&dd->ipath_led_override_timer_active, 0);
-       }
-
-       /* Shut off LEDs after we are sure timer is not running */
-       dd->ipath_led_override = LED_OVER_BOTH_OFF;
-       dd->ipath_f_setextled(dd, 0, 0);
-
-       dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
-
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
-               dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
-                        "not initialized or not present\n", unit);
-               ret = -ENXIO;
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       if (dd->ipath_pd)
-               for (i = 1; i < dd->ipath_cfgports; i++) {
-                       if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
-                               continue;
-                       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-                       ipath_dbg("unit %u port %d is in use "
-                                 "(PID %u cmd %s), can't reset\n",
-                                 unit, i,
-                                 pid_nr(dd->ipath_pd[i]->port_pid),
-                                 dd->ipath_pd[i]->port_comm);
-                       ret = -EBUSY;
-                       goto bail;
-               }
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               teardown_sdma(dd);
-
-       dd->ipath_flags &= ~IPATH_INITTED;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-       ret = dd->ipath_f_reset(dd);
-       if (ret == 1) {
-               ipath_dbg("Reinitializing unit %u after reset attempt\n",
-                         unit);
-               ret = ipath_init_chip(dd, 1);
-       } else
-               ret = -EAGAIN;
-       if (ret)
-               ipath_dev_err(dd, "Reinitialize unit %u after "
-                             "reset failed with %d\n", unit, ret);
-       else
-               dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
-                        "resetting\n", unit);
-
-bail:
-       return ret;
-}
-
-/*
- * send a signal to all the processes that have the driver open
- * through the normal interfaces (i.e., everything other than diags
- * interface).  Returns number of signalled processes.
- */
-static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
-{
-       int i, sub, any = 0;
-       struct pid *pid;
-       unsigned long flags;
-
-       if (!dd->ipath_pd)
-               return 0;
-
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       for (i = 1; i < dd->ipath_cfgports; i++) {
-               if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
-                       continue;
-               pid = dd->ipath_pd[i]->port_pid;
-               if (!pid)
-                       continue;
-
-               dev_info(&dd->pcidev->dev, "context %d in use "
-                         "(PID %u), sending signal %d\n",
-                         i, pid_nr(pid), sig);
-               kill_pid(pid, sig, 1);
-               any++;
-               for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
-                       pid = dd->ipath_pd[i]->port_subpid[sub];
-                       if (!pid)
-                               continue;
-                       dev_info(&dd->pcidev->dev, "sub-context "
-                               "%d:%d in use (PID %u), sending "
-                               "signal %d\n", i, sub, pid_nr(pid), sig);
-                       kill_pid(pid, sig, 1);
-                       any++;
-               }
-       }
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-       return any;
-}
-
-static void ipath_hol_signal_down(struct ipath_devdata *dd)
-{
-       if (ipath_signal_procs(dd, SIGSTOP))
-               ipath_dbg("Stopped some processes\n");
-       ipath_cancel_sends(dd, 1);
-}
-
-
-static void ipath_hol_signal_up(struct ipath_devdata *dd)
-{
-       if (ipath_signal_procs(dd, SIGCONT))
-               ipath_dbg("Continued some processes\n");
-}
-
-/*
- * link is down, stop any users processes, and flush pending sends
- * to prevent HoL blocking, then start the HoL timer that
- * periodically continues, then stop procs, so they can detect
- * link down if they want, and do something about it.
- * Timer may already be running, so use mod_timer, not add_timer.
- */
-void ipath_hol_down(struct ipath_devdata *dd)
-{
-       dd->ipath_hol_state = IPATH_HOL_DOWN;
-       ipath_hol_signal_down(dd);
-       dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
-       dd->ipath_hol_timer.expires = jiffies +
-               msecs_to_jiffies(ipath_hol_timeout_ms);
-       mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
-}
-
-/*
- * link is up, continue any user processes, and ensure timer
- * is a nop, if running.  Let timer keep running, if set; it
- * will nop when it sees the link is up
- */
-void ipath_hol_up(struct ipath_devdata *dd)
-{
-       ipath_hol_signal_up(dd);
-       dd->ipath_hol_state = IPATH_HOL_UP;
-}
-
-/*
- * toggle the running/not running state of user proceses
- * to prevent HoL blocking on chip resources, but still allow
- * user processes to do link down special case handling.
- * Should only be called via the timer
- */
-void ipath_hol_event(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-       if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
-               && dd->ipath_hol_state != IPATH_HOL_UP) {
-               dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
-               ipath_dbg("Stopping processes\n");
-               ipath_hol_signal_down(dd);
-       } else { /* may do "extra" if also in ipath_hol_up() */
-               dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
-               ipath_dbg("Continuing processes\n");
-               ipath_hol_signal_up(dd);
-       }
-       if (dd->ipath_hol_state == IPATH_HOL_UP)
-               ipath_dbg("link's up, don't resched timer\n");
-       else {
-               dd->ipath_hol_timer.expires = jiffies +
-                       msecs_to_jiffies(ipath_hol_timeout_ms);
-               mod_timer(&dd->ipath_hol_timer,
-                       dd->ipath_hol_timer.expires);
-       }
-}
-
-int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
-{
-       u64 val;
-
-       if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
-               return -1;
-       if (dd->ipath_rx_pol_inv != new_pol_inv) {
-               dd->ipath_rx_pol_inv = new_pol_inv;
-               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-               val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
-                        INFINIPATH_XGXS_RX_POL_SHIFT);
-               val |= ((u64)dd->ipath_rx_pol_inv) <<
-                       INFINIPATH_XGXS_RX_POL_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-       }
-       return 0;
-}
-
-/*
- * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
- * the 7220, which is count-based, rather than trigger-based.  Safe for the
- * driver check, since it's at init.   Not completely safe when used for
- * user-mode checking, since some error checking can be lost, but not
- * particularly risky, and only has problematic side-effects in the face of
- * very buggy user code.  There is no reference counting, but that's also
- * fine, given the intended use.
- */
-void ipath_enable_armlaunch(struct ipath_devdata *dd)
-{
-       dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-               INFINIPATH_E_SPIOARMLAUNCH);
-       dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-}
-
-void ipath_disable_armlaunch(struct ipath_devdata *dd)
-{
-       /* so don't re-enable if already set */
-       dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
-       dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-}
-
-module_init(infinipath_init);
-module_exit(infinipath_cleanup);
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
deleted file mode 100644 (file)
index fc71819..0000000
+++ /dev/null
@@ -1,1183 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_kernel.h"
-
-/*
- * InfiniPath I2C driver for a serial eeprom.  This is not a generic
- * I2C interface.  For a start, the device we're using (Atmel AT24C11)
- * doesn't work like a regular I2C device.  It looks like one
- * electrically, but not logically.  Normal I2C devices have a single
- * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
- * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
- * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
- * call" address.)  The Atmel device, on the other hand, responds to ALL
- * 7-bit addresses.  It's designed to be the only device on a given I2C
- * bus.  A 7-bit address corresponds to the memory address within the
- * Atmel device itself.
- *
- * Also, the timing requirements mean more than simple software
- * bitbanging, with readbacks from chip to ensure timing (simple udelay
- * is not enough).
- *
- * This all means that accessing the device is specialized enough
- * that using the standard kernel I2C bitbanging interface would be
- * impossible.  For example, the core I2C eeprom driver expects to find
- * a device at one or more of a limited set of addresses only.  It doesn't
- * allow writing to an eeprom.  It also doesn't provide any means of
- * accessing eeprom contents from within the kernel, only via sysfs.
- */
-
-/* Added functionality for IBA7220-based cards */
-#define IPATH_EEPROM_DEV_V1 0xA0
-#define IPATH_EEPROM_DEV_V2 0xA2
-#define IPATH_TEMP_DEV 0x98
-#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
-#define IPATH_NO_DEV (0xFF)
-
-/*
- * The number of I2C chains is proliferating. Table below brings
- * some order to the madness. The basic principle is that the
- * table is scanned from the top, and a "probe" is made to the
- * device probe_dev. If that succeeds, the chain is considered
- * to be of that type, and dd->i2c_chain_type is set to the index+1
- * of the entry.
- * The +1 is so static initialization can mean "unknown, do probe."
- */
-static struct i2c_chain_desc {
-       u8 probe_dev;   /* If seen at probe, chain is this type */
-       u8 eeprom_dev;  /* Dev addr (if any) for EEPROM */
-       u8 temp_dev;    /* Dev Addr (if any) for Temp-sense */
-} i2c_chains[] = {
-       { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
-       { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
-       { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
-       { IPATH_NO_DEV }
-};
-
-enum i2c_type {
-       i2c_line_scl = 0,
-       i2c_line_sda
-};
-
-enum i2c_state {
-       i2c_line_low = 0,
-       i2c_line_high
-};
-
-#define READ_CMD 1
-#define WRITE_CMD 0
-
-/**
- * i2c_gpio_set - set a GPIO line
- * @dd: the infinipath device
- * @line: the line to set
- * @new_line_state: the state to set
- *
- * Returns 0 if the line was set to the new state successfully, non-zero
- * on error.
- */
-static int i2c_gpio_set(struct ipath_devdata *dd,
-                       enum i2c_type line,
-                       enum i2c_state new_line_state)
-{
-       u64 out_mask, dir_mask, *gpioval;
-       unsigned long flags = 0;
-
-       gpioval = &dd->ipath_gpio_out;
-
-       if (line == i2c_line_scl) {
-               dir_mask = dd->ipath_gpio_scl;
-               out_mask = (1UL << dd->ipath_gpio_scl_num);
-       } else {
-               dir_mask = dd->ipath_gpio_sda;
-               out_mask = (1UL << dd->ipath_gpio_sda_num);
-       }
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       if (new_line_state == i2c_line_high) {
-               /* tri-state the output rather than force high */
-               dd->ipath_extctrl &= ~dir_mask;
-       } else {
-               /* config line to be an output */
-               dd->ipath_extctrl |= dir_mask;
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
-
-       /* set output as well (no real verify) */
-       if (new_line_state == i2c_line_high)
-               *gpioval |= out_mask;
-       else
-               *gpioval &= ~out_mask;
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-       return 0;
-}
-
-/**
- * i2c_gpio_get - get a GPIO line state
- * @dd: the infinipath device
- * @line: the line to get
- * @curr_statep: where to put the line state
- *
- * Returns 0 if the line was set to the new state successfully, non-zero
- * on error.  curr_state is not set on error.
- */
-static int i2c_gpio_get(struct ipath_devdata *dd,
-                       enum i2c_type line,
-                       enum i2c_state *curr_statep)
-{
-       u64 read_val, mask;
-       int ret;
-       unsigned long flags = 0;
-
-       /* check args */
-       if (curr_statep == NULL) {
-               ret = 1;
-               goto bail;
-       }
-
-       /* config line to be an input */
-       if (line == i2c_line_scl)
-               mask = dd->ipath_gpio_scl;
-       else
-               mask = dd->ipath_gpio_sda;
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       dd->ipath_extctrl &= ~mask;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
-       /*
-        * Below is very unlikely to reflect true input state if Output
-        * Enable actually changed.
-        */
-       read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-       if (read_val & mask)
-               *curr_statep = i2c_line_high;
-       else
-               *curr_statep = i2c_line_low;
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * i2c_wait_for_writes - wait for a write
- * @dd: the infinipath device
- *
- * We use this instead of udelay directly, so we can make sure
- * that previous register writes have been flushed all the way
- * to the chip.  Since we are delaying anyway, the cost doesn't
- * hurt, and makes the bit twiddling more regular
- */
-static void i2c_wait_for_writes(struct ipath_devdata *dd)
-{
-       (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
-       rmb();
-}
-
-static void scl_out(struct ipath_devdata *dd, u8 bit)
-{
-       udelay(1);
-       i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
-
-       i2c_wait_for_writes(dd);
-}
-
-static void sda_out(struct ipath_devdata *dd, u8 bit)
-{
-       i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
-
-       i2c_wait_for_writes(dd);
-}
-
-static u8 sda_in(struct ipath_devdata *dd, int wait)
-{
-       enum i2c_state bit;
-
-       if (i2c_gpio_get(dd, i2c_line_sda, &bit))
-               ipath_dbg("get bit failed!\n");
-
-       if (wait)
-               i2c_wait_for_writes(dd);
-
-       return bit == i2c_line_high ? 1U : 0;
-}
-
-/**
- * i2c_ackrcv - see if ack following write is true
- * @dd: the infinipath device
- */
-static int i2c_ackrcv(struct ipath_devdata *dd)
-{
-       u8 ack_received;
-
-       /* AT ENTRY SCL = LOW */
-       /* change direction, ignore data */
-       ack_received = sda_in(dd, 1);
-       scl_out(dd, i2c_line_high);
-       ack_received = sda_in(dd, 1) == 0;
-       scl_out(dd, i2c_line_low);
-       return ack_received;
-}
-
-/**
- * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
- * @dd: the infinipath device
- *
- * Returns byte shifted out of device
- */
-static int rd_byte(struct ipath_devdata *dd)
-{
-       int bit_cntr, data;
-
-       data = 0;
-
-       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
-               data <<= 1;
-               scl_out(dd, i2c_line_high);
-               data |= sda_in(dd, 0);
-               scl_out(dd, i2c_line_low);
-       }
-       return data;
-}
-
-/**
- * wr_byte - write a byte, one bit at a time
- * @dd: the infinipath device
- * @data: the byte to write
- *
- * Returns 0 if we got the following ack, otherwise 1
- */
-static int wr_byte(struct ipath_devdata *dd, u8 data)
-{
-       int bit_cntr;
-       u8 bit;
-
-       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
-               bit = (data >> bit_cntr) & 1;
-               sda_out(dd, bit);
-               scl_out(dd, i2c_line_high);
-               scl_out(dd, i2c_line_low);
-       }
-       return (!i2c_ackrcv(dd)) ? 1 : 0;
-}
-
-static void send_ack(struct ipath_devdata *dd)
-{
-       sda_out(dd, i2c_line_low);
-       scl_out(dd, i2c_line_high);
-       scl_out(dd, i2c_line_low);
-       sda_out(dd, i2c_line_high);
-}
-
-/**
- * i2c_startcmd - transmit the start condition, followed by address/cmd
- * @dd: the infinipath device
- * @offset_dir: direction byte
- *
- *      (both clock/data high, clock high, data low while clock is high)
- */
-static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
-{
-       int res;
-
-       /* issue start sequence */
-       sda_out(dd, i2c_line_high);
-       scl_out(dd, i2c_line_high);
-       sda_out(dd, i2c_line_low);
-       scl_out(dd, i2c_line_low);
-
-       /* issue length and direction byte */
-       res = wr_byte(dd, offset_dir);
-
-       if (res)
-               ipath_cdbg(VERBOSE, "No ack to complete start\n");
-
-       return res;
-}
-
-/**
- * stop_cmd - transmit the stop condition
- * @dd: the infinipath device
- *
- * (both clock/data low, clock high, data high while clock is high)
- */
-static void stop_cmd(struct ipath_devdata *dd)
-{
-       scl_out(dd, i2c_line_low);
-       sda_out(dd, i2c_line_low);
-       scl_out(dd, i2c_line_high);
-       sda_out(dd, i2c_line_high);
-       udelay(2);
-}
-
-/**
- * eeprom_reset - reset I2C communication
- * @dd: the infinipath device
- */
-
-static int eeprom_reset(struct ipath_devdata *dd)
-{
-       int clock_cycles_left = 9;
-       u64 *gpioval = &dd->ipath_gpio_out;
-       int ret;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       /* Make sure shadows are consistent */
-       dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
-       *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-
-       ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
-                  "is %llx\n", (unsigned long long) *gpioval);
-
-       /*
-        * This is to get the i2c into a known state, by first going low,
-        * then tristate sda (and then tristate scl as first thing
-        * in loop)
-        */
-       scl_out(dd, i2c_line_low);
-       sda_out(dd, i2c_line_high);
-
-       /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
-       while (clock_cycles_left--) {
-               scl_out(dd, i2c_line_high);
-
-               /* SDA seen high, issue START by dropping it while SCL high */
-               if (sda_in(dd, 0)) {
-                       sda_out(dd, i2c_line_low);
-                       scl_out(dd, i2c_line_low);
-                       /* ATMEL spec says must be followed by STOP. */
-                       scl_out(dd, i2c_line_high);
-                       sda_out(dd, i2c_line_high);
-                       ret = 0;
-                       goto bail;
-               }
-
-               scl_out(dd, i2c_line_low);
-       }
-
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/*
- * Probe for I2C device at specified address. Returns 0 for "success"
- * to match rest of this file.
- * Leave bus in "reasonable" state for further commands.
- */
-static int i2c_probe(struct ipath_devdata *dd, int devaddr)
-{
-       int ret = 0;
-
-       ret = eeprom_reset(dd);
-       if (ret) {
-               ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
-                             devaddr);
-               return ret;
-       }
-       /*
-        * Reset no longer leaves bus in start condition, so normal
-        * i2c_startcmd() will do.
-        */
-       ret = i2c_startcmd(dd, devaddr | READ_CMD);
-       if (ret)
-               ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
-                          devaddr);
-       else {
-               /*
-                * Device did respond. Complete a single-byte read, because some
-                * devices apparently cannot handle STOP immediately after they
-                * ACK the start-cmd.
-                */
-               int data;
-               data = rd_byte(dd);
-               stop_cmd(dd);
-               ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
-       }
-       return ret;
-}
-
-/*
- * Returns the "i2c type". This is a pointer to a struct that describes
- * the I2C chain on this board. To minimize impact on struct ipath_devdata,
- * the (small integer) index into the table is actually memoized, rather
- * then the pointer.
- * Memoization is because the type is determined on the first call per chip.
- * An alternative would be to move type determination to early
- * init code.
- */
-static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
-{
-       int idx;
-
-       /* Get memoized index, from previous successful probes */
-       idx = dd->ipath_i2c_chain_type - 1;
-       if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
-               goto done;
-
-       idx = 0;
-       while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
-               /* if probe succeeds, this is type */
-               if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
-                       break;
-               ++idx;
-       }
-
-       /*
-        * Old EEPROM (first entry) may require a reset after probe,
-        * rather than being able to "start" after "stop"
-        */
-       if (idx == 0)
-               eeprom_reset(dd);
-
-       if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
-               idx = -1;
-       else
-               dd->ipath_i2c_chain_type = idx + 1;
-done:
-       return (idx >= 0) ? i2c_chains + idx : NULL;
-}
-
-static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
-                                       u8 eeprom_offset, void *buffer, int len)
-{
-       int ret;
-       struct i2c_chain_desc *icd;
-       u8 *bp = buffer;
-
-       ret = 1;
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       if (icd->eeprom_dev == IPATH_NO_DEV) {
-               /* legacy not-really-I2C */
-               ipath_cdbg(VERBOSE, "Start command only address\n");
-               eeprom_offset = (eeprom_offset << 1) | READ_CMD;
-               ret = i2c_startcmd(dd, eeprom_offset);
-       } else {
-               /* Actual I2C */
-               ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
-               if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
-                       ipath_dbg("Failed EEPROM startcmd\n");
-                       stop_cmd(dd);
-                       ret = 1;
-                       goto bail;
-               }
-               ret = wr_byte(dd, eeprom_offset);
-               stop_cmd(dd);
-               if (ret) {
-                       ipath_dev_err(dd, "Failed to write EEPROM address\n");
-                       ret = 1;
-                       goto bail;
-               }
-               ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
-       }
-       if (ret) {
-               ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
-               stop_cmd(dd);
-               ret = 1;
-               goto bail;
-       }
-
-       /*
-        * eeprom keeps clocking data out as long as we ack, automatically
-        * incrementing the address.
-        */
-       while (len-- > 0) {
-               /* get and store data */
-               *bp++ = rd_byte(dd);
-               /* send ack if not the last byte */
-               if (len)
-                       send_ack(dd);
-       }
-
-       stop_cmd(dd);
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
-                                      const void *buffer, int len)
-{
-       int sub_len;
-       const u8 *bp = buffer;
-       int max_wait_time, i;
-       int ret;
-       struct i2c_chain_desc *icd;
-
-       ret = 1;
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       while (len > 0) {
-               if (icd->eeprom_dev == IPATH_NO_DEV) {
-                       if (i2c_startcmd(dd,
-                                        (eeprom_offset << 1) | WRITE_CMD)) {
-                               ipath_dbg("Failed to start cmd offset %u\n",
-                                       eeprom_offset);
-                               goto failed_write;
-                       }
-               } else {
-                       /* Real I2C */
-                       if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
-                               ipath_dbg("Failed EEPROM startcmd\n");
-                               goto failed_write;
-                       }
-                       ret = wr_byte(dd, eeprom_offset);
-                       if (ret) {
-                               ipath_dev_err(dd, "Failed to write EEPROM "
-                                             "address\n");
-                               goto failed_write;
-                       }
-               }
-
-               sub_len = min(len, 4);
-               eeprom_offset += sub_len;
-               len -= sub_len;
-
-               for (i = 0; i < sub_len; i++) {
-                       if (wr_byte(dd, *bp++)) {
-                               ipath_dbg("no ack after byte %u/%u (%u "
-                                         "total remain)\n", i, sub_len,
-                                         len + sub_len - i);
-                               goto failed_write;
-                       }
-               }
-
-               stop_cmd(dd);
-
-               /*
-                * wait for write complete by waiting for a successful
-                * read (the chip replies with a zero after the write
-                * cmd completes, and before it writes to the eeprom.
-                * The startcmd for the read will fail the ack until
-                * the writes have completed.   We do this inline to avoid
-                * the debug prints that are in the real read routine
-                * if the startcmd fails.
-                * We also use the proper device address, so it doesn't matter
-                * whether we have real eeprom_dev. legacy likes any address.
-                */
-               max_wait_time = 100;
-               while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
-                       stop_cmd(dd);
-                       if (!--max_wait_time) {
-                               ipath_dbg("Did not get successful read to "
-                                         "complete write\n");
-                               goto failed_write;
-                       }
-               }
-               /* now read (and ignore) the resulting byte */
-               rd_byte(dd);
-               stop_cmd(dd);
-       }
-
-       ret = 0;
-       goto bail;
-
-failed_write:
-       stop_cmd(dd);
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_eeprom_read - receives bytes from the eeprom via I2C
- * @dd: the infinipath device
- * @eeprom_offset: address to read from
- * @buffer: where to store result
- * @len: number of bytes to receive
- */
-int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
-                       void *buff, int len)
-{
-       int ret;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       return ret;
-}
-
-/**
- * ipath_eeprom_write - writes data to the eeprom via I2C
- * @dd: the infinipath device
- * @eeprom_offset: where to place data
- * @buffer: data to write
- * @len: number of bytes to write
- */
-int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
-                       const void *buff, int len)
-{
-       int ret;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       return ret;
-}
-
-static u8 flash_csum(struct ipath_flash *ifp, int adjust)
-{
-       u8 *ip = (u8 *) ifp;
-       u8 csum = 0, len;
-
-       /*
-        * Limit length checksummed to max length of actual data.
-        * Checksum of erased eeprom will still be bad, but we avoid
-        * reading past the end of the buffer we were passed.
-        */
-       len = ifp->if_length;
-       if (len > sizeof(struct ipath_flash))
-               len = sizeof(struct ipath_flash);
-       while (len--)
-               csum += *ip++;
-       csum -= ifp->if_csum;
-       csum = ~csum;
-       if (adjust)
-               ifp->if_csum = csum;
-
-       return csum;
-}
-
-/**
- * ipath_get_guid - get the GUID from the i2c device
- * @dd: the infinipath device
- *
- * We have the capability to use the ipath_nguid field, and get
- * the guid from the first chip's flash, to use for all of them.
- */
-void ipath_get_eeprom_info(struct ipath_devdata *dd)
-{
-       void *buf;
-       struct ipath_flash *ifp;
-       __be64 guid;
-       int len, eep_stat;
-       u8 csum, *bguid;
-       int t = dd->ipath_unit;
-       struct ipath_devdata *dd0 = ipath_lookup(0);
-
-       if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
-               u8 oguid;
-               dd->ipath_guid = dd0->ipath_guid;
-               bguid = (u8 *) & dd->ipath_guid;
-
-               oguid = bguid[7];
-               bguid[7] += t;
-               if (oguid > bguid[7]) {
-                       if (bguid[6] == 0xff) {
-                               if (bguid[5] == 0xff) {
-                                       ipath_dev_err(
-                                               dd,
-                                               "Can't set %s GUID from "
-                                               "base, wraps to OUI!\n",
-                                               ipath_get_unit_name(t));
-                                       dd->ipath_guid = 0;
-                                       goto bail;
-                               }
-                               bguid[5]++;
-                       }
-                       bguid[6]++;
-               }
-               dd->ipath_nguid = 1;
-
-               ipath_dbg("nguid %u, so adding %u to device 0 guid, "
-                         "for %llx\n",
-                         dd0->ipath_nguid, t,
-                         (unsigned long long) be64_to_cpu(dd->ipath_guid));
-               goto bail;
-       }
-
-       /*
-        * read full flash, not just currently used part, since it may have
-        * been written with a newer definition
-        * */
-       len = sizeof(struct ipath_flash);
-       buf = vmalloc(len);
-       if (!buf) {
-               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
-                             "bytes from eeprom for GUID\n", len);
-               goto bail;
-       }
-
-       mutex_lock(&dd->ipath_eep_lock);
-       eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
-       mutex_unlock(&dd->ipath_eep_lock);
-
-       if (eep_stat) {
-               ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
-               goto done;
-       }
-       ifp = (struct ipath_flash *)buf;
-
-       csum = flash_csum(ifp, 0);
-       if (csum != ifp->if_csum) {
-               dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
-                        "0x%x, not 0x%x\n", csum, ifp->if_csum);
-               goto done;
-       }
-       if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
-           *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
-               ipath_dev_err(dd, "Invalid GUID %llx from flash; "
-                             "ignoring\n",
-                             *(unsigned long long *) ifp->if_guid);
-               /* don't allow GUID if all 0 or all 1's */
-               goto done;
-       }
-
-       /* complain, but allow it */
-       if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
-               dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
-                        "default, probably not correct!\n",
-                        *(unsigned long long *) ifp->if_guid);
-
-       bguid = ifp->if_guid;
-       if (!bguid[0] && !bguid[1] && !bguid[2]) {
-               /* original incorrect GUID format in flash; fix in
-                * core copy, by shifting up 2 octets; don't need to
-                * change top octet, since both it and shifted are
-                * 0.. */
-               bguid[1] = bguid[3];
-               bguid[2] = bguid[4];
-               bguid[3] = bguid[4] = 0;
-               guid = *(__be64 *) ifp->if_guid;
-               ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
-                          "shifting 2 octets\n");
-       } else
-               guid = *(__be64 *) ifp->if_guid;
-       dd->ipath_guid = guid;
-       dd->ipath_nguid = ifp->if_numguid;
-       /*
-        * Things are slightly complicated by the desire to transparently
-        * support both the Pathscale 10-digit serial number and the QLogic
-        * 13-character version.
-        */
-       if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
-               && ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
-               /* This board has a Serial-prefix, which is stored
-                * elsewhere for backward-compatibility.
-                */
-               char *snp = dd->ipath_serial;
-               memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
-               snp[sizeof ifp->if_sprefix] = '\0';
-               len = strlen(snp);
-               snp += len;
-               len = (sizeof dd->ipath_serial) - len;
-               if (len > sizeof ifp->if_serial) {
-                       len = sizeof ifp->if_serial;
-               }
-               memcpy(snp, ifp->if_serial, len);
-       } else
-               memcpy(dd->ipath_serial, ifp->if_serial,
-                      sizeof ifp->if_serial);
-       if (!strstr(ifp->if_comment, "Tested successfully"))
-               ipath_dev_err(dd, "Board SN %s did not pass functional "
-                       "test: %s\n", dd->ipath_serial,
-                       ifp->if_comment);
-
-       ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
-                  (unsigned long long) be64_to_cpu(dd->ipath_guid));
-
-       memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
-       /*
-        * Power-on (actually "active") hours are kept as little-endian value
-        * in EEPROM, but as seconds in a (possibly as small as 24-bit)
-        * atomic_t while running.
-        */
-       atomic_set(&dd->ipath_active_time, 0);
-       dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
-
-done:
-       vfree(buf);
-
-bail:;
-}
-
-/**
- * ipath_update_eeprom_log - copy active-time and error counters to eeprom
- * @dd: the infinipath device
- *
- * Although the time is kept as seconds in the ipath_devdata struct, it is
- * rounded to hours for re-write, as we have only 16 bits in EEPROM.
- * First-cut code reads whole (expected) struct ipath_flash, modifies,
- * re-writes. Future direction: read/write only what we need, assuming
- * that the EEPROM had to have been "good enough" for driver init, and
- * if not, we aren't making it worse.
- *
- */
-
-int ipath_update_eeprom_log(struct ipath_devdata *dd)
-{
-       void *buf;
-       struct ipath_flash *ifp;
-       int len, hi_water;
-       uint32_t new_time, new_hrs;
-       u8 csum;
-       int ret, idx;
-       unsigned long flags;
-
-       /* first, check if we actually need to do anything. */
-       ret = 0;
-       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-               if (dd->ipath_eep_st_new_errs[idx]) {
-                       ret = 1;
-                       break;
-               }
-       }
-       new_time = atomic_read(&dd->ipath_active_time);
-
-       if (ret == 0 && new_time < 3600)
-               return 0;
-
-       /*
-        * The quick-check above determined that there is something worthy
-        * of logging, so get current contents and do a more detailed idea.
-        * read full flash, not just currently used part, since it may have
-        * been written with a newer definition
-        */
-       len = sizeof(struct ipath_flash);
-       buf = vmalloc(len);
-       ret = 1;
-       if (!buf) {
-               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
-                               "bytes from eeprom for logging\n", len);
-               goto bail;
-       }
-
-       /* Grab semaphore and read current EEPROM. If we get an
-        * error, let go, but if not, keep it until we finish write.
-        */
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (ret) {
-               ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
-               goto free_bail;
-       }
-       ret = ipath_eeprom_internal_read(dd, 0, buf, len);
-       if (ret) {
-               mutex_unlock(&dd->ipath_eep_lock);
-               ipath_dev_err(dd, "Unable read EEPROM for logging\n");
-               goto free_bail;
-       }
-       ifp = (struct ipath_flash *)buf;
-
-       csum = flash_csum(ifp, 0);
-       if (csum != ifp->if_csum) {
-               mutex_unlock(&dd->ipath_eep_lock);
-               ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
-                               csum, ifp->if_csum);
-               ret = 1;
-               goto free_bail;
-       }
-       hi_water = 0;
-       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-               int new_val = dd->ipath_eep_st_new_errs[idx];
-               if (new_val) {
-                       /*
-                        * If we have seen any errors, add to EEPROM values
-                        * We need to saturate at 0xFF (255) and we also
-                        * would need to adjust the checksum if we were
-                        * trying to minimize EEPROM traffic
-                        * Note that we add to actual current count in EEPROM,
-                        * in case it was altered while we were running.
-                        */
-                       new_val += ifp->if_errcntp[idx];
-                       if (new_val > 0xFF)
-                               new_val = 0xFF;
-                       if (ifp->if_errcntp[idx] != new_val) {
-                               ifp->if_errcntp[idx] = new_val;
-                               hi_water = offsetof(struct ipath_flash,
-                                               if_errcntp) + idx;
-                       }
-                       /*
-                        * update our shadow (used to minimize EEPROM
-                        * traffic), to match what we are about to write.
-                        */
-                       dd->ipath_eep_st_errs[idx] = new_val;
-                       dd->ipath_eep_st_new_errs[idx] = 0;
-               }
-       }
-       /*
-        * now update active-time. We would like to round to the nearest hour
-        * but unless atomic_t are sure to be proper signed ints we cannot,
-        * because we need to account for what we "transfer" to EEPROM and
-        * if we log an hour at 31 minutes, then we would need to set
-        * active_time to -29 to accurately count the _next_ hour.
-        */
-       if (new_time >= 3600) {
-               new_hrs = new_time / 3600;
-               atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
-               new_hrs += dd->ipath_eep_hrs;
-               if (new_hrs > 0xFFFF)
-                       new_hrs = 0xFFFF;
-               dd->ipath_eep_hrs = new_hrs;
-               if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
-                       ifp->if_powerhour[0] = new_hrs & 0xFF;
-                       hi_water = offsetof(struct ipath_flash, if_powerhour);
-               }
-               if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
-                       ifp->if_powerhour[1] = new_hrs >> 8;
-                       hi_water = offsetof(struct ipath_flash, if_powerhour)
-                                       + 1;
-               }
-       }
-       /*
-        * There is a tiny possibility that we could somehow fail to write
-        * the EEPROM after updating our shadows, but problems from holding
-        * the spinlock too long are a much bigger issue.
-        */
-       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-       if (hi_water) {
-               /* we made some change to the data, uopdate cksum and write */
-               csum = flash_csum(ifp, 1);
-               ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
-       }
-       mutex_unlock(&dd->ipath_eep_lock);
-       if (ret)
-               ipath_dev_err(dd, "Failed updating EEPROM\n");
-
-free_bail:
-       vfree(buf);
-bail:
-       return ret;
-
-}
-
-/**
- * ipath_inc_eeprom_err - increment one of the four error counters
- * that are logged to EEPROM.
- * @dd: the infinipath device
- * @eidx: 0..3, the counter to increment
- * @incr: how much to add
- *
- * Each counter is 8-bits, and saturates at 255 (0xFF). They
- * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
- * is called, but it can only be called in a context that allows sleep.
- * This function can be called even at interrupt level.
- */
-
-void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
-{
-       uint new_val;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-       new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
-       if (new_val > 255)
-               new_val = 255;
-       dd->ipath_eep_st_new_errs[eidx] = new_val;
-       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-       return;
-}
-
-static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
-{
-       int ret;
-       struct i2c_chain_desc *icd;
-
-       ret = -ENOENT;
-
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       if (icd->temp_dev == IPATH_NO_DEV) {
-               /* tempsense only exists on new, real-I2C boards */
-               ret = -ENXIO;
-               goto bail;
-       }
-
-       if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
-               ipath_dbg("Failed tempsense startcmd\n");
-               stop_cmd(dd);
-               ret = -ENXIO;
-               goto bail;
-       }
-       ret = wr_byte(dd, regnum);
-       stop_cmd(dd);
-       if (ret) {
-               ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
-                             regnum);
-               ret = -ENXIO;
-               goto bail;
-       }
-       if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
-               ipath_dbg("Failed tempsense RD startcmd\n");
-               stop_cmd(dd);
-               ret = -ENXIO;
-               goto bail;
-       }
-       /*
-        * We can only clock out one byte per command, sensibly
-        */
-       ret = rd_byte(dd);
-       stop_cmd(dd);
-
-bail:
-       return ret;
-}
-
-#define VALID_TS_RD_REG_MASK 0xBF
-
-/**
- * ipath_tempsense_read - read register of temp sensor via I2C
- * @dd: the infinipath device
- * @regnum: register to read from
- *
- * returns reg contents (0..255) or < 0 for error
- */
-int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
-{
-       int ret;
-
-       if (regnum > 7)
-               return -EINVAL;
-
-       /* return a bogus value for (the one) register we do not have */
-       if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
-               return 0;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_tempsense_internal_read(dd, regnum);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       /*
-        * There are three possibilities here:
-        * ret is actual value (0..255)
-        * ret is -ENXIO or -EINVAL from code in this file
-        * ret is -EINTR from mutex_lock_interruptible.
-        */
-       return ret;
-}
-
-static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
-                                         u8 regnum, u8 data)
-{
-       int ret = -ENOENT;
-       struct i2c_chain_desc *icd;
-
-       icd = ipath_i2c_type(dd);
-       if (!icd)
-               goto bail;
-
-       if (icd->temp_dev == IPATH_NO_DEV) {
-               /* tempsense only exists on new, real-I2C boards */
-               ret = -ENXIO;
-               goto bail;
-       }
-       if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
-               ipath_dbg("Failed tempsense startcmd\n");
-               stop_cmd(dd);
-               ret = -ENXIO;
-               goto bail;
-       }
-       ret = wr_byte(dd, regnum);
-       if (ret) {
-               stop_cmd(dd);
-               ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
-                             regnum);
-               ret = -ENXIO;
-               goto bail;
-       }
-       ret = wr_byte(dd, data);
-       stop_cmd(dd);
-       ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
-       if (ret) {
-               ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
-                             regnum);
-               ret = -ENXIO;
-       }
-
-bail:
-       return ret;
-}
-
-#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
-
-/**
- * ipath_tempsense_write - write register of temp sensor via I2C
- * @dd: the infinipath device
- * @regnum: register to write
- * @data: data to write
- *
- * returns 0 for success or < 0 for error
- */
-int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
-{
-       int ret;
-
-       if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
-               return -EINVAL;
-
-       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
-       if (!ret) {
-               ret = ipath_tempsense_internal_write(dd, regnum, data);
-               mutex_unlock(&dd->ipath_eep_lock);
-       }
-
-       /*
-        * There are three possibilities here:
-        * ret is 0 for success
-        * ret is -ENXIO or -EINVAL from code in this file
-        * ret is -EINTR from mutex_lock_interruptible.
-        */
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
deleted file mode 100644 (file)
index 450d159..0000000
+++ /dev/null
@@ -1,2620 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/poll.h>
-#include <linux/cdev.h>
-#include <linux/swap.h>
-#include <linux/export.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/jiffies.h>
-#include <linux/cpu.h>
-#include <linux/uio.h>
-#include <asm/pgtable.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-#include "ipath_user_sdma.h"
-
-static int ipath_open(struct inode *, struct file *);
-static int ipath_close(struct inode *, struct file *);
-static ssize_t ipath_write(struct file *, const char __user *, size_t,
-                          loff_t *);
-static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
-static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
-static int ipath_mmap(struct file *, struct vm_area_struct *);
-
-/*
- * This is really, really weird shit - write() and writev() here
- * have completely unrelated semantics.  Sucky userland ABI,
- * film at 11.
- */
-static const struct file_operations ipath_file_ops = {
-       .owner = THIS_MODULE,
-       .write = ipath_write,
-       .write_iter = ipath_write_iter,
-       .open = ipath_open,
-       .release = ipath_close,
-       .poll = ipath_poll,
-       .mmap = ipath_mmap,
-       .llseek = noop_llseek,
-};
-
-/*
- * Convert kernel virtual addresses to physical addresses so they don't
- * potentially conflict with the chip addresses used as mmap offsets.
- * It doesn't really matter what mmap offset we use as long as we can
- * interpret it correctly.
- */
-static u64 cvt_kvaddr(void *p)
-{
-       struct page *page;
-       u64 paddr = 0;
-
-       page = vmalloc_to_page(p);
-       if (page)
-               paddr = page_to_pfn(page) << PAGE_SHIFT;
-
-       return paddr;
-}
-
-static int ipath_get_base_info(struct file *fp,
-                              void __user *ubase, size_t ubase_size)
-{
-       struct ipath_portdata *pd = port_fp(fp);
-       int ret = 0;
-       struct ipath_base_info *kinfo = NULL;
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned subport_cnt;
-       int shared, master;
-       size_t sz;
-
-       subport_cnt = pd->port_subport_cnt;
-       if (!subport_cnt) {
-               shared = 0;
-               master = 0;
-               subport_cnt = 1;
-       } else {
-               shared = 1;
-               master = !subport_fp(fp);
-       }
-
-       sz = sizeof(*kinfo);
-       /* If port sharing is not requested, allow the old size structure */
-       if (!shared)
-               sz -= 7 * sizeof(u64);
-       if (ubase_size < sz) {
-               ipath_cdbg(PROC,
-                          "Base size %zu, need %zu (version mismatch?)\n",
-                          ubase_size, sz);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
-       if (kinfo == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       ret = dd->ipath_f_get_base_info(pd, kinfo);
-       if (ret < 0)
-               goto bail;
-
-       kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
-       kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
-       kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
-       kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
-       /*
-        * have to mmap whole thing
-        */
-       kinfo->spi_rcv_egrbuftotlen =
-               pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
-       kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
-       kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
-               pd->port_rcvegrbuf_chunks;
-       kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
-       if (master)
-               kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
-       /*
-        * for this use, may be ipath_cfgports summed over all chips that
-        * are are configured and present
-        */
-       kinfo->spi_nports = dd->ipath_cfgports;
-       /* unit (chip/board) our port is on */
-       kinfo->spi_unit = dd->ipath_unit;
-       /* for now, only a single page */
-       kinfo->spi_tid_maxsize = PAGE_SIZE;
-
-       /*
-        * Doing this per port, and based on the skip value, etc.  This has
-        * to be the actual buffer size, since the protocol code treats it
-        * as an array.
-        *
-        * These have to be set to user addresses in the user code via mmap.
-        * These values are used on return to user code for the mmap target
-        * addresses only.  For 32 bit, same 44 bit address problem, so use
-        * the physical address, not virtual.  Before 2.6.11, using the
-        * page_address() macro worked, but in 2.6.11, even that returns the
-        * full 64 bit address (upper bits all 1's).  So far, using the
-        * physical addresses (or chip offsets, for chip mapping) works, but
-        * no doubt some future kernel release will change that, and we'll be
-        * on to yet another method of dealing with this.
-        */
-       kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
-       kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
-       kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
-       kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
-       kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
-               (void *) dd->ipath_statusp -
-               (void *) dd->ipath_pioavailregs_dma;
-       if (!shared) {
-               kinfo->spi_piocnt = pd->port_piocnt;
-               kinfo->spi_piobufbase = (u64) pd->port_piobufs;
-               kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
-                       dd->ipath_ureg_align * pd->port_port;
-       } else if (master) {
-               kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
-                                   (pd->port_piocnt % subport_cnt);
-               /* Master's PIO buffers are after all the slave's */
-               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
-                       dd->ipath_palign *
-                       (pd->port_piocnt - kinfo->spi_piocnt);
-       } else {
-               unsigned slave = subport_fp(fp) - 1;
-
-               kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
-               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
-                       dd->ipath_palign * kinfo->spi_piocnt * slave;
-       }
-
-       if (shared) {
-               kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
-                       dd->ipath_ureg_align * pd->port_port;
-               kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
-               kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
-               kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
-
-               kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
-                       PAGE_SIZE * subport_fp(fp));
-
-               kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
-                       pd->port_rcvhdrq_size * subport_fp(fp));
-               kinfo->spi_rcvhdr_tailaddr = 0;
-               kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
-                       pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
-                       subport_fp(fp));
-
-               kinfo->spi_subport_uregbase =
-                       cvt_kvaddr(pd->subport_uregbase);
-               kinfo->spi_subport_rcvegrbuf =
-                       cvt_kvaddr(pd->subport_rcvegrbuf);
-               kinfo->spi_subport_rcvhdr_base =
-                       cvt_kvaddr(pd->subport_rcvhdr_base);
-               ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
-                       kinfo->spi_port, kinfo->spi_runtime_flags,
-                       (unsigned long long) kinfo->spi_subport_uregbase,
-                       (unsigned long long) kinfo->spi_subport_rcvegrbuf,
-                       (unsigned long long) kinfo->spi_subport_rcvhdr_base);
-       }
-
-       /*
-        * All user buffers are 2KB buffers.  If we ever support
-        * giving 4KB buffers to user processes, this will need some
-        * work.
-        */
-       kinfo->spi_pioindex = (kinfo->spi_piobufbase -
-               (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
-       kinfo->spi_pioalign = dd->ipath_palign;
-
-       kinfo->spi_qpair = IPATH_KD_QP;
-       /*
-        * user mode PIO buffers are always 2KB, even when 4KB can
-        * be received, and sent via the kernel; this is ibmaxlen
-        * for 2K MTU.
-        */
-       kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
-       kinfo->spi_mtu = dd->ipath_ibmaxlen;    /* maxlen, not ibmtu */
-       kinfo->spi_port = pd->port_port;
-       kinfo->spi_subport = subport_fp(fp);
-       kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
-       kinfo->spi_hw_version = dd->ipath_revision;
-
-       if (master) {
-               kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
-       }
-
-       sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
-       if (copy_to_user(ubase, kinfo, sz))
-               ret = -EFAULT;
-
-bail:
-       kfree(kinfo);
-       return ret;
-}
-
-/**
- * ipath_tid_update - update a port TID
- * @pd: the port
- * @fp: the ipath device file
- * @ti: the TID information
- *
- * The new implementation as of Oct 2004 is that the driver assigns
- * the tid and returns it to the caller.   To make it easier to
- * catch bugs, and to reduce search time, we keep a cursor for
- * each port, walking the shadow tid array to find one that's not
- * in use.
- *
- * For now, if we can't allocate the full list, we fail, although
- * in the long run, we'll allocate as many as we can, and the
- * caller will deal with that by trying the remaining pages later.
- * That means that when we fail, we have to mark the tids as not in
- * use again, in our shadow copy.
- *
- * It's up to the caller to free the tids when they are done.
- * We'll unlock the pages as they free them.
- *
- * Also, right now we are locking one page at a time, but since
- * the intended use of this routine is for a single group of
- * virtually contiguous pages, that should change to improve
- * performance.
- */
-static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
-                           const struct ipath_tid_info *ti)
-{
-       int ret = 0, ntids;
-       u32 tid, porttid, cnt, i, tidcnt, tidoff;
-       u16 *tidlist;
-       struct ipath_devdata *dd = pd->port_dd;
-       u64 physaddr;
-       unsigned long vaddr;
-       u64 __iomem *tidbase;
-       unsigned long tidmap[8];
-       struct page **pagep = NULL;
-       unsigned subport = subport_fp(fp);
-
-       if (!dd->ipath_pageshadow) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       cnt = ti->tidcnt;
-       if (!cnt) {
-               ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
-                         (unsigned long long) ti->tidlist);
-               /*
-                * Should we treat as success?  likely a bug
-                */
-               ret = -EFAULT;
-               goto done;
-       }
-       porttid = pd->port_port * dd->ipath_rcvtidcnt;
-       if (!pd->port_subport_cnt) {
-               tidcnt = dd->ipath_rcvtidcnt;
-               tid = pd->port_tidcursor;
-               tidoff = 0;
-       } else if (!subport) {
-               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
-                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
-               tidoff = dd->ipath_rcvtidcnt - tidcnt;
-               porttid += tidoff;
-               tid = tidcursor_fp(fp);
-       } else {
-               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
-               tidoff = tidcnt * (subport - 1);
-               porttid += tidoff;
-               tid = tidcursor_fp(fp);
-       }
-       if (cnt > tidcnt) {
-               /* make sure it all fits in port_tid_pg_list */
-               dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
-                        "TIDs, only trying max (%u)\n", cnt, tidcnt);
-               cnt = tidcnt;
-       }
-       pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
-       tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
-
-       memset(tidmap, 0, sizeof(tidmap));
-       /* before decrement; chip actual # */
-       ntids = tidcnt;
-       tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
-                                  dd->ipath_rcvtidbase +
-                                  porttid * sizeof(*tidbase));
-
-       ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
-                  pd->port_port, cnt, tid, tidbase);
-
-       /* virtual address of first page in transfer */
-       vaddr = ti->tidvaddr;
-       if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
-                      cnt * PAGE_SIZE)) {
-               ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
-                         (void *)vaddr, cnt);
-               ret = -EFAULT;
-               goto done;
-       }
-       ret = ipath_get_user_pages(vaddr, cnt, pagep);
-       if (ret) {
-               if (ret == -EBUSY) {
-                       ipath_dbg("Failed to lock addr %p, %u pages "
-                                 "(already locked)\n",
-                                 (void *) vaddr, cnt);
-                       /*
-                        * for now, continue, and see what happens but with
-                        * the new implementation, this should never happen,
-                        * unless perhaps the user has mpin'ed the pages
-                        * themselves (something we need to test)
-                        */
-                       ret = 0;
-               } else {
-                       dev_info(&dd->pcidev->dev,
-                                "Failed to lock addr %p, %u pages: "
-                                "errno %d\n", (void *) vaddr, cnt, -ret);
-                       goto done;
-               }
-       }
-       for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
-               for (; ntids--; tid++) {
-                       if (tid == tidcnt)
-                               tid = 0;
-                       if (!dd->ipath_pageshadow[porttid + tid])
-                               break;
-               }
-               if (ntids < 0) {
-                       /*
-                        * oops, wrapped all the way through their TIDs,
-                        * and didn't have enough free; see comments at
-                        * start of routine
-                        */
-                       ipath_dbg("Not enough free TIDs for %u pages "
-                                 "(index %d), failing\n", cnt, i);
-                       i--;    /* last tidlist[i] not filled in */
-                       ret = -ENOMEM;
-                       break;
-               }
-               tidlist[i] = tid + tidoff;
-               ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
-                          "vaddr %lx\n", i, tid + tidoff, vaddr);
-               /* we "know" system pages and TID pages are same size */
-               dd->ipath_pageshadow[porttid + tid] = pagep[i];
-               dd->ipath_physshadow[porttid + tid] = ipath_map_page(
-                       dd->pcidev, pagep[i], 0, PAGE_SIZE,
-                       PCI_DMA_FROMDEVICE);
-               /*
-                * don't need atomic or it's overhead
-                */
-               __set_bit(tid, tidmap);
-               physaddr = dd->ipath_physshadow[porttid + tid];
-               ipath_stats.sps_pagelocks++;
-               ipath_cdbg(VERBOSE,
-                          "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
-                          tid, vaddr, (unsigned long long) physaddr,
-                          pagep[i]);
-               dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
-                                   physaddr);
-               /*
-                * don't check this tid in ipath_portshadow, since we
-                * just filled it in; start with the next one.
-                */
-               tid++;
-       }
-
-       if (ret) {
-               u32 limit;
-       cleanup:
-               /* jump here if copy out of updated info failed... */
-               ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
-                         -ret, i, cnt);
-               /* same code that's in ipath_free_tid() */
-               limit = sizeof(tidmap) * BITS_PER_BYTE;
-               if (limit > tidcnt)
-                       /* just in case size changes in future */
-                       limit = tidcnt;
-               tid = find_first_bit((const unsigned long *)tidmap, limit);
-               for (; tid < limit; tid++) {
-                       if (!test_bit(tid, tidmap))
-                               continue;
-                       if (dd->ipath_pageshadow[porttid + tid]) {
-                               ipath_cdbg(VERBOSE, "Freeing TID %u\n",
-                                          tid);
-                               dd->ipath_f_put_tid(dd, &tidbase[tid],
-                                                   RCVHQ_RCV_TYPE_EXPECTED,
-                                                   dd->ipath_tidinvalid);
-                               pci_unmap_page(dd->pcidev,
-                                       dd->ipath_physshadow[porttid + tid],
-                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                               dd->ipath_pageshadow[porttid + tid] = NULL;
-                               ipath_stats.sps_pageunlocks++;
-                       }
-               }
-               ipath_release_user_pages(pagep, cnt);
-       } else {
-               /*
-                * Copy the updated array, with ipath_tid's filled in, back
-                * to user.  Since we did the copy in already, this "should
-                * never fail" If it does, we have to clean up...
-                */
-               if (copy_to_user((void __user *)
-                                (unsigned long) ti->tidlist,
-                                tidlist, cnt * sizeof(*tidlist))) {
-                       ret = -EFAULT;
-                       goto cleanup;
-               }
-               if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
-                                tidmap, sizeof tidmap)) {
-                       ret = -EFAULT;
-                       goto cleanup;
-               }
-               if (tid == tidcnt)
-                       tid = 0;
-               if (!pd->port_subport_cnt)
-                       pd->port_tidcursor = tid;
-               else
-                       tidcursor_fp(fp) = tid;
-       }
-
-done:
-       if (ret)
-               ipath_dbg("Failed to map %u TID pages, failing with %d\n",
-                         ti->tidcnt, -ret);
-       return ret;
-}
-
-/**
- * ipath_tid_free - free a port TID
- * @pd: the port
- * @subport: the subport
- * @ti: the TID info
- *
- * right now we are unlocking one page at a time, but since
- * the intended use of this routine is for a single group of
- * virtually contiguous pages, that should change to improve
- * performance.  We check that the TID is in range for this port
- * but otherwise don't check validity; if user has an error and
- * frees the wrong tid, it's only their own data that can thereby
- * be corrupted.  We do check that the TID was in use, for sanity
- * We always use our idea of the saved address, not the address that
- * they pass in to us.
- */
-
-static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
-                         const struct ipath_tid_info *ti)
-{
-       int ret = 0;
-       u32 tid, porttid, cnt, limit, tidcnt;
-       struct ipath_devdata *dd = pd->port_dd;
-       u64 __iomem *tidbase;
-       unsigned long tidmap[8];
-
-       if (!dd->ipath_pageshadow) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
-                          sizeof tidmap)) {
-               ret = -EFAULT;
-               goto done;
-       }
-
-       porttid = pd->port_port * dd->ipath_rcvtidcnt;
-       if (!pd->port_subport_cnt)
-               tidcnt = dd->ipath_rcvtidcnt;
-       else if (!subport) {
-               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
-                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
-               porttid += dd->ipath_rcvtidcnt - tidcnt;
-       } else {
-               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
-               porttid += tidcnt * (subport - 1);
-       }
-       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-                                  dd->ipath_rcvtidbase +
-                                  porttid * sizeof(*tidbase));
-
-       limit = sizeof(tidmap) * BITS_PER_BYTE;
-       if (limit > tidcnt)
-               /* just in case size changes in future */
-               limit = tidcnt;
-       tid = find_first_bit(tidmap, limit);
-       ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
-                  "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
-                  limit, tid, porttid);
-       for (cnt = 0; tid < limit; tid++) {
-               /*
-                * small optimization; if we detect a run of 3 or so without
-                * any set, use find_first_bit again.  That's mainly to
-                * accelerate the case where we wrapped, so we have some at
-                * the beginning, and some at the end, and a big gap
-                * in the middle.
-                */
-               if (!test_bit(tid, tidmap))
-                       continue;
-               cnt++;
-               if (dd->ipath_pageshadow[porttid + tid]) {
-                       struct page *p;
-                       p = dd->ipath_pageshadow[porttid + tid];
-                       dd->ipath_pageshadow[porttid + tid] = NULL;
-                       ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
-                                  pid_nr(pd->port_pid), tid);
-                       dd->ipath_f_put_tid(dd, &tidbase[tid],
-                                           RCVHQ_RCV_TYPE_EXPECTED,
-                                           dd->ipath_tidinvalid);
-                       pci_unmap_page(dd->pcidev,
-                               dd->ipath_physshadow[porttid + tid],
-                               PAGE_SIZE, PCI_DMA_FROMDEVICE);
-                       ipath_release_user_pages(&p, 1);
-                       ipath_stats.sps_pageunlocks++;
-               } else
-                       ipath_dbg("Unused tid %u, ignoring\n", tid);
-       }
-       if (cnt != ti->tidcnt)
-               ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
-                         ti->tidcnt, cnt);
-done:
-       if (ret)
-               ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
-                         ti->tidcnt, -ret);
-       return ret;
-}
-
-/**
- * ipath_set_part_key - set a partition key
- * @pd: the port
- * @key: the key
- *
- * We can have up to 4 active at a time (other than the default, which is
- * always allowed).  This is somewhat tricky, since multiple ports may set
- * the same key, so we reference count them, and clean up at exit.  All 4
- * partition keys are packed into a single infinipath register.  It's an
- * error for a process to set the same pkey multiple times.  We provide no
- * mechanism to de-allocate a pkey at this time, we may eventually need to
- * do that.  I've used the atomic operations, and no locking, and only make
- * a single pass through what's available.  This should be more than
- * adequate for some time. I'll think about spinlocks or the like if and as
- * it's necessary.
- */
-static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       int i, any = 0, pidx = -1;
-       u16 lkey = key & 0x7FFF;
-       int ret;
-
-       if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
-               /* nothing to do; this key always valid */
-               ret = 0;
-               goto bail;
-       }
-
-       ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
-                  "%hx:%x %hx:%x %hx:%x %hx:%x\n",
-                  pd->port_port, key, dd->ipath_pkeys[0],
-                  atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
-                  atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
-                  atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
-                  atomic_read(&dd->ipath_pkeyrefs[3]));
-
-       if (!lkey) {
-               ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
-                          pd->port_port);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       /*
-        * Set the full membership bit, because it has to be
-        * set in the register or the packet, and it seems
-        * cleaner to set in the register than to force all
-        * callers to set it. (see bug 4331)
-        */
-       key |= 0x8000;
-
-       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-               if (!pd->port_pkeys[i] && pidx == -1)
-                       pidx = i;
-               if (pd->port_pkeys[i] == key) {
-                       ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
-                                  "(%x) more than once\n",
-                                  pd->port_port, key);
-                       ret = -EEXIST;
-                       goto bail;
-               }
-       }
-       if (pidx == -1) {
-               ipath_dbg("All pkeys for port %u already in use, "
-                         "can't set %x\n", pd->port_port, key);
-               ret = -EBUSY;
-               goto bail;
-       }
-       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i]) {
-                       any++;
-                       continue;
-               }
-               if (dd->ipath_pkeys[i] == key) {
-                       atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
-
-                       if (atomic_inc_return(pkrefs) > 1) {
-                               pd->port_pkeys[pidx] = key;
-                               ipath_cdbg(VERBOSE, "p%u set key %x "
-                                          "matches #%d, count now %d\n",
-                                          pd->port_port, key, i,
-                                          atomic_read(pkrefs));
-                               ret = 0;
-                               goto bail;
-                       } else {
-                               /*
-                                * lost race, decrement count, catch below
-                                */
-                               atomic_dec(pkrefs);
-                               ipath_cdbg(VERBOSE, "Lost race, count was "
-                                          "0, after dec, it's %d\n",
-                                          atomic_read(pkrefs));
-                               any++;
-                       }
-               }
-               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
-                       /*
-                        * It makes no sense to have both the limited and
-                        * full membership PKEY set at the same time since
-                        * the unlimited one will disable the limited one.
-                        */
-                       ret = -EEXIST;
-                       goto bail;
-               }
-       }
-       if (!any) {
-               ipath_dbg("port %u, all pkeys already in use, "
-                         "can't set %x\n", pd->port_port, key);
-               ret = -EBUSY;
-               goto bail;
-       }
-       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i] &&
-                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
-                       u64 pkey;
-
-                       /* for ipathstats, etc. */
-                       ipath_stats.sps_pkeys[i] = lkey;
-                       pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
-                       pkey =
-                               (u64) dd->ipath_pkeys[0] |
-                               ((u64) dd->ipath_pkeys[1] << 16) |
-                               ((u64) dd->ipath_pkeys[2] << 32) |
-                               ((u64) dd->ipath_pkeys[3] << 48);
-                       ipath_cdbg(PROC, "p%u set key %x in #%d, "
-                                  "portidx %d, new pkey reg %llx\n",
-                                  pd->port_port, key, i, pidx,
-                                  (unsigned long long) pkey);
-                       ipath_write_kreg(
-                               dd, dd->ipath_kregs->kr_partitionkey, pkey);
-
-                       ret = 0;
-                       goto bail;
-               }
-       }
-       ipath_dbg("port %u, all pkeys already in use 2nd pass, "
-                 "can't set %x\n", pd->port_port, key);
-       ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_manage_rcvq - manage a port's receive queue
- * @pd: the port
- * @subport: the subport
- * @start_stop: action to carry out
- *
- * start_stop == 0 disables receive on the port, for use in queue
- * overflow conditions.  start_stop==1 re-enables, to be used to
- * re-init the software copy of the head register
- */
-static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
-                            int start_stop)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-
-       ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
-                  start_stop ? "en" : "dis", dd->ipath_unit,
-                  pd->port_port, subport);
-       if (subport)
-               goto bail;
-       /* atomically clear receive enable port. */
-       if (start_stop) {
-               /*
-                * On enable, force in-memory copy of the tail register to
-                * 0, so that protocol code doesn't have to worry about
-                * whether or not the chip has yet updated the in-memory
-                * copy or not on return from the system call. The chip
-                * always resets it's tail register back to 0 on a
-                * transition from disabled to enabled.  This could cause a
-                * problem if software was broken, and did the enable w/o
-                * the disable, but eventually the in-memory copy will be
-                * updated and correct itself, even in the face of software
-                * bugs.
-                */
-               if (pd->port_rcvhdrtail_kvaddr)
-                       ipath_clear_rcvhdrtail(pd);
-               set_bit(dd->ipath_r_portenable_shift + pd->port_port,
-                       &dd->ipath_rcvctrl);
-       } else
-               clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
-                         &dd->ipath_rcvctrl);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-       /* now be sure chip saw it before we return */
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       if (start_stop) {
-               /*
-                * And try to be sure that tail reg update has happened too.
-                * This should in theory interlock with the RXE changes to
-                * the tail register.  Don't assign it to the tail register
-                * in memory copy, since we could overwrite an update by the
-                * chip if we did.
-                */
-               ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
-       }
-       /* always; new head should be equal to new tail; see above */
-bail:
-       return 0;
-}
-
-static void ipath_clean_part_key(struct ipath_portdata *pd,
-                                struct ipath_devdata *dd)
-{
-       int i, j, pchanged = 0;
-       u64 oldpkey;
-
-       /* for debugging only */
-       oldpkey = (u64) dd->ipath_pkeys[0] |
-               ((u64) dd->ipath_pkeys[1] << 16) |
-               ((u64) dd->ipath_pkeys[2] << 32) |
-               ((u64) dd->ipath_pkeys[3] << 48);
-
-       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-               if (!pd->port_pkeys[i])
-                       continue;
-               ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
-                          pd->port_pkeys[i]);
-               for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
-                       /* check for match independent of the global bit */
-                       if ((dd->ipath_pkeys[j] & 0x7fff) !=
-                           (pd->port_pkeys[i] & 0x7fff))
-                               continue;
-                       if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
-                               ipath_cdbg(VERBOSE, "p%u clear key "
-                                          "%x matches #%d\n",
-                                          pd->port_port,
-                                          pd->port_pkeys[i], j);
-                               ipath_stats.sps_pkeys[j] =
-                                       dd->ipath_pkeys[j] = 0;
-                               pchanged++;
-                       }
-                       else ipath_cdbg(
-                               VERBOSE, "p%u key %x matches #%d, "
-                               "but ref still %d\n", pd->port_port,
-                               pd->port_pkeys[i], j,
-                               atomic_read(&dd->ipath_pkeyrefs[j]));
-                       break;
-               }
-               pd->port_pkeys[i] = 0;
-       }
-       if (pchanged) {
-               u64 pkey = (u64) dd->ipath_pkeys[0] |
-                       ((u64) dd->ipath_pkeys[1] << 16) |
-                       ((u64) dd->ipath_pkeys[2] << 32) |
-                       ((u64) dd->ipath_pkeys[3] << 48);
-               ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
-                          "new pkey reg %llx\n", pd->port_port,
-                          (unsigned long long) oldpkey,
-                          (unsigned long long) pkey);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
-                                pkey);
-       }
-}
-
-/*
- * Initialize the port data with the receive buffer sizes
- * so this can be done while the master port is locked.
- * Otherwise, there is a race with a slave opening the port
- * and seeing these fields uninitialized.
- */
-static void init_user_egr_sizes(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned egrperchunk, egrcnt, size;
-
-       /*
-        * to avoid wasting a lot of memory, we allocate 32KB chunks of
-        * physically contiguous memory, advance through it until used up
-        * and then allocate more.  Of course, we need memory to store those
-        * extra pointers, now.  Started out with 256KB, but under heavy
-        * memory pressure (creating large files and then copying them over
-        * NFS while doing lots of MPI jobs), we hit some allocation
-        * failures, even though we can sleep...  (2.6.10) Still get
-        * failures at 64K.  32K is the lowest we can go without wasting
-        * additional memory.
-        */
-       size = 0x8000;
-       egrperchunk = size / dd->ipath_rcvegrbufsize;
-       egrcnt = dd->ipath_rcvegrcnt;
-       pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
-       pd->port_rcvegrbufs_perchunk = egrperchunk;
-       pd->port_rcvegrbuf_size = size;
-}
-
-/**
- * ipath_create_user_egr - allocate eager TID buffers
- * @pd: the port to allocate TID buffers for
- *
- * This routine is now quite different for user and kernel, because
- * the kernel uses skb's, for the accelerated network performance
- * This is the user port version
- *
- * Allocate the eager TID buffers and program them into infinipath
- * They are no longer completely contiguous, we do multiple allocation
- * calls.
- */
-static int ipath_create_user_egr(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
-       size_t size;
-       int ret;
-       gfp_t gfp_flags;
-
-       /*
-        * GFP_USER, but without GFP_FS, so buffer cache can be
-        * coalesced (we hope); otherwise, even at order 4,
-        * heavy filesystem activity makes these fail, and we can
-        * use compound pages.
-        */
-       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
-
-       egrcnt = dd->ipath_rcvegrcnt;
-       /* TID number offset for this port */
-       egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
-       egrsize = dd->ipath_rcvegrbufsize;
-       ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
-                  "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
-
-       chunk = pd->port_rcvegrbuf_chunks;
-       egrperchunk = pd->port_rcvegrbufs_perchunk;
-       size = pd->port_rcvegrbuf_size;
-       pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
-                                    GFP_KERNEL);
-       if (!pd->port_rcvegrbuf) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       pd->port_rcvegrbuf_phys =
-               kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
-                       GFP_KERNEL);
-       if (!pd->port_rcvegrbuf_phys) {
-               ret = -ENOMEM;
-               goto bail_rcvegrbuf;
-       }
-       for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
-
-               pd->port_rcvegrbuf[e] = dma_alloc_coherent(
-                       &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
-                       gfp_flags);
-
-               if (!pd->port_rcvegrbuf[e]) {
-                       ret = -ENOMEM;
-                       goto bail_rcvegrbuf_phys;
-               }
-       }
-
-       pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
-
-       for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
-               dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
-               unsigned i;
-
-               for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
-                       dd->ipath_f_put_tid(dd, e + egroff +
-                                           (u64 __iomem *)
-                                           ((char __iomem *)
-                                            dd->ipath_kregbase +
-                                            dd->ipath_rcvegrbase),
-                                           RCVHQ_RCV_TYPE_EAGER, pa);
-                       pa += egrsize;
-               }
-               cond_resched(); /* don't hog the cpu */
-       }
-
-       ret = 0;
-       goto bail;
-
-bail_rcvegrbuf_phys:
-       for (e = 0; e < pd->port_rcvegrbuf_chunks &&
-               pd->port_rcvegrbuf[e]; e++) {
-               dma_free_coherent(&dd->pcidev->dev, size,
-                                 pd->port_rcvegrbuf[e],
-                                 pd->port_rcvegrbuf_phys[e]);
-
-       }
-       kfree(pd->port_rcvegrbuf_phys);
-       pd->port_rcvegrbuf_phys = NULL;
-bail_rcvegrbuf:
-       kfree(pd->port_rcvegrbuf);
-       pd->port_rcvegrbuf = NULL;
-bail:
-       return ret;
-}
-
-
-/* common code for the mappings on dma_alloc_coherent mem */
-static int ipath_mmap_mem(struct vm_area_struct *vma,
-       struct ipath_portdata *pd, unsigned len, int write_ok,
-       void *kvaddr, char *what)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned long pfn;
-       int ret;
-
-       if ((vma->vm_end - vma->vm_start) > len) {
-               dev_info(&dd->pcidev->dev,
-                        "FAIL on %s: len %lx > %x\n", what,
-                        vma->vm_end - vma->vm_start, len);
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       if (!write_ok) {
-               if (vma->vm_flags & VM_WRITE) {
-                       dev_info(&dd->pcidev->dev,
-                                "%s must be mapped readonly\n", what);
-                       ret = -EPERM;
-                       goto bail;
-               }
-
-               /* don't allow them to later change with mprotect */
-               vma->vm_flags &= ~VM_MAYWRITE;
-       }
-
-       pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
-       ret = remap_pfn_range(vma, vma->vm_start, pfn,
-                             len, vma->vm_page_prot);
-       if (ret)
-               dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
-                        "bytes r%c failed: %d\n", what, pd->port_port,
-                        pfn, len, write_ok?'w':'o', ret);
-       else
-               ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
-                          "r%c\n", what, pd->port_port, pfn, len,
-                          write_ok?'w':'o');
-bail:
-       return ret;
-}
-
-static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
-                    u64 ureg)
-{
-       unsigned long phys;
-       int ret;
-
-       /*
-        * This is real hardware, so use io_remap.  This is the mechanism
-        * for the user process to update the head registers for their port
-        * in the chip.
-        */
-       if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
-               dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
-                        "%lx > PAGE\n", vma->vm_end - vma->vm_start);
-               ret = -EFAULT;
-       } else {
-               phys = dd->ipath_physaddr + ureg;
-               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-               vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-               ret = io_remap_pfn_range(vma, vma->vm_start,
-                                        phys >> PAGE_SHIFT,
-                                        vma->vm_end - vma->vm_start,
-                                        vma->vm_page_prot);
-       }
-       return ret;
-}
-
-static int mmap_piobufs(struct vm_area_struct *vma,
-                       struct ipath_devdata *dd,
-                       struct ipath_portdata *pd,
-                       unsigned piobufs, unsigned piocnt)
-{
-       unsigned long phys;
-       int ret;
-
-       /*
-        * When we map the PIO buffers in the chip, we want to map them as
-        * writeonly, no read possible.   This prevents access to previous
-        * process data, and catches users who might try to read the i/o
-        * space due to a bug.
-        */
-       if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
-               dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
-                        "reqlen %lx > PAGE\n",
-                        vma->vm_end - vma->vm_start);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       phys = dd->ipath_physaddr + piobufs;
-
-#if defined(__powerpc__)
-       /* There isn't a generic way to specify writethrough mappings */
-       pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
-       pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
-       pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
-#endif
-
-       /*
-        * don't allow them to later change to readable with mprotect (for when
-        * not initially mapped readable, as is normally the case)
-        */
-       vma->vm_flags &= ~VM_MAYREAD;
-       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-
-       ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
-                                vma->vm_end - vma->vm_start,
-                                vma->vm_page_prot);
-bail:
-       return ret;
-}
-
-static int mmap_rcvegrbufs(struct vm_area_struct *vma,
-                          struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       unsigned long start, size;
-       size_t total_size, i;
-       unsigned long pfn;
-       int ret;
-
-       size = pd->port_rcvegrbuf_size;
-       total_size = pd->port_rcvegrbuf_chunks * size;
-       if ((vma->vm_end - vma->vm_start) > total_size) {
-               dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
-                        "reqlen %lx > actual %lx\n",
-                        vma->vm_end - vma->vm_start,
-                        (unsigned long) total_size);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (vma->vm_flags & VM_WRITE) {
-               dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
-                        "writable (flags=%lx)\n", vma->vm_flags);
-               ret = -EPERM;
-               goto bail;
-       }
-       /* don't allow them to later change to writeable with mprotect */
-       vma->vm_flags &= ~VM_MAYWRITE;
-
-       start = vma->vm_start;
-
-       for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
-               pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
-               ret = remap_pfn_range(vma, start, pfn, size,
-                                     vma->vm_page_prot);
-               if (ret < 0)
-                       goto bail;
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/*
- * ipath_file_vma_fault - handle a VMA page fault.
- */
-static int ipath_file_vma_fault(struct vm_area_struct *vma,
-                                       struct vm_fault *vmf)
-{
-       struct page *page;
-
-       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
-       if (!page)
-               return VM_FAULT_SIGBUS;
-       get_page(page);
-       vmf->page = page;
-
-       return 0;
-}
-
-static const struct vm_operations_struct ipath_file_vm_ops = {
-       .fault = ipath_file_vma_fault,
-};
-
-static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
-                      struct ipath_portdata *pd, unsigned subport)
-{
-       unsigned long len;
-       struct ipath_devdata *dd;
-       void *addr;
-       size_t size;
-       int ret = 0;
-
-       /* If the port is not shared, all addresses should be physical */
-       if (!pd->port_subport_cnt)
-               goto bail;
-
-       dd = pd->port_dd;
-       size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
-
-       /*
-        * Each process has all the subport uregbase, rcvhdrq, and
-        * rcvegrbufs mmapped - as an array for all the processes,
-        * and also separately for this process.
-        */
-       if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
-               addr = pd->subport_uregbase;
-               size = PAGE_SIZE * pd->port_subport_cnt;
-       } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
-               addr = pd->subport_rcvhdr_base;
-               size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
-       } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
-               addr = pd->subport_rcvegrbuf;
-               size *= pd->port_subport_cnt;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
-                                        PAGE_SIZE * subport)) {
-                addr = pd->subport_uregbase + PAGE_SIZE * subport;
-                size = PAGE_SIZE;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
-                                pd->port_rcvhdrq_size * subport)) {
-                addr = pd->subport_rcvhdr_base +
-                        pd->port_rcvhdrq_size * subport;
-                size = pd->port_rcvhdrq_size;
-        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
-                               size * subport)) {
-                addr = pd->subport_rcvegrbuf + size * subport;
-                /* rcvegrbufs are read-only on the slave */
-                if (vma->vm_flags & VM_WRITE) {
-                        dev_info(&dd->pcidev->dev,
-                                 "Can't map eager buffers as "
-                                 "writable (flags=%lx)\n", vma->vm_flags);
-                        ret = -EPERM;
-                        goto bail;
-                }
-                /*
-                 * Don't allow permission to later change to writeable
-                 * with mprotect.
-                 */
-                vma->vm_flags &= ~VM_MAYWRITE;
-       } else {
-               goto bail;
-       }
-       len = vma->vm_end - vma->vm_start;
-       if (len > size) {
-               ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
-       vma->vm_ops = &ipath_file_vm_ops;
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_mmap - mmap various structures into user space
- * @fp: the file pointer
- * @vma: the VM area
- *
- * We use this to have a shared buffer between the kernel and the user code
- * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
- * buffers in the chip.  We have the open and close entries so we can bump
- * the ref count and keep the driver from being unloaded while still mapped.
- */
-static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-       struct ipath_portdata *pd;
-       struct ipath_devdata *dd;
-       u64 pgaddr, ureg;
-       unsigned piobufs, piocnt;
-       int ret;
-
-       pd = port_fp(fp);
-       if (!pd) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       dd = pd->port_dd;
-
-       /*
-        * This is the ipath_do_user_init() code, mapping the shared buffers
-        * into the user process. The address referred to by vm_pgoff is the
-        * file offset passed via mmap().  For shared ports, this is the
-        * kernel vmalloc() address of the pages to share with the master.
-        * For non-shared or master ports, this is a physical address.
-        * We only do one mmap for each space mapped.
-        */
-       pgaddr = vma->vm_pgoff << PAGE_SHIFT;
-
-       /*
-        * Check for 0 in case one of the allocations failed, but user
-        * called mmap anyway.
-        */
-       if (!pgaddr)  {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
-                  (unsigned long long) pgaddr, vma->vm_start,
-                  vma->vm_end - vma->vm_start, dd->ipath_unit,
-                  pd->port_port, subport_fp(fp));
-
-       /*
-        * Physical addresses must fit in 40 bits for our hardware.
-        * Check for kernel virtual addresses first, anything else must
-        * match a HW or memory address.
-        */
-       ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
-       if (ret) {
-               if (ret > 0)
-                       ret = 0;
-               goto bail;
-       }
-
-       ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
-       if (!pd->port_subport_cnt) {
-               /* port is not shared */
-               piocnt = pd->port_piocnt;
-               piobufs = pd->port_piobufs;
-       } else if (!subport_fp(fp)) {
-               /* caller is the master */
-               piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
-                        (pd->port_piocnt % pd->port_subport_cnt);
-               piobufs = pd->port_piobufs +
-                       dd->ipath_palign * (pd->port_piocnt - piocnt);
-       } else {
-               unsigned slave = subport_fp(fp) - 1;
-
-               /* caller is a slave */
-               piocnt = pd->port_piocnt / pd->port_subport_cnt;
-               piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
-       }
-
-       if (pgaddr == ureg)
-               ret = mmap_ureg(vma, dd, ureg);
-       else if (pgaddr == piobufs)
-               ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
-       else if (pgaddr == dd->ipath_pioavailregs_phys)
-               /* in-memory copy of pioavail registers */
-               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-                                    (void *) dd->ipath_pioavailregs_dma,
-                                    "pioavail registers");
-       else if (pgaddr == pd->port_rcvegr_phys)
-               ret = mmap_rcvegrbufs(vma, pd);
-       else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
-               /*
-                * The rcvhdrq itself; readonly except on HT (so have
-                * to allow writable mapping), multiple pages, contiguous
-                * from an i/o perspective.
-                */
-               ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
-                                    pd->port_rcvhdrq,
-                                    "rcvhdrq");
-       else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
-               /* in-memory copy of rcvhdrq tail register */
-               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
-                                    pd->port_rcvhdrtail_kvaddr,
-                                    "rcvhdrq tail");
-       else
-               ret = -EINVAL;
-
-       vma->vm_private_data = NULL;
-
-       if (ret < 0)
-               dev_info(&dd->pcidev->dev,
-                        "Failure %d on off %llx len %lx\n",
-                        -ret, (unsigned long long)pgaddr,
-                        vma->vm_end - vma->vm_start);
-bail:
-       return ret;
-}
-
-static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
-{
-       unsigned pollflag = 0;
-
-       if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
-           pd->port_hdrqfull != pd->port_hdrqfull_poll) {
-               pollflag |= POLLIN | POLLRDNORM;
-               pd->port_hdrqfull_poll = pd->port_hdrqfull;
-       }
-
-       return pollflag;
-}
-
-static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
-                                     struct file *fp,
-                                     struct poll_table_struct *pt)
-{
-       unsigned pollflag = 0;
-       struct ipath_devdata *dd;
-
-       dd = pd->port_dd;
-
-       /* variable access in ipath_poll_hdrqfull() needs this */
-       rmb();
-       pollflag = ipath_poll_hdrqfull(pd);
-
-       if (pd->port_urgent != pd->port_urgent_poll) {
-               pollflag |= POLLIN | POLLRDNORM;
-               pd->port_urgent_poll = pd->port_urgent;
-       }
-
-       if (!pollflag) {
-               /* this saves a spin_lock/unlock in interrupt handler... */
-               set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
-               /* flush waiting flag so don't miss an event... */
-               wmb();
-               poll_wait(fp, &pd->port_wait, pt);
-       }
-
-       return pollflag;
-}
-
-static unsigned int ipath_poll_next(struct ipath_portdata *pd,
-                                   struct file *fp,
-                                   struct poll_table_struct *pt)
-{
-       u32 head;
-       u32 tail;
-       unsigned pollflag = 0;
-       struct ipath_devdata *dd;
-
-       dd = pd->port_dd;
-
-       /* variable access in ipath_poll_hdrqfull() needs this */
-       rmb();
-       pollflag = ipath_poll_hdrqfull(pd);
-
-       head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
-       if (pd->port_rcvhdrtail_kvaddr)
-               tail = ipath_get_rcvhdrtail(pd);
-       else
-               tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
-
-       if (head != tail)
-               pollflag |= POLLIN | POLLRDNORM;
-       else {
-               /* this saves a spin_lock/unlock in interrupt handler */
-               set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
-               /* flush waiting flag so we don't miss an event */
-               wmb();
-
-               set_bit(pd->port_port + dd->ipath_r_intravail_shift,
-                       &dd->ipath_rcvctrl);
-
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                                dd->ipath_rcvctrl);
-
-               if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
-                       ipath_write_ureg(dd, ur_rcvhdrhead,
-                                        dd->ipath_rhdrhead_intr_off | head,
-                                        pd->port_port);
-
-               poll_wait(fp, &pd->port_wait, pt);
-       }
-
-       return pollflag;
-}
-
-static unsigned int ipath_poll(struct file *fp,
-                              struct poll_table_struct *pt)
-{
-       struct ipath_portdata *pd;
-       unsigned pollflag;
-
-       pd = port_fp(fp);
-       if (!pd)
-               pollflag = 0;
-       else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
-               pollflag = ipath_poll_urgent(pd, fp, pt);
-       else
-               pollflag = ipath_poll_next(pd, fp, pt);
-
-       return pollflag;
-}
-
-static int ipath_supports_subports(int user_swmajor, int user_swminor)
-{
-       /* no subport implementation prior to software version 1.3 */
-       return (user_swmajor > 1) || (user_swminor >= 3);
-}
-
-static int ipath_compatible_subports(int user_swmajor, int user_swminor)
-{
-       /* this code is written long-hand for clarity */
-       if (IPATH_USER_SWMAJOR != user_swmajor) {
-               /* no promise of compatibility if major mismatch */
-               return 0;
-       }
-       if (IPATH_USER_SWMAJOR == 1) {
-               switch (IPATH_USER_SWMINOR) {
-               case 0:
-               case 1:
-               case 2:
-                       /* no subport implementation so cannot be compatible */
-                       return 0;
-               case 3:
-                       /* 3 is only compatible with itself */
-                       return user_swminor == 3;
-               default:
-                       /* >= 4 are compatible (or are expected to be) */
-                       return user_swminor >= 4;
-               }
-       }
-       /* make no promises yet for future major versions */
-       return 0;
-}
-
-static int init_subports(struct ipath_devdata *dd,
-                        struct ipath_portdata *pd,
-                        const struct ipath_user_info *uinfo)
-{
-       int ret = 0;
-       unsigned num_subports;
-       size_t size;
-
-       /*
-        * If the user is requesting zero subports,
-        * skip the subport allocation.
-        */
-       if (uinfo->spu_subport_cnt <= 0)
-               goto bail;
-
-       /* Self-consistency check for ipath_compatible_subports() */
-       if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
-           !ipath_compatible_subports(IPATH_USER_SWMAJOR,
-                                      IPATH_USER_SWMINOR)) {
-               dev_info(&dd->pcidev->dev,
-                        "Inconsistent ipath_compatible_subports()\n");
-               goto bail;
-       }
-
-       /* Check for subport compatibility */
-       if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
-                                      uinfo->spu_userversion & 0xffff)) {
-               dev_info(&dd->pcidev->dev,
-                        "Mismatched user version (%d.%d) and driver "
-                        "version (%d.%d) while port sharing. Ensure "
-                         "that driver and library are from the same "
-                         "release.\n",
-                        (int) (uinfo->spu_userversion >> 16),
-                         (int) (uinfo->spu_userversion & 0xffff),
-                        IPATH_USER_SWMAJOR,
-                        IPATH_USER_SWMINOR);
-               goto bail;
-       }
-       if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       num_subports = uinfo->spu_subport_cnt;
-       pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
-       if (!pd->subport_uregbase) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       /* Note: pd->port_rcvhdrq_size isn't initialized yet. */
-       size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
-                    sizeof(u32), PAGE_SIZE) * num_subports;
-       pd->subport_rcvhdr_base = vzalloc(size);
-       if (!pd->subport_rcvhdr_base) {
-               ret = -ENOMEM;
-               goto bail_ureg;
-       }
-
-       pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
-                                       pd->port_rcvegrbuf_size *
-                                       num_subports);
-       if (!pd->subport_rcvegrbuf) {
-               ret = -ENOMEM;
-               goto bail_rhdr;
-       }
-
-       pd->port_subport_cnt = uinfo->spu_subport_cnt;
-       pd->port_subport_id = uinfo->spu_subport_id;
-       pd->active_slaves = 1;
-       set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
-       goto bail;
-
-bail_rhdr:
-       vfree(pd->subport_rcvhdr_base);
-bail_ureg:
-       vfree(pd->subport_uregbase);
-       pd->subport_uregbase = NULL;
-bail:
-       return ret;
-}
-
-static int try_alloc_port(struct ipath_devdata *dd, int port,
-                         struct file *fp,
-                         const struct ipath_user_info *uinfo)
-{
-       struct ipath_portdata *pd;
-       int ret;
-
-       if (!(pd = dd->ipath_pd[port])) {
-               void *ptmp;
-
-               pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
-
-               /*
-                * Allocate memory for use in ipath_tid_update() just once
-                * at open, not per call.  Reduces cost of expected send
-                * setup.
-                */
-               ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
-                              dd->ipath_rcvtidcnt * sizeof(struct page **),
-                              GFP_KERNEL);
-               if (!pd || !ptmp) {
-                       ipath_dev_err(dd, "Unable to allocate portdata "
-                                     "memory, failing open\n");
-                       ret = -ENOMEM;
-                       kfree(pd);
-                       kfree(ptmp);
-                       goto bail;
-               }
-               dd->ipath_pd[port] = pd;
-               dd->ipath_pd[port]->port_port = port;
-               dd->ipath_pd[port]->port_dd = dd;
-               dd->ipath_pd[port]->port_tid_pg_list = ptmp;
-               init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
-       }
-       if (!pd->port_cnt) {
-               pd->userversion = uinfo->spu_userversion;
-               init_user_egr_sizes(pd);
-               if ((ret = init_subports(dd, pd, uinfo)) != 0)
-                       goto bail;
-               ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
-                          current->comm, current->pid, dd->ipath_unit,
-                          port);
-               pd->port_cnt = 1;
-               port_fp(fp) = pd;
-               pd->port_pid = get_pid(task_pid(current));
-               strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
-               ipath_stats.sps_ports++;
-               ret = 0;
-       } else
-               ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-static inline int usable(struct ipath_devdata *dd)
-{
-       return dd &&
-               (dd->ipath_flags & IPATH_PRESENT) &&
-               dd->ipath_kregbase &&
-               dd->ipath_lid &&
-               !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
-                                    | IPATH_LINKUNK));
-}
-
-static int find_free_port(int unit, struct file *fp,
-                         const struct ipath_user_info *uinfo)
-{
-       struct ipath_devdata *dd = ipath_lookup(unit);
-       int ret, i;
-
-       if (!dd) {
-               ret = -ENODEV;
-               goto bail;
-       }
-
-       if (!usable(dd)) {
-               ret = -ENETDOWN;
-               goto bail;
-       }
-
-       for (i = 1; i < dd->ipath_cfgports; i++) {
-               ret = try_alloc_port(dd, i, fp, uinfo);
-               if (ret != -EBUSY)
-                       goto bail;
-       }
-       ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-static int find_best_unit(struct file *fp,
-                         const struct ipath_user_info *uinfo)
-{
-       int ret = 0, i, prefunit = -1, devmax;
-       int maxofallports, npresent, nup;
-       int ndev;
-
-       devmax = ipath_count_units(&npresent, &nup, &maxofallports);
-
-       /*
-        * This code is present to allow a knowledgeable person to
-        * specify the layout of processes to processors before opening
-        * this driver, and then we'll assign the process to the "closest"
-        * InfiniPath chip to that processor (we assume reasonable connectivity,
-        * for now).  This code assumes that if affinity has been set
-        * before this point, that at most one cpu is set; for now this
-        * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
-        * in case some kernel variant sets none of the bits when no
-        * affinity is set.  2.6.11 and 12 kernels have all present
-        * cpus set.  Some day we'll have to fix it up further to handle
-        * a cpu subset.  This algorithm fails for two HT chips connected
-        * in tunnel fashion.  Eventually this needs real topology
-        * information.  There may be some issues with dual core numbering
-        * as well.  This needs more work prior to release.
-        */
-       if (!cpumask_empty(tsk_cpus_allowed(current)) &&
-           !cpumask_full(tsk_cpus_allowed(current))) {
-               int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
-               get_online_cpus();
-               for_each_online_cpu(i)
-                       if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
-                               ipath_cdbg(PROC, "%s[%u] affinity set for "
-                                          "cpu %d/%d\n", current->comm,
-                                          current->pid, i, ncpus);
-                               curcpu = i;
-                               nset++;
-                       }
-               put_online_cpus();
-               if (curcpu != -1 && nset != ncpus) {
-                       if (npresent) {
-                               prefunit = curcpu / (ncpus / npresent);
-                               ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
-                                         "%d cpus/chip, select unit %d\n",
-                                         current->comm, current->pid,
-                                         npresent, ncpus, ncpus / npresent,
-                                         prefunit);
-                       }
-               }
-       }
-
-       /*
-        * user ports start at 1, kernel port is 0
-        * For now, we do round-robin access across all chips
-        */
-
-       if (prefunit != -1)
-               devmax = prefunit + 1;
-recheck:
-       for (i = 1; i < maxofallports; i++) {
-               for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
-                    ndev++) {
-                       struct ipath_devdata *dd = ipath_lookup(ndev);
-
-                       if (!usable(dd))
-                               continue; /* can't use this unit */
-                       if (i >= dd->ipath_cfgports)
-                               /*
-                                * Maxed out on users of this unit. Try
-                                * next.
-                                */
-                               continue;
-                       ret = try_alloc_port(dd, i, fp, uinfo);
-                       if (!ret)
-                               goto done;
-               }
-       }
-
-       if (npresent) {
-               if (nup == 0) {
-                       ret = -ENETDOWN;
-                       ipath_dbg("No ports available (none initialized "
-                                 "and ready)\n");
-               } else {
-                       if (prefunit > 0) {
-                               /* if started above 0, retry from 0 */
-                               ipath_cdbg(PROC,
-                                          "%s[%u] no ports on prefunit "
-                                          "%d, clear and re-check\n",
-                                          current->comm, current->pid,
-                                          prefunit);
-                               devmax = ipath_count_units(NULL, NULL,
-                                                          NULL);
-                               prefunit = -1;
-                               goto recheck;
-                       }
-                       ret = -EBUSY;
-                       ipath_dbg("No ports available\n");
-               }
-       } else {
-               ret = -ENXIO;
-               ipath_dbg("No boards found\n");
-       }
-
-done:
-       return ret;
-}
-
-static int find_shared_port(struct file *fp,
-                           const struct ipath_user_info *uinfo)
-{
-       int devmax, ndev, i;
-       int ret = 0;
-
-       devmax = ipath_count_units(NULL, NULL, NULL);
-
-       for (ndev = 0; ndev < devmax; ndev++) {
-               struct ipath_devdata *dd = ipath_lookup(ndev);
-
-               if (!usable(dd))
-                       continue;
-               for (i = 1; i < dd->ipath_cfgports; i++) {
-                       struct ipath_portdata *pd = dd->ipath_pd[i];
-
-                       /* Skip ports which are not yet open */
-                       if (!pd || !pd->port_cnt)
-                               continue;
-                       /* Skip port if it doesn't match the requested one */
-                       if (pd->port_subport_id != uinfo->spu_subport_id)
-                               continue;
-                       /* Verify the sharing process matches the master */
-                       if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
-                           pd->userversion != uinfo->spu_userversion ||
-                           pd->port_cnt >= pd->port_subport_cnt) {
-                               ret = -EINVAL;
-                               goto done;
-                       }
-                       port_fp(fp) = pd;
-                       subport_fp(fp) = pd->port_cnt++;
-                       pd->port_subpid[subport_fp(fp)] =
-                               get_pid(task_pid(current));
-                       tidcursor_fp(fp) = 0;
-                       pd->active_slaves |= 1 << subport_fp(fp);
-                       ipath_cdbg(PROC,
-                                  "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
-                                  current->comm, current->pid,
-                                  subport_fp(fp),
-                                  pd->port_comm, pid_nr(pd->port_pid),
-                                  dd->ipath_unit, pd->port_port);
-                       ret = 1;
-                       goto done;
-               }
-       }
-
-done:
-       return ret;
-}
-
-static int ipath_open(struct inode *in, struct file *fp)
-{
-       /* The real work is performed later in ipath_assign_port() */
-       fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
-       return fp->private_data ? 0 : -ENOMEM;
-}
-
-/* Get port early, so can set affinity prior to memory allocation */
-static int ipath_assign_port(struct file *fp,
-                             const struct ipath_user_info *uinfo)
-{
-       int ret;
-       int i_minor;
-       unsigned swmajor, swminor;
-
-       /* Check to be sure we haven't already initialized this file */
-       if (port_fp(fp)) {
-               ret = -EINVAL;
-               goto done;
-       }
-
-       /* for now, if major version is different, bail */
-       swmajor = uinfo->spu_userversion >> 16;
-       if (swmajor != IPATH_USER_SWMAJOR) {
-               ipath_dbg("User major version %d not same as driver "
-                         "major %d\n", uinfo->spu_userversion >> 16,
-                         IPATH_USER_SWMAJOR);
-               ret = -ENODEV;
-               goto done;
-       }
-
-       swminor = uinfo->spu_userversion & 0xffff;
-       if (swminor != IPATH_USER_SWMINOR)
-               ipath_dbg("User minor version %d not same as driver "
-                         "minor %d\n", swminor, IPATH_USER_SWMINOR);
-
-       mutex_lock(&ipath_mutex);
-
-       if (ipath_compatible_subports(swmajor, swminor) &&
-           uinfo->spu_subport_cnt &&
-           (ret = find_shared_port(fp, uinfo))) {
-               if (ret > 0)
-                       ret = 0;
-               goto done_chk_sdma;
-       }
-
-       i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
-       ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
-                  (long)file_inode(fp)->i_rdev, i_minor);
-
-       if (i_minor)
-               ret = find_free_port(i_minor - 1, fp, uinfo);
-       else
-               ret = find_best_unit(fp, uinfo);
-
-done_chk_sdma:
-       if (!ret) {
-               struct ipath_filedata *fd = fp->private_data;
-               const struct ipath_portdata *pd = fd->pd;
-               const struct ipath_devdata *dd = pd->port_dd;
-
-               fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
-                                                     dd->ipath_unit,
-                                                     pd->port_port,
-                                                     fd->subport);
-
-               if (!fd->pq)
-                       ret = -ENOMEM;
-       }
-
-       mutex_unlock(&ipath_mutex);
-
-done:
-       return ret;
-}
-
-
-static int ipath_do_user_init(struct file *fp,
-                             const struct ipath_user_info *uinfo)
-{
-       int ret;
-       struct ipath_portdata *pd = port_fp(fp);
-       struct ipath_devdata *dd;
-       u32 head32;
-
-       /* Subports don't need to initialize anything since master did it. */
-       if (subport_fp(fp)) {
-               ret = wait_event_interruptible(pd->port_wait,
-                       !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
-               goto done;
-       }
-
-       dd = pd->port_dd;
-
-       if (uinfo->spu_rcvhdrsize) {
-               ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
-               if (ret)
-                       goto done;
-       }
-
-       /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
-
-       /* some ports may get extra buffers, calculate that here */
-       if (pd->port_port <= dd->ipath_ports_extrabuf)
-               pd->port_piocnt = dd->ipath_pbufsport + 1;
-       else
-               pd->port_piocnt = dd->ipath_pbufsport;
-
-       /* for right now, kernel piobufs are at end, so port 1 is at 0 */
-       if (pd->port_port <= dd->ipath_ports_extrabuf)
-               pd->port_pio_base = (dd->ipath_pbufsport + 1)
-                       * (pd->port_port - 1);
-       else
-               pd->port_pio_base = dd->ipath_ports_extrabuf +
-                       dd->ipath_pbufsport * (pd->port_port - 1);
-       pd->port_piobufs = dd->ipath_piobufbase +
-               pd->port_pio_base * dd->ipath_palign;
-       ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
-               " first pio %u\n", pd->port_port, pd->port_piobufs,
-               pd->port_piocnt, pd->port_pio_base);
-       ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
-
-       /*
-        * Now allocate the rcvhdr Q and eager TIDs; skip the TID
-        * array for time being.  If pd->port_port > chip-supported,
-        * we need to do extra stuff here to handle by handling overflow
-        * through port 0, someday
-        */
-       ret = ipath_create_rcvhdrq(dd, pd);
-       if (!ret)
-               ret = ipath_create_user_egr(pd);
-       if (ret)
-               goto done;
-
-       /*
-        * set the eager head register for this port to the current values
-        * of the tail pointers, since we don't know if they were
-        * updated on last use of the port.
-        */
-       head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
-       ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
-       pd->port_lastrcvhdrqtail = -1;
-       ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
-               pd->port_port, head32);
-       pd->port_tidcursor = 0; /* start at beginning after open */
-
-       /* initialize poll variables... */
-       pd->port_urgent = 0;
-       pd->port_urgent_poll = 0;
-       pd->port_hdrqfull_poll = pd->port_hdrqfull;
-
-       /*
-        * Now enable the port for receive.
-        * For chips that are set to DMA the tail register to memory
-        * when they change (and when the update bit transitions from
-        * 0 to 1.  So for those chips, we turn it off and then back on.
-        * This will (very briefly) affect any other open ports, but the
-        * duration is very short, and therefore isn't an issue.  We
-        * explicitly set the in-memory tail copy to 0 beforehand, so we
-        * don't have to wait to be sure the DMA update has happened
-        * (chip resets head/tail to 0 on transition to enable).
-        */
-       set_bit(dd->ipath_r_portenable_shift + pd->port_port,
-               &dd->ipath_rcvctrl);
-       if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
-               if (pd->port_rcvhdrtail_kvaddr)
-                       ipath_clear_rcvhdrtail(pd);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                       dd->ipath_rcvctrl &
-                       ~(1ULL << dd->ipath_r_tailupd_shift));
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-       /* Notify any waiting slaves */
-       if (pd->port_subport_cnt) {
-               clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
-               wake_up(&pd->port_wait);
-       }
-done:
-       return ret;
-}
-
-/**
- * unlock_exptid - unlock any expected TID entries port still had in use
- * @pd: port
- *
- * We don't actually update the chip here, because we do a bulk update
- * below, using ipath_f_clear_tids.
- */
-static void unlock_expected_tids(struct ipath_portdata *pd)
-{
-       struct ipath_devdata *dd = pd->port_dd;
-       int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
-       int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
-
-       ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
-                  pd->port_port);
-       for (i = port_tidbase; i < maxtid; i++) {
-               struct page *ps = dd->ipath_pageshadow[i];
-
-               if (!ps)
-                       continue;
-
-               dd->ipath_pageshadow[i] = NULL;
-               pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
-                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
-               ipath_release_user_pages_on_close(&ps, 1);
-               cnt++;
-               ipath_stats.sps_pageunlocks++;
-       }
-       if (cnt)
-               ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
-                          pd->port_port, cnt);
-
-       if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
-               ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
-                          (unsigned long long) ipath_stats.sps_pagelocks,
-                          (unsigned long long)
-                          ipath_stats.sps_pageunlocks);
-}
-
-static int ipath_close(struct inode *in, struct file *fp)
-{
-       int ret = 0;
-       struct ipath_filedata *fd;
-       struct ipath_portdata *pd;
-       struct ipath_devdata *dd;
-       unsigned long flags;
-       unsigned port;
-       struct pid *pid;
-
-       ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
-                  (long)in->i_rdev, fp->private_data);
-
-       mutex_lock(&ipath_mutex);
-
-       fd = fp->private_data;
-       fp->private_data = NULL;
-       pd = fd->pd;
-       if (!pd) {
-               mutex_unlock(&ipath_mutex);
-               goto bail;
-       }
-
-       dd = pd->port_dd;
-
-       /* drain user sdma queue */
-       ipath_user_sdma_queue_drain(dd, fd->pq);
-       ipath_user_sdma_queue_destroy(fd->pq);
-
-       if (--pd->port_cnt) {
-               /*
-                * XXX If the master closes the port before the slave(s),
-                * revoke the mmap for the eager receive queue so
-                * the slave(s) don't wait for receive data forever.
-                */
-               pd->active_slaves &= ~(1 << fd->subport);
-               put_pid(pd->port_subpid[fd->subport]);
-               pd->port_subpid[fd->subport] = NULL;
-               mutex_unlock(&ipath_mutex);
-               goto bail;
-       }
-       /* early; no interrupt users after this */
-       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
-       port = pd->port_port;
-       dd->ipath_pd[port] = NULL;
-       pid = pd->port_pid;
-       pd->port_pid = NULL;
-       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
-
-       if (pd->port_rcvwait_to || pd->port_piowait_to
-           || pd->port_rcvnowait || pd->port_pionowait) {
-               ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
-                          "%u rcv %u, pio already\n",
-                          pd->port_port, pd->port_rcvwait_to,
-                          pd->port_piowait_to, pd->port_rcvnowait,
-                          pd->port_pionowait);
-               pd->port_rcvwait_to = pd->port_piowait_to =
-                       pd->port_rcvnowait = pd->port_pionowait = 0;
-       }
-       if (pd->port_flag) {
-               ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
-                         pd->port_port, pd->port_flag);
-               pd->port_flag = 0;
-       }
-
-       if (dd->ipath_kregbase) {
-               /* atomically clear receive enable port and intr avail. */
-               clear_bit(dd->ipath_r_portenable_shift + port,
-                         &dd->ipath_rcvctrl);
-               clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
-                         &dd->ipath_rcvctrl);
-               ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
-                       dd->ipath_rcvctrl);
-               /* and read back from chip to be sure that nothing
-                * else is in flight when we do the rest */
-               (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-               /* clean up the pkeys for this port user */
-               ipath_clean_part_key(pd, dd);
-               /*
-                * be paranoid, and never write 0's to these, just use an
-                * unused part of the port 0 tail page.  Of course,
-                * rcvhdraddr points to a large chunk of memory, so this
-                * could still trash things, but at least it won't trash
-                * page 0, and by disabling the port, it should stop "soon",
-                * even if a packet or two is in already in flight after we
-                * disabled the port.
-                */
-               ipath_write_kreg_port(dd,
-                       dd->ipath_kregs->kr_rcvhdrtailaddr, port,
-                       dd->ipath_dummy_hdrq_phys);
-               ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
-                       pd->port_port, dd->ipath_dummy_hdrq_phys);
-
-               ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
-               ipath_chg_pioavailkernel(dd, pd->port_pio_base,
-                       pd->port_piocnt, 1);
-
-               dd->ipath_f_clear_tids(dd, pd->port_port);
-
-               if (dd->ipath_pageshadow)
-                       unlock_expected_tids(pd);
-               ipath_stats.sps_ports--;
-               ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
-                          pd->port_comm, pid_nr(pid),
-                          dd->ipath_unit, port);
-       }
-
-       put_pid(pid);
-       mutex_unlock(&ipath_mutex);
-       ipath_free_pddata(dd, pd); /* after releasing the mutex */
-
-bail:
-       kfree(fd);
-       return ret;
-}
-
-static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
-                          struct ipath_port_info __user *uinfo)
-{
-       struct ipath_port_info info;
-       int nup;
-       int ret;
-       size_t sz;
-
-       (void) ipath_count_units(NULL, &nup, NULL);
-       info.num_active = nup;
-       info.unit = pd->port_dd->ipath_unit;
-       info.port = pd->port_port;
-       info.subport = subport;
-       /* Don't return new fields if old library opened the port. */
-       if (ipath_supports_subports(pd->userversion >> 16,
-                                   pd->userversion & 0xffff)) {
-               /* Number of user ports available for this device. */
-               info.num_ports = pd->port_dd->ipath_cfgports - 1;
-               info.num_subports = pd->port_subport_cnt;
-               sz = sizeof(info);
-       } else
-               sz = sizeof(info) - 2 * sizeof(u16);
-
-       if (copy_to_user(uinfo, &info, sz)) {
-               ret = -EFAULT;
-               goto bail;
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int ipath_get_slave_info(struct ipath_portdata *pd,
-                               void __user *slave_mask_addr)
-{
-       int ret = 0;
-
-       if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
-               ret = -EFAULT;
-       return ret;
-}
-
-static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
-                                  u32 __user *inflightp)
-{
-       const u32 val = ipath_user_sdma_inflight_counter(pq);
-
-       if (put_user(val, inflightp))
-               return -EFAULT;
-
-       return 0;
-}
-
-static int ipath_sdma_get_complete(struct ipath_devdata *dd,
-                                  struct ipath_user_sdma_queue *pq,
-                                  u32 __user *completep)
-{
-       u32 val;
-       int err;
-
-       err = ipath_user_sdma_make_progress(dd, pq);
-       if (err < 0)
-               return err;
-
-       val = ipath_user_sdma_complete_counter(pq);
-       if (put_user(val, completep))
-               return -EFAULT;
-
-       return 0;
-}
-
-static ssize_t ipath_write(struct file *fp, const char __user *data,
-                          size_t count, loff_t *off)
-{
-       const struct ipath_cmd __user *ucmd;
-       struct ipath_portdata *pd;
-       const void __user *src;
-       size_t consumed, copy;
-       struct ipath_cmd cmd;
-       ssize_t ret = 0;
-       void *dest;
-
-       if (count < sizeof(cmd.type)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ucmd = (const struct ipath_cmd __user *) data;
-
-       if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
-               ret = -EFAULT;
-               goto bail;
-       }
-
-       consumed = sizeof(cmd.type);
-
-       switch (cmd.type) {
-       case IPATH_CMD_ASSIGN_PORT:
-       case __IPATH_CMD_USER_INIT:
-       case IPATH_CMD_USER_INIT:
-               copy = sizeof(cmd.cmd.user_info);
-               dest = &cmd.cmd.user_info;
-               src = &ucmd->cmd.user_info;
-               break;
-       case IPATH_CMD_RECV_CTRL:
-               copy = sizeof(cmd.cmd.recv_ctrl);
-               dest = &cmd.cmd.recv_ctrl;
-               src = &ucmd->cmd.recv_ctrl;
-               break;
-       case IPATH_CMD_PORT_INFO:
-               copy = sizeof(cmd.cmd.port_info);
-               dest = &cmd.cmd.port_info;
-               src = &ucmd->cmd.port_info;
-               break;
-       case IPATH_CMD_TID_UPDATE:
-       case IPATH_CMD_TID_FREE:
-               copy = sizeof(cmd.cmd.tid_info);
-               dest = &cmd.cmd.tid_info;
-               src = &ucmd->cmd.tid_info;
-               break;
-       case IPATH_CMD_SET_PART_KEY:
-               copy = sizeof(cmd.cmd.part_key);
-               dest = &cmd.cmd.part_key;
-               src = &ucmd->cmd.part_key;
-               break;
-       case __IPATH_CMD_SLAVE_INFO:
-               copy = sizeof(cmd.cmd.slave_mask_addr);
-               dest = &cmd.cmd.slave_mask_addr;
-               src = &ucmd->cmd.slave_mask_addr;
-               break;
-       case IPATH_CMD_PIOAVAILUPD:     // force an update of PIOAvail reg
-               copy = 0;
-               src = NULL;
-               dest = NULL;
-               break;
-       case IPATH_CMD_POLL_TYPE:
-               copy = sizeof(cmd.cmd.poll_type);
-               dest = &cmd.cmd.poll_type;
-               src = &ucmd->cmd.poll_type;
-               break;
-       case IPATH_CMD_ARMLAUNCH_CTRL:
-               copy = sizeof(cmd.cmd.armlaunch_ctrl);
-               dest = &cmd.cmd.armlaunch_ctrl;
-               src = &ucmd->cmd.armlaunch_ctrl;
-               break;
-       case IPATH_CMD_SDMA_INFLIGHT:
-               copy = sizeof(cmd.cmd.sdma_inflight);
-               dest = &cmd.cmd.sdma_inflight;
-               src = &ucmd->cmd.sdma_inflight;
-               break;
-       case IPATH_CMD_SDMA_COMPLETE:
-               copy = sizeof(cmd.cmd.sdma_complete);
-               dest = &cmd.cmd.sdma_complete;
-               src = &ucmd->cmd.sdma_complete;
-               break;
-       default:
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (copy) {
-               if ((count - consumed) < copy) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               if (copy_from_user(dest, src, copy)) {
-                       ret = -EFAULT;
-                       goto bail;
-               }
-
-               consumed += copy;
-       }
-
-       pd = port_fp(fp);
-       if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
-               cmd.type != IPATH_CMD_ASSIGN_PORT) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       switch (cmd.type) {
-       case IPATH_CMD_ASSIGN_PORT:
-               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
-               if (ret)
-                       goto bail;
-               break;
-       case __IPATH_CMD_USER_INIT:
-               /* backwards compatibility, get port first */
-               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
-               if (ret)
-                       goto bail;
-               /* and fall through to current version. */
-       case IPATH_CMD_USER_INIT:
-               ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
-               if (ret)
-                       goto bail;
-               ret = ipath_get_base_info(
-                       fp, (void __user *) (unsigned long)
-                       cmd.cmd.user_info.spu_base_info,
-                       cmd.cmd.user_info.spu_base_info_size);
-               break;
-       case IPATH_CMD_RECV_CTRL:
-               ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
-               break;
-       case IPATH_CMD_PORT_INFO:
-               ret = ipath_port_info(pd, subport_fp(fp),
-                                     (struct ipath_port_info __user *)
-                                     (unsigned long) cmd.cmd.port_info);
-               break;
-       case IPATH_CMD_TID_UPDATE:
-               ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
-               break;
-       case IPATH_CMD_TID_FREE:
-               ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
-               break;
-       case IPATH_CMD_SET_PART_KEY:
-               ret = ipath_set_part_key(pd, cmd.cmd.part_key);
-               break;
-       case __IPATH_CMD_SLAVE_INFO:
-               ret = ipath_get_slave_info(pd,
-                                          (void __user *) (unsigned long)
-                                          cmd.cmd.slave_mask_addr);
-               break;
-       case IPATH_CMD_PIOAVAILUPD:
-               ipath_force_pio_avail_update(pd->port_dd);
-               break;
-       case IPATH_CMD_POLL_TYPE:
-               pd->poll_type = cmd.cmd.poll_type;
-               break;
-       case IPATH_CMD_ARMLAUNCH_CTRL:
-               if (cmd.cmd.armlaunch_ctrl)
-                       ipath_enable_armlaunch(pd->port_dd);
-               else
-                       ipath_disable_armlaunch(pd->port_dd);
-               break;
-       case IPATH_CMD_SDMA_INFLIGHT:
-               ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
-                                             (u32 __user *) (unsigned long)
-                                             cmd.cmd.sdma_inflight);
-               break;
-       case IPATH_CMD_SDMA_COMPLETE:
-               ret = ipath_sdma_get_complete(pd->port_dd,
-                                             user_sdma_queue_fp(fp),
-                                             (u32 __user *) (unsigned long)
-                                             cmd.cmd.sdma_complete);
-               break;
-       }
-
-       if (ret >= 0)
-               ret = consumed;
-
-bail:
-       return ret;
-}
-
-static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *filp = iocb->ki_filp;
-       struct ipath_filedata *fp = filp->private_data;
-       struct ipath_portdata *pd = port_fp(filp);
-       struct ipath_user_sdma_queue *pq = fp->pq;
-
-       if (!iter_is_iovec(from) || !from->nr_segs)
-               return -EINVAL;
-
-       return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
-}
-
-static struct class *ipath_class;
-
-static int init_cdev(int minor, char *name, const struct file_operations *fops,
-                    struct cdev **cdevp, struct device **devp)
-{
-       const dev_t dev = MKDEV(IPATH_MAJOR, minor);
-       struct cdev *cdev = NULL;
-       struct device *device = NULL;
-       int ret;
-
-       cdev = cdev_alloc();
-       if (!cdev) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not allocate cdev for minor %d, %s\n",
-                      minor, name);
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       cdev->owner = THIS_MODULE;
-       cdev->ops = fops;
-       kobject_set_name(&cdev->kobj, name);
-
-       ret = cdev_add(cdev, dev, 1);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME
-                      ": Could not add cdev for minor %d, %s (err %d)\n",
-                      minor, name, -ret);
-               goto err_cdev;
-       }
-
-       device = device_create(ipath_class, NULL, dev, NULL, name);
-
-       if (IS_ERR(device)) {
-               ret = PTR_ERR(device);
-               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
-                      "device for minor %d, %s (err %d)\n",
-                      minor, name, -ret);
-               goto err_cdev;
-       }
-
-       goto done;
-
-err_cdev:
-       cdev_del(cdev);
-       cdev = NULL;
-
-done:
-       if (ret >= 0) {
-               *cdevp = cdev;
-               *devp = device;
-       } else {
-               *cdevp = NULL;
-               *devp = NULL;
-       }
-
-       return ret;
-}
-
-int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
-                   struct cdev **cdevp, struct device **devp)
-{
-       return init_cdev(minor, name, fops, cdevp, devp);
-}
-
-static void cleanup_cdev(struct cdev **cdevp,
-                        struct device **devp)
-{
-       struct device *dev = *devp;
-
-       if (dev) {
-               device_unregister(dev);
-               *devp = NULL;
-       }
-
-       if (*cdevp) {
-               cdev_del(*cdevp);
-               *cdevp = NULL;
-       }
-}
-
-void ipath_cdev_cleanup(struct cdev **cdevp,
-                       struct device **devp)
-{
-       cleanup_cdev(cdevp, devp);
-}
-
-static struct cdev *wildcard_cdev;
-static struct device *wildcard_dev;
-
-static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
-
-static int user_init(void)
-{
-       int ret;
-
-       ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
-       if (ret < 0) {
-               printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
-                      "chrdev region (err %d)\n", -ret);
-               goto done;
-       }
-
-       ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
-
-       if (IS_ERR(ipath_class)) {
-               ret = PTR_ERR(ipath_class);
-               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
-                      "device class (err %d)\n", -ret);
-               goto bail;
-       }
-
-       goto done;
-bail:
-       unregister_chrdev_region(dev, IPATH_NMINORS);
-done:
-       return ret;
-}
-
-static void user_cleanup(void)
-{
-       if (ipath_class) {
-               class_destroy(ipath_class);
-               ipath_class = NULL;
-       }
-
-       unregister_chrdev_region(dev, IPATH_NMINORS);
-}
-
-static atomic_t user_count = ATOMIC_INIT(0);
-static atomic_t user_setup = ATOMIC_INIT(0);
-
-int ipath_user_add(struct ipath_devdata *dd)
-{
-       char name[10];
-       int ret;
-
-       if (atomic_inc_return(&user_count) == 1) {
-               ret = user_init();
-               if (ret < 0) {
-                       ipath_dev_err(dd, "Unable to set up user support: "
-                                     "error %d\n", -ret);
-                       goto bail;
-               }
-               ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
-                               &wildcard_dev);
-               if (ret < 0) {
-                       ipath_dev_err(dd, "Could not create wildcard "
-                                     "minor: error %d\n", -ret);
-                       goto bail_user;
-               }
-
-               atomic_set(&user_setup, 1);
-       }
-
-       snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
-
-       ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
-                       &dd->user_cdev, &dd->user_dev);
-       if (ret < 0)
-               ipath_dev_err(dd, "Could not create user minor %d, %s\n",
-                             dd->ipath_unit + 1, name);
-
-       goto bail;
-
-bail_user:
-       user_cleanup();
-bail:
-       return ret;
-}
-
-void ipath_user_remove(struct ipath_devdata *dd)
-{
-       cleanup_cdev(&dd->user_cdev, &dd->user_dev);
-
-       if (atomic_dec_return(&user_count) == 0) {
-               if (atomic_read(&user_setup) == 0)
-                       goto bail;
-
-               cleanup_cdev(&wildcard_cdev, &wildcard_dev);
-               user_cleanup();
-
-               atomic_set(&user_setup, 0);
-       }
-bail:
-       return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
deleted file mode 100644 (file)
index 25422a3..0000000
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/namei.h>
-#include <linux/slab.h>
-
-#include "ipath_kernel.h"
-
-#define IPATHFS_MAGIC 0x726a77
-
-static struct super_block *ipath_super;
-
-static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
-                        umode_t mode, const struct file_operations *fops,
-                        void *data)
-{
-       int error;
-       struct inode *inode = new_inode(dir->i_sb);
-
-       if (!inode) {
-               error = -EPERM;
-               goto bail;
-       }
-
-       inode->i_ino = get_next_ino();
-       inode->i_mode = mode;
-       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-       inode->i_private = data;
-       if (S_ISDIR(mode)) {
-               inode->i_op = &simple_dir_inode_operations;
-               inc_nlink(inode);
-               inc_nlink(dir);
-       }
-
-       inode->i_fop = fops;
-
-       d_instantiate(dentry, inode);
-       error = 0;
-
-bail:
-       return error;
-}
-
-static int create_file(const char *name, umode_t mode,
-                      struct dentry *parent, struct dentry **dentry,
-                      const struct file_operations *fops, void *data)
-{
-       int error;
-
-       mutex_lock(&d_inode(parent)->i_mutex);
-       *dentry = lookup_one_len(name, parent, strlen(name));
-       if (!IS_ERR(*dentry))
-               error = ipathfs_mknod(d_inode(parent), *dentry,
-                                     mode, fops, data);
-       else
-               error = PTR_ERR(*dentry);
-       mutex_unlock(&d_inode(parent)->i_mutex);
-
-       return error;
-}
-
-static ssize_t atomic_stats_read(struct file *file, char __user *buf,
-                                size_t count, loff_t *ppos)
-{
-       return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
-                                      sizeof ipath_stats);
-}
-
-static const struct file_operations atomic_stats_ops = {
-       .read = atomic_stats_read,
-       .llseek = default_llseek,
-};
-
-static ssize_t atomic_counters_read(struct file *file, char __user *buf,
-                                   size_t count, loff_t *ppos)
-{
-       struct infinipath_counters counters;
-       struct ipath_devdata *dd;
-
-       dd = file_inode(file)->i_private;
-       dd->ipath_f_read_counters(dd, &counters);
-
-       return simple_read_from_buffer(buf, count, ppos, &counters,
-                                      sizeof counters);
-}
-
-static const struct file_operations atomic_counters_ops = {
-       .read = atomic_counters_read,
-       .llseek = default_llseek,
-};
-
-static ssize_t flash_read(struct file *file, char __user *buf,
-                         size_t count, loff_t *ppos)
-{
-       struct ipath_devdata *dd;
-       ssize_t ret;
-       loff_t pos;
-       char *tmp;
-
-       pos = *ppos;
-
-       if ( pos < 0) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (pos >= sizeof(struct ipath_flash)) {
-               ret = 0;
-               goto bail;
-       }
-
-       if (count > sizeof(struct ipath_flash) - pos)
-               count = sizeof(struct ipath_flash) - pos;
-
-       tmp = kmalloc(count, GFP_KERNEL);
-       if (!tmp) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       dd = file_inode(file)->i_private;
-       if (ipath_eeprom_read(dd, pos, tmp, count)) {
-               ipath_dev_err(dd, "failed to read from flash\n");
-               ret = -ENXIO;
-               goto bail_tmp;
-       }
-
-       if (copy_to_user(buf, tmp, count)) {
-               ret = -EFAULT;
-               goto bail_tmp;
-       }
-
-       *ppos = pos + count;
-       ret = count;
-
-bail_tmp:
-       kfree(tmp);
-
-bail:
-       return ret;
-}
-
-static ssize_t flash_write(struct file *file, const char __user *buf,
-                          size_t count, loff_t *ppos)
-{
-       struct ipath_devdata *dd;
-       ssize_t ret;
-       loff_t pos;
-       char *tmp;
-
-       pos = *ppos;
-
-       if (pos != 0) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (count != sizeof(struct ipath_flash)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       tmp = kmalloc(count, GFP_KERNEL);
-       if (!tmp) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       if (copy_from_user(tmp, buf, count)) {
-               ret = -EFAULT;
-               goto bail_tmp;
-       }
-
-       dd = file_inode(file)->i_private;
-       if (ipath_eeprom_write(dd, pos, tmp, count)) {
-               ret = -ENXIO;
-               ipath_dev_err(dd, "failed to write to flash\n");
-               goto bail_tmp;
-       }
-
-       *ppos = pos + count;
-       ret = count;
-
-bail_tmp:
-       kfree(tmp);
-
-bail:
-       return ret;
-}
-
-static const struct file_operations flash_ops = {
-       .read = flash_read,
-       .write = flash_write,
-       .llseek = default_llseek,
-};
-
-static int create_device_files(struct super_block *sb,
-                              struct ipath_devdata *dd)
-{
-       struct dentry *dir, *tmp;
-       char unit[10];
-       int ret;
-
-       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
-       ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
-                         &simple_dir_operations, dd);
-       if (ret) {
-               printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
-               goto bail;
-       }
-
-       ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
-                         &atomic_counters_ops, dd);
-       if (ret) {
-               printk(KERN_ERR "create_file(%s/atomic_counters) "
-                      "failed: %d\n", unit, ret);
-               goto bail;
-       }
-
-       ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
-                         &flash_ops, dd);
-       if (ret) {
-               printk(KERN_ERR "create_file(%s/flash) "
-                      "failed: %d\n", unit, ret);
-               goto bail;
-       }
-
-bail:
-       return ret;
-}
-
-static int remove_file(struct dentry *parent, char *name)
-{
-       struct dentry *tmp;
-       int ret;
-
-       tmp = lookup_one_len(name, parent, strlen(name));
-
-       if (IS_ERR(tmp)) {
-               ret = PTR_ERR(tmp);
-               goto bail;
-       }
-
-       spin_lock(&tmp->d_lock);
-       if (simple_positive(tmp)) {
-               dget_dlock(tmp);
-               __d_drop(tmp);
-               spin_unlock(&tmp->d_lock);
-               simple_unlink(d_inode(parent), tmp);
-       } else
-               spin_unlock(&tmp->d_lock);
-
-       ret = 0;
-bail:
-       /*
-        * We don't expect clients to care about the return value, but
-        * it's there if they need it.
-        */
-       return ret;
-}
-
-static int remove_device_files(struct super_block *sb,
-                              struct ipath_devdata *dd)
-{
-       struct dentry *dir, *root;
-       char unit[10];
-       int ret;
-
-       root = dget(sb->s_root);
-       mutex_lock(&d_inode(root)->i_mutex);
-       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
-       dir = lookup_one_len(unit, root, strlen(unit));
-
-       if (IS_ERR(dir)) {
-               ret = PTR_ERR(dir);
-               printk(KERN_ERR "Lookup of %s failed\n", unit);
-               goto bail;
-       }
-
-       remove_file(dir, "flash");
-       remove_file(dir, "atomic_counters");
-       d_delete(dir);
-       ret = simple_rmdir(d_inode(root), dir);
-
-bail:
-       mutex_unlock(&d_inode(root)->i_mutex);
-       dput(root);
-       return ret;
-}
-
-static int ipathfs_fill_super(struct super_block *sb, void *data,
-                             int silent)
-{
-       struct ipath_devdata *dd, *tmp;
-       unsigned long flags;
-       int ret;
-
-       static struct tree_descr files[] = {
-               [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
-               {""},
-       };
-
-       ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
-       if (ret) {
-               printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
-               goto bail;
-       }
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
-               spin_unlock_irqrestore(&ipath_devs_lock, flags);
-               ret = create_device_files(sb, dd);
-               if (ret)
-                       goto bail;
-               spin_lock_irqsave(&ipath_devs_lock, flags);
-       }
-
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-bail:
-       return ret;
-}
-
-static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
-                       int flags, const char *dev_name, void *data)
-{
-       struct dentry *ret;
-       ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
-       if (!IS_ERR(ret))
-               ipath_super = ret->d_sb;
-       return ret;
-}
-
-static void ipathfs_kill_super(struct super_block *s)
-{
-       kill_litter_super(s);
-       ipath_super = NULL;
-}
-
-int ipathfs_add_device(struct ipath_devdata *dd)
-{
-       int ret;
-
-       if (ipath_super == NULL) {
-               ret = 0;
-               goto bail;
-       }
-
-       ret = create_device_files(ipath_super, dd);
-
-bail:
-       return ret;
-}
-
-int ipathfs_remove_device(struct ipath_devdata *dd)
-{
-       int ret;
-
-       if (ipath_super == NULL) {
-               ret = 0;
-               goto bail;
-       }
-
-       ret = remove_device_files(ipath_super, dd);
-
-bail:
-       return ret;
-}
-
-static struct file_system_type ipathfs_fs_type = {
-       .owner =        THIS_MODULE,
-       .name =         "ipathfs",
-       .mount =        ipathfs_mount,
-       .kill_sb =      ipathfs_kill_super,
-};
-MODULE_ALIAS_FS("ipathfs");
-
-int __init ipath_init_ipathfs(void)
-{
-       return register_filesystem(&ipathfs_fs_type);
-}
-
-void __exit ipath_exit_ipathfs(void)
-{
-       unregister_filesystem(&ipathfs_fs_type);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
deleted file mode 100644 (file)
index 7cc3054..0000000
+++ /dev/null
@@ -1,1940 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file contains all of the code that is specific to the InfiniPath
- * HT chip.
- */
-
-#include <linux/vmalloc.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/htirq.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_kernel.h"
-#include "ipath_registers.h"
-
-static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
-
-
-/*
- * This lists the InfiniPath registers, in the actual chip layout.
- * This structure should never be directly accessed.
- *
- * The names are in InterCap form because they're taken straight from
- * the chip specification.  Since they're only used in this file, they
- * don't pollute the rest of the source.
-*/
-
-struct _infinipath_do_not_use_kernel_regs {
-       unsigned long long Revision;
-       unsigned long long Control;
-       unsigned long long PageAlign;
-       unsigned long long PortCnt;
-       unsigned long long DebugPortSelect;
-       unsigned long long DebugPort;
-       unsigned long long SendRegBase;
-       unsigned long long UserRegBase;
-       unsigned long long CounterRegBase;
-       unsigned long long Scratch;
-       unsigned long long ReservedMisc1;
-       unsigned long long InterruptConfig;
-       unsigned long long IntBlocked;
-       unsigned long long IntMask;
-       unsigned long long IntStatus;
-       unsigned long long IntClear;
-       unsigned long long ErrorMask;
-       unsigned long long ErrorStatus;
-       unsigned long long ErrorClear;
-       unsigned long long HwErrMask;
-       unsigned long long HwErrStatus;
-       unsigned long long HwErrClear;
-       unsigned long long HwDiagCtrl;
-       unsigned long long MDIO;
-       unsigned long long IBCStatus;
-       unsigned long long IBCCtrl;
-       unsigned long long ExtStatus;
-       unsigned long long ExtCtrl;
-       unsigned long long GPIOOut;
-       unsigned long long GPIOMask;
-       unsigned long long GPIOStatus;
-       unsigned long long GPIOClear;
-       unsigned long long RcvCtrl;
-       unsigned long long RcvBTHQP;
-       unsigned long long RcvHdrSize;
-       unsigned long long RcvHdrCnt;
-       unsigned long long RcvHdrEntSize;
-       unsigned long long RcvTIDBase;
-       unsigned long long RcvTIDCnt;
-       unsigned long long RcvEgrBase;
-       unsigned long long RcvEgrCnt;
-       unsigned long long RcvBufBase;
-       unsigned long long RcvBufSize;
-       unsigned long long RxIntMemBase;
-       unsigned long long RxIntMemSize;
-       unsigned long long RcvPartitionKey;
-       unsigned long long ReservedRcv[10];
-       unsigned long long SendCtrl;
-       unsigned long long SendPIOBufBase;
-       unsigned long long SendPIOSize;
-       unsigned long long SendPIOBufCnt;
-       unsigned long long SendPIOAvailAddr;
-       unsigned long long TxIntMemBase;
-       unsigned long long TxIntMemSize;
-       unsigned long long ReservedSend[9];
-       unsigned long long SendBufferError;
-       unsigned long long SendBufferErrorCONT1;
-       unsigned long long SendBufferErrorCONT2;
-       unsigned long long SendBufferErrorCONT3;
-       unsigned long long ReservedSBE[4];
-       unsigned long long RcvHdrAddr0;
-       unsigned long long RcvHdrAddr1;
-       unsigned long long RcvHdrAddr2;
-       unsigned long long RcvHdrAddr3;
-       unsigned long long RcvHdrAddr4;
-       unsigned long long RcvHdrAddr5;
-       unsigned long long RcvHdrAddr6;
-       unsigned long long RcvHdrAddr7;
-       unsigned long long RcvHdrAddr8;
-       unsigned long long ReservedRHA[7];
-       unsigned long long RcvHdrTailAddr0;
-       unsigned long long RcvHdrTailAddr1;
-       unsigned long long RcvHdrTailAddr2;
-       unsigned long long RcvHdrTailAddr3;
-       unsigned long long RcvHdrTailAddr4;
-       unsigned long long RcvHdrTailAddr5;
-       unsigned long long RcvHdrTailAddr6;
-       unsigned long long RcvHdrTailAddr7;
-       unsigned long long RcvHdrTailAddr8;
-       unsigned long long ReservedRHTA[7];
-       unsigned long long Sync;        /* Software only */
-       unsigned long long Dump;        /* Software only */
-       unsigned long long SimVer;      /* Software only */
-       unsigned long long ReservedSW[5];
-       unsigned long long SerdesConfig0;
-       unsigned long long SerdesConfig1;
-       unsigned long long SerdesStatus;
-       unsigned long long XGXSConfig;
-       unsigned long long ReservedSW2[4];
-};
-
-struct _infinipath_do_not_use_counters {
-       __u64 LBIntCnt;
-       __u64 LBFlowStallCnt;
-       __u64 Reserved1;
-       __u64 TxUnsupVLErrCnt;
-       __u64 TxDataPktCnt;
-       __u64 TxFlowPktCnt;
-       __u64 TxDwordCnt;
-       __u64 TxLenErrCnt;
-       __u64 TxMaxMinLenErrCnt;
-       __u64 TxUnderrunCnt;
-       __u64 TxFlowStallCnt;
-       __u64 TxDroppedPktCnt;
-       __u64 RxDroppedPktCnt;
-       __u64 RxDataPktCnt;
-       __u64 RxFlowPktCnt;
-       __u64 RxDwordCnt;
-       __u64 RxLenErrCnt;
-       __u64 RxMaxMinLenErrCnt;
-       __u64 RxICRCErrCnt;
-       __u64 RxVCRCErrCnt;
-       __u64 RxFlowCtrlErrCnt;
-       __u64 RxBadFormatCnt;
-       __u64 RxLinkProblemCnt;
-       __u64 RxEBPCnt;
-       __u64 RxLPCRCErrCnt;
-       __u64 RxBufOvflCnt;
-       __u64 RxTIDFullErrCnt;
-       __u64 RxTIDValidErrCnt;
-       __u64 RxPKeyMismatchCnt;
-       __u64 RxP0HdrEgrOvflCnt;
-       __u64 RxP1HdrEgrOvflCnt;
-       __u64 RxP2HdrEgrOvflCnt;
-       __u64 RxP3HdrEgrOvflCnt;
-       __u64 RxP4HdrEgrOvflCnt;
-       __u64 RxP5HdrEgrOvflCnt;
-       __u64 RxP6HdrEgrOvflCnt;
-       __u64 RxP7HdrEgrOvflCnt;
-       __u64 RxP8HdrEgrOvflCnt;
-       __u64 Reserved6;
-       __u64 Reserved7;
-       __u64 IBStatusChangeCnt;
-       __u64 IBLinkErrRecoveryCnt;
-       __u64 IBLinkDownedCnt;
-       __u64 IBSymbolErrCnt;
-};
-
-#define IPATH_KREG_OFFSET(field) (offsetof( \
-       struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
-#define IPATH_CREG_OFFSET(field) (offsetof( \
-       struct _infinipath_do_not_use_counters, field) / sizeof(u64))
-
-static const struct ipath_kregs ipath_ht_kregs = {
-       .kr_control = IPATH_KREG_OFFSET(Control),
-       .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
-       .kr_debugport = IPATH_KREG_OFFSET(DebugPort),
-       .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
-       .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
-       .kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
-       .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
-       .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
-       .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
-       .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
-       .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
-       .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
-       .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
-       .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
-       .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
-       .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
-       .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
-       .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
-       .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
-       .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
-       .kr_intclear = IPATH_KREG_OFFSET(IntClear),
-       .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
-       .kr_intmask = IPATH_KREG_OFFSET(IntMask),
-       .kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
-       .kr_mdio = IPATH_KREG_OFFSET(MDIO),
-       .kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
-       .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
-       .kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
-       .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
-       .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
-       .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
-       .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
-       .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
-       .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
-       .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
-       .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
-       .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
-       .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
-       .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
-       .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
-       .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
-       .kr_revision = IPATH_KREG_OFFSET(Revision),
-       .kr_scratch = IPATH_KREG_OFFSET(Scratch),
-       .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
-       .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
-       .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
-       .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
-       .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
-       .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
-       .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
-       .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
-       .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
-       .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
-       .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
-       .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
-       .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
-       .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
-       /*
-        * These should not be used directly via ipath_write_kreg64(),
-        * use them with ipath_write_kreg64_port(),
-        */
-       .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
-       .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
-};
-
-static const struct ipath_cregs ipath_ht_cregs = {
-       .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
-       .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
-       .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
-       .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
-       .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
-       .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
-       .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
-       .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
-       .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
-       .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
-       .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
-       .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
-       /* calc from Reg_CounterRegBase + offset */
-       .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
-       .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
-       .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
-       .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
-       .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
-       .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
-       .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
-       .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
-       .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
-       .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
-       .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
-       .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
-       .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
-       .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
-       .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
-       .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
-       .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
-       .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
-       .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
-       .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
-       .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
-};
-
-/* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
-#define INFINIPATH_I_RCVURG_SHIFT 0
-#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
-#define INFINIPATH_I_RCVAVAIL_SHIFT 12
-
-/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
-#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
-#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
-#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
-#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
-#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
-#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
-#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
-#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
-#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
-#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
-#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
-#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
-#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
-#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
-#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
-#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
-#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
-#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
-#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
-#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
-
-#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
-#define IBA6110_IBCS_LINKSTATE_SHIFT 4
-
-/* kr_extstatus bits */
-#define INFINIPATH_EXTS_FREQSEL 0x2
-#define INFINIPATH_EXTS_SERDESSEL 0x4
-#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
-#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
-
-
-/* TID entries (memory), HT-only */
-#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL        /* 40 bits valid */
-#define INFINIPATH_RT_VALID 0x8000000000000000ULL
-#define INFINIPATH_RT_ADDR_SHIFT 0
-#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
-#define INFINIPATH_RT_BUFSIZE_SHIFT 48
-
-#define INFINIPATH_R_INTRAVAIL_SHIFT 16
-#define INFINIPATH_R_TAILUPD_SHIFT 31
-
-/* kr_xgxsconfig bits */
-#define INFINIPATH_XGXS_RESET          0x7ULL
-
-/*
- * masks and bits that are different in different chips, or present only
- * in one
- */
-static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
-    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
-static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
-    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
-
-static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
-    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
-    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
-    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
-static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
-    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
-
-#define _IPATH_GPIO_SDA_NUM 1
-#define _IPATH_GPIO_SCL_NUM 0
-
-#define IPATH_GPIO_SDA \
-       (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
-#define IPATH_GPIO_SCL \
-       (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
-
-/* keep the code below somewhat more readable; not used elsewhere */
-#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
-                               infinipath_hwe_htclnkabyte1crcerr)
-#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |    \
-                               infinipath_hwe_htclnkbbyte1crcerr)
-#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
-                               infinipath_hwe_htclnkbbyte0crcerr)
-#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |    \
-                               infinipath_hwe_htclnkbbyte1crcerr)
-
-static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
-                         char *msg, size_t msgl)
-{
-       char bitsmsg[64];
-       ipath_err_t crcbits = hwerrs &
-               (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
-       /* don't check if 8bit HT */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
-               crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
-       /* don't check if 8bit HT */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
-               crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
-       /*
-        * we'll want to ignore link errors on link that is
-        * not in use, if any.  For now, complain about both
-        */
-       if (crcbits) {
-               u16 ctrl0, ctrl1;
-               snprintf(bitsmsg, sizeof bitsmsg,
-                        "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
-                        !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
-                        "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
-                                   ? "1 (B)" : "0+1 (A+B)"),
-                        !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
-                        : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
-                           "0+1"), (unsigned long long) crcbits);
-               strlcat(msg, bitsmsg, msgl);
-
-               /*
-                * print extra info for debugging.  slave/primary
-                * config word 4, 8 (link control 0, 1)
-                */
-
-               if (pci_read_config_word(dd->pcidev,
-                                        dd->ipath_ht_slave_off + 0x4,
-                                        &ctrl0))
-                       dev_info(&dd->pcidev->dev, "Couldn't read "
-                                "linkctrl0 of slave/primary "
-                                "config block\n");
-               else if (!(ctrl0 & 1 << 6))
-                       /* not if EOC bit set */
-                       ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
-                                 ((ctrl0 >> 8) & 7) ? " CRC" : "",
-                                 ((ctrl0 >> 4) & 1) ? "linkfail" :
-                                 "");
-               if (pci_read_config_word(dd->pcidev,
-                                        dd->ipath_ht_slave_off + 0x8,
-                                        &ctrl1))
-                       dev_info(&dd->pcidev->dev, "Couldn't read "
-                                "linkctrl1 of slave/primary "
-                                "config block\n");
-               else if (!(ctrl1 & 1 << 6))
-                       /* not if EOC bit set */
-                       ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
-                                 ((ctrl1 >> 8) & 7) ? " CRC" : "",
-                                 ((ctrl1 >> 4) & 1) ? "linkfail" :
-                                 "");
-
-               /* disable until driver reloaded */
-               dd->ipath_hwerrmask &= ~crcbits;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-               ipath_dbg("HT crc errs: %s\n", msg);
-       } else
-               ipath_dbg("ignoring HT crc errors 0x%llx, "
-                         "not in use\n", (unsigned long long)
-                         (hwerrs & (_IPATH_HTLINK0_CRCBITS |
-                                    _IPATH_HTLINK1_CRCBITS)));
-}
-
-/* 6110 specific hardware errors... */
-static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
-       INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
-       INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
-       INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
-       INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
-       INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
-       INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
-       INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
-       INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
-};
-
-#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
-                       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
-                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
-#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
-                         << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
-
-static void ipath_ht_txe_recover(struct ipath_devdata *dd)
-{
-       ++ipath_stats.sps_txeparity;
-       dev_info(&dd->pcidev->dev,
-               "Recovering from TXE PIO parity error\n");
-}
-
-
-/**
- * ipath_ht_handle_hwerrors - display hardware errors.
- * @dd: the infinipath device
- * @msg: the output buffer
- * @msgl: the size of the output buffer
- *
- * Use same msg buffer as regular errors to avoid excessive stack
- * use.  Most hardware errors are catastrophic, but for right now,
- * we'll print them and continue.  We reuse the same message buffer as
- * ipath_handle_errors() to avoid excessive stack usage.
- */
-static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
-                                    size_t msgl)
-{
-       ipath_err_t hwerrs;
-       u32 bits, ctrl;
-       int isfatal = 0;
-       char bitsmsg[64];
-       int log_idx;
-
-       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
-
-       if (!hwerrs) {
-               ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
-               /*
-                * better than printing cofusing messages
-                * This seems to be related to clearing the crc error, or
-                * the pll error during init.
-                */
-               goto bail;
-       } else if (hwerrs == -1LL) {
-               ipath_dev_err(dd, "Read of hardware error status failed "
-                             "(all bits set); ignoring\n");
-               goto bail;
-       }
-       ipath_stats.sps_hwerrs++;
-
-       /* Always clear the error status register, except MEMBISTFAIL,
-        * regardless of whether we continue or stop using the chip.
-        * We want that set so we know it failed, even across driver reload.
-        * We'll still ignore it in the hwerrmask.  We do this partly for
-        * diagnostics, but also for support */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
-
-       hwerrs &= dd->ipath_hwerrmask;
-
-       /* We log some errors to EEPROM, check if we have any of those. */
-       for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
-               if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
-                       ipath_inc_eeprom_err(dd, log_idx, 1);
-
-       /*
-        * make sure we get this much out, unless told to be quiet,
-        * it's a parity error we may recover from,
-        * or it's occurred within the last 5 seconds
-        */
-       if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
-               RXE_EAGER_PARITY)) ||
-               (ipath_debug & __IPATH_VERBDBG))
-               dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
-                        "(cleared)\n", (unsigned long long) hwerrs);
-       dd->ipath_lasthwerror |= hwerrs;
-
-       if (hwerrs & ~dd->ipath_hwe_bitsextant)
-               ipath_dev_err(dd, "hwerror interrupt with unknown errors "
-                             "%llx set\n", (unsigned long long)
-                             (hwerrs & ~dd->ipath_hwe_bitsextant));
-
-       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-       if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
-               /*
-                * parity errors in send memory are recoverable,
-                * just cancel the send (if indicated in * sendbuffererror),
-                * count the occurrence, unfreeze (if no other handled
-                * hardware error bits are set), and continue. They can
-                * occur if a processor speculative read is done to the PIO
-                * buffer while we are sending a packet, for example.
-                */
-               if (hwerrs & TXE_PIO_PARITY) {
-                       ipath_ht_txe_recover(dd);
-                       hwerrs &= ~TXE_PIO_PARITY;
-               }
-
-               if (!hwerrs) {
-                       ipath_dbg("Clearing freezemode on ignored or "
-                                 "recovered hardware error\n");
-                       ipath_clear_freeze(dd);
-               }
-       }
-
-       *msg = '\0';
-
-       /*
-        * may someday want to decode into which bits are which
-        * functional area for parity errors, etc.
-        */
-       if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
-                     << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
-               bits = (u32) ((hwerrs >>
-                              INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
-                             INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
-               snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
-                        bits);
-               strlcat(msg, bitsmsg, msgl);
-       }
-
-       ipath_format_hwerrors(hwerrs,
-                             ipath_6110_hwerror_msgs,
-                             ARRAY_SIZE(ipath_6110_hwerror_msgs),
-                             msg, msgl);
-
-       if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
-               hwerr_crcbits(dd, hwerrs, msg, msgl);
-
-       if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
-               strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
-                       msgl);
-               /* ignore from now on, so disable until driver reloaded */
-               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-       }
-#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |       \
-                        INFINIPATH_HWE_COREPLL_RFSLIP |        \
-                        INFINIPATH_HWE_HTBPLL_FBSLIP |         \
-                        INFINIPATH_HWE_HTBPLL_RFSLIP |         \
-                        INFINIPATH_HWE_HTAPLL_FBSLIP |         \
-                        INFINIPATH_HWE_HTAPLL_RFSLIP)
-
-       if (hwerrs & _IPATH_PLL_FAIL) {
-               snprintf(bitsmsg, sizeof bitsmsg,
-                        "[PLL failed (%llx), InfiniPath hardware unusable]",
-                        (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
-               strlcat(msg, bitsmsg, msgl);
-               /* ignore from now on, so disable until driver reloaded */
-               dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-       }
-
-       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
-               /*
-                * If it occurs, it is left masked since the eternal
-                * interface is unused
-                */
-               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                                dd->ipath_hwerrmask);
-       }
-
-       if (hwerrs) {
-               /*
-                * if any set that we aren't ignoring; only
-                * make the complaint once, in case it's stuck
-                * or recurring, and we get here multiple
-                * times.
-                * force link down, so switch knows, and
-                * LEDs are turned off
-                */
-               if (dd->ipath_flags & IPATH_INITTED) {
-                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-                       ipath_setup_ht_setextled(dd,
-                               INFINIPATH_IBCS_L_STATE_DOWN,
-                               INFINIPATH_IBCS_LT_STATE_DISABLED);
-                       ipath_dev_err(dd, "Fatal Hardware Error (freeze "
-                                         "mode), no longer usable, SN %.16s\n",
-                                         dd->ipath_serial);
-                       isfatal = 1;
-               }
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-               /* mark as having had error */
-               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-               /*
-                * mark as not usable, at a minimum until driver
-                * is reloaded, probably until reboot, since no
-                * other reset is possible.
-                */
-               dd->ipath_flags &= ~IPATH_INITTED;
-       }
-       else
-               *msg = 0; /* recovered from all of them */
-       if (*msg)
-               ipath_dev_err(dd, "%s hardware error\n", msg);
-       if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
-               /*
-                * for status file; if no trailing brace is copied,
-                * we'll know it was truncated.
-                */
-               snprintf(dd->ipath_freezemsg,
-                        dd->ipath_freezelen, "{%s}", msg);
-
-bail:;
-}
-
-/**
- * ipath_ht_boardname - fill in the board name
- * @dd: the infinipath device
- * @name: the output buffer
- * @namelen: the size of the output buffer
- *
- * fill in the board name, based on the board revision register
- */
-static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
-                             size_t namelen)
-{
-       char *n = NULL;
-       u8 boardrev = dd->ipath_boardrev;
-       int ret = 0;
-
-       switch (boardrev) {
-       case 5:
-               /*
-                * original production board; two production levels, with
-                * different serial number ranges.   See ipath_ht_early_init() for
-                * case where we enable IPATH_GPIO_INTR for later serial # range.
-                * Original 112* serial number is no longer supported.
-                */
-               n = "InfiniPath_QHT7040";
-               break;
-       case 7:
-               /* small form factor production board */
-               n = "InfiniPath_QHT7140";
-               break;
-       default:                /* don't know, just print the number */
-               ipath_dev_err(dd, "Don't yet know about board "
-                             "with ID %u\n", boardrev);
-               snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
-                        boardrev);
-               break;
-       }
-       if (n)
-               snprintf(name, namelen, "%s", n);
-
-       if (ret) {
-               ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
-               goto bail;
-       }
-       if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
-               dd->ipath_minrev > 4)) {
-               /*
-                * This version of the driver only supports Rev 3.2 - 3.4
-                */
-               ipath_dev_err(dd,
-                             "Unsupported InfiniPath hardware revision %u.%u!\n",
-                             dd->ipath_majrev, dd->ipath_minrev);
-               ret = 1;
-               goto bail;
-       }
-       /*
-        * pkt/word counters are 32 bit, and therefore wrap fast enough
-        * that we snapshot them from a timer, and maintain 64 bit shadow
-        * copies
-        */
-       dd->ipath_flags |= IPATH_32BITCOUNTERS;
-       dd->ipath_flags |= IPATH_GPIO_INTR;
-       if (dd->ipath_lbus_speed != 800)
-               ipath_dev_err(dd,
-                             "Incorrectly configured for HT @ %uMHz\n",
-                             dd->ipath_lbus_speed);
-
-       /*
-        * set here, not in ipath_init_*_funcs because we have to do
-        * it after we can read chip registers.
-        */
-       dd->ipath_ureg_align =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
-
-bail:
-       return ret;
-}
-
-static void ipath_check_htlink(struct ipath_devdata *dd)
-{
-       u8 linkerr, link_off, i;
-
-       for (i = 0; i < 2; i++) {
-               link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
-               if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
-                       dev_info(&dd->pcidev->dev, "Couldn't read "
-                                "linkerror%d of HT slave/primary block\n",
-                                i);
-               else if (linkerr & 0xf0) {
-                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
-                                  "clearing\n", linkerr >> 4, i);
-                       /*
-                        * writing the linkerr bits that are set should
-                        * clear them
-                        */
-                       if (pci_write_config_byte(dd->pcidev, link_off,
-                                                 linkerr))
-                               ipath_dbg("Failed write to clear HT "
-                                         "linkerror%d\n", i);
-                       if (pci_read_config_byte(dd->pcidev, link_off,
-                                                &linkerr))
-                               dev_info(&dd->pcidev->dev,
-                                        "Couldn't reread linkerror%d of "
-                                        "HT slave/primary block\n", i);
-                       else if (linkerr & 0xf0)
-                               dev_info(&dd->pcidev->dev,
-                                        "HT linkerror%d bits 0x%x "
-                                        "couldn't be cleared\n",
-                                        i, linkerr >> 4);
-               }
-       }
-}
-
-static int ipath_setup_ht_reset(struct ipath_devdata *dd)
-{
-       ipath_dbg("No reset possible for this InfiniPath hardware\n");
-       return 0;
-}
-
-#define HT_INTR_DISC_CONFIG  0x80      /* HT interrupt and discovery cap */
-#define HT_INTR_REG_INDEX    2 /* intconfig requires indirect accesses */
-
-/*
- * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
- * errors.  We only bother to do this at load time, because it's OK if
- * it happened before we were loaded (first time after boot/reset),
- * but any time after that, it's fatal anyway.  Also need to not check
- * for upper byte errors if we are in 8 bit mode, so figure out
- * our width.  For now, at least, also complain if it's 8 bit.
- */
-static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
-                            int pos, u8 cap_type)
-{
-       u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
-       u16 linkctrl = 0;
-       int i;
-
-       dd->ipath_ht_slave_off = pos;
-       /* command word, master_host bit */
-       /* master host || slave */
-       if ((cap_type >> 2) & 1)
-               link_a_b_off = 4;
-       else
-               link_a_b_off = 0;
-       ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
-                  link_a_b_off ? 1 : 0,
-                  link_a_b_off ? 'B' : 'A');
-
-       link_a_b_off += pos;
-
-       /*
-        * check both link control registers; clear both HT CRC sets if
-        * necessary.
-        */
-       for (i = 0; i < 2; i++) {
-               link_off = pos + i * 4 + 0x4;
-               if (pci_read_config_word(pdev, link_off, &linkctrl))
-                       ipath_dev_err(dd, "Couldn't read HT link control%d "
-                                     "register\n", i);
-               else if (linkctrl & (0xf << 8)) {
-                       ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
-                                  "bits %x\n", i, linkctrl & (0xf << 8));
-                       /*
-                        * now write them back to clear the error.
-                        */
-                       pci_write_config_word(pdev, link_off,
-                                             linkctrl & (0xf << 8));
-               }
-       }
-
-       /*
-        * As with HT CRC bits, same for protocol errors that might occur
-        * during boot.
-        */
-       for (i = 0; i < 2; i++) {
-               link_off = pos + i * 4 + 0xd;
-               if (pci_read_config_byte(pdev, link_off, &linkerr))
-                       dev_info(&pdev->dev, "Couldn't read linkerror%d "
-                                "of HT slave/primary block\n", i);
-               else if (linkerr & 0xf0) {
-                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
-                                  "clearing\n", linkerr >> 4, i);
-                       /*
-                        * writing the linkerr bits that are set will clear
-                        * them
-                        */
-                       if (pci_write_config_byte
-                           (pdev, link_off, linkerr))
-                               ipath_dbg("Failed write to clear HT "
-                                         "linkerror%d\n", i);
-                       if (pci_read_config_byte(pdev, link_off, &linkerr))
-                               dev_info(&pdev->dev, "Couldn't reread "
-                                        "linkerror%d of HT slave/primary "
-                                        "block\n", i);
-                       else if (linkerr & 0xf0)
-                               dev_info(&pdev->dev, "HT linkerror%d bits "
-                                        "0x%x couldn't be cleared\n",
-                                        i, linkerr >> 4);
-               }
-       }
-
-       /*
-        * this is just for our link to the host, not devices connected
-        * through tunnel.
-        */
-
-       if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
-               ipath_dev_err(dd, "Couldn't read HT link width "
-                             "config register\n");
-       else {
-               u32 width;
-               switch (linkwidth & 7) {
-               case 5:
-                       width = 4;
-                       break;
-               case 4:
-                       width = 2;
-                       break;
-               case 3:
-                       width = 32;
-                       break;
-               case 1:
-                       width = 16;
-                       break;
-               case 0:
-               default:        /* if wrong, assume 8 bit */
-                       width = 8;
-                       break;
-               }
-
-               dd->ipath_lbus_width = width;
-
-               if (linkwidth != 0x11) {
-                       ipath_dev_err(dd, "Not configured for 16 bit HT "
-                                     "(%x)\n", linkwidth);
-                       if (!(linkwidth & 0xf)) {
-                               ipath_dbg("Will ignore HT lane1 errors\n");
-                               dd->ipath_flags |= IPATH_8BIT_IN_HT0;
-                       }
-               }
-       }
-
-       /*
-        * this is just for our link to the host, not devices connected
-        * through tunnel.
-        */
-       if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
-               ipath_dev_err(dd, "Couldn't read HT link frequency "
-                             "config register\n");
-       else {
-               u32 speed;
-               switch (linkwidth & 0xf) {
-               case 6:
-                       speed = 1000;
-                       break;
-               case 5:
-                       speed = 800;
-                       break;
-               case 4:
-                       speed = 600;
-                       break;
-               case 3:
-                       speed = 500;
-                       break;
-               case 2:
-                       speed = 400;
-                       break;
-               case 1:
-                       speed = 300;
-                       break;
-               default:
-                       /*
-                        * assume reserved and vendor-specific are 200...
-                        */
-               case 0:
-                       speed = 200;
-                       break;
-               }
-               dd->ipath_lbus_speed = speed;
-       }
-
-       snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
-               "HyperTransport,%uMHz,x%u\n",
-               dd->ipath_lbus_speed,
-               dd->ipath_lbus_width);
-}
-
-static int ipath_ht_intconfig(struct ipath_devdata *dd)
-{
-       int ret;
-
-       if (dd->ipath_intconfig) {
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
-                                dd->ipath_intconfig);  /* interrupt address */
-               ret = 0;
-       } else {
-               ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
-                             "interrupt address\n");
-               ret = -EINVAL;
-       }
-
-       return ret;
-}
-
-static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
-                               struct ht_irq_msg *msg)
-{
-       struct ipath_devdata *dd = pci_get_drvdata(dev);
-       u64 prev_intconfig = dd->ipath_intconfig;
-
-       dd->ipath_intconfig = msg->address_lo;
-       dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
-
-       /*
-        * If the previous value of dd->ipath_intconfig is zero, we're
-        * getting configured for the first time, and must not program the
-        * intconfig register here (it will be programmed later, when the
-        * hardware is ready).  Otherwise, we should.
-        */
-       if (prev_intconfig)
-               ipath_ht_intconfig(dd);
-}
-
-/**
- * ipath_setup_ht_config - setup the interruptconfig register
- * @dd: the infinipath device
- * @pdev: the PCI device
- *
- * setup the interruptconfig register from the HT config info.
- * Also clear CRC errors in HT linkcontrol, if necessary.
- * This is done only for the real hardware.  It is done before
- * chip address space is initted, so can't touch infinipath registers
- */
-static int ipath_setup_ht_config(struct ipath_devdata *dd,
-                                struct pci_dev *pdev)
-{
-       int pos, ret;
-
-       ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
-       if (ret < 0) {
-               ipath_dev_err(dd, "Couldn't create interrupt handler: "
-                             "err %d\n", ret);
-               goto bail;
-       }
-       dd->ipath_irq = ret;
-       ret = 0;
-
-       /*
-        * Handle clearing CRC errors in linkctrl register if necessary.  We
-        * do this early, before we ever enable errors or hardware errors,
-        * mostly to avoid causing the chip to enter freeze mode.
-        */
-       pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
-       if (!pos) {
-               ipath_dev_err(dd, "Couldn't find HyperTransport "
-                             "capability; no interrupts\n");
-               ret = -ENODEV;
-               goto bail;
-       }
-       do {
-               u8 cap_type;
-
-               /*
-                * The HT capability type byte is 3 bytes after the
-                * capability byte.
-                */
-               if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
-                       dev_info(&pdev->dev, "Couldn't read config "
-                                "command @ %d\n", pos);
-                       continue;
-               }
-               if (!(cap_type & 0xE0))
-                       slave_or_pri_blk(dd, pdev, pos, cap_type);
-       } while ((pos = pci_find_next_capability(pdev, pos,
-                                                PCI_CAP_ID_HT)));
-
-       dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
- * @dd: the infinipath device
- *
- * Called during driver unload.
- * This is currently a nop for the HT chip, not for all chips
- */
-static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
-{
-}
-
-/**
- * ipath_setup_ht_setextled - set the state of the two external LEDs
- * @dd: the infinipath device
- * @lst: the L state
- * @ltst: the LT state
- *
- * Set the state of the two external LEDs, to indicate physical and
- * logical state of IB link.   For this chip (at least with recommended
- * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
- * (logical state)
- *
- * Note:  We try to match the Mellanox HCA LED behavior as best
- * we can.  Green indicates physical link state is OK (something is
- * plugged in, and we can train).
- * Amber indicates the link is logically up (ACTIVE).
- * Mellanox further blinks the amber LED to indicate data packet
- * activity, but we have no hardware support for that, so it would
- * require waking up every 10-20 msecs and checking the counters
- * on the chip, and then turning the LED off if appropriate.  That's
- * visible overhead, so not something we will do.
- *
- */
-static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
-                                    u64 lst, u64 ltst)
-{
-       u64 extctl;
-       unsigned long flags = 0;
-
-       /* the diags use the LED to indicate diag info, so we leave
-        * the external LED alone when the diags are running */
-       if (ipath_diag_inuse)
-               return;
-
-       /* Allow override of LED display for, e.g. Locating system in rack */
-       if (dd->ipath_led_override) {
-               ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
-                       ? INFINIPATH_IBCS_LT_STATE_LINKUP
-                       : INFINIPATH_IBCS_LT_STATE_DISABLED;
-               lst = (dd->ipath_led_override & IPATH_LED_LOG)
-                       ? INFINIPATH_IBCS_L_STATE_ACTIVE
-                       : INFINIPATH_IBCS_L_STATE_DOWN;
-       }
-
-       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
-       /*
-        * start by setting both LED control bits to off, then turn
-        * on the appropriate bit(s).
-        */
-       if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
-               /*
-                * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
-                * is inverted,  because it is normally used to indicate
-                * a hardware fault at reset, if there were errors
-                */
-               extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
-                       | INFINIPATH_EXTC_LEDGBLERR_OFF;
-               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
-                       extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
-               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
-                       extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
-       }
-       else {
-               extctl = dd->ipath_extctrl &
-                       ~(INFINIPATH_EXTC_LED1PRIPORT_ON |
-                         INFINIPATH_EXTC_LED2PRIPORT_ON);
-               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
-                       extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
-               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
-                       extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
-       }
-       dd->ipath_extctrl = extctl;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
-       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
-}
-
-static void ipath_init_ht_variables(struct ipath_devdata *dd)
-{
-       /*
-        * setup the register offsets, since they are different for each
-        * chip
-        */
-       dd->ipath_kregs = &ipath_ht_kregs;
-       dd->ipath_cregs = &ipath_ht_cregs;
-
-       dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
-       dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
-       dd->ipath_gpio_sda = IPATH_GPIO_SDA;
-       dd->ipath_gpio_scl = IPATH_GPIO_SCL;
-
-       /*
-        * Fill in data for field-values that change in newer chips.
-        * We dynamically specify only the mask for LINKTRAININGSTATE
-        * and only the shift for LINKSTATE, as they are the only ones
-        * that change.  Also precalculate the 3 link states of interest
-        * and the combined mask.
-        */
-       dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
-       dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
-       dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
-               dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
-       dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-               (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
-       dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-               (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
-       dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
-               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
-               (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
-
-       /*
-        * Fill in data for ibcc field-values that change in newer chips.
-        * We dynamically specify only the mask for LINKINITCMD
-        * and only the shift for LINKCMD and MAXPKTLEN, as they are
-        * the only ones that change.
-        */
-       dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
-       dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
-       dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
-
-       /* Fill in shifts for RcvCtrl. */
-       dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
-       dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
-       dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
-       dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
-
-       dd->ipath_i_bitsextant =
-               (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
-               (INFINIPATH_I_RCVAVAIL_MASK <<
-                INFINIPATH_I_RCVAVAIL_SHIFT) |
-               INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
-               INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
-
-       dd->ipath_e_bitsextant =
-               INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
-               INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
-               INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
-               INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
-               INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
-               INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
-               INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-               INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
-               INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
-               INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
-               INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
-               INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
-               INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
-               INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
-               INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
-               INFINIPATH_E_HARDWARE;
-
-       dd->ipath_hwe_bitsextant =
-               (INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
-                INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
-               (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
-                INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
-               (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
-                INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
-               INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
-               INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
-               INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
-               INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
-               INFINIPATH_HWE_HTCMISCERR4 |
-               INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
-               INFINIPATH_HWE_HTCMISCERR7 |
-               INFINIPATH_HWE_HTCBUSTREQPARITYERR |
-               INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
-               INFINIPATH_HWE_HTCBUSIREQPARITYERR |
-               INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
-               INFINIPATH_HWE_MEMBISTFAILED |
-               INFINIPATH_HWE_COREPLL_FBSLIP |
-               INFINIPATH_HWE_COREPLL_RFSLIP |
-               INFINIPATH_HWE_HTBPLL_FBSLIP |
-               INFINIPATH_HWE_HTBPLL_RFSLIP |
-               INFINIPATH_HWE_HTAPLL_FBSLIP |
-               INFINIPATH_HWE_HTAPLL_RFSLIP |
-               INFINIPATH_HWE_SERDESPLLFAILED |
-               INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
-               INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
-
-       dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
-       dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
-       dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
-       dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
-
-       /*
-        * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
-        * 2 is Some Misc, 3 is reserved for future.
-        */
-       dd->ipath_eep_st_masks[0].hwerrs_to_log =
-               INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
-               INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
-
-       dd->ipath_eep_st_masks[1].hwerrs_to_log =
-               INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
-               INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
-
-       dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
-
-       dd->delay_mult = 2; /* SDR, 4X, can't change */
-
-       dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
-       dd->ipath_link_speed_supported = IPATH_IB_SDR;
-       dd->ipath_link_width_enabled = IB_WIDTH_4X;
-       dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
-       /* these can't change for this chip, so set once */
-       dd->ipath_link_width_active = dd->ipath_link_width_enabled;
-       dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
-}
-
-/**
- * ipath_ht_init_hwerrors - enable hardware errors
- * @dd: the infinipath device
- *
- * now that we have finished initializing everything that might reasonably
- * cause a hardware error, and cleared those errors bits as they occur,
- * we can enable hardware errors in the mask (potentially enabling
- * freeze mode), and enable hardware errors as errors (along with
- * everything else) in errormask
- */
-static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
-{
-       ipath_err_t val;
-       u64 extsval;
-
-       extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
-
-       if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
-               ipath_dev_err(dd, "MemBIST did not complete!\n");
-       if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
-               ipath_dbg("MemBIST corrected\n");
-
-       ipath_check_htlink(dd);
-
-       /* barring bugs, all hwerrors become interrupts, which can */
-       val = -1LL;
-       /* don't look at crc lane1 if 8 bit */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
-               val &= ~infinipath_hwe_htclnkabyte1crcerr;
-       /* don't look at crc lane1 if 8 bit */
-       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
-               val &= ~infinipath_hwe_htclnkbbyte1crcerr;
-
-       /*
-        * disable RXDSYNCMEMPARITY because external serdes is unused,
-        * and therefore the logic will never be used or initialized,
-        * and uninitialized state will normally result in this error
-        * being asserted.  Similarly for the external serdess pll
-        * lock signal.
-        */
-       val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
-                INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
-
-       /*
-        * Disable MISCERR4 because of an inversion in the HT core
-        * logic checking for errors that cause this bit to be set.
-        * The errata can also cause the protocol error bit to be set
-        * in the HT config space linkerror register(s).
-        */
-       val &= ~INFINIPATH_HWE_HTCMISCERR4;
-
-       /*
-        * PLL ignored because unused MDIO interface has a logic problem
-        */
-       if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
-               val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
-       dd->ipath_hwerrmask = val;
-}
-
-
-
-
-/**
- * ipath_ht_bringup_serdes - bring up the serdes
- * @dd: the infinipath device
- */
-static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
-{
-       u64 val, config1;
-       int ret = 0, change = 0;
-
-       ipath_dbg("Trying to bringup serdes\n");
-
-       if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
-           INFINIPATH_HWE_SERDESPLLFAILED)
-       {
-               ipath_dbg("At start, serdes PLL failed bit set in "
-                         "hwerrstatus, clearing and continuing\n");
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                                INFINIPATH_HWE_SERDESPLLFAILED);
-       }
-
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-       config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
-
-       ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
-                  "config1=%llx, sstatus=%llx xgxs %llx\n",
-                  (unsigned long long) val, (unsigned long long) config1,
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
-
-       /* force reset on */
-       val |= INFINIPATH_SERDC0_RESET_PLL
-               /* | INFINIPATH_SERDC0_RESET_MASK */
-               ;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
-       udelay(15);             /* need pll reset set at least for a bit */
-
-       if (val & INFINIPATH_SERDC0_RESET_PLL) {
-               u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
-               /* set lane resets, and tx idle, during pll reset */
-               val2 |= INFINIPATH_SERDC0_RESET_MASK |
-                       INFINIPATH_SERDC0_TXIDLE;
-               ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
-                          "%llx)\n", (unsigned long long) val2);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
-                                val2);
-               /*
-                * be sure chip saw it
-                */
-               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               /*
-                * need pll reset clear at least 11 usec before lane
-                * resets cleared; give it a few more
-                */
-               udelay(15);
-               val = val2;     /* for check below */
-       }
-
-       if (val & (INFINIPATH_SERDC0_RESET_PLL |
-                  INFINIPATH_SERDC0_RESET_MASK |
-                  INFINIPATH_SERDC0_TXIDLE)) {
-               val &= ~(INFINIPATH_SERDC0_RESET_PLL |
-                        INFINIPATH_SERDC0_RESET_MASK |
-                        INFINIPATH_SERDC0_TXIDLE);
-               /* clear them */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
-                                val);
-       }
-
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-       if (val & INFINIPATH_XGXS_RESET) {
-               /* normally true after boot */
-               val &= ~INFINIPATH_XGXS_RESET;
-               change = 1;
-       }
-       if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
-            INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
-               /* need to compensate for Tx inversion in partner */
-               val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
-                        INFINIPATH_XGXS_RX_POL_SHIFT);
-               val |= dd->ipath_rx_pol_inv <<
-                       INFINIPATH_XGXS_RX_POL_SHIFT;
-               change = 1;
-       }
-       if (change)
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-
-       /* clear current and de-emphasis bits */
-       config1 &= ~0x0ffffffff00ULL;
-       /* set current to 20ma */
-       config1 |= 0x00000000000ULL;
-       /* set de-emphasis to -5.68dB */
-       config1 |= 0x0cccc000000ULL;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
-
-       ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
-                  "config1=%llx, sstatus=%llx xgxs %llx\n",
-                  (unsigned long long) val, (unsigned long long) config1,
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
-                  (unsigned long long)
-                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
-
-       return ret;             /* for now, say we always succeeded */
-}
-
-/**
- * ipath_ht_quiet_serdes - set serdes to txidle
- * @dd: the infinipath device
- * driver is being unloaded
- */
-static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
-{
-       u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
-
-       val |= INFINIPATH_SERDC0_TXIDLE;
-       ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
-                 (unsigned long long) val);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
-}
-
-/**
- * ipath_pe_put_tid - write a TID in chip
- * @dd: the infinipath device
- * @tidptr: pointer to the expected TID (in chip) to update
- * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
- * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
- *
- * This exists as a separate routine to allow for special locking etc.
- * It's used for both the full cleanup on exit, as well as the normal
- * setup and teardown.
- */
-static void ipath_ht_put_tid(struct ipath_devdata *dd,
-                            u64 __iomem *tidptr, u32 type,
-                            unsigned long pa)
-{
-       if (!dd->ipath_kregbase)
-               return;
-
-       if (pa != dd->ipath_tidinvalid) {
-               if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
-                       dev_info(&dd->pcidev->dev,
-                                "physaddr %lx has more than "
-                                "40 bits, using only 40!!!\n", pa);
-                       pa &= INFINIPATH_RT_ADDR_MASK;
-               }
-               if (type == RCVHQ_RCV_TYPE_EAGER)
-                       pa |= dd->ipath_tidtemplate;
-               else {
-                       /* in words (fixed, full page).  */
-                       u64 lenvalid = PAGE_SIZE >> 2;
-                       lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
-                       pa |= lenvalid | INFINIPATH_RT_VALID;
-               }
-       }
-
-       writeq(pa, tidptr);
-}
-
-
-/**
- * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
- * @dd: the infinipath device
- * @port: the port
- *
- * Used from ipath_close(), and at chip initialization.
- */
-static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
-{
-       u64 __iomem *tidbase;
-       int i;
-
-       if (!dd->ipath_kregbase)
-               return;
-
-       ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
-
-       /*
-        * need to invalidate all of the expected TID entries for this
-        * port, so we don't have valid entries that might somehow get
-        * used (early in next use of this port, or through some bug)
-        */
-       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-                                  dd->ipath_rcvtidbase +
-                                  port * dd->ipath_rcvtidcnt *
-                                  sizeof(*tidbase));
-       for (i = 0; i < dd->ipath_rcvtidcnt; i++)
-               ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
-                                dd->ipath_tidinvalid);
-
-       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
-                                  dd->ipath_rcvegrbase +
-                                  port * dd->ipath_rcvegrcnt *
-                                  sizeof(*tidbase));
-
-       for (i = 0; i < dd->ipath_rcvegrcnt; i++)
-               ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
-                                dd->ipath_tidinvalid);
-}
-
-/**
- * ipath_ht_tidtemplate - setup constants for TID updates
- * @dd: the infinipath device
- *
- * We setup stuff that we use a lot, to avoid calculating each time
- */
-static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
-{
-       dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
-       dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
-       dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
-
-       /*
-        * work around chip errata bug 7358, by marking invalid tids
-        * as having max length
-        */
-       dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
-               INFINIPATH_RT_BUFSIZE_SHIFT;
-}
-
-static int ipath_ht_early_init(struct ipath_devdata *dd)
-{
-       u32 __iomem *piobuf;
-       u32 pioincr, val32;
-       int i;
-
-       /*
-        * one cache line; long IB headers will spill over into received
-        * buffer
-        */
-       dd->ipath_rcvhdrentsize = 16;
-       dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
-
-       /*
-        * For HT, we allocate a somewhat overly large eager buffer,
-        * such that we can guarantee that we can receive the largest
-        * packet that we can send out.  To truly support a 4KB MTU,
-        * we need to bump this to a large value.  To date, other than
-        * testing, we have never encountered an HCA that can really
-        * send 4KB MTU packets, so we do not handle that (we'll get
-        * errors interrupts if we ever see one).
-        */
-       dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
-
-       /*
-        * the min() check here is currently a nop, but it may not
-        * always be, depending on just how we do ipath_rcvegrbufsize
-        */
-       dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
-                                dd->ipath_rcvegrbufsize);
-       dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
-       ipath_ht_tidtemplate(dd);
-
-       /*
-        * zero all the TID entries at startup.  We do this for sanity,
-        * in case of a previous driver crash of some kind, and also
-        * because the chip powers up with these memories in an unknown
-        * state.  Use portcnt, not cfgports, since this is for the
-        * full chip, not for current (possibly different) configuration
-        * value.
-        * Chip Errata bug 6447
-        */
-       for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
-               ipath_ht_clear_tids(dd, val32);
-
-       /*
-        * write the pbc of each buffer, to be sure it's initialized, then
-        * cancel all the buffers, and also abort any packets that might
-        * have been in flight for some reason (the latter is for driver
-        * unload/reload, but isn't a bad idea at first init).  PIO send
-        * isn't enabled at this point, so there is no danger of sending
-        * these out on the wire.
-        * Chip Errata bug 6610
-        */
-       piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
-                                 dd->ipath_piobufbase);
-       pioincr = dd->ipath_palign / sizeof(*piobuf);
-       for (i = 0; i < dd->ipath_piobcnt2k; i++) {
-               /*
-                * reasonable word count, just to init pbc
-                */
-               writel(16, piobuf);
-               piobuf += pioincr;
-       }
-
-       ipath_get_eeprom_info(dd);
-       if (dd->ipath_boardrev == 5) {
-               /*
-                * Later production QHT7040 has same changes as QHT7140, so
-                * can use GPIO interrupts.  They have serial #'s starting
-                * with 128, rather than 112.
-                */
-               if (dd->ipath_serial[0] == '1' &&
-                   dd->ipath_serial[1] == '2' &&
-                   dd->ipath_serial[2] == '8')
-                       dd->ipath_flags |= IPATH_GPIO_INTR;
-               else {
-                       ipath_dev_err(dd, "Unsupported InfiniPath board "
-                               "(serial number %.16s)!\n",
-                               dd->ipath_serial);
-                       return 1;
-               }
-       }
-
-       if (dd->ipath_minrev >= 4) {
-               /* Rev4+ reports extra errors via internal GPIO pins */
-               dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
-               dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                dd->ipath_gpio_mask);
-       }
-
-       return 0;
-}
-
-
-/**
- * ipath_init_ht_get_base_info - set chip-specific flags for user code
- * @dd: the infinipath device
- * @kbase: ipath_base_info pointer
- *
- * We set the PCIE flag because the lower bandwidth on PCIe vs
- * HyperTransport can affect some user packet algorithms.
- */
-static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
-{
-       struct ipath_base_info *kinfo = kbase;
-
-       kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
-               IPATH_RUNTIME_PIO_REGSWAPPED;
-
-       if (pd->port_dd->ipath_minrev < 4)
-               kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
-
-       return 0;
-}
-
-static void ipath_ht_free_irq(struct ipath_devdata *dd)
-{
-       free_irq(dd->ipath_irq, dd);
-       ht_destroy_irq(dd->ipath_irq);
-       dd->ipath_irq = 0;
-       dd->ipath_intconfig = 0;
-}
-
-static struct ipath_message_header *
-ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
-{
-       return (struct ipath_message_header *)
-               &rhf_addr[sizeof(u64) / sizeof(u32)];
-}
-
-static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
-{
-       dd->ipath_portcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
-       dd->ipath_p0_rcvegrcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-}
-
-static void ipath_ht_read_counters(struct ipath_devdata *dd,
-                                  struct infinipath_counters *cntrs)
-{
-       cntrs->LBIntCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
-       cntrs->LBFlowStallCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
-       cntrs->TxSDmaDescCnt = 0;
-       cntrs->TxUnsupVLErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
-       cntrs->TxDataPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
-       cntrs->TxFlowPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
-       cntrs->TxDwordCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
-       cntrs->TxLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
-       cntrs->TxMaxMinLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
-       cntrs->TxUnderrunCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
-       cntrs->TxFlowStallCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
-       cntrs->TxDroppedPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
-       cntrs->RxDroppedPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
-       cntrs->RxDataPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
-       cntrs->RxFlowPktCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
-       cntrs->RxDwordCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
-       cntrs->RxLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
-       cntrs->RxMaxMinLenErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
-       cntrs->RxICRCErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
-       cntrs->RxVCRCErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
-       cntrs->RxFlowCtrlErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
-       cntrs->RxBadFormatCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
-       cntrs->RxLinkProblemCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
-       cntrs->RxEBPCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
-       cntrs->RxLPCRCErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
-       cntrs->RxBufOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
-       cntrs->RxTIDFullErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
-       cntrs->RxTIDValidErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
-       cntrs->RxPKeyMismatchCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
-       cntrs->RxP0HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
-       cntrs->RxP1HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
-       cntrs->RxP2HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
-       cntrs->RxP3HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
-       cntrs->RxP4HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
-       cntrs->RxP5HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
-       cntrs->RxP6HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
-       cntrs->RxP7HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
-       cntrs->RxP8HdrEgrOvflCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
-       cntrs->RxP9HdrEgrOvflCnt = 0;
-       cntrs->RxP10HdrEgrOvflCnt = 0;
-       cntrs->RxP11HdrEgrOvflCnt = 0;
-       cntrs->RxP12HdrEgrOvflCnt = 0;
-       cntrs->RxP13HdrEgrOvflCnt = 0;
-       cntrs->RxP14HdrEgrOvflCnt = 0;
-       cntrs->RxP15HdrEgrOvflCnt = 0;
-       cntrs->RxP16HdrEgrOvflCnt = 0;
-       cntrs->IBStatusChangeCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
-       cntrs->IBLinkErrRecoveryCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
-       cntrs->IBLinkDownedCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
-       cntrs->IBSymbolErrCnt =
-               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
-       cntrs->RxVL15DroppedPktCnt = 0;
-       cntrs->RxOtherLocalPhyErrCnt = 0;
-       cntrs->PcieRetryBufDiagQwordCnt = 0;
-       cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
-       cntrs->LocalLinkIntegrityErrCnt =
-               (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
-               dd->ipath_lli_errs : dd->ipath_lli_errors;
-       cntrs->RxVlErrCnt = 0;
-       cntrs->RxDlidFltrCnt = 0;
-}
-
-
-/* no interrupt fallback for these chips */
-static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
-{
-       return 0;
-}
-
-
-/*
- * reset the XGXS (between serdes and IBC).  Slightly less intrusive
- * than resetting the IBC or external link state, and useful in some
- * cases to cause some retraining.  To do this right, we reset IBC
- * as well.
- */
-static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
-{
-       u64 val, prev_val;
-
-       prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-       val = prev_val | INFINIPATH_XGXS_RESET;
-       prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
-       ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control);
-}
-
-
-static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
-{
-       int ret;
-
-       switch (which) {
-       case IPATH_IB_CFG_LWID:
-               ret = dd->ipath_link_width_active;
-               break;
-       case IPATH_IB_CFG_SPD:
-               ret = dd->ipath_link_speed_active;
-               break;
-       case IPATH_IB_CFG_LWID_ENB:
-               ret = dd->ipath_link_width_enabled;
-               break;
-       case IPATH_IB_CFG_SPD_ENB:
-               ret = dd->ipath_link_speed_enabled;
-               break;
-       default:
-               ret =  -ENOTSUPP;
-               break;
-       }
-       return ret;
-}
-
-
-/* we assume range checking is already done, if needed */
-static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
-{
-       int ret = 0;
-
-       if (which == IPATH_IB_CFG_LWID_ENB)
-               dd->ipath_link_width_enabled = val;
-       else if (which == IPATH_IB_CFG_SPD_ENB)
-               dd->ipath_link_speed_enabled = val;
-       else
-               ret = -ENOTSUPP;
-       return ret;
-}
-
-
-static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
-{
-}
-
-
-static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
-{
-       ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
-               ipath_ib_linktrstate(dd, ibcs));
-       return 0;
-}
-
-
-/**
- * ipath_init_iba6110_funcs - set up the chip-specific function pointers
- * @dd: the infinipath device
- *
- * This is global, and is called directly at init to set up the
- * chip-specific function pointers for later use.
- */
-void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
-{
-       dd->ipath_f_intrsetup = ipath_ht_intconfig;
-       dd->ipath_f_bus = ipath_setup_ht_config;
-       dd->ipath_f_reset = ipath_setup_ht_reset;
-       dd->ipath_f_get_boardname = ipath_ht_boardname;
-       dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
-       dd->ipath_f_early_init = ipath_ht_early_init;
-       dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
-       dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
-       dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
-       dd->ipath_f_clear_tids = ipath_ht_clear_tids;
-       dd->ipath_f_put_tid = ipath_ht_put_tid;
-       dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
-       dd->ipath_f_setextled = ipath_setup_ht_setextled;
-       dd->ipath_f_get_base_info = ipath_ht_get_base_info;
-       dd->ipath_f_free_irq = ipath_ht_free_irq;
-       dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
-       dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
-       dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
-       dd->ipath_f_config_ports = ipath_ht_config_ports;
-       dd->ipath_f_read_counters = ipath_ht_read_counters;
-       dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
-       dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
-       dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
-       dd->ipath_f_config_jint = ipath_ht_config_jint;
-       dd->ipath_f_ib_updown = ipath_ht_ib_updown;
-
-       /*
-        * initialize chip-specific variables
-        */
-       ipath_init_ht_variables(dd);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
deleted file mode 100644 (file)
index be2a60e..0000000
+++ /dev/null
@@ -1,1066 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/netdevice.h>
-#include <linux/moduleparam.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_kernel.h"
-#include "ipath_common.h"
-
-/*
- * min buffers we want to have per port, after driver
- */
-#define IPATH_MIN_USER_PORT_BUFCNT 7
-
-/*
- * Number of ports we are configured to use (to allow for more pio
- * buffers per port, etc.)  Zero means use chip value.
- */
-static ushort ipath_cfgports;
-
-module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
-MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
-
-/*
- * Number of buffers reserved for driver (verbs and layered drivers.)
- * Initialized based on number of PIO buffers if not set via module interface.
- * The problem with this is that it's global, but we'll use different
- * numbers for different chip types.
- */
-static ushort ipath_kpiobufs;
-
-static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
-
-module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
-                 &ipath_kpiobufs, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
-
-/**
- * create_port0_egr - allocate the eager TID buffers
- * @dd: the infinipath device
- *
- * This code is now quite different for user and kernel, because
- * the kernel uses skb's, for the accelerated network performance.
- * This is the kernel (port0) version.
- *
- * Allocate the eager TID buffers and program them into infinipath.
- * We use the network layer alloc_skb() allocator to allocate the
- * memory, and either use the buffers as is for things like verbs
- * packets, or pass the buffers up to the ipath layered driver and
- * thence the network layer, replacing them as we do so (see
- * ipath_rcv_layer()).
- */
-static int create_port0_egr(struct ipath_devdata *dd)
-{
-       unsigned e, egrcnt;
-       struct ipath_skbinfo *skbinfo;
-       int ret;
-
-       egrcnt = dd->ipath_p0_rcvegrcnt;
-
-       skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
-       if (skbinfo == NULL) {
-               ipath_dev_err(dd, "allocation error for eager TID "
-                             "skb array\n");
-               ret = -ENOMEM;
-               goto bail;
-       }
-       for (e = 0; e < egrcnt; e++) {
-               /*
-                * This is a bit tricky in that we allocate extra
-                * space for 2 bytes of the 14 byte ethernet header.
-                * These two bytes are passed in the ipath header so
-                * the rest of the data is word aligned.  We allocate
-                * 4 bytes so that the data buffer stays word aligned.
-                * See ipath_kreceive() for more details.
-                */
-               skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
-               if (!skbinfo[e].skb) {
-                       ipath_dev_err(dd, "SKB allocation error for "
-                                     "eager TID %u\n", e);
-                       while (e != 0)
-                               dev_kfree_skb(skbinfo[--e].skb);
-                       vfree(skbinfo);
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-       }
-       /*
-        * After loop above, so we can test non-NULL to see if ready
-        * to use at receive, etc.
-        */
-       dd->ipath_port0_skbinfo = skbinfo;
-
-       for (e = 0; e < egrcnt; e++) {
-               dd->ipath_port0_skbinfo[e].phys =
-                 ipath_map_single(dd->pcidev,
-                                  dd->ipath_port0_skbinfo[e].skb->data,
-                                  dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
-               dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
-                                   ((char __iomem *) dd->ipath_kregbase +
-                                    dd->ipath_rcvegrbase),
-                                   RCVHQ_RCV_TYPE_EAGER,
-                                   dd->ipath_port0_skbinfo[e].phys);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int bringup_link(struct ipath_devdata *dd)
-{
-       u64 val, ibc;
-       int ret = 0;
-
-       /* hold IBC in reset */
-       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control);
-
-       /*
-        * set initial max size pkt IBC will send, including ICRC; it's the
-        * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
-        */
-       val = (dd->ipath_ibmaxlen >> 2) + 1;
-       ibc = val << dd->ibcc_mpl_shift;
-
-       /* flowcontrolwatermark is in units of KBytes */
-       ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
-       /*
-        * How often flowctrl sent.  More or less in usecs; balance against
-        * watermark value, so that in theory senders always get a flow
-        * control update in time to not let the IB link go idle.
-        */
-       ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
-       /* max error tolerance */
-       ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
-       /* use "real" buffer space for */
-       ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
-       /* IB credit flow control. */
-       ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
-       /* initially come up waiting for TS1, without sending anything. */
-       dd->ipath_ibcctrl = ibc;
-       /*
-        * Want to start out with both LINKCMD and LINKINITCMD in NOP
-        * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
-        * to stay a NOP. Flag that we are disabled, for the (unlikely)
-        * case that some recovery path is trying to bring the link up
-        * before we are ready.
-        */
-       ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
-               INFINIPATH_IBCC_LINKINITCMD_SHIFT;
-       dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
-       ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
-                  (unsigned long long) ibc);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
-
-       // be sure chip saw it
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-       ret = dd->ipath_f_bringup_serdes(dd);
-
-       if (ret)
-               dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
-                        "not usable\n");
-       else {
-               /* enable IBC */
-               dd->ipath_control |= INFINIPATH_C_LINKENABLE;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                                dd->ipath_control);
-       }
-
-       return ret;
-}
-
-static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
-{
-       struct ipath_portdata *pd = NULL;
-
-       pd = kzalloc(sizeof(*pd), GFP_KERNEL);
-       if (pd) {
-               pd->port_dd = dd;
-               pd->port_cnt = 1;
-               /* The port 0 pkey table is used by the layer interface. */
-               pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
-               pd->port_seq_cnt = 1;
-       }
-       return pd;
-}
-
-static int init_chip_first(struct ipath_devdata *dd)
-{
-       struct ipath_portdata *pd;
-       int ret = 0;
-       u64 val;
-
-       spin_lock_init(&dd->ipath_kernel_tid_lock);
-       spin_lock_init(&dd->ipath_user_tid_lock);
-       spin_lock_init(&dd->ipath_sendctrl_lock);
-       spin_lock_init(&dd->ipath_uctxt_lock);
-       spin_lock_init(&dd->ipath_sdma_lock);
-       spin_lock_init(&dd->ipath_gpio_lock);
-       spin_lock_init(&dd->ipath_eep_st_lock);
-       spin_lock_init(&dd->ipath_sdepb_lock);
-       mutex_init(&dd->ipath_eep_lock);
-
-       /*
-        * skip cfgports stuff because we are not allocating memory,
-        * and we don't want problems if the portcnt changed due to
-        * cfgports.  We do still check and report a difference, if
-        * not same (should be impossible).
-        */
-       dd->ipath_f_config_ports(dd, ipath_cfgports);
-       if (!ipath_cfgports)
-               dd->ipath_cfgports = dd->ipath_portcnt;
-       else if (ipath_cfgports <= dd->ipath_portcnt) {
-               dd->ipath_cfgports = ipath_cfgports;
-               ipath_dbg("Configured to use %u ports out of %u in chip\n",
-                         dd->ipath_cfgports, ipath_read_kreg32(dd,
-                         dd->ipath_kregs->kr_portcnt));
-       } else {
-               dd->ipath_cfgports = dd->ipath_portcnt;
-               ipath_dbg("Tried to configured to use %u ports; chip "
-                         "only supports %u\n", ipath_cfgports,
-                         ipath_read_kreg32(dd,
-                                 dd->ipath_kregs->kr_portcnt));
-       }
-       /*
-        * Allocate full portcnt array, rather than just cfgports, because
-        * cleanup iterates across all possible ports.
-        */
-       dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
-                              GFP_KERNEL);
-
-       if (!dd->ipath_pd) {
-               ipath_dev_err(dd, "Unable to allocate portdata array, "
-                             "failing\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       pd = create_portdata0(dd);
-       if (!pd) {
-               ipath_dev_err(dd, "Unable to allocate portdata for port "
-                             "0, failing\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-       dd->ipath_pd[0] = pd;
-
-       dd->ipath_rcvtidcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
-       dd->ipath_rcvtidbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
-       dd->ipath_rcvegrcnt =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-       dd->ipath_rcvegrbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
-       dd->ipath_palign =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
-       dd->ipath_piobufbase =
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
-       dd->ipath_piosize2k = val & ~0U;
-       dd->ipath_piosize4k = val >> 32;
-       if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
-               ipath_mtu4096 = 0; /* 4KB not supported by this chip */
-       dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
-       dd->ipath_piobcnt2k = val & ~0U;
-       dd->ipath_piobcnt4k = val >> 32;
-       dd->ipath_pio2kbase =
-               (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
-                                (dd->ipath_piobufbase & 0xffffffff));
-       if (dd->ipath_piobcnt4k) {
-               dd->ipath_pio4kbase = (u32 __iomem *)
-                       (((char __iomem *) dd->ipath_kregbase) +
-                        (dd->ipath_piobufbase >> 32));
-               /*
-                * 4K buffers take 2 pages; we use roundup just to be
-                * paranoid; we calculate it once here, rather than on
-                * ever buf allocate
-                */
-               dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
-                                         dd->ipath_palign);
-               ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
-                         "(%x aligned)\n",
-                         dd->ipath_piobcnt2k, dd->ipath_piosize2k,
-                         dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
-                         dd->ipath_piosize4k, dd->ipath_pio4kbase,
-                         dd->ipath_4kalign);
-       }
-       else ipath_dbg("%u 2k piobufs @ %p\n",
-                      dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
-
-done:
-       return ret;
-}
-
-/**
- * init_chip_reset - re-initialize after a reset, or enable
- * @dd: the infinipath device
- *
- * sanity check at least some of the values after reset, and
- * ensure no receive or transmit (explicitly, in case reset
- * failed
- */
-static int init_chip_reset(struct ipath_devdata *dd)
-{
-       u32 rtmp;
-       int i;
-       unsigned long flags;
-
-       /*
-        * ensure chip does no sends or receives, tail updates, or
-        * pioavail updates while we re-initialize
-        */
-       dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
-       for (i = 0; i < dd->ipath_portcnt; i++) {
-               clear_bit(dd->ipath_r_portenable_shift + i,
-                         &dd->ipath_rcvctrl);
-               clear_bit(dd->ipath_r_intravail_shift + i,
-                         &dd->ipath_rcvctrl);
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-               dd->ipath_rcvctrl);
-
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl = 0U; /* no sdma, etc */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
-       if (rtmp != dd->ipath_rcvtidcnt)
-               dev_info(&dd->pcidev->dev, "tidcnt was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvtidcnt, rtmp);
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
-       if (rtmp != dd->ipath_rcvtidbase)
-               dev_info(&dd->pcidev->dev, "tidbase was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvtidbase, rtmp);
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
-       if (rtmp != dd->ipath_rcvegrcnt)
-               dev_info(&dd->pcidev->dev, "egrcnt was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvegrcnt, rtmp);
-       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
-       if (rtmp != dd->ipath_rcvegrbase)
-               dev_info(&dd->pcidev->dev, "egrbase was %u before "
-                        "reset, now %u, using original\n",
-                        dd->ipath_rcvegrbase, rtmp);
-
-       return 0;
-}
-
-static int init_pioavailregs(struct ipath_devdata *dd)
-{
-       int ret;
-
-       dd->ipath_pioavailregs_dma = dma_alloc_coherent(
-               &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
-               GFP_KERNEL);
-       if (!dd->ipath_pioavailregs_dma) {
-               ipath_dev_err(dd, "failed to allocate PIOavail reg area "
-                             "in memory\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       /*
-        * we really want L2 cache aligned, but for current CPUs of
-        * interest, they are the same.
-        */
-       dd->ipath_statusp = (u64 *)
-               ((char *)dd->ipath_pioavailregs_dma +
-                ((2 * L1_CACHE_BYTES +
-                  dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
-       /* copy the current value now that it's really allocated */
-       *dd->ipath_statusp = dd->_ipath_status;
-       /*
-        * setup buffer to hold freeze msg, accessible to apps,
-        * following statusp
-        */
-       dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
-       /* and its length */
-       dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
-
-       ret = 0;
-
-done:
-       return ret;
-}
-
-/**
- * init_shadow_tids - allocate the shadow TID array
- * @dd: the infinipath device
- *
- * allocate the shadow TID array, so we can ipath_munlock previous
- * entries.  It may make more sense to move the pageshadow to the
- * port data structure, so we only allocate memory for ports actually
- * in use, since we at 8k per port, now.
- */
-static void init_shadow_tids(struct ipath_devdata *dd)
-{
-       struct page **pages;
-       dma_addr_t *addrs;
-
-       pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-                       sizeof(struct page *));
-       if (!pages) {
-               ipath_dev_err(dd, "failed to allocate shadow page * "
-                             "array, no expected sends!\n");
-               dd->ipath_pageshadow = NULL;
-               return;
-       }
-
-       addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
-                       sizeof(dma_addr_t));
-       if (!addrs) {
-               ipath_dev_err(dd, "failed to allocate shadow dma handle "
-                             "array, no expected sends!\n");
-               vfree(pages);
-               dd->ipath_pageshadow = NULL;
-               return;
-       }
-
-       dd->ipath_pageshadow = pages;
-       dd->ipath_physshadow = addrs;
-}
-
-static void enable_chip(struct ipath_devdata *dd, int reinit)
-{
-       u32 val;
-       u64 rcvmask;
-       unsigned long flags;
-       int i;
-
-       if (!reinit)
-               init_waitqueue_head(&ipath_state_wait);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       /* Enable PIO send, and update of PIOavail regs to memory. */
-       dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
-               INFINIPATH_S_PIOBUFAVAILUPD;
-
-       /*
-        * Set the PIO avail update threshold to host memory
-        * on chips that support it.
-        */
-       if (dd->ipath_pioupd_thresh)
-               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
-                       << INFINIPATH_S_UPDTHRESH_SHIFT;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /*
-        * Enable kernel ports' receive and receive interrupt.
-        * Other ports done as user opens and inits them.
-        */
-       rcvmask = 1ULL;
-       dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
-               (rcvmask << dd->ipath_r_intravail_shift);
-       if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
-               dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                        dd->ipath_rcvctrl);
-
-       /*
-        * now ready for use.  this should be cleared whenever we
-        * detect a reset, or initiate one.
-        */
-       dd->ipath_flags |= IPATH_INITTED;
-
-       /*
-        * Init our shadow copies of head from tail values,
-        * and write head values to match.
-        */
-       val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
-       ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
-
-       /* Initialize so we interrupt on next packet received */
-       ipath_write_ureg(dd, ur_rcvhdrhead,
-                        dd->ipath_rhdrhead_intr_off |
-                        dd->ipath_pd[0]->port_head, 0);
-
-       /*
-        * by now pioavail updates to memory should have occurred, so
-        * copy them into our working/shadow registers; this is in
-        * case something went wrong with abort, but mostly to get the
-        * initial values of the generation bit correct.
-        */
-       for (i = 0; i < dd->ipath_pioavregs; i++) {
-               __le64 pioavail;
-
-               /*
-                * Chip Errata bug 6641; even and odd qwords>3 are swapped.
-                */
-               if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
-                       pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
-               else
-                       pioavail = dd->ipath_pioavailregs_dma[i];
-               /*
-                * don't need to worry about ipath_pioavailkernel here
-                * because we will call ipath_chg_pioavailkernel() later
-                * in initialization, to busy out buffers as needed
-                */
-               dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
-       }
-       /* can get counters, stats, etc. */
-       dd->ipath_flags |= IPATH_PRESENT;
-}
-
-static int init_housekeeping(struct ipath_devdata *dd, int reinit)
-{
-       char boardn[40];
-       int ret = 0;
-
-       /*
-        * have to clear shadow copies of registers at init that are
-        * not otherwise set here, or all kinds of bizarre things
-        * happen with driver on chip reset
-        */
-       dd->ipath_rcvhdrsize = 0;
-
-       /*
-        * Don't clear ipath_flags as 8bit mode was set before
-        * entering this func. However, we do set the linkstate to
-        * unknown, so we can watch for a transition.
-        * PRESENT is set because we want register reads to work,
-        * and the kernel infrastructure saw it in config space;
-        * We clear it if we have failures.
-        */
-       dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
-       dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
-                            IPATH_LINKDOWN | IPATH_LINKINIT);
-
-       ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
-       dd->ipath_revision =
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
-
-       /*
-        * set up fundamental info we need to use the chip; we assume
-        * if the revision reg and these regs are OK, we don't need to
-        * special case the rest
-        */
-       dd->ipath_sregbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
-       dd->ipath_cregbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
-       dd->ipath_uregbase =
-               ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
-       ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
-                  "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
-                  dd->ipath_uregbase, dd->ipath_cregbase);
-       if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
-           || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
-           || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
-           || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
-               ipath_dev_err(dd, "Register read failures from chip, "
-                             "giving up initialization\n");
-               dd->ipath_flags &= ~IPATH_PRESENT;
-               ret = -ENODEV;
-               goto done;
-       }
-
-
-       /* clear diagctrl register, in case diags were running and crashed */
-       ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
-
-       /* clear the initial reset flag, in case first driver load */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-                        INFINIPATH_E_RESET);
-
-       ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
-                  (unsigned long long) dd->ipath_revision,
-                  dd->ipath_pcirev);
-
-       if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
-            INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
-               ipath_dev_err(dd, "Driver only handles version %d, "
-                             "chip swversion is %d (%llx), failng\n",
-                             IPATH_CHIP_SWVERSION,
-                             (int)(dd->ipath_revision >>
-                                   INFINIPATH_R_SOFTWARE_SHIFT) &
-                             INFINIPATH_R_SOFTWARE_MASK,
-                             (unsigned long long) dd->ipath_revision);
-               ret = -ENOSYS;
-               goto done;
-       }
-       dd->ipath_majrev = (u8) ((dd->ipath_revision >>
-                                 INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
-                                INFINIPATH_R_CHIPREVMAJOR_MASK);
-       dd->ipath_minrev = (u8) ((dd->ipath_revision >>
-                                 INFINIPATH_R_CHIPREVMINOR_SHIFT) &
-                                INFINIPATH_R_CHIPREVMINOR_MASK);
-       dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
-                                   INFINIPATH_R_BOARDID_SHIFT) &
-                                  INFINIPATH_R_BOARDID_MASK);
-
-       ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
-
-       snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
-                "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
-                "SW Compat %u\n",
-                IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
-                (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
-                INFINIPATH_R_ARCH_MASK,
-                dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
-                (unsigned)(dd->ipath_revision >>
-                           INFINIPATH_R_SOFTWARE_SHIFT) &
-                INFINIPATH_R_SOFTWARE_MASK);
-
-       ipath_dbg("%s", dd->ipath_boardversion);
-
-       if (ret)
-               goto done;
-
-       if (reinit)
-               ret = init_chip_reset(dd);
-       else
-               ret = init_chip_first(dd);
-
-done:
-       return ret;
-}
-
-static void verify_interrupt(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-
-       if (!dd)
-               return; /* being torn down */
-
-       /*
-        * If we don't have any interrupts, let the user know and
-        * don't bother checking again.
-        */
-       if (dd->ipath_int_counter == 0) {
-               if (!dd->ipath_f_intr_fallback(dd))
-                       dev_err(&dd->pcidev->dev, "No interrupts detected, "
-                               "not usable.\n");
-               else /* re-arm the timer to see if fallback works */
-                       mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
-       } else
-               ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
-                       dd->ipath_int_counter);
-}
-
-/**
- * ipath_init_chip - do the actual initialization sequence on the chip
- * @dd: the infinipath device
- * @reinit: reinitializing, so don't allocate new memory
- *
- * Do the actual initialization sequence on the chip.  This is done
- * both from the init routine called from the PCI infrastructure, and
- * when we reset the chip, or detect that it was reset internally,
- * or it's administratively re-enabled.
- *
- * Memory allocation here and in called routines is only done in
- * the first case (reinit == 0).  We have to be careful, because even
- * without memory allocation, we need to re-write all the chip registers
- * TIDs, etc. after the reset or enable has completed.
- */
-int ipath_init_chip(struct ipath_devdata *dd, int reinit)
-{
-       int ret = 0;
-       u32 kpiobufs, defkbufs;
-       u32 piobufs, uports;
-       u64 val;
-       struct ipath_portdata *pd;
-       gfp_t gfp_flags = GFP_USER | __GFP_COMP;
-
-       ret = init_housekeeping(dd, reinit);
-       if (ret)
-               goto done;
-
-       /*
-        * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
-        * but then it no longer nicely fits power of two, and since
-        * we now use routines that backend onto __get_free_pages, the
-        * rest would be wasted.
-        */
-       dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
-                        dd->ipath_rcvhdrcnt);
-
-       /*
-        * Set up the shadow copies of the piobufavail registers,
-        * which we compare against the chip registers for now, and
-        * the in memory DMA'ed copies of the registers.  This has to
-        * be done early, before we calculate lastport, etc.
-        */
-       piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       /*
-        * calc number of pioavail registers, and save it; we have 2
-        * bits per buffer.
-        */
-       dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
-               / (sizeof(u64) * BITS_PER_BYTE / 2);
-       uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
-       if (piobufs > 144)
-               defkbufs = 32 + dd->ipath_pioreserved;
-       else
-               defkbufs = 16 + dd->ipath_pioreserved;
-
-       if (ipath_kpiobufs && (ipath_kpiobufs +
-               (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
-               int i = (int) piobufs -
-                       (int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
-               if (i < 1)
-                       i = 1;
-               dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
-                        "%d for kernel leaves too few for %d user ports "
-                        "(%d each); using %u\n", ipath_kpiobufs,
-                        piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
-               /*
-                * shouldn't change ipath_kpiobufs, because could be
-                * different for different devices...
-                */
-               kpiobufs = i;
-       } else if (ipath_kpiobufs)
-               kpiobufs = ipath_kpiobufs;
-       else
-               kpiobufs = defkbufs;
-       dd->ipath_lastport_piobuf = piobufs - kpiobufs;
-       dd->ipath_pbufsport =
-               uports ? dd->ipath_lastport_piobuf / uports : 0;
-       /* if not an even divisor, some user ports get extra buffers */
-       dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
-               (dd->ipath_pbufsport * uports);
-       if (dd->ipath_ports_extrabuf)
-               ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
-                       "ports <= %u\n", dd->ipath_pbufsport,
-                       dd->ipath_ports_extrabuf);
-       dd->ipath_lastpioindex = 0;
-       dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
-       /* ipath_pioavailshadow initialized earlier */
-       ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
-                  "each for %u user ports\n", kpiobufs,
-                  piobufs, dd->ipath_pbufsport, uports);
-       ret = dd->ipath_f_early_init(dd);
-       if (ret) {
-               ipath_dev_err(dd, "Early initialization failure\n");
-               goto done;
-       }
-
-       /*
-        * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
-        * done after early_init.
-        */
-       dd->ipath_hdrqlast =
-               dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
-                        dd->ipath_rcvhdrentsize);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
-                        dd->ipath_rcvhdrsize);
-
-       if (!reinit) {
-               ret = init_pioavailregs(dd);
-               init_shadow_tids(dd);
-               if (ret)
-                       goto done;
-       }
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
-                        dd->ipath_pioavailregs_phys);
-
-       /*
-        * this is to detect s/w errors, which the h/w works around by
-        * ignoring the low 6 bits of address, if it wasn't aligned.
-        */
-       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
-       if (val != dd->ipath_pioavailregs_phys) {
-               ipath_dev_err(dd, "Catastrophic software error, "
-                             "SendPIOAvailAddr written as %lx, "
-                             "read back as %llx\n",
-                             (unsigned long) dd->ipath_pioavailregs_phys,
-                             (unsigned long long) val);
-               ret = -EINVAL;
-               goto done;
-       }
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
-
-       /*
-        * make sure we are not in freeze, and PIO send enabled, so
-        * writes to pbc happen
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-
-       /*
-        * before error clears, since we expect serdes pll errors during
-        * this, the first time after reset
-        */
-       if (bringup_link(dd)) {
-               dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
-               ret = -ENETDOWN;
-               goto done;
-       }
-
-       /*
-        * clear any "expected" hwerrs from reset and/or initialization
-        * clear any that aren't enabled (at least this once), and then
-        * set the enable mask
-        */
-       dd->ipath_f_init_hwerrors(dd);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
-                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
-                        dd->ipath_hwerrmask);
-
-       /* clear all */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
-       /* enable errors that are masked, at least this first time. */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-                        ~dd->ipath_maskederrs);
-       dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
-       dd->ipath_errormask =
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
-       /* clear any interrupts up to this point (ints still not enabled) */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
-
-       dd->ipath_f_tidtemplate(dd);
-
-       /*
-        * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
-        * re-init, the simplest way to handle this is to free
-        * existing, and re-allocate.
-        * Need to re-create rest of port 0 portdata as well.
-        */
-       pd = dd->ipath_pd[0];
-       if (reinit) {
-               struct ipath_portdata *npd;
-
-               /*
-                * Alloc and init new ipath_portdata for port0,
-                * Then free old pd. Could lead to fragmentation, but also
-                * makes later support for hot-swap easier.
-                */
-               npd = create_portdata0(dd);
-               if (npd) {
-                       ipath_free_pddata(dd, pd);
-                       dd->ipath_pd[0] = npd;
-                       pd = npd;
-               } else {
-                       ipath_dev_err(dd, "Unable to allocate portdata"
-                                     " for port 0, failing\n");
-                       ret = -ENOMEM;
-                       goto done;
-               }
-       }
-       ret = ipath_create_rcvhdrq(dd, pd);
-       if (!ret)
-               ret = create_port0_egr(dd);
-       if (ret) {
-               ipath_dev_err(dd, "failed to allocate kernel port's "
-                             "rcvhdrq and/or egr bufs\n");
-               goto done;
-       }
-       else
-               enable_chip(dd, reinit);
-
-       /* after enable_chip, so pioavailshadow setup */
-       ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
-
-       /*
-        * Cancel any possible active sends from early driver load.
-        * Follows early_init because some chips have to initialize
-        * PIO buffers in early_init to avoid false parity errors.
-        * After enable and ipath_chg_pioavailkernel so we can safely
-        * enable pioavail updates and PIOENABLE; packets are now
-        * ready to go out.
-        */
-       ipath_cancel_sends(dd, 1);
-
-       if (!reinit) {
-               /*
-                * Used when we close a port, for DMA already in flight
-                * at close.
-                */
-               dd->ipath_dummy_hdrq = dma_alloc_coherent(
-                       &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
-                       &dd->ipath_dummy_hdrq_phys,
-                       gfp_flags);
-               if (!dd->ipath_dummy_hdrq) {
-                       dev_info(&dd->pcidev->dev,
-                               "Couldn't allocate 0x%lx bytes for dummy hdrq\n",
-                               dd->ipath_pd[0]->port_rcvhdrq_size);
-                       /* fallback to just 0'ing */
-                       dd->ipath_dummy_hdrq_phys = 0UL;
-               }
-       }
-
-       /*
-        * cause retrigger of pending interrupts ignored during init,
-        * even if we had errors
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-
-       if (!dd->ipath_stats_timer_active) {
-               /*
-                * first init, or after an admin disable/enable
-                * set up stats retrieval timer, even if we had errors
-                * in last portion of setup
-                */
-               init_timer(&dd->ipath_stats_timer);
-               dd->ipath_stats_timer.function = ipath_get_faststats;
-               dd->ipath_stats_timer.data = (unsigned long) dd;
-               /* every 5 seconds; */
-               dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
-               /* takes ~16 seconds to overflow at full IB 4x bandwdith */
-               add_timer(&dd->ipath_stats_timer);
-               dd->ipath_stats_timer_active = 1;
-       }
-
-       /* Set up SendDMA if chip supports it */
-       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-               ret = setup_sdma(dd);
-
-       /* Set up HoL state */
-       init_timer(&dd->ipath_hol_timer);
-       dd->ipath_hol_timer.function = ipath_hol_event;
-       dd->ipath_hol_timer.data = (unsigned long)dd;
-       dd->ipath_hol_state = IPATH_HOL_UP;
-
-done:
-       if (!ret) {
-               *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
-               if (!dd->ipath_f_intrsetup(dd)) {
-                       /* now we can enable all interrupts from the chip */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
-                                        -1LL);
-                       /* force re-interrupt of any pending interrupts. */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
-                                        0ULL);
-                       /* chip is usable; mark it as initialized */
-                       *dd->ipath_statusp |= IPATH_STATUS_INITTED;
-
-                       /*
-                        * setup to verify we get an interrupt, and fallback
-                        * to an alternate if necessary and possible
-                        */
-                       if (!reinit) {
-                               init_timer(&dd->ipath_intrchk_timer);
-                               dd->ipath_intrchk_timer.function =
-                                       verify_interrupt;
-                               dd->ipath_intrchk_timer.data =
-                                       (unsigned long) dd;
-                       }
-                       dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
-                       add_timer(&dd->ipath_intrchk_timer);
-               } else
-                       ipath_dev_err(dd, "No interrupts enabled, couldn't "
-                                     "setup interrupt address\n");
-
-               if (dd->ipath_cfgports > ipath_stats.sps_nports)
-                       /*
-                        * sps_nports is a global, so, we set it to
-                        * the highest number of ports of any of the
-                        * chips we find; we never decrement it, at
-                        * least for now.  Since this might have changed
-                        * over disable/enable or prior to reset, always
-                        * do the check and potentially adjust.
-                        */
-                       ipath_stats.sps_nports = dd->ipath_cfgports;
-       } else
-               ipath_dbg("Failed (%d) to initialize chip\n", ret);
-
-       /* if ret is non-zero, we probably should do some cleanup
-          here... */
-       return ret;
-}
-
-static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
-{
-       struct ipath_devdata *dd;
-       unsigned long flags;
-       unsigned short val;
-       int ret;
-
-       ret = ipath_parse_ushort(str, &val);
-
-       spin_lock_irqsave(&ipath_devs_lock, flags);
-
-       if (ret < 0)
-               goto bail;
-
-       if (val == 0) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
-               if (dd->ipath_kregbase)
-                       continue;
-               if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
-                          (dd->ipath_cfgports *
-                           IPATH_MIN_USER_PORT_BUFCNT)))
-               {
-                       ipath_dev_err(
-                               dd,
-                               "Allocating %d PIO bufs for kernel leaves "
-                               "too few for %d user ports (%d each)\n",
-                               val, dd->ipath_cfgports - 1,
-                               IPATH_MIN_USER_PORT_BUFCNT);
-                       ret = -EINVAL;
-                       goto bail;
-               }
-               dd->ipath_lastport_piobuf =
-                       dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
-       }
-
-       ipath_kpiobufs = val;
-       ret = 0;
-bail:
-       spin_unlock_irqrestore(&ipath_devs_lock, flags);
-
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
deleted file mode 100644 (file)
index 01ba792..0000000
+++ /dev/null
@@ -1,1273 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-
-/*
- * Called when we might have an error that is specific to a particular
- * PIO buffer, and may need to cancel that buffer, so it can be re-used.
- */
-void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
-{
-       u32 piobcnt;
-       unsigned long sbuf[4];
-       /*
-        * it's possible that sendbuffererror could have bits set; might
-        * have already done this as a result of hardware error handling
-        */
-       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       /* read these before writing errorclear */
-       sbuf[0] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror);
-       sbuf[1] = ipath_read_kreg64(
-               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
-       if (piobcnt > 128)
-               sbuf[2] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
-       if (piobcnt > 192)
-               sbuf[3] = ipath_read_kreg64(
-                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
-       else
-               sbuf[3] = 0;
-
-       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
-               int i;
-               if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
-                       time_after(dd->ipath_lastcancel, jiffies)) {
-                       __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
-                                         "SendbufErrs %lx %lx", sbuf[0],
-                                         sbuf[1]);
-                       if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
-                               printk(" %lx %lx ", sbuf[2], sbuf[3]);
-                       printk("\n");
-               }
-
-               for (i = 0; i < piobcnt; i++)
-                       if (test_bit(i, sbuf))
-                               ipath_disarm_piobufs(dd, i, 1);
-               /* ignore armlaunch errs for a bit */
-               dd->ipath_lastcancel = jiffies+3;
-       }
-}
-
-
-/* These are all rcv-related errors which we want to count for stats */
-#define E_SUM_PKTERRS \
-       (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
-        INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
-        INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
-        INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
-        INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
-        INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
-
-/* These are all send-related errors which we want to count for stats */
-#define E_SUM_ERRS \
-       (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
-        INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
-        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
-        INFINIPATH_E_INVALIDADDR)
-
-/*
- * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
- * errors not related to freeze and cancelling buffers.  Can't ignore
- * armlaunch because could get more while still cleaning up, and need
- * to cancel those as they happen.
- */
-#define E_SPKT_ERRS_IGNORE \
-        (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
-        INFINIPATH_E_SPKTLEN)
-
-/*
- * these are errors that can occur when the link changes state while
- * a packet is being sent or received.  This doesn't cover things
- * like EBP or VCRC that can be the result of a sending having the
- * link change state, so we receive a "known bad" packet.
- */
-#define E_SUM_LINK_PKTERRS \
-       (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
-        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
-        INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
-        INFINIPATH_E_RUNEXPCHAR)
-
-static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
-{
-       u64 ignore_this_time = 0;
-
-       ipath_disarm_senderrbufs(dd);
-       if ((errs & E_SUM_LINK_PKTERRS) &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               /*
-                * This can happen when SMA is trying to bring the link
-                * up, but the IB link changes state at the "wrong" time.
-                * The IB logic then complains that the packet isn't
-                * valid.  We don't want to confuse people, so we just
-                * don't print them, except at debug
-                */
-               ipath_dbg("Ignoring packet errors %llx, because link not "
-                         "ACTIVE\n", (unsigned long long) errs);
-               ignore_this_time = errs & E_SUM_LINK_PKTERRS;
-       }
-
-       return ignore_this_time;
-}
-
-/* generic hw error messages... */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
-       { \
-               .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
-                         INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
-               .msg = "TXE " #a " Memory Parity"            \
-       }
-#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
-       { \
-               .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
-                         INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
-               .msg = "RXE " #a " Memory Parity"            \
-       }
-
-static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
-       INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
-       INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
-
-       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
-       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
-       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
-
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
-       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
-};
-
-/**
- * ipath_format_hwmsg - format a single hwerror message
- * @msg message buffer
- * @msgl length of message buffer
- * @hwmsg message to add to message buffer
- */
-static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
-{
-       strlcat(msg, "[", msgl);
-       strlcat(msg, hwmsg, msgl);
-       strlcat(msg, "]", msgl);
-}
-
-/**
- * ipath_format_hwerrors - format hardware error messages for display
- * @hwerrs hardware errors bit vector
- * @hwerrmsgs hardware error descriptions
- * @nhwerrmsgs number of hwerrmsgs
- * @msg message buffer
- * @msgl message buffer length
- */
-void ipath_format_hwerrors(u64 hwerrs,
-                          const struct ipath_hwerror_msgs *hwerrmsgs,
-                          size_t nhwerrmsgs,
-                          char *msg, size_t msgl)
-{
-       int i;
-       const int glen =
-           ARRAY_SIZE(ipath_generic_hwerror_msgs);
-
-       for (i=0; i<glen; i++) {
-               if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
-                       ipath_format_hwmsg(msg, msgl,
-                                          ipath_generic_hwerror_msgs[i].msg);
-               }
-       }
-
-       for (i=0; i<nhwerrmsgs; i++) {
-               if (hwerrs & hwerrmsgs[i].mask) {
-                       ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
-               }
-       }
-}
-
-/* return the strings for the most common link states */
-static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
-{
-       char *ret;
-       u32 state;
-
-       state = ipath_ib_state(dd, ibcs);
-       if (state == dd->ib_init)
-               ret = "Init";
-       else if (state == dd->ib_arm)
-               ret = "Arm";
-       else if (state == dd->ib_active)
-               ret = "Active";
-       else
-               ret = "Down";
-       return ret;
-}
-
-void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
-{
-       struct ib_event event;
-
-       event.device = &dd->verbs_dev->ibdev;
-       event.element.port_num = 1;
-       event.event = ev;
-       ib_dispatch_event(&event);
-}
-
-static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
-                                    ipath_err_t errs)
-{
-       u32 ltstate, lstate, ibstate, lastlstate;
-       u32 init = dd->ib_init;
-       u32 arm = dd->ib_arm;
-       u32 active = dd->ib_active;
-       const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
-
-       lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
-       ibstate = ipath_ib_state(dd, ibcs);
-       /* linkstate at last interrupt */
-       lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
-       ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
-
-       /*
-        * Since going into a recovery state causes the link state to go
-        * down and since recovery is transitory, it is better if we "miss"
-        * ever seeing the link training state go into recovery (i.e.,
-        * ignore this transition for link state special handling purposes)
-        * without even updating ipath_lastibcstat.
-        */
-       if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
-           (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
-           (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
-               goto done;
-
-       /*
-        * if linkstate transitions into INIT from any of the various down
-        * states, or if it transitions from any of the up (INIT or better)
-        * states into any of the down states (except link recovery), then
-        * call the chip-specific code to take appropriate actions.
-        */
-       if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
-               lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
-               /* transitioned to UP */
-               if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
-                       /* link came up, so we must no longer be disabled */
-                       dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
-                       ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
-                       goto skip_ibchange; /* chip-code handled */
-               }
-       } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
-               (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
-               ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
-               ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
-               int handled;
-               handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
-               dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
-               if (handled) {
-                       ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
-                       goto skip_ibchange; /* chip-code handled */
-               }
-       }
-
-       /*
-        * Significant enough to always print and get into logs, if it was
-        * unexpected.  If it was a requested state change, we'll have
-        * already cleared the flags, so we won't print this warning
-        */
-       if ((ibstate != arm && ibstate != active) &&
-           (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
-               dev_info(&dd->pcidev->dev, "Link state changed from %s "
-                        "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
-                        "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
-       }
-
-       if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
-           ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
-               u32 lastlts;
-               lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
-               /*
-                * Ignore cycling back and forth from Polling.Active to
-                * Polling.Quiet while waiting for the other end of the link
-                * to come up, except to try and decide if we are connected
-                * to a live IB device or not.  We will cycle back and
-                * forth between them if no cable is plugged in, the other
-                * device is powered off or disabled, etc.
-                */
-               if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
-                   lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
-                       if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
-                            (++dd->ipath_ibpollcnt == 40)) {
-                               dd->ipath_flags |= IPATH_NOCABLE;
-                               *dd->ipath_statusp |=
-                                       IPATH_STATUS_IB_NOCABLE;
-                               ipath_cdbg(LINKVERB, "Set NOCABLE\n");
-                       }
-                       ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
-                               ipath_ibcstatus_str[ltstate], ibstate);
-                       goto skip_ibchange;
-               }
-       }
-
-       dd->ipath_ibpollcnt = 0; /* not poll*, now */
-       ipath_stats.sps_iblink++;
-
-       if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
-               u64 linkrecov;
-               linkrecov = ipath_snap_cntr(dd,
-                       dd->ipath_cregs->cr_iblinkerrrecovcnt);
-               if (linkrecov != dd->ipath_lastlinkrecov) {
-                       ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
-                               (unsigned long long) ibcs,
-                               ib_linkstate(dd, ibcs),
-                               ipath_ibcstatus_str[ltstate],
-                               (unsigned long long) linkrecov);
-                       /* and no more until active again */
-                       dd->ipath_lastlinkrecov = 0;
-                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-                       goto skip_ibchange;
-               }
-       }
-
-       if (ibstate == init || ibstate == arm || ibstate == active) {
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
-               if (ibstate == init || ibstate == arm) {
-                       *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-                       if (dd->ipath_flags & IPATH_LINKACTIVE)
-                               signal_ib_event(dd, IB_EVENT_PORT_ERR);
-               }
-               if (ibstate == arm) {
-                       dd->ipath_flags |= IPATH_LINKARMED;
-                       dd->ipath_flags &= ~(IPATH_LINKUNK |
-                               IPATH_LINKINIT | IPATH_LINKDOWN |
-                               IPATH_LINKACTIVE | IPATH_NOCABLE);
-                       ipath_hol_down(dd);
-               } else  if (ibstate == init) {
-                       /*
-                        * set INIT and DOWN.  Down is checked by
-                        * most of the other code, but INIT is
-                        * useful to know in a few places.
-                        */
-                       dd->ipath_flags |= IPATH_LINKINIT |
-                               IPATH_LINKDOWN;
-                       dd->ipath_flags &= ~(IPATH_LINKUNK |
-                               IPATH_LINKARMED | IPATH_LINKACTIVE |
-                               IPATH_NOCABLE);
-                       ipath_hol_down(dd);
-               } else {  /* active */
-                       dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
-                               dd->ipath_cregs->cr_iblinkerrrecovcnt);
-                       *dd->ipath_statusp |=
-                               IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
-                       dd->ipath_flags |= IPATH_LINKACTIVE;
-                       dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-                               | IPATH_LINKDOWN | IPATH_LINKARMED |
-                               IPATH_NOCABLE);
-                       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-                               ipath_restart_sdma(dd);
-                       signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
-                       /* LED active not handled in chip _f_updown */
-                       dd->ipath_f_setextled(dd, lstate, ltstate);
-                       ipath_hol_up(dd);
-               }
-
-               /*
-                * print after we've already done the work, so as not to
-                * delay the state changes and notifications, for debugging
-                */
-               if (lstate == lastlstate)
-                       ipath_cdbg(LINKVERB, "Unchanged from last: %s "
-                               "(%x)\n", ib_linkstate(dd, ibcs), ibstate);
-               else
-                       ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
-                                 dd->ipath_unit, ib_linkstate(dd, ibcs),
-                                 ipath_ibcstatus_str[ltstate],  ibstate);
-       } else { /* down */
-               if (dd->ipath_flags & IPATH_LINKACTIVE)
-                       signal_ib_event(dd, IB_EVENT_PORT_ERR);
-               dd->ipath_flags |= IPATH_LINKDOWN;
-               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-                                    | IPATH_LINKACTIVE |
-                                    IPATH_LINKARMED);
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-               dd->ipath_lli_counter = 0;
-
-               if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
-                       ipath_cdbg(VERBOSE, "Unit %u link state down "
-                                  "(state 0x%x), from %s\n",
-                                  dd->ipath_unit, lstate,
-                                  ib_linkstate(dd, dd->ipath_lastibcstat));
-               else
-                       ipath_cdbg(LINKVERB, "Unit %u link state changed "
-                                  "to %s (0x%x) from down (%x)\n",
-                                  dd->ipath_unit,
-                                  ipath_ibcstatus_str[ltstate],
-                                  ibstate, lastlstate);
-       }
-
-skip_ibchange:
-       dd->ipath_lastibcstat = ibcs;
-done:
-       return;
-}
-
-static void handle_supp_msgs(struct ipath_devdata *dd,
-                            unsigned supp_msgs, char *msg, u32 msgsz)
-{
-       /*
-        * Print the message unless it's ibc status change only, which
-        * happens so often we never want to count it.
-        */
-       if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
-               int iserr;
-               ipath_err_t mask;
-               iserr = ipath_decode_err(dd, msg, msgsz,
-                                        dd->ipath_lasterror &
-                                        ~INFINIPATH_E_IBSTATUSCHANGED);
-
-               mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-                       INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
-
-               /* if we're in debug, then don't mask SDMADISABLED msgs */
-               if (ipath_debug & __IPATH_DBG)
-                       mask &= ~INFINIPATH_E_SDMADISABLED;
-
-               if (dd->ipath_lasterror & ~mask)
-                       ipath_dev_err(dd, "Suppressed %u messages for "
-                                     "fast-repeating errors (%s) (%llx)\n",
-                                     supp_msgs, msg,
-                                     (unsigned long long)
-                                     dd->ipath_lasterror);
-               else {
-                       /*
-                        * rcvegrfull and rcvhdrqfull are "normal", for some
-                        * types of processes (mostly benchmarks) that send
-                        * huge numbers of messages, while not processing
-                        * them. So only complain about these at debug
-                        * level.
-                        */
-                       if (iserr)
-                               ipath_dbg("Suppressed %u messages for %s\n",
-                                         supp_msgs, msg);
-                       else
-                               ipath_cdbg(ERRPKT,
-                                       "Suppressed %u messages for %s\n",
-                                         supp_msgs, msg);
-               }
-       }
-}
-
-static unsigned handle_frequent_errors(struct ipath_devdata *dd,
-                                      ipath_err_t errs, char *msg,
-                                      u32 msgsz, int *noprint)
-{
-       unsigned long nc;
-       static unsigned long nextmsg_time;
-       static unsigned nmsgs, supp_msgs;
-
-       /*
-        * Throttle back "fast" messages to no more than 10 per 5 seconds.
-        * This isn't perfect, but it's a reasonable heuristic. If we get
-        * more than 10, give a 6x longer delay.
-        */
-       nc = jiffies;
-       if (nmsgs > 10) {
-               if (time_before(nc, nextmsg_time)) {
-                       *noprint = 1;
-                       if (!supp_msgs++)
-                               nextmsg_time = nc + HZ * 3;
-               }
-               else if (supp_msgs) {
-                       handle_supp_msgs(dd, supp_msgs, msg, msgsz);
-                       supp_msgs = 0;
-                       nmsgs = 0;
-               }
-       }
-       else if (!nmsgs++ || time_after(nc, nextmsg_time))
-               nextmsg_time = nc + HZ / 2;
-
-       return supp_msgs;
-}
-
-static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
-{
-       unsigned long flags;
-       int expected;
-
-       if (ipath_debug & __IPATH_DBG) {
-               char msg[128];
-               ipath_decode_err(dd, msg, sizeof msg, errs &
-                       INFINIPATH_E_SDMAERRS);
-               ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
-       }
-       if (ipath_debug & __IPATH_VERBDBG) {
-               unsigned long tl, hd, status, lengen;
-               tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
-               hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
-               status = ipath_read_kreg64(dd
-                       , dd->ipath_kregs->kr_senddmastatus);
-               lengen = ipath_read_kreg64(dd,
-                       dd->ipath_kregs->kr_senddmalengen);
-               ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
-                       "lengen 0x%lx\n", tl, hd, status, lengen);
-       }
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-       expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       if (!expected)
-               ipath_cancel_sends(dd, 1);
-}
-
-static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
-{
-       unsigned long flags;
-       int expected;
-
-       if ((istat & INFINIPATH_I_SDMAINT) &&
-           !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               ipath_sdma_intr(dd);
-
-       if (istat & INFINIPATH_I_SDMADISABLED) {
-               expected = test_bit(IPATH_SDMA_ABORTING,
-                       &dd->ipath_sdma_status);
-               ipath_dbg("%s SDmaDisabled intr\n",
-                       expected ? "expected" : "unexpected");
-               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-               __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-               if (!expected)
-                       ipath_cancel_sends(dd, 1);
-               if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-                       tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-       }
-}
-
-static int handle_hdrq_full(struct ipath_devdata *dd)
-{
-       int chkerrpkts = 0;
-       u32 hd, tl;
-       u32 i;
-
-       ipath_stats.sps_hdrqfull++;
-       for (i = 0; i < dd->ipath_cfgports; i++) {
-               struct ipath_portdata *pd = dd->ipath_pd[i];
-
-               if (i == 0) {
-                       /*
-                        * For kernel receive queues, we just want to know
-                        * if there are packets in the queue that we can
-                        * process.
-                        */
-                       if (pd->port_head != ipath_get_hdrqtail(pd))
-                               chkerrpkts |= 1 << i;
-                       continue;
-               }
-
-               /* Skip if user context is not open */
-               if (!pd || !pd->port_cnt)
-                       continue;
-
-               /* Don't report the same point multiple times. */
-               if (dd->ipath_flags & IPATH_NODMA_RTAIL)
-                       tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
-               else
-                       tl = ipath_get_rcvhdrtail(pd);
-               if (tl == pd->port_lastrcvhdrqtail)
-                       continue;
-
-               hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
-               if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
-                       pd->port_lastrcvhdrqtail = tl;
-                       pd->port_hdrqfull++;
-                       /* flush hdrqfull so that poll() sees it */
-                       wmb();
-                       wake_up_interruptible(&pd->port_wait);
-               }
-       }
-
-       return chkerrpkts;
-}
-
-static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
-{
-       char msg[128];
-       u64 ignore_this_time = 0;
-       u64 iserr = 0;
-       int chkerrpkts = 0, noprint = 0;
-       unsigned supp_msgs;
-       int log_idx;
-
-       /*
-        * don't report errors that are masked, either at init
-        * (not set in ipath_errormask), or temporarily (set in
-        * ipath_maskederrs)
-        */
-       errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
-
-       supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
-               &noprint);
-
-       /* do these first, they are most important */
-       if (errs & INFINIPATH_E_HARDWARE) {
-               /* reuse same msg buf */
-               dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
-       } else {
-               u64 mask;
-               for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
-                       mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
-                       if (errs & mask)
-                               ipath_inc_eeprom_err(dd, log_idx, 1);
-               }
-       }
-
-       if (errs & INFINIPATH_E_SDMAERRS)
-               handle_sdma_errors(dd, errs);
-
-       if (!noprint && (errs & ~dd->ipath_e_bitsextant))
-               ipath_dev_err(dd, "error interrupt with unknown errors "
-                             "%llx set\n", (unsigned long long)
-                             (errs & ~dd->ipath_e_bitsextant));
-
-       if (errs & E_SUM_ERRS)
-               ignore_this_time = handle_e_sum_errs(dd, errs);
-       else if ((errs & E_SUM_LINK_PKTERRS) &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               /*
-                * This can happen when SMA is trying to bring the link
-                * up, but the IB link changes state at the "wrong" time.
-                * The IB logic then complains that the packet isn't
-                * valid.  We don't want to confuse people, so we just
-                * don't print them, except at debug
-                */
-               ipath_dbg("Ignoring packet errors %llx, because link not "
-                         "ACTIVE\n", (unsigned long long) errs);
-               ignore_this_time = errs & E_SUM_LINK_PKTERRS;
-       }
-
-       if (supp_msgs == 250000) {
-               int s_iserr;
-               /*
-                * It's not entirely reasonable assuming that the errors set
-                * in the last clear period are all responsible for the
-                * problem, but the alternative is to assume it's the only
-                * ones on this particular interrupt, which also isn't great
-                */
-               dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
-
-               dd->ipath_errormask &= ~dd->ipath_maskederrs;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-                                dd->ipath_errormask);
-               s_iserr = ipath_decode_err(dd, msg, sizeof msg,
-                                          dd->ipath_maskederrs);
-
-               if (dd->ipath_maskederrs &
-                   ~(INFINIPATH_E_RRCVEGRFULL |
-                     INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
-                       ipath_dev_err(dd, "Temporarily disabling "
-                           "error(s) %llx reporting; too frequent (%s)\n",
-                               (unsigned long long) dd->ipath_maskederrs,
-                               msg);
-               else {
-                       /*
-                        * rcvegrfull and rcvhdrqfull are "normal",
-                        * for some types of processes (mostly benchmarks)
-                        * that send huge numbers of messages, while not
-                        * processing them.  So only complain about
-                        * these at debug level.
-                        */
-                       if (s_iserr)
-                               ipath_dbg("Temporarily disabling reporting "
-                                   "too frequent queue full errors (%s)\n",
-                                   msg);
-                       else
-                               ipath_cdbg(ERRPKT,
-                                   "Temporarily disabling reporting too"
-                                   " frequent packet errors (%s)\n",
-                                   msg);
-               }
-
-               /*
-                * Re-enable the masked errors after around 3 minutes.  in
-                * ipath_get_faststats().  If we have a series of fast
-                * repeating but different errors, the interval will keep
-                * stretching out, but that's OK, as that's pretty
-                * catastrophic.
-                */
-               dd->ipath_unmasktime = jiffies + HZ * 180;
-       }
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
-       if (ignore_this_time)
-               errs &= ~ignore_this_time;
-       if (errs & ~dd->ipath_lasterror) {
-               errs &= ~dd->ipath_lasterror;
-               /* never suppress duplicate hwerrors or ibstatuschange */
-               dd->ipath_lasterror |= errs &
-                       ~(INFINIPATH_E_HARDWARE |
-                         INFINIPATH_E_IBSTATUSCHANGED);
-       }
-
-       if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
-               dd->ipath_spectriggerhit++;
-               ipath_dbg("%lu special trigger hits\n",
-                       dd->ipath_spectriggerhit);
-       }
-
-       /* likely due to cancel; so suppress message unless verbose */
-       if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
-               time_after(dd->ipath_lastcancel, jiffies)) {
-               /* armlaunch takes precedence; it often causes both. */
-               ipath_cdbg(VERBOSE,
-                       "Suppressed %s error (%llx) after sendbuf cancel\n",
-                       (errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
-                       "armlaunch" : "sendpktlen", (unsigned long long)errs);
-               errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
-       }
-
-       if (!errs)
-               return 0;
-
-       if (!noprint) {
-               ipath_err_t mask;
-               /*
-                * The ones we mask off are handled specially below
-                * or above.  Also mask SDMADISABLED by default as it
-                * is too chatty.
-                */
-               mask = INFINIPATH_E_IBSTATUSCHANGED |
-                       INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-                       INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
-
-               /* if we're in debug, then don't mask SDMADISABLED msgs */
-               if (ipath_debug & __IPATH_DBG)
-                       mask &= ~INFINIPATH_E_SDMADISABLED;
-
-               ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
-       } else
-               /* so we don't need if (!noprint) at strlcat's below */
-               *msg = 0;
-
-       if (errs & E_SUM_PKTERRS) {
-               ipath_stats.sps_pkterrs++;
-               chkerrpkts = 1;
-       }
-       if (errs & E_SUM_ERRS)
-               ipath_stats.sps_errs++;
-
-       if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
-               ipath_stats.sps_crcerrs++;
-               chkerrpkts = 1;
-       }
-       iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
-
-
-       /*
-        * We don't want to print these two as they happen, or we can make
-        * the situation even worse, because it takes so long to print
-        * messages to serial consoles.  Kernel ports get printed from
-        * fast_stats, no more than every 5 seconds, user ports get printed
-        * on close
-        */
-       if (errs & INFINIPATH_E_RRCVHDRFULL)
-               chkerrpkts |= handle_hdrq_full(dd);
-       if (errs & INFINIPATH_E_RRCVEGRFULL) {
-               struct ipath_portdata *pd = dd->ipath_pd[0];
-
-               /*
-                * since this is of less importance and not likely to
-                * happen without also getting hdrfull, only count
-                * occurrences; don't check each port (or even the kernel
-                * vs user)
-                */
-               ipath_stats.sps_etidfull++;
-               if (pd->port_head != ipath_get_hdrqtail(pd))
-                       chkerrpkts |= 1;
-       }
-
-       /*
-        * do this before IBSTATUSCHANGED, in case both bits set in a single
-        * interrupt; we want the STATUSCHANGE to "win", so we do our
-        * internal copy of state machine correctly
-        */
-       if (errs & INFINIPATH_E_RIBLOSTLINK) {
-               /*
-                * force through block below
-                */
-               errs |= INFINIPATH_E_IBSTATUSCHANGED;
-               ipath_stats.sps_iblink++;
-               dd->ipath_flags |= IPATH_LINKDOWN;
-               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
-                                    | IPATH_LINKARMED | IPATH_LINKACTIVE);
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-
-               ipath_dbg("Lost link, link now down (%s)\n",
-                       ipath_ibcstatus_str[ipath_read_kreg64(dd,
-                       dd->ipath_kregs->kr_ibcstatus) & 0xf]);
-       }
-       if (errs & INFINIPATH_E_IBSTATUSCHANGED)
-               handle_e_ibstatuschanged(dd, errs);
-
-       if (errs & INFINIPATH_E_RESET) {
-               if (!noprint)
-                       ipath_dev_err(dd, "Got reset, requires re-init "
-                                     "(unload and reload driver)\n");
-               dd->ipath_flags &= ~IPATH_INITTED;      /* needs re-init */
-               /* mark as having had error */
-               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-               *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
-       }
-
-       if (!noprint && *msg) {
-               if (iserr)
-                       ipath_dev_err(dd, "%s error\n", msg);
-       }
-       if (dd->ipath_state_wanted & dd->ipath_flags) {
-               ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
-                          "waking\n", dd->ipath_state_wanted,
-                          dd->ipath_flags);
-               wake_up_interruptible(&ipath_state_wait);
-       }
-
-       return chkerrpkts;
-}
-
-/*
- * try to cleanup as much as possible for anything that might have gone
- * wrong while in freeze mode, such as pio buffers being written by user
- * processes (causing armlaunch), send errors due to going into freeze mode,
- * etc., and try to avoid causing extra interrupts while doing so.
- * Forcibly update the in-memory pioavail register copies after cleanup
- * because the chip won't do it while in freeze mode (the register values
- * themselves are kept correct).
- * Make sure that we don't lose any important interrupts by using the chip
- * feature that says that writing 0 to a bit in *clear that is set in
- * *status will cause an interrupt to be generated again (if allowed by
- * the *mask value).
- */
-void ipath_clear_freeze(struct ipath_devdata *dd)
-{
-       /* disable error interrupts, to avoid confusion */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
-
-       /* also disable interrupts; errormask is sometimes overwriten */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-
-       ipath_cancel_sends(dd, 1);
-
-       /* clear the freeze, and be sure chip saw it */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
-                        dd->ipath_control);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-
-       /* force in-memory update now we are out of freeze */
-       ipath_force_pio_avail_update(dd);
-
-       /*
-        * force new interrupt if any hwerr, error or interrupt bits are
-        * still set, and clear "safe" send packet errors related to freeze
-        * and cancelling sends.  Re-enable error interrupts before possible
-        * force of re-interrupt on pending interrupts.
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
-               E_SPKT_ERRS_IGNORE);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-}
-
-
-/* this is separate to allow for better optimization of ipath_intr() */
-
-static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
-{
-       /*
-        * sometimes happen during driver init and unload, don't want
-        * to process any interrupts at that point
-        */
-
-       /* this is just a bandaid, not a fix, if something goes badly
-        * wrong */
-       if (++*unexpectp > 100) {
-               if (++*unexpectp > 105) {
-                       /*
-                        * ok, we must be taking somebody else's interrupts,
-                        * due to a messed up mptable and/or PIRQ table, so
-                        * unregister the interrupt.  We've seen this during
-                        * linuxbios development work, and it may happen in
-                        * the future again.
-                        */
-                       if (dd->pcidev && dd->ipath_irq) {
-                               ipath_dev_err(dd, "Now %u unexpected "
-                                             "interrupts, unregistering "
-                                             "interrupt handler\n",
-                                             *unexpectp);
-                               ipath_dbg("free_irq of irq %d\n",
-                                         dd->ipath_irq);
-                               dd->ipath_f_free_irq(dd);
-                       }
-               }
-               if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
-                       ipath_dev_err(dd, "%u unexpected interrupts, "
-                                     "disabling interrupts completely\n",
-                                     *unexpectp);
-                       /*
-                        * disable all interrupts, something is very wrong
-                        */
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
-                                        0ULL);
-               }
-       } else if (*unexpectp > 1)
-               ipath_dbg("Interrupt when not ready, should not happen, "
-                         "ignoring\n");
-}
-
-static noinline void ipath_bad_regread(struct ipath_devdata *dd)
-{
-       static int allbits;
-
-       /* separate routine, for better optimization of ipath_intr() */
-
-       /*
-        * We print the message and disable interrupts, in hope of
-        * having a better chance of debugging the problem.
-        */
-       ipath_dev_err(dd,
-                     "Read of interrupt status failed (all bits set)\n");
-       if (allbits++) {
-               /* disable all interrupts, something is very wrong */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
-               if (allbits == 2) {
-                       ipath_dev_err(dd, "Still bad interrupt status, "
-                                     "unregistering interrupt\n");
-                       dd->ipath_f_free_irq(dd);
-               } else if (allbits > 2) {
-                       if ((allbits % 10000) == 0)
-                               printk(".");
-               } else
-                       ipath_dev_err(dd, "Disabling interrupts, "
-                                     "multiple errors\n");
-       }
-}
-
-static void handle_layer_pioavail(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       int ret;
-
-       ret = ipath_ib_piobufavail(dd->verbs_dev);
-       if (ret > 0)
-               goto set;
-
-       return;
-set:
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                        dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-}
-
-/*
- * Handle receive interrupts for user ports; this means a user
- * process was waiting for a packet to arrive, and didn't want
- * to poll
- */
-static void handle_urcv(struct ipath_devdata *dd, u64 istat)
-{
-       u64 portr;
-       int i;
-       int rcvdint = 0;
-
-       /*
-        * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
-        * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
-        * would both like timely updates of the bits so that
-        * we don't pass them by unnecessarily.  the rmb()
-        * here ensures that we see them promptly -- the
-        * corresponding wmb()'s are in ipath_poll_urgent()
-        * and ipath_poll_next()...
-        */
-       rmb();
-       portr = ((istat >> dd->ipath_i_rcvavail_shift) &
-                dd->ipath_i_rcvavail_mask) |
-               ((istat >> dd->ipath_i_rcvurg_shift) &
-                dd->ipath_i_rcvurg_mask);
-       for (i = 1; i < dd->ipath_cfgports; i++) {
-               struct ipath_portdata *pd = dd->ipath_pd[i];
-
-               if (portr & (1 << i) && pd && pd->port_cnt) {
-                       if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
-                                              &pd->port_flag)) {
-                               clear_bit(i + dd->ipath_r_intravail_shift,
-                                         &dd->ipath_rcvctrl);
-                               wake_up_interruptible(&pd->port_wait);
-                               rcvdint = 1;
-                       } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
-                                                     &pd->port_flag)) {
-                               pd->port_urgent++;
-                               wake_up_interruptible(&pd->port_wait);
-                       }
-               }
-       }
-       if (rcvdint) {
-               /* only want to take one interrupt, so turn off the rcv
-                * interrupt for all the ports that we set the rcv_waiting
-                * (but never for kernel port)
-                */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-                                dd->ipath_rcvctrl);
-       }
-}
-
-irqreturn_t ipath_intr(int irq, void *data)
-{
-       struct ipath_devdata *dd = data;
-       u64 istat, chk0rcv = 0;
-       ipath_err_t estat = 0;
-       irqreturn_t ret;
-       static unsigned unexpected = 0;
-       u64 kportrbits;
-
-       ipath_stats.sps_ints++;
-
-       if (dd->ipath_int_counter != (u32) -1)
-               dd->ipath_int_counter++;
-
-       if (!(dd->ipath_flags & IPATH_PRESENT)) {
-               /*
-                * This return value is not great, but we do not want the
-                * interrupt core code to remove our interrupt handler
-                * because we don't appear to be handling an interrupt
-                * during a chip reset.
-                */
-               return IRQ_HANDLED;
-       }
-
-       /*
-        * this needs to be flags&initted, not statusp, so we keep
-        * taking interrupts even after link goes down, etc.
-        * Also, we *must* clear the interrupt at some point, or we won't
-        * take it again, which can be real bad for errors, etc...
-        */
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               ipath_bad_intr(dd, &unexpected);
-               ret = IRQ_NONE;
-               goto bail;
-       }
-
-       istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
-
-       if (unlikely(!istat)) {
-               ipath_stats.sps_nullintr++;
-               ret = IRQ_NONE; /* not our interrupt, or already handled */
-               goto bail;
-       }
-       if (unlikely(istat == -1)) {
-               ipath_bad_regread(dd);
-               /* don't know if it was our interrupt or not */
-               ret = IRQ_NONE;
-               goto bail;
-       }
-
-       if (unexpected)
-               unexpected = 0;
-
-       if (unlikely(istat & ~dd->ipath_i_bitsextant))
-               ipath_dev_err(dd,
-                             "interrupt with unknown interrupts %Lx set\n",
-                             (unsigned long long)
-                             istat & ~dd->ipath_i_bitsextant);
-       else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
-               ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
-                       (unsigned long long) istat);
-
-       if (istat & INFINIPATH_I_ERROR) {
-               ipath_stats.sps_errints++;
-               estat = ipath_read_kreg64(dd,
-                                         dd->ipath_kregs->kr_errorstatus);
-               if (!estat)
-                       dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
-                                "but no error bits set!\n",
-                                (unsigned long long) istat);
-               else if (estat == -1LL)
-                       /*
-                        * should we try clearing all, or hope next read
-                        * works?
-                        */
-                       ipath_dev_err(dd, "Read of error status failed "
-                                     "(all bits set); ignoring\n");
-               else
-                       chk0rcv |= handle_errors(dd, estat);
-       }
-
-       if (istat & INFINIPATH_I_GPIO) {
-               /*
-                * GPIO interrupts fall in two broad classes:
-                * GPIO_2 indicates (on some HT4xx boards) that a packet
-                *        has arrived for Port 0. Checking for this
-                *        is controlled by flag IPATH_GPIO_INTR.
-                * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
-                *        errors that we need to count. Checking for this
-                *        is controlled by flag IPATH_GPIO_ERRINTRS.
-                */
-               u32 gpiostatus;
-               u32 to_clear = 0;
-
-               gpiostatus = ipath_read_kreg32(
-                       dd, dd->ipath_kregs->kr_gpio_status);
-               /* First the error-counter case. */
-               if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
-                   (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
-                       /* want to clear the bits we see asserted. */
-                       to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
-
-                       /*
-                        * Count appropriately, clear bits out of our copy,
-                        * as they have been "handled".
-                        */
-                       if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
-                               ipath_dbg("FlowCtl on UnsupVL\n");
-                               dd->ipath_rxfc_unsupvl_errs++;
-                       }
-                       if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
-                               ipath_dbg("Overrun Threshold exceeded\n");
-                               dd->ipath_overrun_thresh_errs++;
-                       }
-                       if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
-                               ipath_dbg("Local Link Integrity error\n");
-                               dd->ipath_lli_errs++;
-                       }
-                       gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
-               }
-               /* Now the Port0 Receive case */
-               if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
-                   (dd->ipath_flags & IPATH_GPIO_INTR)) {
-                       /*
-                        * GPIO status bit 2 is set, and we expected it.
-                        * clear it and indicate in p0bits.
-                        * This probably only happens if a Port0 pkt
-                        * arrives at _just_ the wrong time, and we
-                        * handle that by seting chk0rcv;
-                        */
-                       to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
-                       gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
-                       chk0rcv = 1;
-               }
-               if (gpiostatus) {
-                       /*
-                        * Some unexpected bits remain. If they could have
-                        * caused the interrupt, complain and clear.
-                        * To avoid repetition of this condition, also clear
-                        * the mask. It is almost certainly due to error.
-                        */
-                       const u32 mask = (u32) dd->ipath_gpio_mask;
-
-                       if (mask & gpiostatus) {
-                               ipath_dbg("Unexpected GPIO IRQ bits %x\n",
-                                 gpiostatus & mask);
-                               to_clear |= (gpiostatus & mask);
-                               dd->ipath_gpio_mask &= ~(gpiostatus & mask);
-                               ipath_write_kreg(dd,
-                                       dd->ipath_kregs->kr_gpio_mask,
-                                       dd->ipath_gpio_mask);
-                       }
-               }
-               if (to_clear) {
-                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
-                                       (u64) to_clear);
-               }
-       }
-
-       /*
-        * Clear the interrupt bits we found set, unless they are receive
-        * related, in which case we already cleared them above, and don't
-        * want to clear them again, because we might lose an interrupt.
-        * Clear it early, so we "know" know the chip will have seen this by
-        * the time we process the queue, and will re-interrupt if necessary.
-        * The processor itself won't take the interrupt again until we return.
-        */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
-
-       /*
-        * Handle kernel receive queues before checking for pio buffers
-        * available since receives can overflow; piobuf waiters can afford
-        * a few extra cycles, since they were waiting anyway, and user's
-        * waiting for receive are at the bottom.
-        */
-       kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
-               (1ULL << dd->ipath_i_rcvurg_shift);
-       if (chk0rcv || (istat & kportrbits)) {
-               istat &= ~kportrbits;
-               ipath_kreceive(dd->ipath_pd[0]);
-       }
-
-       if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
-                    (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
-               handle_urcv(dd, istat);
-
-       if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
-               handle_sdma_intr(dd, istat);
-
-       if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                                dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-               /* always process; sdma verbs uses PIO for acks and VL15  */
-               handle_layer_pioavail(dd);
-       }
-
-       ret = IRQ_HANDLED;
-
-bail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
deleted file mode 100644 (file)
index f0f9471..0000000
+++ /dev/null
@@ -1,1373 +0,0 @@
-#ifndef _IPATH_KERNEL_H
-#define _IPATH_KERNEL_H
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This header file is the base header file for infinipath kernel code
- * ipath_user.h serves a similar purpose for user code.
- */
-
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/scatterlist.h>
-#include <asm/io.h>
-#include <rdma/ib_verbs.h>
-
-#include "ipath_common.h"
-#include "ipath_debug.h"
-#include "ipath_registers.h"
-
-/* only s/w major version of InfiniPath we can handle */
-#define IPATH_CHIP_VERS_MAJ 2U
-
-/* don't care about this except printing */
-#define IPATH_CHIP_VERS_MIN 0U
-
-/* temporary, maybe always */
-extern struct infinipath_stats ipath_stats;
-
-#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
-/*
- * First-cut critierion for "device is active" is
- * two thousand dwords combined Tx, Rx traffic per
- * 5-second interval. SMA packets are 64 dwords,
- * and occur "a few per second", presumably each way.
- */
-#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
-/*
- * Struct used to indicate which errors are logged in each of the
- * error-counters that are logged to EEPROM. A counter is incremented
- * _once_ (saturating at 255) for each event with any bits set in
- * the error or hwerror register masks below.
- */
-#define IPATH_EEP_LOG_CNT (4)
-struct ipath_eep_log_mask {
-       u64 errs_to_log;
-       u64 hwerrs_to_log;
-};
-
-struct ipath_portdata {
-       void **port_rcvegrbuf;
-       dma_addr_t *port_rcvegrbuf_phys;
-       /* rcvhdrq base, needs mmap before useful */
-       void *port_rcvhdrq;
-       /* kernel virtual address where hdrqtail is updated */
-       void *port_rcvhdrtail_kvaddr;
-       /*
-        * temp buffer for expected send setup, allocated at open, instead
-        * of each setup call
-        */
-       void *port_tid_pg_list;
-       /* when waiting for rcv or pioavail */
-       wait_queue_head_t port_wait;
-       /*
-        * rcvegr bufs base, physical, must fit
-        * in 44 bits so 32 bit programs mmap64 44 bit works)
-        */
-       dma_addr_t port_rcvegr_phys;
-       /* mmap of hdrq, must fit in 44 bits */
-       dma_addr_t port_rcvhdrq_phys;
-       dma_addr_t port_rcvhdrqtailaddr_phys;
-       /*
-        * number of opens (including slave subports) on this instance
-        * (ignoring forks, dup, etc. for now)
-        */
-       int port_cnt;
-       /*
-        * how much space to leave at start of eager TID entries for
-        * protocol use, on each TID
-        */
-       /* instead of calculating it */
-       unsigned port_port;
-       /* non-zero if port is being shared. */
-       u16 port_subport_cnt;
-       /* non-zero if port is being shared. */
-       u16 port_subport_id;
-       /* number of pio bufs for this port (all procs, if shared) */
-       u32 port_piocnt;
-       /* first pio buffer for this port */
-       u32 port_pio_base;
-       /* chip offset of PIO buffers for this port */
-       u32 port_piobufs;
-       /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
-       u32 port_rcvegrbuf_chunks;
-       /* how many egrbufs per chunk */
-       u32 port_rcvegrbufs_perchunk;
-       /* order for port_rcvegrbuf_pages */
-       size_t port_rcvegrbuf_size;
-       /* rcvhdrq size (for freeing) */
-       size_t port_rcvhdrq_size;
-       /* next expected TID to check when looking for free */
-       u32 port_tidcursor;
-       /* next expected TID to check */
-       unsigned long port_flag;
-       /* what happened */
-       unsigned long int_flag;
-       /* WAIT_RCV that timed out, no interrupt */
-       u32 port_rcvwait_to;
-       /* WAIT_PIO that timed out, no interrupt */
-       u32 port_piowait_to;
-       /* WAIT_RCV already happened, no wait */
-       u32 port_rcvnowait;
-       /* WAIT_PIO already happened, no wait */
-       u32 port_pionowait;
-       /* total number of rcvhdrqfull errors */
-       u32 port_hdrqfull;
-       /*
-        * Used to suppress multiple instances of same
-        * port staying stuck at same point.
-        */
-       u32 port_lastrcvhdrqtail;
-       /* saved total number of rcvhdrqfull errors for poll edge trigger */
-       u32 port_hdrqfull_poll;
-       /* total number of polled urgent packets */
-       u32 port_urgent;
-       /* saved total number of polled urgent packets for poll edge trigger */
-       u32 port_urgent_poll;
-       /* pid of process using this port */
-       struct pid *port_pid;
-       struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
-       /* same size as task_struct .comm[] */
-       char port_comm[16];
-       /* pkeys set by this use of this port */
-       u16 port_pkeys[4];
-       /* so file ops can get at unit */
-       struct ipath_devdata *port_dd;
-       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
-       void *subport_uregbase;
-       /* An array of pages for the eager receive buffers * N */
-       void *subport_rcvegrbuf;
-       /* An array of pages for the eager header queue entries * N */
-       void *subport_rcvhdr_base;
-       /* The version of the library which opened this port */
-       u32 userversion;
-       /* Bitmask of active slaves */
-       u32 active_slaves;
-       /* Type of packets or conditions we want to poll for */
-       u16 poll_type;
-       /* port rcvhdrq head offset */
-       u32 port_head;
-       /* receive packet sequence counter */
-       u32 port_seq_cnt;
-};
-
-struct sk_buff;
-struct ipath_sge_state;
-struct ipath_verbs_txreq;
-
-/*
- * control information for layered drivers
- */
-struct _ipath_layer {
-       void *l_arg;
-};
-
-struct ipath_skbinfo {
-       struct sk_buff *skb;
-       dma_addr_t phys;
-};
-
-struct ipath_sdma_txreq {
-       int                 flags;
-       int                 sg_count;
-       union {
-               struct scatterlist *sg;
-               void *map_addr;
-       };
-       void              (*callback)(void *, int);
-       void               *callback_cookie;
-       int                 callback_status;
-       u16                 start_idx;  /* sdma private */
-       u16                 next_descq_idx;  /* sdma private */
-       struct list_head    list;       /* sdma private */
-};
-
-struct ipath_sdma_desc {
-       __le64 qw[2];
-};
-
-#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
-#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
-#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
-#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
-#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
-#define IPATH_SDMA_TXREQ_F_VL15         0x20
-
-#define IPATH_SDMA_TXREQ_S_OK        0
-#define IPATH_SDMA_TXREQ_S_SENDERROR 1
-#define IPATH_SDMA_TXREQ_S_ABORTED   2
-#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
-
-#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG    (1ull << 63)
-#define IPATH_SDMA_STATUS_ABORT_IN_PROG                        (1ull << 62)
-#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE         (1ull << 61)
-#define IPATH_SDMA_STATUS_SCB_EMPTY                    (1ull << 30)
-
-/* max dwords in small buffer packet */
-#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
-
-/*
- * Possible IB config parameters for ipath_f_get/set_ib_cfg()
- */
-#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
-#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
-#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
-#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
-#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
-#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
-#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
-#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
-#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
-#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
-#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
-
-
-struct ipath_devdata {
-       struct list_head ipath_list;
-
-       struct ipath_kregs const *ipath_kregs;
-       struct ipath_cregs const *ipath_cregs;
-
-       /* mem-mapped pointer to base of chip regs */
-       u64 __iomem *ipath_kregbase;
-       /* end of mem-mapped chip space; range checking */
-       u64 __iomem *ipath_kregend;
-       /* physical address of chip for io_remap, etc. */
-       unsigned long ipath_physaddr;
-       /* base of memory alloced for ipath_kregbase, for free */
-       u64 *ipath_kregalloc;
-       /* ipath_cfgports pointers */
-       struct ipath_portdata **ipath_pd;
-       /* sk_buffs used by port 0 eager receive queue */
-       struct ipath_skbinfo *ipath_port0_skbinfo;
-       /* kvirt address of 1st 2k pio buffer */
-       void __iomem *ipath_pio2kbase;
-       /* kvirt address of 1st 4k pio buffer */
-       void __iomem *ipath_pio4kbase;
-       /*
-        * points to area where PIOavail registers will be DMA'ed.
-        * Has to be on a page of it's own, because the page will be
-        * mapped into user program space.  This copy is *ONLY* ever
-        * written by DMA, not by the driver!  Need a copy per device
-        * when we get to multiple devices
-        */
-       volatile __le64 *ipath_pioavailregs_dma;
-       /* physical address where updates occur */
-       dma_addr_t ipath_pioavailregs_phys;
-       struct _ipath_layer ipath_layer;
-       /* setup intr */
-       int (*ipath_f_intrsetup)(struct ipath_devdata *);
-       /* fallback to alternate interrupt type if possible */
-       int (*ipath_f_intr_fallback)(struct ipath_devdata *);
-       /* setup on-chip bus config */
-       int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
-       /* hard reset chip */
-       int (*ipath_f_reset)(struct ipath_devdata *);
-       int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
-                                    size_t);
-       void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
-       void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
-                                       size_t);
-       void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
-       int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
-       int (*ipath_f_early_init)(struct ipath_devdata *);
-       void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
-       void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
-                               u32, unsigned long);
-       void (*ipath_f_tidtemplate)(struct ipath_devdata *);
-       void (*ipath_f_cleanup)(struct ipath_devdata *);
-       void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
-       /* fill out chip-specific fields */
-       int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
-       /* free irq */
-       void (*ipath_f_free_irq)(struct ipath_devdata *);
-       struct ipath_message_header *(*ipath_f_get_msgheader)
-                                       (struct ipath_devdata *, __le32 *);
-       void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
-       int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
-       int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
-       void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
-       void (*ipath_f_read_counters)(struct ipath_devdata *,
-                                       struct infinipath_counters *);
-       void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
-       /* per chip actions needed for IB Link up/down changes */
-       int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
-
-       unsigned ipath_lastegr_idx;
-       struct ipath_ibdev *verbs_dev;
-       struct timer_list verbs_timer;
-       /* total dwords sent (summed from counter) */
-       u64 ipath_sword;
-       /* total dwords rcvd (summed from counter) */
-       u64 ipath_rword;
-       /* total packets sent (summed from counter) */
-       u64 ipath_spkts;
-       /* total packets rcvd (summed from counter) */
-       u64 ipath_rpkts;
-       /* ipath_statusp initially points to this. */
-       u64 _ipath_status;
-       /* GUID for this interface, in network order */
-       __be64 ipath_guid;
-       /*
-        * aggregrate of error bits reported since last cleared, for
-        * limiting of error reporting
-        */
-       ipath_err_t ipath_lasterror;
-       /*
-        * aggregrate of error bits reported since last cleared, for
-        * limiting of hwerror reporting
-        */
-       ipath_err_t ipath_lasthwerror;
-       /* errors masked because they occur too fast */
-       ipath_err_t ipath_maskederrs;
-       u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
-       /* these 5 fields are used to establish deltas for IB Symbol
-        * errors and linkrecovery errors. They can be reported on
-        * some chips during link negotiation prior to INIT, and with
-        * DDR when faking DDR negotiations with non-IBTA switches.
-        * The chip counters are adjusted at driver unload if there is
-        * a non-zero delta.
-        */
-       u64 ibdeltainprog;
-       u64 ibsymdelta;
-       u64 ibsymsnap;
-       u64 iblnkerrdelta;
-       u64 iblnkerrsnap;
-
-       /* time in jiffies at which to re-enable maskederrs */
-       unsigned long ipath_unmasktime;
-       /* count of egrfull errors, combined for all ports */
-       u64 ipath_last_tidfull;
-       /* for ipath_qcheck() */
-       u64 ipath_lastport0rcv_cnt;
-       /* template for writing TIDs  */
-       u64 ipath_tidtemplate;
-       /* value to write to free TIDs */
-       u64 ipath_tidinvalid;
-       /* IBA6120 rcv interrupt setup */
-       u64 ipath_rhdrhead_intr_off;
-
-       /* size of memory at ipath_kregbase */
-       u32 ipath_kregsize;
-       /* number of registers used for pioavail */
-       u32 ipath_pioavregs;
-       /* IPATH_POLL, etc. */
-       u32 ipath_flags;
-       /* ipath_flags driver is waiting for */
-       u32 ipath_state_wanted;
-       /* last buffer for user use, first buf for kernel use is this
-        * index. */
-       u32 ipath_lastport_piobuf;
-       /* is a stats timer active */
-       u32 ipath_stats_timer_active;
-       /* number of interrupts for this device -- saturates... */
-       u32 ipath_int_counter;
-       /* dwords sent read from counter */
-       u32 ipath_lastsword;
-       /* dwords received read from counter */
-       u32 ipath_lastrword;
-       /* sent packets read from counter */
-       u32 ipath_lastspkts;
-       /* received packets read from counter */
-       u32 ipath_lastrpkts;
-       /* pio bufs allocated per port */
-       u32 ipath_pbufsport;
-       /* if remainder on bufs/port, ports < extrabuf get 1 extra */
-       u32 ipath_ports_extrabuf;
-       u32 ipath_pioupd_thresh; /* update threshold, some chips */
-       /*
-        * number of ports configured as max; zero is set to number chip
-        * supports, less gives more pio bufs/port, etc.
-        */
-       u32 ipath_cfgports;
-       /* count of port 0 hdrqfull errors */
-       u32 ipath_p0_hdrqfull;
-       /* port 0 number of receive eager buffers */
-       u32 ipath_p0_rcvegrcnt;
-
-       /*
-        * index of last piobuffer we used.  Speeds up searching, by
-        * starting at this point.  Doesn't matter if multiple cpu's use and
-        * update, last updater is only write that matters.  Whenever it
-        * wraps, we update shadow copies.  Need a copy per device when we
-        * get to multiple devices
-        */
-       u32 ipath_lastpioindex;
-       u32 ipath_lastpioindexl;
-       /* max length of freezemsg */
-       u32 ipath_freezelen;
-       /*
-        * consecutive times we wanted a PIO buffer but were unable to
-        * get one
-        */
-       u32 ipath_consec_nopiobuf;
-       /*
-        * hint that we should update ipath_pioavailshadow before
-        * looking for a PIO buffer
-        */
-       u32 ipath_upd_pio_shadow;
-       /* so we can rewrite it after a chip reset */
-       u32 ipath_pcibar0;
-       /* so we can rewrite it after a chip reset */
-       u32 ipath_pcibar1;
-       u32 ipath_x1_fix_tries;
-       u32 ipath_autoneg_tries;
-       u32 serdes_first_init_done;
-
-       struct ipath_relock {
-               atomic_t ipath_relock_timer_active;
-               struct timer_list ipath_relock_timer;
-               unsigned int ipath_relock_interval; /* in jiffies */
-       } ipath_relock_singleton;
-
-       /* interrupt number */
-       int ipath_irq;
-       /* HT/PCI Vendor ID (here for NodeInfo) */
-       u16 ipath_vendorid;
-       /* HT/PCI Device ID (here for NodeInfo) */
-       u16 ipath_deviceid;
-       /* offset in HT config space of slave/primary interface block */
-       u8 ipath_ht_slave_off;
-       /* for write combining settings */
-       int wc_cookie;
-       /* ref count for each pkey */
-       atomic_t ipath_pkeyrefs[4];
-       /* shadow copy of struct page *'s for exp tid pages */
-       struct page **ipath_pageshadow;
-       /* shadow copy of dma handles for exp tid pages */
-       dma_addr_t *ipath_physshadow;
-       u64 __iomem *ipath_egrtidbase;
-       /* lock to workaround chip bug 9437 and others */
-       spinlock_t ipath_kernel_tid_lock;
-       spinlock_t ipath_user_tid_lock;
-       spinlock_t ipath_sendctrl_lock;
-       /* around ipath_pd and (user ports) port_cnt use (intr vs free) */
-       spinlock_t ipath_uctxt_lock;
-
-       /*
-        * IPATH_STATUS_*,
-        * this address is mapped readonly into user processes so they can
-        * get status cheaply, whenever they want.
-        */
-       u64 *ipath_statusp;
-       /* freeze msg if hw error put chip in freeze */
-       char *ipath_freezemsg;
-       /* pci access data structure */
-       struct pci_dev *pcidev;
-       struct cdev *user_cdev;
-       struct cdev *diag_cdev;
-       struct device *user_dev;
-       struct device *diag_dev;
-       /* timer used to prevent stats overflow, error throttling, etc. */
-       struct timer_list ipath_stats_timer;
-       /* timer to verify interrupts work, and fallback if possible */
-       struct timer_list ipath_intrchk_timer;
-       void *ipath_dummy_hdrq; /* used after port close */
-       dma_addr_t ipath_dummy_hdrq_phys;
-
-       /* SendDMA related entries */
-       spinlock_t            ipath_sdma_lock;
-       unsigned long         ipath_sdma_status;
-       unsigned long         ipath_sdma_abort_jiffies;
-       unsigned long         ipath_sdma_abort_intr_timeout;
-       unsigned long         ipath_sdma_buf_jiffies;
-       struct ipath_sdma_desc *ipath_sdma_descq;
-       u64                   ipath_sdma_descq_added;
-       u64                   ipath_sdma_descq_removed;
-       int                   ipath_sdma_desc_nreserved;
-       u16                   ipath_sdma_descq_cnt;
-       u16                   ipath_sdma_descq_tail;
-       u16                   ipath_sdma_descq_head;
-       u16                   ipath_sdma_next_intr;
-       u16                   ipath_sdma_reset_wait;
-       u8                    ipath_sdma_generation;
-       struct tasklet_struct ipath_sdma_abort_task;
-       struct tasklet_struct ipath_sdma_notify_task;
-       struct list_head      ipath_sdma_activelist;
-       struct list_head      ipath_sdma_notifylist;
-       atomic_t              ipath_sdma_vl15_count;
-       struct timer_list     ipath_sdma_vl15_timer;
-
-       dma_addr_t       ipath_sdma_descq_phys;
-       volatile __le64 *ipath_sdma_head_dma;
-       dma_addr_t       ipath_sdma_head_phys;
-
-       unsigned long ipath_ureg_align; /* user register alignment */
-
-       struct delayed_work ipath_autoneg_work;
-       wait_queue_head_t ipath_autoneg_wait;
-
-       /* HoL blocking / user app forward-progress state */
-       unsigned          ipath_hol_state;
-       unsigned          ipath_hol_next;
-       struct timer_list ipath_hol_timer;
-
-       /*
-        * Shadow copies of registers; size indicates read access size.
-        * Most of them are readonly, but some are write-only register,
-        * where we manipulate the bits in the shadow copy, and then write
-        * the shadow copy to infinipath.
-        *
-        * We deliberately make most of these 32 bits, since they have
-        * restricted range.  For any that we read, we won't to generate 32
-        * bit accesses, since Opteron will generate 2 separate 32 bit HT
-        * transactions for a 64 bit read, and we want to avoid unnecessary
-        * HT transactions.
-        */
-
-       /* This is the 64 bit group */
-
-       /*
-        * shadow of pioavail, check to be sure it's large enough at
-        * init time.
-        */
-       unsigned long ipath_pioavailshadow[8];
-       /* bitmap of send buffers available for the kernel to use with PIO. */
-       unsigned long ipath_pioavailkernel[8];
-       /* shadow of kr_gpio_out, for rmw ops */
-       u64 ipath_gpio_out;
-       /* shadow the gpio mask register */
-       u64 ipath_gpio_mask;
-       /* shadow the gpio output enable, etc... */
-       u64 ipath_extctrl;
-       /* kr_revision shadow */
-       u64 ipath_revision;
-       /*
-        * shadow of ibcctrl, for interrupt handling of link changes,
-        * etc.
-        */
-       u64 ipath_ibcctrl;
-       /*
-        * last ibcstatus, to suppress "duplicate" status change messages,
-        * mostly from 2 to 3
-        */
-       u64 ipath_lastibcstat;
-       /* hwerrmask shadow */
-       ipath_err_t ipath_hwerrmask;
-       ipath_err_t ipath_errormask; /* errormask shadow */
-       /* interrupt config reg shadow */
-       u64 ipath_intconfig;
-       /* kr_sendpiobufbase value */
-       u64 ipath_piobufbase;
-       /* kr_ibcddrctrl shadow */
-       u64 ipath_ibcddrctrl;
-
-       /* these are the "32 bit" regs */
-
-       /*
-        * number of GUIDs in the flash for this interface; may need some
-        * rethinking for setting on other ifaces
-        */
-       u32 ipath_nguid;
-       /*
-        * the following two are 32-bit bitmasks, but {test,clear,set}_bit
-        * all expect bit fields to be "unsigned long"
-        */
-       /* shadow kr_rcvctrl */
-       unsigned long ipath_rcvctrl;
-       /* shadow kr_sendctrl */
-       unsigned long ipath_sendctrl;
-       /* to not count armlaunch after cancel */
-       unsigned long ipath_lastcancel;
-       /* count cases where special trigger was needed (double write) */
-       unsigned long ipath_spectriggerhit;
-
-       /* value we put in kr_rcvhdrcnt */
-       u32 ipath_rcvhdrcnt;
-       /* value we put in kr_rcvhdrsize */
-       u32 ipath_rcvhdrsize;
-       /* value we put in kr_rcvhdrentsize */
-       u32 ipath_rcvhdrentsize;
-       /* offset of last entry in rcvhdrq */
-       u32 ipath_hdrqlast;
-       /* kr_portcnt value */
-       u32 ipath_portcnt;
-       /* kr_pagealign value */
-       u32 ipath_palign;
-       /* number of "2KB" PIO buffers */
-       u32 ipath_piobcnt2k;
-       /* size in bytes of "2KB" PIO buffers */
-       u32 ipath_piosize2k;
-       /* number of "4KB" PIO buffers */
-       u32 ipath_piobcnt4k;
-       /* size in bytes of "4KB" PIO buffers */
-       u32 ipath_piosize4k;
-       u32 ipath_pioreserved; /* reserved special-inkernel; */
-       /* kr_rcvegrbase value */
-       u32 ipath_rcvegrbase;
-       /* kr_rcvegrcnt value */
-       u32 ipath_rcvegrcnt;
-       /* kr_rcvtidbase value */
-       u32 ipath_rcvtidbase;
-       /* kr_rcvtidcnt value */
-       u32 ipath_rcvtidcnt;
-       /* kr_sendregbase */
-       u32 ipath_sregbase;
-       /* kr_userregbase */
-       u32 ipath_uregbase;
-       /* kr_counterregbase */
-       u32 ipath_cregbase;
-       /* shadow the control register contents */
-       u32 ipath_control;
-       /* PCI revision register (HTC rev on FPGA) */
-       u32 ipath_pcirev;
-
-       /* chip address space used by 4k pio buffers */
-       u32 ipath_4kalign;
-       /* The MTU programmed for this unit */
-       u32 ipath_ibmtu;
-       /*
-        * The max size IB packet, included IB headers that we can send.
-        * Starts same as ipath_piosize, but is affected when ibmtu is
-        * changed, or by size of eager buffers
-        */
-       u32 ipath_ibmaxlen;
-       /*
-        * ibmaxlen at init time, limited by chip and by receive buffer
-        * size.  Not changed after init.
-        */
-       u32 ipath_init_ibmaxlen;
-       /* size of each rcvegrbuffer */
-       u32 ipath_rcvegrbufsize;
-       /* localbus width (1, 2,4,8,16,32) from config space  */
-       u32 ipath_lbus_width;
-       /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
-       u32 ipath_lbus_speed;
-       /*
-        * number of sequential ibcstatus change for polling active/quiet
-        * (i.e., link not coming up).
-        */
-       u32 ipath_ibpollcnt;
-       /* low and high portions of MSI capability/vector */
-       u32 ipath_msi_lo;
-       /* saved after PCIe init for restore after reset */
-       u32 ipath_msi_hi;
-       /* MSI data (vector) saved for restore */
-       u16 ipath_msi_data;
-       /* MLID programmed for this instance */
-       u16 ipath_mlid;
-       /* LID programmed for this instance */
-       u16 ipath_lid;
-       /* list of pkeys programmed; 0 if not set */
-       u16 ipath_pkeys[4];
-       /*
-        * ASCII serial number, from flash, large enough for original
-        * all digit strings, and longer QLogic serial number format
-        */
-       u8 ipath_serial[16];
-       /* human readable board version */
-       u8 ipath_boardversion[96];
-       u8 ipath_lbus_info[32]; /* human readable localbus info */
-       /* chip major rev, from ipath_revision */
-       u8 ipath_majrev;
-       /* chip minor rev, from ipath_revision */
-       u8 ipath_minrev;
-       /* board rev, from ipath_revision */
-       u8 ipath_boardrev;
-       /* saved for restore after reset */
-       u8 ipath_pci_cacheline;
-       /* LID mask control */
-       u8 ipath_lmc;
-       /* link width supported */
-       u8 ipath_link_width_supported;
-       /* link speed supported */
-       u8 ipath_link_speed_supported;
-       u8 ipath_link_width_enabled;
-       u8 ipath_link_speed_enabled;
-       u8 ipath_link_width_active;
-       u8 ipath_link_speed_active;
-       /* Rx Polarity inversion (compensate for ~tx on partner) */
-       u8 ipath_rx_pol_inv;
-
-       u8 ipath_r_portenable_shift;
-       u8 ipath_r_intravail_shift;
-       u8 ipath_r_tailupd_shift;
-       u8 ipath_r_portcfg_shift;
-
-       /* unit # of this chip, if present */
-       int ipath_unit;
-
-       /* local link integrity counter */
-       u32 ipath_lli_counter;
-       /* local link integrity errors */
-       u32 ipath_lli_errors;
-       /*
-        * Above counts only cases where _successive_ LocalLinkIntegrity
-        * errors were seen in the receive headers of kern-packets.
-        * Below are the three (monotonically increasing) counters
-        * maintained via GPIO interrupts on iba6120-rev2.
-        */
-       u32 ipath_rxfc_unsupvl_errs;
-       u32 ipath_overrun_thresh_errs;
-       u32 ipath_lli_errs;
-
-       /*
-        * Not all devices managed by a driver instance are the same
-        * type, so these fields must be per-device.
-        */
-       u64 ipath_i_bitsextant;
-       ipath_err_t ipath_e_bitsextant;
-       ipath_err_t ipath_hwe_bitsextant;
-
-       /*
-        * Below should be computable from number of ports,
-        * since they are never modified.
-        */
-       u64 ipath_i_rcvavail_mask;
-       u64 ipath_i_rcvurg_mask;
-       u16 ipath_i_rcvurg_shift;
-       u16 ipath_i_rcvavail_shift;
-
-       /*
-        * Register bits for selecting i2c direction and values, used for
-        * I2C serial flash.
-        */
-       u8 ipath_gpio_sda_num;
-       u8 ipath_gpio_scl_num;
-       u8 ipath_i2c_chain_type;
-       u64 ipath_gpio_sda;
-       u64 ipath_gpio_scl;
-
-       /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
-       spinlock_t ipath_gpio_lock;
-
-       /*
-        * IB link and linktraining states and masks that vary per chip in
-        * some way.  Set at init, to avoid each IB status change interrupt
-        */
-       u8 ibcs_ls_shift;
-       u8 ibcs_lts_mask;
-       u32 ibcs_mask;
-       u32 ib_init;
-       u32 ib_arm;
-       u32 ib_active;
-
-       u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
-
-       /*
-        * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
-        * reg. Changes for IBA7220
-        */
-       u8 ibcc_lic_mask; /* LinkInitCmd */
-       u8 ibcc_lc_shift; /* LinkCmd */
-       u8 ibcc_mpl_shift; /* Maxpktlen */
-
-       u8 delay_mult;
-
-       /* used to override LED behavior */
-       u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
-       u16 ipath_led_override_timeoff; /* delta to next timer event */
-       u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
-       u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
-       atomic_t ipath_led_override_timer_active;
-       /* Used to flash LEDs in override mode */
-       struct timer_list ipath_led_override_timer;
-
-       /* Support (including locks) for EEPROM logging of errors and time */
-       /* control access to actual counters, timer */
-       spinlock_t ipath_eep_st_lock;
-       /* control high-level access to EEPROM */
-       struct mutex ipath_eep_lock;
-       /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
-       uint64_t ipath_traffic_wds;
-       /* active time is kept in seconds, but logged in hours */
-       atomic_t ipath_active_time;
-       /* Below are nominal shadow of EEPROM, new since last EEPROM update */
-       uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
-       uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
-       uint16_t ipath_eep_hrs;
-       /*
-        * masks for which bits of errs, hwerrs that cause
-        * each of the counters to increment.
-        */
-       struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
-
-       /* interrupt mitigation reload register info */
-       u16 ipath_jint_idle_ticks;      /* idle clock ticks */
-       u16 ipath_jint_max_packets;     /* max packets across all ports */
-
-       /*
-        * lock for access to SerDes, and flags to sequence preset
-        * versus steady-state. 7220-only at the moment.
-        */
-       spinlock_t ipath_sdepb_lock;
-       u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
-};
-
-/* ipath_hol_state values (stopping/starting user proc, send flushing) */
-#define IPATH_HOL_UP       0
-#define IPATH_HOL_DOWN     1
-/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
-#define IPATH_HOL_DOWNSTOP 0
-#define IPATH_HOL_DOWNCONT 1
-
-/* bit positions for sdma_status */
-#define IPATH_SDMA_ABORTING  0
-#define IPATH_SDMA_DISARMED  1
-#define IPATH_SDMA_DISABLED  2
-#define IPATH_SDMA_LAYERBUF  3
-#define IPATH_SDMA_RUNNING  30
-#define IPATH_SDMA_SHUTDOWN 31
-
-/* bit combinations that correspond to abort states */
-#define IPATH_SDMA_ABORT_NONE 0
-#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
-#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISARMED))
-#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISABLED))
-#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
-#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
-       (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
-
-#define IPATH_SDMA_BUF_NONE 0
-#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
-
-/* Private data for file operations */
-struct ipath_filedata {
-       struct ipath_portdata *pd;
-       unsigned subport;
-       unsigned tidcursor;
-       struct ipath_user_sdma_queue *pq;
-};
-extern struct list_head ipath_dev_list;
-extern spinlock_t ipath_devs_lock;
-extern struct ipath_devdata *ipath_lookup(int unit);
-
-int ipath_init_chip(struct ipath_devdata *, int);
-int ipath_enable_wc(struct ipath_devdata *dd);
-void ipath_disable_wc(struct ipath_devdata *dd);
-int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
-void ipath_shutdown_device(struct ipath_devdata *);
-void ipath_clear_freeze(struct ipath_devdata *);
-
-struct file_operations;
-int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
-                   struct cdev **cdevp, struct device **devp);
-void ipath_cdev_cleanup(struct cdev **cdevp,
-                       struct device **devp);
-
-int ipath_diag_add(struct ipath_devdata *);
-void ipath_diag_remove(struct ipath_devdata *);
-
-extern wait_queue_head_t ipath_state_wait;
-
-int ipath_user_add(struct ipath_devdata *dd);
-void ipath_user_remove(struct ipath_devdata *dd);
-
-struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
-
-extern int ipath_diag_inuse;
-
-irqreturn_t ipath_intr(int irq, void *devid);
-int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
-                    ipath_err_t err);
-#if __IPATH_INFO || __IPATH_DBG
-extern const char *ipath_ibcstatus_str[];
-#endif
-
-/* clean up any per-chip chip-specific stuff */
-void ipath_chip_cleanup(struct ipath_devdata *);
-/* clean up any chip type-specific stuff */
-void ipath_chip_done(void);
-
-void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
-                         unsigned cnt);
-void ipath_cancel_sends(struct ipath_devdata *, int);
-
-int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
-void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
-
-int ipath_parse_ushort(const char *str, unsigned short *valp);
-
-void ipath_kreceive(struct ipath_portdata *);
-int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
-int ipath_reset_device(int);
-void ipath_get_faststats(unsigned long);
-int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
-int ipath_set_linkstate(struct ipath_devdata *, u8);
-int ipath_set_mtu(struct ipath_devdata *, u16);
-int ipath_set_lid(struct ipath_devdata *, u32, u8);
-int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
-void ipath_enable_armlaunch(struct ipath_devdata *);
-void ipath_disable_armlaunch(struct ipath_devdata *);
-void ipath_hol_down(struct ipath_devdata *);
-void ipath_hol_up(struct ipath_devdata *);
-void ipath_hol_event(unsigned long);
-void ipath_toggle_rclkrls(struct ipath_devdata *);
-void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
-void ipath_set_relock_poll(struct ipath_devdata *, int);
-void ipath_shutdown_relock_poll(struct ipath_devdata *);
-
-/* for use in system calls, where we want to know device type, etc. */
-#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
-#define subport_fp(fp) \
-       ((struct ipath_filedata *)(fp)->private_data)->subport
-#define tidcursor_fp(fp) \
-       ((struct ipath_filedata *)(fp)->private_data)->tidcursor
-#define user_sdma_queue_fp(fp) \
-       ((struct ipath_filedata *)(fp)->private_data)->pq
-
-/*
- * values for ipath_flags
- */
-               /* chip can report link latency (IB 1.2) */
-#define IPATH_HAS_LINK_LATENCY 0x1
-               /* The chip is up and initted */
-#define IPATH_INITTED       0x2
-               /* set if any user code has set kr_rcvhdrsize */
-#define IPATH_RCVHDRSZ_SET  0x4
-               /* The chip is present and valid for accesses */
-#define IPATH_PRESENT       0x8
-               /* HT link0 is only 8 bits wide, ignore upper byte crc
-                * errors, etc. */
-#define IPATH_8BIT_IN_HT0   0x10
-               /* HT link1 is only 8 bits wide, ignore upper byte crc
-                * errors, etc. */
-#define IPATH_8BIT_IN_HT1   0x20
-               /* The link is down */
-#define IPATH_LINKDOWN      0x40
-               /* The link level is up (0x11) */
-#define IPATH_LINKINIT      0x80
-               /* The link is in the armed (0x21) state */
-#define IPATH_LINKARMED     0x100
-               /* The link is in the active (0x31) state */
-#define IPATH_LINKACTIVE    0x200
-               /* link current state is unknown */
-#define IPATH_LINKUNK       0x400
-               /* Write combining flush needed for PIO */
-#define IPATH_PIO_FLUSH_WC  0x1000
-               /* DMA Receive tail pointer */
-#define IPATH_NODMA_RTAIL   0x2000
-               /* no IB cable, or no device on IB cable */
-#define IPATH_NOCABLE       0x4000
-               /* Supports port zero per packet receive interrupts via
-                * GPIO */
-#define IPATH_GPIO_INTR     0x8000
-               /* uses the coded 4byte TID, not 8 byte */
-#define IPATH_4BYTE_TID     0x10000
-               /* packet/word counters are 32 bit, else those 4 counters
-                * are 64bit */
-#define IPATH_32BITCOUNTERS 0x20000
-               /* Interrupt register is 64 bits */
-#define IPATH_INTREG_64     0x40000
-               /* can miss port0 rx interrupts */
-#define IPATH_DISABLED      0x80000 /* administratively disabled */
-               /* Use GPIO interrupts for new counters */
-#define IPATH_GPIO_ERRINTRS 0x100000
-#define IPATH_SWAP_PIOBUFS  0x200000
-               /* Supports Send DMA */
-#define IPATH_HAS_SEND_DMA  0x400000
-               /* Supports Send Count (not just word count) in PBC */
-#define IPATH_HAS_PBC_CNT   0x800000
-               /* Suppress heartbeat, even if turning off loopback */
-#define IPATH_NO_HRTBT      0x1000000
-#define IPATH_HAS_THRESH_UPDATE 0x4000000
-#define IPATH_HAS_MULT_IB_SPEED 0x8000000
-#define IPATH_IB_AUTONEG_INPROG 0x10000000
-#define IPATH_IB_AUTONEG_FAILED 0x20000000
-               /* Linkdown-disable intentionally, Do not attempt to bring up */
-#define IPATH_IB_LINK_DISABLED 0x40000000
-#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
-
-/* Bits in GPIO for the added interrupts */
-#define IPATH_GPIO_PORT0_BIT 2
-#define IPATH_GPIO_RXUVL_BIT 3
-#define IPATH_GPIO_OVRUN_BIT 4
-#define IPATH_GPIO_LLI_BIT 5
-#define IPATH_GPIO_ERRINTR_MASK 0x38
-
-/* portdata flag bit offsets */
-               /* waiting for a packet to arrive */
-#define IPATH_PORT_WAITING_RCV   2
-               /* master has not finished initializing */
-#define IPATH_PORT_MASTER_UNINIT 4
-               /* waiting for an urgent packet to arrive */
-#define IPATH_PORT_WAITING_URG 5
-
-/* free up any allocated data at closes */
-void ipath_free_data(struct ipath_portdata *dd);
-u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
-void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
-                               unsigned len, int avail);
-void ipath_init_iba6110_funcs(struct ipath_devdata *);
-void ipath_get_eeprom_info(struct ipath_devdata *);
-int ipath_update_eeprom_log(struct ipath_devdata *dd);
-void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
-u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
-void ipath_disarm_senderrbufs(struct ipath_devdata *);
-void ipath_force_pio_avail_update(struct ipath_devdata *);
-void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
-
-/*
- * Set LED override, only the two LSBs have "public" meaning, but
- * any non-zero value substitutes them for the Link and LinkTrain
- * LED states.
- */
-#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
-#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
-void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
-
-/* send dma routines */
-int setup_sdma(struct ipath_devdata *);
-void teardown_sdma(struct ipath_devdata *);
-void ipath_restart_sdma(struct ipath_devdata *);
-void ipath_sdma_intr(struct ipath_devdata *);
-int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
-                         u32, struct ipath_verbs_txreq *);
-/* ipath_sdma_lock should be locked before calling this. */
-int ipath_sdma_make_progress(struct ipath_devdata *dd);
-
-/* must be called under ipath_sdma_lock */
-static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
-{
-       return dd->ipath_sdma_descq_cnt -
-               (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
-               1 - dd->ipath_sdma_desc_nreserved;
-}
-
-static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
-{
-       dd->ipath_sdma_desc_nreserved += cnt;
-}
-
-static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
-{
-       dd->ipath_sdma_desc_nreserved -= cnt;
-}
-
-/*
- * number of words used for protocol header if not set by ipath_userinit();
- */
-#define IPATH_DFLT_RCVHDRSIZE 9
-
-int ipath_get_user_pages(unsigned long, size_t, struct page **);
-void ipath_release_user_pages(struct page **, size_t);
-void ipath_release_user_pages_on_close(struct page **, size_t);
-int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
-int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
-int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
-int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
-
-/* these are used for the registers that vary with port */
-void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
-                          unsigned, u64);
-
-/*
- * We could have a single register get/put routine, that takes a group type,
- * but this is somewhat clearer and cleaner.  It also gives us some error
- * checking.  64 bit register reads should always work, but are inefficient
- * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
- * so we use kreg32 wherever possible.  User register and counter register
- * reads are always 32 bit reads, so only one form of those routines.
- */
-
-/*
- * At the moment, none of the s-registers are writable, so no
- * ipath_write_sreg().
- */
-
-/**
- * ipath_read_ureg32 - read 32-bit virtualized per-port register
- * @dd: device
- * @regno: register number
- * @port: port number
- *
- * Return the contents of a register that is virtualized to be per port.
- * Returns -1 on errors (not distinguishable from valid contents at
- * runtime; we may add a separate error variable at some point).
- */
-static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
-                                   ipath_ureg regno, int port)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return 0;
-
-       return readl(regno + (u64 __iomem *)
-                    (dd->ipath_uregbase +
-                     (char __iomem *)dd->ipath_kregbase +
-                     dd->ipath_ureg_align * port));
-}
-
-/**
- * ipath_write_ureg - write 32-bit virtualized per-port register
- * @dd: device
- * @regno: register number
- * @value: value
- * @port: port
- *
- * Write the contents of a register that is virtualized to be per port.
- */
-static inline void ipath_write_ureg(const struct ipath_devdata *dd,
-                                   ipath_ureg regno, u64 value, int port)
-{
-       u64 __iomem *ubase = (u64 __iomem *)
-               (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
-                dd->ipath_ureg_align * port);
-       if (dd->ipath_kregbase)
-               writeq(value, &ubase[regno]);
-}
-
-static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
-                                   ipath_kreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return -1;
-       return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
-}
-
-static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
-                                   ipath_kreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return -1;
-
-       return readq(&dd->ipath_kregbase[regno]);
-}
-
-static inline void ipath_write_kreg(const struct ipath_devdata *dd,
-                                   ipath_kreg regno, u64 value)
-{
-       if (dd->ipath_kregbase)
-               writeq(value, &dd->ipath_kregbase[regno]);
-}
-
-static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
-                                 ipath_sreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return 0;
-
-       return readq(regno + (u64 __iomem *)
-                    (dd->ipath_cregbase +
-                     (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
-                                        ipath_sreg regno)
-{
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
-               return 0;
-       return readl(regno + (u64 __iomem *)
-                    (dd->ipath_cregbase +
-                     (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline void ipath_write_creg(const struct ipath_devdata *dd,
-                                   ipath_creg regno, u64 value)
-{
-       if (dd->ipath_kregbase)
-               writeq(value, regno + (u64 __iomem *)
-                      (dd->ipath_cregbase +
-                       (char __iomem *)dd->ipath_kregbase));
-}
-
-static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
-{
-       *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
-}
-
-static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
-{
-       return (u32) le64_to_cpu(*((volatile __le64 *)
-                               pd->port_rcvhdrtail_kvaddr));
-}
-
-static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
-{
-       const struct ipath_devdata *dd = pd->port_dd;
-       u32 hdrqtail;
-
-       if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
-               __le32 *rhf_addr;
-               u32 seq;
-
-               rhf_addr = (__le32 *) pd->port_rcvhdrq +
-                       pd->port_head + dd->ipath_rhf_offset;
-               seq = ipath_hdrget_seq(rhf_addr);
-               hdrqtail = pd->port_head;
-               if (seq == pd->port_seq_cnt)
-                       hdrqtail++;
-       } else
-               hdrqtail = ipath_get_rcvhdrtail(pd);
-
-       return hdrqtail;
-}
-
-static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
-{
-       return (dd->ipath_flags & IPATH_INTREG_64) ?
-               ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
-}
-
-/*
- * from contents of IBCStatus (or a saved copy), return linkstate
- * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
- * everywhere, anyway (and should be, for almost all purposes).
- */
-static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
-{
-       u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
-               INFINIPATH_IBCS_LINKSTATE_MASK;
-       if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
-               state = INFINIPATH_IBCS_L_STATE_ACTIVE;
-       return state;
-}
-
-/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
-static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
-{
-       return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
-               dd->ibcs_lts_mask;
-}
-
-/*
- * from contents of IBCStatus (or a saved copy), return logical link state
- * combination of link state and linktraining state (down, active, init,
- * arm, etc.
- */
-static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
-{
-       u32 ibs;
-       ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
-               dd->ibcs_lts_mask;
-       ibs |= (u32)(ibcs &
-               (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
-       return ibs;
-}
-
-/*
- * sysfs interface.
- */
-
-struct device_driver;
-
-extern const char ib_ipath_version[];
-
-extern const struct attribute_group *ipath_driver_attr_groups[];
-
-int ipath_device_create_group(struct device *, struct ipath_devdata *);
-void ipath_device_remove_group(struct device *, struct ipath_devdata *);
-int ipath_expose_reset(struct device *);
-
-int ipath_init_ipathfs(void);
-void ipath_exit_ipathfs(void);
-int ipathfs_add_device(struct ipath_devdata *);
-int ipathfs_remove_device(struct ipath_devdata *);
-
-/*
- * dma_addr wrappers - all 0's invalid for hw
- */
-dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
-                         size_t, int);
-dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
-const char *ipath_get_unit_name(int unit);
-
-/*
- * Flush write combining store buffers (if present) and perform a write
- * barrier.
- */
-#if defined(CONFIG_X86_64)
-#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
-#else
-#define ipath_flush_wc() wmb()
-#endif
-
-extern unsigned ipath_debug; /* debugging bit mask */
-extern unsigned ipath_linkrecovery;
-extern unsigned ipath_mtu4096;
-extern struct mutex ipath_mutex;
-
-#define IPATH_DRV_NAME         "ib_ipath"
-#define IPATH_MAJOR            233
-#define IPATH_USER_MINOR_BASE  0
-#define IPATH_DIAGPKT_MINOR    127
-#define IPATH_DIAG_MINOR_BASE  129
-#define IPATH_NMINORS          255
-
-#define ipath_dev_err(dd,fmt,...) \
-       do { \
-               const struct ipath_devdata *__dd = (dd); \
-               if (__dd->pcidev) \
-                       dev_err(&__dd->pcidev->dev, "%s: " fmt, \
-                               ipath_get_unit_name(__dd->ipath_unit), \
-                               ##__VA_ARGS__); \
-               else \
-                       printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
-                              ipath_get_unit_name(__dd->ipath_unit), \
-                              ##__VA_ARGS__); \
-       } while (0)
-
-#if _IPATH_DEBUGGING
-
-# define __IPATH_DBG_WHICH(which,fmt,...) \
-       do { \
-               if (unlikely(ipath_debug & (which))) \
-                       printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
-                              __func__,##__VA_ARGS__); \
-       } while(0)
-
-# define ipath_dbg(fmt,...) \
-       __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
-# define ipath_cdbg(which,fmt,...) \
-       __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
-
-#else /* ! _IPATH_DEBUGGING */
-
-# define ipath_dbg(fmt,...)
-# define ipath_cdbg(which,fmt,...)
-
-#endif /* _IPATH_DEBUGGING */
-
-/*
- * this is used for formatting hw error messages...
- */
-struct ipath_hwerror_msgs {
-       u64 mask;
-       const char *msg;
-};
-
-#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
-
-/* in ipath_intr.c... */
-void ipath_format_hwerrors(u64 hwerrs,
-                          const struct ipath_hwerror_msgs *hwerrmsgs,
-                          size_t nhwerrmsgs,
-                          char *msg, size_t lmsg);
-
-#endif                         /* _IPATH_KERNEL_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
deleted file mode 100644 (file)
index c0e933f..0000000
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <asm/io.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/**
- * ipath_alloc_lkey - allocate an lkey
- * @rkt: lkey table in which to allocate the lkey
- * @mr: memory region that this lkey protects
- *
- * Returns 1 if successful, otherwise returns 0.
- */
-
-int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
-{
-       unsigned long flags;
-       u32 r;
-       u32 n;
-       int ret;
-
-       spin_lock_irqsave(&rkt->lock, flags);
-
-       /* Find the next available LKEY */
-       r = n = rkt->next;
-       for (;;) {
-               if (rkt->table[r] == NULL)
-                       break;
-               r = (r + 1) & (rkt->max - 1);
-               if (r == n) {
-                       spin_unlock_irqrestore(&rkt->lock, flags);
-                       ipath_dbg("LKEY table full\n");
-                       ret = 0;
-                       goto bail;
-               }
-       }
-       rkt->next = (r + 1) & (rkt->max - 1);
-       /*
-        * Make sure lkey is never zero which is reserved to indicate an
-        * unrestricted LKEY.
-        */
-       rkt->gen++;
-       mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
-               ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
-                << 8);
-       if (mr->lkey == 0) {
-               mr->lkey |= 1 << 8;
-               rkt->gen++;
-       }
-       rkt->table[r] = mr;
-       spin_unlock_irqrestore(&rkt->lock, flags);
-
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_free_lkey - free an lkey
- * @rkt: table from which to free the lkey
- * @lkey: lkey id to free
- */
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
-{
-       unsigned long flags;
-       u32 r;
-
-       if (lkey == 0)
-               return;
-       r = lkey >> (32 - ib_ipath_lkey_table_size);
-       spin_lock_irqsave(&rkt->lock, flags);
-       rkt->table[r] = NULL;
-       spin_unlock_irqrestore(&rkt->lock, flags);
-}
-
-/**
- * ipath_lkey_ok - check IB SGE for validity and initialize
- * @rkt: table containing lkey to check SGE against
- * @isge: outgoing internal SGE
- * @sge: SGE to check
- * @acc: access flags
- *
- * Return 1 if valid and successful, otherwise returns 0.
- *
- * Check the IB SGE for validity and initialize our internal version
- * of it.
- */
-int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
-                 struct ib_sge *sge, int acc)
-{
-       struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
-       struct ipath_mregion *mr;
-       unsigned n, m;
-       size_t off;
-       int ret;
-
-       /*
-        * We use LKEY == zero for kernel virtual addresses
-        * (see ipath_get_dma_mr and ipath_dma.c).
-        */
-       if (sge->lkey == 0) {
-               /* always a kernel port, no locking needed */
-               struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
-
-               if (pd->user) {
-                       ret = 0;
-                       goto bail;
-               }
-               isge->mr = NULL;
-               isge->vaddr = (void *) sge->addr;
-               isge->length = sge->length;
-               isge->sge_length = sge->length;
-               ret = 1;
-               goto bail;
-       }
-       mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
-       if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
-                    qp->ibqp.pd != mr->pd)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off = sge->addr - mr->user_base;
-       if (unlikely(sge->addr < mr->user_base ||
-                    off + sge->length > mr->length ||
-                    (mr->access_flags & acc) != acc)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off += mr->offset;
-       m = 0;
-       n = 0;
-       while (off >= mr->map[m]->segs[n].length) {
-               off -= mr->map[m]->segs[n].length;
-               n++;
-               if (n >= IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       isge->mr = mr;
-       isge->vaddr = mr->map[m]->segs[n].vaddr + off;
-       isge->length = mr->map[m]->segs[n].length - off;
-       isge->sge_length = sge->length;
-       isge->m = m;
-       isge->n = n;
-
-       ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_rkey_ok - check the IB virtual address, length, and RKEY
- * @dev: infiniband device
- * @ss: SGE state
- * @len: length of data
- * @vaddr: virtual address to place data
- * @rkey: rkey to check
- * @acc: access flags
- *
- * Return 1 if successful, otherwise 0.
- */
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
-                 u32 len, u64 vaddr, u32 rkey, int acc)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_lkey_table *rkt = &dev->lk_table;
-       struct ipath_sge *sge = &ss->sge;
-       struct ipath_mregion *mr;
-       unsigned n, m;
-       size_t off;
-       int ret;
-
-       /*
-        * We use RKEY == zero for kernel virtual addresses
-        * (see ipath_get_dma_mr and ipath_dma.c).
-        */
-       if (rkey == 0) {
-               /* always a kernel port, no locking needed */
-               struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
-
-               if (pd->user) {
-                       ret = 0;
-                       goto bail;
-               }
-               sge->mr = NULL;
-               sge->vaddr = (void *) vaddr;
-               sge->length = len;
-               sge->sge_length = len;
-               ss->sg_list = NULL;
-               ss->num_sge = 1;
-               ret = 1;
-               goto bail;
-       }
-
-       mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
-       if (unlikely(mr == NULL || mr->lkey != rkey ||
-                    qp->ibqp.pd != mr->pd)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off = vaddr - mr->iova;
-       if (unlikely(vaddr < mr->iova || off + len > mr->length ||
-                    (mr->access_flags & acc) == 0)) {
-               ret = 0;
-               goto bail;
-       }
-
-       off += mr->offset;
-       m = 0;
-       n = 0;
-       while (off >= mr->map[m]->segs[n].length) {
-               off -= mr->map[m]->segs[n].length;
-               n++;
-               if (n >= IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       sge->mr = mr;
-       sge->vaddr = mr->map[m]->segs[n].vaddr + off;
-       sge->length = mr->map[m]->segs[n].length - off;
-       sge->sge_length = len;
-       sge->m = m;
-       sge->n = n;
-       ss->sg_list = NULL;
-       ss->num_sge = 1;
-
-       ret = 1;
-
-bail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
deleted file mode 100644 (file)
index ad3a926..0000000
+++ /dev/null
@@ -1,1521 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_smi.h>
-#include <rdma/ib_pma.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-#define IB_SMP_UNSUP_VERSION   cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD    cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD   cpu_to_be16(0x001C)
-
-static int reply(struct ib_smp *smp)
-{
-       /*
-        * The verbs framework will handle the directed/LID route
-        * packet changes.
-        */
-       smp->method = IB_MGMT_METHOD_GET_RESP;
-       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-               smp->status |= IB_SMP_DIRECTION;
-       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
-static int recv_subn_get_nodedescription(struct ib_smp *smp,
-                                        struct ib_device *ibdev)
-{
-       if (smp->attr_mod)
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
-
-       return reply(smp);
-}
-
-struct nodeinfo {
-       u8 base_version;
-       u8 class_version;
-       u8 node_type;
-       u8 num_ports;
-       __be64 sys_guid;
-       __be64 node_guid;
-       __be64 port_guid;
-       __be16 partition_cap;
-       __be16 device_id;
-       __be32 revision;
-       u8 local_port_num;
-       u8 vendor_id[3];
-} __attribute__ ((packed));
-
-static int recv_subn_get_nodeinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev, u8 port)
-{
-       struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
-       struct ipath_devdata *dd = to_idev(ibdev)->dd;
-       u32 vendor, majrev, minrev;
-
-       /* GUID 0 is illegal */
-       if (smp->attr_mod || (dd->ipath_guid == 0))
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       nip->base_version = 1;
-       nip->class_version = 1;
-       nip->node_type = 1;     /* channel adapter */
-       /*
-        * XXX The num_ports value will need a layer function to get
-        * the value if we ever have more than one IB port on a chip.
-        * We will also need to get the GUID for the port.
-        */
-       nip->num_ports = ibdev->phys_port_cnt;
-       /* This is already in network order */
-       nip->sys_guid = to_idev(ibdev)->sys_image_guid;
-       nip->node_guid = dd->ipath_guid;
-       nip->port_guid = dd->ipath_guid;
-       nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
-       nip->device_id = cpu_to_be16(dd->ipath_deviceid);
-       majrev = dd->ipath_majrev;
-       minrev = dd->ipath_minrev;
-       nip->revision = cpu_to_be32((majrev << 16) | minrev);
-       nip->local_port_num = port;
-       vendor = dd->ipath_vendorid;
-       nip->vendor_id[0] = IPATH_SRC_OUI_1;
-       nip->vendor_id[1] = IPATH_SRC_OUI_2;
-       nip->vendor_id[2] = IPATH_SRC_OUI_3;
-
-       return reply(smp);
-}
-
-static int recv_subn_get_guidinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev)
-{
-       u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
-       __be64 *p = (__be64 *) smp->data;
-
-       /* 32 blocks of 8 64-bit GUIDs per block */
-
-       memset(smp->data, 0, sizeof(smp->data));
-
-       /*
-        * We only support one GUID for now.  If this changes, the
-        * portinfo.guid_cap field needs to be updated too.
-        */
-       if (startgx == 0) {
-               __be64 g = to_idev(ibdev)->dd->ipath_guid;
-               if (g == 0)
-                       /* GUID 0 is illegal */
-                       smp->status |= IB_SMP_INVALID_FIELD;
-               else
-                       /* The first is a copy of the read-only HW GUID. */
-                       *p = g;
-       } else
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return reply(smp);
-}
-
-static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
-{
-       (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
-}
-
-static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
-{
-       (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
-}
-
-static int get_overrunthreshold(struct ipath_devdata *dd)
-{
-       return (dd->ipath_ibcctrl >>
-               INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
-}
-
-/**
- * set_overrunthreshold - set the overrun threshold
- * @dd: the infinipath device
- * @n: the new threshold
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
-{
-       unsigned v;
-
-       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
-       if (v != n) {
-               dd->ipath_ibcctrl &=
-                       ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
-                         INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
-               dd->ipath_ibcctrl |=
-                       (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-       }
-       return 0;
-}
-
-static int get_phyerrthreshold(struct ipath_devdata *dd)
-{
-       return (dd->ipath_ibcctrl >>
-               INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-}
-
-/**
- * set_phyerrthreshold - set the physical error threshold
- * @dd: the infinipath device
- * @n: the new threshold
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
-{
-       unsigned v;
-
-       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
-               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
-       if (v != n) {
-               dd->ipath_ibcctrl &=
-                       ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
-                         INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
-               dd->ipath_ibcctrl |=
-                       (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                                dd->ipath_ibcctrl);
-       }
-       return 0;
-}
-
-/**
- * get_linkdowndefaultstate - get the default linkdown state
- * @dd: the infinipath device
- *
- * Returns zero if the default is POLL, 1 if the default is SLEEP.
- */
-static int get_linkdowndefaultstate(struct ipath_devdata *dd)
-{
-       return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
-}
-
-static int recv_subn_get_portinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev, u8 port)
-{
-       struct ipath_ibdev *dev;
-       struct ipath_devdata *dd;
-       struct ib_port_info *pip = (struct ib_port_info *)smp->data;
-       u16 lid;
-       u8 ibcstat;
-       u8 mtu;
-       int ret;
-
-       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
-               smp->status |= IB_SMP_INVALID_FIELD;
-               ret = reply(smp);
-               goto bail;
-       }
-
-       dev = to_idev(ibdev);
-       dd = dev->dd;
-
-       /* Clear all fields.  Only set the non-zero fields. */
-       memset(smp->data, 0, sizeof(smp->data));
-
-       /* Only return the mkey if the protection field allows it. */
-       if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
-           dev->mkeyprot == 0)
-               pip->mkey = dev->mkey;
-       pip->gid_prefix = dev->gid_prefix;
-       lid = dd->ipath_lid;
-       pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
-       pip->sm_lid = cpu_to_be16(dev->sm_lid);
-       pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
-       /* pip->diag_code; */
-       pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
-       pip->local_port_num = port;
-       pip->link_width_enabled = dd->ipath_link_width_enabled;
-       pip->link_width_supported = dd->ipath_link_width_supported;
-       pip->link_width_active = dd->ipath_link_width_active;
-       pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
-       ibcstat = dd->ipath_lastibcstat;
-       /* map LinkState to IB portinfo values.  */
-       pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
-
-       pip->portphysstate_linkdown =
-               (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
-               (get_linkdowndefaultstate(dd) ? 1 : 2);
-       pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
-       pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
-               dd->ipath_link_speed_enabled;
-       switch (dd->ipath_ibmtu) {
-       case 4096:
-               mtu = IB_MTU_4096;
-               break;
-       case 2048:
-               mtu = IB_MTU_2048;
-               break;
-       case 1024:
-               mtu = IB_MTU_1024;
-               break;
-       case 512:
-               mtu = IB_MTU_512;
-               break;
-       case 256:
-               mtu = IB_MTU_256;
-               break;
-       default:                /* oops, something is wrong */
-               mtu = IB_MTU_2048;
-               break;
-       }
-       pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
-       pip->vlcap_inittype = 0x10;     /* VLCap = VL0, InitType = 0 */
-       pip->vl_high_limit = dev->vl_high_limit;
-       /* pip->vl_arb_high_cap; // only one VL */
-       /* pip->vl_arb_low_cap; // only one VL */
-       /* InitTypeReply = 0 */
-       /* our mtu cap depends on whether 4K MTU enabled or not */
-       pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
-       /* HCAs ignore VLStallCount and HOQLife */
-       /* pip->vlstallcnt_hoqlife; */
-       pip->operationalvl_pei_peo_fpi_fpo = 0x10;      /* OVLs = 1 */
-       pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
-       /* P_KeyViolations are counted by hardware. */
-       pip->pkey_violations =
-               cpu_to_be16((ipath_get_cr_errpkey(dd) -
-                            dev->z_pkey_violations) & 0xFFFF);
-       pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
-       /* Only the hardware GUID is supported for now */
-       pip->guid_cap = 1;
-       pip->clientrereg_resv_subnetto = dev->subnet_timeout;
-       /* 32.768 usec. response time (guessing) */
-       pip->resv_resptimevalue = 3;
-       pip->localphyerrors_overrunerrors =
-               (get_phyerrthreshold(dd) << 4) |
-               get_overrunthreshold(dd);
-       /* pip->max_credit_hint; */
-       if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
-               u32 v;
-
-               v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
-               pip->link_roundtrip_latency[0] = v >> 16;
-               pip->link_roundtrip_latency[1] = v >> 8;
-               pip->link_roundtrip_latency[2] = v;
-       }
-
-       ret = reply(smp);
-
-bail:
-       return ret;
-}
-
-/**
- * get_pkeys - return the PKEY table for port 0
- * @dd: the infinipath device
- * @pkeys: the pkey table is placed here
- */
-static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
-{
-       /* always a kernel port, no locking needed */
-       struct ipath_portdata *pd = dd->ipath_pd[0];
-
-       memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
-
-       return 0;
-}
-
-static int recv_subn_get_pkeytable(struct ib_smp *smp,
-                                  struct ib_device *ibdev)
-{
-       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
-       u16 *p = (u16 *) smp->data;
-       __be16 *q = (__be16 *) smp->data;
-
-       /* 64 blocks of 32 16-bit P_Key entries */
-
-       memset(smp->data, 0, sizeof(smp->data));
-       if (startpx == 0) {
-               struct ipath_ibdev *dev = to_idev(ibdev);
-               unsigned i, n = ipath_get_npkeys(dev->dd);
-
-               get_pkeys(dev->dd, p);
-
-               for (i = 0; i < n; i++)
-                       q[i] = cpu_to_be16(p[i]);
-       } else
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return reply(smp);
-}
-
-static int recv_subn_set_guidinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev)
-{
-       /* The only GUID we support is the first read-only entry. */
-       return recv_subn_get_guidinfo(smp, ibdev);
-}
-
-/**
- * set_linkdowndefaultstate - set the default linkdown state
- * @dd: the infinipath device
- * @sleep: the new state
- *
- * Note that this will only take effect when the link state changes.
- */
-static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
-{
-       if (sleep)
-               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
-       else
-               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
-                        dd->ipath_ibcctrl);
-       return 0;
-}
-
-/**
- * recv_subn_set_portinfo - set port information
- * @smp: the incoming SM packet
- * @ibdev: the infiniband device
- * @port: the port on the device
- *
- * Set Portinfo (see ch. 14.2.5.6).
- */
-static int recv_subn_set_portinfo(struct ib_smp *smp,
-                                 struct ib_device *ibdev, u8 port)
-{
-       struct ib_port_info *pip = (struct ib_port_info *)smp->data;
-       struct ib_event event;
-       struct ipath_ibdev *dev;
-       struct ipath_devdata *dd;
-       char clientrereg = 0;
-       u16 lid, smlid;
-       u8 lwe;
-       u8 lse;
-       u8 state;
-       u16 lstate;
-       u32 mtu;
-       int ret, ore;
-
-       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
-               goto err;
-
-       dev = to_idev(ibdev);
-       dd = dev->dd;
-       event.device = ibdev;
-       event.element.port_num = port;
-
-       dev->mkey = pip->mkey;
-       dev->gid_prefix = pip->gid_prefix;
-       dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
-
-       lid = be16_to_cpu(pip->lid);
-       if (dd->ipath_lid != lid ||
-           dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
-               /* Must be a valid unicast LID address. */
-               if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
-                       goto err;
-               ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
-               event.event = IB_EVENT_LID_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       smlid = be16_to_cpu(pip->sm_lid);
-       if (smlid != dev->sm_lid) {
-               /* Must be a valid unicast LID address. */
-               if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
-                       goto err;
-               dev->sm_lid = smlid;
-               event.event = IB_EVENT_SM_CHANGE;
-               ib_dispatch_event(&event);
-       }
-
-       /* Allow 1x or 4x to be set (see 14.2.6.6). */
-       lwe = pip->link_width_enabled;
-       if (lwe) {
-               if (lwe == 0xFF)
-                       lwe = dd->ipath_link_width_supported;
-               else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
-                       goto err;
-               set_link_width_enabled(dd, lwe);
-       }
-
-       /* Allow 2.5 or 5.0 Gbs. */
-       lse = pip->linkspeedactive_enabled & 0xF;
-       if (lse) {
-               if (lse == 15)
-                       lse = dd->ipath_link_speed_supported;
-               else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
-                       goto err;
-               set_link_speed_enabled(dd, lse);
-       }
-
-       /* Set link down default state. */
-       switch (pip->portphysstate_linkdown & 0xF) {
-       case 0: /* NOP */
-               break;
-       case 1: /* SLEEP */
-               if (set_linkdowndefaultstate(dd, 1))
-                       goto err;
-               break;
-       case 2: /* POLL */
-               if (set_linkdowndefaultstate(dd, 0))
-                       goto err;
-               break;
-       default:
-               goto err;
-       }
-
-       dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
-       dev->vl_high_limit = pip->vl_high_limit;
-
-       switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
-       case IB_MTU_256:
-               mtu = 256;
-               break;
-       case IB_MTU_512:
-               mtu = 512;
-               break;
-       case IB_MTU_1024:
-               mtu = 1024;
-               break;
-       case IB_MTU_2048:
-               mtu = 2048;
-               break;
-       case IB_MTU_4096:
-               if (!ipath_mtu4096)
-                       goto err;
-               mtu = 4096;
-               break;
-       default:
-               /* XXX We have already partially updated our state! */
-               goto err;
-       }
-       ipath_set_mtu(dd, mtu);
-
-       dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
-
-       /* We only support VL0 */
-       if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
-               goto err;
-
-       if (pip->mkey_violations == 0)
-               dev->mkey_violations = 0;
-
-       /*
-        * Hardware counter can't be reset so snapshot and subtract
-        * later.
-        */
-       if (pip->pkey_violations == 0)
-               dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
-
-       if (pip->qkey_violations == 0)
-               dev->qkey_violations = 0;
-
-       ore = pip->localphyerrors_overrunerrors;
-       if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
-               goto err;
-
-       if (set_overrunthreshold(dd, (ore & 0xF)))
-               goto err;
-
-       dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
-
-       if (pip->clientrereg_resv_subnetto & 0x80) {
-               clientrereg = 1;
-               event.event = IB_EVENT_CLIENT_REREGISTER;
-               ib_dispatch_event(&event);
-       }
-
-       /*
-        * Do the port state change now that the other link parameters
-        * have been set.
-        * Changing the port physical state only makes sense if the link
-        * is down or is being set to down.
-        */
-       state = pip->linkspeed_portstate & 0xF;
-       lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
-       if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
-               goto err;
-
-       /*
-        * Only state changes of DOWN, ARM, and ACTIVE are valid
-        * and must be in the correct state to take effect (see 7.2.6).
-        */
-       switch (state) {
-       case IB_PORT_NOP:
-               if (lstate == 0)
-                       break;
-               /* FALLTHROUGH */
-       case IB_PORT_DOWN:
-               if (lstate == 0)
-                       lstate = IPATH_IB_LINKDOWN_ONLY;
-               else if (lstate == 1)
-                       lstate = IPATH_IB_LINKDOWN_SLEEP;
-               else if (lstate == 2)
-                       lstate = IPATH_IB_LINKDOWN;
-               else if (lstate == 3)
-                       lstate = IPATH_IB_LINKDOWN_DISABLE;
-               else
-                       goto err;
-               ipath_set_linkstate(dd, lstate);
-               if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
-                       ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-                       goto done;
-               }
-               ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
-                               IPATH_LINKACTIVE, 1000);
-               break;
-       case IB_PORT_ARMED:
-               ipath_set_linkstate(dd, IPATH_IB_LINKARM);
-               break;
-       case IB_PORT_ACTIVE:
-               ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
-               break;
-       default:
-               /* XXX We have already partially updated our state! */
-               goto err;
-       }
-
-       ret = recv_subn_get_portinfo(smp, ibdev, port);
-
-       if (clientrereg)
-               pip->clientrereg_resv_subnetto |= 0x80;
-
-       goto done;
-
-err:
-       smp->status |= IB_SMP_INVALID_FIELD;
-       ret = recv_subn_get_portinfo(smp, ibdev, port);
-
-done:
-       return ret;
-}
-
-/**
- * rm_pkey - decrecment the reference count for the given PKEY
- * @dd: the infinipath device
- * @key: the PKEY index
- *
- * Return true if this was the last reference and the hardware table entry
- * needs to be changed.
- */
-static int rm_pkey(struct ipath_devdata *dd, u16 key)
-{
-       int i;
-       int ret;
-
-       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (dd->ipath_pkeys[i] != key)
-                       continue;
-               if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
-                       dd->ipath_pkeys[i] = 0;
-                       ret = 1;
-                       goto bail;
-               }
-               break;
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * add_pkey - add the given PKEY to the hardware table
- * @dd: the infinipath device
- * @key: the PKEY
- *
- * Return an error code if unable to add the entry, zero if no change,
- * or 1 if the hardware PKEY register needs to be updated.
- */
-static int add_pkey(struct ipath_devdata *dd, u16 key)
-{
-       int i;
-       u16 lkey = key & 0x7FFF;
-       int any = 0;
-       int ret;
-
-       if (lkey == 0x7FFF) {
-               ret = 0;
-               goto bail;
-       }
-
-       /* Look for an empty slot or a matching PKEY. */
-       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i]) {
-                       any++;
-                       continue;
-               }
-               /* If it matches exactly, try to increment the ref count */
-               if (dd->ipath_pkeys[i] == key) {
-                       if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
-                               ret = 0;
-                               goto bail;
-                       }
-                       /* Lost the race. Look for an empty slot below. */
-                       atomic_dec(&dd->ipath_pkeyrefs[i]);
-                       any++;
-               }
-               /*
-                * It makes no sense to have both the limited and unlimited
-                * PKEY set at the same time since the unlimited one will
-                * disable the limited one.
-                */
-               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
-                       ret = -EEXIST;
-                       goto bail;
-               }
-       }
-       if (!any) {
-               ret = -EBUSY;
-               goto bail;
-       }
-       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
-               if (!dd->ipath_pkeys[i] &&
-                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
-                       /* for ipathstats, etc. */
-                       ipath_stats.sps_pkeys[i] = lkey;
-                       dd->ipath_pkeys[i] = key;
-                       ret = 1;
-                       goto bail;
-               }
-       }
-       ret = -EBUSY;
-
-bail:
-       return ret;
-}
-
-/**
- * set_pkeys - set the PKEY table for port 0
- * @dd: the infinipath device
- * @pkeys: the PKEY table
- */
-static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
-{
-       struct ipath_portdata *pd;
-       int i;
-       int changed = 0;
-
-       /* always a kernel port, no locking needed */
-       pd = dd->ipath_pd[0];
-
-       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
-               u16 key = pkeys[i];
-               u16 okey = pd->port_pkeys[i];
-
-               if (key == okey)
-                       continue;
-               /*
-                * The value of this PKEY table entry is changing.
-                * Remove the old entry in the hardware's array of PKEYs.
-                */
-               if (okey & 0x7FFF)
-                       changed |= rm_pkey(dd, okey);
-               if (key & 0x7FFF) {
-                       int ret = add_pkey(dd, key);
-
-                       if (ret < 0)
-                               key = 0;
-                       else
-                               changed |= ret;
-               }
-               pd->port_pkeys[i] = key;
-       }
-       if (changed) {
-               u64 pkey;
-               struct ib_event event;
-
-               pkey = (u64) dd->ipath_pkeys[0] |
-                       ((u64) dd->ipath_pkeys[1] << 16) |
-                       ((u64) dd->ipath_pkeys[2] << 32) |
-                       ((u64) dd->ipath_pkeys[3] << 48);
-               ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
-                          (unsigned long long) pkey);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
-                                pkey);
-
-               event.event = IB_EVENT_PKEY_CHANGE;
-               event.device = &dd->verbs_dev->ibdev;
-               event.element.port_num = port;
-               ib_dispatch_event(&event);
-       }
-       return 0;
-}
-
-static int recv_subn_set_pkeytable(struct ib_smp *smp,
-                                  struct ib_device *ibdev, u8 port)
-{
-       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
-       __be16 *p = (__be16 *) smp->data;
-       u16 *q = (u16 *) smp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       unsigned i, n = ipath_get_npkeys(dev->dd);
-
-       for (i = 0; i < n; i++)
-               q[i] = be16_to_cpu(p[i]);
-
-       if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
-               smp->status |= IB_SMP_INVALID_FIELD;
-
-       return recv_subn_get_pkeytable(smp, ibdev);
-}
-
-static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
-{
-       struct ib_class_port_info *p =
-               (struct ib_class_port_info *)pmp->data;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       if (pmp->mad_hdr.attr_mod != 0)
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       /* Indicate AllPortSelect is valid (only one port anyway) */
-       p->capability_mask = cpu_to_be16(1 << 8);
-       p->base_version = 1;
-       p->class_version = 1;
-       /*
-        * Expected response time is 4.096 usec. * 2^18 == 1.073741824
-        * sec.
-        */
-       p->resp_time_value = 18;
-
-       return reply((struct ib_smp *) pmp);
-}
-
-/*
- * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
- * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
- * We support 5 counters which only count the mandatory quantities.
- */
-#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
-#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
-                                   COUNTER_MASK(1, 1) | \
-                                   COUNTER_MASK(1, 2) | \
-                                   COUNTER_MASK(1, 3) | \
-                                   COUNTER_MASK(1, 4))
-
-static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
-                                          struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portsamplescontrol *p =
-               (struct ib_pma_portsamplescontrol *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       unsigned long flags;
-       u8 port_select = p->port_select;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       p->port_select = port_select;
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (port_select != port && port_select != 0xFF))
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-       /*
-        * Ticks are 10x the link transfer period which for 2.5Gbs is 4
-        * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
-        * intervals are counted in ticks.  Since we use Linux timers, that
-        * count in jiffies, we can't sample for less than 1000 ticks if HZ
-        * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
-        * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
-        * have hardware support for delaying packets.
-        */
-       if (crp->cr_psstat)
-               p->tick = dev->dd->ipath_link_speed_active - 1;
-       else
-               p->tick = 250;          /* 1 usec. */
-       p->counter_width = 4;   /* 32 bit counters */
-       p->counter_mask0_9 = COUNTER_MASK0_9;
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (crp->cr_psstat)
-               p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               p->sample_status = dev->pma_sample_status;
-       p->sample_start = cpu_to_be32(dev->pma_sample_start);
-       p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
-       p->tag = cpu_to_be16(dev->pma_tag);
-       p->counter_select[0] = dev->pma_counter_select[0];
-       p->counter_select[1] = dev->pma_counter_select[1];
-       p->counter_select[2] = dev->pma_counter_select[2];
-       p->counter_select[3] = dev->pma_counter_select[3];
-       p->counter_select[4] = dev->pma_counter_select[4];
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
-                                          struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portsamplescontrol *p =
-               (struct ib_pma_portsamplescontrol *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       unsigned long flags;
-       u8 status;
-       int ret;
-
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (p->port_select != port && p->port_select != 0xFF)) {
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-               ret = reply((struct ib_smp *) pmp);
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (crp->cr_psstat)
-               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               status = dev->pma_sample_status;
-       if (status == IB_PMA_SAMPLE_STATUS_DONE) {
-               dev->pma_sample_start = be32_to_cpu(p->sample_start);
-               dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
-               dev->pma_tag = be16_to_cpu(p->tag);
-               dev->pma_counter_select[0] = p->counter_select[0];
-               dev->pma_counter_select[1] = p->counter_select[1];
-               dev->pma_counter_select[2] = p->counter_select[2];
-               dev->pma_counter_select[3] = p->counter_select[3];
-               dev->pma_counter_select[4] = p->counter_select[4];
-               if (crp->cr_psstat) {
-                       ipath_write_creg(dev->dd, crp->cr_psinterval,
-                                        dev->pma_sample_interval);
-                       ipath_write_creg(dev->dd, crp->cr_psstart,
-                                        dev->pma_sample_start);
-               } else
-                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
-
-bail:
-       return ret;
-}
-
-static u64 get_counter(struct ipath_ibdev *dev,
-                      struct ipath_cregs const *crp,
-                      __be16 sel)
-{
-       u64 ret;
-
-       switch (sel) {
-       case IB_PMA_PORT_XMIT_DATA:
-               ret = (crp->cr_psxmitdatacount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
-                       dev->ipath_sword;
-               break;
-       case IB_PMA_PORT_RCV_DATA:
-               ret = (crp->cr_psrcvdatacount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
-                       dev->ipath_rword;
-               break;
-       case IB_PMA_PORT_XMIT_PKTS:
-               ret = (crp->cr_psxmitpktscount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
-                       dev->ipath_spkts;
-               break;
-       case IB_PMA_PORT_RCV_PKTS:
-               ret = (crp->cr_psrcvpktscount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
-                       dev->ipath_rpkts;
-               break;
-       case IB_PMA_PORT_XMIT_WAIT:
-               ret = (crp->cr_psxmitwaitcount) ?
-                       ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
-                       dev->ipath_xmit_wait;
-               break;
-       default:
-               ret = 0;
-       }
-
-       return ret;
-}
-
-static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
-                                         struct ib_device *ibdev)
-{
-       struct ib_pma_portsamplesresult *p =
-               (struct ib_pma_portsamplesresult *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       u8 status;
-       int i;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-       p->tag = cpu_to_be16(dev->pma_tag);
-       if (crp->cr_psstat)
-               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               status = dev->pma_sample_status;
-       p->sample_status = cpu_to_be16(status);
-       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-               p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
-                   cpu_to_be32(
-                       get_counter(dev, crp, dev->pma_counter_select[i]));
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
-                                             struct ib_device *ibdev)
-{
-       struct ib_pma_portsamplesresult_ext *p =
-               (struct ib_pma_portsamplesresult_ext *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
-       u8 status;
-       int i;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-       p->tag = cpu_to_be16(dev->pma_tag);
-       if (crp->cr_psstat)
-               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
-       else
-               status = dev->pma_sample_status;
-       p->sample_status = cpu_to_be16(status);
-       /* 64 bits */
-       p->extended_width = cpu_to_be32(0x80000000);
-       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-               p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
-                   cpu_to_be64(
-                       get_counter(dev, crp, dev->pma_counter_select[i]));
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
-                                    struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_verbs_counters cntrs;
-       u8 port_select = p->port_select;
-
-       ipath_get_counters(dev->dd, &cntrs);
-
-       /* Adjust counters for any resets done. */
-       cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
-       cntrs.link_error_recovery_counter -=
-               dev->z_link_error_recovery_counter;
-       cntrs.link_downed_counter -= dev->z_link_downed_counter;
-       cntrs.port_rcv_errors += dev->rcv_errors;
-       cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
-       cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
-       cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
-       cntrs.port_xmit_data -= dev->z_port_xmit_data;
-       cntrs.port_rcv_data -= dev->z_port_rcv_data;
-       cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
-       cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
-       cntrs.local_link_integrity_errors -=
-               dev->z_local_link_integrity_errors;
-       cntrs.excessive_buffer_overrun_errors -=
-               dev->z_excessive_buffer_overrun_errors;
-       cntrs.vl15_dropped -= dev->z_vl15_dropped;
-       cntrs.vl15_dropped += dev->n_vl15_dropped;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       p->port_select = port_select;
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (port_select != port && port_select != 0xFF))
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       if (cntrs.symbol_error_counter > 0xFFFFUL)
-               p->symbol_error_counter = cpu_to_be16(0xFFFF);
-       else
-               p->symbol_error_counter =
-                       cpu_to_be16((u16)cntrs.symbol_error_counter);
-       if (cntrs.link_error_recovery_counter > 0xFFUL)
-               p->link_error_recovery_counter = 0xFF;
-       else
-               p->link_error_recovery_counter =
-                       (u8)cntrs.link_error_recovery_counter;
-       if (cntrs.link_downed_counter > 0xFFUL)
-               p->link_downed_counter = 0xFF;
-       else
-               p->link_downed_counter = (u8)cntrs.link_downed_counter;
-       if (cntrs.port_rcv_errors > 0xFFFFUL)
-               p->port_rcv_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_errors =
-                       cpu_to_be16((u16) cntrs.port_rcv_errors);
-       if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
-               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
-       else
-               p->port_rcv_remphys_errors =
-                       cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
-       if (cntrs.port_xmit_discards > 0xFFFFUL)
-               p->port_xmit_discards = cpu_to_be16(0xFFFF);
-       else
-               p->port_xmit_discards =
-                       cpu_to_be16((u16)cntrs.port_xmit_discards);
-       if (cntrs.local_link_integrity_errors > 0xFUL)
-               cntrs.local_link_integrity_errors = 0xFUL;
-       if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
-               cntrs.excessive_buffer_overrun_errors = 0xFUL;
-       p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
-               cntrs.excessive_buffer_overrun_errors;
-       if (cntrs.vl15_dropped > 0xFFFFUL)
-               p->vl15_dropped = cpu_to_be16(0xFFFF);
-       else
-               p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
-       if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
-               p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
-       if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
-               p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
-       if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
-               p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_xmit_packets =
-                       cpu_to_be32((u32)cntrs.port_xmit_packets);
-       if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
-               p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
-       else
-               p->port_rcv_packets =
-                       cpu_to_be32((u32) cntrs.port_rcv_packets);
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
-                                        struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters_ext *p =
-               (struct ib_pma_portcounters_ext *)pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       u64 swords, rwords, spkts, rpkts, xwait;
-       u8 port_select = p->port_select;
-
-       ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
-                               &rpkts, &xwait);
-
-       /* Adjust counters for any resets done. */
-       swords -= dev->z_port_xmit_data;
-       rwords -= dev->z_port_rcv_data;
-       spkts -= dev->z_port_xmit_packets;
-       rpkts -= dev->z_port_rcv_packets;
-
-       memset(pmp->data, 0, sizeof(pmp->data));
-
-       p->port_select = port_select;
-       if (pmp->mad_hdr.attr_mod != 0 ||
-           (port_select != port && port_select != 0xFF))
-               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
-
-       p->port_xmit_data = cpu_to_be64(swords);
-       p->port_rcv_data = cpu_to_be64(rwords);
-       p->port_xmit_packets = cpu_to_be64(spkts);
-       p->port_rcv_packets = cpu_to_be64(rpkts);
-       p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
-       p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
-       p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
-       p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
-
-       return reply((struct ib_smp *) pmp);
-}
-
-static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
-                                    struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_verbs_counters cntrs;
-
-       /*
-        * Since the HW doesn't support clearing counters, we save the
-        * current count and subtract it from future responses.
-        */
-       ipath_get_counters(dev->dd, &cntrs);
-
-       if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
-               dev->z_symbol_error_counter = cntrs.symbol_error_counter;
-
-       if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
-               dev->z_link_error_recovery_counter =
-                       cntrs.link_error_recovery_counter;
-
-       if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
-               dev->z_link_downed_counter = cntrs.link_downed_counter;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
-               dev->z_port_rcv_errors =
-                       cntrs.port_rcv_errors + dev->rcv_errors;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
-               dev->z_port_rcv_remphys_errors =
-                       cntrs.port_rcv_remphys_errors;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
-               dev->z_port_xmit_discards = cntrs.port_xmit_discards;
-
-       if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
-               dev->z_local_link_integrity_errors =
-                       cntrs.local_link_integrity_errors;
-
-       if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
-               dev->z_excessive_buffer_overrun_errors =
-                       cntrs.excessive_buffer_overrun_errors;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
-               dev->n_vl15_dropped = 0;
-               dev->z_vl15_dropped = cntrs.vl15_dropped;
-       }
-
-       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
-               dev->z_port_xmit_data = cntrs.port_xmit_data;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
-               dev->z_port_rcv_data = cntrs.port_rcv_data;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
-               dev->z_port_xmit_packets = cntrs.port_xmit_packets;
-
-       if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
-               dev->z_port_rcv_packets = cntrs.port_rcv_packets;
-
-       return recv_pma_get_portcounters(pmp, ibdev, port);
-}
-
-static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
-                                        struct ib_device *ibdev, u8 port)
-{
-       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
-               pmp->data;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       u64 swords, rwords, spkts, rpkts, xwait;
-
-       ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
-                               &rpkts, &xwait);
-
-       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
-               dev->z_port_xmit_data = swords;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
-               dev->z_port_rcv_data = rwords;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
-               dev->z_port_xmit_packets = spkts;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
-               dev->z_port_rcv_packets = rpkts;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
-               dev->n_unicast_xmit = 0;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
-               dev->n_unicast_rcv = 0;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
-               dev->n_multicast_xmit = 0;
-
-       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
-               dev->n_multicast_rcv = 0;
-
-       return recv_pma_get_portcounters_ext(pmp, ibdev, port);
-}
-
-static int process_subn(struct ib_device *ibdev, int mad_flags,
-                       u8 port_num, const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_smp *smp = (struct ib_smp *)out_mad;
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       int ret;
-
-       *out_mad = *in_mad;
-       if (smp->class_version != 1) {
-               smp->status |= IB_SMP_UNSUP_VERSION;
-               ret = reply(smp);
-               goto bail;
-       }
-
-       /* Is the mkey in the process of expiring? */
-       if (dev->mkey_lease_timeout &&
-           time_after_eq(jiffies, dev->mkey_lease_timeout)) {
-               /* Clear timeout and mkey protection field. */
-               dev->mkey_lease_timeout = 0;
-               dev->mkeyprot = 0;
-       }
-
-       /*
-        * M_Key checking depends on
-        * Portinfo:M_Key_protect_bits
-        */
-       if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
-           dev->mkey != smp->mkey &&
-           (smp->method == IB_MGMT_METHOD_SET ||
-            (smp->method == IB_MGMT_METHOD_GET &&
-             dev->mkeyprot >= 2))) {
-               if (dev->mkey_violations != 0xFFFF)
-                       ++dev->mkey_violations;
-               if (dev->mkey_lease_timeout ||
-                   dev->mkey_lease_period == 0) {
-                       ret = IB_MAD_RESULT_SUCCESS |
-                               IB_MAD_RESULT_CONSUMED;
-                       goto bail;
-               }
-               dev->mkey_lease_timeout = jiffies +
-                       dev->mkey_lease_period * HZ;
-               /* Future: Generate a trap notice. */
-               ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-               goto bail;
-       } else if (dev->mkey_lease_timeout)
-               dev->mkey_lease_timeout = 0;
-
-       switch (smp->method) {
-       case IB_MGMT_METHOD_GET:
-               switch (smp->attr_id) {
-               case IB_SMP_ATTR_NODE_DESC:
-                       ret = recv_subn_get_nodedescription(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_NODE_INFO:
-                       ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_GUID_INFO:
-                       ret = recv_subn_get_guidinfo(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_PORT_INFO:
-                       ret = recv_subn_get_portinfo(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_PKEY_TABLE:
-                       ret = recv_subn_get_pkeytable(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_SM_INFO:
-                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
-                               ret = IB_MAD_RESULT_SUCCESS |
-                                       IB_MAD_RESULT_CONSUMED;
-                               goto bail;
-                       }
-                       if (dev->port_cap_flags & IB_PORT_SM) {
-                               ret = IB_MAD_RESULT_SUCCESS;
-                               goto bail;
-                       }
-                       /* FALLTHROUGH */
-               default:
-                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply(smp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_SET:
-               switch (smp->attr_id) {
-               case IB_SMP_ATTR_GUID_INFO:
-                       ret = recv_subn_set_guidinfo(smp, ibdev);
-                       goto bail;
-               case IB_SMP_ATTR_PORT_INFO:
-                       ret = recv_subn_set_portinfo(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_PKEY_TABLE:
-                       ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
-                       goto bail;
-               case IB_SMP_ATTR_SM_INFO:
-                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
-                               ret = IB_MAD_RESULT_SUCCESS |
-                                       IB_MAD_RESULT_CONSUMED;
-                               goto bail;
-                       }
-                       if (dev->port_cap_flags & IB_PORT_SM) {
-                               ret = IB_MAD_RESULT_SUCCESS;
-                               goto bail;
-                       }
-                       /* FALLTHROUGH */
-               default:
-                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply(smp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_TRAP:
-       case IB_MGMT_METHOD_REPORT:
-       case IB_MGMT_METHOD_REPORT_RESP:
-       case IB_MGMT_METHOD_TRAP_REPRESS:
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               goto bail;
-       default:
-               smp->status |= IB_SMP_UNSUP_METHOD;
-               ret = reply(smp);
-       }
-
-bail:
-       return ret;
-}
-
-static int process_perf(struct ib_device *ibdev, u8 port_num,
-                       const struct ib_mad *in_mad,
-                       struct ib_mad *out_mad)
-{
-       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
-       int ret;
-
-       *out_mad = *in_mad;
-       if (pmp->mad_hdr.class_version != 1) {
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
-               ret = reply((struct ib_smp *) pmp);
-               goto bail;
-       }
-
-       switch (pmp->mad_hdr.method) {
-       case IB_MGMT_METHOD_GET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_CLASS_PORT_INFO:
-                       ret = recv_pma_get_classportinfo(pmp);
-                       goto bail;
-               case IB_PMA_PORT_SAMPLES_CONTROL:
-                       ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
-                                                             port_num);
-                       goto bail;
-               case IB_PMA_PORT_SAMPLES_RESULT:
-                       ret = recv_pma_get_portsamplesresult(pmp, ibdev);
-                       goto bail;
-               case IB_PMA_PORT_SAMPLES_RESULT_EXT:
-                       ret = recv_pma_get_portsamplesresult_ext(pmp,
-                                                                ibdev);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS:
-                       ret = recv_pma_get_portcounters(pmp, ibdev,
-                                                       port_num);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS_EXT:
-                       ret = recv_pma_get_portcounters_ext(pmp, ibdev,
-                                                           port_num);
-                       goto bail;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_smp *) pmp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_SET:
-               switch (pmp->mad_hdr.attr_id) {
-               case IB_PMA_PORT_SAMPLES_CONTROL:
-                       ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
-                                                             port_num);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS:
-                       ret = recv_pma_set_portcounters(pmp, ibdev,
-                                                       port_num);
-                       goto bail;
-               case IB_PMA_PORT_COUNTERS_EXT:
-                       ret = recv_pma_set_portcounters_ext(pmp, ibdev,
-                                                           port_num);
-                       goto bail;
-               default:
-                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
-                       ret = reply((struct ib_smp *) pmp);
-                       goto bail;
-               }
-
-       case IB_MGMT_METHOD_GET_RESP:
-               /*
-                * The ib_mad module will call us to process responses
-                * before checking for other consumers.
-                * Just tell the caller to process it normally.
-                */
-               ret = IB_MAD_RESULT_SUCCESS;
-               goto bail;
-       default:
-               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
-               ret = reply((struct ib_smp *) pmp);
-       }
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_process_mad - process an incoming MAD packet
- * @ibdev: the infiniband device this packet came in on
- * @mad_flags: MAD flags
- * @port_num: the port number this packet came in on
- * @in_wc: the work completion entry for this packet
- * @in_grh: the global route header for this packet
- * @in_mad: the incoming MAD
- * @out_mad: any outgoing MAD reply
- *
- * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
- * interested in processing.
- *
- * Note that the verbs framework has already done the MAD sanity checks,
- * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
- * MADs.
- *
- * This is called by the ib_mad module.
- */
-int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-                     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-                     const struct ib_mad_hdr *in, size_t in_mad_size,
-                     struct ib_mad_hdr *out, size_t *out_mad_size,
-                     u16 *out_mad_pkey_index)
-{
-       int ret;
-       const struct ib_mad *in_mad = (const struct ib_mad *)in;
-       struct ib_mad *out_mad = (struct ib_mad *)out;
-
-       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-                        *out_mad_size != sizeof(*out_mad)))
-               return IB_MAD_RESULT_FAILURE;
-
-       switch (in_mad->mad_hdr.mgmt_class) {
-       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
-       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-               ret = process_subn(ibdev, mad_flags, port_num,
-                                  in_mad, out_mad);
-               goto bail;
-       case IB_MGMT_CLASS_PERF_MGMT:
-               ret = process_perf(ibdev, port_num, in_mad, out_mad);
-               goto bail;
-       default:
-               ret = IB_MAD_RESULT_SUCCESS;
-       }
-
-bail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c
deleted file mode 100644 (file)
index e732742..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <asm/pgtable.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_release_mmap_info - free mmap info structure
- * @ref: a pointer to the kref within struct ipath_mmap_info
- */
-void ipath_release_mmap_info(struct kref *ref)
-{
-       struct ipath_mmap_info *ip =
-               container_of(ref, struct ipath_mmap_info, ref);
-       struct ipath_ibdev *dev = to_idev(ip->context->device);
-
-       spin_lock_irq(&dev->pending_lock);
-       list_del(&ip->pending_mmaps);
-       spin_unlock_irq(&dev->pending_lock);
-
-       vfree(ip->obj);
-       kfree(ip);
-}
-
-/*
- * open and close keep track of how many times the CQ is mapped,
- * to avoid releasing it.
- */
-static void ipath_vma_open(struct vm_area_struct *vma)
-{
-       struct ipath_mmap_info *ip = vma->vm_private_data;
-
-       kref_get(&ip->ref);
-}
-
-static void ipath_vma_close(struct vm_area_struct *vma)
-{
-       struct ipath_mmap_info *ip = vma->vm_private_data;
-
-       kref_put(&ip->ref, ipath_release_mmap_info);
-}
-
-static const struct vm_operations_struct ipath_vm_ops = {
-       .open =     ipath_vma_open,
-       .close =    ipath_vma_close,
-};
-
-/**
- * ipath_mmap - create a new mmap region
- * @context: the IB user context of the process making the mmap() call
- * @vma: the VMA to be initialized
- * Return zero if the mmap is OK. Otherwise, return an errno.
- */
-int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-       struct ipath_ibdev *dev = to_idev(context->device);
-       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-       unsigned long size = vma->vm_end - vma->vm_start;
-       struct ipath_mmap_info *ip, *pp;
-       int ret = -EINVAL;
-
-       /*
-        * Search the device's list of objects waiting for a mmap call.
-        * Normally, this list is very short since a call to create a
-        * CQ, QP, or SRQ is soon followed by a call to mmap().
-        */
-       spin_lock_irq(&dev->pending_lock);
-       list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
-                                pending_mmaps) {
-               /* Only the creator is allowed to mmap the object */
-               if (context != ip->context || (__u64) offset != ip->offset)
-                       continue;
-               /* Don't allow a mmap larger than the object. */
-               if (size > ip->size)
-                       break;
-
-               list_del_init(&ip->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-
-               ret = remap_vmalloc_range(vma, ip->obj, 0);
-               if (ret)
-                       goto done;
-               vma->vm_ops = &ipath_vm_ops;
-               vma->vm_private_data = ip;
-               ipath_vma_open(vma);
-               goto done;
-       }
-       spin_unlock_irq(&dev->pending_lock);
-done:
-       return ret;
-}
-
-/*
- * Allocate information for ipath_mmap
- */
-struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
-                                              u32 size,
-                                              struct ib_ucontext *context,
-                                              void *obj) {
-       struct ipath_mmap_info *ip;
-
-       ip = kmalloc(sizeof *ip, GFP_KERNEL);
-       if (!ip)
-               goto bail;
-
-       size = PAGE_ALIGN(size);
-
-       spin_lock_irq(&dev->mmap_offset_lock);
-       if (dev->mmap_offset == 0)
-               dev->mmap_offset = PAGE_SIZE;
-       ip->offset = dev->mmap_offset;
-       dev->mmap_offset += size;
-       spin_unlock_irq(&dev->mmap_offset_lock);
-
-       INIT_LIST_HEAD(&ip->pending_mmaps);
-       ip->size = size;
-       ip->context = context;
-       ip->obj = obj;
-       kref_init(&ip->ref);
-
-bail:
-       return ip;
-}
-
-void ipath_update_mmap_info(struct ipath_ibdev *dev,
-                           struct ipath_mmap_info *ip,
-                           u32 size, void *obj) {
-       size = PAGE_ALIGN(size);
-
-       spin_lock_irq(&dev->mmap_offset_lock);
-       if (dev->mmap_offset == 0)
-               dev->mmap_offset = PAGE_SIZE;
-       ip->offset = dev->mmap_offset;
-       dev->mmap_offset += size;
-       spin_unlock_irq(&dev->mmap_offset_lock);
-
-       ip->size = size;
-       ip->obj = obj;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
deleted file mode 100644 (file)
index c7278f6..0000000
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/slab.h>
-
-#include <rdma/ib_umem.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_smi.h>
-
-#include "ipath_verbs.h"
-
-/* Fast memory region */
-struct ipath_fmr {
-       struct ib_fmr ibfmr;
-       u8 page_shift;
-       struct ipath_mregion mr;        /* must be last */
-};
-
-static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
-{
-       return container_of(ibfmr, struct ipath_fmr, ibfmr);
-}
-
-/**
- * ipath_get_dma_mr - get a DMA memory region
- * @pd: protection domain for this memory region
- * @acc: access flags
- *
- * Returns the memory region on success, otherwise returns an errno.
- * Note that all DMA addresses should be created via the
- * struct ib_dma_mapping_ops functions (see ipath_dma.c).
- */
-struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
-{
-       struct ipath_mr *mr;
-       struct ib_mr *ret;
-
-       mr = kzalloc(sizeof *mr, GFP_KERNEL);
-       if (!mr) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       mr->mr.access_flags = acc;
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
-static struct ipath_mr *alloc_mr(int count,
-                                struct ipath_lkey_table *lk_table)
-{
-       struct ipath_mr *mr;
-       int m, i = 0;
-
-       /* Allocate struct plus pointers to first level page tables. */
-       m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
-       mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
-       if (!mr)
-               goto done;
-
-       /* Allocate first level page tables. */
-       for (; i < m; i++) {
-               mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
-               if (!mr->mr.map[i])
-                       goto bail;
-       }
-       mr->mr.mapsz = m;
-
-       /*
-        * ib_reg_phys_mr() will initialize mr->ibmr except for
-        * lkey and rkey.
-        */
-       if (!ipath_alloc_lkey(lk_table, &mr->mr))
-               goto bail;
-       mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
-
-       goto done;
-
-bail:
-       while (i) {
-               i--;
-               kfree(mr->mr.map[i]);
-       }
-       kfree(mr);
-       mr = NULL;
-
-done:
-       return mr;
-}
-
-/**
- * ipath_reg_phys_mr - register a physical memory region
- * @pd: protection domain for this memory region
- * @buffer_list: pointer to the list of physical buffers to register
- * @num_phys_buf: the number of physical buffers to register
- * @iova_start: the starting address passed over IB which maps to this MR
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-                               struct ib_phys_buf *buffer_list,
-                               int num_phys_buf, int acc, u64 *iova_start)
-{
-       struct ipath_mr *mr;
-       int n, m, i;
-       struct ib_mr *ret;
-
-       mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
-       if (mr == NULL) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       mr->mr.pd = pd;
-       mr->mr.user_base = *iova_start;
-       mr->mr.iova = *iova_start;
-       mr->mr.length = 0;
-       mr->mr.offset = 0;
-       mr->mr.access_flags = acc;
-       mr->mr.max_segs = num_phys_buf;
-       mr->umem = NULL;
-
-       m = 0;
-       n = 0;
-       for (i = 0; i < num_phys_buf; i++) {
-               mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
-               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
-               mr->mr.length += buffer_list[i].size;
-               n++;
-               if (n == IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_reg_user_mr - register a userspace memory region
- * @pd: protection domain for this memory region
- * @start: starting userspace address
- * @length: length of region to register
- * @virt_addr: virtual address to use (from HCA's point of view)
- * @mr_access_flags: access flags for this memory region
- * @udata: unused by the InfiniPath driver
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                               u64 virt_addr, int mr_access_flags,
-                               struct ib_udata *udata)
-{
-       struct ipath_mr *mr;
-       struct ib_umem *umem;
-       int n, m, entry;
-       struct scatterlist *sg;
-       struct ib_mr *ret;
-
-       if (length == 0) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       umem = ib_umem_get(pd->uobject->context, start, length,
-                          mr_access_flags, 0);
-       if (IS_ERR(umem))
-               return (void *) umem;
-
-       n = umem->nmap;
-       mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
-       if (!mr) {
-               ret = ERR_PTR(-ENOMEM);
-               ib_umem_release(umem);
-               goto bail;
-       }
-
-       mr->mr.pd = pd;
-       mr->mr.user_base = start;
-       mr->mr.iova = virt_addr;
-       mr->mr.length = length;
-       mr->mr.offset = ib_umem_offset(umem);
-       mr->mr.access_flags = mr_access_flags;
-       mr->mr.max_segs = n;
-       mr->umem = umem;
-
-       m = 0;
-       n = 0;
-       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-               void *vaddr;
-
-               vaddr = page_address(sg_page(sg));
-               if (!vaddr) {
-                       ret = ERR_PTR(-EINVAL);
-                       goto bail;
-               }
-               mr->mr.map[m]->segs[n].vaddr = vaddr;
-               mr->mr.map[m]->segs[n].length = umem->page_size;
-               n++;
-               if (n == IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       ret = &mr->ibmr;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_dereg_mr - unregister and free a memory region
- * @ibmr: the memory region to free
- *
- * Returns 0 on success.
- *
- * Note that this is called to free MRs created by ipath_get_dma_mr()
- * or ipath_reg_user_mr().
- */
-int ipath_dereg_mr(struct ib_mr *ibmr)
-{
-       struct ipath_mr *mr = to_imr(ibmr);
-       int i;
-
-       ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
-       i = mr->mr.mapsz;
-       while (i) {
-               i--;
-               kfree(mr->mr.map[i]);
-       }
-
-       if (mr->umem)
-               ib_umem_release(mr->umem);
-
-       kfree(mr);
-       return 0;
-}
-
-/**
- * ipath_alloc_fmr - allocate a fast memory region
- * @pd: the protection domain for this memory region
- * @mr_access_flags: access flags for this memory region
- * @fmr_attr: fast memory region attributes
- *
- * Returns the memory region on success, otherwise returns an errno.
- */
-struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-                              struct ib_fmr_attr *fmr_attr)
-{
-       struct ipath_fmr *fmr;
-       int m, i = 0;
-       struct ib_fmr *ret;
-
-       /* Allocate struct plus pointers to first level page tables. */
-       m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
-       fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
-       if (!fmr)
-               goto bail;
-
-       /* Allocate first level page tables. */
-       for (; i < m; i++) {
-               fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
-                                        GFP_KERNEL);
-               if (!fmr->mr.map[i])
-                       goto bail;
-       }
-       fmr->mr.mapsz = m;
-
-       /*
-        * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
-        * rkey.
-        */
-       if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
-               goto bail;
-       fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
-       /*
-        * Resources are allocated but no valid mapping (RKEY can't be
-        * used).
-        */
-       fmr->mr.pd = pd;
-       fmr->mr.user_base = 0;
-       fmr->mr.iova = 0;
-       fmr->mr.length = 0;
-       fmr->mr.offset = 0;
-       fmr->mr.access_flags = mr_access_flags;
-       fmr->mr.max_segs = fmr_attr->max_pages;
-       fmr->page_shift = fmr_attr->page_shift;
-
-       ret = &fmr->ibfmr;
-       goto done;
-
-bail:
-       while (i)
-               kfree(fmr->mr.map[--i]);
-       kfree(fmr);
-       ret = ERR_PTR(-ENOMEM);
-
-done:
-       return ret;
-}
-
-/**
- * ipath_map_phys_fmr - set up a fast memory region
- * @ibmfr: the fast memory region to set up
- * @page_list: the list of pages to associate with the fast memory region
- * @list_len: the number of pages to associate with the fast memory region
- * @iova: the virtual address of the start of the fast memory region
- *
- * This may be called from interrupt context.
- */
-
-int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
-                      int list_len, u64 iova)
-{
-       struct ipath_fmr *fmr = to_ifmr(ibfmr);
-       struct ipath_lkey_table *rkt;
-       unsigned long flags;
-       int m, n, i;
-       u32 ps;
-       int ret;
-
-       if (list_len > fmr->mr.max_segs) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       rkt = &to_idev(ibfmr->device)->lk_table;
-       spin_lock_irqsave(&rkt->lock, flags);
-       fmr->mr.user_base = iova;
-       fmr->mr.iova = iova;
-       ps = 1 << fmr->page_shift;
-       fmr->mr.length = list_len * ps;
-       m = 0;
-       n = 0;
-       ps = 1 << fmr->page_shift;
-       for (i = 0; i < list_len; i++) {
-               fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
-               fmr->mr.map[m]->segs[n].length = ps;
-               if (++n == IPATH_SEGSZ) {
-                       m++;
-                       n = 0;
-               }
-       }
-       spin_unlock_irqrestore(&rkt->lock, flags);
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_unmap_fmr - unmap fast memory regions
- * @fmr_list: the list of fast memory regions to unmap
- *
- * Returns 0 on success.
- */
-int ipath_unmap_fmr(struct list_head *fmr_list)
-{
-       struct ipath_fmr *fmr;
-       struct ipath_lkey_table *rkt;
-       unsigned long flags;
-
-       list_for_each_entry(fmr, fmr_list, ibfmr.list) {
-               rkt = &to_idev(fmr->ibfmr.device)->lk_table;
-               spin_lock_irqsave(&rkt->lock, flags);
-               fmr->mr.user_base = 0;
-               fmr->mr.iova = 0;
-               fmr->mr.length = 0;
-               spin_unlock_irqrestore(&rkt->lock, flags);
-       }
-       return 0;
-}
-
-/**
- * ipath_dealloc_fmr - deallocate a fast memory region
- * @ibfmr: the fast memory region to deallocate
- *
- * Returns 0 on success.
- */
-int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
-{
-       struct ipath_fmr *fmr = to_ifmr(ibfmr);
-       int i;
-
-       ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
-       i = fmr->mr.mapsz;
-       while (i)
-               kfree(fmr->mr.map[--i]);
-       kfree(fmr);
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
deleted file mode 100644 (file)
index face876..0000000
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-#define BITS_PER_PAGE          (PAGE_SIZE*BITS_PER_BYTE)
-#define BITS_PER_PAGE_MASK     (BITS_PER_PAGE-1)
-#define mk_qpn(qpt, map, off)  (((map) - (qpt)->map) * BITS_PER_PAGE + \
-                                (off))
-#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
-                                                     BITS_PER_PAGE, off)
-
-/*
- * Convert the AETH credit code into the number of credits.
- */
-static u32 credit_table[31] = {
-       0,                      /* 0 */
-       1,                      /* 1 */
-       2,                      /* 2 */
-       3,                      /* 3 */
-       4,                      /* 4 */
-       6,                      /* 5 */
-       8,                      /* 6 */
-       12,                     /* 7 */
-       16,                     /* 8 */
-       24,                     /* 9 */
-       32,                     /* A */
-       48,                     /* B */
-       64,                     /* C */
-       96,                     /* D */
-       128,                    /* E */
-       192,                    /* F */
-       256,                    /* 10 */
-       384,                    /* 11 */
-       512,                    /* 12 */
-       768,                    /* 13 */
-       1024,                   /* 14 */
-       1536,                   /* 15 */
-       2048,                   /* 16 */
-       3072,                   /* 17 */
-       4096,                   /* 18 */
-       6144,                   /* 19 */
-       8192,                   /* 1A */
-       12288,                  /* 1B */
-       16384,                  /* 1C */
-       24576,                  /* 1D */
-       32768                   /* 1E */
-};
-
-
-static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
-{
-       unsigned long page = get_zeroed_page(GFP_KERNEL);
-       unsigned long flags;
-
-       /*
-        * Free the page if someone raced with us installing it.
-        */
-
-       spin_lock_irqsave(&qpt->lock, flags);
-       if (map->page)
-               free_page(page);
-       else
-               map->page = (void *)page;
-       spin_unlock_irqrestore(&qpt->lock, flags);
-}
-
-
-static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
-{
-       u32 i, offset, max_scan, qpn;
-       struct qpn_map *map;
-       u32 ret = -1;
-
-       if (type == IB_QPT_SMI)
-               ret = 0;
-       else if (type == IB_QPT_GSI)
-               ret = 1;
-
-       if (ret != -1) {
-               map = &qpt->map[0];
-               if (unlikely(!map->page)) {
-                       get_map_page(qpt, map);
-                       if (unlikely(!map->page)) {
-                               ret = -ENOMEM;
-                               goto bail;
-                       }
-               }
-               if (!test_and_set_bit(ret, map->page))
-                       atomic_dec(&map->n_free);
-               else
-                       ret = -EBUSY;
-               goto bail;
-       }
-
-       qpn = qpt->last + 1;
-       if (qpn >= QPN_MAX)
-               qpn = 2;
-       offset = qpn & BITS_PER_PAGE_MASK;
-       map = &qpt->map[qpn / BITS_PER_PAGE];
-       max_scan = qpt->nmaps - !offset;
-       for (i = 0;;) {
-               if (unlikely(!map->page)) {
-                       get_map_page(qpt, map);
-                       if (unlikely(!map->page))
-                               break;
-               }
-               if (likely(atomic_read(&map->n_free))) {
-                       do {
-                               if (!test_and_set_bit(offset, map->page)) {
-                                       atomic_dec(&map->n_free);
-                                       qpt->last = qpn;
-                                       ret = qpn;
-                                       goto bail;
-                               }
-                               offset = find_next_offset(map, offset);
-                               qpn = mk_qpn(qpt, map, offset);
-                               /*
-                                * This test differs from alloc_pidmap().
-                                * If find_next_offset() does find a zero
-                                * bit, we don't need to check for QPN
-                                * wrapping around past our starting QPN.
-                                * We just need to be sure we don't loop
-                                * forever.
-                                */
-                       } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
-               }
-               /*
-                * In order to keep the number of pages allocated to a
-                * minimum, we scan the all existing pages before increasing
-                * the size of the bitmap table.
-                */
-               if (++i > max_scan) {
-                       if (qpt->nmaps == QPNMAP_ENTRIES)
-                               break;
-                       map = &qpt->map[qpt->nmaps++];
-                       offset = 0;
-               } else if (map < &qpt->map[qpt->nmaps]) {
-                       ++map;
-                       offset = 0;
-               } else {
-                       map = &qpt->map[0];
-                       offset = 2;
-               }
-               qpn = mk_qpn(qpt, map, offset);
-       }
-
-       ret = -ENOMEM;
-
-bail:
-       return ret;
-}
-
-static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
-{
-       struct qpn_map *map;
-
-       map = qpt->map + qpn / BITS_PER_PAGE;
-       if (map->page)
-               clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
-       atomic_inc(&map->n_free);
-}
-
-/**
- * ipath_alloc_qpn - allocate a QP number
- * @qpt: the QP table
- * @qp: the QP
- * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
- *
- * Allocate the next available QPN and put the QP into the hash table.
- * The hash table holds a reference to the QP.
- */
-static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
-                          enum ib_qp_type type)
-{
-       unsigned long flags;
-       int ret;
-
-       ret = alloc_qpn(qpt, type);
-       if (ret < 0)
-               goto bail;
-       qp->ibqp.qp_num = ret;
-
-       /* Add the QP to the hash table. */
-       spin_lock_irqsave(&qpt->lock, flags);
-
-       ret %= qpt->max;
-       qp->next = qpt->table[ret];
-       qpt->table[ret] = qp;
-       atomic_inc(&qp->refcount);
-
-       spin_unlock_irqrestore(&qpt->lock, flags);
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_free_qp - remove a QP from the QP table
- * @qpt: the QP table
- * @qp: the QP to remove
- *
- * Remove the QP from the table so it can't be found asynchronously by
- * the receive interrupt routine.
- */
-static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
-{
-       struct ipath_qp *q, **qpp;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qpt->lock, flags);
-
-       /* Remove QP from the hash table. */
-       qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
-       for (; (q = *qpp) != NULL; qpp = &q->next) {
-               if (q == qp) {
-                       *qpp = qp->next;
-                       qp->next = NULL;
-                       atomic_dec(&qp->refcount);
-                       break;
-               }
-       }
-
-       spin_unlock_irqrestore(&qpt->lock, flags);
-}
-
-/**
- * ipath_free_all_qps - check for QPs still in use
- * @qpt: the QP table to empty
- *
- * There should not be any QPs still in use.
- * Free memory for table.
- */
-unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
-{
-       unsigned long flags;
-       struct ipath_qp *qp;
-       u32 n, qp_inuse = 0;
-
-       spin_lock_irqsave(&qpt->lock, flags);
-       for (n = 0; n < qpt->max; n++) {
-               qp = qpt->table[n];
-               qpt->table[n] = NULL;
-
-               for (; qp; qp = qp->next)
-                       qp_inuse++;
-       }
-       spin_unlock_irqrestore(&qpt->lock, flags);
-
-       for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
-               if (qpt->map[n].page)
-                       free_page((unsigned long) qpt->map[n].page);
-       return qp_inuse;
-}
-
-/**
- * ipath_lookup_qpn - return the QP with the given QPN
- * @qpt: the QP table
- * @qpn: the QP number to look up
- *
- * The caller is responsible for decrementing the QP reference count
- * when done.
- */
-struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
-{
-       unsigned long flags;
-       struct ipath_qp *qp;
-
-       spin_lock_irqsave(&qpt->lock, flags);
-
-       for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
-               if (qp->ibqp.qp_num == qpn) {
-                       atomic_inc(&qp->refcount);
-                       break;
-               }
-       }
-
-       spin_unlock_irqrestore(&qpt->lock, flags);
-       return qp;
-}
-
-/**
- * ipath_reset_qp - initialize the QP state to the reset state
- * @qp: the QP to reset
- * @type: the QP type
- */
-static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
-{
-       qp->remote_qpn = 0;
-       qp->qkey = 0;
-       qp->qp_access_flags = 0;
-       atomic_set(&qp->s_dma_busy, 0);
-       qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
-       qp->s_hdrwords = 0;
-       qp->s_wqe = NULL;
-       qp->s_pkt_delay = 0;
-       qp->s_draining = 0;
-       qp->s_psn = 0;
-       qp->r_psn = 0;
-       qp->r_msn = 0;
-       if (type == IB_QPT_RC) {
-               qp->s_state = IB_OPCODE_RC_SEND_LAST;
-               qp->r_state = IB_OPCODE_RC_SEND_LAST;
-       } else {
-               qp->s_state = IB_OPCODE_UC_SEND_LAST;
-               qp->r_state = IB_OPCODE_UC_SEND_LAST;
-       }
-       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
-       qp->r_nak_state = 0;
-       qp->r_aflags = 0;
-       qp->r_flags = 0;
-       qp->s_rnr_timeout = 0;
-       qp->s_head = 0;
-       qp->s_tail = 0;
-       qp->s_cur = 0;
-       qp->s_last = 0;
-       qp->s_ssn = 1;
-       qp->s_lsn = 0;
-       memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
-       qp->r_head_ack_queue = 0;
-       qp->s_tail_ack_queue = 0;
-       qp->s_num_rd_atomic = 0;
-       if (qp->r_rq.wq) {
-               qp->r_rq.wq->head = 0;
-               qp->r_rq.wq->tail = 0;
-       }
-}
-
-/**
- * ipath_error_qp - put a QP into the error state
- * @qp: the QP to put into the error state
- * @err: the receive completion error to signal if a RWQE is active
- *
- * Flushes both send and receive work queues.
- * Returns true if last WQE event should be generated.
- * The QP s_lock should be held and interrupts disabled.
- * If we are already in error state, just return.
- */
-
-int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ib_wc wc;
-       int ret = 0;
-
-       if (qp->state == IB_QPS_ERR)
-               goto bail;
-
-       qp->state = IB_QPS_ERR;
-
-       spin_lock(&dev->pending_lock);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       if (!list_empty(&qp->piowait))
-               list_del_init(&qp->piowait);
-       spin_unlock(&dev->pending_lock);
-
-       /* Schedule the sending tasklet to drain the send work queue. */
-       if (qp->s_last != qp->s_head)
-               ipath_schedule_send(qp);
-
-       memset(&wc, 0, sizeof(wc));
-       wc.qp = &qp->ibqp;
-       wc.opcode = IB_WC_RECV;
-
-       if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
-               wc.wr_id = qp->r_wr_id;
-               wc.status = err;
-               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-       }
-       wc.status = IB_WC_WR_FLUSH_ERR;
-
-       if (qp->r_rq.wq) {
-               struct ipath_rwq *wq;
-               u32 head;
-               u32 tail;
-
-               spin_lock(&qp->r_rq.lock);
-
-               /* sanity check pointers before trusting them */
-               wq = qp->r_rq.wq;
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               while (tail != head) {
-                       wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
-                       if (++tail >= qp->r_rq.size)
-                               tail = 0;
-                       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-               }
-               wq->tail = tail;
-
-               spin_unlock(&qp->r_rq.lock);
-       } else if (qp->ibqp.event_handler)
-               ret = 1;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_modify_qp - modify the attributes of a queue pair
- * @ibqp: the queue pair who's attributes we're modifying
- * @attr: the new attributes
- * @attr_mask: the mask of attributes to modify
- * @udata: user data for ipathverbs.so
- *
- * Returns 0 on success, otherwise returns an errno.
- */
-int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata)
-{
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-       struct ipath_qp *qp = to_iqp(ibqp);
-       enum ib_qp_state cur_state, new_state;
-       int lastwqe = 0;
-       int ret;
-
-       spin_lock_irq(&qp->s_lock);
-
-       cur_state = attr_mask & IB_QP_CUR_STATE ?
-               attr->cur_qp_state : qp->state;
-       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
-                               attr_mask, IB_LINK_LAYER_UNSPECIFIED))
-               goto inval;
-
-       if (attr_mask & IB_QP_AV) {
-               if (attr->ah_attr.dlid == 0 ||
-                   attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
-                       goto inval;
-
-               if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
-                   (attr->ah_attr.grh.sgid_index > 1))
-                       goto inval;
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX)
-               if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
-                       goto inval;
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER)
-               if (attr->min_rnr_timer > 31)
-                       goto inval;
-
-       if (attr_mask & IB_QP_PORT)
-               if (attr->port_num == 0 ||
-                   attr->port_num > ibqp->device->phys_port_cnt)
-                       goto inval;
-
-       /*
-        * don't allow invalid Path MTU values or greater than 2048
-        * unless we are configured for a 4KB MTU
-        */
-       if ((attr_mask & IB_QP_PATH_MTU) &&
-               (ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
-               (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
-               goto inval;
-
-       if (attr_mask & IB_QP_PATH_MIG_STATE)
-               if (attr->path_mig_state != IB_MIG_MIGRATED &&
-                   attr->path_mig_state != IB_MIG_REARM)
-                       goto inval;
-
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-               if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
-                       goto inval;
-
-       switch (new_state) {
-       case IB_QPS_RESET:
-               if (qp->state != IB_QPS_RESET) {
-                       qp->state = IB_QPS_RESET;
-                       spin_lock(&dev->pending_lock);
-                       if (!list_empty(&qp->timerwait))
-                               list_del_init(&qp->timerwait);
-                       if (!list_empty(&qp->piowait))
-                               list_del_init(&qp->piowait);
-                       spin_unlock(&dev->pending_lock);
-                       qp->s_flags &= ~IPATH_S_ANY_WAIT;
-                       spin_unlock_irq(&qp->s_lock);
-                       /* Stop the sending tasklet */
-                       tasklet_kill(&qp->s_task);
-                       wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
-                       spin_lock_irq(&qp->s_lock);
-               }
-               ipath_reset_qp(qp, ibqp->qp_type);
-               break;
-
-       case IB_QPS_SQD:
-               qp->s_draining = qp->s_last != qp->s_cur;
-               qp->state = new_state;
-               break;
-
-       case IB_QPS_SQE:
-               if (qp->ibqp.qp_type == IB_QPT_RC)
-                       goto inval;
-               qp->state = new_state;
-               break;
-
-       case IB_QPS_ERR:
-               lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-               break;
-
-       default:
-               qp->state = new_state;
-               break;
-       }
-
-       if (attr_mask & IB_QP_PKEY_INDEX)
-               qp->s_pkey_index = attr->pkey_index;
-
-       if (attr_mask & IB_QP_DEST_QPN)
-               qp->remote_qpn = attr->dest_qp_num;
-
-       if (attr_mask & IB_QP_SQ_PSN) {
-               qp->s_psn = qp->s_next_psn = attr->sq_psn;
-               qp->s_last_psn = qp->s_next_psn - 1;
-       }
-
-       if (attr_mask & IB_QP_RQ_PSN)
-               qp->r_psn = attr->rq_psn;
-
-       if (attr_mask & IB_QP_ACCESS_FLAGS)
-               qp->qp_access_flags = attr->qp_access_flags;
-
-       if (attr_mask & IB_QP_AV) {
-               qp->remote_ah_attr = attr->ah_attr;
-               qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
-       }
-
-       if (attr_mask & IB_QP_PATH_MTU)
-               qp->path_mtu = attr->path_mtu;
-
-       if (attr_mask & IB_QP_RETRY_CNT)
-               qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
-
-       if (attr_mask & IB_QP_RNR_RETRY) {
-               qp->s_rnr_retry = attr->rnr_retry;
-               if (qp->s_rnr_retry > 7)
-                       qp->s_rnr_retry = 7;
-               qp->s_rnr_retry_cnt = qp->s_rnr_retry;
-       }
-
-       if (attr_mask & IB_QP_MIN_RNR_TIMER)
-               qp->r_min_rnr_timer = attr->min_rnr_timer;
-
-       if (attr_mask & IB_QP_TIMEOUT)
-               qp->timeout = attr->timeout;
-
-       if (attr_mask & IB_QP_QKEY)
-               qp->qkey = attr->qkey;
-
-       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
-               qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
-
-       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
-               qp->s_max_rd_atomic = attr->max_rd_atomic;
-
-       spin_unlock_irq(&qp->s_lock);
-
-       if (lastwqe) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-       ret = 0;
-       goto bail;
-
-inval:
-       spin_unlock_irq(&qp->s_lock);
-       ret = -EINVAL;
-
-bail:
-       return ret;
-}
-
-int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                  int attr_mask, struct ib_qp_init_attr *init_attr)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-
-       attr->qp_state = qp->state;
-       attr->cur_qp_state = attr->qp_state;
-       attr->path_mtu = qp->path_mtu;
-       attr->path_mig_state = 0;
-       attr->qkey = qp->qkey;
-       attr->rq_psn = qp->r_psn;
-       attr->sq_psn = qp->s_next_psn;
-       attr->dest_qp_num = qp->remote_qpn;
-       attr->qp_access_flags = qp->qp_access_flags;
-       attr->cap.max_send_wr = qp->s_size - 1;
-       attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
-       attr->cap.max_send_sge = qp->s_max_sge;
-       attr->cap.max_recv_sge = qp->r_rq.max_sge;
-       attr->cap.max_inline_data = 0;
-       attr->ah_attr = qp->remote_ah_attr;
-       memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
-       attr->pkey_index = qp->s_pkey_index;
-       attr->alt_pkey_index = 0;
-       attr->en_sqd_async_notify = 0;
-       attr->sq_draining = qp->s_draining;
-       attr->max_rd_atomic = qp->s_max_rd_atomic;
-       attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
-       attr->min_rnr_timer = qp->r_min_rnr_timer;
-       attr->port_num = 1;
-       attr->timeout = qp->timeout;
-       attr->retry_cnt = qp->s_retry_cnt;
-       attr->rnr_retry = qp->s_rnr_retry_cnt;
-       attr->alt_port_num = 0;
-       attr->alt_timeout = 0;
-
-       init_attr->event_handler = qp->ibqp.event_handler;
-       init_attr->qp_context = qp->ibqp.qp_context;
-       init_attr->send_cq = qp->ibqp.send_cq;
-       init_attr->recv_cq = qp->ibqp.recv_cq;
-       init_attr->srq = qp->ibqp.srq;
-       init_attr->cap = attr->cap;
-       if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
-               init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
-       else
-               init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
-       init_attr->qp_type = qp->ibqp.qp_type;
-       init_attr->port_num = 1;
-       return 0;
-}
-
-/**
- * ipath_compute_aeth - compute the AETH (syndrome + MSN)
- * @qp: the queue pair to compute the AETH for
- *
- * Returns the AETH.
- */
-__be32 ipath_compute_aeth(struct ipath_qp *qp)
-{
-       u32 aeth = qp->r_msn & IPATH_MSN_MASK;
-
-       if (qp->ibqp.srq) {
-               /*
-                * Shared receive queues don't generate credits.
-                * Set the credit field to the invalid value.
-                */
-               aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
-       } else {
-               u32 min, max, x;
-               u32 credits;
-               struct ipath_rwq *wq = qp->r_rq.wq;
-               u32 head;
-               u32 tail;
-
-               /* sanity check pointers before trusting them */
-               head = wq->head;
-               if (head >= qp->r_rq.size)
-                       head = 0;
-               tail = wq->tail;
-               if (tail >= qp->r_rq.size)
-                       tail = 0;
-               /*
-                * Compute the number of credits available (RWQEs).
-                * XXX Not holding the r_rq.lock here so there is a small
-                * chance that the pair of reads are not atomic.
-                */
-               credits = head - tail;
-               if ((int)credits < 0)
-                       credits += qp->r_rq.size;
-               /*
-                * Binary search the credit table to find the code to
-                * use.
-                */
-               min = 0;
-               max = 31;
-               for (;;) {
-                       x = (min + max) / 2;
-                       if (credit_table[x] == credits)
-                               break;
-                       if (credit_table[x] > credits)
-                               max = x;
-                       else if (min == x)
-                               break;
-                       else
-                               min = x;
-               }
-               aeth |= x << IPATH_AETH_CREDIT_SHIFT;
-       }
-       return cpu_to_be32(aeth);
-}
-
-/**
- * ipath_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: unused by InfiniPath
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
-                             struct ib_qp_init_attr *init_attr,
-                             struct ib_udata *udata)
-{
-       struct ipath_qp *qp;
-       int err;
-       struct ipath_swqe *swq = NULL;
-       struct ipath_ibdev *dev;
-       size_t sz;
-       size_t sg_list_sz;
-       struct ib_qp *ret;
-
-       if (init_attr->create_flags) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
-           init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       /* Check receive queue parameters if no SRQ is specified. */
-       if (!init_attr->srq) {
-               if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
-                   init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
-                       ret = ERR_PTR(-EINVAL);
-                       goto bail;
-               }
-               if (init_attr->cap.max_send_sge +
-                   init_attr->cap.max_send_wr +
-                   init_attr->cap.max_recv_sge +
-                   init_attr->cap.max_recv_wr == 0) {
-                       ret = ERR_PTR(-EINVAL);
-                       goto bail;
-               }
-       }
-
-       switch (init_attr->qp_type) {
-       case IB_QPT_UC:
-       case IB_QPT_RC:
-       case IB_QPT_UD:
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               sz = sizeof(struct ipath_sge) *
-                       init_attr->cap.max_send_sge +
-                       sizeof(struct ipath_swqe);
-               swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
-               if (swq == NULL) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail;
-               }
-               sz = sizeof(*qp);
-               sg_list_sz = 0;
-               if (init_attr->srq) {
-                       struct ipath_srq *srq = to_isrq(init_attr->srq);
-
-                       if (srq->rq.max_sge > 1)
-                               sg_list_sz = sizeof(*qp->r_sg_list) *
-                                       (srq->rq.max_sge - 1);
-               } else if (init_attr->cap.max_recv_sge > 1)
-                       sg_list_sz = sizeof(*qp->r_sg_list) *
-                               (init_attr->cap.max_recv_sge - 1);
-               qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
-               if (!qp) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail_swq;
-               }
-               if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
-                   init_attr->qp_type == IB_QPT_SMI ||
-                   init_attr->qp_type == IB_QPT_GSI)) {
-                       qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
-                       if (!qp->r_ud_sg_list) {
-                               ret = ERR_PTR(-ENOMEM);
-                               goto bail_qp;
-                       }
-               } else
-                       qp->r_ud_sg_list = NULL;
-               if (init_attr->srq) {
-                       sz = 0;
-                       qp->r_rq.size = 0;
-                       qp->r_rq.max_sge = 0;
-                       qp->r_rq.wq = NULL;
-                       init_attr->cap.max_recv_wr = 0;
-                       init_attr->cap.max_recv_sge = 0;
-               } else {
-                       qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
-                       qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
-                       sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
-                               sizeof(struct ipath_rwqe);
-                       qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
-                                             qp->r_rq.size * sz);
-                       if (!qp->r_rq.wq) {
-                               ret = ERR_PTR(-ENOMEM);
-                               goto bail_sg_list;
-                       }
-               }
-
-               /*
-                * ib_create_qp() will initialize qp->ibqp
-                * except for qp->ibqp.qp_num.
-                */
-               spin_lock_init(&qp->s_lock);
-               spin_lock_init(&qp->r_rq.lock);
-               atomic_set(&qp->refcount, 0);
-               init_waitqueue_head(&qp->wait);
-               init_waitqueue_head(&qp->wait_dma);
-               tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
-               INIT_LIST_HEAD(&qp->piowait);
-               INIT_LIST_HEAD(&qp->timerwait);
-               qp->state = IB_QPS_RESET;
-               qp->s_wq = swq;
-               qp->s_size = init_attr->cap.max_send_wr + 1;
-               qp->s_max_sge = init_attr->cap.max_send_sge;
-               if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
-                       qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
-               else
-                       qp->s_flags = 0;
-               dev = to_idev(ibpd->device);
-               err = ipath_alloc_qpn(&dev->qp_table, qp,
-                                     init_attr->qp_type);
-               if (err) {
-                       ret = ERR_PTR(err);
-                       vfree(qp->r_rq.wq);
-                       goto bail_sg_list;
-               }
-               qp->ip = NULL;
-               qp->s_tx = NULL;
-               ipath_reset_qp(qp, init_attr->qp_type);
-               break;
-
-       default:
-               /* Don't support raw QPs */
-               ret = ERR_PTR(-ENOSYS);
-               goto bail;
-       }
-
-       init_attr->cap.max_inline_data = 0;
-
-       /*
-        * Return the address of the RWQ as the offset to mmap.
-        * See ipath_mmap() for details.
-        */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               if (!qp->r_rq.wq) {
-                       __u64 offset = 0;
-
-                       err = ib_copy_to_udata(udata, &offset,
-                                              sizeof(offset));
-                       if (err) {
-                               ret = ERR_PTR(err);
-                               goto bail_ip;
-                       }
-               } else {
-                       u32 s = sizeof(struct ipath_rwq) +
-                               qp->r_rq.size * sz;
-
-                       qp->ip =
-                           ipath_create_mmap_info(dev, s,
-                                                  ibpd->uobject->context,
-                                                  qp->r_rq.wq);
-                       if (!qp->ip) {
-                               ret = ERR_PTR(-ENOMEM);
-                               goto bail_ip;
-                       }
-
-                       err = ib_copy_to_udata(udata, &(qp->ip->offset),
-                                              sizeof(qp->ip->offset));
-                       if (err) {
-                               ret = ERR_PTR(err);
-                               goto bail_ip;
-                       }
-               }
-       }
-
-       spin_lock(&dev->n_qps_lock);
-       if (dev->n_qps_allocated == ib_ipath_max_qps) {
-               spin_unlock(&dev->n_qps_lock);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_ip;
-       }
-
-       dev->n_qps_allocated++;
-       spin_unlock(&dev->n_qps_lock);
-
-       if (qp->ip) {
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       ret = &qp->ibqp;
-       goto bail;
-
-bail_ip:
-       if (qp->ip)
-               kref_put(&qp->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(qp->r_rq.wq);
-       ipath_free_qp(&dev->qp_table, qp);
-       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
-bail_sg_list:
-       kfree(qp->r_ud_sg_list);
-bail_qp:
-       kfree(qp);
-bail_swq:
-       vfree(swq);
-bail:
-       return ret;
-}
-
-/**
- * ipath_destroy_qp - destroy a queue pair
- * @ibqp: the queue pair to destroy
- *
- * Returns 0 on success.
- *
- * Note that this can be called while the QP is actively sending or
- * receiving!
- */
-int ipath_destroy_qp(struct ib_qp *ibqp)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-
-       /* Make sure HW and driver activity is stopped. */
-       spin_lock_irq(&qp->s_lock);
-       if (qp->state != IB_QPS_RESET) {
-               qp->state = IB_QPS_RESET;
-               spin_lock(&dev->pending_lock);
-               if (!list_empty(&qp->timerwait))
-                       list_del_init(&qp->timerwait);
-               if (!list_empty(&qp->piowait))
-                       list_del_init(&qp->piowait);
-               spin_unlock(&dev->pending_lock);
-               qp->s_flags &= ~IPATH_S_ANY_WAIT;
-               spin_unlock_irq(&qp->s_lock);
-               /* Stop the sending tasklet */
-               tasklet_kill(&qp->s_task);
-               wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
-       } else
-               spin_unlock_irq(&qp->s_lock);
-
-       ipath_free_qp(&dev->qp_table, qp);
-
-       if (qp->s_tx) {
-               atomic_dec(&qp->refcount);
-               if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
-                       kfree(qp->s_tx->txreq.map_addr);
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
-               spin_unlock_irq(&dev->pending_lock);
-               qp->s_tx = NULL;
-       }
-
-       wait_event(qp->wait, !atomic_read(&qp->refcount));
-
-       /* all user's cleaned up, mark it available */
-       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
-       spin_lock(&dev->n_qps_lock);
-       dev->n_qps_allocated--;
-       spin_unlock(&dev->n_qps_lock);
-
-       if (qp->ip)
-               kref_put(&qp->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(qp->r_rq.wq);
-       kfree(qp->r_ud_sg_list);
-       vfree(qp->s_wq);
-       kfree(qp);
-       return 0;
-}
-
-/**
- * ipath_init_qp_table - initialize the QP table for a device
- * @idev: the device who's QP table we're initializing
- * @size: the size of the QP table
- *
- * Returns 0 on success, otherwise returns an errno.
- */
-int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
-{
-       int i;
-       int ret;
-
-       idev->qp_table.last = 1;        /* QPN 0 and 1 are special. */
-       idev->qp_table.max = size;
-       idev->qp_table.nmaps = 1;
-       idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
-                                      GFP_KERNEL);
-       if (idev->qp_table.table == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
-               atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
-               idev->qp_table.map[i].page = NULL;
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_get_credit - flush the send work queue of a QP
- * @qp: the qp who's send work queue to flush
- * @aeth: the Acknowledge Extended Transport Header
- *
- * The QP s_lock should be held.
- */
-void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
-{
-       u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
-
-       /*
-        * If the credit is invalid, we can send
-        * as many packets as we like.  Otherwise, we have to
-        * honor the credit field.
-        */
-       if (credit == IPATH_AETH_CREDIT_INVAL)
-               qp->s_lsn = (u32) -1;
-       else if (qp->s_lsn != (u32) -1) {
-               /* Compute new LSN (i.e., MSN + credit) */
-               credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
-               if (ipath_cmp24(credit, qp->s_lsn) > 0)
-                       qp->s_lsn = credit;
-       }
-
-       /* Restart sending if it was blocked due to lack of credits. */
-       if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
-           qp->s_cur != qp->s_head &&
-           (qp->s_lsn == (u32) -1 ||
-            ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
-                        qp->s_lsn + 1) <= 0))
-               ipath_schedule_send(qp);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
deleted file mode 100644 (file)
index 79b3dbc..0000000
+++ /dev/null
@@ -1,1969 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/io.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_RC_##x
-
-static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
-                      u32 psn, u32 pmtu)
-{
-       u32 len;
-
-       len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
-       ss->sge = wqe->sg_list[0];
-       ss->sg_list = wqe->sg_list + 1;
-       ss->num_sge = wqe->wr.num_sge;
-       ipath_skip_sge(ss, len);
-       return wqe->length - len;
-}
-
-/**
- * ipath_init_restart- initialize the qp->s_sge after a restart
- * @qp: the QP who's SGE we're restarting
- * @wqe: the work queue to initialize the QP's SGE from
- *
- * The QP s_lock should be held and interrupts disabled.
- */
-static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
-{
-       struct ipath_ibdev *dev;
-
-       qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
-                               ib_mtu_enum_to_int(qp->path_mtu));
-       dev = to_idev(qp->ibqp.device);
-       spin_lock(&dev->pending_lock);
-       if (list_empty(&qp->timerwait))
-               list_add_tail(&qp->timerwait,
-                             &dev->pending[dev->pending_index]);
-       spin_unlock(&dev->pending_lock);
-}
-
-/**
- * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
- * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @pmtu: the path MTU
- *
- * Return 1 if constructed; otherwise, return 0.
- * Note that we are in the responder's side of the QP context.
- * Note the QP s_lock must be held.
- */
-static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
-                            struct ipath_other_headers *ohdr, u32 pmtu)
-{
-       struct ipath_ack_entry *e;
-       u32 hwords;
-       u32 len;
-       u32 bth0;
-       u32 bth2;
-
-       /* Don't send an ACK if we aren't supposed to. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-               goto bail;
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-
-       switch (qp->s_ack_state) {
-       case OP(RDMA_READ_RESPONSE_LAST):
-       case OP(RDMA_READ_RESPONSE_ONLY):
-       case OP(ATOMIC_ACKNOWLEDGE):
-               /*
-                * We can increment the tail pointer now that the last
-                * response has been sent instead of only being
-                * constructed.
-                */
-               if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
-                       qp->s_tail_ack_queue = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_ONLY):
-       case OP(ACKNOWLEDGE):
-               /* Check for no next entry in the queue. */
-               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
-                       if (qp->s_flags & IPATH_S_ACK_PENDING)
-                               goto normal;
-                       qp->s_ack_state = OP(ACKNOWLEDGE);
-                       goto bail;
-               }
-
-               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->opcode == OP(RDMA_READ_REQUEST)) {
-                       /* Copy SGE state in case we need to resend */
-                       qp->s_ack_rdma_sge = e->rdma_sge;
-                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
-                       len = e->rdma_sge.sge.sge_length;
-                       if (len > pmtu) {
-                               len = pmtu;
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
-                       } else {
-                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
-                               e->sent = 1;
-                       }
-                       ohdr->u.aeth = ipath_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_rdma_psn = e->psn;
-                       bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
-               } else {
-                       /* COMPARE_SWAP or FETCH_ADD */
-                       qp->s_cur_sge = NULL;
-                       len = 0;
-                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
-                       ohdr->u.at.aeth = ipath_compute_aeth(qp);
-                       ohdr->u.at.atomic_ack_eth[0] =
-                               cpu_to_be32(e->atomic_data >> 32);
-                       ohdr->u.at.atomic_ack_eth[1] =
-                               cpu_to_be32(e->atomic_data);
-                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
-                       bth2 = e->psn;
-                       e->sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               len = qp->s_ack_rdma_sge.sge.sge_length;
-               if (len > pmtu)
-                       len = pmtu;
-               else {
-                       ohdr->u.aeth = ipath_compute_aeth(qp);
-                       hwords++;
-                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-                       qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
-               }
-               bth0 = qp->s_ack_state << 24;
-               bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
-               break;
-
-       default:
-       normal:
-               /*
-                * Send a regular ACK.
-                * Set the s_ack_state so we wait until after sending
-                * the ACK before setting s_ack_state to ACKNOWLEDGE
-                * (see above).
-                */
-               qp->s_ack_state = OP(SEND_ONLY);
-               qp->s_flags &= ~IPATH_S_ACK_PENDING;
-               qp->s_cur_sge = NULL;
-               if (qp->s_nak_state)
-                       ohdr->u.aeth =
-                               cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
-                                           (qp->s_nak_state <<
-                                            IPATH_AETH_CREDIT_SHIFT));
-               else
-                       ohdr->u.aeth = ipath_compute_aeth(qp);
-               hwords++;
-               len = 0;
-               bth0 = OP(ACKNOWLEDGE) << 24;
-               bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
-       }
-       qp->s_hdrwords = hwords;
-       qp->s_cur_size = len;
-       ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
-       return 1;
-
-bail:
-       return 0;
-}
-
-/**
- * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
- * @qp: a pointer to the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_rc_req(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_other_headers *ohdr;
-       struct ipath_sge_state *ss;
-       struct ipath_swqe *wqe;
-       u32 hwords;
-       u32 len;
-       u32 bth0;
-       u32 bth2;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       char newreq;
-       unsigned long flags;
-       int ret = 0;
-
-       ohdr = &qp->s_hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &qp->s_hdr.u.l.oth;
-
-       /*
-        * The lock is needed to synchronize between the sending tasklet,
-        * the receive interrupt handler, and timeout resends.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Sending responses has higher priority over sending requests. */
-       if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
-            (qp->s_flags & IPATH_S_ACK_PENDING) ||
-            qp->s_ack_state != OP(ACKNOWLEDGE)) &&
-           ipath_make_rc_ack(dev, qp, ohdr, pmtu))
-               goto done;
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&qp->s_dma_busy)) {
-                       qp->s_flags |= IPATH_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done;
-       }
-
-       /* Leave BUSY set until RNR timeout. */
-       if (qp->s_rnr_timeout) {
-               qp->s_flags |= IPATH_S_WAITING;
-               goto bail;
-       }
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-       bth0 = 1 << 22; /* Set M bit */
-
-       /* Send a request. */
-       wqe = get_swqe_ptr(qp, qp->s_cur);
-       switch (qp->s_state) {
-       default:
-               if (!(ib_ipath_state_ops[qp->state] &
-                   IPATH_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /*
-                * Resend an old request or start a new one.
-                *
-                * We keep track of the current SWQE so that
-                * we don't reset the "furthest progress" state
-                * if we need to back up.
-                */
-               newreq = 0;
-               if (qp->s_cur == qp->s_tail) {
-                       /* Check if send work queue is empty. */
-                       if (qp->s_tail == qp->s_head)
-                               goto bail;
-                       /*
-                        * If a fence is requested, wait for previous
-                        * RDMA read and atomic operations to finish.
-                        */
-                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
-                           qp->s_num_rd_atomic) {
-                               qp->s_flags |= IPATH_S_FENCE_PENDING;
-                               goto bail;
-                       }
-                       wqe->psn = qp->s_next_psn;
-                       newreq = 1;
-               }
-               /*
-                * Note that we have to be careful not to modify the
-                * original work request since we may need to resend
-                * it.
-                */
-               len = wqe->length;
-               ss = &qp->s_sge;
-               bth2 = 0;
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       /* If no credit, return. */
-                       if (qp->s_lsn != (u32) -1 &&
-                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       wqe->lpsn = wqe->psn;
-                       if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND)
-                               qp->s_state = OP(SEND_ONLY);
-                       else {
-                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-                       bth2 = 1 << 31; /* Request ACK. */
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-                       if (newreq && qp->s_lsn != (u32) -1)
-                               qp->s_lsn++;
-                       /* FALLTHROUGH */
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       /* If no credit, return. */
-                       if (qp->s_lsn != (u32) -1 &&
-                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
-                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
-                               goto bail;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / sizeof(u32);
-                       wqe->lpsn = wqe->psn;
-                       if (len > pmtu) {
-                               wqe->lpsn += (len - 1) / pmtu;
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= 1 << 23;
-                       }
-                       bth2 = 1 << 31; /* Request ACK. */
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_READ:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= IPATH_S_RDMAR_PENDING;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (qp->s_lsn != (u32) -1)
-                                       qp->s_lsn++;
-                               /*
-                                * Adjust s_next_psn to count the
-                                * expected number of responses.
-                                */
-                               if (len > pmtu)
-                                       qp->s_next_psn += (len - 1) / pmtu;
-                               wqe->lpsn = qp->s_next_psn++;
-                       }
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       qp->s_state = OP(RDMA_READ_REQUEST);
-                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_ATOMIC_CMP_AND_SWP:
-               case IB_WR_ATOMIC_FETCH_AND_ADD:
-                       /*
-                        * Don't allow more operations to be started
-                        * than the QP limits allow.
-                        */
-                       if (newreq) {
-                               if (qp->s_num_rd_atomic >=
-                                   qp->s_max_rd_atomic) {
-                                       qp->s_flags |= IPATH_S_RDMAR_PENDING;
-                                       goto bail;
-                               }
-                               qp->s_num_rd_atomic++;
-                               if (qp->s_lsn != (u32) -1)
-                                       qp->s_lsn++;
-                               wqe->lpsn = wqe->psn;
-                       }
-                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-                               qp->s_state = OP(COMPARE_SWAP);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->wr.wr.atomic.swap);
-                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
-                                       wqe->wr.wr.atomic.compare_add);
-                       } else {
-                               qp->s_state = OP(FETCH_ADD);
-                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
-                                       wqe->wr.wr.atomic.compare_add);
-                               ohdr->u.atomic_eth.compare_data = 0;
-                       }
-                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
-                               wqe->wr.wr.atomic.remote_addr >> 32);
-                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
-                               wqe->wr.wr.atomic.remote_addr);
-                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
-                               wqe->wr.wr.atomic.rkey);
-                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
-                       ss = NULL;
-                       len = 0;
-                       if (++qp->s_cur == qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_len = wqe->length;
-               if (newreq) {
-                       qp->s_tail++;
-                       if (qp->s_tail >= qp->s_size)
-                               qp->s_tail = 0;
-               }
-               bth2 |= qp->s_psn & IPATH_PSN_MASK;
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                       qp->s_psn = wqe->lpsn + 1;
-               else {
-                       qp->s_psn++;
-                       if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                               qp->s_next_psn = qp->s_psn;
-               }
-               /*
-                * Put the QP on the pending list so lost ACKs will cause
-                * a retry.  More than one request can be pending so the
-                * QP may already be on the dev->pending list.
-                */
-               spin_lock(&dev->pending_lock);
-               if (list_empty(&qp->timerwait))
-                       list_add_tail(&qp->timerwait,
-                                     &dev->pending[dev->pending_index]);
-               spin_unlock(&dev->pending_lock);
-               break;
-
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               /*
-                * This case can only happen if a send is restarted.
-                * See ipath_restart_rc().
-                */
-               ipath_init_restart(qp, wqe);
-               /* FALLTHROUGH */
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND)
-                       qp->s_state = OP(SEND_LAST);
-               else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= 1 << 23;
-               bth2 |= 1 << 31;        /* Request ACK. */
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /*
-                * This case can only happen if a RDMA write is restarted.
-                * See ipath_restart_rc().
-                */
-               ipath_init_restart(qp, wqe);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
-                       qp->s_next_psn = qp->s_psn;
-               ss = &qp->s_sge;
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               else {
-                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-               }
-               bth2 |= 1 << 31;        /* Request ACK. */
-               qp->s_cur++;
-               if (qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /*
-                * This case can only happen if a RDMA read is restarted.
-                * See ipath_restart_rc().
-                */
-               ipath_init_restart(qp, wqe);
-               len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
-               ohdr->u.rc.reth.vaddr =
-                       cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
-               ohdr->u.rc.reth.rkey =
-                       cpu_to_be32(wqe->wr.wr.rdma.rkey);
-               ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
-               qp->s_state = OP(RDMA_READ_REQUEST);
-               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
-               bth2 = qp->s_psn & IPATH_PSN_MASK;
-               qp->s_psn = wqe->lpsn + 1;
-               ss = NULL;
-               len = 0;
-               qp->s_cur++;
-               if (qp->s_cur == qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
-               bth2 |= 1 << 31;        /* Request ACK. */
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       qp->s_cur_sge = ss;
-       qp->s_cur_size = len;
-       ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
-done:
-       ret = 1;
-       goto unlock;
-
-bail:
-       qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * send_rc_ack - Construct an ACK packet and send it
- * @qp: a pointer to the QP
- *
- * This is called from ipath_rc_rcv() and only uses the receive
- * side QP state.
- * Note that RDMA reads and atomics are handled in the
- * send side QP state and tasklet.
- */
-static void send_rc_ack(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_devdata *dd;
-       u16 lrh0;
-       u32 bth0;
-       u32 hwords;
-       u32 __iomem *piobuf;
-       struct ipath_ib_header hdr;
-       struct ipath_other_headers *ohdr;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
-       if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
-           (qp->s_flags & IPATH_S_ACK_PENDING) ||
-           qp->s_ack_state != OP(ACKNOWLEDGE))
-               goto queue_ack;
-
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       /* Don't try to send ACKs if the link isn't ACTIVE */
-       dd = dev->dd;
-       if (!(dd->ipath_flags & IPATH_LINKACTIVE))
-               goto done;
-
-       piobuf = ipath_getpiobuf(dd, 0, NULL);
-       if (!piobuf) {
-               /*
-                * We are out of PIO buffers at the moment.
-                * Pass responsibility for sending the ACK to the
-                * send tasklet so that when a PIO buffer becomes
-                * available, the ACK is sent ahead of other outgoing
-                * packets.
-                */
-               spin_lock_irqsave(&qp->s_lock, flags);
-               goto queue_ack;
-       }
-
-       /* Construct the header. */
-       ohdr = &hdr.u.oth;
-       lrh0 = IPATH_LRH_BTH;
-       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
-       hwords = 6;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               hwords += ipath_make_grh(dev, &hdr.u.l.grh,
-                                        &qp->remote_ah_attr.grh,
-                                        hwords, 0);
-               ohdr = &hdr.u.l.oth;
-               lrh0 = IPATH_LRH_GRH;
-       }
-       /* read pkey_index w/o lock (its atomic) */
-       bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
-               (OP(ACKNOWLEDGE) << 24) | (1 << 22);
-       if (qp->r_nak_state)
-               ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
-                                           (qp->r_nak_state <<
-                                            IPATH_AETH_CREDIT_SHIFT));
-       else
-               ohdr->u.aeth = ipath_compute_aeth(qp);
-       lrh0 |= qp->remote_ah_attr.sl << 4;
-       hdr.lrh[0] = cpu_to_be16(lrh0);
-       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-       hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
-                                qp->remote_ah_attr.src_path_bits);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
-
-       writeq(hwords + 1, piobuf);
-
-       if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
-               u32 *hdrp = (u32 *) &hdr;
-
-               ipath_flush_wc();
-               __iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
-               ipath_flush_wc();
-               __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
-       } else
-               __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
-
-       ipath_flush_wc();
-
-       dev->n_unicast_xmit++;
-       goto done;
-
-queue_ack:
-       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
-               dev->n_rc_qacks++;
-               qp->s_flags |= IPATH_S_ACK_PENDING;
-               qp->s_nak_state = qp->r_nak_state;
-               qp->s_ack_psn = qp->r_ack_psn;
-
-               /* Schedule the send tasklet. */
-               ipath_schedule_send(qp);
-       }
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return;
-}
-
-/**
- * reset_psn - reset the QP state to send starting from PSN
- * @qp: the QP
- * @psn: the packet sequence number to restart at
- *
- * This is called from ipath_rc_rcv() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held.
- */
-static void reset_psn(struct ipath_qp *qp, u32 psn)
-{
-       u32 n = qp->s_last;
-       struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
-       u32 opcode;
-
-       qp->s_cur = n;
-
-       /*
-        * If we are starting the request from the beginning,
-        * let the normal send code handle initialization.
-        */
-       if (ipath_cmp24(psn, wqe->psn) <= 0) {
-               qp->s_state = OP(SEND_LAST);
-               goto done;
-       }
-
-       /* Find the work request opcode corresponding to the given PSN. */
-       opcode = wqe->wr.opcode;
-       for (;;) {
-               int diff;
-
-               if (++n == qp->s_size)
-                       n = 0;
-               if (n == qp->s_tail)
-                       break;
-               wqe = get_swqe_ptr(qp, n);
-               diff = ipath_cmp24(psn, wqe->psn);
-               if (diff < 0)
-                       break;
-               qp->s_cur = n;
-               /*
-                * If we are starting the request from the beginning,
-                * let the normal send code handle initialization.
-                */
-               if (diff == 0) {
-                       qp->s_state = OP(SEND_LAST);
-                       goto done;
-               }
-               opcode = wqe->wr.opcode;
-       }
-
-       /*
-        * Set the state to restart in the middle of a request.
-        * Don't change the s_sge, s_cur_sge, or s_cur_size.
-        * See ipath_make_rc_req().
-        */
-       switch (opcode) {
-       case IB_WR_SEND:
-       case IB_WR_SEND_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
-               break;
-
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
-               break;
-
-       case IB_WR_RDMA_READ:
-               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-               break;
-
-       default:
-               /*
-                * This case shouldn't happen since its only
-                * one PSN per req.
-                */
-               qp->s_state = OP(SEND_LAST);
-       }
-done:
-       qp->s_psn = psn;
-}
-
-/**
- * ipath_restart_rc - back up requester to resend the last un-ACKed request
- * @qp: the QP to restart
- * @psn: packet sequence number for the request
- * @wc: the work completion request
- *
- * The QP s_lock should be held and interrupts disabled.
- */
-void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
-{
-       struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
-       struct ipath_ibdev *dev;
-
-       if (qp->s_retry == 0) {
-               ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
-               ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-               goto bail;
-       }
-       qp->s_retry--;
-
-       /*
-        * Remove the QP from the timeout queue.
-        * Note: it may already have been removed by ipath_ib_timer().
-        */
-       dev = to_idev(qp->ibqp.device);
-       spin_lock(&dev->pending_lock);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       if (!list_empty(&qp->piowait))
-               list_del_init(&qp->piowait);
-       spin_unlock(&dev->pending_lock);
-
-       if (wqe->wr.opcode == IB_WR_RDMA_READ)
-               dev->n_rc_resends++;
-       else
-               dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
-
-       reset_psn(qp, psn);
-       ipath_schedule_send(qp);
-
-bail:
-       return;
-}
-
-static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
-{
-       qp->s_last_psn = psn;
-}
-
-/**
- * do_rc_ack - process an incoming RC ACK
- * @qp: the QP the ACK came in on
- * @psn: the packet sequence number of the ACK
- * @opcode: the opcode of the request that resulted in the ACK
- *
- * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
- * for the given QP.
- * Called at interrupt level with the QP s_lock held and interrupts disabled.
- * Returns 1 if OK, 0 if current operation should be aborted (NAK).
- */
-static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
-                    u64 val)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ib_wc wc;
-       enum ib_wc_status status;
-       struct ipath_swqe *wqe;
-       int ret = 0;
-       u32 ack_psn;
-       int diff;
-
-       /*
-        * Remove the QP from the timeout queue (or RNR timeout queue).
-        * If ipath_ib_timer() has already removed it,
-        * it's OK since we hold the QP s_lock and ipath_restart_rc()
-        * just won't find anything to restart if we ACK everything.
-        */
-       spin_lock(&dev->pending_lock);
-       if (!list_empty(&qp->timerwait))
-               list_del_init(&qp->timerwait);
-       spin_unlock(&dev->pending_lock);
-
-       /*
-        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
-        * requests and implicitly NAK RDMA read and atomic requests issued
-        * before the NAK'ed request.  The MSN won't include the NAK'ed
-        * request but will include an ACK'ed request(s).
-        */
-       ack_psn = psn;
-       if (aeth >> 29)
-               ack_psn--;
-       wqe = get_swqe_ptr(qp, qp->s_last);
-
-       /*
-        * The MSN might be for a later WQE than the PSN indicates so
-        * only complete WQEs that the PSN finishes.
-        */
-       while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
-               /*
-                * RDMA_READ_RESPONSE_ONLY is a special case since
-                * we want to generate completion events for everything
-                * before the RDMA read, copy the data, then generate
-                * the completion for the read.
-                */
-               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
-                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
-                   diff == 0) {
-                       ret = 1;
-                       goto bail;
-               }
-               /*
-                * If this request is a RDMA read or atomic, and the ACK is
-                * for a later operation, this ACK NAKs the RDMA read or
-                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
-                * can ACK a RDMA read and likewise for atomic ops.  Note
-                * that the NAK case can only happen if relaxed ordering is
-                * used and requests are sent after an RDMA read or atomic
-                * is sent but before the response is received.
-                */
-               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
-                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
-                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-                       /*
-                        * The last valid PSN seen is the previous
-                        * request's.
-                        */
-                       update_last_psn(qp, wqe->psn - 1);
-                       /* Retry this request. */
-                       ipath_restart_rc(qp, wqe->psn);
-                       /*
-                        * No need to process the ACK/NAK since we are
-                        * restarting an earlier request.
-                        */
-                       goto bail;
-               }
-               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-                       *(u64 *) wqe->sg_list[0].vaddr = val;
-               if (qp->s_num_rd_atomic &&
-                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
-                       qp->s_num_rd_atomic--;
-                       /* Restart sending task if fence is complete */
-                       if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
-                            !qp->s_num_rd_atomic) ||
-                           qp->s_flags & IPATH_S_RDMAR_PENDING)
-                               ipath_schedule_send(qp);
-               }
-               /* Post a send completion queue entry if requested. */
-               if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-                       memset(&wc, 0, sizeof wc);
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.status = IB_WC_SUCCESS;
-                       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-                       wc.byte_len = wqe->length;
-                       wc.qp = &qp->ibqp;
-                       wc.src_qp = qp->remote_qpn;
-                       wc.slid = qp->remote_ah_attr.dlid;
-                       wc.sl = qp->remote_ah_attr.sl;
-                       ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
-               }
-               qp->s_retry = qp->s_retry_cnt;
-               /*
-                * If we are completing a request which is in the process of
-                * being resent, we can stop resending it since we know the
-                * responder has already seen it.
-                */
-               if (qp->s_last == qp->s_cur) {
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       qp->s_last = qp->s_cur;
-                       if (qp->s_last == qp->s_tail)
-                               break;
-                       wqe = get_swqe_ptr(qp, qp->s_cur);
-                       qp->s_state = OP(SEND_LAST);
-                       qp->s_psn = wqe->psn;
-               } else {
-                       if (++qp->s_last >= qp->s_size)
-                               qp->s_last = 0;
-                       if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
-                               qp->s_draining = 0;
-                       if (qp->s_last == qp->s_tail)
-                               break;
-                       wqe = get_swqe_ptr(qp, qp->s_last);
-               }
-       }
-
-       switch (aeth >> 29) {
-       case 0:         /* ACK */
-               dev->n_rc_acks++;
-               /* If this is a partial ACK, reset the retransmit timer. */
-               if (qp->s_last != qp->s_tail) {
-                       spin_lock(&dev->pending_lock);
-                       if (list_empty(&qp->timerwait))
-                               list_add_tail(&qp->timerwait,
-                                       &dev->pending[dev->pending_index]);
-                       spin_unlock(&dev->pending_lock);
-                       /*
-                        * If we get a partial ACK for a resent operation,
-                        * we can stop resending the earlier packets and
-                        * continue with the next packet the receiver wants.
-                        */
-                       if (ipath_cmp24(qp->s_psn, psn) <= 0) {
-                               reset_psn(qp, psn + 1);
-                               ipath_schedule_send(qp);
-                       }
-               } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
-                       qp->s_state = OP(SEND_LAST);
-                       qp->s_psn = psn + 1;
-               }
-               ipath_get_credit(qp, aeth);
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               qp->s_retry = qp->s_retry_cnt;
-               update_last_psn(qp, psn);
-               ret = 1;
-               goto bail;
-
-       case 1:         /* RNR NAK */
-               dev->n_rnr_naks++;
-               if (qp->s_last == qp->s_tail)
-                       goto bail;
-               if (qp->s_rnr_retry == 0) {
-                       status = IB_WC_RNR_RETRY_EXC_ERR;
-                       goto class_b;
-               }
-               if (qp->s_rnr_retry_cnt < 7)
-                       qp->s_rnr_retry--;
-
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-
-               if (wqe->wr.opcode == IB_WR_RDMA_READ)
-                       dev->n_rc_resends++;
-               else
-                       dev->n_rc_resends +=
-                               (qp->s_psn - psn) & IPATH_PSN_MASK;
-
-               reset_psn(qp, psn);
-
-               qp->s_rnr_timeout =
-                       ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
-                                          IPATH_AETH_CREDIT_MASK];
-               ipath_insert_rnr_queue(qp);
-               ipath_schedule_send(qp);
-               goto bail;
-
-       case 3:         /* NAK */
-               if (qp->s_last == qp->s_tail)
-                       goto bail;
-               /* The last valid PSN is the previous PSN. */
-               update_last_psn(qp, psn - 1);
-               switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
-                       IPATH_AETH_CREDIT_MASK) {
-               case 0: /* PSN sequence error */
-                       dev->n_seq_naks++;
-                       /*
-                        * Back up to the responder's expected PSN.
-                        * Note that we might get a NAK in the middle of an
-                        * RDMA READ response which terminates the RDMA
-                        * READ.
-                        */
-                       ipath_restart_rc(qp, psn);
-                       break;
-
-               case 1: /* Invalid Request */
-                       status = IB_WC_REM_INV_REQ_ERR;
-                       dev->n_other_naks++;
-                       goto class_b;
-
-               case 2: /* Remote Access Error */
-                       status = IB_WC_REM_ACCESS_ERR;
-                       dev->n_other_naks++;
-                       goto class_b;
-
-               case 3: /* Remote Operation Error */
-                       status = IB_WC_REM_OP_ERR;
-                       dev->n_other_naks++;
-               class_b:
-                       ipath_send_complete(qp, wqe, status);
-                       ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-                       break;
-
-               default:
-                       /* Ignore other reserved NAK error codes */
-                       goto reserved;
-               }
-               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
-               goto bail;
-
-       default:                /* 2: reserved */
-       reserved:
-               /* Ignore reserved NAK codes. */
-               goto bail;
-       }
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_rc_rcv_resp - process an incoming RC response packet
- * @dev: the device this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @hdrsize: the header length
- * @pmtu: the path MTU
- * @header_in_data: true if part of the header data is in the data buffer
- *
- * This is called from ipath_rc_rcv() to process an incoming RC response
- * packet for the given QP.
- * Called at interrupt level.
- */
-static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
-                                    struct ipath_other_headers *ohdr,
-                                    void *data, u32 tlen,
-                                    struct ipath_qp *qp,
-                                    u32 opcode,
-                                    u32 psn, u32 hdrsize, u32 pmtu,
-                                    int header_in_data)
-{
-       struct ipath_swqe *wqe;
-       enum ib_wc_status status;
-       unsigned long flags;
-       int diff;
-       u32 pad;
-       u32 aeth;
-       u64 val;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Double check we can process this now that we hold the s_lock. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-               goto ack_done;
-
-       /* Ignore invalid responses. */
-       if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
-               goto ack_done;
-
-       /* Ignore duplicate responses. */
-       diff = ipath_cmp24(psn, qp->s_last_psn);
-       if (unlikely(diff <= 0)) {
-               /* Update credits for "ghost" ACKs */
-               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
-                       if (!header_in_data)
-                               aeth = be32_to_cpu(ohdr->u.aeth);
-                       else {
-                               aeth = be32_to_cpu(((__be32 *) data)[0]);
-                               data += sizeof(__be32);
-                       }
-                       if ((aeth >> 29) == 0)
-                               ipath_get_credit(qp, aeth);
-               }
-               goto ack_done;
-       }
-
-       if (unlikely(qp->s_last == qp->s_tail))
-               goto ack_done;
-       wqe = get_swqe_ptr(qp, qp->s_last);
-       status = IB_WC_SUCCESS;
-
-       switch (opcode) {
-       case OP(ACKNOWLEDGE):
-       case OP(ATOMIC_ACKNOWLEDGE):
-       case OP(RDMA_READ_RESPONSE_FIRST):
-               if (!header_in_data)
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-               else {
-                       aeth = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               }
-               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
-                       if (!header_in_data) {
-                               __be32 *p = ohdr->u.at.atomic_ack_eth;
-
-                               val = ((u64) be32_to_cpu(p[0]) << 32) |
-                                       be32_to_cpu(p[1]);
-                       } else
-                               val = be64_to_cpu(((__be64 *) data)[0]);
-               } else
-                       val = 0;
-               if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
-                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
-                       goto ack_done;
-               hdrsize += 4;
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_middle;
-
-       case OP(RDMA_READ_RESPONSE_MIDDLE):
-               /* no AETH, no ACK */
-               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-                       dev->n_rdma_seq++;
-                       if (qp->r_flags & IPATH_R_RDMAR_SEQ)
-                               goto ack_done;
-                       qp->r_flags |= IPATH_R_RDMAR_SEQ;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1);
-                       goto ack_done;
-               }
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-       read_middle:
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto ack_len_err;
-               if (unlikely(pmtu >= qp->s_rdma_read_len))
-                       goto ack_len_err;
-
-               /* We got a response so update the timeout. */
-               spin_lock(&dev->pending_lock);
-               if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
-                       list_move_tail(&qp->timerwait,
-                                      &dev->pending[dev->pending_index]);
-               spin_unlock(&dev->pending_lock);
-
-               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
-                       qp->s_retry = qp->s_retry_cnt;
-
-               /*
-                * Update the RDMA receive state but do the copy w/o
-                * holding the locks and blocking interrupts.
-                */
-               qp->s_rdma_read_len -= pmtu;
-               update_last_psn(qp, psn);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
-               goto bail;
-
-       case OP(RDMA_READ_RESPONSE_ONLY):
-               if (!header_in_data)
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-               else
-                       aeth = be32_to_cpu(((__be32 *) data)[0]);
-               if (!do_rc_ack(qp, aeth, psn, opcode, 0))
-                       goto ack_done;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 0 && <= pmtu.
-                * Remember to account for the AETH header (4) and
-                * ICRC (4).
-                */
-               if (unlikely(tlen < (hdrsize + pad + 8)))
-                       goto ack_len_err;
-               /*
-                * If this is a response to a resent RDMA read, we
-                * have to be careful to copy the data to the right
-                * location.
-                */
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
-                                                 wqe, psn, pmtu);
-               goto read_last;
-
-       case OP(RDMA_READ_RESPONSE_LAST):
-               /* ACKs READ req. */
-               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-                       dev->n_rdma_seq++;
-                       if (qp->r_flags & IPATH_R_RDMAR_SEQ)
-                               goto ack_done;
-                       qp->r_flags |= IPATH_R_RDMAR_SEQ;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1);
-                       goto ack_done;
-               }
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_op_err;
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /*
-                * Check that the data size is >= 1 && <= pmtu.
-                * Remember to account for the AETH header (4) and
-                * ICRC (4).
-                */
-               if (unlikely(tlen <= (hdrsize + pad + 8)))
-                       goto ack_len_err;
-       read_last:
-               tlen -= hdrsize + pad + 8;
-               if (unlikely(tlen != qp->s_rdma_read_len))
-                       goto ack_len_err;
-               if (!header_in_data)
-                       aeth = be32_to_cpu(ohdr->u.aeth);
-               else {
-                       aeth = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               }
-               ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
-               (void) do_rc_ack(qp, aeth, psn,
-                                OP(RDMA_READ_RESPONSE_LAST), 0);
-               goto ack_done;
-       }
-
-ack_op_err:
-       status = IB_WC_LOC_QP_OP_ERR;
-       goto ack_err;
-
-ack_len_err:
-       status = IB_WC_LOC_LEN_ERR;
-ack_err:
-       ipath_send_complete(qp, wqe, status);
-       ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
-ack_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-bail:
-       return;
-}
-
-/**
- * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
- * @dev: the device this packet came in on
- * @ohdr: the other headers for this packet
- * @data: the packet data
- * @qp: the QP for this packet
- * @opcode: the opcode for this packet
- * @psn: the packet sequence number for this packet
- * @diff: the difference between the PSN and the expected PSN
- * @header_in_data: true if part of the header data is in the data buffer
- *
- * This is called from ipath_rc_rcv() to process an unexpected
- * incoming RC packet for the given QP.
- * Called at interrupt level.
- * Return 1 if no more processing is needed; otherwise return 0 to
- * schedule a response to be sent.
- */
-static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
-                                    struct ipath_other_headers *ohdr,
-                                    void *data,
-                                    struct ipath_qp *qp,
-                                    u32 opcode,
-                                    u32 psn,
-                                    int diff,
-                                    int header_in_data)
-{
-       struct ipath_ack_entry *e;
-       u8 i, prev;
-       int old_req;
-       unsigned long flags;
-
-       if (diff > 0) {
-               /*
-                * Packet sequence error.
-                * A NAK will ACK earlier sends and RDMA writes.
-                * Don't queue the NAK if we already sent one.
-                */
-               if (!qp->r_nak_state) {
-                       qp->r_nak_state = IB_NAK_PSN_ERROR;
-                       /* Use the expected PSN. */
-                       qp->r_ack_psn = qp->r_psn;
-                       goto send_ack;
-               }
-               goto done;
-       }
-
-       /*
-        * Handle a duplicate request.  Don't re-execute SEND, RDMA
-        * write or atomic op.  Don't NAK errors, just silently drop
-        * the duplicate request.  Note that r_sge, r_len, and
-        * r_rcv_len may be in use so don't modify them.
-        *
-        * We are supposed to ACK the earliest duplicate PSN but we
-        * can coalesce an outstanding duplicate ACK.  We have to
-        * send the earliest so that RDMA reads can be restarted at
-        * the requester's expected PSN.
-        *
-        * First, find where this duplicate PSN falls within the
-        * ACKs previously sent.
-        */
-       psn &= IPATH_PSN_MASK;
-       e = NULL;
-       old_req = 1;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       /* Double check we can process this now that we hold the s_lock. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-               goto unlock_done;
-
-       for (i = qp->r_head_ack_queue; ; i = prev) {
-               if (i == qp->s_tail_ack_queue)
-                       old_req = 0;
-               if (i)
-                       prev = i - 1;
-               else
-                       prev = IPATH_MAX_RDMA_ATOMIC;
-               if (prev == qp->r_head_ack_queue) {
-                       e = NULL;
-                       break;
-               }
-               e = &qp->s_ack_queue[prev];
-               if (!e->opcode) {
-                       e = NULL;
-                       break;
-               }
-               if (ipath_cmp24(psn, e->psn) >= 0) {
-                       if (prev == qp->s_tail_ack_queue)
-                               old_req = 0;
-                       break;
-               }
-       }
-       switch (opcode) {
-       case OP(RDMA_READ_REQUEST): {
-               struct ib_reth *reth;
-               u32 offset;
-               u32 len;
-
-               /*
-                * If we didn't find the RDMA read request in the ack queue,
-                * or the send tasklet is already backed up to send an
-                * earlier entry, we can ignore this request.
-                */
-               if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
-                       goto unlock_done;
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               /*
-                * Address range must be a subset of the original
-                * request and start on pmtu boundaries.
-                * We reuse the old ack_queue slot since the requester
-                * should not back up and request an earlier PSN for the
-                * same request.
-                */
-               offset = ((psn - e->psn) & IPATH_PSN_MASK) *
-                       ib_mtu_enum_to_int(qp->path_mtu);
-               len = be32_to_cpu(reth->length);
-               if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
-                       goto unlock_done;
-               if (len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       ok = ipath_rkey_ok(qp, &e->rdma_sge,
-                                          len, vaddr, rkey,
-                                          IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto unlock_done;
-               } else {
-                       e->rdma_sge.sg_list = NULL;
-                       e->rdma_sge.num_sge = 0;
-                       e->rdma_sge.sge.mr = NULL;
-                       e->rdma_sge.sge.vaddr = NULL;
-                       e->rdma_sge.sge.length = 0;
-                       e->rdma_sge.sge.sge_length = 0;
-               }
-               e->psn = psn;
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               /*
-                * If we didn't find the atomic request in the ack queue
-                * or the send tasklet is already backed up to send an
-                * earlier entry, we can ignore this request.
-                */
-               if (!e || e->opcode != (u8) opcode || old_req)
-                       goto unlock_done;
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-               qp->s_tail_ack_queue = prev;
-               break;
-       }
-
-       default:
-               if (old_req)
-                       goto unlock_done;
-               /*
-                * Resend the most recent ACK if this request is
-                * after all the previous RDMA reads and atomics.
-                */
-               if (i == qp->r_head_ack_queue) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       qp->r_nak_state = 0;
-                       qp->r_ack_psn = qp->r_psn - 1;
-                       goto send_ack;
-               }
-               /*
-                * Try to send a simple ACK to work around a Mellanox bug
-                * which doesn't accept a RDMA read response or atomic
-                * response as an ACK for earlier SENDs or RDMA writes.
-                */
-               if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
-                   !(qp->s_flags & IPATH_S_ACK_PENDING) &&
-                   qp->s_ack_state == OP(ACKNOWLEDGE)) {
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       qp->r_nak_state = 0;
-                       qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
-                       goto send_ack;
-               }
-               /*
-                * Resend the RDMA read or atomic op which
-                * ACKs this duplicate request.
-                */
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-               qp->s_tail_ack_queue = i;
-               break;
-       }
-       qp->r_nak_state = 0;
-       ipath_schedule_send(qp);
-
-unlock_done:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return 1;
-
-send_ack:
-       return 0;
-}
-
-void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
-{
-       unsigned long flags;
-       int lastwqe;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-       lastwqe = ipath_error_qp(qp, err);
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-       if (lastwqe) {
-               struct ib_event ev;
-
-               ev.device = qp->ibqp.device;
-               ev.element.qp = &qp->ibqp;
-               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-       }
-}
-
-static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
-{
-       unsigned next;
-
-       next = n + 1;
-       if (next > IPATH_MAX_RDMA_ATOMIC)
-               next = 0;
-       if (n == qp->s_tail_ack_queue) {
-               qp->s_tail_ack_queue = next;
-               qp->s_ack_state = OP(ACKNOWLEDGE);
-       }
-}
-
-/**
- * ipath_rc_rcv - process an incoming RC packet
- * @dev: the device this packet came in on
- * @hdr: the header of this packet
- * @has_grh: true if the header has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP for this packet
- *
- * This is called from ipath_qp_rcv() to process an incoming RC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       u32 opcode;
-       u32 hdrsize;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       int diff;
-       struct ib_reth *reth;
-       int header_in_data;
-       unsigned long flags;
-
-       /* Validate the SLID. See Ch. 9.6.1.5 */
-       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
-               goto done;
-
-       /* Check for GRH */
-       if (!has_grh) {
-               ohdr = &hdr->u.oth;
-               hdrsize = 8 + 12;       /* LRH + BTH */
-               psn = be32_to_cpu(ohdr->bth[2]);
-               header_in_data = 0;
-       } else {
-               ohdr = &hdr->u.l.oth;
-               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
-               /*
-                * The header with GRH is 60 bytes and the core driver sets
-                * the eager header buffer size to 56 bytes so the last 4
-                * bytes of the BTH header (PSN) is in the data buffer.
-                */
-               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-               if (header_in_data) {
-                       psn = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               } else
-                       psn = be32_to_cpu(ohdr->bth[2]);
-       }
-
-       /*
-        * Process responses (ACKs) before anything else.  Note that the
-        * packet sequence number will be for something in the send work
-        * queue rather than the expected receive packet sequence number.
-        * In other words, this QP is the requester.
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
-           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
-               ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
-                                 hdrsize, pmtu, header_in_data);
-               goto done;
-       }
-
-       /* Compute 24 bits worth of difference. */
-       diff = ipath_cmp24(psn, qp->r_psn);
-       if (unlikely(diff)) {
-               if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
-                                      psn, diff, header_in_data))
-                       goto done;
-               goto send_ack;
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto nack_inv;
-
-       default:
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       goto nack_inv;
-               /*
-                * Note that it is up to the requester to not send a new
-                * RDMA read or atomic operation before receiving an ACK
-                * for the previous operation.
-                */
-               break;
-       }
-
-       memset(&wc, 0, sizeof wc);
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-               if (!ipath_get_rwqe(qp, 0))
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-       case OP(RDMA_WRITE_MIDDLE):
-       send_middle:
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto nack_inv;
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len))
-                       goto nack_inv;
-               ipath_copy_sge(&qp->r_sge, data, pmtu);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-               /* consume RWQE */
-               if (!ipath_get_rwqe(qp, 1))
-                       goto rnr_nak;
-               goto send_last_imm;
-
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-               if (!ipath_get_rwqe(qp, 0))
-                       goto rnr_nak;
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto send_last;
-               /* FALLTHROUGH */
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-       send_last_imm:
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else {
-                       /* Immediate data comes after BTH */
-                       wc.ex.imm_data = ohdr->u.imm_data;
-               }
-               hdrsize += 4;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               /* FALLTHROUGH */
-       case OP(SEND_LAST):
-       case OP(RDMA_WRITE_LAST):
-       send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4)))
-                       goto nack_inv;
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len))
-                       goto nack_inv;
-               ipath_copy_sge(&qp->r_sge, data, tlen);
-               qp->r_msn++;
-               if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-                       break;
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               else
-                       wc.opcode = IB_WC_RECV;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               wc.sl = qp->remote_ah_attr.sl;
-               /* Signal completion event if the solicited bit is set. */
-               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                              (ohdr->bth[0] &
-                               cpu_to_be32(1 << 23)) != 0);
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE)))
-                       goto nack_inv;
-               /* consume RWQE */
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               hdrsize += sizeof(*reth);
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = ipath_rkey_ok(qp, &qp->r_sge,
-                                          qp->r_len, vaddr, rkey,
-                                          IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok))
-                               goto nack_acc;
-               } else {
-                       qp->r_sge.sg_list = NULL;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (opcode == OP(RDMA_WRITE_FIRST))
-                       goto send_middle;
-               else if (opcode == OP(RDMA_WRITE_ONLY))
-                       goto send_last;
-               if (!ipath_get_rwqe(qp, 1))
-                       goto rnr_nak;
-               goto send_last_imm;
-
-       case OP(RDMA_READ_REQUEST): {
-               struct ipath_ack_entry *e;
-               u32 len;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_READ)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               if (next > IPATH_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               /* Double check we can process this while holding the s_lock. */
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-                       goto unlock;
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       ipath_update_ack_queue(qp, next);
-               }
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               len = be32_to_cpu(reth->length);
-               if (len) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey & NAK */
-                       ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
-                                          rkey, IB_ACCESS_REMOTE_READ);
-                       if (unlikely(!ok))
-                               goto nack_acc_unlck;
-                       /*
-                        * Update the next expected PSN.  We add 1 later
-                        * below, so only add the remainder here.
-                        */
-                       if (len > pmtu)
-                               qp->r_psn += (len - 1) / pmtu;
-               } else {
-                       e->rdma_sge.sg_list = NULL;
-                       e->rdma_sge.num_sge = 0;
-                       e->rdma_sge.sge.mr = NULL;
-                       e->rdma_sge.sge.vaddr = NULL;
-                       e->rdma_sge.sge.length = 0;
-                       e->rdma_sge.sge.sge_length = 0;
-               }
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn;
-               /*
-                * We need to increment the MSN here instead of when we
-                * finish sending the result since a duplicate request would
-                * increment it more than once.
-                */
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               ipath_schedule_send(qp);
-
-               goto unlock;
-       }
-
-       case OP(COMPARE_SWAP):
-       case OP(FETCH_ADD): {
-               struct ib_atomic_eth *ateth;
-               struct ipath_ack_entry *e;
-               u64 vaddr;
-               atomic64_t *maddr;
-               u64 sdata;
-               u32 rkey;
-               u8 next;
-
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_inv;
-               next = qp->r_head_ack_queue + 1;
-               if (next > IPATH_MAX_RDMA_ATOMIC)
-                       next = 0;
-               spin_lock_irqsave(&qp->s_lock, flags);
-               /* Double check we can process this while holding the s_lock. */
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
-                       goto unlock;
-               if (unlikely(next == qp->s_tail_ack_queue)) {
-                       if (!qp->s_ack_queue[next].sent)
-                               goto nack_inv_unlck;
-                       ipath_update_ack_queue(qp, next);
-               }
-               if (!header_in_data)
-                       ateth = &ohdr->u.atomic_eth;
-               else
-                       ateth = (struct ib_atomic_eth *)data;
-               vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
-                       be32_to_cpu(ateth->vaddr[1]);
-               if (unlikely(vaddr & (sizeof(u64) - 1)))
-                       goto nack_inv_unlck;
-               rkey = be32_to_cpu(ateth->rkey);
-               /* Check rkey & NAK */
-               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
-                                           sizeof(u64), vaddr, rkey,
-                                           IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_acc_unlck;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-               sdata = be64_to_cpu(ateth->swap_data);
-               e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
-                       (u64) atomic64_add_return(sdata, maddr) - sdata :
-                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-                                     be64_to_cpu(ateth->compare_data),
-                                     sdata);
-               e->opcode = opcode;
-               e->sent = 0;
-               e->psn = psn & IPATH_PSN_MASK;
-               qp->r_msn++;
-               qp->r_psn++;
-               qp->r_state = opcode;
-               qp->r_nak_state = 0;
-               qp->r_head_ack_queue = next;
-
-               /* Schedule the send tasklet. */
-               ipath_schedule_send(qp);
-
-               goto unlock;
-       }
-
-       default:
-               /* NAK unknown opcodes. */
-               goto nack_inv;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-       qp->r_ack_psn = psn;
-       qp->r_nak_state = 0;
-       /* Send an ACK if requested or required. */
-       if (psn & (1 << 31))
-               goto send_ack;
-       goto done;
-
-rnr_nak:
-       qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
-       qp->r_ack_psn = qp->r_psn;
-       goto send_ack;
-
-nack_inv_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_inv:
-       ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
-       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
-       qp->r_ack_psn = qp->r_psn;
-       goto send_ack;
-
-nack_acc_unlck:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-nack_acc:
-       ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
-       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
-       qp->r_ack_psn = qp->r_psn;
-send_ack:
-       send_rc_ack(qp);
-       goto done;
-
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-done:
-       return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h
deleted file mode 100644 (file)
index 8f44d0c..0000000
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef _IPATH_REGISTERS_H
-#define _IPATH_REGISTERS_H
-
-/*
- * This file should only be included by kernel source, and by the diags.  It
- * defines the registers, and their contents, for InfiniPath chips.
- */
-
-/*
- * These are the InfiniPath register and buffer bit definitions,
- * that are visible to software, and needed only by the kernel
- * and diag code.  A few, that are visible to protocol and user
- * code are in ipath_common.h.  Some bits are specific
- * to a given chip implementation, and have been moved to the
- * chip-specific source file
- */
-
-/* kr_revision bits */
-#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
-#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
-#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
-#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
-#define INFINIPATH_R_ARCH_MASK 0xFF
-#define INFINIPATH_R_ARCH_SHIFT 16
-#define INFINIPATH_R_SOFTWARE_MASK 0xFF
-#define INFINIPATH_R_SOFTWARE_SHIFT 24
-#define INFINIPATH_R_BOARDID_MASK 0xFF
-#define INFINIPATH_R_BOARDID_SHIFT 32
-
-/* kr_control bits */
-#define INFINIPATH_C_FREEZEMODE 0x00000002
-#define INFINIPATH_C_LINKENABLE 0x00000004
-
-/* kr_sendctrl bits */
-#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
-#define INFINIPATH_S_UPDTHRESH_SHIFT 24
-#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
-
-#define IPATH_S_ABORT          0
-#define IPATH_S_PIOINTBUFAVAIL 1
-#define IPATH_S_PIOBUFAVAILUPD 2
-#define IPATH_S_PIOENABLE      3
-#define IPATH_S_SDMAINTENABLE  9
-#define IPATH_S_SDMASINGLEDESCRIPTOR   10
-#define IPATH_S_SDMAENABLE     11
-#define IPATH_S_SDMAHALT       12
-#define IPATH_S_DISARM         31
-
-#define INFINIPATH_S_ABORT             (1U << IPATH_S_ABORT)
-#define INFINIPATH_S_PIOINTBUFAVAIL    (1U << IPATH_S_PIOINTBUFAVAIL)
-#define INFINIPATH_S_PIOBUFAVAILUPD    (1U << IPATH_S_PIOBUFAVAILUPD)
-#define INFINIPATH_S_PIOENABLE         (1U << IPATH_S_PIOENABLE)
-#define INFINIPATH_S_SDMAINTENABLE     (1U << IPATH_S_SDMAINTENABLE)
-#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
-                                       (1U << IPATH_S_SDMASINGLEDESCRIPTOR)
-#define INFINIPATH_S_SDMAENABLE                (1U << IPATH_S_SDMAENABLE)
-#define INFINIPATH_S_SDMAHALT          (1U << IPATH_S_SDMAHALT)
-#define INFINIPATH_S_DISARM            (1U << IPATH_S_DISARM)
-
-/* kr_rcvctrl bits that are the same on multiple chips */
-#define INFINIPATH_R_PORTENABLE_SHIFT 0
-#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
-
-/* kr_intstatus, kr_intclear, kr_intmask bits */
-#define INFINIPATH_I_SDMAINT           0x8000000000000000ULL
-#define INFINIPATH_I_SDMADISABLED      0x4000000000000000ULL
-#define INFINIPATH_I_ERROR             0x0000000080000000ULL
-#define INFINIPATH_I_SPIOSENT          0x0000000040000000ULL
-#define INFINIPATH_I_SPIOBUFAVAIL      0x0000000020000000ULL
-#define INFINIPATH_I_GPIO              0x0000000010000000ULL
-#define INFINIPATH_I_JINT              0x0000000004000000ULL
-
-/* kr_errorstatus, kr_errorclear, kr_errormask bits */
-#define INFINIPATH_E_RFORMATERR                        0x0000000000000001ULL
-#define INFINIPATH_E_RVCRC                     0x0000000000000002ULL
-#define INFINIPATH_E_RICRC                     0x0000000000000004ULL
-#define INFINIPATH_E_RMINPKTLEN                        0x0000000000000008ULL
-#define INFINIPATH_E_RMAXPKTLEN                        0x0000000000000010ULL
-#define INFINIPATH_E_RLONGPKTLEN               0x0000000000000020ULL
-#define INFINIPATH_E_RSHORTPKTLEN              0x0000000000000040ULL
-#define INFINIPATH_E_RUNEXPCHAR                        0x0000000000000080ULL
-#define INFINIPATH_E_RUNSUPVL                  0x0000000000000100ULL
-#define INFINIPATH_E_REBP                      0x0000000000000200ULL
-#define INFINIPATH_E_RIBFLOW                   0x0000000000000400ULL
-#define INFINIPATH_E_RBADVERSION               0x0000000000000800ULL
-#define INFINIPATH_E_RRCVEGRFULL               0x0000000000001000ULL
-#define INFINIPATH_E_RRCVHDRFULL               0x0000000000002000ULL
-#define INFINIPATH_E_RBADTID                   0x0000000000004000ULL
-#define INFINIPATH_E_RHDRLEN                   0x0000000000008000ULL
-#define INFINIPATH_E_RHDR                      0x0000000000010000ULL
-#define INFINIPATH_E_RIBLOSTLINK               0x0000000000020000ULL
-#define INFINIPATH_E_SENDSPECIALTRIGGER                0x0000000008000000ULL
-#define INFINIPATH_E_SDMADISABLED              0x0000000010000000ULL
-#define INFINIPATH_E_SMINPKTLEN                        0x0000000020000000ULL
-#define INFINIPATH_E_SMAXPKTLEN                        0x0000000040000000ULL
-#define INFINIPATH_E_SUNDERRUN                 0x0000000080000000ULL
-#define INFINIPATH_E_SPKTLEN                   0x0000000100000000ULL
-#define INFINIPATH_E_SDROPPEDSMPPKT            0x0000000200000000ULL
-#define INFINIPATH_E_SDROPPEDDATAPKT           0x0000000400000000ULL
-#define INFINIPATH_E_SPIOARMLAUNCH             0x0000000800000000ULL
-#define INFINIPATH_E_SUNEXPERRPKTNUM           0x0000001000000000ULL
-#define INFINIPATH_E_SUNSUPVL                  0x0000002000000000ULL
-#define INFINIPATH_E_SENDBUFMISUSE             0x0000004000000000ULL
-#define INFINIPATH_E_SDMAGENMISMATCH           0x0000008000000000ULL
-#define INFINIPATH_E_SDMAOUTOFBOUND            0x0000010000000000ULL
-#define INFINIPATH_E_SDMATAILOUTOFBOUND                0x0000020000000000ULL
-#define INFINIPATH_E_SDMABASE                  0x0000040000000000ULL
-#define INFINIPATH_E_SDMA1STDESC               0x0000080000000000ULL
-#define INFINIPATH_E_SDMARPYTAG                        0x0000100000000000ULL
-#define INFINIPATH_E_SDMADWEN                  0x0000200000000000ULL
-#define INFINIPATH_E_SDMAMISSINGDW             0x0000400000000000ULL
-#define INFINIPATH_E_SDMAUNEXPDATA             0x0000800000000000ULL
-#define INFINIPATH_E_IBSTATUSCHANGED           0x0001000000000000ULL
-#define INFINIPATH_E_INVALIDADDR               0x0002000000000000ULL
-#define INFINIPATH_E_RESET                     0x0004000000000000ULL
-#define INFINIPATH_E_HARDWARE                  0x0008000000000000ULL
-#define INFINIPATH_E_SDMADESCADDRMISALIGN      0x0010000000000000ULL
-#define INFINIPATH_E_INVALIDEEPCMD             0x0020000000000000ULL
-
-/*
- * this is used to print "common" packet errors only when the
- * __IPATH_ERRPKTDBG bit is set in ipath_debug.
- */
-#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
-               | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
-               | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
-               | INFINIPATH_E_REBP )
-
-/* Convenience for decoding Send DMA errors */
-#define INFINIPATH_E_SDMAERRS ( \
-       INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
-       INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
-       INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
-       INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
-       INFINIPATH_E_SDMAUNEXPDATA | \
-       INFINIPATH_E_SDMADESCADDRMISALIGN | \
-       INFINIPATH_E_SDMADISABLED | \
-       INFINIPATH_E_SENDBUFMISUSE)
-
-/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
-/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
- * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
- *             bit 4: flag buffer, 5: datainfo, 6: header info */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
-#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
-#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
-/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF  0x1ULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC  0x2ULL
-#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
-/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
-#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
-#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
-/* waldo specific -- find the rest in ipath_6110.c */
-#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
-/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
-#define INFINIPATH_HWE_MEMBISTFAILED   0x0040000000000000ULL
-
-/* kr_hwdiagctrl bits */
-#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
-#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
-#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
-#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
-#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
-#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
-#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
-#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
-#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
-
-/* kr_ibcctrl bits */
-#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
-#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
-#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
-#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
-#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
-#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
-/* cycle through TS1/TS2 till OK */
-#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
-/* wait for TS1, then go on */
-#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
-#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
-#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
-#define INFINIPATH_IBCC_LINKCMD_DOWN 1         /* move to 0x11 */
-#define INFINIPATH_IBCC_LINKCMD_ARMED 2                /* move to 0x21 */
-#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3       /* move to 0x31 */
-#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
-#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
-#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
-#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
-#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
-#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
-#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
-#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
-#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
-#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
-#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
-
-/* kr_ibcstatus bits */
-#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
-#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
-
-#define INFINIPATH_IBCS_TXREADY       0x40000000
-#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
-/* link training states (shift by
-   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
-#define INFINIPATH_IBCS_LT_STATE_DISABLED      0x00
-#define INFINIPATH_IBCS_LT_STATE_LINKUP                0x01
-#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE    0x02
-#define INFINIPATH_IBCS_LT_STATE_POLLQUIET     0x03
-#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY    0x04
-#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET    0x05
-#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE   0x08
-#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG    0x09
-#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT    0x0a
-#define INFINIPATH_IBCS_LT_STATE_CFGIDLE       0x0b
-#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN        0x0c
-#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT        0x0e
-#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE   0x0f
-/* link state machine states (shift by ibcs_ls_shift) */
-#define INFINIPATH_IBCS_L_STATE_DOWN           0x0
-#define INFINIPATH_IBCS_L_STATE_INIT           0x1
-#define INFINIPATH_IBCS_L_STATE_ARM            0x2
-#define INFINIPATH_IBCS_L_STATE_ACTIVE         0x3
-#define INFINIPATH_IBCS_L_STATE_ACT_DEFER      0x4
-
-
-/* kr_extstatus bits */
-#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
-#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
-#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
-
-/* kr_extctrl bits */
-#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
-#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
-#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
-#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
-#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
-#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
-#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
-#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
-#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
-#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
-#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
-#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
-#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
-#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
-#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
-#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
-#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
-
-/* kr_partitionkey bits */
-#define INFINIPATH_PKEY_SIZE 16
-#define INFINIPATH_PKEY_MASK 0xFFFF
-#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
-
-/* kr_serdesconfig0 bits */
-#define INFINIPATH_SERDC0_RESET_MASK  0xfULL   /* overal reset bits */
-#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL    /* pll reset */
-/* tx idle enables (per lane) */
-#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
-/* rx detect enables (per lane) */
-#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
-/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
-#define INFINIPATH_SERDC0_L1PWR_DN      0xF0ULL
-
-/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
-#define INFINIPATH_XGXS_RX_POL_SHIFT 19
-#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
-
-
-/*
- * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
- * PIO send buffers.  This is well beyond anything currently
- * defined in the InfiniBand spec.
- */
-#define IPATH_PIO_MAXIBHDR 128
-
-typedef u64 ipath_err_t;
-
-/* The following change with the type of device, so
- * need to be part of the ipath_devdata struct, or
- * we could have problems plugging in devices of
- * different types (e.g. one HT, one PCIE)
- * in one system, to be managed by one driver.
- * On the other hand, this file is may also be included
- * by other code, so leave the declarations here
- * temporarily. Minor footprint issue if common-model
- * linker used, none if C89+ linker used.
- */
-
-/* mask of defined bits for various registers */
-extern u64 infinipath_i_bitsextant;
-extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
-
-/* masks that are different in various chips, or only exist in some chips */
-extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
-
-/*
- * These are the infinipath general register numbers (not offsets).
- * The kernel registers are used directly, those beyond the kernel
- * registers are calculated from one of the base registers.  The use of
- * an integer type doesn't allow type-checking as thorough as, say,
- * an enum but allows for better hiding of chip differences.
- */
-typedef const u16 ipath_kreg,  /* infinipath general registers */
- ipath_creg,                   /* infinipath counter registers */
- ipath_sreg;                   /* kernel-only, infinipath send registers */
-
-/*
- * These are the chip registers common to all infinipath chips, and
- * used both by the kernel and the diagnostics or other user code.
- * They are all implemented such that 64 bit accesses work.
- * Some implement no more than 32 bits.  Because 64 bit reads
- * require 2 HT cmds on opteron, we access those with 32 bit
- * reads for efficiency (they are written as 64 bits, since
- * the extra 32 bits are nearly free on writes, and it slightly reduces
- * complexity).  The rest are all accessed as 64 bits.
- */
-struct ipath_kregs {
-       /* These are the 32 bit group */
-       ipath_kreg kr_control;
-       ipath_kreg kr_counterregbase;
-       ipath_kreg kr_intmask;
-       ipath_kreg kr_intstatus;
-       ipath_kreg kr_pagealign;
-       ipath_kreg kr_portcnt;
-       ipath_kreg kr_rcvtidbase;
-       ipath_kreg kr_rcvtidcnt;
-       ipath_kreg kr_rcvegrbase;
-       ipath_kreg kr_rcvegrcnt;
-       ipath_kreg kr_scratch;
-       ipath_kreg kr_sendctrl;
-       ipath_kreg kr_sendpiobufbase;
-       ipath_kreg kr_sendpiobufcnt;
-       ipath_kreg kr_sendpiosize;
-       ipath_kreg kr_sendregbase;
-       ipath_kreg kr_userregbase;
-       /* These are the 64 bit group */
-       ipath_kreg kr_debugport;
-       ipath_kreg kr_debugportselect;
-       ipath_kreg kr_errorclear;
-       ipath_kreg kr_errormask;
-       ipath_kreg kr_errorstatus;
-       ipath_kreg kr_extctrl;
-       ipath_kreg kr_extstatus;
-       ipath_kreg kr_gpio_clear;
-       ipath_kreg kr_gpio_mask;
-       ipath_kreg kr_gpio_out;
-       ipath_kreg kr_gpio_status;
-       ipath_kreg kr_hwdiagctrl;
-       ipath_kreg kr_hwerrclear;
-       ipath_kreg kr_hwerrmask;
-       ipath_kreg kr_hwerrstatus;
-       ipath_kreg kr_ibcctrl;
-       ipath_kreg kr_ibcstatus;
-       ipath_kreg kr_intblocked;
-       ipath_kreg kr_intclear;
-       ipath_kreg kr_interruptconfig;
-       ipath_kreg kr_mdio;
-       ipath_kreg kr_partitionkey;
-       ipath_kreg kr_rcvbthqp;
-       ipath_kreg kr_rcvbufbase;
-       ipath_kreg kr_rcvbufsize;
-       ipath_kreg kr_rcvctrl;
-       ipath_kreg kr_rcvhdrcnt;
-       ipath_kreg kr_rcvhdrentsize;
-       ipath_kreg kr_rcvhdrsize;
-       ipath_kreg kr_rcvintmembase;
-       ipath_kreg kr_rcvintmemsize;
-       ipath_kreg kr_revision;
-       ipath_kreg kr_sendbuffererror;
-       ipath_kreg kr_sendpioavailaddr;
-       ipath_kreg kr_serdesconfig0;
-       ipath_kreg kr_serdesconfig1;
-       ipath_kreg kr_serdesstatus;
-       ipath_kreg kr_txintmembase;
-       ipath_kreg kr_txintmemsize;
-       ipath_kreg kr_xgxsconfig;
-       ipath_kreg kr_ibpllcfg;
-       /* use these two (and the following N ports) only with
-        * ipath_k*_kreg64_port(); not *kreg64() */
-       ipath_kreg kr_rcvhdraddr;
-       ipath_kreg kr_rcvhdrtailaddr;
-
-       /* remaining registers are not present on all types of infinipath
-          chips  */
-       ipath_kreg kr_rcvpktledcnt;
-       ipath_kreg kr_pcierbuftestreg0;
-       ipath_kreg kr_pcierbuftestreg1;
-       ipath_kreg kr_pcieq0serdesconfig0;
-       ipath_kreg kr_pcieq0serdesconfig1;
-       ipath_kreg kr_pcieq0serdesstatus;
-       ipath_kreg kr_pcieq1serdesconfig0;
-       ipath_kreg kr_pcieq1serdesconfig1;
-       ipath_kreg kr_pcieq1serdesstatus;
-       ipath_kreg kr_hrtbt_guid;
-       ipath_kreg kr_ibcddrctrl;
-       ipath_kreg kr_ibcddrstatus;
-       ipath_kreg kr_jintreload;
-
-       /* send dma related regs */
-       ipath_kreg kr_senddmabase;
-       ipath_kreg kr_senddmalengen;
-       ipath_kreg kr_senddmatail;
-       ipath_kreg kr_senddmahead;
-       ipath_kreg kr_senddmaheadaddr;
-       ipath_kreg kr_senddmabufmask0;
-       ipath_kreg kr_senddmabufmask1;
-       ipath_kreg kr_senddmabufmask2;
-       ipath_kreg kr_senddmastatus;
-
-       /* SerDes related regs (IBA7220-only) */
-       ipath_kreg kr_ibserdesctrl;
-       ipath_kreg kr_ib_epbacc;
-       ipath_kreg kr_ib_epbtrans;
-       ipath_kreg kr_pcie_epbacc;
-       ipath_kreg kr_pcie_epbtrans;
-       ipath_kreg kr_ib_ddsrxeq;
-};
-
-struct ipath_cregs {
-       ipath_creg cr_badformatcnt;
-       ipath_creg cr_erricrccnt;
-       ipath_creg cr_errlinkcnt;
-       ipath_creg cr_errlpcrccnt;
-       ipath_creg cr_errpkey;
-       ipath_creg cr_errrcvflowctrlcnt;
-       ipath_creg cr_err_rlencnt;
-       ipath_creg cr_errslencnt;
-       ipath_creg cr_errtidfull;
-       ipath_creg cr_errtidvalid;
-       ipath_creg cr_errvcrccnt;
-       ipath_creg cr_ibstatuschange;
-       ipath_creg cr_intcnt;
-       ipath_creg cr_invalidrlencnt;
-       ipath_creg cr_invalidslencnt;
-       ipath_creg cr_lbflowstallcnt;
-       ipath_creg cr_iblinkdowncnt;
-       ipath_creg cr_iblinkerrrecovcnt;
-       ipath_creg cr_ibsymbolerrcnt;
-       ipath_creg cr_pktrcvcnt;
-       ipath_creg cr_pktrcvflowctrlcnt;
-       ipath_creg cr_pktsendcnt;
-       ipath_creg cr_pktsendflowcnt;
-       ipath_creg cr_portovflcnt;
-       ipath_creg cr_rcvebpcnt;
-       ipath_creg cr_rcvovflcnt;
-       ipath_creg cr_rxdroppktcnt;
-       ipath_creg cr_senddropped;
-       ipath_creg cr_sendstallcnt;
-       ipath_creg cr_sendunderruncnt;
-       ipath_creg cr_unsupvlcnt;
-       ipath_creg cr_wordrcvcnt;
-       ipath_creg cr_wordsendcnt;
-       ipath_creg cr_vl15droppedpktcnt;
-       ipath_creg cr_rxotherlocalphyerrcnt;
-       ipath_creg cr_excessbufferovflcnt;
-       ipath_creg cr_locallinkintegrityerrcnt;
-       ipath_creg cr_rxvlerrcnt;
-       ipath_creg cr_rxdlidfltrcnt;
-       ipath_creg cr_psstat;
-       ipath_creg cr_psstart;
-       ipath_creg cr_psinterval;
-       ipath_creg cr_psrcvdatacount;
-       ipath_creg cr_psrcvpktscount;
-       ipath_creg cr_psxmitdatacount;
-       ipath_creg cr_psxmitpktscount;
-       ipath_creg cr_psxmitwaitcount;
-};
-
-#endif                         /* _IPATH_REGISTERS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
deleted file mode 100644 (file)
index 1f95bba..0000000
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/*
- * Convert the AETH RNR timeout code into the number of milliseconds.
- */
-const u32 ib_ipath_rnr_table[32] = {
-       656,                    /* 0 */
-       1,                      /* 1 */
-       1,                      /* 2 */
-       1,                      /* 3 */
-       1,                      /* 4 */
-       1,                      /* 5 */
-       1,                      /* 6 */
-       1,                      /* 7 */
-       1,                      /* 8 */
-       1,                      /* 9 */
-       1,                      /* A */
-       1,                      /* B */
-       1,                      /* C */
-       1,                      /* D */
-       2,                      /* E */
-       2,                      /* F */
-       3,                      /* 10 */
-       4,                      /* 11 */
-       6,                      /* 12 */
-       8,                      /* 13 */
-       11,                     /* 14 */
-       16,                     /* 15 */
-       21,                     /* 16 */
-       31,                     /* 17 */
-       41,                     /* 18 */
-       62,                     /* 19 */
-       82,                     /* 1A */
-       123,                    /* 1B */
-       164,                    /* 1C */
-       246,                    /* 1D */
-       328,                    /* 1E */
-       492                     /* 1F */
-};
-
-/**
- * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
- * @qp: the QP
- *
- * Called with the QP s_lock held and interrupts disabled.
- * XXX Use a simple list for now.  We might need a priority
- * queue if we have lots of QPs waiting for RNR timeouts
- * but that should be rare.
- */
-void ipath_insert_rnr_queue(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-
-       /* We already did a spin_lock_irqsave(), so just use spin_lock */
-       spin_lock(&dev->pending_lock);
-       if (list_empty(&dev->rnrwait))
-               list_add(&qp->timerwait, &dev->rnrwait);
-       else {
-               struct list_head *l = &dev->rnrwait;
-               struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
-                                                 timerwait);
-
-               while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
-                       qp->s_rnr_timeout -= nqp->s_rnr_timeout;
-                       l = l->next;
-                       if (l->next == &dev->rnrwait) {
-                               nqp = NULL;
-                               break;
-                       }
-                       nqp = list_entry(l->next, struct ipath_qp,
-                                        timerwait);
-               }
-               if (nqp)
-                       nqp->s_rnr_timeout -= qp->s_rnr_timeout;
-               list_add(&qp->timerwait, l);
-       }
-       spin_unlock(&dev->pending_lock);
-}
-
-/**
- * ipath_init_sge - Validate a RWQE and fill in the SGE state
- * @qp: the QP
- *
- * Return 1 if OK.
- */
-int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-                  u32 *lengthp, struct ipath_sge_state *ss)
-{
-       int i, j, ret;
-       struct ib_wc wc;
-
-       *lengthp = 0;
-       for (i = j = 0; i < wqe->num_sge; i++) {
-               if (wqe->sg_list[i].length == 0)
-                       continue;
-               /* Check LKEY */
-               if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
-                                  &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
-                       goto bad_lkey;
-               *lengthp += wqe->sg_list[i].length;
-               j++;
-       }
-       ss->num_sge = j;
-       ret = 1;
-       goto bail;
-
-bad_lkey:
-       memset(&wc, 0, sizeof(wc));
-       wc.wr_id = wqe->wr_id;
-       wc.status = IB_WC_LOC_PROT_ERR;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       /* Signal solicited completion event. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
- * @qp: the QP
- * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
- *
- * Return 0 if no RWQE is available, otherwise return 1.
- *
- * Can be called from interrupt level.
- */
-int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
-{
-       unsigned long flags;
-       struct ipath_rq *rq;
-       struct ipath_rwq *wq;
-       struct ipath_srq *srq;
-       struct ipath_rwqe *wqe;
-       void (*handler)(struct ib_event *, void *);
-       u32 tail;
-       int ret;
-
-       if (qp->ibqp.srq) {
-               srq = to_isrq(qp->ibqp.srq);
-               handler = srq->ibsrq.event_handler;
-               rq = &srq->rq;
-       } else {
-               srq = NULL;
-               handler = NULL;
-               rq = &qp->r_rq;
-       }
-
-       spin_lock_irqsave(&rq->lock, flags);
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               ret = 0;
-               goto unlock;
-       }
-
-       wq = rq->wq;
-       tail = wq->tail;
-       /* Validate tail before using it since it is user writable. */
-       if (tail >= rq->size)
-               tail = 0;
-       do {
-               if (unlikely(tail == wq->head)) {
-                       ret = 0;
-                       goto unlock;
-               }
-               /* Make sure entry is read after head index is read. */
-               smp_rmb();
-               wqe = get_rwqe_ptr(rq, tail);
-               if (++tail >= rq->size)
-                       tail = 0;
-               if (wr_id_only)
-                       break;
-               qp->r_sge.sg_list = qp->r_sg_list;
-       } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
-       qp->r_wr_id = wqe->wr_id;
-       wq->tail = tail;
-
-       ret = 1;
-       set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
-       if (handler) {
-               u32 n;
-
-               /*
-                * validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-                       goto bail;
-               }
-       }
-unlock:
-       spin_unlock_irqrestore(&rq->lock, flags);
-bail:
-       return ret;
-}
-
-/**
- * ipath_ruc_loopback - handle UC and RC lookback requests
- * @sqp: the sending QP
- *
- * This is called from ipath_do_send() to
- * forward a WQE addressed to the same HCA.
- * Note that although we are single threaded due to the tasklet, we still
- * have to protect against post_send().  We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ipath_ruc_loopback(struct ipath_qp *sqp)
-{
-       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
-       struct ipath_qp *qp;
-       struct ipath_swqe *wqe;
-       struct ipath_sge *sge;
-       unsigned long flags;
-       struct ib_wc wc;
-       u64 sdata;
-       atomic64_t *maddr;
-       enum ib_wc_status send_status;
-
-       /*
-        * Note that we check the responder QP state after
-        * checking the requester's state.
-        */
-       qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
-
-       spin_lock_irqsave(&sqp->s_lock, flags);
-
-       /* Return if we are already busy processing a work request. */
-       if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
-           !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
-               goto unlock;
-
-       sqp->s_flags |= IPATH_S_BUSY;
-
-again:
-       if (sqp->s_last == sqp->s_head)
-               goto clr_busy;
-       wqe = get_swqe_ptr(sqp, sqp->s_last);
-
-       /* Return if it is not OK to start a new work reqeust. */
-       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
-                       goto clr_busy;
-               /* We are in the error state, flush the work request. */
-               send_status = IB_WC_WR_FLUSH_ERR;
-               goto flush_send;
-       }
-
-       /*
-        * We can rely on the entry not changing without the s_lock
-        * being held until we update s_last.
-        * We increment s_cur to indicate s_last is in progress.
-        */
-       if (sqp->s_last == sqp->s_cur) {
-               if (++sqp->s_cur >= sqp->s_size)
-                       sqp->s_cur = 0;
-       }
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-
-       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               dev->n_pkt_drops++;
-               /*
-                * For RC, the requester would timeout and retry so
-                * shortcut the timeouts and just signal too many retries.
-                */
-               if (sqp->ibqp.qp_type == IB_QPT_RC)
-                       send_status = IB_WC_RETRY_EXC_ERR;
-               else
-                       send_status = IB_WC_SUCCESS;
-               goto serr;
-       }
-
-       memset(&wc, 0, sizeof wc);
-       send_status = IB_WC_SUCCESS;
-
-       sqp->s_sge.sge = wqe->sg_list[0];
-       sqp->s_sge.sg_list = wqe->sg_list + 1;
-       sqp->s_sge.num_sge = wqe->wr.num_sge;
-       sqp->s_len = wqe->length;
-       switch (wqe->wr.opcode) {
-       case IB_WR_SEND_WITH_IMM:
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               /* FALLTHROUGH */
-       case IB_WR_SEND:
-               if (!ipath_get_rwqe(qp, 0))
-                       goto rnr_nak;
-               break;
-
-       case IB_WR_RDMA_WRITE_WITH_IMM:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = wqe->wr.ex.imm_data;
-               if (!ipath_get_rwqe(qp, 1))
-                       goto rnr_nak;
-               /* FALLTHROUGH */
-       case IB_WR_RDMA_WRITE:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
-                       goto inv_err;
-               if (wqe->length == 0)
-                       break;
-               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
-                                           wqe->wr.wr.rdma.remote_addr,
-                                           wqe->wr.wr.rdma.rkey,
-                                           IB_ACCESS_REMOTE_WRITE)))
-                       goto acc_err;
-               break;
-
-       case IB_WR_RDMA_READ:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto inv_err;
-               if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
-                                           wqe->wr.wr.rdma.remote_addr,
-                                           wqe->wr.wr.rdma.rkey,
-                                           IB_ACCESS_REMOTE_READ)))
-                       goto acc_err;
-               qp->r_sge.sge = wqe->sg_list[0];
-               qp->r_sge.sg_list = wqe->sg_list + 1;
-               qp->r_sge.num_sge = wqe->wr.num_sge;
-               break;
-
-       case IB_WR_ATOMIC_CMP_AND_SWP:
-       case IB_WR_ATOMIC_FETCH_AND_ADD:
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
-                       goto inv_err;
-               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
-                                           wqe->wr.wr.atomic.remote_addr,
-                                           wqe->wr.wr.atomic.rkey,
-                                           IB_ACCESS_REMOTE_ATOMIC)))
-                       goto acc_err;
-               /* Perform atomic OP and save result. */
-               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
-               sdata = wqe->wr.wr.atomic.compare_add;
-               *(u64 *) sqp->s_sge.sge.vaddr =
-                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
-                       (u64) atomic64_add_return(sdata, maddr) - sdata :
-                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
-                                     sdata, wqe->wr.wr.atomic.swap);
-               goto send_comp;
-
-       default:
-               send_status = IB_WC_LOC_QP_OP_ERR;
-               goto serr;
-       }
-
-       sge = &sqp->s_sge.sge;
-       while (sqp->s_len) {
-               u32 len = sqp->s_len;
-
-               if (len > sge->length)
-                       len = sge->length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--sqp->s_sge.num_sge)
-                               *sge = *sqp->s_sge.sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               sqp->s_len -= len;
-       }
-
-       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-               goto send_comp;
-
-       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-       else
-               wc.opcode = IB_WC_RECV;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.byte_len = wqe->length;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = qp->remote_qpn;
-       wc.slid = qp->remote_ah_attr.dlid;
-       wc.sl = qp->remote_ah_attr.sl;
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                      wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-flush_send:
-       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-       ipath_send_complete(sqp, wqe, send_status);
-       goto again;
-
-rnr_nak:
-       /* Handle RNR NAK */
-       if (qp->ibqp.qp_type == IB_QPT_UC)
-               goto send_comp;
-       /*
-        * Note: we don't need the s_lock held since the BUSY flag
-        * makes this single threaded.
-        */
-       if (sqp->s_rnr_retry == 0) {
-               send_status = IB_WC_RNR_RETRY_EXC_ERR;
-               goto serr;
-       }
-       if (sqp->s_rnr_retry_cnt < 7)
-               sqp->s_rnr_retry--;
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
-               goto clr_busy;
-       sqp->s_flags |= IPATH_S_WAITING;
-       dev->n_rnr_naks++;
-       sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
-       ipath_insert_rnr_queue(sqp);
-       goto clr_busy;
-
-inv_err:
-       send_status = IB_WC_REM_INV_REQ_ERR;
-       wc.status = IB_WC_LOC_QP_OP_ERR;
-       goto err;
-
-acc_err:
-       send_status = IB_WC_REM_ACCESS_ERR;
-       wc.status = IB_WC_LOC_PROT_ERR;
-err:
-       /* responder goes to error state */
-       ipath_rc_error(qp, wc.status);
-
-serr:
-       spin_lock_irqsave(&sqp->s_lock, flags);
-       ipath_send_complete(sqp, wqe, send_status);
-       if (sqp->ibqp.qp_type == IB_QPT_RC) {
-               int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
-               sqp->s_flags &= ~IPATH_S_BUSY;
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               if (lastwqe) {
-                       struct ib_event ev;
-
-                       ev.device = sqp->ibqp.device;
-                       ev.element.qp = &sqp->ibqp;
-                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
-                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
-               }
-               goto done;
-       }
-clr_busy:
-       sqp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
-       if (qp && atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
-{
-       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
-           qp->ibqp.qp_type == IB_QPT_SMI) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                                dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-       }
-}
-
-/**
- * ipath_no_bufs_available - tell the layer driver we need buffers
- * @qp: the QP that caused the problem
- * @dev: the device we ran out of buffers on
- *
- * Called when we run out of PIO buffers.
- * If we are now in the error state, return zero to flush the
- * send work request.
- */
-static int ipath_no_bufs_available(struct ipath_qp *qp,
-                                   struct ipath_ibdev *dev)
-{
-       unsigned long flags;
-       int ret = 1;
-
-       /*
-        * Note that as soon as want_buffer() is called and
-        * possibly before it returns, ipath_ib_piobufavail()
-        * could be called. Therefore, put QP on the piowait list before
-        * enabling the PIO avail interrupt.
-        */
-       spin_lock_irqsave(&qp->s_lock, flags);
-       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
-               dev->n_piowait++;
-               qp->s_flags |= IPATH_S_WAITING;
-               qp->s_flags &= ~IPATH_S_BUSY;
-               spin_lock(&dev->pending_lock);
-               if (list_empty(&qp->piowait))
-                       list_add_tail(&qp->piowait, &dev->piowait);
-               spin_unlock(&dev->pending_lock);
-       } else
-               ret = 0;
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (ret)
-               want_buffer(dev->dd, qp);
-       return ret;
-}
-
-/**
- * ipath_make_grh - construct a GRH header
- * @dev: a pointer to the ipath device
- * @hdr: a pointer to the GRH header being constructed
- * @grh: the global route address to send to
- * @hwords: the number of 32 bit words of header being sent
- * @nwords: the number of 32 bit words of data being sent
- *
- * Return the size of the header in 32 bit words.
- */
-u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
-                  struct ib_global_route *grh, u32 hwords, u32 nwords)
-{
-       hdr->version_tclass_flow =
-               cpu_to_be32((6 << 28) |
-                           (grh->traffic_class << 20) |
-                           grh->flow_label);
-       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
-       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
-       hdr->next_hdr = 0x1B;
-       hdr->hop_limit = grh->hop_limit;
-       /* The SGID is 32-bit aligned. */
-       hdr->sgid.global.subnet_prefix = dev->gid_prefix;
-       hdr->sgid.global.interface_id = dev->dd->ipath_guid;
-       hdr->dgid = grh->dgid;
-
-       /* GRH header size in 32-bit words. */
-       return sizeof(struct ib_grh) / sizeof(u32);
-}
-
-void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
-                          struct ipath_other_headers *ohdr,
-                          u32 bth0, u32 bth2)
-{
-       u16 lrh0;
-       u32 nwords;
-       u32 extra_bytes;
-
-       /* Construct the header. */
-       extra_bytes = -qp->s_cur_size & 3;
-       nwords = (qp->s_cur_size + extra_bytes) >> 2;
-       lrh0 = IPATH_LRH_BTH;
-       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-               qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-                                                &qp->remote_ah_attr.grh,
-                                                qp->s_hdrwords, nwords);
-               lrh0 = IPATH_LRH_GRH;
-       }
-       lrh0 |= qp->remote_ah_attr.sl << 4;
-       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-       qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-       qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
-                                      qp->remote_ah_attr.src_path_bits);
-       bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
-       bth0 |= extra_bytes << 20;
-       ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
-       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(bth2);
-}
-
-/**
- * ipath_do_send - perform a send on a QP
- * @data: contains a pointer to the QP
- *
- * Process entries in the send work queue until credit or queue is
- * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
- * Otherwise, two threads could send packets out of order.
- */
-void ipath_do_send(unsigned long data)
-{
-       struct ipath_qp *qp = (struct ipath_qp *)data;
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       int (*make_req)(struct ipath_qp *qp);
-       unsigned long flags;
-
-       if ((qp->ibqp.qp_type == IB_QPT_RC ||
-            qp->ibqp.qp_type == IB_QPT_UC) &&
-           qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
-               ipath_ruc_loopback(qp);
-               goto bail;
-       }
-
-       if (qp->ibqp.qp_type == IB_QPT_RC)
-              make_req = ipath_make_rc_req;
-       else if (qp->ibqp.qp_type == IB_QPT_UC)
-              make_req = ipath_make_uc_req;
-       else
-              make_req = ipath_make_ud_req;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       /* Return if we are already busy processing a work request. */
-       if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
-           !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               goto bail;
-       }
-
-       qp->s_flags |= IPATH_S_BUSY;
-
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-
-again:
-       /* Check for a constructed packet to be sent. */
-       if (qp->s_hdrwords != 0) {
-               /*
-                * If no PIO bufs are available, return.  An interrupt will
-                * call ipath_ib_piobufavail() when one is available.
-                */
-               if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
-                                    qp->s_cur_sge, qp->s_cur_size)) {
-                       if (ipath_no_bufs_available(qp, dev))
-                               goto bail;
-               }
-               dev->n_unicast_xmit++;
-               /* Record that we sent the packet and s_hdr is empty. */
-               qp->s_hdrwords = 0;
-       }
-
-       if (make_req(qp))
-               goto again;
-
-bail:;
-}
-
-/*
- * This should be called with s_lock held.
- */
-void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
-                        enum ib_wc_status status)
-{
-       u32 old_last, last;
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
-               return;
-
-       /* See ch. 11.2.4.1 and 10.7.3.1 */
-       if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
-           status != IB_WC_SUCCESS) {
-               struct ib_wc wc;
-
-               memset(&wc, 0, sizeof wc);
-               wc.wr_id = wqe->wr.wr_id;
-               wc.status = status;
-               wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-               wc.qp = &qp->ibqp;
-               if (status == IB_WC_SUCCESS)
-                       wc.byte_len = wqe->length;
-               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
-                              status != IB_WC_SUCCESS);
-       }
-
-       old_last = last = qp->s_last;
-       if (++last >= qp->s_size)
-               last = 0;
-       qp->s_last = last;
-       if (qp->s_cur == old_last)
-               qp->s_cur = last;
-       if (qp->s_tail == old_last)
-               qp->s_tail = last;
-       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
-               qp->s_draining = 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
deleted file mode 100644 (file)
index 17a5177..0000000
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/spinlock.h>
-#include <linux/gfp.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
-
-static void vl15_watchdog_enq(struct ipath_devdata *dd)
-{
-       /* ipath_sdma_lock must already be held */
-       if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
-               unsigned long interval = (HZ + 19) / 20;
-               dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
-               add_timer(&dd->ipath_sdma_vl15_timer);
-       }
-}
-
-static void vl15_watchdog_deq(struct ipath_devdata *dd)
-{
-       /* ipath_sdma_lock must already be held */
-       if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
-               unsigned long interval = (HZ + 19) / 20;
-               mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
-       } else {
-               del_timer(&dd->ipath_sdma_vl15_timer);
-       }
-}
-
-static void vl15_watchdog_timeout(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-       if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
-               ipath_dbg("vl15 watchdog timeout - clearing\n");
-               ipath_cancel_sends(dd, 1);
-               ipath_hol_down(dd);
-       } else {
-               ipath_dbg("vl15 watchdog timeout - "
-                         "condition already cleared\n");
-       }
-}
-
-static void unmap_desc(struct ipath_devdata *dd, unsigned head)
-{
-       __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
-       u64 desc[2];
-       dma_addr_t addr;
-       size_t len;
-
-       desc[0] = le64_to_cpu(descqp[0]);
-       desc[1] = le64_to_cpu(descqp[1]);
-
-       addr = (desc[1] << 32) | (desc[0] >> 32);
-       len = (desc[0] >> 14) & (0x7ffULL << 2);
-       dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
-}
-
-/*
- * ipath_sdma_lock should be locked before calling this.
- */
-int ipath_sdma_make_progress(struct ipath_devdata *dd)
-{
-       struct list_head *lp = NULL;
-       struct ipath_sdma_txreq *txp = NULL;
-       u16 dmahead;
-       u16 start_idx = 0;
-       int progress = 0;
-
-       if (!list_empty(&dd->ipath_sdma_activelist)) {
-               lp = dd->ipath_sdma_activelist.next;
-               txp = list_entry(lp, struct ipath_sdma_txreq, list);
-               start_idx = txp->start_idx;
-       }
-
-       /*
-        * Read the SDMA head register in order to know that the
-        * interrupt clear has been written to the chip.
-        * Otherwise, we may not get an interrupt for the last
-        * descriptor in the queue.
-        */
-       dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
-       /* sanity check return value for error handling (chip reset, etc.) */
-       if (dmahead >= dd->ipath_sdma_descq_cnt)
-               goto done;
-
-       while (dd->ipath_sdma_descq_head != dmahead) {
-               if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
-                   dd->ipath_sdma_descq_head == start_idx) {
-                       unmap_desc(dd, dd->ipath_sdma_descq_head);
-                       start_idx++;
-                       if (start_idx == dd->ipath_sdma_descq_cnt)
-                               start_idx = 0;
-               }
-
-               /* increment free count and head */
-               dd->ipath_sdma_descq_removed++;
-               if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
-                       dd->ipath_sdma_descq_head = 0;
-
-               if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
-                       /* move to notify list */
-                       if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-                               vl15_watchdog_deq(dd);
-                       list_move_tail(lp, &dd->ipath_sdma_notifylist);
-                       if (!list_empty(&dd->ipath_sdma_activelist)) {
-                               lp = dd->ipath_sdma_activelist.next;
-                               txp = list_entry(lp, struct ipath_sdma_txreq,
-                                                list);
-                               start_idx = txp->start_idx;
-                       } else {
-                               lp = NULL;
-                               txp = NULL;
-                       }
-               }
-               progress = 1;
-       }
-
-       if (progress)
-               tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
-done:
-       return progress;
-}
-
-static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
-{
-       struct ipath_sdma_txreq *txp, *txp_next;
-
-       list_for_each_entry_safe(txp, txp_next, list, list) {
-               list_del_init(&txp->list);
-
-               if (txp->callback)
-                       (*txp->callback)(txp->callback_cookie,
-                                        txp->callback_status);
-       }
-}
-
-static void sdma_notify_taskbody(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       struct list_head list;
-
-       INIT_LIST_HEAD(&list);
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       list_splice_init(&dd->ipath_sdma_notifylist, &list);
-
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       ipath_sdma_notify(dd, &list);
-
-       /*
-        * The IB verbs layer needs to see the callback before getting
-        * the call to ipath_ib_piobufavail() because the callback
-        * handles releasing resources the next send will need.
-        * Otherwise, we could do these calls in
-        * ipath_sdma_make_progress().
-        */
-       ipath_ib_piobufavail(dd->verbs_dev);
-}
-
-static void sdma_notify_task(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
-
-       if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               sdma_notify_taskbody(dd);
-}
-
-static void dump_sdma_state(struct ipath_devdata *dd)
-{
-       unsigned long reg;
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
-       ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
-       ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
-       ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
-       ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
-       ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
-       ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
-
-       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
-       ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
-}
-
-static void sdma_abort_task(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-       u64 status;
-       unsigned long flags;
-
-       if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               return;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
-
-       /* nothing to do */
-       if (status == IPATH_SDMA_ABORT_NONE)
-               goto unlock;
-
-       /* ipath_sdma_abort() is done, waiting for interrupt */
-       if (status == IPATH_SDMA_ABORT_DISARMED) {
-               if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
-                       goto resched_noprint;
-               /* give up, intr got lost somewhere */
-               ipath_dbg("give up waiting for SDMADISABLED intr\n");
-               __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-               status = IPATH_SDMA_ABORT_ABORTED;
-       }
-
-       /* everything is stopped, time to clean up and restart */
-       if (status == IPATH_SDMA_ABORT_ABORTED) {
-               struct ipath_sdma_txreq *txp, *txpnext;
-               u64 hwstatus;
-               int notify = 0;
-
-               hwstatus = ipath_read_kreg64(dd,
-                               dd->ipath_kregs->kr_senddmastatus);
-
-               if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
-                                IPATH_SDMA_STATUS_ABORT_IN_PROG             |
-                                IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
-                   !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
-                       if (dd->ipath_sdma_reset_wait > 0) {
-                               /* not done shutting down sdma */
-                               --dd->ipath_sdma_reset_wait;
-                               goto resched;
-                       }
-                       ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
-                               "status after SDMA reset, continuing\n");
-                       dump_sdma_state(dd);
-               }
-
-               /* dequeue all "sent" requests */
-               list_for_each_entry_safe(txp, txpnext,
-                                        &dd->ipath_sdma_activelist, list) {
-                       txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
-                       if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-                               vl15_watchdog_deq(dd);
-                       list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-                       notify = 1;
-               }
-               if (notify)
-                       tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
-
-               /* reset our notion of head and tail */
-               dd->ipath_sdma_descq_tail = 0;
-               dd->ipath_sdma_descq_head = 0;
-               dd->ipath_sdma_head_dma[0] = 0;
-               dd->ipath_sdma_generation = 0;
-               dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
-
-               /* Reset SendDmaLenGen */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
-                       (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
-
-               /* done with sdma state for a bit */
-               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-               /*
-                * Don't restart sdma here (with the exception
-                * below). Wait until link is up to ACTIVE.  VL15 MADs
-                * used to bring the link up use PIO, and multiple link
-                * transitions otherwise cause the sdma engine to be
-                * stopped and started multiple times.
-                * The disable is done here, including the shadow,
-                * so the state is kept consistent.
-                * See ipath_restart_sdma() for the actual starting
-                * of sdma.
-                */
-               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-               dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-                                dd->ipath_sendctrl);
-               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-               /* make sure I see next message */
-               dd->ipath_sdma_abort_jiffies = 0;
-
-               /*
-                * Not everything that takes SDMA offline is a link
-                * status change.  If the link was up, restart SDMA.
-                */
-               if (dd->ipath_flags & IPATH_LINKACTIVE)
-                       ipath_restart_sdma(dd);
-
-               goto done;
-       }
-
-resched:
-       /*
-        * for now, keep spinning
-        * JAG - this is bad to just have default be a loop without
-        * state change
-        */
-       if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
-               ipath_dbg("looping with status 0x%08lx\n",
-                         dd->ipath_sdma_status);
-               dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
-       }
-resched_noprint:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-               tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
-       return;
-
-unlock:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-done:
-       return;
-}
-
-/*
- * This is called from interrupt context.
- */
-void ipath_sdma_intr(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       (void) ipath_sdma_make_progress(dd);
-
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-}
-
-static int alloc_sdma(struct ipath_devdata *dd)
-{
-       int ret = 0;
-
-       /* Allocate memory for SendDMA descriptor FIFO */
-       dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
-               SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
-
-       if (!dd->ipath_sdma_descq) {
-               ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
-                       "FIFO memory\n");
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       dd->ipath_sdma_descq_cnt =
-               SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
-
-       /* Allocate memory for DMA of head register to memory */
-       dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
-               PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
-       if (!dd->ipath_sdma_head_dma) {
-               ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
-               ret = -ENOMEM;
-               goto cleanup_descq;
-       }
-       dd->ipath_sdma_head_dma[0] = 0;
-
-       init_timer(&dd->ipath_sdma_vl15_timer);
-       dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
-       dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
-       atomic_set(&dd->ipath_sdma_vl15_count, 0);
-
-       goto done;
-
-cleanup_descq:
-       dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
-               (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
-       dd->ipath_sdma_descq = NULL;
-       dd->ipath_sdma_descq_phys = 0;
-done:
-       return ret;
-}
-
-int setup_sdma(struct ipath_devdata *dd)
-{
-       int ret = 0;
-       unsigned i, n;
-       u64 tmp64;
-       u64 senddmabufmask[3] = { 0 };
-       unsigned long flags;
-
-       ret = alloc_sdma(dd);
-       if (ret)
-               goto done;
-
-       if (!dd->ipath_sdma_descq) {
-               ipath_dev_err(dd, "SendDMA memory not allocated\n");
-               goto done;
-       }
-
-       /*
-        * Set initial status as if we had been up, then gone down.
-        * This lets initial start on transition to ACTIVE be the
-        * same as restart after link flap.
-        */
-       dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
-       dd->ipath_sdma_abort_jiffies = 0;
-       dd->ipath_sdma_generation = 0;
-       dd->ipath_sdma_descq_tail = 0;
-       dd->ipath_sdma_descq_head = 0;
-       dd->ipath_sdma_descq_removed = 0;
-       dd->ipath_sdma_descq_added = 0;
-
-       /* Set SendDmaBase */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
-                        dd->ipath_sdma_descq_phys);
-       /* Set SendDmaLenGen */
-       tmp64 = dd->ipath_sdma_descq_cnt;
-       tmp64 |= 1<<18; /* enable generation checking */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
-       /* Set SendDmaTail */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
-                        dd->ipath_sdma_descq_tail);
-       /* Set SendDmaHeadAddr */
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
-                        dd->ipath_sdma_head_phys);
-
-       /*
-        * Reserve all the former "kernel" piobufs, using high number range
-        * so we get as many 4K buffers as possible
-        */
-       n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
-       i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
-       ipath_chg_pioavailkernel(dd, i, n - i , 0);
-       for (; i < n; ++i) {
-               unsigned word = i / 64;
-               unsigned bit = i & 63;
-               BUG_ON(word >= 3);
-               senddmabufmask[word] |= 1ULL << bit;
-       }
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
-                        senddmabufmask[0]);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
-                        senddmabufmask[1]);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
-                        senddmabufmask[2]);
-
-       INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
-       INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
-
-       tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
-                    (unsigned long) dd);
-       tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
-                    (unsigned long) dd);
-
-       /*
-        * No use to turn on SDMA here, as link is probably not ACTIVE
-        * Just mark it RUNNING and enable the interrupt, and let the
-        * ipath_restart_sdma() on link transition to ACTIVE actually
-        * enable it.
-        */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-done:
-       return ret;
-}
-
-void teardown_sdma(struct ipath_devdata *dd)
-{
-       struct ipath_sdma_txreq *txp, *txpnext;
-       unsigned long flags;
-       dma_addr_t sdma_head_phys = 0;
-       dma_addr_t sdma_descq_phys = 0;
-       void *sdma_descq = NULL;
-       void *sdma_head_dma = NULL;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
-       __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-       __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       tasklet_kill(&dd->ipath_sdma_abort_task);
-       tasklet_kill(&dd->ipath_sdma_notify_task);
-
-       /* turn off sdma */
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-               dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       /* dequeue all "sent" requests */
-       list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
-                                list) {
-               txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
-               if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
-                       vl15_watchdog_deq(dd);
-               list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
-       }
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       sdma_notify_taskbody(dd);
-
-       del_timer_sync(&dd->ipath_sdma_vl15_timer);
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       dd->ipath_sdma_abort_jiffies = 0;
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
-
-       if (dd->ipath_sdma_head_dma) {
-               sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
-               sdma_head_phys = dd->ipath_sdma_head_phys;
-               dd->ipath_sdma_head_dma = NULL;
-               dd->ipath_sdma_head_phys = 0;
-       }
-
-       if (dd->ipath_sdma_descq) {
-               sdma_descq = dd->ipath_sdma_descq;
-               sdma_descq_phys = dd->ipath_sdma_descq_phys;
-               dd->ipath_sdma_descq = NULL;
-               dd->ipath_sdma_descq_phys = 0;
-       }
-
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       if (sdma_head_dma)
-               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-                                 sdma_head_dma, sdma_head_phys);
-
-       if (sdma_descq)
-               dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
-                                 sdma_descq, sdma_descq_phys);
-}
-
-/*
- * [Re]start SDMA, if we use it, and it's not already OK.
- * This is called on transition to link ACTIVE, either the first or
- * subsequent times.
- */
-void ipath_restart_sdma(struct ipath_devdata *dd)
-{
-       unsigned long flags;
-       int needed = 1;
-
-       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-               goto bail;
-
-       /*
-        * First, make sure we should, which is to say,
-        * check that we are "RUNNING" (not in teardown)
-        * and not "SHUTDOWN"
-        */
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
-               || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
-                       needed = 0;
-       else {
-               __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
-               __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
-               __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
-       }
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-       if (!needed) {
-               ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
-                       dd->ipath_sdma_status);
-               goto bail;
-       }
-       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
-       /*
-        * First clear, just to be safe. Enable is only done
-        * in chip on 0->1 transition
-        */
-       dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
-       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
-       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
-
-       /* notify upper layers */
-       ipath_ib_piobufavail(dd->verbs_dev);
-
-bail:
-       return;
-}
-
-static inline void make_sdma_desc(struct ipath_devdata *dd,
-       u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
-{
-       WARN_ON(addr & 3);
-       /* SDmaPhyAddr[47:32] */
-       sdmadesc[1] = addr >> 32;
-       /* SDmaPhyAddr[31:0] */
-       sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
-       /* SDmaGeneration[1:0] */
-       sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
-       /* SDmaDwordCount[10:0] */
-       sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
-       /* SDmaBufOffset[12:2] */
-       sdmadesc[0] |= dwoffset & 0x7ffULL;
-}
-
-/*
- * This function queues one IB packet onto the send DMA queue per call.
- * The caller is responsible for checking:
- * 1) The number of send DMA descriptor entries is less than the size of
- *    the descriptor queue.
- * 2) The IB SGE addresses and lengths are 32-bit aligned
- *    (except possibly the last SGE's length)
- * 3) The SGE addresses are suitable for passing to dma_map_single().
- */
-int ipath_sdma_verbs_send(struct ipath_devdata *dd,
-       struct ipath_sge_state *ss, u32 dwords,
-       struct ipath_verbs_txreq *tx)
-{
-
-       unsigned long flags;
-       struct ipath_sge *sge;
-       int ret = 0;
-       u16 tail;
-       __le64 *descqp;
-       u64 sdmadesc[2];
-       u32 dwoffset;
-       dma_addr_t addr;
-
-       if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
-               ipath_dbg("packet size %X > ibmax %X, fail\n",
-                       tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
-               ret = -EMSGSIZE;
-               goto fail;
-       }
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-retry:
-       if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
-               ret = -EBUSY;
-               goto unlock;
-       }
-
-       if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
-               if (ipath_sdma_make_progress(dd))
-                       goto retry;
-               ret = -ENOBUFS;
-               goto unlock;
-       }
-
-       addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
-                             tx->map_len, DMA_TO_DEVICE);
-       if (dma_mapping_error(&dd->pcidev->dev, addr))
-               goto ioerr;
-
-       dwoffset = tx->map_len >> 2;
-       make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
-
-       /* SDmaFirstDesc */
-       sdmadesc[0] |= 1ULL << 12;
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
-               sdmadesc[0] |= 1ULL << 14;      /* SDmaUseLargeBuf */
-
-       /* write to the descq */
-       tail = dd->ipath_sdma_descq_tail;
-       descqp = &dd->ipath_sdma_descq[tail].qw[0];
-       *descqp++ = cpu_to_le64(sdmadesc[0]);
-       *descqp++ = cpu_to_le64(sdmadesc[1]);
-
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
-               tx->txreq.start_idx = tail;
-
-       /* increment the tail */
-       if (++tail == dd->ipath_sdma_descq_cnt) {
-               tail = 0;
-               descqp = &dd->ipath_sdma_descq[0].qw[0];
-               ++dd->ipath_sdma_generation;
-       }
-
-       sge = &ss->sge;
-       while (dwords) {
-               u32 dw;
-               u32 len;
-
-               len = dwords << 2;
-               if (len > sge->length)
-                       len = sge->length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               dw = (len + 3) >> 2;
-               addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
-                                     DMA_TO_DEVICE);
-               if (dma_mapping_error(&dd->pcidev->dev, addr))
-                       goto unmap;
-               make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
-               /* SDmaUseLargeBuf has to be set in every descriptor */
-               if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
-                       sdmadesc[0] |= 1ULL << 14;
-               /* write to the descq */
-               *descqp++ = cpu_to_le64(sdmadesc[0]);
-               *descqp++ = cpu_to_le64(sdmadesc[1]);
-
-               /* increment the tail */
-               if (++tail == dd->ipath_sdma_descq_cnt) {
-                       tail = 0;
-                       descqp = &dd->ipath_sdma_descq[0].qw[0];
-                       ++dd->ipath_sdma_generation;
-               }
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-
-               dwoffset += dw;
-               dwords -= dw;
-       }
-
-       if (!tail)
-               descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
-       descqp -= 2;
-       /* SDmaLastDesc */
-       descqp[0] |= cpu_to_le64(1ULL << 11);
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
-               /* SDmaIntReq */
-               descqp[0] |= cpu_to_le64(1ULL << 15);
-       }
-
-       /* Commit writes to memory and advance the tail on the chip */
-       wmb();
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
-
-       tx->txreq.next_descq_idx = tail;
-       tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
-       dd->ipath_sdma_descq_tail = tail;
-       dd->ipath_sdma_descq_added += tx->txreq.sg_count;
-       list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
-               vl15_watchdog_enq(dd);
-       goto unlock;
-
-unmap:
-       while (tail != dd->ipath_sdma_descq_tail) {
-               if (!tail)
-                       tail = dd->ipath_sdma_descq_cnt - 1;
-               else
-                       tail--;
-               unmap_desc(dd, tail);
-       }
-ioerr:
-       ret = -EIO;
-unlock:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-fail:
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c
deleted file mode 100644 (file)
index 2627198..0000000
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-
-#include "ipath_verbs.h"
-
-/**
- * ipath_post_srq_receive - post a receive on a shared receive queue
- * @ibsrq: the SRQ to post the receive on
- * @wr: the list of work requests to post
- * @bad_wr: the first WR to cause a problem is put here
- *
- * This may be called from interrupt context.
- */
-int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
-                          struct ib_recv_wr **bad_wr)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-       struct ipath_rwq *wq;
-       unsigned long flags;
-       int ret;
-
-       for (; wr; wr = wr->next) {
-               struct ipath_rwqe *wqe;
-               u32 next;
-               int i;
-
-               if ((unsigned) wr->num_sge > srq->rq.max_sge) {
-                       *bad_wr = wr;
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               spin_lock_irqsave(&srq->rq.lock, flags);
-               wq = srq->rq.wq;
-               next = wq->head + 1;
-               if (next >= srq->rq.size)
-                       next = 0;
-               if (next == wq->tail) {
-                       spin_unlock_irqrestore(&srq->rq.lock, flags);
-                       *bad_wr = wr;
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               wqe = get_rwqe_ptr(&srq->rq, wq->head);
-               wqe->wr_id = wr->wr_id;
-               wqe->num_sge = wr->num_sge;
-               for (i = 0; i < wr->num_sge; i++)
-                       wqe->sg_list[i] = wr->sg_list[i];
-               /* Make sure queue entry is written before the head index. */
-               smp_wmb();
-               wq->head = next;
-               spin_unlock_irqrestore(&srq->rq.lock, flags);
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_create_srq - create a shared receive queue
- * @ibpd: the protection domain of the SRQ to create
- * @srq_init_attr: the attributes of the SRQ
- * @udata: data from libipathverbs when creating a user SRQ
- */
-struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
-                               struct ib_srq_init_attr *srq_init_attr,
-                               struct ib_udata *udata)
-{
-       struct ipath_ibdev *dev = to_idev(ibpd->device);
-       struct ipath_srq *srq;
-       u32 sz;
-       struct ib_srq *ret;
-
-       if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
-               ret = ERR_PTR(-ENOSYS);
-               goto done;
-       }
-
-       if (srq_init_attr->attr.max_wr == 0) {
-               ret = ERR_PTR(-EINVAL);
-               goto done;
-       }
-
-       if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
-           (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
-               ret = ERR_PTR(-EINVAL);
-               goto done;
-       }
-
-       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq) {
-               ret = ERR_PTR(-ENOMEM);
-               goto done;
-       }
-
-       /*
-        * Need to use vmalloc() if we want to support large #s of entries.
-        */
-       srq->rq.size = srq_init_attr->attr.max_wr + 1;
-       srq->rq.max_sge = srq_init_attr->attr.max_sge;
-       sz = sizeof(struct ib_sge) * srq->rq.max_sge +
-               sizeof(struct ipath_rwqe);
-       srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
-       if (!srq->rq.wq) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_srq;
-       }
-
-       /*
-        * Return the address of the RWQ as the offset to mmap.
-        * See ipath_mmap() for details.
-        */
-       if (udata && udata->outlen >= sizeof(__u64)) {
-               int err;
-               u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
-
-               srq->ip =
-                   ipath_create_mmap_info(dev, s,
-                                          ibpd->uobject->context,
-                                          srq->rq.wq);
-               if (!srq->ip) {
-                       ret = ERR_PTR(-ENOMEM);
-                       goto bail_wq;
-               }
-
-               err = ib_copy_to_udata(udata, &srq->ip->offset,
-                                      sizeof(srq->ip->offset));
-               if (err) {
-                       ret = ERR_PTR(err);
-                       goto bail_ip;
-               }
-       } else
-               srq->ip = NULL;
-
-       /*
-        * ib_create_srq() will initialize srq->ibsrq.
-        */
-       spin_lock_init(&srq->rq.lock);
-       srq->rq.wq->head = 0;
-       srq->rq.wq->tail = 0;
-       srq->limit = srq_init_attr->attr.srq_limit;
-
-       spin_lock(&dev->n_srqs_lock);
-       if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
-               spin_unlock(&dev->n_srqs_lock);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail_ip;
-       }
-
-       dev->n_srqs_allocated++;
-       spin_unlock(&dev->n_srqs_lock);
-
-       if (srq->ip) {
-               spin_lock_irq(&dev->pending_lock);
-               list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
-               spin_unlock_irq(&dev->pending_lock);
-       }
-
-       ret = &srq->ibsrq;
-       goto done;
-
-bail_ip:
-       kfree(srq->ip);
-bail_wq:
-       vfree(srq->rq.wq);
-bail_srq:
-       kfree(srq);
-done:
-       return ret;
-}
-
-/**
- * ipath_modify_srq - modify a shared receive queue
- * @ibsrq: the SRQ to modify
- * @attr: the new attributes of the SRQ
- * @attr_mask: indicates which attributes to modify
- * @udata: user data for ipathverbs.so
- */
-int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                    enum ib_srq_attr_mask attr_mask,
-                    struct ib_udata *udata)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-       struct ipath_rwq *wq;
-       int ret = 0;
-
-       if (attr_mask & IB_SRQ_MAX_WR) {
-               struct ipath_rwq *owq;
-               struct ipath_rwqe *p;
-               u32 sz, size, n, head, tail;
-
-               /* Check that the requested sizes are below the limits. */
-               if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
-                   ((attr_mask & IB_SRQ_LIMIT) ?
-                    attr->srq_limit : srq->limit) > attr->max_wr) {
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               sz = sizeof(struct ipath_rwqe) +
-                       srq->rq.max_sge * sizeof(struct ib_sge);
-               size = attr->max_wr + 1;
-               wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
-               if (!wq) {
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               /* Check that we can write the offset to mmap. */
-               if (udata && udata->inlen >= sizeof(__u64)) {
-                       __u64 offset_addr;
-                       __u64 offset = 0;
-
-                       ret = ib_copy_from_udata(&offset_addr, udata,
-                                                sizeof(offset_addr));
-                       if (ret)
-                               goto bail_free;
-                       udata->outbuf =
-                               (void __user *) (unsigned long) offset_addr;
-                       ret = ib_copy_to_udata(udata, &offset,
-                                              sizeof(offset));
-                       if (ret)
-                               goto bail_free;
-               }
-
-               spin_lock_irq(&srq->rq.lock);
-               /*
-                * validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               owq = srq->rq.wq;
-               head = owq->head;
-               if (head >= srq->rq.size)
-                       head = 0;
-               tail = owq->tail;
-               if (tail >= srq->rq.size)
-                       tail = 0;
-               n = head;
-               if (n < tail)
-                       n += srq->rq.size - tail;
-               else
-                       n -= tail;
-               if (size <= n) {
-                       ret = -EINVAL;
-                       goto bail_unlock;
-               }
-               n = 0;
-               p = wq->wq;
-               while (tail != head) {
-                       struct ipath_rwqe *wqe;
-                       int i;
-
-                       wqe = get_rwqe_ptr(&srq->rq, tail);
-                       p->wr_id = wqe->wr_id;
-                       p->num_sge = wqe->num_sge;
-                       for (i = 0; i < wqe->num_sge; i++)
-                               p->sg_list[i] = wqe->sg_list[i];
-                       n++;
-                       p = (struct ipath_rwqe *)((char *) p + sz);
-                       if (++tail >= srq->rq.size)
-                               tail = 0;
-               }
-               srq->rq.wq = wq;
-               srq->rq.size = size;
-               wq->head = n;
-               wq->tail = 0;
-               if (attr_mask & IB_SRQ_LIMIT)
-                       srq->limit = attr->srq_limit;
-               spin_unlock_irq(&srq->rq.lock);
-
-               vfree(owq);
-
-               if (srq->ip) {
-                       struct ipath_mmap_info *ip = srq->ip;
-                       struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
-                       u32 s = sizeof(struct ipath_rwq) + size * sz;
-
-                       ipath_update_mmap_info(dev, ip, s, wq);
-
-                       /*
-                        * Return the offset to mmap.
-                        * See ipath_mmap() for details.
-                        */
-                       if (udata && udata->inlen >= sizeof(__u64)) {
-                               ret = ib_copy_to_udata(udata, &ip->offset,
-                                                      sizeof(ip->offset));
-                               if (ret)
-                                       goto bail;
-                       }
-
-                       spin_lock_irq(&dev->pending_lock);
-                       if (list_empty(&ip->pending_mmaps))
-                               list_add(&ip->pending_mmaps,
-                                        &dev->pending_mmaps);
-                       spin_unlock_irq(&dev->pending_lock);
-               }
-       } else if (attr_mask & IB_SRQ_LIMIT) {
-               spin_lock_irq(&srq->rq.lock);
-               if (attr->srq_limit >= srq->rq.size)
-                       ret = -EINVAL;
-               else
-                       srq->limit = attr->srq_limit;
-               spin_unlock_irq(&srq->rq.lock);
-       }
-       goto bail;
-
-bail_unlock:
-       spin_unlock_irq(&srq->rq.lock);
-bail_free:
-       vfree(wq);
-bail:
-       return ret;
-}
-
-int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-
-       attr->max_wr = srq->rq.size - 1;
-       attr->max_sge = srq->rq.max_sge;
-       attr->srq_limit = srq->limit;
-       return 0;
-}
-
-/**
- * ipath_destroy_srq - destroy a shared receive queue
- * @ibsrq: the SRQ to destroy
- */
-int ipath_destroy_srq(struct ib_srq *ibsrq)
-{
-       struct ipath_srq *srq = to_isrq(ibsrq);
-       struct ipath_ibdev *dev = to_idev(ibsrq->device);
-
-       spin_lock(&dev->n_srqs_lock);
-       dev->n_srqs_allocated--;
-       spin_unlock(&dev->n_srqs_lock);
-       if (srq->ip)
-               kref_put(&srq->ip->ref, ipath_release_mmap_info);
-       else
-               vfree(srq->rq.wq);
-       kfree(srq);
-
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
deleted file mode 100644 (file)
index f63e143..0000000
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ipath_kernel.h"
-
-struct infinipath_stats ipath_stats;
-
-/**
- * ipath_snap_cntr - snapshot a chip counter
- * @dd: the infinipath device
- * @creg: the counter to snapshot
- *
- * called from add_timer and user counter read calls, to deal with
- * counters that wrap in "human time".  The words sent and received, and
- * the packets sent and received are all that we worry about.  For now,
- * at least, we don't worry about error counters, because if they wrap
- * that quickly, we probably don't care.  We may eventually just make this
- * handle all the counters.  word counters can wrap in about 20 seconds
- * of full bandwidth traffic, packet counters in a few hours.
- */
-
-u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
-{
-       u32 val, reg64 = 0;
-       u64 val64;
-       unsigned long t0, t1;
-       u64 ret;
-
-       t0 = jiffies;
-       /* If fast increment counters are only 32 bits, snapshot them,
-        * and maintain them as 64bit values in the driver */
-       if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
-           (creg == dd->ipath_cregs->cr_wordsendcnt ||
-            creg == dd->ipath_cregs->cr_wordrcvcnt ||
-            creg == dd->ipath_cregs->cr_pktsendcnt ||
-            creg == dd->ipath_cregs->cr_pktrcvcnt)) {
-               val64 = ipath_read_creg(dd, creg);
-               val = val64 == ~0ULL ? ~0U : 0;
-               reg64 = 1;
-       } else                  /* val64 just to keep gcc quiet... */
-               val64 = val = ipath_read_creg32(dd, creg);
-       /*
-        * See if a second has passed.  This is just a way to detect things
-        * that are quite broken.  Normally this should take just a few
-        * cycles (the check is for long enough that we don't care if we get
-        * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
-        * normal NB values
-        */
-       t1 = jiffies;
-       if (time_before(t0 + HZ, t1) && val == -1) {
-               ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
-                             creg);
-               ret = 0ULL;
-               goto bail;
-       }
-       if (reg64) {
-               ret = val64;
-               goto bail;
-       }
-
-       if (creg == dd->ipath_cregs->cr_wordsendcnt) {
-               if (val != dd->ipath_lastsword) {
-                       dd->ipath_sword += val - dd->ipath_lastsword;
-                       dd->ipath_lastsword = val;
-               }
-               val64 = dd->ipath_sword;
-       } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
-               if (val != dd->ipath_lastrword) {
-                       dd->ipath_rword += val - dd->ipath_lastrword;
-                       dd->ipath_lastrword = val;
-               }
-               val64 = dd->ipath_rword;
-       } else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
-               if (val != dd->ipath_lastspkts) {
-                       dd->ipath_spkts += val - dd->ipath_lastspkts;
-                       dd->ipath_lastspkts = val;
-               }
-               val64 = dd->ipath_spkts;
-       } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
-               if (val != dd->ipath_lastrpkts) {
-                       dd->ipath_rpkts += val - dd->ipath_lastrpkts;
-                       dd->ipath_lastrpkts = val;
-               }
-               val64 = dd->ipath_rpkts;
-       } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
-               if (dd->ibdeltainprog)
-                       val64 -= val64 - dd->ibsymsnap;
-               val64 -= dd->ibsymdelta;
-       } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
-               if (dd->ibdeltainprog)
-                       val64 -= val64 - dd->iblnkerrsnap;
-               val64 -= dd->iblnkerrdelta;
-       } else
-               val64 = (u64) val;
-
-       ret = val64;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
- * @dd: the infinipath device
- *
- * print the delta of egrfull/hdrqfull errors for kernel ports no more than
- * every 5 seconds.  User processes are printed at close, but kernel doesn't
- * close, so...  Separate routine so may call from other places someday, and
- * so function name when printed by _IPATH_INFO is meaningfull
- */
-static void ipath_qcheck(struct ipath_devdata *dd)
-{
-       static u64 last_tot_hdrqfull;
-       struct ipath_portdata *pd = dd->ipath_pd[0];
-       size_t blen = 0;
-       char buf[128];
-       u32 hdrqtail;
-
-       *buf = 0;
-       if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
-               blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
-                               pd->port_hdrqfull -
-                               dd->ipath_p0_hdrqfull);
-               dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
-       }
-       if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
-               blen += snprintf(buf + blen, sizeof buf - blen,
-                                "%srcvegrfull %llu",
-                                blen ? ", " : "",
-                                (unsigned long long)
-                                (ipath_stats.sps_etidfull -
-                                 dd->ipath_last_tidfull));
-               dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
-       }
-
-       /*
-        * this is actually the number of hdrq full interrupts, not actual
-        * events, but at the moment that's mostly what I'm interested in.
-        * Actual count, etc. is in the counters, if needed.  For production
-        * users this won't ordinarily be printed.
-        */
-
-       if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
-           ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
-               blen += snprintf(buf + blen, sizeof buf - blen,
-                                "%shdrqfull %llu (all ports)",
-                                blen ? ", " : "",
-                                (unsigned long long)
-                                (ipath_stats.sps_hdrqfull -
-                                 last_tot_hdrqfull));
-               last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
-       }
-       if (blen)
-               ipath_dbg("%s\n", buf);
-
-       hdrqtail = ipath_get_hdrqtail(pd);
-       if (pd->port_head != hdrqtail) {
-               if (dd->ipath_lastport0rcv_cnt ==
-                   ipath_stats.sps_port0pkts) {
-                       ipath_cdbg(PKT, "missing rcv interrupts? "
-                                  "port0 hd=%x tl=%x; port0pkts %llx; write"
-                                  " hd (w/intr)\n",
-                                  pd->port_head, hdrqtail,
-                                  (unsigned long long)
-                                  ipath_stats.sps_port0pkts);
-                       ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
-                               dd->ipath_rhdrhead_intr_off, pd->port_port);
-               }
-               dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
-       }
-}
-
-static void ipath_chk_errormask(struct ipath_devdata *dd)
-{
-       static u32 fixed;
-       u32 ctrl;
-       unsigned long errormask;
-       unsigned long hwerrs;
-
-       if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
-               return;
-
-       errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
-
-       if (errormask == dd->ipath_errormask)
-               return;
-       fixed++;
-
-       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
-       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-
-       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-               dd->ipath_errormask);
-
-       if ((hwerrs & dd->ipath_hwerrmask) ||
-               (ctrl & INFINIPATH_C_FREEZEMODE)) {
-               /* force re-interrupt of pending events, just in case */
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
-               dev_info(&dd->pcidev->dev,
-                       "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
-                       fixed, errormask, (unsigned long)dd->ipath_errormask,
-                       ctrl, hwerrs);
-       } else
-               ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
-                       fixed, errormask,
-                       (unsigned long)dd->ipath_errormask);
-}
-
-
-/**
- * ipath_get_faststats - get word counters from chip before they overflow
- * @opaque - contains a pointer to the infinipath device ipath_devdata
- *
- * called from add_timer
- */
-void ipath_get_faststats(unsigned long opaque)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-       int i;
-       static unsigned cnt;
-       unsigned long flags;
-       u64 traffic_wds;
-
-       /*
-        * don't access the chip while running diags, or memory diags can
-        * fail
-        */
-       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
-           ipath_diag_inuse)
-               /* but re-arm the timer, for diags case; won't hurt other */
-               goto done;
-
-       /*
-        * We now try to maintain a "active timer", based on traffic
-        * exceeding a threshold, so we need to check the word-counts
-        * even if they are 64-bit.
-        */
-       traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-       traffic_wds -= dd->ipath_traffic_wds;
-       dd->ipath_traffic_wds += traffic_wds;
-       if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
-               atomic_add(5, &dd->ipath_active_time); /* S/B #define */
-       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
-
-       if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-       }
-
-       ipath_qcheck(dd);
-
-       /*
-        * deal with repeat error suppression.  Doesn't really matter if
-        * last error was almost a full interval ago, or just a few usecs
-        * ago; still won't get more than 2 per interval.  We may want
-        * longer intervals for this eventually, could do with mod, counter
-        * or separate timer.  Also see code in ipath_handle_errors() and
-        * ipath_handle_hwerrors().
-        */
-
-       if (dd->ipath_lasterror)
-               dd->ipath_lasterror = 0;
-       if (dd->ipath_lasthwerror)
-               dd->ipath_lasthwerror = 0;
-       if (dd->ipath_maskederrs
-           && time_after(jiffies, dd->ipath_unmasktime)) {
-               char ebuf[256];
-               int iserr;
-               iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
-                                        dd->ipath_maskederrs);
-               if (dd->ipath_maskederrs &
-                   ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
-                     INFINIPATH_E_PKTERRS))
-                       ipath_dev_err(dd, "Re-enabling masked errors "
-                                     "(%s)\n", ebuf);
-               else {
-                       /*
-                        * rcvegrfull and rcvhdrqfull are "normal", for some
-                        * types of processes (mostly benchmarks) that send
-                        * huge numbers of messages, while not processing
-                        * them.  So only complain about these at debug
-                        * level.
-                        */
-                       if (iserr)
-                               ipath_dbg(
-                                       "Re-enabling queue full errors (%s)\n",
-                                       ebuf);
-                       else
-                               ipath_cdbg(ERRPKT, "Re-enabling packet"
-                                       " problem interrupt (%s)\n", ebuf);
-               }
-
-               /* re-enable masked errors */
-               dd->ipath_errormask |= dd->ipath_maskederrs;
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
-                                dd->ipath_errormask);
-               dd->ipath_maskederrs = 0;
-       }
-
-       /* limit qfull messages to ~one per minute per port */
-       if ((++cnt & 0x10)) {
-               for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
-                       struct ipath_portdata *pd = dd->ipath_pd[i];
-
-                       if (pd && pd->port_lastrcvhdrqtail != -1)
-                               pd->port_lastrcvhdrqtail = -1;
-               }
-       }
-
-       ipath_chk_errormask(dd);
-done:
-       mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
deleted file mode 100644 (file)
index 75558f3..0000000
+++ /dev/null
@@ -1,1238 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/ctype.h>
-#include <linux/stat.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-/**
- * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
- * @str: the string containing the number
- * @valp: where to put the result
- *
- * returns the number of bytes consumed, or negative value on error
- */
-int ipath_parse_ushort(const char *str, unsigned short *valp)
-{
-       unsigned long val;
-       char *end;
-       int ret;
-
-       if (!isdigit(str[0])) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       val = simple_strtoul(str, &end, 0);
-
-       if (val > 0xffff) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       *valp = val;
-
-       ret = end + 1 - str;
-       if (ret == 0)
-               ret = -EINVAL;
-
-bail:
-       return ret;
-}
-
-static ssize_t show_version(struct device_driver *dev, char *buf)
-{
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
-}
-
-static ssize_t show_num_units(struct device_driver *dev, char *buf)
-{
-       return scnprintf(buf, PAGE_SIZE, "%d\n",
-                        ipath_count_units(NULL, NULL, NULL));
-}
-
-static ssize_t show_status(struct device *dev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-
-       if (!dd->ipath_statusp) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
-                       (unsigned long long) *(dd->ipath_statusp));
-
-bail:
-       return ret;
-}
-
-static const char *ipath_status_str[] = {
-       "Initted",
-       "Disabled",
-       "Admin_Disabled",
-       "", /* This used to be the old "OIB_SMA" status. */
-       "", /* This used to be the old "SMA" status. */
-       "Present",
-       "IB_link_up",
-       "IB_configured",
-       "NoIBcable",
-       "Fatal_Hardware_Error",
-       NULL,
-};
-
-static ssize_t show_status_str(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int i, any;
-       u64 s;
-       ssize_t ret;
-
-       if (!dd->ipath_statusp) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       s = *(dd->ipath_statusp);
-       *buf = '\0';
-       for (any = i = 0; s && ipath_status_str[i]; i++) {
-               if (s & 1) {
-                       if (any && strlcat(buf, " ", PAGE_SIZE) >=
-                           PAGE_SIZE)
-                               /* overflow */
-                               break;
-                       if (strlcat(buf, ipath_status_str[i],
-                                   PAGE_SIZE) >= PAGE_SIZE)
-                               break;
-                       any = 1;
-               }
-               s >>= 1;
-       }
-       if (any)
-               strlcat(buf, "\n", PAGE_SIZE);
-
-       ret = strlen(buf);
-
-bail:
-       return ret;
-}
-
-static ssize_t show_boardversion(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
-}
-
-static ssize_t show_localbus_info(struct device *dev,
-                              struct device_attribute *attr,
-                              char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       /* The string printed here is already newline-terminated. */
-       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
-}
-
-static ssize_t show_lmc(struct device *dev,
-                       struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
-}
-
-static ssize_t store_lmc(struct device *dev,
-                        struct device_attribute *attr,
-                        const char *buf,
-                        size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 lmc = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &lmc);
-       if (ret < 0)
-               goto invalid;
-
-       if (lmc > 7) {
-               ret = -EINVAL;
-               goto invalid;
-       }
-
-       ipath_set_lid(dd, dd->ipath_lid, lmc);
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
-bail:
-       return ret;
-}
-
-static ssize_t show_lid(struct device *dev,
-                       struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
-}
-
-static ssize_t store_lid(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 lid = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &lid);
-       if (ret < 0)
-               goto invalid;
-
-       if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
-               ret = -EINVAL;
-               goto invalid;
-       }
-
-       ipath_set_lid(dd, lid, dd->ipath_lmc);
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
-bail:
-       return ret;
-}
-
-static ssize_t show_mlid(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
-}
-
-static ssize_t store_mlid(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 mlid;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &mlid);
-       if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
-               goto invalid;
-
-       dd->ipath_mlid = mlid;
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid MLID\n");
-bail:
-       return ret;
-}
-
-static ssize_t show_guid(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u8 *guid;
-
-       guid = (u8 *) & (dd->ipath_guid);
-
-       return scnprintf(buf, PAGE_SIZE,
-                        "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
-                        guid[0], guid[1], guid[2], guid[3],
-                        guid[4], guid[5], guid[6], guid[7]);
-}
-
-static ssize_t store_guid(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-       unsigned short guid[8];
-       __be64 new_guid;
-       u8 *ng;
-       int i;
-
-       if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
-                  &guid[0], &guid[1], &guid[2], &guid[3],
-                  &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
-               goto invalid;
-
-       ng = (u8 *) &new_guid;
-
-       for (i = 0; i < 8; i++) {
-               if (guid[i] > 0xff)
-                       goto invalid;
-               ng[i] = guid[i];
-       }
-
-       if (new_guid == 0)
-               goto invalid;
-
-       dd->ipath_guid = new_guid;
-       dd->ipath_nguid = 1;
-       if (dd->verbs_dev)
-               dd->verbs_dev->ibdev.node_guid = new_guid;
-
-       ret = strlen(buf);
-       goto bail;
-
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid GUID\n");
-       ret = -EINVAL;
-
-bail:
-       return ret;
-}
-
-static ssize_t show_nguid(struct device *dev,
-                         struct device_attribute *attr,
-                         char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
-}
-
-static ssize_t show_nports(struct device *dev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       /* Return the number of user ports available. */
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
-}
-
-static ssize_t show_serial(struct device *dev,
-                          struct device_attribute *attr,
-                          char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       buf[sizeof dd->ipath_serial] = '\0';
-       memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
-       strcat(buf, "\n");
-       return strlen(buf);
-}
-
-static ssize_t show_unit(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
-}
-
-static ssize_t show_jint_max_packets(struct device *dev,
-                                    struct device_attribute *attr,
-                                    char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
-}
-
-static ssize_t store_jint_max_packets(struct device *dev,
-                                     struct device_attribute *attr,
-                                     const char *buf,
-                                     size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 v = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &v);
-       if (ret < 0)
-               ipath_dev_err(dd, "invalid jint_max_packets.\n");
-       else
-               dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
-
-       return ret;
-}
-
-static ssize_t show_jint_idle_ticks(struct device *dev,
-                                   struct device_attribute *attr,
-                                   char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-
-       return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
-}
-
-static ssize_t store_jint_idle_ticks(struct device *dev,
-                                    struct device_attribute *attr,
-                                    const char *buf,
-                                    size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       u16 v = 0;
-       int ret;
-
-       ret = ipath_parse_ushort(buf, &v);
-       if (ret < 0)
-               ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
-       else
-               dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
-
-       return ret;
-}
-
-#define DEVICE_COUNTER(name, attr) \
-       static ssize_t show_counter_##name(struct device *dev, \
-                                          struct device_attribute *attr, \
-                                          char *buf) \
-       { \
-               struct ipath_devdata *dd = dev_get_drvdata(dev); \
-               return scnprintf(\
-                       buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
-                       ipath_snap_cntr( \
-                               dd, offsetof(struct infinipath_counters, \
-                                            attr) / sizeof(u64)));     \
-       } \
-       static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
-
-DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
-DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
-DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
-DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
-DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
-DEVICE_COUNTER(lb_ints, LBIntCnt);
-DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
-DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
-DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
-DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
-DEVICE_COUNTER(rx_dwords, RxDwordCnt);
-DEVICE_COUNTER(rx_ebps, RxEBPCnt);
-DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
-DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
-DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
-DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
-DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
-DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
-DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
-DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
-DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
-DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
-DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
-DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
-DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
-DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
-DEVICE_COUNTER(tx_dwords, TxDwordCnt);
-DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
-DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
-DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
-DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
-DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
-DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
-
-static struct attribute *dev_counter_attributes[] = {
-       &dev_attr_ib_link_downeds.attr,
-       &dev_attr_ib_link_err_recoveries.attr,
-       &dev_attr_ib_status_changes.attr,
-       &dev_attr_ib_symbol_errs.attr,
-       &dev_attr_lb_flow_stalls.attr,
-       &dev_attr_lb_ints.attr,
-       &dev_attr_rx_bad_formats.attr,
-       &dev_attr_rx_buf_ovfls.attr,
-       &dev_attr_rx_data_pkts.attr,
-       &dev_attr_rx_dropped_pkts.attr,
-       &dev_attr_rx_dwords.attr,
-       &dev_attr_rx_ebps.attr,
-       &dev_attr_rx_flow_ctrl_errs.attr,
-       &dev_attr_rx_flow_pkts.attr,
-       &dev_attr_rx_icrc_errs.attr,
-       &dev_attr_rx_len_errs.attr,
-       &dev_attr_rx_link_problems.attr,
-       &dev_attr_rx_lpcrc_errs.attr,
-       &dev_attr_rx_max_min_len_errs.attr,
-       &dev_attr_rx_p0_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p1_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p2_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p3_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p4_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p5_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p6_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p7_hdr_egr_ovfls.attr,
-       &dev_attr_rx_p8_hdr_egr_ovfls.attr,
-       &dev_attr_rx_pkey_mismatches.attr,
-       &dev_attr_rx_tid_full_errs.attr,
-       &dev_attr_rx_tid_valid_errs.attr,
-       &dev_attr_rx_vcrc_errs.attr,
-       &dev_attr_tx_data_pkts.attr,
-       &dev_attr_tx_dropped_pkts.attr,
-       &dev_attr_tx_dwords.attr,
-       &dev_attr_tx_flow_pkts.attr,
-       &dev_attr_tx_flow_stalls.attr,
-       &dev_attr_tx_len_errs.attr,
-       &dev_attr_tx_max_min_len_errs.attr,
-       &dev_attr_tx_underruns.attr,
-       &dev_attr_tx_unsup_vl_errs.attr,
-       NULL
-};
-
-static struct attribute_group dev_counter_attr_group = {
-       .name = "counters",
-       .attrs = dev_counter_attributes
-};
-
-static ssize_t store_reset(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       if (count < 5 || memcmp(buf, "reset", 5)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       if (dd->ipath_flags & IPATH_DISABLED) {
-               /*
-                * post-reset init would re-enable interrupts, etc.
-                * so don't allow reset on disabled devices.  Not
-                * perfect error, but about the best choice.
-                */
-               dev_info(dev,"Unit %d is disabled, can't reset\n",
-                        dd->ipath_unit);
-               ret = -EINVAL;
-               goto bail;
-       }
-       ret = ipath_reset_device(dd->ipath_unit);
-bail:
-       return ret<0 ? ret : count;
-}
-
-static ssize_t store_link_state(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 state;
-
-       ret = ipath_parse_ushort(buf, &state);
-       if (ret < 0)
-               goto invalid;
-
-       r = ipath_set_linkstate(dd, state);
-       if (r < 0) {
-               ret = r;
-               goto bail;
-       }
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid link state\n");
-bail:
-       return ret;
-}
-
-static ssize_t show_mtu(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
-}
-
-static ssize_t store_mtu(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-       u16 mtu = 0;
-       int r;
-
-       ret = ipath_parse_ushort(buf, &mtu);
-       if (ret < 0)
-               goto invalid;
-
-       r = ipath_set_mtu(dd, mtu);
-       if (r < 0)
-               ret = r;
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid MTU\n");
-bail:
-       return ret;
-}
-
-static ssize_t show_enabled(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                        (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
-}
-
-static ssize_t store_enabled(struct device *dev,
-                        struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       ssize_t ret;
-       u16 enable = 0;
-
-       ret = ipath_parse_ushort(buf, &enable);
-       if (ret < 0) {
-               ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
-               goto bail;
-       }
-
-       if (enable) {
-               if (!(dd->ipath_flags & IPATH_DISABLED))
-                       goto bail;
-
-               dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
-               /* same as post-reset */
-               ret = ipath_init_chip(dd, 1);
-               if (ret)
-                       ipath_dev_err(dd, "Failed to enable unit %d\n",
-                                     dd->ipath_unit);
-               else {
-                       dd->ipath_flags &= ~IPATH_DISABLED;
-                       *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
-               }
-       }
-       else if (!(dd->ipath_flags & IPATH_DISABLED)) {
-               dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
-               ipath_shutdown_device(dd);
-               dd->ipath_flags |= IPATH_DISABLED;
-               *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
-       }
-
-bail:
-       return ret;
-}
-
-static ssize_t store_rx_pol_inv(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret < 0)
-               goto invalid;
-
-       r = ipath_set_rx_pol_inv(dd, val);
-       if (r < 0) {
-               ret = r;
-               goto bail;
-       }
-
-       goto bail;
-invalid:
-       ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
-bail:
-       return ret;
-}
-
-static ssize_t store_led_override(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret > 0)
-               ipath_set_led_override(dd, val);
-       else
-               ipath_dev_err(dd, "attempt to set invalid LED override\n");
-       return ret;
-}
-
-static ssize_t show_logged_errs(struct device *dev,
-                               struct device_attribute *attr,
-                               char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int idx, count;
-
-       /* force consistency with actual EEPROM */
-       if (ipath_update_eeprom_log(dd) != 0)
-               return -ENXIO;
-
-       count = 0;
-       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
-               count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
-                       dd->ipath_eep_st_errs[idx],
-                       idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
-       }
-
-       return count;
-}
-
-/*
- * New sysfs entries to control various IB config. These all turn into
- * accesses via ipath_f_get/set_ib_cfg.
- *
- * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
- */
-static ssize_t show_hrtbt_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_hrtbt_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && val > 3)
-               ret = -EINVAL;
-       if (ret < 0) {
-               ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
-               goto bail;
-       }
-
-       /*
-        * Set the "intentional" heartbeat enable per either of
-        * "Enable" and "Auto", as these are normally set together.
-        * This bit is consulted when leaving loopback mode,
-        * because entering loopback mode overrides it and automatically
-        * disables heartbeat.
-        */
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
-       if (r < 0)
-               ret = r;
-       else if (val == IPATH_IB_HRTBT_OFF)
-               dd->ipath_flags |= IPATH_NO_HRTBT;
-       else
-               dd->ipath_flags &= ~IPATH_NO_HRTBT;
-
-bail:
-       return ret;
-}
-
-/*
- * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
- * _not_ the particular encoding of any given chip)
- */
-static ssize_t show_lwid_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_lwid_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && (val == 0 || val > 3))
-               ret = -EINVAL;
-       if (ret < 0) {
-               ipath_dev_err(dd,
-                       "attempt to set invalid Link Width (enable)\n");
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-/* Get current link width */
-static ssize_t show_lwid(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-/*
- * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
- */
-static ssize_t show_spd_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_spd_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
-               ret = -EINVAL;
-       if (ret < 0) {
-               ipath_dev_err(dd,
-                       "attempt to set invalid Link Speed (enable)\n");
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-/* Get current link speed */
-static ssize_t show_spd(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-/*
- * Get/Set RX polarity-invert enable. 0=no, 1=yes.
- */
-static ssize_t show_rx_polinv_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_rx_polinv_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && val > 1) {
-               ipath_dev_err(dd,
-                       "attempt to set invalid Rx Polarity (enable)\n");
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-/*
- * Get/Set RX lane-reversal enable. 0=no, 1=yes.
- */
-static ssize_t show_lanerev_enb(struct device *dev,
-                        struct device_attribute *attr,
-                        char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-
-       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
-       if (ret >= 0)
-               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
-       return ret;
-}
-
-static ssize_t store_lanerev_enb(struct device *dev,
-                         struct device_attribute *attr,
-                         const char *buf,
-                         size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, r;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret >= 0 && val > 1) {
-               ret = -EINVAL;
-               ipath_dev_err(dd,
-                       "attempt to set invalid Lane reversal (enable)\n");
-               goto bail;
-       }
-
-       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
-       if (r < 0)
-               ret = r;
-
-bail:
-       return ret;
-}
-
-static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
-static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
-
-static struct attribute *driver_attributes[] = {
-       &driver_attr_num_units.attr,
-       &driver_attr_version.attr,
-       NULL
-};
-
-static struct attribute_group driver_attr_group = {
-       .attrs = driver_attributes
-};
-
-static ssize_t store_tempsense(struct device *dev,
-                              struct device_attribute *attr,
-                              const char *buf,
-                              size_t count)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret, stat;
-       u16 val;
-
-       ret = ipath_parse_ushort(buf, &val);
-       if (ret <= 0) {
-               ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
-               goto bail;
-       }
-       /* If anything but the highest limit, enable T_CRIT_A "interrupt" */
-       stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
-       if (stat) {
-               ipath_dev_err(dd, "Unable to set tempsense config\n");
-               ret = -1;
-               goto bail;
-       }
-       stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
-       if (stat) {
-               ipath_dev_err(dd, "Unable to set local Tcrit\n");
-               ret = -1;
-               goto bail;
-       }
-       stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
-       if (stat) {
-               ipath_dev_err(dd, "Unable to set remote Tcrit\n");
-               ret = -1;
-               goto bail;
-       }
-
-bail:
-       return ret;
-}
-
-/*
- * dump tempsense regs. in decimal, to ease shell-scripts.
- */
-static ssize_t show_tempsense(struct device *dev,
-                             struct device_attribute *attr,
-                             char *buf)
-{
-       struct ipath_devdata *dd = dev_get_drvdata(dev);
-       int ret;
-       int idx;
-       u8 regvals[8];
-
-       ret = -ENXIO;
-       for (idx = 0; idx < 8; ++idx) {
-               if (idx == 6)
-                       continue;
-               ret = ipath_tempsense_read(dd, idx);
-               if (ret < 0)
-                       break;
-               regvals[idx] = ret;
-       }
-       if (idx == 8)
-               ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
-                       *(signed char *)(regvals),
-                       *(signed char *)(regvals + 1),
-                       regvals[2], regvals[3],
-                       *(signed char *)(regvals + 5),
-                       *(signed char *)(regvals + 7));
-       return ret;
-}
-
-const struct attribute_group *ipath_driver_attr_groups[] = {
-       &driver_attr_group,
-       NULL,
-};
-
-static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
-static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
-static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
-static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
-static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
-static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
-static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
-static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
-static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
-static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
-static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
-static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
-static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
-static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
-static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
-                  show_jint_max_packets, store_jint_max_packets);
-static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
-                  show_jint_idle_ticks, store_jint_idle_ticks);
-static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
-                  show_tempsense, store_tempsense);
-
-static struct attribute *dev_attributes[] = {
-       &dev_attr_guid.attr,
-       &dev_attr_lmc.attr,
-       &dev_attr_lid.attr,
-       &dev_attr_link_state.attr,
-       &dev_attr_mlid.attr,
-       &dev_attr_mtu.attr,
-       &dev_attr_nguid.attr,
-       &dev_attr_nports.attr,
-       &dev_attr_serial.attr,
-       &dev_attr_status.attr,
-       &dev_attr_status_str.attr,
-       &dev_attr_boardversion.attr,
-       &dev_attr_unit.attr,
-       &dev_attr_enabled.attr,
-       &dev_attr_rx_pol_inv.attr,
-       &dev_attr_led_override.attr,
-       &dev_attr_logged_errors.attr,
-       &dev_attr_tempsense.attr,
-       &dev_attr_localbus_info.attr,
-       NULL
-};
-
-static struct attribute_group dev_attr_group = {
-       .attrs = dev_attributes
-};
-
-static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
-                  store_hrtbt_enb);
-static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
-                  store_lwid_enb);
-static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
-static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
-                  store_spd_enb);
-static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
-static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
-                  store_rx_polinv_enb);
-static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
-                  store_lanerev_enb);
-
-static struct attribute *dev_ibcfg_attributes[] = {
-       &dev_attr_hrtbt_enable.attr,
-       &dev_attr_link_width_enable.attr,
-       &dev_attr_link_width.attr,
-       &dev_attr_link_speed_enable.attr,
-       &dev_attr_link_speed.attr,
-       &dev_attr_rx_pol_inv_enable.attr,
-       &dev_attr_rx_lane_rev_enable.attr,
-       NULL
-};
-
-static struct attribute_group dev_ibcfg_attr_group = {
-       .attrs = dev_ibcfg_attributes
-};
-
-/**
- * ipath_expose_reset - create a device reset file
- * @dev: the device structure
- *
- * Only expose a file that lets us reset the device after someone
- * enters diag mode.  A device reset is quite likely to crash the
- * machine entirely, so we don't want to normally make it
- * available.
- *
- * Called with ipath_mutex held.
- */
-int ipath_expose_reset(struct device *dev)
-{
-       static int exposed;
-       int ret;
-
-       if (!exposed) {
-               ret = device_create_file(dev, &dev_attr_reset);
-               exposed = 1;
-       }
-       else
-               ret = 0;
-
-       return ret;
-}
-
-int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
-{
-       int ret;
-
-       ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
-       if (ret)
-               goto bail;
-
-       ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
-       if (ret)
-               goto bail_attrs;
-
-       if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
-               ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
-               if (ret)
-                       goto bail_counter;
-               ret = device_create_file(dev, &dev_attr_jint_max_packets);
-               if (ret)
-                       goto bail_idle;
-
-               ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
-               if (ret)
-                       goto bail_max;
-       }
-
-       return 0;
-
-bail_max:
-       device_remove_file(dev, &dev_attr_jint_max_packets);
-bail_idle:
-       device_remove_file(dev, &dev_attr_jint_idle_ticks);
-bail_counter:
-       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
-bail_attrs:
-       sysfs_remove_group(&dev->kobj, &dev_attr_group);
-bail:
-       return ret;
-}
-
-void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
-{
-       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
-
-       if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
-               sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
-               device_remove_file(dev, &dev_attr_jint_idle_ticks);
-               device_remove_file(dev, &dev_attr_jint_max_packets);
-       }
-
-       sysfs_remove_group(&dev->kobj, &dev_attr_group);
-
-       device_remove_file(dev, &dev_attr_reset);
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
deleted file mode 100644 (file)
index 22e6099..0000000
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/* cut down ridiculously long IB macro names */
-#define OP(x) IB_OPCODE_UC_##x
-
-/**
- * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
- * @qp: a pointer to the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_uc_req(struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       struct ipath_swqe *wqe;
-       unsigned long flags;
-       u32 hwords;
-       u32 bth0;
-       u32 len;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       int ret = 0;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&qp->s_dma_busy)) {
-                       qp->s_flags |= IPATH_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done;
-       }
-
-       ohdr = &qp->s_hdr.u.oth;
-       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-               ohdr = &qp->s_hdr.u.l.oth;
-
-       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
-       hwords = 5;
-       bth0 = 1 << 22; /* Set M bit */
-
-       /* Get the next send request. */
-       wqe = get_swqe_ptr(qp, qp->s_cur);
-       qp->s_wqe = NULL;
-       switch (qp->s_state) {
-       default:
-               if (!(ib_ipath_state_ops[qp->state] &
-                   IPATH_PROCESS_NEXT_SEND_OK))
-                       goto bail;
-               /* Check if send work queue is empty. */
-               if (qp->s_cur == qp->s_head)
-                       goto bail;
-               /*
-                * Start a new request.
-                */
-               qp->s_psn = wqe->psn = qp->s_next_psn;
-               qp->s_sge.sge = wqe->sg_list[0];
-               qp->s_sge.sg_list = wqe->sg_list + 1;
-               qp->s_sge.num_sge = wqe->wr.num_sge;
-               qp->s_len = len = wqe->length;
-               switch (wqe->wr.opcode) {
-               case IB_WR_SEND:
-               case IB_WR_SEND_WITH_IMM:
-                       if (len > pmtu) {
-                               qp->s_state = OP(SEND_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_SEND)
-                               qp->s_state = OP(SEND_ONLY);
-                       else {
-                               qp->s_state =
-                                       OP(SEND_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the BTH */
-                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                       }
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               case IB_WR_RDMA_WRITE:
-               case IB_WR_RDMA_WRITE_WITH_IMM:
-                       ohdr->u.rc.reth.vaddr =
-                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
-                       ohdr->u.rc.reth.rkey =
-                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
-                       ohdr->u.rc.reth.length = cpu_to_be32(len);
-                       hwords += sizeof(struct ib_reth) / 4;
-                       if (len > pmtu) {
-                               qp->s_state = OP(RDMA_WRITE_FIRST);
-                               len = pmtu;
-                               break;
-                       }
-                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                               qp->s_state = OP(RDMA_WRITE_ONLY);
-                       else {
-                               qp->s_state =
-                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
-                               /* Immediate data comes after the RETH */
-                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
-                               hwords += 1;
-                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                                       bth0 |= 1 << 23;
-                       }
-                       qp->s_wqe = wqe;
-                       if (++qp->s_cur >= qp->s_size)
-                               qp->s_cur = 0;
-                       break;
-
-               default:
-                       goto bail;
-               }
-               break;
-
-       case OP(SEND_FIRST):
-               qp->s_state = OP(SEND_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_SEND)
-                       qp->s_state = OP(SEND_LAST);
-               else {
-                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-               }
-               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                       bth0 |= 1 << 23;
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-               qp->s_state = OP(RDMA_WRITE_MIDDLE);
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               len = qp->s_len;
-               if (len > pmtu) {
-                       len = pmtu;
-                       break;
-               }
-               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
-                       qp->s_state = OP(RDMA_WRITE_LAST);
-               else {
-                       qp->s_state =
-                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
-                       /* Immediate data comes after the BTH */
-                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
-                       hwords += 1;
-                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-                               bth0 |= 1 << 23;
-               }
-               qp->s_wqe = wqe;
-               if (++qp->s_cur >= qp->s_size)
-                       qp->s_cur = 0;
-               break;
-       }
-       qp->s_len -= len;
-       qp->s_hdrwords = hwords;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_cur_size = len;
-       ipath_make_ruc_header(to_idev(qp->ibqp.device),
-                             qp, ohdr, bth0 | (qp->s_state << 24),
-                             qp->s_next_psn++ & IPATH_PSN_MASK);
-done:
-       ret = 1;
-       goto unlock;
-
-bail:
-       qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * ipath_uc_rcv - handle an incoming UC packet
- * @dev: the device the packet came in on
- * @hdr: the header of the packet
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the length of the packet
- * @qp: the QP for this packet.
- *
- * This is called from ipath_qp_rcv() to process an incoming UC packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       int opcode;
-       u32 hdrsize;
-       u32 psn;
-       u32 pad;
-       struct ib_wc wc;
-       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-       struct ib_reth *reth;
-       int header_in_data;
-
-       /* Validate the SLID. See Ch. 9.6.1.5 */
-       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
-               goto done;
-
-       /* Check for GRH */
-       if (!has_grh) {
-               ohdr = &hdr->u.oth;
-               hdrsize = 8 + 12;       /* LRH + BTH */
-               psn = be32_to_cpu(ohdr->bth[2]);
-               header_in_data = 0;
-       } else {
-               ohdr = &hdr->u.l.oth;
-               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
-               /*
-                * The header with GRH is 60 bytes and the
-                * core driver sets the eager header buffer
-                * size to 56 bytes so the last 4 bytes of
-                * the BTH header (PSN) is in the data buffer.
-                */
-               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-               if (header_in_data) {
-                       psn = be32_to_cpu(((__be32 *) data)[0]);
-                       data += sizeof(__be32);
-               } else
-                       psn = be32_to_cpu(ohdr->bth[2]);
-       }
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-
-       memset(&wc, 0, sizeof wc);
-
-       /* Compare the PSN verses the expected PSN. */
-       if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
-               /*
-                * Handle a sequence error.
-                * Silently drop any current message.
-                */
-               qp->r_psn = psn;
-       inv:
-               qp->r_state = OP(SEND_LAST);
-               switch (opcode) {
-               case OP(SEND_FIRST):
-               case OP(SEND_ONLY):
-               case OP(SEND_ONLY_WITH_IMMEDIATE):
-                       goto send_first;
-
-               case OP(RDMA_WRITE_FIRST):
-               case OP(RDMA_WRITE_ONLY):
-               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
-                       goto rdma_first;
-
-               default:
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-       }
-
-       /* Check for opcode sequence errors. */
-       switch (qp->r_state) {
-       case OP(SEND_FIRST):
-       case OP(SEND_MIDDLE):
-               if (opcode == OP(SEND_MIDDLE) ||
-                   opcode == OP(SEND_LAST) ||
-                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_MIDDLE):
-               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
-                   opcode == OP(RDMA_WRITE_LAST) ||
-                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-
-       default:
-               if (opcode == OP(SEND_FIRST) ||
-                   opcode == OP(SEND_ONLY) ||
-                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
-                   opcode == OP(RDMA_WRITE_FIRST) ||
-                   opcode == OP(RDMA_WRITE_ONLY) ||
-                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       break;
-               goto inv;
-       }
-
-       /* OK, process the packet. */
-       switch (opcode) {
-       case OP(SEND_FIRST):
-       case OP(SEND_ONLY):
-       case OP(SEND_ONLY_WITH_IMMEDIATE):
-       send_first:
-               if (qp->r_flags & IPATH_R_REUSE_SGE) {
-                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
-                       qp->r_sge = qp->s_rdma_read_sge;
-               } else if (!ipath_get_rwqe(qp, 0)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Save the WQE so we can reuse it in case of an error. */
-               qp->s_rdma_read_sge = qp->r_sge;
-               qp->r_rcv_len = 0;
-               if (opcode == OP(SEND_ONLY))
-                       goto send_last;
-               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
-                       goto send_last_imm;
-               /* FALLTHROUGH */
-       case OP(SEND_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len)) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               ipath_copy_sge(&qp->r_sge, data, pmtu);
-               break;
-
-       case OP(SEND_LAST_WITH_IMMEDIATE):
-       send_last_imm:
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else {
-                       /* Immediate data comes after BTH */
-                       wc.ex.imm_data = ohdr->u.imm_data;
-               }
-               hdrsize += 4;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               /* FALLTHROUGH */
-       case OP(SEND_LAST):
-       send_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               wc.byte_len = tlen + qp->r_rcv_len;
-               if (unlikely(wc.byte_len > qp->r_len)) {
-                       qp->r_flags |= IPATH_R_REUSE_SGE;
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               wc.opcode = IB_WC_RECV;
-       last_imm:
-               ipath_copy_sge(&qp->r_sge, data, tlen);
-               wc.wr_id = qp->r_wr_id;
-               wc.status = IB_WC_SUCCESS;
-               wc.qp = &qp->ibqp;
-               wc.src_qp = qp->remote_qpn;
-               wc.slid = qp->remote_ah_attr.dlid;
-               wc.sl = qp->remote_ah_attr.sl;
-               /* Signal completion event if the solicited bit is set. */
-               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                              (ohdr->bth[0] &
-                               cpu_to_be32(1 << 23)) != 0);
-               break;
-
-       case OP(RDMA_WRITE_FIRST):
-       case OP(RDMA_WRITE_ONLY):
-       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
-       rdma_first:
-               /* RETH comes after BTH */
-               if (!header_in_data)
-                       reth = &ohdr->u.rc.reth;
-               else {
-                       reth = (struct ib_reth *)data;
-                       data += sizeof(*reth);
-               }
-               hdrsize += sizeof(*reth);
-               qp->r_len = be32_to_cpu(reth->length);
-               qp->r_rcv_len = 0;
-               if (qp->r_len != 0) {
-                       u32 rkey = be32_to_cpu(reth->rkey);
-                       u64 vaddr = be64_to_cpu(reth->vaddr);
-                       int ok;
-
-                       /* Check rkey */
-                       ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
-                                          vaddr, rkey,
-                                          IB_ACCESS_REMOTE_WRITE);
-                       if (unlikely(!ok)) {
-                               dev->n_pkt_drops++;
-                               goto done;
-                       }
-               } else {
-                       qp->r_sge.sg_list = NULL;
-                       qp->r_sge.sge.mr = NULL;
-                       qp->r_sge.sge.vaddr = NULL;
-                       qp->r_sge.sge.length = 0;
-                       qp->r_sge.sge.sge_length = 0;
-               }
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               if (opcode == OP(RDMA_WRITE_ONLY))
-                       goto rdma_last;
-               else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
-                       goto rdma_last_imm;
-               /* FALLTHROUGH */
-       case OP(RDMA_WRITE_MIDDLE):
-               /* Check for invalid length PMTU or posted rwqe len. */
-               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               qp->r_rcv_len += pmtu;
-               if (unlikely(qp->r_rcv_len > qp->r_len)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               ipath_copy_sge(&qp->r_sge, data, pmtu);
-               break;
-
-       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
-       rdma_last_imm:
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else {
-                       /* Immediate data comes after BTH */
-                       wc.ex.imm_data = ohdr->u.imm_data;
-               }
-               hdrsize += 4;
-               wc.wc_flags = IB_WC_WITH_IMM;
-
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               if (qp->r_flags & IPATH_R_REUSE_SGE)
-                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
-               else if (!ipath_get_rwqe(qp, 1)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               wc.byte_len = qp->r_len;
-               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
-               goto last_imm;
-
-       case OP(RDMA_WRITE_LAST):
-       rdma_last:
-               /* Get the number of bytes the message was padded by. */
-               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-               /* Check for invalid length. */
-               /* XXX LAST len should be >= 1 */
-               if (unlikely(tlen < (hdrsize + pad + 4))) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               /* Don't count the CRC. */
-               tlen -= (hdrsize + pad + 4);
-               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
-                       dev->n_pkt_drops++;
-                       goto done;
-               }
-               ipath_copy_sge(&qp->r_sge, data, tlen);
-               break;
-
-       default:
-               /* Drop packet for unknown opcodes. */
-               dev->n_pkt_drops++;
-               goto done;
-       }
-       qp->r_psn++;
-       qp->r_state = opcode;
-done:
-       return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
deleted file mode 100644 (file)
index e8a2a91..0000000
+++ /dev/null
@@ -1,580 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/sched.h>
-#include <rdma/ib_smi.h>
-
-#include "ipath_verbs.h"
-#include "ipath_kernel.h"
-
-/**
- * ipath_ud_loopback - handle send on loopback QPs
- * @sqp: the sending QP
- * @swqe: the send work request
- *
- * This is called from ipath_make_ud_req() to forward a WQE addressed
- * to the same HCA.
- * Note that the receive interrupt handler may be calling ipath_ud_rcv()
- * while this is being called.
- */
-static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
-{
-       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
-       struct ipath_qp *qp;
-       struct ib_ah_attr *ah_attr;
-       unsigned long flags;
-       struct ipath_rq *rq;
-       struct ipath_srq *srq;
-       struct ipath_sge_state rsge;
-       struct ipath_sge *sge;
-       struct ipath_rwq *wq;
-       struct ipath_rwqe *wqe;
-       void (*handler)(struct ib_event *, void *);
-       struct ib_wc wc;
-       u32 tail;
-       u32 rlen;
-       u32 length;
-
-       qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
-       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               dev->n_pkt_drops++;
-               goto done;
-       }
-
-       /*
-        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       if (unlikely(qp->ibqp.qp_num &&
-                    ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
-                     sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
-               /* XXX OK to lose a count once in a while. */
-               dev->qkey_violations++;
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       length = swqe->length;
-       memset(&wc, 0, sizeof wc);
-       wc.byte_len = length + sizeof(struct ib_grh);
-
-       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               wc.wc_flags = IB_WC_WITH_IMM;
-               wc.ex.imm_data = swqe->wr.ex.imm_data;
-       }
-
-       /*
-        * This would be a lot simpler if we could call ipath_get_rwqe()
-        * but that uses state that the receive interrupt handler uses
-        * so we would need to lock out receive interrupts while doing
-        * local loopback.
-        */
-       if (qp->ibqp.srq) {
-               srq = to_isrq(qp->ibqp.srq);
-               handler = srq->ibsrq.event_handler;
-               rq = &srq->rq;
-       } else {
-               srq = NULL;
-               handler = NULL;
-               rq = &qp->r_rq;
-       }
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        * Note that it is safe to drop the lock after changing rq->tail
-        * since ipath_post_receive() won't fill the empty slot.
-        */
-       spin_lock_irqsave(&rq->lock, flags);
-       wq = rq->wq;
-       tail = wq->tail;
-       /* Validate tail before using it since it is user writable. */
-       if (tail >= rq->size)
-               tail = 0;
-       if (unlikely(tail == wq->head)) {
-               spin_unlock_irqrestore(&rq->lock, flags);
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-       wqe = get_rwqe_ptr(rq, tail);
-       rsge.sg_list = qp->r_ud_sg_list;
-       if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
-               spin_unlock_irqrestore(&rq->lock, flags);
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-       /* Silently drop packets which are too big. */
-       if (wc.byte_len > rlen) {
-               spin_unlock_irqrestore(&rq->lock, flags);
-               dev->n_pkt_drops++;
-               goto drop;
-       }
-       if (++tail >= rq->size)
-               tail = 0;
-       wq->tail = tail;
-       wc.wr_id = wqe->wr_id;
-       if (handler) {
-               u32 n;
-
-               /*
-                * validate head pointer value and compute
-                * the number of remaining WQEs.
-                */
-               n = wq->head;
-               if (n >= rq->size)
-                       n = 0;
-               if (n < tail)
-                       n += rq->size - tail;
-               else
-                       n -= tail;
-               if (n < srq->limit) {
-                       struct ib_event ev;
-
-                       srq->limit = 0;
-                       spin_unlock_irqrestore(&rq->lock, flags);
-                       ev.device = qp->ibqp.device;
-                       ev.element.srq = qp->ibqp.srq;
-                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
-                       handler(&ev, srq->ibsrq.srq_context);
-               } else
-                       spin_unlock_irqrestore(&rq->lock, flags);
-       } else
-               spin_unlock_irqrestore(&rq->lock, flags);
-
-       ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
-               wc.wc_flags |= IB_WC_GRH;
-       } else
-               ipath_skip_sge(&rsge, sizeof(struct ib_grh));
-       sge = swqe->sg_list;
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               ipath_copy_sge(&rsge, sge->vaddr, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--swqe->wr.num_sge)
-                               sge++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = sqp->ibqp.qp_num;
-       /* XXX do we know which pkey matched? Only needed for GSI. */
-       wc.pkey_index = 0;
-       wc.slid = dev->dd->ipath_lid |
-               (ah_attr->src_path_bits &
-                ((1 << dev->dd->ipath_lmc) - 1));
-       wc.sl = ah_attr->sl;
-       wc.dlid_path_bits =
-               ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                      swqe->wr.send_flags & IB_SEND_SOLICITED);
-drop:
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-done:;
-}
-
-/**
- * ipath_make_ud_req - construct a UD request packet
- * @qp: the QP
- *
- * Return 1 if constructed; otherwise, return 0.
- */
-int ipath_make_ud_req(struct ipath_qp *qp)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_other_headers *ohdr;
-       struct ib_ah_attr *ah_attr;
-       struct ipath_swqe *wqe;
-       unsigned long flags;
-       u32 nwords;
-       u32 extra_bytes;
-       u32 bth0;
-       u16 lrh0;
-       u16 lid;
-       int ret = 0;
-       int next_cur;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
-               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
-                       goto bail;
-               /* We are in the error state, flush the work request. */
-               if (qp->s_last == qp->s_head)
-                       goto bail;
-               /* If DMAs are in progress, we can't flush immediately. */
-               if (atomic_read(&qp->s_dma_busy)) {
-                       qp->s_flags |= IPATH_S_WAIT_DMA;
-                       goto bail;
-               }
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
-               goto done;
-       }
-
-       if (qp->s_cur == qp->s_head)
-               goto bail;
-
-       wqe = get_swqe_ptr(qp, qp->s_cur);
-       next_cur = qp->s_cur + 1;
-       if (next_cur >= qp->s_size)
-               next_cur = 0;
-
-       /* Construct the header. */
-       ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
-       if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
-               if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
-                       dev->n_multicast_xmit++;
-               else
-                       dev->n_unicast_xmit++;
-       } else {
-               dev->n_unicast_xmit++;
-               lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
-               if (unlikely(lid == dev->dd->ipath_lid)) {
-                       /*
-                        * If DMAs are in progress, we can't generate
-                        * a completion for the loopback packet since
-                        * it would be out of order.
-                        * XXX Instead of waiting, we could queue a
-                        * zero length descriptor so we get a callback.
-                        */
-                       if (atomic_read(&qp->s_dma_busy)) {
-                               qp->s_flags |= IPATH_S_WAIT_DMA;
-                               goto bail;
-                       }
-                       qp->s_cur = next_cur;
-                       spin_unlock_irqrestore(&qp->s_lock, flags);
-                       ipath_ud_loopback(qp, wqe);
-                       spin_lock_irqsave(&qp->s_lock, flags);
-                       ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
-                       goto done;
-               }
-       }
-
-       qp->s_cur = next_cur;
-       extra_bytes = -wqe->length & 3;
-       nwords = (wqe->length + extra_bytes) >> 2;
-
-       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
-       qp->s_hdrwords = 7;
-       qp->s_cur_size = wqe->length;
-       qp->s_cur_sge = &qp->s_sge;
-       qp->s_dmult = ah_attr->static_rate;
-       qp->s_wqe = wqe;
-       qp->s_sge.sge = wqe->sg_list[0];
-       qp->s_sge.sg_list = wqe->sg_list + 1;
-       qp->s_sge.num_sge = wqe->wr.num_sge;
-
-       if (ah_attr->ah_flags & IB_AH_GRH) {
-               /* Header size in 32-bit words. */
-               qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-                                                &ah_attr->grh,
-                                                qp->s_hdrwords, nwords);
-               lrh0 = IPATH_LRH_GRH;
-               ohdr = &qp->s_hdr.u.l.oth;
-               /*
-                * Don't worry about sending to locally attached multicast
-                * QPs.  It is unspecified by the spec. what happens.
-                */
-       } else {
-               /* Header size in 32-bit words. */
-               lrh0 = IPATH_LRH_BTH;
-               ohdr = &qp->s_hdr.u.oth;
-       }
-       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-               qp->s_hdrwords++;
-               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
-               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-       } else
-               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-       lrh0 |= ah_attr->sl << 4;
-       if (qp->ibqp.qp_type == IB_QPT_SMI)
-               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
-       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-       qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
-       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
-                                          SIZE_OF_CRC);
-       lid = dev->dd->ipath_lid;
-       if (lid) {
-               lid |= ah_attr->src_path_bits &
-                       ((1 << dev->dd->ipath_lmc) - 1);
-               qp->s_hdr.lrh[3] = cpu_to_be16(lid);
-       } else
-               qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
-       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-               bth0 |= 1 << 23;
-       bth0 |= extra_bytes << 20;
-       bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
-               ipath_get_pkey(dev->dd, qp->s_pkey_index);
-       ohdr->bth[0] = cpu_to_be32(bth0);
-       /*
-        * Use the multicast QP if the destination LID is a multicast LID.
-        */
-       ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
-               ah_attr->dlid != IPATH_PERMISSIVE_LID ?
-               cpu_to_be32(IPATH_MULTICAST_QPN) :
-               cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
-       ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
-       /*
-        * Qkeys with the high order bit set mean use the
-        * qkey from the QP context instead of the WR (see 10.2.5).
-        */
-       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
-                                        qp->qkey : wqe->wr.wr.ud.remote_qkey);
-       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
-
-done:
-       ret = 1;
-       goto unlock;
-
-bail:
-       qp->s_flags &= ~IPATH_S_BUSY;
-unlock:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * ipath_ud_rcv - receive an incoming UD packet
- * @dev: the device the packet came in on
- * @hdr: the packet header
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from ipath_qp_rcv() to process an incoming UD packet
- * for the given QP.
- * Called at interrupt level.
- */
-void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
-{
-       struct ipath_other_headers *ohdr;
-       int opcode;
-       u32 hdrsize;
-       u32 pad;
-       struct ib_wc wc;
-       u32 qkey;
-       u32 src_qp;
-       u16 dlid;
-       int header_in_data;
-
-       /* Check for GRH */
-       if (!has_grh) {
-               ohdr = &hdr->u.oth;
-               hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
-               qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-               src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
-               header_in_data = 0;
-       } else {
-               ohdr = &hdr->u.l.oth;
-               hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
-               /*
-                * The header with GRH is 68 bytes and the core driver sets
-                * the eager header buffer size to 56 bytes so the last 12
-                * bytes of the IB header is in the data buffer.
-                */
-               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
-               if (header_in_data) {
-                       qkey = be32_to_cpu(((__be32 *) data)[1]);
-                       src_qp = be32_to_cpu(((__be32 *) data)[2]);
-                       data += 12;
-               } else {
-                       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
-                       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
-               }
-       }
-       src_qp &= IPATH_QPN_MASK;
-
-       /*
-        * Check that the permissive LID is only used on QP0
-        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
-        */
-       if (qp->ibqp.qp_num) {
-               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                            hdr->lrh[3] == IB_LID_PERMISSIVE)) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-               if (unlikely(qkey != qp->qkey)) {
-                       /* XXX OK to lose a count once in a while. */
-                       dev->qkey_violations++;
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-       } else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
-                  hdr->lrh[3] == IB_LID_PERMISSIVE) {
-               struct ib_smp *smp = (struct ib_smp *) data;
-
-               if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-       }
-
-       /*
-        * The opcode is in the low byte when its in network order
-        * (top byte when in host order).
-        */
-       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
-       if (qp->ibqp.qp_num > 1 &&
-           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
-               if (header_in_data) {
-                       wc.ex.imm_data = *(__be32 *) data;
-                       data += sizeof(__be32);
-               } else
-                       wc.ex.imm_data = ohdr->u.ud.imm_data;
-               wc.wc_flags = IB_WC_WITH_IMM;
-               hdrsize += sizeof(u32);
-       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
-               wc.ex.imm_data = 0;
-               wc.wc_flags = 0;
-       } else {
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-
-       /* Get the number of bytes the message was padded by. */
-       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
-       if (unlikely(tlen < (hdrsize + pad + 4))) {
-               /* Drop incomplete packets. */
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-       tlen -= hdrsize + pad + 4;
-
-       /* Drop invalid MAD packets (see 13.5.3.1). */
-       if (unlikely((qp->ibqp.qp_num == 0 &&
-                     (tlen != 256 ||
-                      (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
-                    (qp->ibqp.qp_num == 1 &&
-                     (tlen != 256 ||
-                      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-
-       /*
-        * A GRH is expected to precede the data even if not
-        * present on the wire.
-        */
-       wc.byte_len = tlen + sizeof(struct ib_grh);
-
-       /*
-        * Get the next work request entry to find where to put the data.
-        */
-       if (qp->r_flags & IPATH_R_REUSE_SGE)
-               qp->r_flags &= ~IPATH_R_REUSE_SGE;
-       else if (!ipath_get_rwqe(qp, 0)) {
-               /*
-                * Count VL15 packets dropped due to no receive buffer.
-                * Otherwise, count them as buffer overruns since usually,
-                * the HW will be able to receive packets even if there are
-                * no QPs with posted receive buffers.
-                */
-               if (qp->ibqp.qp_num == 0)
-                       dev->n_vl15_dropped++;
-               else
-                       dev->rcv_errors++;
-               goto bail;
-       }
-       /* Silently drop packets which are too big. */
-       if (wc.byte_len > qp->r_len) {
-               qp->r_flags |= IPATH_R_REUSE_SGE;
-               dev->n_pkt_drops++;
-               goto bail;
-       }
-       if (has_grh) {
-               ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
-                              sizeof(struct ib_grh));
-               wc.wc_flags |= IB_WC_GRH;
-       } else
-               ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
-       ipath_copy_sge(&qp->r_sge, data,
-                      wc.byte_len - sizeof(struct ib_grh));
-       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
-               goto bail;
-       wc.wr_id = qp->r_wr_id;
-       wc.status = IB_WC_SUCCESS;
-       wc.opcode = IB_WC_RECV;
-       wc.vendor_err = 0;
-       wc.qp = &qp->ibqp;
-       wc.src_qp = src_qp;
-       /* XXX do we know which pkey matched? Only needed for GSI. */
-       wc.pkey_index = 0;
-       wc.slid = be16_to_cpu(hdr->lrh[3]);
-       wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
-       dlid = be16_to_cpu(hdr->lrh[1]);
-       /*
-        * Save the LMC lower bits if the destination LID is a unicast LID.
-        */
-       wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
-               dlid & ((1 << dev->dd->ipath_lmc) - 1);
-       wc.port_num = 1;
-       /* Signal completion event if the solicited bit is set. */
-       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
-                      (ohdr->bth[0] &
-                       cpu_to_be32(1 << 23)) != 0);
-
-bail:;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
deleted file mode 100644 (file)
index 1da1252..0000000
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/mm.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/sched.h>
-
-#include "ipath_kernel.h"
-
-static void __ipath_release_user_pages(struct page **p, size_t num_pages,
-                                  int dirty)
-{
-       size_t i;
-
-       for (i = 0; i < num_pages; i++) {
-               ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
-                          (unsigned long) num_pages, p[i]);
-               if (dirty)
-                       set_page_dirty_lock(p[i]);
-               put_page(p[i]);
-       }
-}
-
-/* call with current->mm->mmap_sem held */
-static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-                                 struct page **p)
-{
-       unsigned long lock_limit;
-       size_t got;
-       int ret;
-
-       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-       if (num_pages > lock_limit) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
-                  (unsigned long) num_pages, start_page);
-
-       for (got = 0; got < num_pages; got += ret) {
-               ret = get_user_pages(current, current->mm,
-                                    start_page + got * PAGE_SIZE,
-                                    num_pages - got, 1, 1,
-                                    p + got, NULL);
-               if (ret < 0)
-                       goto bail_release;
-       }
-
-       current->mm->pinned_vm += num_pages;
-
-       ret = 0;
-       goto bail;
-
-bail_release:
-       __ipath_release_user_pages(p, got, 0);
-bail:
-       return ret;
-}
-
-/**
- * ipath_map_page - a safety wrapper around pci_map_page()
- *
- * A dma_addr of all 0's is interpreted by the chip as "disabled".
- * Unfortunately, it can also be a valid dma_addr returned on some
- * architectures.
- *
- * The powerpc iommu assigns dma_addrs in ascending order, so we don't
- * have to bother with retries or mapping a dummy page to insure we
- * don't just get the same mapping again.
- *
- * I'm sure we won't be so lucky with other iommu's, so FIXME.
- */
-dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
-       unsigned long offset, size_t size, int direction)
-{
-       dma_addr_t phys;
-
-       phys = pci_map_page(hwdev, page, offset, size, direction);
-
-       if (phys == 0) {
-               pci_unmap_page(hwdev, phys, size, direction);
-               phys = pci_map_page(hwdev, page, offset, size, direction);
-               /*
-                * FIXME: If we get 0 again, we should keep this page,
-                * map another, then free the 0 page.
-                */
-       }
-
-       return phys;
-}
-
-/**
- * ipath_map_single - a safety wrapper around pci_map_single()
- *
- * Same idea as ipath_map_page().
- */
-dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
-       int direction)
-{
-       dma_addr_t phys;
-
-       phys = pci_map_single(hwdev, ptr, size, direction);
-
-       if (phys == 0) {
-               pci_unmap_single(hwdev, phys, size, direction);
-               phys = pci_map_single(hwdev, ptr, size, direction);
-               /*
-                * FIXME: If we get 0 again, we should keep this page,
-                * map another, then free the 0 page.
-                */
-       }
-
-       return phys;
-}
-
-/**
- * ipath_get_user_pages - lock user pages into memory
- * @start_page: the start page
- * @num_pages: the number of pages
- * @p: the output page structures
- *
- * This function takes a given start page (page aligned user virtual
- * address) and pins it and the following specified number of pages.  For
- * now, num_pages is always 1, but that will probably change at some point
- * (because caller is doing expected sends on a single virtually contiguous
- * buffer, so we can do all pages at once).
- */
-int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
-                        struct page **p)
-{
-       int ret;
-
-       down_write(&current->mm->mmap_sem);
-
-       ret = __ipath_get_user_pages(start_page, num_pages, p);
-
-       up_write(&current->mm->mmap_sem);
-
-       return ret;
-}
-
-void ipath_release_user_pages(struct page **p, size_t num_pages)
-{
-       down_write(&current->mm->mmap_sem);
-
-       __ipath_release_user_pages(p, num_pages, 1);
-
-       current->mm->pinned_vm -= num_pages;
-
-       up_write(&current->mm->mmap_sem);
-}
-
-struct ipath_user_pages_work {
-       struct work_struct work;
-       struct mm_struct *mm;
-       unsigned long num_pages;
-};
-
-static void user_pages_account(struct work_struct *_work)
-{
-       struct ipath_user_pages_work *work =
-               container_of(_work, struct ipath_user_pages_work, work);
-
-       down_write(&work->mm->mmap_sem);
-       work->mm->pinned_vm -= work->num_pages;
-       up_write(&work->mm->mmap_sem);
-       mmput(work->mm);
-       kfree(work);
-}
-
-void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
-{
-       struct ipath_user_pages_work *work;
-       struct mm_struct *mm;
-
-       __ipath_release_user_pages(p, num_pages, 1);
-
-       mm = get_task_mm(current);
-       if (!mm)
-               return;
-
-       work = kmalloc(sizeof(*work), GFP_KERNEL);
-       if (!work)
-               goto bail_mm;
-
-       INIT_WORK(&work->work, user_pages_account);
-       work->mm = mm;
-       work->num_pages = num_pages;
-
-       queue_work(ib_wq, &work->work);
-       return;
-
-bail_mm:
-       mmput(mm);
-       return;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
deleted file mode 100644 (file)
index cc04b7b..0000000
+++ /dev/null
@@ -1,875 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/highmem.h>
-#include <linux/io.h>
-#include <linux/uio.h>
-#include <linux/rbtree.h>
-#include <linux/spinlock.h>
-#include <linux/delay.h>
-
-#include "ipath_kernel.h"
-#include "ipath_user_sdma.h"
-
-/* minimum size of header */
-#define IPATH_USER_SDMA_MIN_HEADER_LENGTH      64
-/* expected size of headers (for dma_pool) */
-#define IPATH_USER_SDMA_EXP_HEADER_LENGTH      64
-/* length mask in PBC (lower 11 bits) */
-#define IPATH_PBC_LENGTH_MASK                  ((1 << 11) - 1)
-
-struct ipath_user_sdma_pkt {
-       u8 naddr;               /* dimension of addr (1..3) ... */
-       u32 counter;            /* sdma pkts queued counter for this entry */
-       u64 added;              /* global descq number of entries */
-
-       struct {
-               u32 offset;                     /* offset for kvaddr, addr */
-               u32 length;                     /* length in page */
-               u8  put_page;                   /* should we put_page? */
-               u8  dma_mapped;                 /* is page dma_mapped? */
-               struct page *page;              /* may be NULL (coherent mem) */
-               void *kvaddr;                   /* FIXME: only for pio hack */
-               dma_addr_t addr;
-       } addr[4];   /* max pages, any more and we coalesce */
-       struct list_head list;  /* list element */
-};
-
-struct ipath_user_sdma_queue {
-       /*
-        * pkts sent to dma engine are queued on this
-        * list head.  the type of the elements of this
-        * list are struct ipath_user_sdma_pkt...
-        */
-       struct list_head sent;
-
-       /* headers with expected length are allocated from here... */
-       char header_cache_name[64];
-       struct dma_pool *header_cache;
-
-       /* packets are allocated from the slab cache... */
-       char pkt_slab_name[64];
-       struct kmem_cache *pkt_slab;
-
-       /* as packets go on the queued queue, they are counted... */
-       u32 counter;
-       u32 sent_counter;
-
-       /* dma page table */
-       struct rb_root dma_pages_root;
-
-       /* protect everything above... */
-       struct mutex lock;
-};
-
-struct ipath_user_sdma_queue *
-ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
-{
-       struct ipath_user_sdma_queue *pq =
-               kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
-
-       if (!pq)
-               goto done;
-
-       pq->counter = 0;
-       pq->sent_counter = 0;
-       INIT_LIST_HEAD(&pq->sent);
-
-       mutex_init(&pq->lock);
-
-       snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
-                "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
-       pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
-                                        sizeof(struct ipath_user_sdma_pkt),
-                                        0, 0, NULL);
-
-       if (!pq->pkt_slab)
-               goto err_kfree;
-
-       snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
-                "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
-       pq->header_cache = dma_pool_create(pq->header_cache_name,
-                                          dev,
-                                          IPATH_USER_SDMA_EXP_HEADER_LENGTH,
-                                          4, 0);
-       if (!pq->header_cache)
-               goto err_slab;
-
-       pq->dma_pages_root = RB_ROOT;
-
-       goto done;
-
-err_slab:
-       kmem_cache_destroy(pq->pkt_slab);
-err_kfree:
-       kfree(pq);
-       pq = NULL;
-
-done:
-       return pq;
-}
-
-static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
-                                     int i, size_t offset, size_t len,
-                                     int put_page, int dma_mapped,
-                                     struct page *page,
-                                     void *kvaddr, dma_addr_t dma_addr)
-{
-       pkt->addr[i].offset = offset;
-       pkt->addr[i].length = len;
-       pkt->addr[i].put_page = put_page;
-       pkt->addr[i].dma_mapped = dma_mapped;
-       pkt->addr[i].page = page;
-       pkt->addr[i].kvaddr = kvaddr;
-       pkt->addr[i].addr = dma_addr;
-}
-
-static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
-                                       u32 counter, size_t offset,
-                                       size_t len, int dma_mapped,
-                                       struct page *page,
-                                       void *kvaddr, dma_addr_t dma_addr)
-{
-       pkt->naddr = 1;
-       pkt->counter = counter;
-       ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
-                                 kvaddr, dma_addr);
-}
-
-/* we've too many pages in the iovec, coalesce to a single page */
-static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
-                                   struct ipath_user_sdma_pkt *pkt,
-                                   const struct iovec *iov,
-                                   unsigned long niov) {
-       int ret = 0;
-       struct page *page = alloc_page(GFP_KERNEL);
-       void *mpage_save;
-       char *mpage;
-       int i;
-       int len = 0;
-       dma_addr_t dma_addr;
-
-       if (!page) {
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       mpage = kmap(page);
-       mpage_save = mpage;
-       for (i = 0; i < niov; i++) {
-               int cfur;
-
-               cfur = copy_from_user(mpage,
-                                     iov[i].iov_base, iov[i].iov_len);
-               if (cfur) {
-                       ret = -EFAULT;
-                       goto free_unmap;
-               }
-
-               mpage += iov[i].iov_len;
-               len += iov[i].iov_len;
-       }
-
-       dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
-                               DMA_TO_DEVICE);
-       if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-               ret = -ENOMEM;
-               goto free_unmap;
-       }
-
-       ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
-                                 dma_addr);
-       pkt->naddr = 2;
-
-       goto done;
-
-free_unmap:
-       kunmap(page);
-       __free_page(page);
-done:
-       return ret;
-}
-
-/* how many pages in this iovec element? */
-static int ipath_user_sdma_num_pages(const struct iovec *iov)
-{
-       const unsigned long addr  = (unsigned long) iov->iov_base;
-       const unsigned long  len  = iov->iov_len;
-       const unsigned long spage = addr & PAGE_MASK;
-       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
-
-       return 1 + ((epage - spage) >> PAGE_SHIFT);
-}
-
-/* truncate length to page boundary */
-static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
-{
-       const unsigned long offset = addr & ~PAGE_MASK;
-
-       return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
-}
-
-static void ipath_user_sdma_free_pkt_frag(struct device *dev,
-                                         struct ipath_user_sdma_queue *pq,
-                                         struct ipath_user_sdma_pkt *pkt,
-                                         int frag)
-{
-       const int i = frag;
-
-       if (pkt->addr[i].page) {
-               if (pkt->addr[i].dma_mapped)
-                       dma_unmap_page(dev,
-                                      pkt->addr[i].addr,
-                                      pkt->addr[i].length,
-                                      DMA_TO_DEVICE);
-
-               if (pkt->addr[i].kvaddr)
-                       kunmap(pkt->addr[i].page);
-
-               if (pkt->addr[i].put_page)
-                       put_page(pkt->addr[i].page);
-               else
-                       __free_page(pkt->addr[i].page);
-       } else if (pkt->addr[i].kvaddr)
-               /* free coherent mem from cache... */
-               dma_pool_free(pq->header_cache,
-                             pkt->addr[i].kvaddr, pkt->addr[i].addr);
-}
-
-/* return number of pages pinned... */
-static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
-                                    struct ipath_user_sdma_pkt *pkt,
-                                    unsigned long addr, int tlen, int npages)
-{
-       struct page *pages[2];
-       int j;
-       int ret;
-
-       ret = get_user_pages_fast(addr, npages, 0, pages);
-       if (ret != npages) {
-               int i;
-
-               for (i = 0; i < ret; i++)
-                       put_page(pages[i]);
-
-               ret = -ENOMEM;
-               goto done;
-       }
-
-       for (j = 0; j < npages; j++) {
-               /* map the pages... */
-               const int flen =
-                       ipath_user_sdma_page_length(addr, tlen);
-               dma_addr_t dma_addr =
-                       dma_map_page(&dd->pcidev->dev,
-                                    pages[j], 0, flen, DMA_TO_DEVICE);
-               unsigned long fofs = addr & ~PAGE_MASK;
-
-               if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-                       ret = -ENOMEM;
-                       goto done;
-               }
-
-               ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
-                                         pages[j], kmap(pages[j]),
-                                         dma_addr);
-
-               pkt->naddr++;
-               addr += flen;
-               tlen -= flen;
-       }
-
-done:
-       return ret;
-}
-
-static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
-                                  struct ipath_user_sdma_queue *pq,
-                                  struct ipath_user_sdma_pkt *pkt,
-                                  const struct iovec *iov,
-                                  unsigned long niov)
-{
-       int ret = 0;
-       unsigned long idx;
-
-       for (idx = 0; idx < niov; idx++) {
-               const int npages = ipath_user_sdma_num_pages(iov + idx);
-               const unsigned long addr = (unsigned long) iov[idx].iov_base;
-
-               ret = ipath_user_sdma_pin_pages(dd, pkt,
-                                               addr, iov[idx].iov_len,
-                                               npages);
-               if (ret < 0)
-                       goto free_pkt;
-       }
-
-       goto done;
-
-free_pkt:
-       for (idx = 0; idx < pkt->naddr; idx++)
-               ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
-
-done:
-       return ret;
-}
-
-static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
-                                       struct ipath_user_sdma_queue *pq,
-                                       struct ipath_user_sdma_pkt *pkt,
-                                       const struct iovec *iov,
-                                       unsigned long niov, int npages)
-{
-       int ret = 0;
-
-       if (npages >= ARRAY_SIZE(pkt->addr))
-               ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
-       else
-               ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
-
-       return ret;
-}
-
-/* free a packet list -- return counter value of last packet */
-static void ipath_user_sdma_free_pkt_list(struct device *dev,
-                                         struct ipath_user_sdma_queue *pq,
-                                         struct list_head *list)
-{
-       struct ipath_user_sdma_pkt *pkt, *pkt_next;
-
-       list_for_each_entry_safe(pkt, pkt_next, list, list) {
-               int i;
-
-               for (i = 0; i < pkt->naddr; i++)
-                       ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
-
-               kmem_cache_free(pq->pkt_slab, pkt);
-       }
-}
-
-/*
- * copy headers, coalesce etc -- pq->lock must be held
- *
- * we queue all the packets to list, returning the
- * number of bytes total.  list must be empty initially,
- * as, if there is an error we clean it...
- */
-static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
-                                     struct ipath_user_sdma_queue *pq,
-                                     struct list_head *list,
-                                     const struct iovec *iov,
-                                     unsigned long niov,
-                                     int maxpkts)
-{
-       unsigned long idx = 0;
-       int ret = 0;
-       int npkts = 0;
-       struct page *page = NULL;
-       __le32 *pbc;
-       dma_addr_t dma_addr;
-       struct ipath_user_sdma_pkt *pkt = NULL;
-       size_t len;
-       size_t nw;
-       u32 counter = pq->counter;
-       int dma_mapped = 0;
-
-       while (idx < niov && npkts < maxpkts) {
-               const unsigned long addr = (unsigned long) iov[idx].iov_base;
-               const unsigned long idx_save = idx;
-               unsigned pktnw;
-               unsigned pktnwc;
-               int nfrags = 0;
-               int npages = 0;
-               int cfur;
-
-               dma_mapped = 0;
-               len = iov[idx].iov_len;
-               nw = len >> 2;
-               page = NULL;
-
-               pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
-               if (!pkt) {
-                       ret = -ENOMEM;
-                       goto free_list;
-               }
-
-               if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
-                   len > PAGE_SIZE || len & 3 || addr & 3) {
-                       ret = -EINVAL;
-                       goto free_pkt;
-               }
-
-               if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
-                       pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
-                                            &dma_addr);
-               else
-                       pbc = NULL;
-
-               if (!pbc) {
-                       page = alloc_page(GFP_KERNEL);
-                       if (!page) {
-                               ret = -ENOMEM;
-                               goto free_pkt;
-                       }
-                       pbc = kmap(page);
-               }
-
-               cfur = copy_from_user(pbc, iov[idx].iov_base, len);
-               if (cfur) {
-                       ret = -EFAULT;
-                       goto free_pbc;
-               }
-
-               /*
-                * this assignment is a bit strange.  it's because the
-                * the pbc counts the number of 32 bit words in the full
-                * packet _except_ the first word of the pbc itself...
-                */
-               pktnwc = nw - 1;
-
-               /*
-                * pktnw computation yields the number of 32 bit words
-                * that the caller has indicated in the PBC.  note that
-                * this is one less than the total number of words that
-                * goes to the send DMA engine as the first 32 bit word
-                * of the PBC itself is not counted.  Armed with this count,
-                * we can verify that the packet is consistent with the
-                * iovec lengths.
-                */
-               pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
-               if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
-                       ret = -EINVAL;
-                       goto free_pbc;
-               }
-
-
-               idx++;
-               while (pktnwc < pktnw && idx < niov) {
-                       const size_t slen = iov[idx].iov_len;
-                       const unsigned long faddr =
-                               (unsigned long) iov[idx].iov_base;
-
-                       if (slen & 3 || faddr & 3 || !slen ||
-                           slen > PAGE_SIZE) {
-                               ret = -EINVAL;
-                               goto free_pbc;
-                       }
-
-                       npages++;
-                       if ((faddr & PAGE_MASK) !=
-                           ((faddr + slen - 1) & PAGE_MASK))
-                               npages++;
-
-                       pktnwc += slen >> 2;
-                       idx++;
-                       nfrags++;
-               }
-
-               if (pktnwc != pktnw) {
-                       ret = -EINVAL;
-                       goto free_pbc;
-               }
-
-               if (page) {
-                       dma_addr = dma_map_page(&dd->pcidev->dev,
-                                               page, 0, len, DMA_TO_DEVICE);
-                       if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
-                               ret = -ENOMEM;
-                               goto free_pbc;
-                       }
-
-                       dma_mapped = 1;
-               }
-
-               ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
-                                           page, pbc, dma_addr);
-
-               if (nfrags) {
-                       ret = ipath_user_sdma_init_payload(dd, pq, pkt,
-                                                          iov + idx_save + 1,
-                                                          nfrags, npages);
-                       if (ret < 0)
-                               goto free_pbc_dma;
-               }
-
-               counter++;
-               npkts++;
-
-               list_add_tail(&pkt->list, list);
-       }
-
-       ret = idx;
-       goto done;
-
-free_pbc_dma:
-       if (dma_mapped)
-               dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
-free_pbc:
-       if (page) {
-               kunmap(page);
-               __free_page(page);
-       } else
-               dma_pool_free(pq->header_cache, pbc, dma_addr);
-free_pkt:
-       kmem_cache_free(pq->pkt_slab, pkt);
-free_list:
-       ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
-done:
-       return ret;
-}
-
-static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
-                                                u32 c)
-{
-       pq->sent_counter = c;
-}
-
-/* try to clean out queue -- needs pq->lock */
-static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
-                                      struct ipath_user_sdma_queue *pq)
-{
-       struct list_head free_list;
-       struct ipath_user_sdma_pkt *pkt;
-       struct ipath_user_sdma_pkt *pkt_prev;
-       int ret = 0;
-
-       INIT_LIST_HEAD(&free_list);
-
-       list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
-               s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
-
-               if (descd < 0)
-                       break;
-
-               list_move_tail(&pkt->list, &free_list);
-
-               /* one more packet cleaned */
-               ret++;
-       }
-
-       if (!list_empty(&free_list)) {
-               u32 counter;
-
-               pkt = list_entry(free_list.prev,
-                                struct ipath_user_sdma_pkt, list);
-               counter = pkt->counter;
-
-               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
-               ipath_user_sdma_set_complete_counter(pq, counter);
-       }
-
-       return ret;
-}
-
-void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
-{
-       if (!pq)
-               return;
-
-       kmem_cache_destroy(pq->pkt_slab);
-       dma_pool_destroy(pq->header_cache);
-       kfree(pq);
-}
-
-/* clean descriptor queue, returns > 0 if some elements cleaned */
-static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
-{
-       int ret;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-       ret = ipath_sdma_make_progress(dd);
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       return ret;
-}
-
-/* we're in close, drain packets so that we can cleanup successfully... */
-void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
-                                struct ipath_user_sdma_queue *pq)
-{
-       int i;
-
-       if (!pq)
-               return;
-
-       for (i = 0; i < 100; i++) {
-               mutex_lock(&pq->lock);
-               if (list_empty(&pq->sent)) {
-                       mutex_unlock(&pq->lock);
-                       break;
-               }
-               ipath_user_sdma_hwqueue_clean(dd);
-               ipath_user_sdma_queue_clean(dd, pq);
-               mutex_unlock(&pq->lock);
-               msleep(10);
-       }
-
-       if (!list_empty(&pq->sent)) {
-               struct list_head free_list;
-
-               printk(KERN_INFO "drain: lists not empty: forcing!\n");
-               INIT_LIST_HEAD(&free_list);
-               mutex_lock(&pq->lock);
-               list_splice_init(&pq->sent, &free_list);
-               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
-               mutex_unlock(&pq->lock);
-       }
-}
-
-static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
-                                          u64 addr, u64 dwlen, u64 dwoffset)
-{
-       return cpu_to_le64(/* SDmaPhyAddr[31:0] */
-                          ((addr & 0xfffffffcULL) << 32) |
-                          /* SDmaGeneration[1:0] */
-                          ((dd->ipath_sdma_generation & 3ULL) << 30) |
-                          /* SDmaDwordCount[10:0] */
-                          ((dwlen & 0x7ffULL) << 16) |
-                          /* SDmaBufOffset[12:2] */
-                          (dwoffset & 0x7ffULL));
-}
-
-static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
-{
-       return descq | cpu_to_le64(1ULL << 12);
-}
-
-static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
-{
-                                             /* last */  /* dma head */
-       return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
-}
-
-static inline __le64 ipath_sdma_make_desc1(u64 addr)
-{
-       /* SDmaPhyAddr[47:32] */
-       return cpu_to_le64(addr >> 32);
-}
-
-static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
-                                     struct ipath_user_sdma_pkt *pkt, int idx,
-                                     unsigned ofs, u16 tail)
-{
-       const u64 addr = (u64) pkt->addr[idx].addr +
-               (u64) pkt->addr[idx].offset;
-       const u64 dwlen = (u64) pkt->addr[idx].length / 4;
-       __le64 *descqp;
-       __le64 descq0;
-
-       descqp = &dd->ipath_sdma_descq[tail].qw[0];
-
-       descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
-       if (idx == 0)
-               descq0 = ipath_sdma_make_first_desc0(descq0);
-       if (idx == pkt->naddr - 1)
-               descq0 = ipath_sdma_make_last_desc0(descq0);
-
-       descqp[0] = descq0;
-       descqp[1] = ipath_sdma_make_desc1(addr);
-}
-
-/* pq->lock must be held, get packets on the wire... */
-static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
-                                    struct ipath_user_sdma_queue *pq,
-                                    struct list_head *pktlist)
-{
-       int ret = 0;
-       unsigned long flags;
-       u16 tail;
-
-       if (list_empty(pktlist))
-               return 0;
-
-       if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
-               return -ECOMM;
-
-       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
-
-       if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
-               ret = -ECOMM;
-               goto unlock;
-       }
-
-       tail = dd->ipath_sdma_descq_tail;
-       while (!list_empty(pktlist)) {
-               struct ipath_user_sdma_pkt *pkt =
-                       list_entry(pktlist->next, struct ipath_user_sdma_pkt,
-                                  list);
-               int i;
-               unsigned ofs = 0;
-               u16 dtail = tail;
-
-               if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
-                       goto unlock_check_tail;
-
-               for (i = 0; i < pkt->naddr; i++) {
-                       ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
-                       ofs += pkt->addr[i].length >> 2;
-
-                       if (++tail == dd->ipath_sdma_descq_cnt) {
-                               tail = 0;
-                               ++dd->ipath_sdma_generation;
-                       }
-               }
-
-               if ((ofs<<2) > dd->ipath_ibmaxlen) {
-                       ipath_dbg("packet size %X > ibmax %X, fail\n",
-                               ofs<<2, dd->ipath_ibmaxlen);
-                       ret = -EMSGSIZE;
-                       goto unlock;
-               }
-
-               /*
-                * if the packet is >= 2KB mtu equivalent, we have to use
-                * the large buffers, and have to mark each descriptor as
-                * part of a large buffer packet.
-                */
-               if (ofs >= IPATH_SMALLBUF_DWORDS) {
-                       for (i = 0; i < pkt->naddr; i++) {
-                               dd->ipath_sdma_descq[dtail].qw[0] |=
-                                       cpu_to_le64(1ULL << 14);
-                               if (++dtail == dd->ipath_sdma_descq_cnt)
-                                       dtail = 0;
-                       }
-               }
-
-               dd->ipath_sdma_descq_added += pkt->naddr;
-               pkt->added = dd->ipath_sdma_descq_added;
-               list_move_tail(&pkt->list, &pq->sent);
-               ret++;
-       }
-
-unlock_check_tail:
-       /* advance the tail on the chip if necessary */
-       if (dd->ipath_sdma_descq_tail != tail) {
-               wmb();
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
-               dd->ipath_sdma_descq_tail = tail;
-       }
-
-unlock:
-       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
-
-       return ret;
-}
-
-int ipath_user_sdma_writev(struct ipath_devdata *dd,
-                          struct ipath_user_sdma_queue *pq,
-                          const struct iovec *iov,
-                          unsigned long dim)
-{
-       int ret = 0;
-       struct list_head list;
-       int npkts = 0;
-
-       INIT_LIST_HEAD(&list);
-
-       mutex_lock(&pq->lock);
-
-       if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
-               ipath_user_sdma_hwqueue_clean(dd);
-               ipath_user_sdma_queue_clean(dd, pq);
-       }
-
-       while (dim) {
-               const int mxp = 8;
-
-               ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
-               if (ret <= 0)
-                       goto done_unlock;
-               else {
-                       dim -= ret;
-                       iov += ret;
-               }
-
-               /* force packets onto the sdma hw queue... */
-               if (!list_empty(&list)) {
-                       /*
-                        * lazily clean hw queue.  the 4 is a guess of about
-                        * how many sdma descriptors a packet will take (it
-                        * doesn't have to be perfect).
-                        */
-                       if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
-                               ipath_user_sdma_hwqueue_clean(dd);
-                               ipath_user_sdma_queue_clean(dd, pq);
-                       }
-
-                       ret = ipath_user_sdma_push_pkts(dd, pq, &list);
-                       if (ret < 0)
-                               goto done_unlock;
-                       else {
-                               npkts += ret;
-                               pq->counter += ret;
-
-                               if (!list_empty(&list))
-                                       goto done_unlock;
-                       }
-               }
-       }
-
-done_unlock:
-       if (!list_empty(&list))
-               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
-       mutex_unlock(&pq->lock);
-
-       return (ret < 0) ? ret : npkts;
-}
-
-int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
-                                 struct ipath_user_sdma_queue *pq)
-{
-       int ret = 0;
-
-       mutex_lock(&pq->lock);
-       ipath_user_sdma_hwqueue_clean(dd);
-       ret = ipath_user_sdma_queue_clean(dd, pq);
-       mutex_unlock(&pq->lock);
-
-       return ret;
-}
-
-u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
-{
-       return pq->sent_counter;
-}
-
-u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
-{
-       return pq->counter;
-}
-
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
deleted file mode 100644 (file)
index fc76316..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/device.h>
-
-struct ipath_user_sdma_queue;
-
-struct ipath_user_sdma_queue *
-ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
-void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
-
-int ipath_user_sdma_writev(struct ipath_devdata *dd,
-                          struct ipath_user_sdma_queue *pq,
-                          const struct iovec *iov,
-                          unsigned long dim);
-
-int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
-                                 struct ipath_user_sdma_queue *pq);
-
-void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
-                                struct ipath_user_sdma_queue *pq);
-
-u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
-u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
deleted file mode 100644 (file)
index 30ba49c..0000000
+++ /dev/null
@@ -1,2364 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <rdma/ib_mad.h>
-#include <rdma/ib_user_verbs.h>
-#include <linux/io.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/utsname.h>
-#include <linux/rculist.h>
-
-#include "ipath_kernel.h"
-#include "ipath_verbs.h"
-#include "ipath_common.h"
-
-static unsigned int ib_ipath_qp_table_size = 251;
-module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
-MODULE_PARM_DESC(qp_table_size, "QP table size");
-
-unsigned int ib_ipath_lkey_table_size = 12;
-module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
-                  S_IRUGO);
-MODULE_PARM_DESC(lkey_table_size,
-                "LKEY table size in bits (2^n, 1 <= n <= 23)");
-
-static unsigned int ib_ipath_max_pds = 0xFFFF;
-module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_pds,
-                "Maximum number of protection domains to support");
-
-static unsigned int ib_ipath_max_ahs = 0xFFFF;
-module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
-
-unsigned int ib_ipath_max_cqes = 0x2FFFF;
-module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_cqes,
-                "Maximum number of completion queue entries to support");
-
-unsigned int ib_ipath_max_cqs = 0x1FFFF;
-module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
-
-unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
-module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
-                  S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
-
-unsigned int ib_ipath_max_qps = 16384;
-module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
-
-unsigned int ib_ipath_max_sges = 0x60;
-module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
-
-unsigned int ib_ipath_max_mcast_grps = 16384;
-module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
-                  S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_mcast_grps,
-                "Maximum number of multicast groups to support");
-
-unsigned int ib_ipath_max_mcast_qp_attached = 16;
-module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
-                  uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_mcast_qp_attached,
-                "Maximum number of attached QPs to support");
-
-unsigned int ib_ipath_max_srqs = 1024;
-module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
-
-unsigned int ib_ipath_max_srq_sges = 128;
-module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
-                  uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
-
-unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
-module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
-                  uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
-
-static unsigned int ib_ipath_disable_sma;
-module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(disable_sma, "Disable the SMA");
-
-/*
- * Note that it is OK to post send work requests in the SQE and ERR
- * states; ipath_do_send() will process them and generate error
- * completions as per IB 1.2 C10-96.
- */
-const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
-       [IB_QPS_RESET] = 0,
-       [IB_QPS_INIT] = IPATH_POST_RECV_OK,
-       [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
-       [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
-           IPATH_PROCESS_NEXT_SEND_OK,
-       [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
-       [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
-       [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
-           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
-};
-
-struct ipath_ucontext {
-       struct ib_ucontext ibucontext;
-};
-
-static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
-                                                 *ibucontext)
-{
-       return container_of(ibucontext, struct ipath_ucontext, ibucontext);
-}
-
-/*
- * Translate ib_wr_opcode into ib_wc_opcode.
- */
-const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
-       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
-       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
-       [IB_WR_SEND] = IB_WC_SEND,
-       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
-       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
-       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
-       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
-};
-
-/*
- * System image GUID.
- */
-static __be64 sys_image_guid;
-
-/**
- * ipath_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- */
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               memcpy(sge->vaddr, data, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               data += len;
-               length -= len;
-       }
-}
-
-/**
- * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
- * @ss: the SGE state
- * @length: the number of bytes to skip
- */
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               length -= len;
-       }
-}
-
-/*
- * Count the number of DMA descriptors needed to send length bytes of data.
- * Don't modify the ipath_sge_state to get the count.
- * Return zero if any of the segments is not aligned.
- */
-static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
-{
-       struct ipath_sge *sg_list = ss->sg_list;
-       struct ipath_sge sge = ss->sge;
-       u8 num_sge = ss->num_sge;
-       u32 ndesc = 1;  /* count the header */
-
-       while (length) {
-               u32 len = sge.length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge.sge_length)
-                       len = sge.sge_length;
-               BUG_ON(len == 0);
-               if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
-                   (len != length && (len & (sizeof(u32) - 1)))) {
-                       ndesc = 0;
-                       break;
-               }
-               ndesc++;
-               sge.vaddr += len;
-               sge.length -= len;
-               sge.sge_length -= len;
-               if (sge.sge_length == 0) {
-                       if (--num_sge)
-                               sge = *sg_list++;
-               } else if (sge.length == 0 && sge.mr != NULL) {
-                       if (++sge.n >= IPATH_SEGSZ) {
-                               if (++sge.m >= sge.mr->mapsz)
-                                       break;
-                               sge.n = 0;
-                       }
-                       sge.vaddr =
-                               sge.mr->map[sge.m]->segs[sge.n].vaddr;
-                       sge.length =
-                               sge.mr->map[sge.m]->segs[sge.n].length;
-               }
-               length -= len;
-       }
-       return ndesc;
-}
-
-/*
- * Copy from the SGEs to the data buffer.
- */
-static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
-                               u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       while (length) {
-               u32 len = sge->length;
-
-               if (len > length)
-                       len = length;
-               if (len > sge->sge_length)
-                       len = sge->sge_length;
-               BUG_ON(len == 0);
-               memcpy(data, sge->vaddr, len);
-               sge->vaddr += len;
-               sge->length -= len;
-               sge->sge_length -= len;
-               if (sge->sge_length == 0) {
-                       if (--ss->num_sge)
-                               *sge = *ss->sg_list++;
-               } else if (sge->length == 0 && sge->mr != NULL) {
-                       if (++sge->n >= IPATH_SEGSZ) {
-                               if (++sge->m >= sge->mr->mapsz)
-                                       break;
-                               sge->n = 0;
-                       }
-                       sge->vaddr =
-                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
-                       sge->length =
-                               sge->mr->map[sge->m]->segs[sge->n].length;
-               }
-               data += len;
-               length -= len;
-       }
-}
-
-/**
- * ipath_post_one_send - post one RC, UC, or UD send work request
- * @qp: the QP to post on
- * @wr: the work request to send
- */
-static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
-{
-       struct ipath_swqe *wqe;
-       u32 next;
-       int i;
-       int j;
-       int acc;
-       int ret;
-       unsigned long flags;
-       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-
-       spin_lock_irqsave(&qp->s_lock, flags);
-
-       if (qp->ibqp.qp_type != IB_QPT_SMI &&
-           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
-               ret = -ENETDOWN;
-               goto bail;
-       }
-
-       /* Check that state is OK to post send. */
-       if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
-               goto bail_inval;
-
-       /* IB spec says that num_sge == 0 is OK. */
-       if (wr->num_sge > qp->s_max_sge)
-               goto bail_inval;
-
-       /*
-        * Don't allow RDMA reads or atomic operations on UC or
-        * undefined operations.
-        * Make sure buffer is large enough to hold the result for atomics.
-        */
-       if (qp->ibqp.qp_type == IB_QPT_UC) {
-               if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
-                       goto bail_inval;
-       } else if (qp->ibqp.qp_type == IB_QPT_UD) {
-               /* Check UD opcode */
-               if (wr->opcode != IB_WR_SEND &&
-                   wr->opcode != IB_WR_SEND_WITH_IMM)
-                       goto bail_inval;
-               /* Check UD destination address PD */
-               if (qp->ibqp.pd != wr->wr.ud.ah->pd)
-                       goto bail_inval;
-       } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
-               goto bail_inval;
-       else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
-                  (wr->num_sge == 0 ||
-                   wr->sg_list[0].length < sizeof(u64) ||
-                   wr->sg_list[0].addr & (sizeof(u64) - 1)))
-               goto bail_inval;
-       else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
-               goto bail_inval;
-
-       next = qp->s_head + 1;
-       if (next >= qp->s_size)
-               next = 0;
-       if (next == qp->s_last) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       wqe = get_swqe_ptr(qp, qp->s_head);
-       wqe->wr = *wr;
-       wqe->length = 0;
-       if (wr->num_sge) {
-               acc = wr->opcode >= IB_WR_RDMA_READ ?
-                       IB_ACCESS_LOCAL_WRITE : 0;
-               for (i = 0, j = 0; i < wr->num_sge; i++) {
-                       u32 length = wr->sg_list[i].length;
-                       int ok;
-
-                       if (length == 0)
-                               continue;
-                       ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
-                                          &wr->sg_list[i], acc);
-                       if (!ok)
-                               goto bail_inval;
-                       wqe->length += length;
-                       j++;
-               }
-               wqe->wr.num_sge = j;
-       }
-       if (qp->ibqp.qp_type == IB_QPT_UC ||
-           qp->ibqp.qp_type == IB_QPT_RC) {
-               if (wqe->length > 0x80000000U)
-                       goto bail_inval;
-       } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
-               goto bail_inval;
-       wqe->ssn = qp->s_ssn++;
-       qp->s_head = next;
-
-       ret = 0;
-       goto bail;
-
-bail_inval:
-       ret = -EINVAL;
-bail:
-       spin_unlock_irqrestore(&qp->s_lock, flags);
-       return ret;
-}
-
-/**
- * ipath_post_send - post a send on a QP
- * @ibqp: the QP to post the send on
- * @wr: the list of work requests to post
- * @bad_wr: the first bad WR is put here
- *
- * This may be called from interrupt context.
- */
-static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
-                          struct ib_send_wr **bad_wr)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       int err = 0;
-
-       for (; wr; wr = wr->next) {
-               err = ipath_post_one_send(qp, wr);
-               if (err) {
-                       *bad_wr = wr;
-                       goto bail;
-               }
-       }
-
-       /* Try to do the send work in the caller's context. */
-       ipath_do_send((unsigned long) qp);
-
-bail:
-       return err;
-}
-
-/**
- * ipath_post_receive - post a receive on a QP
- * @ibqp: the QP to post the receive on
- * @wr: the WR to post
- * @bad_wr: the first bad WR is put here
- *
- * This may be called from interrupt context.
- */
-static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
-                             struct ib_recv_wr **bad_wr)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_rwq *wq = qp->r_rq.wq;
-       unsigned long flags;
-       int ret;
-
-       /* Check that state is OK to post receive. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
-               *bad_wr = wr;
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       for (; wr; wr = wr->next) {
-               struct ipath_rwqe *wqe;
-               u32 next;
-               int i;
-
-               if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
-                       *bad_wr = wr;
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               spin_lock_irqsave(&qp->r_rq.lock, flags);
-               next = wq->head + 1;
-               if (next >= qp->r_rq.size)
-                       next = 0;
-               if (next == wq->tail) {
-                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
-                       *bad_wr = wr;
-                       ret = -ENOMEM;
-                       goto bail;
-               }
-
-               wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
-               wqe->wr_id = wr->wr_id;
-               wqe->num_sge = wr->num_sge;
-               for (i = 0; i < wr->num_sge; i++)
-                       wqe->sg_list[i] = wr->sg_list[i];
-               /* Make sure queue entry is written before the head index. */
-               smp_wmb();
-               wq->head = next;
-               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
-       }
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_qp_rcv - processing an incoming packet on a QP
- * @dev: the device the packet came on
- * @hdr: the packet header
- * @has_grh: true if the packet has a GRH
- * @data: the packet data
- * @tlen: the packet length
- * @qp: the QP the packet came on
- *
- * This is called from ipath_ib_rcv() to process an incoming packet
- * for the given QP.
- * Called at interrupt level.
- */
-static void ipath_qp_rcv(struct ipath_ibdev *dev,
-                        struct ipath_ib_header *hdr, int has_grh,
-                        void *data, u32 tlen, struct ipath_qp *qp)
-{
-       /* Check for valid receive state. */
-       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
-               dev->n_pkt_drops++;
-               return;
-       }
-
-       switch (qp->ibqp.qp_type) {
-       case IB_QPT_SMI:
-       case IB_QPT_GSI:
-               if (ib_ipath_disable_sma)
-                       break;
-               /* FALLTHROUGH */
-       case IB_QPT_UD:
-               ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
-               break;
-
-       case IB_QPT_RC:
-               ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
-               break;
-
-       case IB_QPT_UC:
-               ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
-               break;
-
-       default:
-               break;
-       }
-}
-
-/**
- * ipath_ib_rcv - process an incoming packet
- * @arg: the device pointer
- * @rhdr: the header of the packet
- * @data: the packet data
- * @tlen: the packet length
- *
- * This is called from ipath_kreceive() to process an incoming packet at
- * interrupt level. Tlen is the length of the header + data + CRC in bytes.
- */
-void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
-                 u32 tlen)
-{
-       struct ipath_ib_header *hdr = rhdr;
-       struct ipath_other_headers *ohdr;
-       struct ipath_qp *qp;
-       u32 qp_num;
-       int lnh;
-       u8 opcode;
-       u16 lid;
-
-       if (unlikely(dev == NULL))
-               goto bail;
-
-       if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
-               dev->rcv_errors++;
-               goto bail;
-       }
-
-       /* Check for a valid destination LID (see ch. 7.11.1). */
-       lid = be16_to_cpu(hdr->lrh[1]);
-       if (lid < IPATH_MULTICAST_LID_BASE) {
-               lid &= ~((1 << dev->dd->ipath_lmc) - 1);
-               if (unlikely(lid != dev->dd->ipath_lid)) {
-                       dev->rcv_errors++;
-                       goto bail;
-               }
-       }
-
-       /* Check for GRH */
-       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
-       if (lnh == IPATH_LRH_BTH)
-               ohdr = &hdr->u.oth;
-       else if (lnh == IPATH_LRH_GRH)
-               ohdr = &hdr->u.l.oth;
-       else {
-               dev->rcv_errors++;
-               goto bail;
-       }
-
-       opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
-       dev->opstats[opcode].n_bytes += tlen;
-       dev->opstats[opcode].n_packets++;
-
-       /* Get the destination QP number. */
-       qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
-       if (qp_num == IPATH_MULTICAST_QPN) {
-               struct ipath_mcast *mcast;
-               struct ipath_mcast_qp *p;
-
-               if (lnh != IPATH_LRH_GRH) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-               mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
-               if (mcast == NULL) {
-                       dev->n_pkt_drops++;
-                       goto bail;
-               }
-               dev->n_multicast_rcv++;
-               list_for_each_entry_rcu(p, &mcast->qp_list, list)
-                       ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
-               /*
-                * Notify ipath_multicast_detach() if it is waiting for us
-                * to finish.
-                */
-               if (atomic_dec_return(&mcast->refcount) <= 1)
-                       wake_up(&mcast->wait);
-       } else {
-               qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
-               if (qp) {
-                       dev->n_unicast_rcv++;
-                       ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
-                                    tlen, qp);
-                       /*
-                        * Notify ipath_destroy_qp() if it is waiting
-                        * for us to finish.
-                        */
-                       if (atomic_dec_and_test(&qp->refcount))
-                               wake_up(&qp->wait);
-               } else
-                       dev->n_pkt_drops++;
-       }
-
-bail:;
-}
-
-/**
- * ipath_ib_timer - verbs timer
- * @arg: the device pointer
- *
- * This is called from ipath_do_rcv_timer() at interrupt level to check for
- * QPs which need retransmits and to collect performance numbers.
- */
-static void ipath_ib_timer(struct ipath_ibdev *dev)
-{
-       struct ipath_qp *resend = NULL;
-       struct ipath_qp *rnr = NULL;
-       struct list_head *last;
-       struct ipath_qp *qp;
-       unsigned long flags;
-
-       if (dev == NULL)
-               return;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       /* Start filling the next pending queue. */
-       if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
-               dev->pending_index = 0;
-       /* Save any requests still in the new queue, they have timed out. */
-       last = &dev->pending[dev->pending_index];
-       while (!list_empty(last)) {
-               qp = list_entry(last->next, struct ipath_qp, timerwait);
-               list_del_init(&qp->timerwait);
-               qp->timer_next = resend;
-               resend = qp;
-               atomic_inc(&qp->refcount);
-       }
-       last = &dev->rnrwait;
-       if (!list_empty(last)) {
-               qp = list_entry(last->next, struct ipath_qp, timerwait);
-               if (--qp->s_rnr_timeout == 0) {
-                       do {
-                               list_del_init(&qp->timerwait);
-                               qp->timer_next = rnr;
-                               rnr = qp;
-                               atomic_inc(&qp->refcount);
-                               if (list_empty(last))
-                                       break;
-                               qp = list_entry(last->next, struct ipath_qp,
-                                               timerwait);
-                       } while (qp->s_rnr_timeout == 0);
-               }
-       }
-       /*
-        * We should only be in the started state if pma_sample_start != 0
-        */
-       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
-           --dev->pma_sample_start == 0) {
-               dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
-               ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
-                                       &dev->ipath_rword,
-                                       &dev->ipath_spkts,
-                                       &dev->ipath_rpkts,
-                                       &dev->ipath_xmit_wait);
-       }
-       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
-               if (dev->pma_sample_interval == 0) {
-                       u64 ta, tb, tc, td, te;
-
-                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
-                       ipath_snapshot_counters(dev->dd, &ta, &tb,
-                                               &tc, &td, &te);
-
-                       dev->ipath_sword = ta - dev->ipath_sword;
-                       dev->ipath_rword = tb - dev->ipath_rword;
-                       dev->ipath_spkts = tc - dev->ipath_spkts;
-                       dev->ipath_rpkts = td - dev->ipath_rpkts;
-                       dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
-               }
-               else
-                       dev->pma_sample_interval--;
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       /* XXX What if timer fires again while this is running? */
-       while (resend != NULL) {
-               qp = resend;
-               resend = qp->timer_next;
-
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (qp->s_last != qp->s_tail &&
-                   ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
-                       dev->n_timeouts++;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1);
-               }
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-
-               /* Notify ipath_destroy_qp() if it is waiting. */
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-       while (rnr != NULL) {
-               qp = rnr;
-               rnr = qp->timer_next;
-
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-
-               /* Notify ipath_destroy_qp() if it is waiting. */
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-}
-
-static void update_sge(struct ipath_sge_state *ss, u32 length)
-{
-       struct ipath_sge *sge = &ss->sge;
-
-       sge->vaddr += length;
-       sge->length -= length;
-       sge->sge_length -= length;
-       if (sge->sge_length == 0) {
-               if (--ss->num_sge)
-                       *sge = *ss->sg_list++;
-       } else if (sge->length == 0 && sge->mr != NULL) {
-               if (++sge->n >= IPATH_SEGSZ) {
-                       if (++sge->m >= sge->mr->mapsz)
-                               return;
-                       sge->n = 0;
-               }
-               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
-               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
-       }
-}
-
-#ifdef __LITTLE_ENDIAN
-static inline u32 get_upper_bits(u32 data, u32 shift)
-{
-       return data >> shift;
-}
-
-static inline u32 set_upper_bits(u32 data, u32 shift)
-{
-       return data << shift;
-}
-
-static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
-{
-       data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
-       data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
-       return data;
-}
-#else
-static inline u32 get_upper_bits(u32 data, u32 shift)
-{
-       return data << shift;
-}
-
-static inline u32 set_upper_bits(u32 data, u32 shift)
-{
-       return data >> shift;
-}
-
-static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
-{
-       data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
-       data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
-       return data;
-}
-#endif
-
-static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
-                   u32 length, unsigned flush_wc)
-{
-       u32 extra = 0;
-       u32 data = 0;
-       u32 last;
-
-       while (1) {
-               u32 len = ss->sge.length;
-               u32 off;
-
-               if (len > length)
-                       len = length;
-               if (len > ss->sge.sge_length)
-                       len = ss->sge.sge_length;
-               BUG_ON(len == 0);
-               /* If the source address is not aligned, try to align it. */
-               off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
-               if (off) {
-                       u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
-                                           ~(sizeof(u32) - 1));
-                       u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
-                       u32 y;
-
-                       y = sizeof(u32) - off;
-                       if (len > y)
-                               len = y;
-                       if (len + extra >= sizeof(u32)) {
-                               data |= set_upper_bits(v, extra *
-                                                      BITS_PER_BYTE);
-                               len = sizeof(u32) - extra;
-                               if (len == length) {
-                                       last = data;
-                                       break;
-                               }
-                               __raw_writel(data, piobuf);
-                               piobuf++;
-                               extra = 0;
-                               data = 0;
-                       } else {
-                               /* Clear unused upper bytes */
-                               data |= clear_upper_bytes(v, len, extra);
-                               if (len == length) {
-                                       last = data;
-                                       break;
-                               }
-                               extra += len;
-                       }
-               } else if (extra) {
-                       /* Source address is aligned. */
-                       u32 *addr = (u32 *) ss->sge.vaddr;
-                       int shift = extra * BITS_PER_BYTE;
-                       int ushift = 32 - shift;
-                       u32 l = len;
-
-                       while (l >= sizeof(u32)) {
-                               u32 v = *addr;
-
-                               data |= set_upper_bits(v, shift);
-                               __raw_writel(data, piobuf);
-                               data = get_upper_bits(v, ushift);
-                               piobuf++;
-                               addr++;
-                               l -= sizeof(u32);
-                       }
-                       /*
-                        * We still have 'extra' number of bytes leftover.
-                        */
-                       if (l) {
-                               u32 v = *addr;
-
-                               if (l + extra >= sizeof(u32)) {
-                                       data |= set_upper_bits(v, shift);
-                                       len -= l + extra - sizeof(u32);
-                                       if (len == length) {
-                                               last = data;
-                                               break;
-                                       }
-                                       __raw_writel(data, piobuf);
-                                       piobuf++;
-                                       extra = 0;
-                                       data = 0;
-                               } else {
-                                       /* Clear unused upper bytes */
-                                       data |= clear_upper_bytes(v, l,
-                                                                 extra);
-                                       if (len == length) {
-                                               last = data;
-                                               break;
-                                       }
-                                       extra += l;
-                               }
-                       } else if (len == length) {
-                               last = data;
-                               break;
-                       }
-               } else if (len == length) {
-                       u32 w;
-
-                       /*
-                        * Need to round up for the last dword in the
-                        * packet.
-                        */
-                       w = (len + 3) >> 2;
-                       __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
-                       piobuf += w - 1;
-                       last = ((u32 *) ss->sge.vaddr)[w - 1];
-                       break;
-               } else {
-                       u32 w = len >> 2;
-
-                       __iowrite32_copy(piobuf, ss->sge.vaddr, w);
-                       piobuf += w;
-
-                       extra = len & (sizeof(u32) - 1);
-                       if (extra) {
-                               u32 v = ((u32 *) ss->sge.vaddr)[w];
-
-                               /* Clear unused upper bytes */
-                               data = clear_upper_bytes(v, extra, 0);
-                       }
-               }
-               update_sge(ss, len);
-               length -= len;
-       }
-       /* Update address before sending packet. */
-       update_sge(ss, length);
-       if (flush_wc) {
-               /* must flush early everything before trigger word */
-               ipath_flush_wc();
-               __raw_writel(last, piobuf);
-               /* be sure trigger word is written */
-               ipath_flush_wc();
-       } else
-               __raw_writel(last, piobuf);
-}
-
-/*
- * Convert IB rate to delay multiplier.
- */
-unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
-{
-       switch (rate) {
-       case IB_RATE_2_5_GBPS: return 8;
-       case IB_RATE_5_GBPS:   return 4;
-       case IB_RATE_10_GBPS:  return 2;
-       case IB_RATE_20_GBPS:  return 1;
-       default:               return 0;
-       }
-}
-
-/*
- * Convert delay multiplier to IB rate
- */
-static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
-{
-       switch (mult) {
-       case 8:  return IB_RATE_2_5_GBPS;
-       case 4:  return IB_RATE_5_GBPS;
-       case 2:  return IB_RATE_10_GBPS;
-       case 1:  return IB_RATE_20_GBPS;
-       default: return IB_RATE_PORT_CURRENT;
-       }
-}
-
-static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
-{
-       struct ipath_verbs_txreq *tx = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       if (!list_empty(&dev->txreq_free)) {
-               struct list_head *l = dev->txreq_free.next;
-
-               list_del(l);
-               tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-       return tx;
-}
-
-static inline void put_txreq(struct ipath_ibdev *dev,
-                            struct ipath_verbs_txreq *tx)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       list_add(&tx->txreq.list, &dev->txreq_free);
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-}
-
-static void sdma_complete(void *cookie, int status)
-{
-       struct ipath_verbs_txreq *tx = cookie;
-       struct ipath_qp *qp = tx->qp;
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       unsigned long flags;
-       enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
-               IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
-
-       if (atomic_dec_and_test(&qp->s_dma_busy)) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (tx->wqe)
-                       ipath_send_complete(qp, tx->wqe, ibs);
-               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
-                    qp->s_last != qp->s_head) ||
-                   (qp->s_flags & IPATH_S_WAIT_DMA))
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               wake_up(&qp->wait_dma);
-       } else if (tx->wqe) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               ipath_send_complete(qp, tx->wqe, ibs);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       }
-
-       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
-               kfree(tx->txreq.map_addr);
-       put_txreq(dev, tx);
-
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-}
-
-static void decrement_dma_busy(struct ipath_qp *qp)
-{
-       unsigned long flags;
-
-       if (atomic_dec_and_test(&qp->s_dma_busy)) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
-                    qp->s_last != qp->s_head) ||
-                   (qp->s_flags & IPATH_S_WAIT_DMA))
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-               wake_up(&qp->wait_dma);
-       }
-}
-
-/*
- * Compute the number of clock cycles of delay before sending the next packet.
- * The multipliers reflect the number of clocks for the fastest rate so
- * one tick at 4xDDR is 8 ticks at 1xSDR.
- * If the destination port will take longer to receive a packet than
- * the outgoing link can send it, we need to delay sending the next packet
- * by the difference in time it takes the receiver to receive and the sender
- * to send this packet.
- * Note that this delay is always correct for UC and RC but not always
- * optimal for UD. For UD, the destination HCA can be different for each
- * packet, in which case, we could send packets to a different destination
- * while "waiting" for the delay. The overhead for doing this without
- * HW support is more than just paying the cost of delaying some packets
- * unnecessarily.
- */
-static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
-{
-       return (rcv_mult > snd_mult) ?
-               (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
-}
-
-static int ipath_verbs_send_dma(struct ipath_qp *qp,
-                               struct ipath_ib_header *hdr, u32 hdrwords,
-                               struct ipath_sge_state *ss, u32 len,
-                               u32 plen, u32 dwords)
-{
-       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       struct ipath_devdata *dd = dev->dd;
-       struct ipath_verbs_txreq *tx;
-       u32 *piobuf;
-       u32 control;
-       u32 ndesc;
-       int ret;
-
-       tx = qp->s_tx;
-       if (tx) {
-               qp->s_tx = NULL;
-               /* resend previously constructed packet */
-               atomic_inc(&qp->s_dma_busy);
-               ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
-               if (ret) {
-                       qp->s_tx = tx;
-                       decrement_dma_busy(qp);
-               }
-               goto bail;
-       }
-
-       tx = get_txreq(dev);
-       if (!tx) {
-               ret = -EBUSY;
-               goto bail;
-       }
-
-       /*
-        * Get the saved delay count we computed for the previous packet
-        * and save the delay count for this packet to be used next time
-        * we get here.
-        */
-       control = qp->s_pkt_delay;
-       qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
-
-       tx->qp = qp;
-       atomic_inc(&qp->refcount);
-       tx->wqe = qp->s_wqe;
-       tx->txreq.callback = sdma_complete;
-       tx->txreq.callback_cookie = tx;
-       tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
-               IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
-       if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
-               tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
-
-       /* VL15 packets bypass credit check */
-       if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
-               control |= 1ULL << 31;
-               tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
-       }
-
-       if (len) {
-               /*
-                * Don't try to DMA if it takes more descriptors than
-                * the queue holds.
-                */
-               ndesc = ipath_count_sge(ss, len);
-               if (ndesc >= dd->ipath_sdma_descq_cnt)
-                       ndesc = 0;
-       } else
-               ndesc = 1;
-       if (ndesc) {
-               tx->hdr.pbc[0] = cpu_to_le32(plen);
-               tx->hdr.pbc[1] = cpu_to_le32(control);
-               memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
-               tx->txreq.sg_count = ndesc;
-               tx->map_len = (hdrwords + 2) << 2;
-               tx->txreq.map_addr = &tx->hdr;
-               atomic_inc(&qp->s_dma_busy);
-               ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
-               if (ret) {
-                       /* save ss and length in dwords */
-                       tx->ss = ss;
-                       tx->len = dwords;
-                       qp->s_tx = tx;
-                       decrement_dma_busy(qp);
-               }
-               goto bail;
-       }
-
-       /* Allocate a buffer and copy the header and payload to it. */
-       tx->map_len = (plen + 1) << 2;
-       piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
-       if (unlikely(piobuf == NULL)) {
-               ret = -EBUSY;
-               goto err_tx;
-       }
-       tx->txreq.map_addr = piobuf;
-       tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
-       tx->txreq.sg_count = 1;
-
-       *piobuf++ = (__force u32) cpu_to_le32(plen);
-       *piobuf++ = (__force u32) cpu_to_le32(control);
-       memcpy(piobuf, hdr, hdrwords << 2);
-       ipath_copy_from_sge(piobuf + hdrwords, ss, len);
-
-       atomic_inc(&qp->s_dma_busy);
-       ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
-       /*
-        * If we couldn't queue the DMA request, save the info
-        * and try again later rather than destroying the
-        * buffer and undoing the side effects of the copy.
-        */
-       if (ret) {
-               tx->ss = NULL;
-               tx->len = 0;
-               qp->s_tx = tx;
-               decrement_dma_busy(qp);
-       }
-       dev->n_unaligned++;
-       goto bail;
-
-err_tx:
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-       put_txreq(dev, tx);
-bail:
-       return ret;
-}
-
-static int ipath_verbs_send_pio(struct ipath_qp *qp,
-                               struct ipath_ib_header *ibhdr, u32 hdrwords,
-                               struct ipath_sge_state *ss, u32 len,
-                               u32 plen, u32 dwords)
-{
-       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-       u32 *hdr = (u32 *) ibhdr;
-       u32 __iomem *piobuf;
-       unsigned flush_wc;
-       u32 control;
-       int ret;
-       unsigned long flags;
-
-       piobuf = ipath_getpiobuf(dd, plen, NULL);
-       if (unlikely(piobuf == NULL)) {
-               ret = -EBUSY;
-               goto bail;
-       }
-
-       /*
-        * Get the saved delay count we computed for the previous packet
-        * and save the delay count for this packet to be used next time
-        * we get here.
-        */
-       control = qp->s_pkt_delay;
-       qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
-
-       /* VL15 packets bypass credit check */
-       if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
-               control |= 1ULL << 31;
-
-       /*
-        * Write the length to the control qword plus any needed flags.
-        * We have to flush after the PBC for correctness on some cpus
-        * or WC buffer can be written out of order.
-        */
-       writeq(((u64) control << 32) | plen, piobuf);
-       piobuf += 2;
-
-       flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
-       if (len == 0) {
-               /*
-                * If there is just the header portion, must flush before
-                * writing last word of header for correctness, and after
-                * the last header word (trigger word).
-                */
-               if (flush_wc) {
-                       ipath_flush_wc();
-                       __iowrite32_copy(piobuf, hdr, hdrwords - 1);
-                       ipath_flush_wc();
-                       __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
-                       ipath_flush_wc();
-               } else
-                       __iowrite32_copy(piobuf, hdr, hdrwords);
-               goto done;
-       }
-
-       if (flush_wc)
-               ipath_flush_wc();
-       __iowrite32_copy(piobuf, hdr, hdrwords);
-       piobuf += hdrwords;
-
-       /* The common case is aligned and contained in one segment. */
-       if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
-                  !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
-               u32 *addr = (u32 *) ss->sge.vaddr;
-
-               /* Update address before sending packet. */
-               update_sge(ss, len);
-               if (flush_wc) {
-                       __iowrite32_copy(piobuf, addr, dwords - 1);
-                       /* must flush early everything before trigger word */
-                       ipath_flush_wc();
-                       __raw_writel(addr[dwords - 1], piobuf + dwords - 1);
-                       /* be sure trigger word is written */
-                       ipath_flush_wc();
-               } else
-                       __iowrite32_copy(piobuf, addr, dwords);
-               goto done;
-       }
-       copy_io(piobuf, ss, len, flush_wc);
-done:
-       if (qp->s_wqe) {
-               spin_lock_irqsave(&qp->s_lock, flags);
-               ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-       }
-       ret = 0;
-bail:
-       return ret;
-}
-
-/**
- * ipath_verbs_send - send a packet
- * @qp: the QP to send on
- * @hdr: the packet header
- * @hdrwords: the number of 32-bit words in the header
- * @ss: the SGE to send
- * @len: the length of the packet in bytes
- */
-int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
-                    u32 hdrwords, struct ipath_sge_state *ss, u32 len)
-{
-       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
-       u32 plen;
-       int ret;
-       u32 dwords = (len + 3) >> 2;
-
-       /*
-        * Calculate the send buffer trigger address.
-        * The +1 counts for the pbc control dword following the pbc length.
-        */
-       plen = hdrwords + dwords + 1;
-
-       /*
-        * VL15 packets (IB_QPT_SMI) will always use PIO, so we
-        * can defer SDMA restart until link goes ACTIVE without
-        * worrying about just how we got there.
-        */
-       if (qp->ibqp.qp_type == IB_QPT_SMI ||
-           !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
-               ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
-                                          plen, dwords);
-       else
-               ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
-                                          plen, dwords);
-
-       return ret;
-}
-
-int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
-                           u64 *rwords, u64 *spkts, u64 *rpkts,
-                           u64 *xmit_wait)
-{
-       int ret;
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ret = -EINVAL;
-               goto bail;
-       }
-       *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
-       *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-       *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-       *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
-       *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_get_counters - get various chip counters
- * @dd: the infinipath device
- * @cntrs: counters are placed here
- *
- * Return the counters needed by recv_pma_get_portcounters().
- */
-int ipath_get_counters(struct ipath_devdata *dd,
-                      struct ipath_verbs_counters *cntrs)
-{
-       struct ipath_cregs const *crp = dd->ipath_cregs;
-       int ret;
-
-       if (!(dd->ipath_flags & IPATH_INITTED)) {
-               /* no hardware, freeze, etc. */
-               ret = -EINVAL;
-               goto bail;
-       }
-       cntrs->symbol_error_counter =
-               ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
-       cntrs->link_error_recovery_counter =
-               ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
-       /*
-        * The link downed counter counts when the other side downs the
-        * connection.  We add in the number of times we downed the link
-        * due to local link integrity errors to compensate.
-        */
-       cntrs->link_downed_counter =
-               ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
-       cntrs->port_rcv_errors =
-               ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
-               ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
-               ipath_snap_cntr(dd, crp->cr_portovflcnt) +
-               ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
-               ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
-               ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
-               ipath_snap_cntr(dd, crp->cr_erricrccnt) +
-               ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
-               ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
-               ipath_snap_cntr(dd, crp->cr_badformatcnt) +
-               dd->ipath_rxfc_unsupvl_errs;
-       if (crp->cr_rxotherlocalphyerrcnt)
-               cntrs->port_rcv_errors +=
-                       ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
-       if (crp->cr_rxvlerrcnt)
-               cntrs->port_rcv_errors +=
-                       ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
-       cntrs->port_rcv_remphys_errors =
-               ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
-       cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
-       cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
-       cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
-       cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
-       cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
-       cntrs->local_link_integrity_errors =
-               crp->cr_locallinkintegrityerrcnt ?
-               ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
-               ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
-                dd->ipath_lli_errs : dd->ipath_lli_errors);
-       cntrs->excessive_buffer_overrun_errors =
-               crp->cr_excessbufferovflcnt ?
-               ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
-               dd->ipath_overrun_thresh_errs;
-       cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
-               ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_ib_piobufavail - callback when a PIO buffer is available
- * @arg: the device pointer
- *
- * This is called from ipath_intr() at interrupt level when a PIO buffer is
- * available after ipath_verbs_send() returned an error that no buffers were
- * available.  Return 1 if we consumed all the PIO buffers and we still have
- * QPs waiting for buffers (for now, just restart the send tasklet and
- * return zero).
- */
-int ipath_ib_piobufavail(struct ipath_ibdev *dev)
-{
-       struct list_head *list;
-       struct ipath_qp *qplist;
-       struct ipath_qp *qp;
-       unsigned long flags;
-
-       if (dev == NULL)
-               goto bail;
-
-       list = &dev->piowait;
-       qplist = NULL;
-
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       while (!list_empty(list)) {
-               qp = list_entry(list->next, struct ipath_qp, piowait);
-               list_del_init(&qp->piowait);
-               qp->pio_next = qplist;
-               qplist = qp;
-               atomic_inc(&qp->refcount);
-       }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-
-       while (qplist != NULL) {
-               qp = qplist;
-               qplist = qp->pio_next;
-
-               spin_lock_irqsave(&qp->s_lock, flags);
-               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
-                       ipath_schedule_send(qp);
-               spin_unlock_irqrestore(&qp->s_lock, flags);
-
-               /* Notify ipath_destroy_qp() if it is waiting. */
-               if (atomic_dec_and_test(&qp->refcount))
-                       wake_up(&qp->wait);
-       }
-
-bail:
-       return 0;
-}
-
-static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-                             struct ib_udata *uhw)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-
-       if (uhw->inlen || uhw->outlen)
-               return -EINVAL;
-
-       memset(props, 0, sizeof(*props));
-
-       props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
-               IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
-               IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
-               IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
-       props->page_size_cap = PAGE_SIZE;
-       props->vendor_id =
-               IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
-       props->vendor_part_id = dev->dd->ipath_deviceid;
-       props->hw_ver = dev->dd->ipath_pcirev;
-
-       props->sys_image_guid = dev->sys_image_guid;
-
-       props->max_mr_size = ~0ull;
-       props->max_qp = ib_ipath_max_qps;
-       props->max_qp_wr = ib_ipath_max_qp_wrs;
-       props->max_sge = ib_ipath_max_sges;
-       props->max_cq = ib_ipath_max_cqs;
-       props->max_ah = ib_ipath_max_ahs;
-       props->max_cqe = ib_ipath_max_cqes;
-       props->max_mr = dev->lk_table.max;
-       props->max_fmr = dev->lk_table.max;
-       props->max_map_per_fmr = 32767;
-       props->max_pd = ib_ipath_max_pds;
-       props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
-       props->max_qp_init_rd_atom = 255;
-       /* props->max_res_rd_atom */
-       props->max_srq = ib_ipath_max_srqs;
-       props->max_srq_wr = ib_ipath_max_srq_wrs;
-       props->max_srq_sge = ib_ipath_max_srq_sges;
-       /* props->local_ca_ack_delay */
-       props->atomic_cap = IB_ATOMIC_GLOB;
-       props->max_pkeys = ipath_get_npkeys(dev->dd);
-       props->max_mcast_grp = ib_ipath_max_mcast_grps;
-       props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
-       props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
-               props->max_mcast_grp;
-
-       return 0;
-}
-
-const u8 ipath_cvt_physportstate[32] = {
-       [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
-       [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
-       [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
-       [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
-       [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
-       [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
-       [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
-               IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
-               IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
-               IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
-               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-       [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
-               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-       [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
-               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
-       [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
-       [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
-};
-
-u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
-{
-       return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
-}
-
-static int ipath_query_port(struct ib_device *ibdev,
-                           u8 port, struct ib_port_attr *props)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_devdata *dd = dev->dd;
-       enum ib_mtu mtu;
-       u16 lid = dd->ipath_lid;
-       u64 ibcstat;
-
-       memset(props, 0, sizeof(*props));
-       props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
-       props->lmc = dd->ipath_lmc;
-       props->sm_lid = dev->sm_lid;
-       props->sm_sl = dev->sm_sl;
-       ibcstat = dd->ipath_lastibcstat;
-       /* map LinkState to IB portinfo values.  */
-       props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
-
-       /* See phys_state_show() */
-       props->phys_state = /* MEA: assumes shift == 0 */
-               ipath_cvt_physportstate[dd->ipath_lastibcstat &
-               dd->ibcs_lts_mask];
-       props->port_cap_flags = dev->port_cap_flags;
-       props->gid_tbl_len = 1;
-       props->max_msg_sz = 0x80000000;
-       props->pkey_tbl_len = ipath_get_npkeys(dd);
-       props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
-               dev->z_pkey_violations;
-       props->qkey_viol_cntr = dev->qkey_violations;
-       props->active_width = dd->ipath_link_width_active;
-       /* See rate_show() */
-       props->active_speed = dd->ipath_link_speed_active;
-       props->max_vl_num = 1;          /* VLCap = VL0 */
-       props->init_type_reply = 0;
-
-       props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
-       switch (dd->ipath_ibmtu) {
-       case 4096:
-               mtu = IB_MTU_4096;
-               break;
-       case 2048:
-               mtu = IB_MTU_2048;
-               break;
-       case 1024:
-               mtu = IB_MTU_1024;
-               break;
-       case 512:
-               mtu = IB_MTU_512;
-               break;
-       case 256:
-               mtu = IB_MTU_256;
-               break;
-       default:
-               mtu = IB_MTU_2048;
-       }
-       props->active_mtu = mtu;
-       props->subnet_timeout = dev->subnet_timeout;
-
-       return 0;
-}
-
-static int ipath_modify_device(struct ib_device *device,
-                              int device_modify_mask,
-                              struct ib_device_modify *device_modify)
-{
-       int ret;
-
-       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
-                                  IB_DEVICE_MODIFY_NODE_DESC)) {
-               ret = -EOPNOTSUPP;
-               goto bail;
-       }
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
-               memcpy(device->node_desc, device_modify->node_desc, 64);
-
-       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
-               to_idev(device)->sys_image_guid =
-                       cpu_to_be64(device_modify->sys_image_guid);
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static int ipath_modify_port(struct ib_device *ibdev,
-                            u8 port, int port_modify_mask,
-                            struct ib_port_modify *props)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-
-       dev->port_cap_flags |= props->set_port_cap_mask;
-       dev->port_cap_flags &= ~props->clr_port_cap_mask;
-       if (port_modify_mask & IB_PORT_SHUTDOWN)
-               ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
-       if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
-               dev->qkey_violations = 0;
-       return 0;
-}
-
-static int ipath_query_gid(struct ib_device *ibdev, u8 port,
-                          int index, union ib_gid *gid)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       int ret;
-
-       if (index >= 1) {
-               ret = -EINVAL;
-               goto bail;
-       }
-       gid->global.subnet_prefix = dev->gid_prefix;
-       gid->global.interface_id = dev->dd->ipath_guid;
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
-                                   struct ib_ucontext *context,
-                                   struct ib_udata *udata)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       struct ipath_pd *pd;
-       struct ib_pd *ret;
-
-       /*
-        * This is actually totally arbitrary.  Some correctness tests
-        * assume there's a maximum number of PDs that can be allocated.
-        * We don't actually have this limit, but we fail the test if
-        * we allow allocations of more than we report for this value.
-        */
-
-       pd = kmalloc(sizeof *pd, GFP_KERNEL);
-       if (!pd) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       spin_lock(&dev->n_pds_lock);
-       if (dev->n_pds_allocated == ib_ipath_max_pds) {
-               spin_unlock(&dev->n_pds_lock);
-               kfree(pd);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       dev->n_pds_allocated++;
-       spin_unlock(&dev->n_pds_lock);
-
-       /* ib_alloc_pd() will initialize pd->ibpd. */
-       pd->user = udata != NULL;
-
-       ret = &pd->ibpd;
-
-bail:
-       return ret;
-}
-
-static int ipath_dealloc_pd(struct ib_pd *ibpd)
-{
-       struct ipath_pd *pd = to_ipd(ibpd);
-       struct ipath_ibdev *dev = to_idev(ibpd->device);
-
-       spin_lock(&dev->n_pds_lock);
-       dev->n_pds_allocated--;
-       spin_unlock(&dev->n_pds_lock);
-
-       kfree(pd);
-
-       return 0;
-}
-
-/**
- * ipath_create_ah - create an address handle
- * @pd: the protection domain
- * @ah_attr: the attributes of the AH
- *
- * This may be called from interrupt context.
- */
-static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
-                                    struct ib_ah_attr *ah_attr)
-{
-       struct ipath_ah *ah;
-       struct ib_ah *ret;
-       struct ipath_ibdev *dev = to_idev(pd->device);
-       unsigned long flags;
-
-       /* A multicast address requires a GRH (see ch. 8.4.1). */
-       if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
-           ah_attr->dlid != IPATH_PERMISSIVE_LID &&
-           !(ah_attr->ah_flags & IB_AH_GRH)) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       if (ah_attr->dlid == 0) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       if (ah_attr->port_num < 1 ||
-           ah_attr->port_num > pd->device->phys_port_cnt) {
-               ret = ERR_PTR(-EINVAL);
-               goto bail;
-       }
-
-       ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-       if (!ah) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       spin_lock_irqsave(&dev->n_ahs_lock, flags);
-       if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
-               spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-               kfree(ah);
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       dev->n_ahs_allocated++;
-       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-
-       /* ib_create_ah() will initialize ah->ibah. */
-       ah->attr = *ah_attr;
-       ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
-
-       ret = &ah->ibah;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_destroy_ah - destroy an address handle
- * @ibah: the AH to destroy
- *
- * This may be called from interrupt context.
- */
-static int ipath_destroy_ah(struct ib_ah *ibah)
-{
-       struct ipath_ibdev *dev = to_idev(ibah->device);
-       struct ipath_ah *ah = to_iah(ibah);
-       unsigned long flags;
-
-       spin_lock_irqsave(&dev->n_ahs_lock, flags);
-       dev->n_ahs_allocated--;
-       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-
-       kfree(ah);
-
-       return 0;
-}
-
-static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
-{
-       struct ipath_ah *ah = to_iah(ibah);
-
-       *ah_attr = ah->attr;
-       ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
-
-       return 0;
-}
-
-/**
- * ipath_get_npkeys - return the size of the PKEY table for port 0
- * @dd: the infinipath device
- */
-unsigned ipath_get_npkeys(struct ipath_devdata *dd)
-{
-       return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
-}
-
-/**
- * ipath_get_pkey - return the indexed PKEY from the port PKEY table
- * @dd: the infinipath device
- * @index: the PKEY index
- */
-unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
-{
-       unsigned ret;
-
-       /* always a kernel port, no locking needed */
-       if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
-               ret = 0;
-       else
-               ret = dd->ipath_pd[0]->port_pkeys[index];
-
-       return ret;
-}
-
-static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-                           u16 *pkey)
-{
-       struct ipath_ibdev *dev = to_idev(ibdev);
-       int ret;
-
-       if (index >= ipath_get_npkeys(dev->dd)) {
-               ret = -EINVAL;
-               goto bail;
-       }
-
-       *pkey = ipath_get_pkey(dev->dd, index);
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-/**
- * ipath_alloc_ucontext - allocate a ucontest
- * @ibdev: the infiniband device
- * @udata: not used by the InfiniPath driver
- */
-
-static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
-                                               struct ib_udata *udata)
-{
-       struct ipath_ucontext *context;
-       struct ib_ucontext *ret;
-
-       context = kmalloc(sizeof *context, GFP_KERNEL);
-       if (!context) {
-               ret = ERR_PTR(-ENOMEM);
-               goto bail;
-       }
-
-       ret = &context->ibucontext;
-
-bail:
-       return ret;
-}
-
-static int ipath_dealloc_ucontext(struct ib_ucontext *context)
-{
-       kfree(to_iucontext(context));
-       return 0;
-}
-
-static int ipath_verbs_register_sysfs(struct ib_device *dev);
-
-static void __verbs_timer(unsigned long arg)
-{
-       struct ipath_devdata *dd = (struct ipath_devdata *) arg;
-
-       /* Handle verbs layer timeouts. */
-       ipath_ib_timer(dd->verbs_dev);
-
-       mod_timer(&dd->verbs_timer, jiffies + 1);
-}
-
-static int enable_timer(struct ipath_devdata *dd)
-{
-       /*
-        * Early chips had a design flaw where the chip and kernel idea
-        * of the tail register don't always agree, and therefore we won't
-        * get an interrupt on the next packet received.
-        * If the board supports per packet receive interrupts, use it.
-        * Otherwise, the timer function periodically checks for packets
-        * to cover this case.
-        * Either way, the timer is needed for verbs layer related
-        * processing.
-        */
-       if (dd->ipath_flags & IPATH_GPIO_INTR) {
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
-                                0x2074076542310ULL);
-               /* Enable GPIO bit 2 interrupt */
-               dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                dd->ipath_gpio_mask);
-       }
-
-       init_timer(&dd->verbs_timer);
-       dd->verbs_timer.function = __verbs_timer;
-       dd->verbs_timer.data = (unsigned long)dd;
-       dd->verbs_timer.expires = jiffies + 1;
-       add_timer(&dd->verbs_timer);
-
-       return 0;
-}
-
-static int disable_timer(struct ipath_devdata *dd)
-{
-       /* Disable GPIO bit 2 interrupt */
-       if (dd->ipath_flags & IPATH_GPIO_INTR) {
-                /* Disable GPIO bit 2 interrupt */
-               dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
-               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
-                                dd->ipath_gpio_mask);
-               /*
-                * We might want to undo changes to debugportselect,
-                * but how?
-                */
-       }
-
-       del_timer_sync(&dd->verbs_timer);
-
-       return 0;
-}
-
-static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num,
-                               struct ib_port_immutable *immutable)
-{
-       struct ib_port_attr attr;
-       int err;
-
-       err = ipath_query_port(ibdev, port_num, &attr);
-       if (err)
-               return err;
-
-       immutable->pkey_tbl_len = attr.pkey_tbl_len;
-       immutable->gid_tbl_len = attr.gid_tbl_len;
-       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
-       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-
-       return 0;
-}
-
-/**
- * ipath_register_ib_device - register our device with the infiniband core
- * @dd: the device data structure
- * Return the allocated ipath_ibdev pointer or NULL on error.
- */
-int ipath_register_ib_device(struct ipath_devdata *dd)
-{
-       struct ipath_verbs_counters cntrs;
-       struct ipath_ibdev *idev;
-       struct ib_device *dev;
-       struct ipath_verbs_txreq *tx;
-       unsigned i;
-       int ret;
-
-       idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
-       if (idev == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-
-       dev = &idev->ibdev;
-
-       if (dd->ipath_sdma_descq_cnt) {
-               tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
-                            GFP_KERNEL);
-               if (tx == NULL) {
-                       ret = -ENOMEM;
-                       goto err_tx;
-               }
-       } else
-               tx = NULL;
-       idev->txreq_bufs = tx;
-
-       /* Only need to initialize non-zero fields. */
-       spin_lock_init(&idev->n_pds_lock);
-       spin_lock_init(&idev->n_ahs_lock);
-       spin_lock_init(&idev->n_cqs_lock);
-       spin_lock_init(&idev->n_qps_lock);
-       spin_lock_init(&idev->n_srqs_lock);
-       spin_lock_init(&idev->n_mcast_grps_lock);
-
-       spin_lock_init(&idev->qp_table.lock);
-       spin_lock_init(&idev->lk_table.lock);
-       idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
-       /* Set the prefix to the default value (see ch. 4.1.1) */
-       idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL);
-
-       ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
-       if (ret)
-               goto err_qp;
-
-       /*
-        * The top ib_ipath_lkey_table_size bits are used to index the
-        * table.  The lower 8 bits can be owned by the user (copied from
-        * the LKEY).  The remaining bits act as a generation number or tag.
-        */
-       idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
-       idev->lk_table.table = kzalloc(idev->lk_table.max *
-                                      sizeof(*idev->lk_table.table),
-                                      GFP_KERNEL);
-       if (idev->lk_table.table == NULL) {
-               ret = -ENOMEM;
-               goto err_lk;
-       }
-       INIT_LIST_HEAD(&idev->pending_mmaps);
-       spin_lock_init(&idev->pending_lock);
-       idev->mmap_offset = PAGE_SIZE;
-       spin_lock_init(&idev->mmap_offset_lock);
-       INIT_LIST_HEAD(&idev->pending[0]);
-       INIT_LIST_HEAD(&idev->pending[1]);
-       INIT_LIST_HEAD(&idev->pending[2]);
-       INIT_LIST_HEAD(&idev->piowait);
-       INIT_LIST_HEAD(&idev->rnrwait);
-       INIT_LIST_HEAD(&idev->txreq_free);
-       idev->pending_index = 0;
-       idev->port_cap_flags =
-               IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
-       if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
-               idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
-       idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
-       idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
-       idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
-       idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
-       idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
-
-       /* Snapshot current HW counters to "clear" them. */
-       ipath_get_counters(dd, &cntrs);
-       idev->z_symbol_error_counter = cntrs.symbol_error_counter;
-       idev->z_link_error_recovery_counter =
-               cntrs.link_error_recovery_counter;
-       idev->z_link_downed_counter = cntrs.link_downed_counter;
-       idev->z_port_rcv_errors = cntrs.port_rcv_errors;
-       idev->z_port_rcv_remphys_errors =
-               cntrs.port_rcv_remphys_errors;
-       idev->z_port_xmit_discards = cntrs.port_xmit_discards;
-       idev->z_port_xmit_data = cntrs.port_xmit_data;
-       idev->z_port_rcv_data = cntrs.port_rcv_data;
-       idev->z_port_xmit_packets = cntrs.port_xmit_packets;
-       idev->z_port_rcv_packets = cntrs.port_rcv_packets;
-       idev->z_local_link_integrity_errors =
-               cntrs.local_link_integrity_errors;
-       idev->z_excessive_buffer_overrun_errors =
-               cntrs.excessive_buffer_overrun_errors;
-       idev->z_vl15_dropped = cntrs.vl15_dropped;
-
-       for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
-               list_add(&tx->txreq.list, &idev->txreq_free);
-
-       /*
-        * The system image GUID is supposed to be the same for all
-        * IB HCAs in a single system but since there can be other
-        * device types in the system, we can't be sure this is unique.
-        */
-       if (!sys_image_guid)
-               sys_image_guid = dd->ipath_guid;
-       idev->sys_image_guid = sys_image_guid;
-       idev->ib_unit = dd->ipath_unit;
-       idev->dd = dd;
-
-       strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
-       dev->owner = THIS_MODULE;
-       dev->node_guid = dd->ipath_guid;
-       dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
-       dev->uverbs_cmd_mask =
-               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
-               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
-               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
-               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
-               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
-               (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
-               (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
-               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
-               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
-               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
-               (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
-               (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
-               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
-               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
-               (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
-               (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
-               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
-               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
-               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
-               (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
-               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
-               (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
-       dev->node_type = RDMA_NODE_IB_CA;
-       dev->phys_port_cnt = 1;
-       dev->num_comp_vectors = 1;
-       dev->dma_device = &dd->pcidev->dev;
-       dev->query_device = ipath_query_device;
-       dev->modify_device = ipath_modify_device;
-       dev->query_port = ipath_query_port;
-       dev->modify_port = ipath_modify_port;
-       dev->query_pkey = ipath_query_pkey;
-       dev->query_gid = ipath_query_gid;
-       dev->alloc_ucontext = ipath_alloc_ucontext;
-       dev->dealloc_ucontext = ipath_dealloc_ucontext;
-       dev->alloc_pd = ipath_alloc_pd;
-       dev->dealloc_pd = ipath_dealloc_pd;
-       dev->create_ah = ipath_create_ah;
-       dev->destroy_ah = ipath_destroy_ah;
-       dev->query_ah = ipath_query_ah;
-       dev->create_srq = ipath_create_srq;
-       dev->modify_srq = ipath_modify_srq;
-       dev->query_srq = ipath_query_srq;
-       dev->destroy_srq = ipath_destroy_srq;
-       dev->create_qp = ipath_create_qp;
-       dev->modify_qp = ipath_modify_qp;
-       dev->query_qp = ipath_query_qp;
-       dev->destroy_qp = ipath_destroy_qp;
-       dev->post_send = ipath_post_send;
-       dev->post_recv = ipath_post_receive;
-       dev->post_srq_recv = ipath_post_srq_receive;
-       dev->create_cq = ipath_create_cq;
-       dev->destroy_cq = ipath_destroy_cq;
-       dev->resize_cq = ipath_resize_cq;
-       dev->poll_cq = ipath_poll_cq;
-       dev->req_notify_cq = ipath_req_notify_cq;
-       dev->get_dma_mr = ipath_get_dma_mr;
-       dev->reg_phys_mr = ipath_reg_phys_mr;
-       dev->reg_user_mr = ipath_reg_user_mr;
-       dev->dereg_mr = ipath_dereg_mr;
-       dev->alloc_fmr = ipath_alloc_fmr;
-       dev->map_phys_fmr = ipath_map_phys_fmr;
-       dev->unmap_fmr = ipath_unmap_fmr;
-       dev->dealloc_fmr = ipath_dealloc_fmr;
-       dev->attach_mcast = ipath_multicast_attach;
-       dev->detach_mcast = ipath_multicast_detach;
-       dev->process_mad = ipath_process_mad;
-       dev->mmap = ipath_mmap;
-       dev->dma_ops = &ipath_dma_mapping_ops;
-       dev->get_port_immutable = ipath_port_immutable;
-
-       snprintf(dev->node_desc, sizeof(dev->node_desc),
-                IPATH_IDSTR " %s", init_utsname()->nodename);
-
-       ret = ib_register_device(dev, NULL);
-       if (ret)
-               goto err_reg;
-
-       ret = ipath_verbs_register_sysfs(dev);
-       if (ret)
-               goto err_class;
-
-       enable_timer(dd);
-
-       goto bail;
-
-err_class:
-       ib_unregister_device(dev);
-err_reg:
-       kfree(idev->lk_table.table);
-err_lk:
-       kfree(idev->qp_table.table);
-err_qp:
-       kfree(idev->txreq_bufs);
-err_tx:
-       ib_dealloc_device(dev);
-       ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
-       idev = NULL;
-
-bail:
-       dd->verbs_dev = idev;
-       return ret;
-}
-
-void ipath_unregister_ib_device(struct ipath_ibdev *dev)
-{
-       struct ib_device *ibdev = &dev->ibdev;
-       u32 qps_inuse;
-
-       ib_unregister_device(ibdev);
-
-       disable_timer(dev->dd);
-
-       if (!list_empty(&dev->pending[0]) ||
-           !list_empty(&dev->pending[1]) ||
-           !list_empty(&dev->pending[2]))
-               ipath_dev_err(dev->dd, "pending list not empty!\n");
-       if (!list_empty(&dev->piowait))
-               ipath_dev_err(dev->dd, "piowait list not empty!\n");
-       if (!list_empty(&dev->rnrwait))
-               ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
-       if (!ipath_mcast_tree_empty())
-               ipath_dev_err(dev->dd, "multicast table memory leak!\n");
-       /*
-        * Note that ipath_unregister_ib_device() can be called before all
-        * the QPs are destroyed!
-        */
-       qps_inuse = ipath_free_all_qps(&dev->qp_table);
-       if (qps_inuse)
-               ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
-                       qps_inuse);
-       kfree(dev->qp_table.table);
-       kfree(dev->lk_table.table);
-       kfree(dev->txreq_bufs);
-       ib_dealloc_device(ibdev);
-}
-
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_ibdev *dev =
-               container_of(device, struct ipath_ibdev, ibdev.dev);
-
-       return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
-}
-
-static ssize_t show_hca(struct device *device, struct device_attribute *attr,
-                       char *buf)
-{
-       struct ipath_ibdev *dev =
-               container_of(device, struct ipath_ibdev, ibdev.dev);
-       int ret;
-
-       ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
-       if (ret < 0)
-               goto bail;
-       strcat(buf, "\n");
-       ret = strlen(buf);
-
-bail:
-       return ret;
-}
-
-static ssize_t show_stats(struct device *device, struct device_attribute *attr,
-                         char *buf)
-{
-       struct ipath_ibdev *dev =
-               container_of(device, struct ipath_ibdev, ibdev.dev);
-       int i;
-       int len;
-
-       len = sprintf(buf,
-                     "RC resends  %d\n"
-                     "RC no QACK  %d\n"
-                     "RC ACKs     %d\n"
-                     "RC SEQ NAKs %d\n"
-                     "RC RDMA seq %d\n"
-                     "RC RNR NAKs %d\n"
-                     "RC OTH NAKs %d\n"
-                     "RC timeouts %d\n"
-                     "RC RDMA dup %d\n"
-                     "piobuf wait %d\n"
-                     "unaligned   %d\n"
-                     "PKT drops   %d\n"
-                     "WQE errs    %d\n",
-                     dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
-                     dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
-                     dev->n_other_naks, dev->n_timeouts,
-                     dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
-                     dev->n_pkt_drops, dev->n_wqe_errs);
-       for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
-               const struct ipath_opcode_stats *si = &dev->opstats[i];
-
-               if (!si->n_packets && !si->n_bytes)
-                       continue;
-               len += sprintf(buf + len, "%02x %llu/%llu\n", i,
-                              (unsigned long long) si->n_packets,
-                              (unsigned long long) si->n_bytes);
-       }
-       return len;
-}
-
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
-static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
-
-static struct device_attribute *ipath_class_attributes[] = {
-       &dev_attr_hw_rev,
-       &dev_attr_hca_type,
-       &dev_attr_board_id,
-       &dev_attr_stats
-};
-
-static int ipath_verbs_register_sysfs(struct ib_device *dev)
-{
-       int i;
-       int ret;
-
-       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
-               ret = device_create_file(&dev->dev,
-                                      ipath_class_attributes[i]);
-               if (ret)
-                       goto bail;
-       }
-       return 0;
-bail:
-       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
-               device_remove_file(&dev->dev, ipath_class_attributes[i]);
-       return ret;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
deleted file mode 100644 (file)
index ec167e5..0000000
+++ /dev/null
@@ -1,939 +0,0 @@
-/*
- * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IPATH_VERBS_H
-#define IPATH_VERBS_H
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/kref.h>
-#include <rdma/ib_pack.h>
-#include <rdma/ib_user_verbs.h>
-
-#include "ipath_kernel.h"
-
-#define IPATH_MAX_RDMA_ATOMIC  4
-
-#define QPN_MAX                 (1 << 24)
-#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
-
-/*
- * Increment this value if any changes that break userspace ABI
- * compatibility are made.
- */
-#define IPATH_UVERBS_ABI_VERSION       2
-
-/*
- * Define an ib_cq_notify value that is not valid so we know when CQ
- * notifications are armed.
- */
-#define IB_CQ_NONE     (IB_CQ_NEXT_COMP + 1)
-
-/* AETH NAK opcode values */
-#define IB_RNR_NAK                     0x20
-#define IB_NAK_PSN_ERROR               0x60
-#define IB_NAK_INVALID_REQUEST         0x61
-#define IB_NAK_REMOTE_ACCESS_ERROR     0x62
-#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
-#define IB_NAK_INVALID_RD_REQUEST      0x64
-
-/* Flags for checking QP state (see ib_ipath_state_ops[]) */
-#define IPATH_POST_SEND_OK             0x01
-#define IPATH_POST_RECV_OK             0x02
-#define IPATH_PROCESS_RECV_OK          0x04
-#define IPATH_PROCESS_SEND_OK          0x08
-#define IPATH_PROCESS_NEXT_SEND_OK     0x10
-#define IPATH_FLUSH_SEND               0x20
-#define IPATH_FLUSH_RECV               0x40
-#define IPATH_PROCESS_OR_FLUSH_SEND \
-       (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
-
-/* IB Performance Manager status values */
-#define IB_PMA_SAMPLE_STATUS_DONE      0x00
-#define IB_PMA_SAMPLE_STATUS_STARTED   0x01
-#define IB_PMA_SAMPLE_STATUS_RUNNING   0x02
-
-/* Mandatory IB performance counter select values. */
-#define IB_PMA_PORT_XMIT_DATA  cpu_to_be16(0x0001)
-#define IB_PMA_PORT_RCV_DATA   cpu_to_be16(0x0002)
-#define IB_PMA_PORT_XMIT_PKTS  cpu_to_be16(0x0003)
-#define IB_PMA_PORT_RCV_PKTS   cpu_to_be16(0x0004)
-#define IB_PMA_PORT_XMIT_WAIT  cpu_to_be16(0x0005)
-
-struct ib_reth {
-       __be64 vaddr;
-       __be32 rkey;
-       __be32 length;
-} __attribute__ ((packed));
-
-struct ib_atomic_eth {
-       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
-       __be32 rkey;
-       __be64 swap_data;
-       __be64 compare_data;
-} __attribute__ ((packed));
-
-struct ipath_other_headers {
-       __be32 bth[3];
-       union {
-               struct {
-                       __be32 deth[2];
-                       __be32 imm_data;
-               } ud;
-               struct {
-                       struct ib_reth reth;
-                       __be32 imm_data;
-               } rc;
-               struct {
-                       __be32 aeth;
-                       __be32 atomic_ack_eth[2];
-               } at;
-               __be32 imm_data;
-               __be32 aeth;
-               struct ib_atomic_eth atomic_eth;
-       } u;
-} __attribute__ ((packed));
-
-/*
- * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
- * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
- * will be in the eager header buffer.  The remaining 12 or 16 bytes
- * are in the data buffer.
- */
-struct ipath_ib_header {
-       __be16 lrh[4];
-       union {
-               struct {
-                       struct ib_grh grh;
-                       struct ipath_other_headers oth;
-               } l;
-               struct ipath_other_headers oth;
-       } u;
-} __attribute__ ((packed));
-
-struct ipath_pio_header {
-       __le32 pbc[2];
-       struct ipath_ib_header hdr;
-} __attribute__ ((packed));
-
-/*
- * There is one struct ipath_mcast for each multicast GID.
- * All attached QPs are then stored as a list of
- * struct ipath_mcast_qp.
- */
-struct ipath_mcast_qp {
-       struct list_head list;
-       struct ipath_qp *qp;
-};
-
-struct ipath_mcast {
-       struct rb_node rb_node;
-       union ib_gid mgid;
-       struct list_head qp_list;
-       wait_queue_head_t wait;
-       atomic_t refcount;
-       int n_attached;
-};
-
-/* Protection domain */
-struct ipath_pd {
-       struct ib_pd ibpd;
-       int user;               /* non-zero if created from user space */
-};
-
-/* Address Handle */
-struct ipath_ah {
-       struct ib_ah ibah;
-       struct ib_ah_attr attr;
-};
-
-/*
- * This structure is used by ipath_mmap() to validate an offset
- * when an mmap() request is made.  The vm_area_struct then uses
- * this as its vm_private_data.
- */
-struct ipath_mmap_info {
-       struct list_head pending_mmaps;
-       struct ib_ucontext *context;
-       void *obj;
-       __u64 offset;
-       struct kref ref;
-       unsigned size;
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and completion queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- */
-struct ipath_cq_wc {
-       u32 head;               /* index of next entry to fill */
-       u32 tail;               /* index of next ib_poll_cq() entry */
-       union {
-               /* these are actually size ibcq.cqe + 1 */
-               struct ib_uverbs_wc uqueue[0];
-               struct ib_wc kqueue[0];
-       };
-};
-
-/*
- * The completion queue structure.
- */
-struct ipath_cq {
-       struct ib_cq ibcq;
-       struct tasklet_struct comptask;
-       spinlock_t lock;
-       u8 notify;
-       u8 triggered;
-       struct ipath_cq_wc *queue;
-       struct ipath_mmap_info *ip;
-};
-
-/*
- * A segment is a linear region of low physical memory.
- * XXX Maybe we should use phys addr here and kmap()/kunmap().
- * Used by the verbs layer.
- */
-struct ipath_seg {
-       void *vaddr;
-       size_t length;
-};
-
-/* The number of ipath_segs that fit in a page. */
-#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
-
-struct ipath_segarray {
-       struct ipath_seg segs[IPATH_SEGSZ];
-};
-
-struct ipath_mregion {
-       struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
-       u64 user_base;          /* User's address for this region */
-       u64 iova;               /* IB start address of this region */
-       size_t length;
-       u32 lkey;
-       u32 offset;             /* offset (bytes) to start of region */
-       int access_flags;
-       u32 max_segs;           /* number of ipath_segs in all the arrays */
-       u32 mapsz;              /* size of the map array */
-       struct ipath_segarray *map[0];  /* the segments */
-};
-
-/*
- * These keep track of the copy progress within a memory region.
- * Used by the verbs layer.
- */
-struct ipath_sge {
-       struct ipath_mregion *mr;
-       void *vaddr;            /* kernel virtual address of segment */
-       u32 sge_length;         /* length of the SGE */
-       u32 length;             /* remaining length of the segment */
-       u16 m;                  /* current index: mr->map[m] */
-       u16 n;                  /* current index: mr->map[m]->segs[n] */
-};
-
-/* Memory region */
-struct ipath_mr {
-       struct ib_mr ibmr;
-       struct ib_umem *umem;
-       struct ipath_mregion mr;        /* must be last */
-};
-
-/*
- * Send work request queue entry.
- * The size of the sg_list is determined when the QP is created and stored
- * in qp->s_max_sge.
- */
-struct ipath_swqe {
-       struct ib_send_wr wr;   /* don't use wr.sg_list */
-       u32 psn;                /* first packet sequence number */
-       u32 lpsn;               /* last packet sequence number */
-       u32 ssn;                /* send sequence number */
-       u32 length;             /* total length of data in sg_list */
-       struct ipath_sge sg_list[0];
-};
-
-/*
- * Receive work request queue entry.
- * The size of the sg_list is determined when the QP (or SRQ) is created
- * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
- */
-struct ipath_rwqe {
-       u64 wr_id;
-       u8 num_sge;
-       struct ib_sge sg_list[0];
-};
-
-/*
- * This structure is used to contain the head pointer, tail pointer,
- * and receive work queue entries as a single memory allocation so
- * it can be mmap'ed into user space.
- * Note that the wq array elements are variable size so you can't
- * just index into the array to get the N'th element;
- * use get_rwqe_ptr() instead.
- */
-struct ipath_rwq {
-       u32 head;               /* new work requests posted to the head */
-       u32 tail;               /* receives pull requests from here. */
-       struct ipath_rwqe wq[0];
-};
-
-struct ipath_rq {
-       struct ipath_rwq *wq;
-       spinlock_t lock;
-       u32 size;               /* size of RWQE array */
-       u8 max_sge;
-};
-
-struct ipath_srq {
-       struct ib_srq ibsrq;
-       struct ipath_rq rq;
-       struct ipath_mmap_info *ip;
-       /* send signal when number of RWQEs < limit */
-       u32 limit;
-};
-
-struct ipath_sge_state {
-       struct ipath_sge *sg_list;      /* next SGE to be used if any */
-       struct ipath_sge sge;   /* progress state for the current SGE */
-       u8 num_sge;
-       u8 static_rate;
-};
-
-/*
- * This structure holds the information that the send tasklet needs
- * to send a RDMA read response or atomic operation.
- */
-struct ipath_ack_entry {
-       u8 opcode;
-       u8 sent;
-       u32 psn;
-       union {
-               struct ipath_sge_state rdma_sge;
-               u64 atomic_data;
-       };
-};
-
-/*
- * Variables prefixed with s_ are for the requester (sender).
- * Variables prefixed with r_ are for the responder (receiver).
- * Variables prefixed with ack_ are for responder replies.
- *
- * Common variables are protected by both r_rq.lock and s_lock in that order
- * which only happens in modify_qp() or changing the QP 'state'.
- */
-struct ipath_qp {
-       struct ib_qp ibqp;
-       struct ipath_qp *next;          /* link list for QPN hash table */
-       struct ipath_qp *timer_next;    /* link list for ipath_ib_timer() */
-       struct ipath_qp *pio_next;      /* link for ipath_ib_piobufavail() */
-       struct list_head piowait;       /* link for wait PIO buf */
-       struct list_head timerwait;     /* link for waiting for timeouts */
-       struct ib_ah_attr remote_ah_attr;
-       struct ipath_ib_header s_hdr;   /* next packet header to send */
-       atomic_t refcount;
-       wait_queue_head_t wait;
-       wait_queue_head_t wait_dma;
-       struct tasklet_struct s_task;
-       struct ipath_mmap_info *ip;
-       struct ipath_sge_state *s_cur_sge;
-       struct ipath_verbs_txreq *s_tx;
-       struct ipath_sge_state s_sge;   /* current send request data */
-       struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
-       struct ipath_sge_state s_ack_rdma_sge;
-       struct ipath_sge_state s_rdma_read_sge;
-       struct ipath_sge_state r_sge;   /* current receive data */
-       spinlock_t s_lock;
-       atomic_t s_dma_busy;
-       u16 s_pkt_delay;
-       u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
-       u32 s_cur_size;         /* size of send packet in bytes */
-       u32 s_len;              /* total length of s_sge */
-       u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
-       u32 s_next_psn;         /* PSN for next request */
-       u32 s_last_psn;         /* last response PSN processed */
-       u32 s_psn;              /* current packet sequence number */
-       u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
-       u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
-       u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
-       u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
-       u64 r_wr_id;            /* ID for current receive WQE */
-       unsigned long r_aflags;
-       u32 r_len;              /* total length of r_sge */
-       u32 r_rcv_len;          /* receive data len processed */
-       u32 r_psn;              /* expected rcv packet sequence number */
-       u32 r_msn;              /* message sequence number */
-       u8 state;               /* QP state */
-       u8 s_state;             /* opcode of last packet sent */
-       u8 s_ack_state;         /* opcode of packet to ACK */
-       u8 s_nak_state;         /* non-zero if NAK is pending */
-       u8 r_state;             /* opcode of last packet received */
-       u8 r_nak_state;         /* non-zero if NAK is pending */
-       u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
-       u8 r_flags;
-       u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
-       u8 r_head_ack_queue;    /* index into s_ack_queue[] */
-       u8 qp_access_flags;
-       u8 s_max_sge;           /* size of s_wq->sg_list */
-       u8 s_retry_cnt;         /* number of times to retry */
-       u8 s_rnr_retry_cnt;
-       u8 s_retry;             /* requester retry counter */
-       u8 s_rnr_retry;         /* requester RNR retry counter */
-       u8 s_pkey_index;        /* PKEY index to use */
-       u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
-       u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
-       u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
-       u8 s_flags;
-       u8 s_dmult;
-       u8 s_draining;
-       u8 timeout;             /* Timeout for this QP */
-       enum ib_mtu path_mtu;
-       u32 remote_qpn;
-       u32 qkey;               /* QKEY for this QP (for UD or RD) */
-       u32 s_size;             /* send work queue size */
-       u32 s_head;             /* new entries added here */
-       u32 s_tail;             /* next entry to process */
-       u32 s_cur;              /* current work queue entry */
-       u32 s_last;             /* last un-ACK'ed entry */
-       u32 s_ssn;              /* SSN of tail entry */
-       u32 s_lsn;              /* limit sequence number (credit) */
-       struct ipath_swqe *s_wq;        /* send work queue */
-       struct ipath_swqe *s_wqe;
-       struct ipath_sge *r_ud_sg_list;
-       struct ipath_rq r_rq;           /* receive work queue */
-       struct ipath_sge r_sg_list[0];  /* verified SGEs */
-};
-
-/*
- * Atomic bit definitions for r_aflags.
- */
-#define IPATH_R_WRID_VALID     0
-
-/*
- * Bit definitions for r_flags.
- */
-#define IPATH_R_REUSE_SGE      0x01
-#define IPATH_R_RDMAR_SEQ      0x02
-
-/*
- * Bit definitions for s_flags.
- *
- * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
- *                        before processing the next SWQE
- * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
- *                        before processing the next SWQE
- * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
- * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
- * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
- *                   next send completion entry not via send DMA.
- */
-#define IPATH_S_SIGNAL_REQ_WR  0x01
-#define IPATH_S_FENCE_PENDING  0x02
-#define IPATH_S_RDMAR_PENDING  0x04
-#define IPATH_S_ACK_PENDING    0x08
-#define IPATH_S_BUSY           0x10
-#define IPATH_S_WAITING                0x20
-#define IPATH_S_WAIT_SSN_CREDIT        0x40
-#define IPATH_S_WAIT_DMA       0x80
-
-#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
-       IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
-
-#define IPATH_PSN_CREDIT       512
-
-/*
- * Since struct ipath_swqe is not a fixed size, we can't simply index into
- * struct ipath_qp.s_wq.  This function does the array index computation.
- */
-static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
-                                             unsigned n)
-{
-       return (struct ipath_swqe *)((char *)qp->s_wq +
-                                    (sizeof(struct ipath_swqe) +
-                                     qp->s_max_sge *
-                                     sizeof(struct ipath_sge)) * n);
-}
-
-/*
- * Since struct ipath_rwqe is not a fixed size, we can't simply index into
- * struct ipath_rwq.wq.  This function does the array index computation.
- */
-static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
-                                             unsigned n)
-{
-       return (struct ipath_rwqe *)
-               ((char *) rq->wq->wq +
-                (sizeof(struct ipath_rwqe) +
-                 rq->max_sge * sizeof(struct ib_sge)) * n);
-}
-
-/*
- * QPN-map pages start out as NULL, they get allocated upon
- * first use and are never deallocated. This way,
- * large bitmaps are not allocated unless large numbers of QPs are used.
- */
-struct qpn_map {
-       atomic_t n_free;
-       void *page;
-};
-
-struct ipath_qp_table {
-       spinlock_t lock;
-       u32 last;               /* last QP number allocated */
-       u32 max;                /* size of the hash table */
-       u32 nmaps;              /* size of the map table */
-       struct ipath_qp **table;
-       /* bit map of free numbers */
-       struct qpn_map map[QPNMAP_ENTRIES];
-};
-
-struct ipath_lkey_table {
-       spinlock_t lock;
-       u32 next;               /* next unused index (speeds search) */
-       u32 gen;                /* generation count */
-       u32 max;                /* size of the table */
-       struct ipath_mregion **table;
-};
-
-struct ipath_opcode_stats {
-       u64 n_packets;          /* number of packets */
-       u64 n_bytes;            /* total number of bytes */
-};
-
-struct ipath_ibdev {
-       struct ib_device ibdev;
-       struct ipath_devdata *dd;
-       struct list_head pending_mmaps;
-       spinlock_t mmap_offset_lock;
-       u32 mmap_offset;
-       int ib_unit;            /* This is the device number */
-       u16 sm_lid;             /* in host order */
-       u8 sm_sl;
-       u8 mkeyprot;
-       /* non-zero when timer is set */
-       unsigned long mkey_lease_timeout;
-
-       /* The following fields are really per port. */
-       struct ipath_qp_table qp_table;
-       struct ipath_lkey_table lk_table;
-       struct list_head pending[3];    /* FIFO of QPs waiting for ACKs */
-       struct list_head piowait;       /* list for wait PIO buf */
-       struct list_head txreq_free;
-       void *txreq_bufs;
-       /* list of QPs waiting for RNR timer */
-       struct list_head rnrwait;
-       spinlock_t pending_lock;
-       __be64 sys_image_guid;  /* in network order */
-       __be64 gid_prefix;      /* in network order */
-       __be64 mkey;
-
-       u32 n_pds_allocated;    /* number of PDs allocated for device */
-       spinlock_t n_pds_lock;
-       u32 n_ahs_allocated;    /* number of AHs allocated for device */
-       spinlock_t n_ahs_lock;
-       u32 n_cqs_allocated;    /* number of CQs allocated for device */
-       spinlock_t n_cqs_lock;
-       u32 n_qps_allocated;    /* number of QPs allocated for device */
-       spinlock_t n_qps_lock;
-       u32 n_srqs_allocated;   /* number of SRQs allocated for device */
-       spinlock_t n_srqs_lock;
-       u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
-       spinlock_t n_mcast_grps_lock;
-
-       u64 ipath_sword;        /* total dwords sent (sample result) */
-       u64 ipath_rword;        /* total dwords received (sample result) */
-       u64 ipath_spkts;        /* total packets sent (sample result) */
-       u64 ipath_rpkts;        /* total packets received (sample result) */
-       /* # of ticks no data sent (sample result) */
-       u64 ipath_xmit_wait;
-       u64 rcv_errors;         /* # of packets with SW detected rcv errs */
-       u64 n_unicast_xmit;     /* total unicast packets sent */
-       u64 n_unicast_rcv;      /* total unicast packets received */
-       u64 n_multicast_xmit;   /* total multicast packets sent */
-       u64 n_multicast_rcv;    /* total multicast packets received */
-       u64 z_symbol_error_counter;             /* starting count for PMA */
-       u64 z_link_error_recovery_counter;      /* starting count for PMA */
-       u64 z_link_downed_counter;              /* starting count for PMA */
-       u64 z_port_rcv_errors;                  /* starting count for PMA */
-       u64 z_port_rcv_remphys_errors;          /* starting count for PMA */
-       u64 z_port_xmit_discards;               /* starting count for PMA */
-       u64 z_port_xmit_data;                   /* starting count for PMA */
-       u64 z_port_rcv_data;                    /* starting count for PMA */
-       u64 z_port_xmit_packets;                /* starting count for PMA */
-       u64 z_port_rcv_packets;                 /* starting count for PMA */
-       u32 z_pkey_violations;                  /* starting count for PMA */
-       u32 z_local_link_integrity_errors;      /* starting count for PMA */
-       u32 z_excessive_buffer_overrun_errors;  /* starting count for PMA */
-       u32 z_vl15_dropped;                     /* starting count for PMA */
-       u32 n_rc_resends;
-       u32 n_rc_acks;
-       u32 n_rc_qacks;
-       u32 n_seq_naks;
-       u32 n_rdma_seq;
-       u32 n_rnr_naks;
-       u32 n_other_naks;
-       u32 n_timeouts;
-       u32 n_pkt_drops;
-       u32 n_vl15_dropped;
-       u32 n_wqe_errs;
-       u32 n_rdma_dup_busy;
-       u32 n_piowait;
-       u32 n_unaligned;
-       u32 port_cap_flags;
-       u32 pma_sample_start;
-       u32 pma_sample_interval;
-       __be16 pma_counter_select[5];
-       u16 pma_tag;
-       u16 qkey_violations;
-       u16 mkey_violations;
-       u16 mkey_lease_period;
-       u16 pending_index;      /* which pending queue is active */
-       u8 pma_sample_status;
-       u8 subnet_timeout;
-       u8 vl_high_limit;
-       struct ipath_opcode_stats opstats[128];
-};
-
-struct ipath_verbs_counters {
-       u64 symbol_error_counter;
-       u64 link_error_recovery_counter;
-       u64 link_downed_counter;
-       u64 port_rcv_errors;
-       u64 port_rcv_remphys_errors;
-       u64 port_xmit_discards;
-       u64 port_xmit_data;
-       u64 port_rcv_data;
-       u64 port_xmit_packets;
-       u64 port_rcv_packets;
-       u32 local_link_integrity_errors;
-       u32 excessive_buffer_overrun_errors;
-       u32 vl15_dropped;
-};
-
-struct ipath_verbs_txreq {
-       struct ipath_qp         *qp;
-       struct ipath_swqe       *wqe;
-       u32                      map_len;
-       u32                      len;
-       struct ipath_sge_state  *ss;
-       struct ipath_pio_header  hdr;
-       struct ipath_sdma_txreq  txreq;
-};
-
-static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
-{
-       return container_of(ibmr, struct ipath_mr, ibmr);
-}
-
-static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
-{
-       return container_of(ibpd, struct ipath_pd, ibpd);
-}
-
-static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
-{
-       return container_of(ibah, struct ipath_ah, ibah);
-}
-
-static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
-{
-       return container_of(ibcq, struct ipath_cq, ibcq);
-}
-
-static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
-{
-       return container_of(ibsrq, struct ipath_srq, ibsrq);
-}
-
-static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
-{
-       return container_of(ibqp, struct ipath_qp, ibqp);
-}
-
-static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
-{
-       return container_of(ibdev, struct ipath_ibdev, ibdev);
-}
-
-/*
- * This must be called with s_lock held.
- */
-static inline void ipath_schedule_send(struct ipath_qp *qp)
-{
-       if (qp->s_flags & IPATH_S_ANY_WAIT)
-               qp->s_flags &= ~IPATH_S_ANY_WAIT;
-       if (!(qp->s_flags & IPATH_S_BUSY))
-               tasklet_hi_schedule(&qp->s_task);
-}
-
-int ipath_process_mad(struct ib_device *ibdev,
-                     int mad_flags,
-                     u8 port_num,
-                     const struct ib_wc *in_wc,
-                     const struct ib_grh *in_grh,
-                     const struct ib_mad_hdr *in, size_t in_mad_size,
-                     struct ib_mad_hdr *out, size_t *out_mad_size,
-                     u16 *out_mad_pkey_index);
-
-/*
- * Compare the lower 24 bits of the two values.
- * Returns an integer <, ==, or > than zero.
- */
-static inline int ipath_cmp24(u32 a, u32 b)
-{
-       return (((int) a) - ((int) b)) << 8;
-}
-
-struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
-
-int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
-                           u64 *rwords, u64 *spkts, u64 *rpkts,
-                           u64 *xmit_wait);
-
-int ipath_get_counters(struct ipath_devdata *dd,
-                      struct ipath_verbs_counters *cntrs);
-
-int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
-
-int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
-
-int ipath_mcast_tree_empty(void);
-
-__be32 ipath_compute_aeth(struct ipath_qp *qp);
-
-struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
-
-struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
-                             struct ib_qp_init_attr *init_attr,
-                             struct ib_udata *udata);
-
-int ipath_destroy_qp(struct ib_qp *ibqp);
-
-int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
-
-int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                   int attr_mask, struct ib_udata *udata);
-
-int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-                  int attr_mask, struct ib_qp_init_attr *init_attr);
-
-unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
-
-int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
-
-void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
-
-unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
-
-int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
-                    u32 hdrwords, struct ipath_sge_state *ss, u32 len);
-
-void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
-
-void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
-
-void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
-
-void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
-
-int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
-
-void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
-                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
-
-int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
-                    struct ipath_mregion *mr);
-
-void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
-
-int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
-                 struct ib_sge *sge, int acc);
-
-int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
-                 u32 len, u64 vaddr, u32 rkey, int acc);
-
-int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
-                          struct ib_recv_wr **bad_wr);
-
-struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
-                               struct ib_srq_init_attr *srq_init_attr,
-                               struct ib_udata *udata);
-
-int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
-                    enum ib_srq_attr_mask attr_mask,
-                    struct ib_udata *udata);
-
-int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-
-int ipath_destroy_srq(struct ib_srq *ibsrq);
-
-void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
-
-int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
-
-struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
-                             const struct ib_cq_init_attr *attr,
-                             struct ib_ucontext *context,
-                             struct ib_udata *udata);
-
-int ipath_destroy_cq(struct ib_cq *ibcq);
-
-int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
-
-int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
-
-struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
-
-struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
-                               struct ib_phys_buf *buffer_list,
-                               int num_phys_buf, int acc, u64 *iova_start);
-
-struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-                               u64 virt_addr, int mr_access_flags,
-                               struct ib_udata *udata);
-
-int ipath_dereg_mr(struct ib_mr *ibmr);
-
-struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-                              struct ib_fmr_attr *fmr_attr);
-
-int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
-                      int list_len, u64 iova);
-
-int ipath_unmap_fmr(struct list_head *fmr_list);
-
-int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
-
-void ipath_release_mmap_info(struct kref *ref);
-
-struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
-                                              u32 size,
-                                              struct ib_ucontext *context,
-                                              void *obj);
-
-void ipath_update_mmap_info(struct ipath_ibdev *dev,
-                           struct ipath_mmap_info *ip,
-                           u32 size, void *obj);
-
-int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-
-void ipath_insert_rnr_queue(struct ipath_qp *qp);
-
-int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-                  u32 *lengthp, struct ipath_sge_state *ss);
-
-int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
-
-u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
-                  struct ib_global_route *grh, u32 hwords, u32 nwords);
-
-void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
-                          struct ipath_other_headers *ohdr,
-                          u32 bth0, u32 bth2);
-
-void ipath_do_send(unsigned long data);
-
-void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
-                        enum ib_wc_status status);
-
-int ipath_make_rc_req(struct ipath_qp *qp);
-
-int ipath_make_uc_req(struct ipath_qp *qp);
-
-int ipath_make_ud_req(struct ipath_qp *qp);
-
-int ipath_register_ib_device(struct ipath_devdata *);
-
-void ipath_unregister_ib_device(struct ipath_ibdev *);
-
-void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
-
-int ipath_ib_piobufavail(struct ipath_ibdev *);
-
-unsigned ipath_get_npkeys(struct ipath_devdata *);
-
-u32 ipath_get_cr_errpkey(struct ipath_devdata *);
-
-unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
-
-extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
-
-/*
- * Below converts HCA-specific LinkTrainingState to IB PhysPortState
- * values.
- */
-extern const u8 ipath_cvt_physportstate[];
-#define IB_PHYSPORTSTATE_SLEEP 1
-#define IB_PHYSPORTSTATE_POLL 2
-#define IB_PHYSPORTSTATE_DISABLED 3
-#define IB_PHYSPORTSTATE_CFG_TRAIN 4
-#define IB_PHYSPORTSTATE_LINKUP 5
-#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
-
-extern const int ib_ipath_state_ops[];
-
-extern unsigned int ib_ipath_lkey_table_size;
-
-extern unsigned int ib_ipath_max_cqes;
-
-extern unsigned int ib_ipath_max_cqs;
-
-extern unsigned int ib_ipath_max_qp_wrs;
-
-extern unsigned int ib_ipath_max_qps;
-
-extern unsigned int ib_ipath_max_sges;
-
-extern unsigned int ib_ipath_max_mcast_grps;
-
-extern unsigned int ib_ipath_max_mcast_qp_attached;
-
-extern unsigned int ib_ipath_max_srqs;
-
-extern unsigned int ib_ipath_max_srq_sges;
-
-extern unsigned int ib_ipath_max_srq_wrs;
-
-extern const u32 ib_ipath_rnr_table[];
-
-extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
-
-#endif                         /* IPATH_VERBS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
deleted file mode 100644 (file)
index 6216ea9..0000000
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include <linux/rculist.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include "ipath_verbs.h"
-
-/*
- * Global table of GID to attached QPs.
- * The table is global to all ipath devices since a send from one QP/device
- * needs to be locally routed to any locally attached QPs on the same
- * or different device.
- */
-static struct rb_root mcast_tree;
-static DEFINE_SPINLOCK(mcast_lock);
-
-/**
- * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
- * @qp: the QP to link
- */
-static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
-{
-       struct ipath_mcast_qp *mqp;
-
-       mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
-       if (!mqp)
-               goto bail;
-
-       mqp->qp = qp;
-       atomic_inc(&qp->refcount);
-
-bail:
-       return mqp;
-}
-
-static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
-{
-       struct ipath_qp *qp = mqp->qp;
-
-       /* Notify ipath_destroy_qp() if it is waiting. */
-       if (atomic_dec_and_test(&qp->refcount))
-               wake_up(&qp->wait);
-
-       kfree(mqp);
-}
-
-/**
- * ipath_mcast_alloc - allocate the multicast GID structure
- * @mgid: the multicast GID
- *
- * A list of QPs will be attached to this structure.
- */
-static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
-{
-       struct ipath_mcast *mcast;
-
-       mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
-       if (!mcast)
-               goto bail;
-
-       mcast->mgid = *mgid;
-       INIT_LIST_HEAD(&mcast->qp_list);
-       init_waitqueue_head(&mcast->wait);
-       atomic_set(&mcast->refcount, 0);
-       mcast->n_attached = 0;
-
-bail:
-       return mcast;
-}
-
-static void ipath_mcast_free(struct ipath_mcast *mcast)
-{
-       struct ipath_mcast_qp *p, *tmp;
-
-       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
-               ipath_mcast_qp_free(p);
-
-       kfree(mcast);
-}
-
-/**
- * ipath_mcast_find - search the global table for the given multicast GID
- * @mgid: the multicast GID to search for
- *
- * Returns NULL if not found.
- *
- * The caller is responsible for decrementing the reference count if found.
- */
-struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
-{
-       struct rb_node *n;
-       unsigned long flags;
-       struct ipath_mcast *mcast;
-
-       spin_lock_irqsave(&mcast_lock, flags);
-       n = mcast_tree.rb_node;
-       while (n) {
-               int ret;
-
-               mcast = rb_entry(n, struct ipath_mcast, rb_node);
-
-               ret = memcmp(mgid->raw, mcast->mgid.raw,
-                            sizeof(union ib_gid));
-               if (ret < 0)
-                       n = n->rb_left;
-               else if (ret > 0)
-                       n = n->rb_right;
-               else {
-                       atomic_inc(&mcast->refcount);
-                       spin_unlock_irqrestore(&mcast_lock, flags);
-                       goto bail;
-               }
-       }
-       spin_unlock_irqrestore(&mcast_lock, flags);
-
-       mcast = NULL;
-
-bail:
-       return mcast;
-}
-
-/**
- * ipath_mcast_add - insert mcast GID into table and attach QP struct
- * @mcast: the mcast GID table
- * @mqp: the QP to attach
- *
- * Return zero if both were added.  Return EEXIST if the GID was already in
- * the table but the QP was added.  Return ESRCH if the QP was already
- * attached and neither structure was added.
- */
-static int ipath_mcast_add(struct ipath_ibdev *dev,
-                          struct ipath_mcast *mcast,
-                          struct ipath_mcast_qp *mqp)
-{
-       struct rb_node **n = &mcast_tree.rb_node;
-       struct rb_node *pn = NULL;
-       int ret;
-
-       spin_lock_irq(&mcast_lock);
-
-       while (*n) {
-               struct ipath_mcast *tmcast;
-               struct ipath_mcast_qp *p;
-
-               pn = *n;
-               tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
-
-               ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
-                            sizeof(union ib_gid));
-               if (ret < 0) {
-                       n = &pn->rb_left;
-                       continue;
-               }
-               if (ret > 0) {
-                       n = &pn->rb_right;
-                       continue;
-               }
-
-               /* Search the QP list to see if this is already there. */
-               list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
-                       if (p->qp == mqp->qp) {
-                               ret = ESRCH;
-                               goto bail;
-                       }
-               }
-               if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
-                       ret = ENOMEM;
-                       goto bail;
-               }
-
-               tmcast->n_attached++;
-
-               list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
-               ret = EEXIST;
-               goto bail;
-       }
-
-       spin_lock(&dev->n_mcast_grps_lock);
-       if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
-               spin_unlock(&dev->n_mcast_grps_lock);
-               ret = ENOMEM;
-               goto bail;
-       }
-
-       dev->n_mcast_grps_allocated++;
-       spin_unlock(&dev->n_mcast_grps_lock);
-
-       mcast->n_attached++;
-
-       list_add_tail_rcu(&mqp->list, &mcast->qp_list);
-
-       atomic_inc(&mcast->refcount);
-       rb_link_node(&mcast->rb_node, pn, n);
-       rb_insert_color(&mcast->rb_node, &mcast_tree);
-
-       ret = 0;
-
-bail:
-       spin_unlock_irq(&mcast_lock);
-
-       return ret;
-}
-
-int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-       struct ipath_mcast *mcast;
-       struct ipath_mcast_qp *mqp;
-       int ret;
-
-       /*
-        * Allocate data structures since its better to do this outside of
-        * spin locks and it will most likely be needed.
-        */
-       mcast = ipath_mcast_alloc(gid);
-       if (mcast == NULL) {
-               ret = -ENOMEM;
-               goto bail;
-       }
-       mqp = ipath_mcast_qp_alloc(qp);
-       if (mqp == NULL) {
-               ipath_mcast_free(mcast);
-               ret = -ENOMEM;
-               goto bail;
-       }
-       switch (ipath_mcast_add(dev, mcast, mqp)) {
-       case ESRCH:
-               /* Neither was used: can't attach the same QP twice. */
-               ipath_mcast_qp_free(mqp);
-               ipath_mcast_free(mcast);
-               ret = -EINVAL;
-               goto bail;
-       case EEXIST:            /* The mcast wasn't used */
-               ipath_mcast_free(mcast);
-               break;
-       case ENOMEM:
-               /* Exceeded the maximum number of mcast groups. */
-               ipath_mcast_qp_free(mqp);
-               ipath_mcast_free(mcast);
-               ret = -ENOMEM;
-               goto bail;
-       default:
-               break;
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
-       struct ipath_qp *qp = to_iqp(ibqp);
-       struct ipath_ibdev *dev = to_idev(ibqp->device);
-       struct ipath_mcast *mcast = NULL;
-       struct ipath_mcast_qp *p, *tmp;
-       struct rb_node *n;
-       int last = 0;
-       int ret;
-
-       spin_lock_irq(&mcast_lock);
-
-       /* Find the GID in the mcast table. */
-       n = mcast_tree.rb_node;
-       while (1) {
-               if (n == NULL) {
-                       spin_unlock_irq(&mcast_lock);
-                       ret = -EINVAL;
-                       goto bail;
-               }
-
-               mcast = rb_entry(n, struct ipath_mcast, rb_node);
-               ret = memcmp(gid->raw, mcast->mgid.raw,
-                            sizeof(union ib_gid));
-               if (ret < 0)
-                       n = n->rb_left;
-               else if (ret > 0)
-                       n = n->rb_right;
-               else
-                       break;
-       }
-
-       /* Search the QP list. */
-       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
-               if (p->qp != qp)
-                       continue;
-               /*
-                * We found it, so remove it, but don't poison the forward
-                * link until we are sure there are no list walkers.
-                */
-               list_del_rcu(&p->list);
-               mcast->n_attached--;
-
-               /* If this was the last attached QP, remove the GID too. */
-               if (list_empty(&mcast->qp_list)) {
-                       rb_erase(&mcast->rb_node, &mcast_tree);
-                       last = 1;
-               }
-               break;
-       }
-
-       spin_unlock_irq(&mcast_lock);
-
-       if (p) {
-               /*
-                * Wait for any list walkers to finish before freeing the
-                * list element.
-                */
-               wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
-               ipath_mcast_qp_free(p);
-       }
-       if (last) {
-               atomic_dec(&mcast->refcount);
-               wait_event(mcast->wait, !atomic_read(&mcast->refcount));
-               ipath_mcast_free(mcast);
-               spin_lock_irq(&dev->n_mcast_grps_lock);
-               dev->n_mcast_grps_allocated--;
-               spin_unlock_irq(&dev->n_mcast_grps_lock);
-       }
-
-       ret = 0;
-
-bail:
-       return ret;
-}
-
-int ipath_mcast_tree_empty(void)
-{
-       return mcast_tree.rb_node == NULL;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
deleted file mode 100644 (file)
index 1a7e20a..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file is conditionally built on PowerPC only.  Otherwise weak symbol
- * versions of the functions exported from here are used.
- */
-
-#include "ipath_kernel.h"
-
-/**
- * ipath_enable_wc - enable write combining for MMIO writes to the device
- * @dd: infinipath device
- *
- * Nothing to do on PowerPC, so just return without error.
- */
-int ipath_enable_wc(struct ipath_devdata *dd)
-{
-       return 0;
-}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
deleted file mode 100644 (file)
index 7b6e4c8..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
- * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * This file is conditionally built on x86_64 only.  Otherwise weak symbol
- * versions of the functions exported from here are used.
- */
-
-#include <linux/pci.h>
-#include <asm/processor.h>
-
-#include "ipath_kernel.h"
-
-/**
- * ipath_enable_wc - enable write combining for MMIO writes to the device
- * @dd: infinipath device
- *
- * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
- * write combining.
- */
-int ipath_enable_wc(struct ipath_devdata *dd)
-{
-       int ret = 0;
-       u64 pioaddr, piolen;
-       unsigned bits;
-       const unsigned long addr = pci_resource_start(dd->pcidev, 0);
-       const size_t len = pci_resource_len(dd->pcidev, 0);
-
-       /*
-        * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
-        * chip.  Linux (possibly the hardware) requires it to be on a power
-        * of 2 address matching the length (which has to be a power of 2).
-        * For rev1, that means the base address, for rev2, it will be just
-        * the PIO buffers themselves.
-        * For chips with two sets of buffers, the calculations are
-        * somewhat more complicated; we need to sum, and the piobufbase
-        * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
-        * The buffers are still packed, so a single range covers both.
-        */
-       if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
-               unsigned long pio2kbase, pio4kbase;
-               pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
-               pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
-               if (pio2kbase < pio4kbase) { /* all, for now */
-                       pioaddr = addr + pio2kbase;
-                       piolen = pio4kbase - pio2kbase +
-                               dd->ipath_piobcnt4k * dd->ipath_4kalign;
-               } else {
-                       pioaddr = addr + pio4kbase;
-                       piolen = pio2kbase - pio4kbase +
-                               dd->ipath_piobcnt2k * dd->ipath_palign;
-               }
-       } else {  /* single buffer size (2K, currently) */
-               pioaddr = addr + dd->ipath_piobufbase;
-               piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
-                       dd->ipath_piobcnt4k * dd->ipath_4kalign;
-       }
-
-       for (bits = 0; !(piolen & (1ULL << bits)); bits++)
-               /* do nothing */ ;
-
-       if (piolen != (1ULL << bits)) {
-               piolen >>= bits;
-               while (piolen >>= 1)
-                       bits++;
-               piolen = 1ULL << (bits + 1);
-       }
-       if (pioaddr & (piolen - 1)) {
-               u64 atmp;
-               ipath_dbg("pioaddr %llx not on right boundary for size "
-                         "%llx, fixing\n",
-                         (unsigned long long) pioaddr,
-                         (unsigned long long) piolen);
-               atmp = pioaddr & ~(piolen - 1);
-               if (atmp < addr || (atmp + piolen) > (addr + len)) {
-                       ipath_dev_err(dd, "No way to align address/size "
-                                     "(%llx/%llx), no WC mtrr\n",
-                                     (unsigned long long) atmp,
-                                     (unsigned long long) piolen << 1);
-                       ret = -ENODEV;
-               } else {
-                       ipath_dbg("changing WC base from %llx to %llx, "
-                                 "len from %llx to %llx\n",
-                                 (unsigned long long) pioaddr,
-                                 (unsigned long long) atmp,
-                                 (unsigned long long) piolen,
-                                 (unsigned long long) piolen << 1);
-                       pioaddr = atmp;
-                       piolen <<= 1;
-               }
-       }
-
-       if (!ret) {
-               dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
-               if (dd->wc_cookie < 0) {
-                       ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n");
-                       ret = -ENODEV;
-               } else if (dd->wc_cookie == 0)
-                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n");
-               else
-                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n");
-       }
-
-       return ret;
-}
-
-/**
- * ipath_disable_wc - disable write combining for MMIO writes to the device
- * @dd: infinipath device
- */
-void ipath_disable_wc(struct ipath_devdata *dd)
-{
-       arch_phys_wc_del(dd->wc_cookie);
-}
index f50a546224adf09b91ec7f4b1d75b2bbfde5c880..1688a17de4fe1fc6d887d74d703b51dffaff2e08 100644 (file)
@@ -89,7 +89,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
        if (vlan_tag < 0x1000)
                vlan_tag |= (ah_attr->sl & 7) << 13;
        ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
-       ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+       ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index);
        ah->av.eth.vlan = cpu_to_be16(vlan_tag);
        if (ah_attr->static_rate) {
                ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
@@ -148,9 +148,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
        enum rdma_link_layer ll;
 
        memset(ah_attr, 0, sizeof *ah_attr);
-       ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
        ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
        ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+       if (ll == IB_LINK_LAYER_ETHERNET)
+               ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29;
+       else
+               ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+
        ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
        if (ah->av.ib.stat_rate)
                ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
index 180a8f7ec82de80fdf69f4226d9bb0d8fff54052..5fd49f9435f9dd8d2496d91394271b6270af2cbd 100644 (file)
@@ -638,7 +638,7 @@ static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
         * simulated FLUSH_ERR completions
         */
        list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
-               mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1);
+               mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1);
                if (*npolled >= num_entries)
                        goto out;
        }
index 68b3dfa922bf3e01ce3c00a60674ca508fd50912..1cd75ff0225193c475c9a9debe2ccedcf9bdb8b8 100644 (file)
@@ -580,7 +580,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 
        list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
        list.length = sizeof (struct mlx4_rcv_tunnel_mad);
-       list.lkey = tun_ctx->mr->lkey;
+       list.lkey = tun_ctx->pd->local_dma_lkey;
 
        wr.wr.ud.ah = ah;
        wr.wr.ud.port_num = port;
@@ -1133,7 +1133,7 @@ static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
 
        sg_list.addr = tun_qp->ring[index].map;
        sg_list.length = size;
-       sg_list.lkey = ctx->mr->lkey;
+       sg_list.lkey = ctx->pd->local_dma_lkey;
 
        recv_wr.next = NULL;
        recv_wr.sg_list = &sg_list;
@@ -1244,7 +1244,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 
        list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
        list.length = sizeof (struct mlx4_mad_snd_buf);
-       list.lkey = sqp_ctx->mr->lkey;
+       list.lkey = sqp_ctx->pd->local_dma_lkey;
 
        wr.wr.ud.ah = ah;
        wr.wr.ud.port_num = port;
@@ -1827,19 +1827,12 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
                goto err_cq;
        }
 
-       ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(ctx->mr)) {
-               ret = PTR_ERR(ctx->mr);
-               pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
-               goto err_pd;
-       }
-
        if (ctx->has_smi) {
                ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
                if (ret) {
                        pr_err("Couldn't create %s QP0 (%d)\n",
                               create_tun ? "tunnel for" : "",  ret);
-                       goto err_mr;
+                       goto err_pd;
                }
        }
 
@@ -1876,10 +1869,6 @@ err_qp0:
                ib_destroy_qp(ctx->qp[0].qp);
        ctx->qp[0].qp = NULL;
 
-err_mr:
-       ib_dereg_mr(ctx->mr);
-       ctx->mr = NULL;
-
 err_pd:
        ib_dealloc_pd(ctx->pd);
        ctx->pd = NULL;
@@ -1916,8 +1905,6 @@ static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
                ib_destroy_qp(ctx->qp[1].qp);
                ctx->qp[1].qp = NULL;
                mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
-               ib_dereg_mr(ctx->mr);
-               ctx->mr = NULL;
                ib_dealloc_pd(ctx->pd);
                ctx->pd = NULL;
                ib_destroy_cq(ctx->cq);
@@ -2050,8 +2037,6 @@ static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
                ib_destroy_qp(sqp_ctx->qp[1].qp);
                sqp_ctx->qp[1].qp = NULL;
                mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
-               ib_dereg_mr(sqp_ctx->mr);
-               sqp_ctx->mr = NULL;
                ib_dealloc_pd(sqp_ctx->pd);
                sqp_ctx->pd = NULL;
                ib_destroy_cq(sqp_ctx->cq);
index 8be6db81646049a741abebe1c4ab982fad7e6011..efecdf0216d85179c05f6e949d0c7597cf7f4f70 100644 (file)
@@ -45,6 +45,9 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <net/bonding.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
@@ -74,13 +77,6 @@ static const char mlx4_ib_version[] =
        DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
        DRV_VERSION " (" DRV_RELDATE ")\n";
 
-struct update_gid_work {
-       struct work_struct      work;
-       union ib_gid            gids[128];
-       struct mlx4_ib_dev     *dev;
-       int                     port;
-};
-
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 
 static struct workqueue_struct *wq;
@@ -93,8 +89,6 @@ static void init_query_mad(struct ib_smp *mad)
        mad->method        = IB_MGMT_METHOD_GET;
 }
 
-static union ib_gid zgid;
-
 static int check_flow_steering_support(struct mlx4_dev *dev)
 {
        int eth_num_ports = 0;
@@ -131,6 +125,237 @@ static int num_ib_ports(struct mlx4_dev *dev)
        return ib_ports;
 }
 
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
+{
+       struct mlx4_ib_dev *ibdev = to_mdev(device);
+       struct net_device *dev;
+
+       rcu_read_lock();
+       dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+
+       if (dev) {
+               if (mlx4_is_bonded(ibdev->dev)) {
+                       struct net_device *upper = NULL;
+
+                       upper = netdev_master_upper_dev_get_rcu(dev);
+                       if (upper) {
+                               struct net_device *active;
+
+                               active = bond_option_active_slave_get_rcu(netdev_priv(upper));
+                               if (active)
+                                       dev = active;
+                       }
+               }
+       }
+       if (dev)
+               dev_hold(dev);
+
+       rcu_read_unlock();
+       return dev;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+                              struct mlx4_ib_dev *ibdev,
+                              u8 port_num)
+{
+       struct mlx4_cmd_mailbox *mailbox;
+       int err;
+       struct mlx4_dev *dev = ibdev->dev;
+       int i;
+       union ib_gid *gid_tbl;
+
+       mailbox = mlx4_alloc_cmd_mailbox(dev);
+       if (IS_ERR(mailbox))
+               return -ENOMEM;
+
+       gid_tbl = mailbox->buf;
+
+       for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+               memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
+
+       err = mlx4_cmd(dev, mailbox->dma,
+                      MLX4_SET_PORT_GID_TABLE << 8 | port_num,
+                      1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+                      MLX4_CMD_WRAPPED);
+       if (mlx4_is_bonded(dev))
+               err += mlx4_cmd(dev, mailbox->dma,
+                               MLX4_SET_PORT_GID_TABLE << 8 | 2,
+                               1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+                               MLX4_CMD_WRAPPED);
+
+       mlx4_free_cmd_mailbox(dev, mailbox);
+       return err;
+}
+
+static int mlx4_ib_add_gid(struct ib_device *device,
+                          u8 port_num,
+                          unsigned int index,
+                          const union ib_gid *gid,
+                          const struct ib_gid_attr *attr,
+                          void **context)
+{
+       struct mlx4_ib_dev *ibdev = to_mdev(device);
+       struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+       struct mlx4_port_gid_table   *port_gid_table;
+       int free = -1, found = -1;
+       int ret = 0;
+       int hw_update = 0;
+       int i;
+       struct gid_entry *gids = NULL;
+
+       if (!rdma_cap_roce_gid_table(device, port_num))
+               return -EINVAL;
+
+       if (port_num > MLX4_MAX_PORTS)
+               return -EINVAL;
+
+       if (!context)
+               return -EINVAL;
+
+       port_gid_table = &iboe->gids[port_num - 1];
+       spin_lock_bh(&iboe->lock);
+       for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+               if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+                       found = i;
+                       break;
+               }
+               if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+                       free = i; /* HW has space */
+       }
+
+       if (found < 0) {
+               if (free < 0) {
+                       ret = -ENOSPC;
+               } else {
+                       port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_ATOMIC);
+                       if (!port_gid_table->gids[free].ctx) {
+                               ret = -ENOMEM;
+                       } else {
+                               *context = port_gid_table->gids[free].ctx;
+                               memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+                               port_gid_table->gids[free].ctx->real_index = free;
+                               port_gid_table->gids[free].ctx->refcount = 1;
+                               hw_update = 1;
+                       }
+               }
+       } else {
+               struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+               *context = ctx;
+               ctx->refcount++;
+       }
+       if (!ret && hw_update) {
+               gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+               if (!gids) {
+                       ret = -ENOMEM;
+               } else {
+                       for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+                               memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+               }
+       }
+       spin_unlock_bh(&iboe->lock);
+
+       if (!ret && hw_update) {
+               ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+               kfree(gids);
+       }
+
+       return ret;
+}
+
+static int mlx4_ib_del_gid(struct ib_device *device,
+                          u8 port_num,
+                          unsigned int index,
+                          void **context)
+{
+       struct gid_cache_context *ctx = *context;
+       struct mlx4_ib_dev *ibdev = to_mdev(device);
+       struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+       struct mlx4_port_gid_table   *port_gid_table;
+       int ret = 0;
+       int hw_update = 0;
+       struct gid_entry *gids = NULL;
+
+       if (!rdma_cap_roce_gid_table(device, port_num))
+               return -EINVAL;
+
+       if (port_num > MLX4_MAX_PORTS)
+               return -EINVAL;
+
+       port_gid_table = &iboe->gids[port_num - 1];
+       spin_lock_bh(&iboe->lock);
+       if (ctx) {
+               ctx->refcount--;
+               if (!ctx->refcount) {
+                       unsigned int real_index = ctx->real_index;
+
+                       memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid));
+                       kfree(port_gid_table->gids[real_index].ctx);
+                       port_gid_table->gids[real_index].ctx = NULL;
+                       hw_update = 1;
+               }
+       }
+       if (!ret && hw_update) {
+               int i;
+
+               gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_ATOMIC);
+               if (!gids) {
+                       ret = -ENOMEM;
+               } else {
+                       for (i = 0; i < MLX4_MAX_PORT_GIDS; i++)
+                               memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+               }
+       }
+       spin_unlock_bh(&iboe->lock);
+
+       if (!ret && hw_update) {
+               ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+               kfree(gids);
+       }
+       return ret;
+}
+
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+                                   u8 port_num, int index)
+{
+       struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+       struct gid_cache_context *ctx = NULL;
+       union ib_gid gid;
+       struct mlx4_port_gid_table   *port_gid_table;
+       int real_index = -EINVAL;
+       int i;
+       int ret;
+       unsigned long flags;
+
+       if (port_num > MLX4_MAX_PORTS)
+               return -EINVAL;
+
+       if (mlx4_is_bonded(ibdev->dev))
+               port_num = 1;
+
+       if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num))
+               return index;
+
+       ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid);
+       if (ret)
+               return ret;
+
+       if (!memcmp(&gid, &zgid, sizeof(gid)))
+               return -EINVAL;
+
+       spin_lock_irqsave(&iboe->lock, flags);
+       port_gid_table = &iboe->gids[port_num - 1];
+
+       for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+               if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) {
+                       ctx = port_gid_table->gids[i].ctx;
+                       break;
+               }
+       if (ctx)
+               real_index = ctx->real_index;
+       spin_unlock_irqrestore(&iboe->lock, flags);
+       return real_index;
+}
+
 static int mlx4_ib_query_device(struct ib_device *ibdev,
                                struct ib_device_attr *props,
                                struct ib_udata *uhw)
@@ -229,6 +454,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
        props->max_qp_wr           = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
        props->max_sge             = min(dev->dev->caps.max_sq_sg,
                                         dev->dev->caps.max_rq_sg);
+       props->max_sge_rd = props->max_sge;
        props->max_cq              = dev->dev->quotas.cq;
        props->max_cqe             = dev->dev->caps.max_cqes;
        props->max_mr              = dev->dev->quotas.mpt;
@@ -414,12 +640,13 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
        props->state            = IB_PORT_DOWN;
        props->phys_state       = state_to_phys_state(props->state);
        props->active_mtu       = IB_MTU_256;
-       if (is_bonded)
-               rtnl_lock(); /* required to get upper dev */
        spin_lock_bh(&iboe->lock);
        ndev = iboe->netdevs[port - 1];
-       if (ndev && is_bonded)
-               ndev = netdev_master_upper_dev_get(ndev);
+       if (ndev && is_bonded) {
+               rcu_read_lock(); /* required to get upper dev */
+               ndev = netdev_master_upper_dev_get_rcu(ndev);
+               rcu_read_unlock();
+       }
        if (!ndev)
                goto out_unlock;
 
@@ -431,8 +658,6 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
        props->phys_state       = state_to_phys_state(props->state);
 out_unlock:
        spin_unlock_bh(&iboe->lock);
-       if (is_bonded)
-               rtnl_unlock();
 out:
        mlx4_free_cmd_mailbox(mdev->dev, mailbox);
        return err;
@@ -515,23 +740,27 @@ out:
        return err;
 }
 
-static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
-                         union ib_gid *gid)
-{
-       struct mlx4_ib_dev *dev = to_mdev(ibdev);
-
-       *gid = dev->iboe.gid_table[port - 1][index];
-
-       return 0;
-}
-
 static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
                             union ib_gid *gid)
 {
-       if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+       int ret;
+
+       if (rdma_protocol_ib(ibdev, port))
                return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
-       else
-               return iboe_query_gid(ibdev, port, index, gid);
+
+       if (!rdma_protocol_roce(ibdev, port))
+               return -ENODEV;
+
+       if (!rdma_cap_roce_gid_table(ibdev, port))
+               return -ENODEV;
+
+       ret = ib_get_cached_gid(ibdev, port, index, gid);
+       if (ret == -EAGAIN) {
+               memcpy(gid, &zgid, sizeof(*gid));
+               return 0;
+       }
+
+       return ret;
 }
 
 int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -692,7 +921,7 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
                resp.cqe_size         = dev->dev->caps.cqe_size;
        }
 
-       context = kmalloc(sizeof *context, GFP_KERNEL);
+       context = kzalloc(sizeof(*context), GFP_KERNEL);
        if (!context)
                return ERR_PTR(-ENOMEM);
 
@@ -729,21 +958,143 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
        return 0;
 }
 
+static void  mlx4_ib_vma_open(struct vm_area_struct *area)
+{
+       /* vma_open is called when a new VMA is created on top of our VMA.
+        * This is done through either mremap flow or split_vma (usually due
+        * to mlock, madvise, munmap, etc.). We do not support a clone of the
+        * vma, as this VMA is strongly hardware related. Therefore we set the
+        * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+        * calling us again and trying to do incorrect actions. We assume that
+        * the original vma size is exactly a single page that there will be no
+        * "splitting" operations on.
+        */
+       area->vm_ops = NULL;
+}
+
+static void  mlx4_ib_vma_close(struct vm_area_struct *area)
+{
+       struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
+
+       /* It's guaranteed that all VMAs opened on a FD are closed before the
+        * file itself is closed, therefore no sync is needed with the regular
+        * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
+        * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
+        * The close operation is usually called under mm->mmap_sem except when
+        * process is exiting.  The exiting case is handled explicitly as part
+        * of mlx4_ib_disassociate_ucontext.
+        */
+       mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)
+                               area->vm_private_data;
+
+       /* set the vma context pointer to null in the mlx4_ib driver's private
+        * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
+        */
+       mlx4_ib_vma_priv_data->vma = NULL;
+}
+
+static const struct vm_operations_struct mlx4_ib_vm_ops = {
+       .open = mlx4_ib_vma_open,
+       .close = mlx4_ib_vma_close
+};
+
+static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+       int i;
+       int ret = 0;
+       struct vm_area_struct *vma;
+       struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+       struct task_struct *owning_process  = NULL;
+       struct mm_struct   *owning_mm       = NULL;
+
+       owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+       if (!owning_process)
+               return;
+
+       owning_mm = get_task_mm(owning_process);
+       if (!owning_mm) {
+               pr_info("no mm, disassociate ucontext is pending task termination\n");
+               while (1) {
+                       /* make sure that task is dead before returning, it may
+                        * prevent a rare case of module down in parallel to a
+                        * call to mlx4_ib_vma_close.
+                        */
+                       put_task_struct(owning_process);
+                       msleep(1);
+                       owning_process = get_pid_task(ibcontext->tgid,
+                                                     PIDTYPE_PID);
+                       if (!owning_process ||
+                           owning_process->state == TASK_DEAD) {
+                               pr_info("disassociate ucontext done, task was terminated\n");
+                               /* in case task was dead need to release the task struct */
+                               if (owning_process)
+                                       put_task_struct(owning_process);
+                               return;
+                       }
+               }
+       }
+
+       /* need to protect from a race on closing the vma as part of
+        * mlx4_ib_vma_close().
+        */
+       down_read(&owning_mm->mmap_sem);
+       for (i = 0; i < HW_BAR_COUNT; i++) {
+               vma = context->hw_bar_info[i].vma;
+               if (!vma)
+                       continue;
+
+               ret = zap_vma_ptes(context->hw_bar_info[i].vma,
+                                  context->hw_bar_info[i].vma->vm_start,
+                                  PAGE_SIZE);
+               if (ret) {
+                       pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret);
+                       BUG_ON(1);
+               }
+
+               /* context going to be destroyed, should not access ops any more */
+               context->hw_bar_info[i].vma->vm_ops = NULL;
+       }
+
+       up_read(&owning_mm->mmap_sem);
+       mmput(owning_mm);
+       put_task_struct(owning_process);
+}
+
+static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
+                                struct mlx4_ib_vma_private_data *vma_private_data)
+{
+       vma_private_data->vma = vma;
+       vma->vm_private_data = vma_private_data;
+       vma->vm_ops =  &mlx4_ib_vm_ops;
+}
+
 static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
        struct mlx4_ib_dev *dev = to_mdev(context->device);
+       struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
 
        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
                return -EINVAL;
 
        if (vma->vm_pgoff == 0) {
+               /* We prevent double mmaping on same context */
+               if (mucontext->hw_bar_info[HW_BAR_DB].vma)
+                       return -EINVAL;
+
                vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
                if (io_remap_pfn_range(vma, vma->vm_start,
                                       to_mucontext(context)->uar.pfn,
                                       PAGE_SIZE, vma->vm_page_prot))
                        return -EAGAIN;
+
+               mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
+
        } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+               /* We prevent double mmaping on same context */
+               if (mucontext->hw_bar_info[HW_BAR_BF].vma)
+                       return -EINVAL;
+
                vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 
                if (io_remap_pfn_range(vma, vma->vm_start,
@@ -751,9 +1102,18 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
                                       dev->dev->caps.num_uars,
                                       PAGE_SIZE, vma->vm_page_prot))
                        return -EAGAIN;
+
+               mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
+
        } else if (vma->vm_pgoff == 3) {
                struct mlx4_clock_params params;
-               int ret = mlx4_get_internal_clock_params(dev->dev, &params);
+               int ret;
+
+               /* We prevent double mmaping on same context */
+               if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma)
+                       return -EINVAL;
+
+               ret = mlx4_get_internal_clock_params(dev->dev, &params);
 
                if (ret)
                        return ret;
@@ -766,6 +1126,9 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
                                       >> PAGE_SHIFT,
                                       PAGE_SIZE, vma->vm_page_prot))
                        return -EAGAIN;
+
+               mlx4_ib_set_vma_data(vma,
+                                    &mucontext->hw_bar_info[HW_BAR_CLOCK]);
        } else {
                return -EINVAL;
        }
@@ -1547,272 +1910,6 @@ static struct device_attribute *mlx4_class_attributes[] = {
        &dev_attr_board_id
 };
 
-static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id,
-                                    struct net_device *dev)
-{
-       memcpy(eui, dev->dev_addr, 3);
-       memcpy(eui + 5, dev->dev_addr + 3, 3);
-       if (vlan_id < 0x1000) {
-               eui[3] = vlan_id >> 8;
-               eui[4] = vlan_id & 0xff;
-       } else {
-               eui[3] = 0xff;
-               eui[4] = 0xfe;
-       }
-       eui[0] ^= 2;
-}
-
-static void update_gids_task(struct work_struct *work)
-{
-       struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
-       struct mlx4_cmd_mailbox *mailbox;
-       union ib_gid *gids;
-       int err;
-       struct mlx4_dev *dev = gw->dev->dev;
-       int is_bonded = mlx4_is_bonded(dev);
-
-       if (!gw->dev->ib_active)
-               return;
-
-       mailbox = mlx4_alloc_cmd_mailbox(dev);
-       if (IS_ERR(mailbox)) {
-               pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
-               return;
-       }
-
-       gids = mailbox->buf;
-       memcpy(gids, gw->gids, sizeof gw->gids);
-
-       err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-                      MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
-                      MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
-       if (err)
-               pr_warn("set port command failed\n");
-       else
-               if ((gw->port == 1) || !is_bonded)
-                       mlx4_ib_dispatch_event(gw->dev,
-                                              is_bonded ? 1 : gw->port,
-                                              IB_EVENT_GID_CHANGE);
-
-       mlx4_free_cmd_mailbox(dev, mailbox);
-       kfree(gw);
-}
-
-static void reset_gids_task(struct work_struct *work)
-{
-       struct update_gid_work *gw =
-                       container_of(work, struct update_gid_work, work);
-       struct mlx4_cmd_mailbox *mailbox;
-       union ib_gid *gids;
-       int err;
-       struct mlx4_dev *dev = gw->dev->dev;
-
-       if (!gw->dev->ib_active)
-               return;
-
-       mailbox = mlx4_alloc_cmd_mailbox(dev);
-       if (IS_ERR(mailbox)) {
-               pr_warn("reset gid table failed\n");
-               goto free;
-       }
-
-       gids = mailbox->buf;
-       memcpy(gids, gw->gids, sizeof(gw->gids));
-
-       if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
-                                   IB_LINK_LAYER_ETHERNET) {
-               err = mlx4_cmd(dev, mailbox->dma,
-                              MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-                              MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
-                              MLX4_CMD_TIME_CLASS_B,
-                              MLX4_CMD_WRAPPED);
-               if (err)
-                       pr_warn("set port %d command failed\n", gw->port);
-       }
-
-       mlx4_free_cmd_mailbox(dev, mailbox);
-free:
-       kfree(gw);
-}
-
-static int update_gid_table(struct mlx4_ib_dev *dev, int port,
-                           union ib_gid *gid, int clear,
-                           int default_gid)
-{
-       struct update_gid_work *work;
-       int i;
-       int need_update = 0;
-       int free = -1;
-       int found = -1;
-       int max_gids;
-
-       if (default_gid) {
-               free = 0;
-       } else {
-               max_gids = dev->dev->caps.gid_table_len[port];
-               for (i = 1; i < max_gids; ++i) {
-                       if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
-                                   sizeof(*gid)))
-                               found = i;
-
-                       if (clear) {
-                               if (found >= 0) {
-                                       need_update = 1;
-                                       dev->iboe.gid_table[port - 1][found] =
-                                               zgid;
-                                       break;
-                               }
-                       } else {
-                               if (found >= 0)
-                                       break;
-
-                               if (free < 0 &&
-                                   !memcmp(&dev->iboe.gid_table[port - 1][i],
-                                           &zgid, sizeof(*gid)))
-                                       free = i;
-                       }
-               }
-       }
-
-       if (found == -1 && !clear && free >= 0) {
-               dev->iboe.gid_table[port - 1][free] = *gid;
-               need_update = 1;
-       }
-
-       if (!need_update)
-               return 0;
-
-       work = kzalloc(sizeof(*work), GFP_ATOMIC);
-       if (!work)
-               return -ENOMEM;
-
-       memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
-       INIT_WORK(&work->work, update_gids_task);
-       work->port = port;
-       work->dev = dev;
-       queue_work(wq, &work->work);
-
-       return 0;
-}
-
-static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid)
-{
-       gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
-       mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);
-}
-
-
-static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)
-{
-       struct update_gid_work *work;
-
-       work = kzalloc(sizeof(*work), GFP_ATOMIC);
-       if (!work)
-               return -ENOMEM;
-
-       memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids));
-       memset(work->gids, 0, sizeof(work->gids));
-       INIT_WORK(&work->work, reset_gids_task);
-       work->dev = dev;
-       work->port = port;
-       queue_work(wq, &work->work);
-       return 0;
-}
-
-static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
-                             struct mlx4_ib_dev *ibdev, union ib_gid *gid)
-{
-       struct mlx4_ib_iboe *iboe;
-       int port = 0;
-       struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
-                               rdma_vlan_dev_real_dev(event_netdev) :
-                               event_netdev;
-       union ib_gid default_gid;
-
-       mlx4_make_default_gid(real_dev, &default_gid);
-
-       if (!memcmp(gid, &default_gid, sizeof(*gid)))
-               return 0;
-
-       if (event != NETDEV_DOWN && event != NETDEV_UP)
-               return 0;
-
-       if ((real_dev != event_netdev) &&
-           (event == NETDEV_DOWN) &&
-           rdma_link_local_addr((struct in6_addr *)gid))
-               return 0;
-
-       iboe = &ibdev->iboe;
-       spin_lock_bh(&iboe->lock);
-
-       for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
-               if ((netif_is_bond_master(real_dev) &&
-                    (real_dev == iboe->masters[port - 1])) ||
-                    (!netif_is_bond_master(real_dev) &&
-                    (real_dev == iboe->netdevs[port - 1])))
-                       update_gid_table(ibdev, port, gid,
-                                        event == NETDEV_DOWN, 0);
-
-       spin_unlock_bh(&iboe->lock);
-       return 0;
-
-}
-
-static u8 mlx4_ib_get_dev_port(struct net_device *dev,
-                              struct mlx4_ib_dev *ibdev)
-{
-       u8 port = 0;
-       struct mlx4_ib_iboe *iboe;
-       struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
-                               rdma_vlan_dev_real_dev(dev) : dev;
-
-       iboe = &ibdev->iboe;
-
-       for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
-               if ((netif_is_bond_master(real_dev) &&
-                    (real_dev == iboe->masters[port - 1])) ||
-                    (!netif_is_bond_master(real_dev) &&
-                    (real_dev == iboe->netdevs[port - 1])))
-                       break;
-
-       if ((port == 0) || (port > ibdev->dev->caps.num_ports))
-               return 0;
-       else
-               return port;
-}
-
-static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
-                               void *ptr)
-{
-       struct mlx4_ib_dev *ibdev;
-       struct in_ifaddr *ifa = ptr;
-       union ib_gid gid;
-       struct net_device *event_netdev = ifa->ifa_dev->dev;
-
-       ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
-
-       ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
-
-       mlx4_ib_addr_event(event, event_netdev, ibdev, &gid);
-       return NOTIFY_DONE;
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,
-                               void *ptr)
-{
-       struct mlx4_ib_dev *ibdev;
-       struct inet6_ifaddr *ifa = ptr;
-       union  ib_gid *gid = (union ib_gid *)&ifa->addr;
-       struct net_device *event_netdev = ifa->idev->dev;
-
-       ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6);
-
-       mlx4_ib_addr_event(event, event_netdev, ibdev, gid);
-       return NOTIFY_DONE;
-}
-#endif
-
 #define MLX4_IB_INVALID_MAC    ((u64)-1)
 static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
                               struct net_device *dev,
@@ -1871,94 +1968,6 @@ unlock:
        mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
 }
 
-static void mlx4_ib_get_dev_addr(struct net_device *dev,
-                                struct mlx4_ib_dev *ibdev, u8 port)
-{
-       struct in_device *in_dev;
-#if IS_ENABLED(CONFIG_IPV6)
-       struct inet6_dev *in6_dev;
-       union ib_gid  *pgid;
-       struct inet6_ifaddr *ifp;
-       union ib_gid default_gid;
-#endif
-       union ib_gid gid;
-
-
-       if ((port == 0) || (port > ibdev->dev->caps.num_ports))
-               return;
-
-       /* IPv4 gids */
-       in_dev = in_dev_get(dev);
-       if (in_dev) {
-               for_ifa(in_dev) {
-                       /*ifa->ifa_address;*/
-                       ipv6_addr_set_v4mapped(ifa->ifa_address,
-                                              (struct in6_addr *)&gid);
-                       update_gid_table(ibdev, port, &gid, 0, 0);
-               }
-               endfor_ifa(in_dev);
-               in_dev_put(in_dev);
-       }
-#if IS_ENABLED(CONFIG_IPV6)
-       mlx4_make_default_gid(dev, &default_gid);
-       /* IPv6 gids */
-       in6_dev = in6_dev_get(dev);
-       if (in6_dev) {
-               read_lock_bh(&in6_dev->lock);
-               list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-                       pgid = (union ib_gid *)&ifp->addr;
-                       if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
-                               continue;
-                       update_gid_table(ibdev, port, pgid, 0, 0);
-               }
-               read_unlock_bh(&in6_dev->lock);
-               in6_dev_put(in6_dev);
-       }
-#endif
-}
-
-static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev,
-                                struct  net_device *dev, u8 port)
-{
-       union ib_gid gid;
-       mlx4_make_default_gid(dev, &gid);
-       update_gid_table(ibdev, port, &gid, 0, 1);
-}
-
-static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
-{
-       struct  net_device *dev;
-       struct mlx4_ib_iboe *iboe = &ibdev->iboe;
-       int i;
-       int err = 0;
-
-       for (i = 1; i <= ibdev->num_ports; ++i) {
-               if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
-                   IB_LINK_LAYER_ETHERNET) {
-                       err = reset_gid_table(ibdev, i);
-                       if (err)
-                               goto out;
-               }
-       }
-
-       read_lock(&dev_base_lock);
-       spin_lock_bh(&iboe->lock);
-
-       for_each_netdev(&init_net, dev) {
-               u8 port = mlx4_ib_get_dev_port(dev, ibdev);
-               /* port will be non-zero only for ETH ports */
-               if (port) {
-                       mlx4_ib_set_default_gid(ibdev, dev, port);
-                       mlx4_ib_get_dev_addr(dev, ibdev, port);
-               }
-       }
-
-       spin_unlock_bh(&iboe->lock);
-       read_unlock(&dev_base_lock);
-out:
-       return err;
-}
-
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
                                 struct net_device *dev,
                                 unsigned long event)
@@ -1968,81 +1977,22 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
        int update_qps_port = -1;
        int port;
 
+       ASSERT_RTNL();
+
        iboe = &ibdev->iboe;
 
        spin_lock_bh(&iboe->lock);
        mlx4_foreach_ib_transport_port(port, ibdev->dev) {
-               enum ib_port_state      port_state = IB_PORT_NOP;
-               struct net_device *old_master = iboe->masters[port - 1];
-               struct net_device *curr_netdev;
-               struct net_device *curr_master;
 
                iboe->netdevs[port - 1] =
                        mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
-               if (iboe->netdevs[port - 1])
-                       mlx4_ib_set_default_gid(ibdev,
-                                               iboe->netdevs[port - 1], port);
-               curr_netdev = iboe->netdevs[port - 1];
-
-               if (iboe->netdevs[port - 1] &&
-                   netif_is_bond_slave(iboe->netdevs[port - 1])) {
-                       iboe->masters[port - 1] = netdev_master_upper_dev_get(
-                               iboe->netdevs[port - 1]);
-               } else {
-                       iboe->masters[port - 1] = NULL;
-               }
-               curr_master = iboe->masters[port - 1];
 
                if (dev == iboe->netdevs[port - 1] &&
                    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
                     event == NETDEV_UP || event == NETDEV_CHANGE))
                        update_qps_port = port;
 
-               if (curr_netdev) {
-                       port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
-                                               IB_PORT_ACTIVE : IB_PORT_DOWN;
-                       mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-                       if (curr_master) {
-                               /* if using bonding/team and a slave port is down, we
-                                * don't want the bond IP based gids in the table since
-                                * flows that select port by gid may get the down port.
-                               */
-                               if (port_state == IB_PORT_DOWN &&
-                                   !mlx4_is_bonded(ibdev->dev)) {
-                                       reset_gid_table(ibdev, port);
-                                       mlx4_ib_set_default_gid(ibdev,
-                                                               curr_netdev,
-                                                               port);
-                               } else {
-                                       /* gids from the upper dev (bond/team)
-                                        * should appear in port's gid table
-                                       */
-                                       mlx4_ib_get_dev_addr(curr_master,
-                                                            ibdev, port);
-                               }
-                       }
-                       /* if bonding is used it is possible that we add it to
-                        * masters only after IP address is assigned to the
-                        * net bonding interface.
-                       */
-                       if (curr_master && (old_master != curr_master)) {
-                               reset_gid_table(ibdev, port);
-                               mlx4_ib_set_default_gid(ibdev,
-                                                       curr_netdev, port);
-                               mlx4_ib_get_dev_addr(curr_master, ibdev, port);
-                       }
-
-                       if (!curr_master && (old_master != curr_master)) {
-                               reset_gid_table(ibdev, port);
-                               mlx4_ib_set_default_gid(ibdev,
-                                                       curr_netdev, port);
-                               mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
-                       }
-               } else {
-                       reset_gid_table(ibdev, port);
-               }
        }
-
        spin_unlock_bh(&iboe->lock);
 
        if (update_qps_port > 0)
@@ -2225,6 +2175,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                                                1 : ibdev->num_ports;
        ibdev->ib_dev.num_comp_vectors  = dev->caps.num_comp_vectors;
        ibdev->ib_dev.dma_device        = &dev->persist->pdev->dev;
+       ibdev->ib_dev.get_netdev        = mlx4_ib_get_netdev;
+       ibdev->ib_dev.add_gid           = mlx4_ib_add_gid;
+       ibdev->ib_dev.del_gid           = mlx4_ib_del_gid;
 
        if (dev->caps.userspace_caps)
                ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2293,13 +2246,14 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
        ibdev->ib_dev.reg_user_mr       = mlx4_ib_reg_user_mr;
        ibdev->ib_dev.rereg_user_mr     = mlx4_ib_rereg_user_mr;
        ibdev->ib_dev.dereg_mr          = mlx4_ib_dereg_mr;
-       ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+       ibdev->ib_dev.alloc_mr          = mlx4_ib_alloc_mr;
        ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
        ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
        ibdev->ib_dev.attach_mcast      = mlx4_ib_mcg_attach;
        ibdev->ib_dev.detach_mcast      = mlx4_ib_mcg_detach;
        ibdev->ib_dev.process_mad       = mlx4_ib_process_mad;
        ibdev->ib_dev.get_port_immutable = mlx4_port_immutable;
+       ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 
        if (!mlx4_is_slave(ibdev->dev)) {
                ibdev->ib_dev.alloc_fmr         = mlx4_ib_fmr_alloc;
@@ -2435,26 +2389,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
                                goto err_notif;
                        }
                }
-               if (!iboe->nb_inet.notifier_call) {
-                       iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
-                       err = register_inetaddr_notifier(&iboe->nb_inet);
-                       if (err) {
-                               iboe->nb_inet.notifier_call = NULL;
-                               goto err_notif;
-                       }
-               }
-#if IS_ENABLED(CONFIG_IPV6)
-               if (!iboe->nb_inet6.notifier_call) {
-                       iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event;
-                       err = register_inet6addr_notifier(&iboe->nb_inet6);
-                       if (err) {
-                               iboe->nb_inet6.notifier_call = NULL;
-                               goto err_notif;
-                       }
-               }
-#endif
-               if (mlx4_ib_init_gid_table(ibdev))
-                       goto err_notif;
        }
 
        for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
@@ -2485,18 +2419,6 @@ err_notif:
                        pr_warn("failure unregistering notifier\n");
                ibdev->iboe.nb.notifier_call = NULL;
        }
-       if (ibdev->iboe.nb_inet.notifier_call) {
-               if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
-                       pr_warn("failure unregistering notifier\n");
-               ibdev->iboe.nb_inet.notifier_call = NULL;
-       }
-#if IS_ENABLED(CONFIG_IPV6)
-       if (ibdev->iboe.nb_inet6.notifier_call) {
-               if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
-                       pr_warn("failure unregistering notifier\n");
-               ibdev->iboe.nb_inet6.notifier_call = NULL;
-       }
-#endif
        flush_workqueue(wq);
 
        mlx4_ib_close_sriov(ibdev);
@@ -2622,19 +2544,6 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
                kfree(ibdev->ib_uc_qpns_bitmap);
        }
 
-       if (ibdev->iboe.nb_inet.notifier_call) {
-               if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
-                       pr_warn("failure unregistering notifier\n");
-               ibdev->iboe.nb_inet.notifier_call = NULL;
-       }
-#if IS_ENABLED(CONFIG_IPV6)
-       if (ibdev->iboe.nb_inet6.notifier_call) {
-               if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
-                       pr_warn("failure unregistering notifier\n");
-               ibdev->iboe.nb_inet6.notifier_call = NULL;
-       }
-#endif
-
        iounmap(ibdev->uar_map);
        for (p = 0; p < ibdev->num_ports; ++p)
                if (ibdev->counters[p].index != -1 &&
index ed327e6c8fdca54baf19c3ded92d60776cc1adbf..2d5bccd71fc66d121382326655a4180db1692e0b 100644 (file)
        pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
        (group)->name, group->demux->port, ## arg)
 
+#define mcg_debug_group(group, format, arg...) \
+       pr_debug("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+                (group)->name, (group)->demux->port, ## arg)
+
 #define mcg_error_group(group, format, arg...) \
        pr_err("  %16s: " format, (group)->name, ## arg)
 
@@ -206,15 +210,16 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
 {
        struct mlx4_ib_dev *dev = ctx->dev;
        struct ib_ah_attr       ah_attr;
+       unsigned long flags;
 
-       spin_lock(&dev->sm_lock);
+       spin_lock_irqsave(&dev->sm_lock, flags);
        if (!dev->sm_ah[ctx->port - 1]) {
                /* port is not yet Active, sm_ah not ready */
-               spin_unlock(&dev->sm_lock);
+               spin_unlock_irqrestore(&dev->sm_lock, flags);
                return -EAGAIN;
        }
        mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
-       spin_unlock(&dev->sm_lock);
+       spin_unlock_irqrestore(&dev->sm_lock, flags);
        return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
                                    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
                                    &ah_attr, NULL, mad);
@@ -961,8 +966,8 @@ int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
                mutex_lock(&group->lock);
                if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
                        mutex_unlock(&group->lock);
-                       mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
-                                      port, slave, MAX_PEND_REQS_PER_FUNC);
+                       mcg_debug_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+                                       port, slave, MAX_PEND_REQS_PER_FUNC);
                        release_group(group, 0);
                        kfree(req);
                        return -ENOMEM;
index 334387f63358a34d4ef1ae31ee068e3f58da3fe8..1e7b23bb2eb0bbb4b95c737585ace30371003494 100644 (file)
@@ -70,11 +70,24 @@ extern int mlx4_ib_sm_guid_assign;
 
 #define MLX4_IB_UC_STEER_QPN_ALIGN 1
 #define MLX4_IB_UC_MAX_NUM_QPS     256
+
+enum hw_bar_type {
+       HW_BAR_BF,
+       HW_BAR_DB,
+       HW_BAR_CLOCK,
+       HW_BAR_COUNT
+};
+
+struct mlx4_ib_vma_private_data {
+       struct vm_area_struct *vma;
+};
+
 struct mlx4_ib_ucontext {
        struct ib_ucontext      ibucontext;
        struct mlx4_uar         uar;
        struct list_head        db_page_list;
        struct mutex            db_page_mutex;
+       struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
 };
 
 struct mlx4_ib_pd {
@@ -415,7 +428,6 @@ struct mlx4_ib_demux_pv_ctx {
        struct ib_device *ib_dev;
        struct ib_cq *cq;
        struct ib_pd *pd;
-       struct ib_mr *mr;
        struct work_struct work;
        struct workqueue_struct *wq;
        struct mlx4_ib_demux_pv_qp qp[2];
@@ -457,15 +469,26 @@ struct mlx4_ib_sriov {
        struct idr pv_id_table;
 };
 
+struct gid_cache_context {
+       int real_index;
+       int refcount;
+};
+
+struct gid_entry {
+       union ib_gid    gid;
+       struct gid_cache_context *ctx;
+};
+
+struct mlx4_port_gid_table {
+       struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
 struct mlx4_ib_iboe {
        spinlock_t              lock;
        struct net_device      *netdevs[MLX4_MAX_PORTS];
-       struct net_device      *masters[MLX4_MAX_PORTS];
        atomic64_t              mac[MLX4_MAX_PORTS];
        struct notifier_block   nb;
-       struct notifier_block   nb_inet;
-       struct notifier_block   nb_inet6;
-       union ib_gid            gid_table[MLX4_MAX_PORTS][128];
+       struct mlx4_port_gid_table gids[MLX4_MAX_PORTS];
 };
 
 struct pkey_mgt {
@@ -680,8 +703,9 @@ struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
 int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
                    struct ib_mw_bind *mw_bind);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
-struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-                                       int max_page_list_len);
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
+                              enum ib_mr_type mr_type,
+                              u32 max_num_sg);
 struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
                                                               int page_list_len);
 void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
@@ -838,5 +862,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
                          u64 start, u64 length, u64 virt_addr,
                          int mr_access_flags, struct ib_pd *pd,
                          struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+                                   u8 port_num, int index);
 
 #endif /* MLX4_IB_H */
index e0d271782d0a0012577a100e4a59c27f17f9366d..2542fd3c1a493e037d6b1e24a4d5b15176ca03f9 100644 (file)
@@ -350,19 +350,24 @@ int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
        return 0;
 }
 
-struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-                                       int max_page_list_len)
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
+                              enum ib_mr_type mr_type,
+                              u32 max_num_sg)
 {
        struct mlx4_ib_dev *dev = to_mdev(pd->device);
        struct mlx4_ib_mr *mr;
        int err;
 
+       if (mr_type != IB_MR_TYPE_MEM_REG ||
+           max_num_sg > MLX4_MAX_FAST_REG_PAGES)
+               return ERR_PTR(-EINVAL);
+
        mr = kmalloc(sizeof *mr, GFP_KERNEL);
        if (!mr)
                return ERR_PTR(-ENOMEM);
 
        err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
-                           max_page_list_len, 0, &mr->mmr);
+                           max_num_sg, 0, &mr->mmr);
        if (err)
                goto err_free;
 
index c5a3a5f0de41f696a8eadd69f79ceecbad4628bd..4ad9be3ad61c0a780be7c0ce9aa098989226c77e 100644 (file)
@@ -1292,14 +1292,18 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
                path->static_rate = 0;
 
        if (ah->ah_flags & IB_AH_GRH) {
-               if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+               int real_sgid_index = mlx4_ib_gid_index_to_real_index(dev,
+                                                                     port,
+                                                                     ah->grh.sgid_index);
+
+               if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
                        pr_err("sgid_index (%u) too large. max is %d\n",
-                              ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+                              real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
                        return -1;
                }
 
                path->grh_mylmc |= 1 << 7;
-               path->mgid_index = ah->grh.sgid_index;
+               path->mgid_index = real_sgid_index;
                path->hop_limit  = ah->grh.hop_limit;
                path->tclass_flowlabel =
                        cpu_to_be32((ah->grh.traffic_class << 20) |
index 6797108ce8735b7aa11b08b4c09d8b62c29ec364..69fb5ba94d0f226c11a8614b29be4c33df329040 100644 (file)
@@ -640,6 +640,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
        struct mlx4_port *p;
        int i;
        int ret;
+       int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
+                       IB_LINK_LAYER_ETHERNET;
 
        p = kzalloc(sizeof *p, GFP_KERNEL);
        if (!p)
@@ -657,7 +659,8 @@ static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
 
        p->pkey_group.name  = "pkey_idx";
        p->pkey_group.attrs =
-               alloc_group_attrs(show_port_pkey, store_port_pkey,
+               alloc_group_attrs(show_port_pkey,
+                                 is_eth ? NULL : store_port_pkey,
                                  dev->dev->caps.pkey_table_len[port_num]);
        if (!p->pkey_group.attrs) {
                ret = -ENOMEM;
index 5c9eeea628054db99df2c987478b6bd713dbcf16..2d0dbbf38ceb9f6277bc9c86e726b97a6e33f17d 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/kref.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/ib_cache.h>
 #include "mlx5_ib.h"
 #include "user.h"
 
@@ -227,7 +228,14 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
        wc->dlid_path_bits = cqe->ml_path;
        g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
        wc->wc_flags |= g ? IB_WC_GRH : 0;
-       wc->pkey_index     = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+       if (unlikely(is_qp1(qp->ibqp.qp_type))) {
+               u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+
+               ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey,
+                                   &wc->pkey_index);
+       } else {
+               wc->pkey_index = 0;
+       }
 }
 
 static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
index 085c24b4b603600bed9497bde1fd3c5811715b6a..41d6911e244e1765a34b77fe067cf4e4ddeab172 100644 (file)
@@ -212,6 +212,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        int err = -ENOMEM;
        int max_rq_sg;
        int max_sq_sg;
+       u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
 
        if (uhw->inlen || uhw->outlen)
                return -EINVAL;
@@ -264,7 +265,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
        props->hw_ver              = mdev->pdev->revision;
 
        props->max_mr_size         = ~0ull;
-       props->page_size_cap       = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+       props->page_size_cap       = ~(min_page_size - 1);
        props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
        props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
        max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
@@ -273,6 +274,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
                     sizeof(struct mlx5_wqe_ctrl_seg)) /
                     sizeof(struct mlx5_wqe_data_seg);
        props->max_sge = min(max_rq_sg, max_sq_sg);
+       props->max_sge_rd = props->max_sge;
        props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
        props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
        props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
@@ -1121,7 +1123,6 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev)
 
        mlx5_ib_destroy_qp(dev->umrc.qp);
        ib_destroy_cq(dev->umrc.cq);
-       ib_dereg_mr(dev->umrc.mr);
        ib_dealloc_pd(dev->umrc.pd);
 }
 
@@ -1136,7 +1137,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
        struct ib_pd *pd;
        struct ib_cq *cq;
        struct ib_qp *qp;
-       struct ib_mr *mr;
        struct ib_cq_init_attr cq_attr = {};
        int ret;
 
@@ -1154,13 +1154,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
                goto error_0;
        }
 
-       mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(mr)) {
-               mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
-               ret = PTR_ERR(mr);
-               goto error_1;
-       }
-
        cq_attr.cqe = 128;
        cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL,
                          &cq_attr);
@@ -1218,7 +1211,6 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
 
        dev->umrc.qp = qp;
        dev->umrc.cq = cq;
-       dev->umrc.mr = mr;
        dev->umrc.pd = pd;
 
        sema_init(&dev->umrc.sem, MAX_UMR_WR);
@@ -1240,9 +1232,6 @@ error_3:
        ib_destroy_cq(cq);
 
 error_2:
-       ib_dereg_mr(mr);
-
-error_1:
        ib_dealloc_pd(pd);
 
 error_0:
@@ -1256,10 +1245,18 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        struct ib_srq_init_attr attr;
        struct mlx5_ib_dev *dev;
        struct ib_cq_init_attr cq_attr = {.cqe = 1};
+       u32 rsvd_lkey;
        int ret = 0;
 
        dev = container_of(devr, struct mlx5_ib_dev, devr);
 
+       ret = mlx5_core_query_special_context(dev->mdev, &rsvd_lkey);
+       if (ret) {
+               pr_err("Failed to query special context %d\n", ret);
+               return ret;
+       }
+       dev->ib_dev.local_dma_lkey = rsvd_lkey;
+
        devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
        if (IS_ERR(devr->p0)) {
                ret = PTR_ERR(devr->p0);
@@ -1421,7 +1418,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
        dev->ib_dev.owner               = THIS_MODULE;
        dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
-       dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
        dev->num_ports          = MLX5_CAP_GEN(mdev, num_ports);
        dev->ib_dev.phys_port_cnt     = dev->num_ports;
        dev->ib_dev.num_comp_vectors    =
@@ -1490,12 +1486,10 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        dev->ib_dev.get_dma_mr          = mlx5_ib_get_dma_mr;
        dev->ib_dev.reg_user_mr         = mlx5_ib_reg_user_mr;
        dev->ib_dev.dereg_mr            = mlx5_ib_dereg_mr;
-       dev->ib_dev.destroy_mr          = mlx5_ib_destroy_mr;
        dev->ib_dev.attach_mcast        = mlx5_ib_mcg_attach;
        dev->ib_dev.detach_mcast        = mlx5_ib_mcg_detach;
        dev->ib_dev.process_mad         = mlx5_ib_process_mad;
-       dev->ib_dev.create_mr           = mlx5_ib_create_mr;
-       dev->ib_dev.alloc_fast_reg_mr   = mlx5_ib_alloc_fast_reg_mr;
+       dev->ib_dev.alloc_mr            = mlx5_ib_alloc_mr;
        dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
        dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
        dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
index 7cae098364812242d549fc62d5585432051c3328..bb8cda79e8812cf1122feaa70a3f113958858d77 100644 (file)
@@ -349,7 +349,6 @@ struct umr_common {
        struct ib_pd    *pd;
        struct ib_cq    *cq;
        struct ib_qp    *qp;
-       struct ib_mr    *mr;
        /* control access to UMR QP
         */
        struct semaphore        sem;
@@ -573,11 +572,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
                       int npages, int zap);
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
-int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
-struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
-                               struct ib_mr_init_attr *mr_init_attr);
-struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-                                       int max_page_list_len);
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
+                              enum ib_mr_type mr_type,
+                              u32 max_num_sg);
 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
                                                               int page_list_len);
 void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
@@ -683,6 +680,11 @@ static inline u8 convert_access(int acc)
               MLX5_PERM_LOCAL_READ;
 }
 
+static inline int is_qp1(enum ib_qp_type qp_type)
+{
+       return qp_type == IB_QPT_GSI;
+}
+
 #define MLX5_MAX_UMR_SHIFT 16
 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
 
index bc9a0de897cb466d62d69dde0330ca96581953ca..54a15b5d336d00043643c09a99f05b89df2861bd 100644 (file)
@@ -441,9 +441,6 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
                spin_unlock_irq(&ent->lock);
 
                queue_work(cache->wq, &ent->work);
-
-               if (mr)
-                       break;
        }
 
        if (!mr)
@@ -690,12 +687,11 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
                             int access_flags)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
-       struct ib_mr *mr = dev->umrc.mr;
        struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
 
        sg->addr = dma;
        sg->length = ALIGN(sizeof(u64) * n, 64);
-       sg->lkey = mr->lkey;
+       sg->lkey = dev->umrc.pd->local_dma_lkey;
 
        wr->next = NULL;
        wr->send_flags = 0;
@@ -926,7 +922,7 @@ int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
                sg.addr = dma;
                sg.length = ALIGN(npages * sizeof(u64),
                                MLX5_UMR_MTT_ALIGNMENT);
-               sg.lkey = dev->umrc.mr->lkey;
+               sg.lkey = dev->umrc.pd->local_dma_lkey;
 
                wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
                                MLX5_IB_SEND_UMR_UPDATE_MTT;
@@ -1118,19 +1114,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        return &mr->ibmr;
 
 error:
-       /*
-        * Destroy the umem *before* destroying the MR, to ensure we
-        * will not have any in-flight notifiers when destroying the
-        * MR.
-        *
-        * As the MR is completely invalid to begin with, and this
-        * error path is only taken if we can't push the mr entry into
-        * the pagefault tree, this is safe.
-        */
-
        ib_umem_release(umem);
-       /* Kill the MR, and return an error code. */
-       clean_mr(mr);
        return ERR_PTR(err);
 }
 
@@ -1173,6 +1157,19 @@ static int clean_mr(struct mlx5_ib_mr *mr)
        int umred = mr->umred;
        int err;
 
+       if (mr->sig) {
+               if (mlx5_core_destroy_psv(dev->mdev,
+                                         mr->sig->psv_memory.psv_idx))
+                       mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+                                    mr->sig->psv_memory.psv_idx);
+               if (mlx5_core_destroy_psv(dev->mdev,
+                                         mr->sig->psv_wire.psv_idx))
+                       mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+                                    mr->sig->psv_wire.psv_idx);
+               kfree(mr->sig);
+               mr->sig = NULL;
+       }
+
        if (!umred) {
                err = destroy_mkey(dev, mr);
                if (err) {
@@ -1234,14 +1231,15 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
        return 0;
 }
 
-struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
-                               struct ib_mr_init_attr *mr_init_attr)
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
+                              enum ib_mr_type mr_type,
+                              u32 max_num_sg)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct mlx5_create_mkey_mbox_in *in;
        struct mlx5_ib_mr *mr;
        int access_mode, err;
-       int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4);
+       int ndescs = roundup(max_num_sg, 4);
 
        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
        if (!mr)
@@ -1257,9 +1255,11 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
        in->seg.xlt_oct_size = cpu_to_be32(ndescs);
        in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
        in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
-       access_mode = MLX5_ACCESS_MODE_MTT;
 
-       if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) {
+       if (mr_type == IB_MR_TYPE_MEM_REG) {
+               access_mode = MLX5_ACCESS_MODE_MTT;
+               in->seg.log2_page_size = PAGE_SHIFT;
+       } else if (mr_type == IB_MR_TYPE_SIGNATURE) {
                u32 psv_index[2];
 
                in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
@@ -1285,6 +1285,10 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
                mr->sig->sig_err_exists = false;
                /* Next UMR, Arm SIGERR */
                ++mr->sig->sigerr_count;
+       } else {
+               mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
+               err = -EINVAL;
+               goto err_free_in;
        }
 
        in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
@@ -1320,80 +1324,6 @@ err_free:
        return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
-{
-       struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
-       struct mlx5_ib_mr *mr = to_mmr(ibmr);
-       int err;
-
-       if (mr->sig) {
-               if (mlx5_core_destroy_psv(dev->mdev,
-                                         mr->sig->psv_memory.psv_idx))
-                       mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
-                                    mr->sig->psv_memory.psv_idx);
-               if (mlx5_core_destroy_psv(dev->mdev,
-                                         mr->sig->psv_wire.psv_idx))
-                       mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
-                                    mr->sig->psv_wire.psv_idx);
-               kfree(mr->sig);
-       }
-
-       err = destroy_mkey(dev, mr);
-       if (err) {
-               mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
-                            mr->mmr.key, err);
-               return err;
-       }
-
-       kfree(mr);
-
-       return err;
-}
-
-struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
-                                       int max_page_list_len)
-{
-       struct mlx5_ib_dev *dev = to_mdev(pd->device);
-       struct mlx5_create_mkey_mbox_in *in;
-       struct mlx5_ib_mr *mr;
-       int err;
-
-       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-       if (!mr)
-               return ERR_PTR(-ENOMEM);
-
-       in = kzalloc(sizeof(*in), GFP_KERNEL);
-       if (!in) {
-               err = -ENOMEM;
-               goto err_free;
-       }
-
-       in->seg.status = MLX5_MKEY_STATUS_FREE;
-       in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
-       in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
-       in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
-       in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
-       /*
-        * TBD not needed - issue 197292 */
-       in->seg.log2_page_size = PAGE_SHIFT;
-
-       err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
-                                   NULL, NULL);
-       kfree(in);
-       if (err)
-               goto err_free;
-
-       mr->ibmr.lkey = mr->mmr.key;
-       mr->ibmr.rkey = mr->mmr.key;
-       mr->umem = NULL;
-
-       return &mr->ibmr;
-
-err_free:
-       kfree(mr);
-       return ERR_PTR(err);
-}
-
 struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
                                                               int page_list_len)
 {
index 203c8a45e095560b146859e464eb0a33933c474a..c745c6c5e10da0b296fd19ef6ee01d7650af44ff 100644 (file)
@@ -76,11 +76,6 @@ static int is_qp0(enum ib_qp_type qp_type)
        return qp_type == IB_QPT_SMI;
 }
 
-static int is_qp1(enum ib_qp_type qp_type)
-{
-       return qp_type == IB_QPT_GSI;
-}
-
 static int is_sqp(enum ib_qp_type qp_type)
 {
        return is_qp0(qp_type) || is_qp1(qp_type);
index 93ae51dcf2ffaefb715363e2706855e66f1ce41b..dc2d48c59e6274c54e84af0d26b4da1b56271b13 100644 (file)
@@ -97,6 +97,7 @@ static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *pr
        props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
        props->max_qp_wr           = mdev->limits.max_wqes;
        props->max_sge             = mdev->limits.max_sg;
+       props->max_sge_rd          = props->max_sge;
        props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
        props->max_cqe             = mdev->limits.max_cqes;
        props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
index fbc43e5f717b024b4c50832e7aab8a229554fc82..44cb513f9a87c0597704422393802cebf70f0f45 100644 (file)
@@ -375,9 +375,11 @@ static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 }
 
 /*
- * nes_alloc_fast_reg_mr
+ * nes_alloc_mr
  */
-static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
+                                 enum ib_mr_type mr_type,
+                                 u32 max_num_sg)
 {
        struct nes_pd *nespd = to_nespd(ibpd);
        struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
@@ -393,11 +395,18 @@ static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list
        u32 stag;
        int ret;
        struct ib_mr *ibmr;
+
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+               return ERR_PTR(-E2BIG);
+
 /*
  * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
  *      for the fast_reg_mr operation to always know the size of the PBL.
  */
-       if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+       if (max_num_sg > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
                return ERR_PTR(-E2BIG);
 
        get_random_bytes(&next_stag_index, sizeof(next_stag_index));
@@ -424,7 +433,7 @@ static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list
        nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
                  stag, stag_index);
 
-       ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len);
+       ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_num_sg);
 
        if (ret == 0) {
                nesmr->ibmr.rkey = stag;
@@ -3929,7 +3938,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
        nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
        nesibdev->ibdev.bind_mw = nes_bind_mw;
 
-       nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr;
+       nesibdev->ibdev.alloc_mr = nes_alloc_mr;
        nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list;
        nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list;
 
index 6a36338593cd0a1c09b1ad67c1d7d57f856b0e93..b4091ab48db0bc86d8edf42cb76661c5d6b44b52 100644 (file)
@@ -246,7 +246,6 @@ struct ocrdma_dev {
        u16 base_eqid;
        u16 max_eq;
 
-       union ib_gid *sgid_tbl;
        /* provided synchronization to sgid table for
         * updating gid entries triggered by notifier.
         */
index b119a3413a155574ae1eb1bdd020bb7d77e42822..87aa55df7c8211f6f8fbc441d0133b86878387ec 100644 (file)
@@ -67,8 +67,6 @@ static LIST_HEAD(ocrdma_dev_list);
 static DEFINE_SPINLOCK(ocrdma_devlist_lock);
 static DEFINE_IDR(ocrdma_dev_id);
 
-static union ib_gid ocrdma_zero_sgid;
-
 void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 {
        u8 mac_addr[6];
@@ -83,135 +81,6 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
        guid[6] = mac_addr[4];
        guid[7] = mac_addr[5];
 }
-
-static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)
-{
-       int i;
-       unsigned long flags;
-
-       memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid));
-
-
-       spin_lock_irqsave(&dev->sgid_lock, flags);
-       for (i = 0; i < OCRDMA_MAX_SGID; i++) {
-               if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,
-                           sizeof(union ib_gid))) {
-                       /* found free entry */
-                       memcpy(&dev->sgid_tbl[i], new_sgid,
-                              sizeof(union ib_gid));
-                       spin_unlock_irqrestore(&dev->sgid_lock, flags);
-                       return true;
-               } else if (!memcmp(&dev->sgid_tbl[i], new_sgid,
-                                  sizeof(union ib_gid))) {
-                       /* entry already present, no addition is required. */
-                       spin_unlock_irqrestore(&dev->sgid_lock, flags);
-                       return false;
-               }
-       }
-       spin_unlock_irqrestore(&dev->sgid_lock, flags);
-       return false;
-}
-
-static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)
-{
-       int found = false;
-       int i;
-       unsigned long flags;
-
-
-       spin_lock_irqsave(&dev->sgid_lock, flags);
-       /* first is default sgid, which cannot be deleted. */
-       for (i = 1; i < OCRDMA_MAX_SGID; i++) {
-               if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {
-                       /* found matching entry */
-                       memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));
-                       found = true;
-                       break;
-               }
-       }
-       spin_unlock_irqrestore(&dev->sgid_lock, flags);
-       return found;
-}
-
-static int ocrdma_addr_event(unsigned long event, struct net_device *netdev,
-                            union ib_gid *gid)
-{
-       struct ib_event gid_event;
-       struct ocrdma_dev *dev;
-       bool found = false;
-       bool updated = false;
-       bool is_vlan = false;
-
-       is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN;
-       if (is_vlan)
-               netdev = rdma_vlan_dev_real_dev(netdev);
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) {
-               if (dev->nic_info.netdev == netdev) {
-                       found = true;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       if (!found)
-               return NOTIFY_DONE;
-
-       mutex_lock(&dev->dev_lock);
-       switch (event) {
-       case NETDEV_UP:
-               updated = ocrdma_add_sgid(dev, gid);
-               break;
-       case NETDEV_DOWN:
-               updated = ocrdma_del_sgid(dev, gid);
-               break;
-       default:
-               break;
-       }
-       if (updated) {
-               /* GID table updated, notify the consumers about it */
-               gid_event.device = &dev->ibdev;
-               gid_event.element.port_num = 1;
-               gid_event.event = IB_EVENT_GID_CHANGE;
-               ib_dispatch_event(&gid_event);
-       }
-       mutex_unlock(&dev->dev_lock);
-       return NOTIFY_OK;
-}
-
-static int ocrdma_inetaddr_event(struct notifier_block *notifier,
-                                 unsigned long event, void *ptr)
-{
-       struct in_ifaddr *ifa = ptr;
-       union ib_gid gid;
-       struct net_device *netdev = ifa->ifa_dev->dev;
-
-       ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
-       return ocrdma_addr_event(event, netdev, &gid);
-}
-
-static struct notifier_block ocrdma_inetaddr_notifier = {
-       .notifier_call = ocrdma_inetaddr_event
-};
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-static int ocrdma_inet6addr_event(struct notifier_block *notifier,
-                                 unsigned long event, void *ptr)
-{
-       struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
-       union  ib_gid *gid = (union ib_gid *)&ifa->addr;
-       struct net_device *netdev = ifa->idev->dev;
-       return ocrdma_addr_event(event, netdev, gid);
-}
-
-static struct notifier_block ocrdma_inet6addr_notifier = {
-       .notifier_call = ocrdma_inet6addr_event
-};
-
-#endif /* IPV6 and VLAN */
-
 static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
                                              u8 port_num)
 {
@@ -280,6 +149,9 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
        dev->ibdev.query_port = ocrdma_query_port;
        dev->ibdev.modify_port = ocrdma_modify_port;
        dev->ibdev.query_gid = ocrdma_query_gid;
+       dev->ibdev.get_netdev = ocrdma_get_netdev;
+       dev->ibdev.add_gid = ocrdma_add_gid;
+       dev->ibdev.del_gid = ocrdma_del_gid;
        dev->ibdev.get_link_layer = ocrdma_link_layer;
        dev->ibdev.alloc_pd = ocrdma_alloc_pd;
        dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
@@ -309,7 +181,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
        dev->ibdev.dereg_mr = ocrdma_dereg_mr;
        dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
 
-       dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr;
+       dev->ibdev.alloc_mr = ocrdma_alloc_mr;
        dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list;
        dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list;
 
@@ -342,12 +214,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
 {
        mutex_init(&dev->dev_lock);
-       dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
-                               OCRDMA_MAX_SGID, GFP_KERNEL);
-       if (!dev->sgid_tbl)
-               goto alloc_err;
-       spin_lock_init(&dev->sgid_lock);
-
        dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
                              OCRDMA_MAX_CQ, GFP_KERNEL);
        if (!dev->cq_tbl)
@@ -379,7 +245,6 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev)
        kfree(dev->stag_arr);
        kfree(dev->qp_tbl);
        kfree(dev->cq_tbl);
-       kfree(dev->sgid_tbl);
 }
 
 /* OCRDMA sysfs interface */
@@ -425,68 +290,6 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
                device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
 }
 
-static void ocrdma_add_default_sgid(struct ocrdma_dev *dev)
-{
-       /* GID Index 0 - Invariant manufacturer-assigned EUI-64 */
-       union ib_gid *sgid = &dev->sgid_tbl[0];
-
-       sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
-       ocrdma_get_guid(dev, &sgid->raw[8]);
-}
-
-static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
-                                 struct net_device *net)
-{
-       struct in_device *in_dev;
-       union ib_gid gid;
-       in_dev = in_dev_get(net);
-       if (in_dev) {
-               for_ifa(in_dev) {
-                       ipv6_addr_set_v4mapped(ifa->ifa_address,
-                                              (struct in6_addr *)&gid);
-                       ocrdma_add_sgid(dev, &gid);
-               }
-               endfor_ifa(in_dev);
-               in_dev_put(in_dev);
-       }
-}
-
-static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
-                                 struct net_device *net)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-       struct inet6_dev *in6_dev;
-       union ib_gid  *pgid;
-       struct inet6_ifaddr *ifp;
-       in6_dev = in6_dev_get(net);
-       if (in6_dev) {
-               read_lock_bh(&in6_dev->lock);
-               list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-                       pgid = (union ib_gid *)&ifp->addr;
-                       ocrdma_add_sgid(dev, pgid);
-               }
-               read_unlock_bh(&in6_dev->lock);
-               in6_dev_put(in6_dev);
-       }
-#endif
-}
-
-static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
-{
-       struct  net_device *net_dev;
-
-       for_each_netdev(&init_net, net_dev) {
-               struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
-                               rdma_vlan_dev_real_dev(net_dev) : net_dev;
-
-               if (real_dev == dev->nic_info.netdev) {
-                       ocrdma_add_default_sgid(dev);
-                       ocrdma_init_ipv4_gids(dev, net_dev);
-                       ocrdma_init_ipv6_gids(dev, net_dev);
-               }
-       }
-}
-
 static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 {
        int status = 0, i;
@@ -515,7 +318,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
                goto alloc_err;
 
        ocrdma_init_service_level(dev);
-       ocrdma_init_gid_table(dev);
        status = ocrdma_register_device(dev);
        if (status)
                goto alloc_err;
@@ -662,34 +464,12 @@ static struct ocrdma_driver ocrdma_drv = {
        .be_abi_version         = OCRDMA_BE_ROCE_ABI_VERSION,
 };
 
-static void ocrdma_unregister_inet6addr_notifier(void)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-       unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier);
-#endif
-}
-
-static void ocrdma_unregister_inetaddr_notifier(void)
-{
-       unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier);
-}
-
 static int __init ocrdma_init_module(void)
 {
        int status;
 
        ocrdma_init_debugfs();
 
-       status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier);
-       if (status)
-               return status;
-
-#if IS_ENABLED(CONFIG_IPV6)
-       status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);
-       if (status)
-               goto err_notifier6;
-#endif
-
        status = be_roce_register_driver(&ocrdma_drv);
        if (status)
                goto err_be_reg;
@@ -697,19 +477,13 @@ static int __init ocrdma_init_module(void)
        return 0;
 
 err_be_reg:
-#if IS_ENABLED(CONFIG_IPV6)
-       ocrdma_unregister_inet6addr_notifier();
-err_notifier6:
-#endif
-       ocrdma_unregister_inetaddr_notifier();
+
        return status;
 }
 
 static void __exit ocrdma_exit_module(void)
 {
        be_roce_unregister_driver(&ocrdma_drv);
-       ocrdma_unregister_inet6addr_notifier();
-       ocrdma_unregister_inetaddr_notifier();
        ocrdma_rem_debugfs();
        idr_destroy(&ocrdma_dev_id);
 }
index 80006b24aa118e752f444383fc9f4f3c3bafb191..6a38268bbe9fb6b981e27f2ec42da8fd10adbfc2 100644 (file)
@@ -140,6 +140,8 @@ enum {
        OCRDMA_DB_RQ_SHIFT              = 24
 };
 
+#define OCRDMA_ROUDP_FLAGS_SHIFT       0x03
+
 #define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF  /* bits 0 - 9 */
 #define OCRDMA_DB_CQ_RING_ID_EXT_MASK  0x0C00  /* bits 10-11 of qid at 12-11 */
 /* qid #2 msbits at 12-11 */
index bc84cd462ecf3208e8576ad1ba1084578ddb7c29..1f3affb6a477156dec43694d80b70b3b8339c27d 100644 (file)
@@ -46,6 +46,7 @@
 #include <rdma/iw_cm.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "ocrdma.h"
 #include "ocrdma_hw.h"
@@ -64,6 +65,7 @@ int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
 int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
                     int index, union ib_gid *sgid)
 {
+       int ret;
        struct ocrdma_dev *dev;
 
        dev = get_ocrdma_dev(ibdev);
@@ -71,8 +73,28 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
        if (index >= OCRDMA_MAX_SGID)
                return -EINVAL;
 
-       memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid));
+       ret = ib_get_cached_gid(ibdev, port, index, sgid);
+       if (ret == -EAGAIN) {
+               memcpy(sgid, &zgid, sizeof(*sgid));
+               return 0;
+       }
+
+       return ret;
+}
+
+int ocrdma_add_gid(struct ib_device *device,
+                  u8 port_num,
+                  unsigned int index,
+                  const union ib_gid *gid,
+                  const struct ib_gid_attr *attr,
+                  void **context) {
+       return  0;
+}
 
+int  ocrdma_del_gid(struct ib_device *device,
+                   u8 port_num,
+                   unsigned int index,
+                   void **context) {
        return 0;
 }
 
@@ -125,6 +147,24 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
        return 0;
 }
 
+struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+       struct ocrdma_dev *dev;
+       struct net_device *ndev = NULL;
+
+       rcu_read_lock();
+
+       dev = get_ocrdma_dev(ibdev);
+       if (dev)
+               ndev = dev->nic_info.netdev;
+       if (ndev)
+               dev_hold(ndev);
+
+       rcu_read_unlock();
+
+       return ndev;
+}
+
 static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
                                            u8 *ib_speed, u8 *ib_width)
 {
@@ -194,7 +234,8 @@ int ocrdma_query_port(struct ib_device *ibdev,
        props->port_cap_flags =
            IB_PORT_CM_SUP |
            IB_PORT_REINIT_SUP |
-           IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS;
+           IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP |
+           IB_PORT_IP_BASED_GIDS;
        props->gid_tbl_len = OCRDMA_MAX_SGID;
        props->pkey_tbl_len = 1;
        props->bad_pkey_cntr = 0;
@@ -2998,21 +3039,26 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
        return 0;
 }
 
-struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len)
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd,
+                             enum ib_mr_type mr_type,
+                             u32 max_num_sg)
 {
        int status;
        struct ocrdma_mr *mr;
        struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
        struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 
-       if (max_page_list_len > dev->attr.max_pages_per_frmr)
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       if (max_num_sg > dev->attr.max_pages_per_frmr)
                return ERR_PTR(-EINVAL);
 
        mr = kzalloc(sizeof(*mr), GFP_KERNEL);
        if (!mr)
                return ERR_PTR(-ENOMEM);
 
-       status = ocrdma_get_pbl_info(dev, mr, max_page_list_len);
+       status = ocrdma_get_pbl_info(dev, mr, max_num_sg);
        if (status)
                goto pbl_err;
        mr->hwmr.fr_mr = 1;
index eaccb2d3cb9ff52f51fe0bbba3334053d519d047..308c16857a5d03e2d3605136abdf8d3b11ad5361 100644 (file)
@@ -63,6 +63,17 @@ ocrdma_query_protocol(struct ib_device *device, u8 port_num);
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
 int ocrdma_query_gid(struct ib_device *, u8 port,
                     int index, union ib_gid *gid);
+struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
+int ocrdma_add_gid(struct ib_device *device,
+                  u8 port_num,
+                  unsigned int index,
+                  const union ib_gid *gid,
+                  const struct ib_gid_attr *attr,
+                  void **context);
+int  ocrdma_del_gid(struct ib_device *device,
+                   u8 port_num,
+                   unsigned int index,
+                   void **context);
 int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
 struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
@@ -111,7 +122,9 @@ struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
                                   int num_phys_buf, int acc, u64 *iova_start);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
                                 u64 virt, int acc, struct ib_udata *);
-struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len);
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
+                             enum ib_mr_type mr_type,
+                             u32 max_num_sg);
 struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
                                                        *ibdev,
                                                        int page_list_len);
index 725881890c4a217247993f9fbb933ff11bb27e27..e449e394963f00d42cd11ecafbca6081f9011bcd 100644 (file)
@@ -908,7 +908,7 @@ static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
 
-static struct vm_operations_struct qib_file_vm_ops = {
+static const struct vm_operations_struct qib_file_vm_ops = {
        .fault = qib_file_vma_fault,
 };
 
index ad843c786e7212d0c89f90264bacb5cb6b8346a0..5afaa218508d222f901252194c5872a18a79be90 100644 (file)
@@ -86,6 +86,10 @@ int qib_alloc_lkey(struct qib_mregion *mr, int dma_region)
         * unrestricted LKEY.
         */
        rkt->gen++;
+       /*
+        * bits are capped in qib_verbs.c to insure enough bits
+        * for generation number
+        */
        mr->lkey = (r << (32 - ib_qib_lkey_table_size)) |
                ((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen)
                 << 8);
index 941d4d50d8e74a76f333498c66ab9eb3da9b6246..57e99dc0d80c868c1eb8e678805ea0a65f31cd0a 100644 (file)
 
 #include <rdma/ib_pma.h>
 
-#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
-#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
-#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
-#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+#define IB_SMP_UNSUP_VERSION \
+cpu_to_be16(IB_MGMT_MAD_STATUS_BAD_VERSION)
 
-struct ib_node_info {
-       u8 base_version;
-       u8 class_version;
-       u8 node_type;
-       u8 num_ports;
-       __be64 sys_guid;
-       __be64 node_guid;
-       __be64 port_guid;
-       __be16 partition_cap;
-       __be16 device_id;
-       __be32 revision;
-       u8 local_port_num;
-       u8 vendor_id[3];
-} __packed;
-
-struct ib_mad_notice_attr {
-       u8 generic_type;
-       u8 prod_type_msb;
-       __be16 prod_type_lsb;
-       __be16 trap_num;
-       __be16 issuer_lid;
-       __be16 toggle_count;
-
-       union {
-               struct {
-                       u8      details[54];
-               } raw_data;
-
-               struct {
-                       __be16  reserved;
-                       __be16  lid;            /* where violation happened */
-                       u8      port_num;       /* where violation happened */
-               } __packed ntc_129_131;
-
-               struct {
-                       __be16  reserved;
-                       __be16  lid;            /* LID where change occurred */
-                       u8      reserved2;
-                       u8      local_changes;  /* low bit - local changes */
-                       __be32  new_cap_mask;   /* new capability mask */
-                       u8      reserved3;
-                       u8      change_flags;   /* low 3 bits only */
-               } __packed ntc_144;
-
-               struct {
-                       __be16  reserved;
-                       __be16  lid;            /* lid where sys guid changed */
-                       __be16  reserved2;
-                       __be64  new_sys_guid;
-               } __packed ntc_145;
-
-               struct {
-                       __be16  reserved;
-                       __be16  lid;
-                       __be16  dr_slid;
-                       u8      method;
-                       u8      reserved2;
-                       __be16  attr_id;
-                       __be32  attr_mod;
-                       __be64  mkey;
-                       u8      reserved3;
-                       u8      dr_trunc_hop;
-                       u8      dr_rtn_path[30];
-               } __packed ntc_256;
-
-               struct {
-                       __be16          reserved;
-                       __be16          lid1;
-                       __be16          lid2;
-                       __be32          key;
-                       __be32          sl_qp1; /* SL: high 4 bits */
-                       __be32          qp2;    /* high 8 bits reserved */
-                       union ib_gid    gid1;
-                       union ib_gid    gid2;
-               } __packed ntc_257_258;
-
-       } details;
-};
-
-/*
- * Generic trap/notice types
- */
-#define IB_NOTICE_TYPE_FATAL   0x80
-#define IB_NOTICE_TYPE_URGENT  0x81
-#define IB_NOTICE_TYPE_SECURITY        0x82
-#define IB_NOTICE_TYPE_SM      0x83
-#define IB_NOTICE_TYPE_INFO    0x84
+#define IB_SMP_UNSUP_METHOD \
+cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD)
 
-/*
- * Generic trap/notice producers
- */
-#define IB_NOTICE_PROD_CA              cpu_to_be16(1)
-#define IB_NOTICE_PROD_SWITCH          cpu_to_be16(2)
-#define IB_NOTICE_PROD_ROUTER          cpu_to_be16(3)
-#define IB_NOTICE_PROD_CLASS_MGR       cpu_to_be16(4)
+#define IB_SMP_UNSUP_METH_ATTR \
+cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB)
 
-/*
- * Generic trap/notice numbers
- */
-#define IB_NOTICE_TRAP_LLI_THRESH      cpu_to_be16(129)
-#define IB_NOTICE_TRAP_EBO_THRESH      cpu_to_be16(130)
-#define IB_NOTICE_TRAP_FLOW_UPDATE     cpu_to_be16(131)
-#define IB_NOTICE_TRAP_CAP_MASK_CHG    cpu_to_be16(144)
-#define IB_NOTICE_TRAP_SYS_GUID_CHG    cpu_to_be16(145)
-#define IB_NOTICE_TRAP_BAD_MKEY                cpu_to_be16(256)
-#define IB_NOTICE_TRAP_BAD_PKEY                cpu_to_be16(257)
-#define IB_NOTICE_TRAP_BAD_QKEY                cpu_to_be16(258)
-
-/*
- * Repress trap/notice flags
- */
-#define IB_NOTICE_REPRESS_LLI_THRESH   (1 << 0)
-#define IB_NOTICE_REPRESS_EBO_THRESH   (1 << 1)
-#define IB_NOTICE_REPRESS_FLOW_UPDATE  (1 << 2)
-#define IB_NOTICE_REPRESS_CAP_MASK_CHG (1 << 3)
-#define IB_NOTICE_REPRESS_SYS_GUID_CHG (1 << 4)
-#define IB_NOTICE_REPRESS_BAD_MKEY     (1 << 5)
-#define IB_NOTICE_REPRESS_BAD_PKEY     (1 << 6)
-#define IB_NOTICE_REPRESS_BAD_QKEY     (1 << 7)
-
-/*
- * Generic trap/notice other local changes flags (trap 144).
- */
-#define IB_NOTICE_TRAP_LSE_CHG         0x04    /* Link Speed Enable changed */
-#define IB_NOTICE_TRAP_LWE_CHG         0x02    /* Link Width Enable changed */
-#define IB_NOTICE_TRAP_NODE_DESC_CHG   0x01
-
-/*
- * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
- */
-#define IB_NOTICE_TRAP_DR_NOTICE       0x80
-#define IB_NOTICE_TRAP_DR_TRUNC                0x40
-
-struct ib_vl_weight_elem {
-       u8      vl;     /* Only low 4 bits, upper 4 bits reserved */
-       u8      weight;
-};
+#define IB_SMP_INVALID_FIELD \
+cpu_to_be16(IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE)
 
 #define IB_VLARB_LOWPRI_0_31    1
 #define IB_VLARB_LOWPRI_32_63   2
index 146cf29a2e1db19a8293f2ecbf3f8348ce1732c2..34927b700b0e67f5ebbe605aceefd205c9651660 100644 (file)
@@ -75,7 +75,7 @@ static void qib_vma_close(struct vm_area_struct *vma)
        kref_put(&ip->ref, qib_release_mmap_info);
 }
 
-static struct vm_operations_struct qib_vm_ops = {
+static const struct vm_operations_struct qib_vm_ops = {
        .open =     qib_vma_open,
        .close =    qib_vma_close,
 };
index c4473db46699b5f367a0fd1b9d9df92bb2723d50..19220dcb9a3b2a1ea00e4aed2d87eef09269a92d 100644 (file)
@@ -327,11 +327,16 @@ out:
  *
  * Return the memory region on success, otherwise return an errno.
  */
-struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+struct ib_mr *qib_alloc_mr(struct ib_pd *pd,
+                          enum ib_mr_type mr_type,
+                          u32 max_num_sg)
 {
        struct qib_mr *mr;
 
-       mr = alloc_mr(max_page_list_len, pd);
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       mr = alloc_mr(max_num_sg, pd);
        if (IS_ERR(mr))
                return (struct ib_mr *)mr;
 
index f42bd0f47577a4557f47cac58de4a16b678b923d..22e356ca8058af1511d8a2a4af4947bd2c0fc892 100644 (file)
@@ -32,6 +32,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <rdma/ib_smi.h>
 
 #include "qib.h"
 #include "qib_mad.h"
index a05d1a372208a11f837bc10564fa51aec1fb7ab6..3dcc4985b60ff861d5d7e2b5c9a8a9c780bccf2d 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/rculist.h>
 #include <linux/mm.h>
 #include <linux/random.h>
+#include <linux/vmalloc.h>
 
 #include "qib.h"
 #include "qib_common.h"
@@ -1574,6 +1575,7 @@ static int qib_query_device(struct ib_device *ibdev, struct ib_device_attr *prop
        props->max_qp = ib_qib_max_qps;
        props->max_qp_wr = ib_qib_max_qp_wrs;
        props->max_sge = ib_qib_max_sges;
+       props->max_sge_rd = ib_qib_max_sges;
        props->max_cq = ib_qib_max_cqs;
        props->max_ah = ib_qib_max_ahs;
        props->max_cqe = ib_qib_max_cqes;
@@ -2109,10 +2111,16 @@ int qib_register_ib_device(struct qib_devdata *dd)
         * the LKEY).  The remaining bits act as a generation number or tag.
         */
        spin_lock_init(&dev->lk_table.lock);
+       /* insure generation is at least 4 bits see keys.c */
+       if (ib_qib_lkey_table_size > MAX_LKEY_TABLE_BITS) {
+               qib_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
+                       ib_qib_lkey_table_size, MAX_LKEY_TABLE_BITS);
+               ib_qib_lkey_table_size = MAX_LKEY_TABLE_BITS;
+       }
        dev->lk_table.max = 1 << ib_qib_lkey_table_size;
        lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
        dev->lk_table.table = (struct qib_mregion __rcu **)
-               __get_free_pages(GFP_KERNEL, get_order(lk_tab_size));
+               vmalloc(lk_tab_size);
        if (dev->lk_table.table == NULL) {
                ret = -ENOMEM;
                goto err_lk;
@@ -2235,7 +2243,7 @@ int qib_register_ib_device(struct qib_devdata *dd)
        ibdev->reg_phys_mr = qib_reg_phys_mr;
        ibdev->reg_user_mr = qib_reg_user_mr;
        ibdev->dereg_mr = qib_dereg_mr;
-       ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr;
+       ibdev->alloc_mr = qib_alloc_mr;
        ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list;
        ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list;
        ibdev->alloc_fmr = qib_alloc_fmr;
@@ -2286,7 +2294,7 @@ err_tx:
                                        sizeof(struct qib_pio_header),
                                  dev->pio_hdrs, dev->pio_hdrs_phys);
 err_hdrs:
-       free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size));
+       vfree(dev->lk_table.table);
 err_lk:
        kfree(dev->qp_table);
 err_qpt:
@@ -2340,8 +2348,7 @@ void qib_unregister_ib_device(struct qib_devdata *dd)
                                        sizeof(struct qib_pio_header),
                                  dev->pio_hdrs, dev->pio_hdrs_phys);
        lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
-       free_pages((unsigned long) dev->lk_table.table,
-                  get_order(lk_tab_size));
+       vfree(dev->lk_table.table);
        kfree(dev->qp_table);
 }
 
index 1635572752ce5bb37e05b3059dc194c196031880..a08df70e85038a220a0dce5a3ccc30f9d165bcd5 100644 (file)
@@ -647,6 +647,8 @@ struct qib_qpn_table {
        struct qpn_map map[QPNMAP_ENTRIES];
 };
 
+#define MAX_LKEY_TABLE_BITS 23
+
 struct qib_lkey_table {
        spinlock_t lock; /* protect changes in this struct */
        u32 next;               /* next unused index (speeds search) */
@@ -1032,7 +1034,9 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 int qib_dereg_mr(struct ib_mr *ibmr);
 
-struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+struct ib_mr *qib_alloc_mr(struct ib_pd *pd,
+                          enum ib_mr_type mr_type,
+                          u32 max_entries);
 
 struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list(
                                struct ib_device *ibdev, int page_list_len);
index 79859c4d43c9c572f4946a364b87fdc74acf40a3..ca2873698d75444066312640a1eb84dc5e2190db 100644 (file)
@@ -342,7 +342,6 @@ struct ipoib_dev_priv {
        u16               pkey;
        u16               pkey_index;
        struct ib_pd     *pd;
-       struct ib_mr     *mr;
        struct ib_cq     *recv_cq;
        struct ib_cq     *send_cq;
        struct ib_qp     *qp;
index ee39be6ccfb0fdd9aa75ad408543e1f0b08ff1d4..c78dc1638030093298c28e13605c86845ce1fcfc 100644 (file)
@@ -332,7 +332,7 @@ static void ipoib_cm_init_rx_wr(struct net_device *dev,
        int i;
 
        for (i = 0; i < priv->cm.num_frags; ++i)
-               sge[i].lkey = priv->mr->lkey;
+               sge[i].lkey = priv->pd->local_dma_lkey;
 
        sge[0].length = IPOIB_CM_HEAD_SIZE;
        for (i = 1; i < priv->cm.num_frags; ++i)
@@ -848,7 +848,7 @@ int ipoib_cm_dev_open(struct net_device *dev)
        }
 
        ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
-                          0, NULL);
+                          0);
        if (ret) {
                printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
                       IPOIB_CM_IETF_ID | priv->qp->qp_num);
index b2943c84a5dda0aecdd8904917f2ebbb02b9b013..36536ce5a3e2f9d51278be970d51bb51ec07232c 100644 (file)
@@ -48,6 +48,9 @@
 
 #include <linux/jhash.h>
 #include <net/arp.h>
+#include <net/addrconf.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_cache.h>
 
 #define DRV_VERSION "1.0.0"
 
@@ -89,13 +92,18 @@ struct workqueue_struct *ipoib_workqueue;
 struct ib_sa_client ipoib_sa_client;
 
 static void ipoib_add_one(struct ib_device *device);
-static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device, void *client_data);
 static void ipoib_neigh_reclaim(struct rcu_head *rp);
+static struct net_device *ipoib_get_net_dev_by_params(
+               struct ib_device *dev, u8 port, u16 pkey,
+               const union ib_gid *gid, const struct sockaddr *addr,
+               void *client_data);
 
 static struct ib_client ipoib_client = {
        .name   = "ipoib",
        .add    = ipoib_add_one,
-       .remove = ipoib_remove_one
+       .remove = ipoib_remove_one,
+       .get_net_dev_by_params = ipoib_get_net_dev_by_params,
 };
 
 int ipoib_open(struct net_device *dev)
@@ -222,6 +230,225 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
        return 0;
 }
 
+/* Called with an RCU read lock taken */
+static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
+                                       struct net_device *dev)
+{
+       struct net *net = dev_net(dev);
+       struct in_device *in_dev;
+       struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
+       struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
+       __be32 ret_addr;
+
+       switch (addr->sa_family) {
+       case AF_INET:
+               in_dev = in_dev_get(dev);
+               if (!in_dev)
+                       return false;
+
+               ret_addr = inet_confirm_addr(net, in_dev, 0,
+                                            addr_in->sin_addr.s_addr,
+                                            RT_SCOPE_HOST);
+               in_dev_put(in_dev);
+               if (ret_addr)
+                       return true;
+
+               break;
+       case AF_INET6:
+               if (IS_ENABLED(CONFIG_IPV6) &&
+                   ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
+                       return true;
+
+               break;
+       }
+       return false;
+}
+
+/**
+ * Find the master net_device on top of the given net_device.
+ * @dev: base IPoIB net_device
+ *
+ * Returns the master net_device with a reference held, or the same net_device
+ * if no master exists.
+ */
+static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
+{
+       struct net_device *master;
+
+       rcu_read_lock();
+       master = netdev_master_upper_dev_get_rcu(dev);
+       if (master)
+               dev_hold(master);
+       rcu_read_unlock();
+
+       if (master)
+               return master;
+
+       dev_hold(dev);
+       return dev;
+}
+
+/**
+ * Find a net_device matching the given address, which is an upper device of
+ * the given net_device.
+ * @addr: IP address to look for.
+ * @dev: base IPoIB net_device
+ *
+ * If found, returns the net_device with a reference held. Otherwise return
+ * NULL.
+ */
+static struct net_device *ipoib_get_net_dev_match_addr(
+               const struct sockaddr *addr, struct net_device *dev)
+{
+       struct net_device *upper,
+                         *result = NULL;
+       struct list_head *iter;
+
+       rcu_read_lock();
+       if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
+               dev_hold(dev);
+               result = dev;
+               goto out;
+       }
+
+       netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
+               if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
+                       dev_hold(upper);
+                       result = upper;
+                       break;
+               }
+       }
+out:
+       rcu_read_unlock();
+       return result;
+}
+
+/* returns the number of IPoIB netdevs on top a given ipoib device matching a
+ * pkey_index and address, if one exists.
+ *
+ * @found_net_dev: contains a matching net_device if the return value >= 1,
+ * with a reference held. */
+static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
+                                    const union ib_gid *gid,
+                                    u16 pkey_index,
+                                    const struct sockaddr *addr,
+                                    int nesting,
+                                    struct net_device **found_net_dev)
+{
+       struct ipoib_dev_priv *child_priv;
+       struct net_device *net_dev = NULL;
+       int matches = 0;
+
+       if (priv->pkey_index == pkey_index &&
+           (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
+               if (!addr) {
+                       net_dev = ipoib_get_master_net_dev(priv->dev);
+               } else {
+                       /* Verify the net_device matches the IP address, as
+                        * IPoIB child devices currently share a GID. */
+                       net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
+               }
+               if (net_dev) {
+                       if (!*found_net_dev)
+                               *found_net_dev = net_dev;
+                       else
+                               dev_put(net_dev);
+                       ++matches;
+               }
+       }
+
+       /* Check child interfaces */
+       down_read_nested(&priv->vlan_rwsem, nesting);
+       list_for_each_entry(child_priv, &priv->child_intfs, list) {
+               matches += ipoib_match_gid_pkey_addr(child_priv, gid,
+                                                   pkey_index, addr,
+                                                   nesting + 1,
+                                                   found_net_dev);
+               if (matches > 1)
+                       break;
+       }
+       up_read(&priv->vlan_rwsem);
+
+       return matches;
+}
+
+/* Returns the number of matching net_devs found (between 0 and 2). Also
+ * return the matching net_device in the @net_dev parameter, holding a
+ * reference to the net_device, if the number of matches >= 1 */
+static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
+                                        u16 pkey_index,
+                                        const union ib_gid *gid,
+                                        const struct sockaddr *addr,
+                                        struct net_device **net_dev)
+{
+       struct ipoib_dev_priv *priv;
+       int matches = 0;
+
+       *net_dev = NULL;
+
+       list_for_each_entry(priv, dev_list, list) {
+               if (priv->port != port)
+                       continue;
+
+               matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
+                                                    addr, 0, net_dev);
+               if (matches > 1)
+                       break;
+       }
+
+       return matches;
+}
+
+static struct net_device *ipoib_get_net_dev_by_params(
+               struct ib_device *dev, u8 port, u16 pkey,
+               const union ib_gid *gid, const struct sockaddr *addr,
+               void *client_data)
+{
+       struct net_device *net_dev;
+       struct list_head *dev_list = client_data;
+       u16 pkey_index;
+       int matches;
+       int ret;
+
+       if (!rdma_protocol_ib(dev, port))
+               return NULL;
+
+       ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
+       if (ret)
+               return NULL;
+
+       if (!dev_list)
+               return NULL;
+
+       /* See if we can find a unique device matching the L2 parameters */
+       matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+                                               gid, NULL, &net_dev);
+
+       switch (matches) {
+       case 0:
+               return NULL;
+       case 1:
+               return net_dev;
+       }
+
+       dev_put(net_dev);
+
+       /* Couldn't find a unique device with L2 parameters only. Use L3
+        * address to uniquely match the net device */
+       matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
+                                               gid, addr, &net_dev);
+       switch (matches) {
+       case 0:
+               return NULL;
+       default:
+               dev_warn_ratelimited(&dev->dev,
+                                    "duplicate IP address detected\n");
+               /* Fall through */
+       case 1:
+               return net_dev;
+       }
+}
+
 int ipoib_set_mode(struct net_device *dev, const char *buf)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1715,12 +1942,11 @@ static void ipoib_add_one(struct ib_device *device)
        ib_set_client_data(device, &ipoib_client, dev_list);
 }
 
-static void ipoib_remove_one(struct ib_device *device)
+static void ipoib_remove_one(struct ib_device *device, void *client_data)
 {
        struct ipoib_dev_priv *priv, *tmp;
-       struct list_head *dev_list;
+       struct list_head *dev_list = client_data;
 
-       dev_list = ib_get_client_data(device, &ipoib_client);
        if (!dev_list)
                return;
 
index 0d23e0568deb6fee19247ddbe43fbaabc477edf4..09a1748f9d131423f020020456d61d2f6c44a8b1 100644 (file)
@@ -393,8 +393,13 @@ static int ipoib_mcast_join_complete(int status,
                        goto out_locked;
                }
        } else {
-               if (mcast->logcount++ < 20) {
-                       if (status == -ETIMEDOUT || status == -EAGAIN) {
+               bool silent_fail =
+                   test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+                   status == -EINVAL;
+
+               if (mcast->logcount < 20) {
+                       if (status == -ETIMEDOUT || status == -EAGAIN ||
+                           silent_fail) {
                                ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
                                                test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
                                                mcast->mcmember.mgid.raw, status);
@@ -403,6 +408,9 @@ static int ipoib_mcast_join_complete(int status,
                                                test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
                                           mcast->mcmember.mgid.raw, status);
                        }
+
+                       if (!silent_fail)
+                               mcast->logcount++;
                }
 
                if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
@@ -448,8 +456,7 @@ out_locked:
        return status;
 }
 
-static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
-                            int create)
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_sa_multicast *multicast;
@@ -471,7 +478,14 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
                IB_SA_MCMEMBER_REC_PKEY         |
                IB_SA_MCMEMBER_REC_JOIN_STATE;
 
-       if (create) {
+       if (mcast != priv->broadcast) {
+               /*
+                * RFC 4391:
+                *  The MGID MUST use the same P_Key, Q_Key, SL, MTU,
+                *  and HopLimit as those used in the broadcast-GID.  The rest
+                *  of attributes SHOULD follow the values used in the
+                *  broadcast-GID as well.
+                */
                comp_mask |=
                        IB_SA_MCMEMBER_REC_QKEY                 |
                        IB_SA_MCMEMBER_REC_MTU_SELECTOR         |
@@ -492,6 +506,22 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
                rec.sl            = priv->broadcast->mcmember.sl;
                rec.flow_label    = priv->broadcast->mcmember.flow_label;
                rec.hop_limit     = priv->broadcast->mcmember.hop_limit;
+
+               /*
+                * Historically Linux IPoIB has never properly supported SEND
+                * ONLY join. It emulated it by not providing all the required
+                * attributes, which is enough to prevent group creation and
+                * detect if there are full members or not. A major problem
+                * with supporting SEND ONLY is detecting when the group is
+                * auto-destroyed as IPoIB will cache the MLID..
+                */
+#if 1
+               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+                       comp_mask &= ~IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+#else
+               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+                       rec.join_state = 4;
+#endif
        }
 
        multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
@@ -517,7 +547,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
        struct ib_port_attr port_attr;
        unsigned long delay_until = 0;
        struct ipoib_mcast *mcast = NULL;
-       int create = 1;
 
        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
                return;
@@ -566,7 +595,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
                if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
                    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
                        mcast = priv->broadcast;
-                       create = 0;
                        if (mcast->backoff > 1 &&
                            time_before(jiffies, mcast->delay_until)) {
                                delay_until = mcast->delay_until;
@@ -590,12 +618,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
                                /* Found the next unjoined group */
                                init_completion(&mcast->done);
                                set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-                               if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-                                       create = 0;
-                               else
-                                       create = 1;
                                spin_unlock_irq(&priv->lock);
-                               ipoib_mcast_join(dev, mcast, create);
+                               ipoib_mcast_join(dev, mcast);
                                spin_lock_irq(&priv->lock);
                        } else if (!delay_until ||
                                 time_before(mcast->delay_until, delay_until))
@@ -618,7 +642,7 @@ out:
        }
        spin_unlock_irq(&priv->lock);
        if (mcast)
-               ipoib_mcast_join(dev, mcast, create);
+               ipoib_mcast_join(dev, mcast);
 }
 
 int ipoib_mcast_start_thread(struct net_device *dev)
index 851c8219d50104105ec8d97a3ba743cb6f59626b..78845b6e8b812737477ce68dcbc6c1712477d23d 100644 (file)
@@ -152,12 +152,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
                return -ENODEV;
        }
 
-       priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(priv->mr)) {
-               printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
-               goto out_free_pd;
-       }
-
        /*
         * the various IPoIB tasks assume they will never race against
         * themselves, so always use a single thread workqueue
@@ -165,7 +159,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        priv->wq = create_singlethread_workqueue("ipoib_wq");
        if (!priv->wq) {
                printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
-               goto out_free_mr;
+               goto out_free_pd;
        }
 
        size = ipoib_recvq_size + 1;
@@ -225,13 +219,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
        priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
 
        for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
-               priv->tx_sge[i].lkey = priv->mr->lkey;
+               priv->tx_sge[i].lkey = priv->pd->local_dma_lkey;
 
        priv->tx_wr.opcode      = IB_WR_SEND;
        priv->tx_wr.sg_list     = priv->tx_sge;
        priv->tx_wr.send_flags  = IB_SEND_SIGNALED;
 
-       priv->rx_sge[0].lkey = priv->mr->lkey;
+       priv->rx_sge[0].lkey = priv->pd->local_dma_lkey;
 
        priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
        priv->rx_wr.num_sge = 1;
@@ -254,9 +248,6 @@ out_free_wq:
        destroy_workqueue(priv->wq);
        priv->wq = NULL;
 
-out_free_mr:
-       ib_dereg_mr(priv->mr);
-
 out_free_pd:
        ib_dealloc_pd(priv->pd);
 
@@ -289,12 +280,7 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
                priv->wq = NULL;
        }
 
-       if (ib_dereg_mr(priv->mr))
-               ipoib_warn(priv, "ib_dereg_mr failed\n");
-
-       if (ib_dealloc_pd(priv->pd))
-               ipoib_warn(priv, "ib_dealloc_pd failed\n");
-
+       ib_dealloc_pd(priv->pd);
 }
 
 void ipoib_event(struct ib_event_handler *handler,
index 6a594aac229008418f388433107a959186f96867..1ace5d83a4d761b82ffbe446bbf41a1f051cd66b 100644 (file)
 
 #include "iscsi_iser.h"
 
+MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
+MODULE_VERSION(DRV_VER);
+
 static struct scsi_host_template iscsi_iser_sht;
 static struct iscsi_transport iscsi_iser_transport;
 static struct scsi_transport_template *iscsi_iser_scsi_transport;
-
-static unsigned int iscsi_max_lun = 512;
-module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
+static struct workqueue_struct *release_wq;
+struct iser_global ig;
 
 int iser_debug_level = 0;
-bool iser_pi_enable = false;
-int iser_pi_guard = 1;
+module_param_named(debug_level, iser_debug_level, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
 
-MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
-MODULE_VERSION(DRV_VER);
+static unsigned int iscsi_max_lun = 512;
+module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
+MODULE_PARM_DESC(max_lun, "Max LUNs to allow per session (default:512");
 
-module_param_named(debug_level, iser_debug_level, int, 0644);
-MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
+unsigned int iser_max_sectors = ISER_DEF_MAX_SECTORS;
+module_param_named(max_sectors, iser_max_sectors, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(max_sectors, "Max number of sectors in a single scsi command (default:1024");
 
-module_param_named(pi_enable, iser_pi_enable, bool, 0644);
+bool iser_pi_enable = false;
+module_param_named(pi_enable, iser_pi_enable, bool, S_IRUGO);
 MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
 
-module_param_named(pi_guard, iser_pi_guard, int, 0644);
+int iser_pi_guard;
+module_param_named(pi_guard, iser_pi_guard, int, S_IRUGO);
 MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
 
-static struct workqueue_struct *release_wq;
-struct iser_global ig;
-
 /*
  * iscsi_iser_recv() - Process a successfull recv completion
  * @conn:         iscsi connection
@@ -201,10 +204,12 @@ iser_initialize_task_headers(struct iscsi_task *task,
                goto out;
        }
 
+       tx_desc->wr_idx = 0;
+       tx_desc->mapped = true;
        tx_desc->dma_addr = dma_addr;
        tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
        tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
-       tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+       tx_desc->tx_sg[0].lkey   = device->pd->local_dma_lkey;
 
        iser_task->iser_conn = iser_conn;
 out:
@@ -360,16 +365,19 @@ iscsi_iser_task_xmit(struct iscsi_task *task)
 static void iscsi_iser_cleanup_task(struct iscsi_task *task)
 {
        struct iscsi_iser_task *iser_task = task->dd_data;
-       struct iser_tx_desc    *tx_desc   = &iser_task->desc;
-       struct iser_conn       *iser_conn         = task->conn->dd_data;
+       struct iser_tx_desc *tx_desc = &iser_task->desc;
+       struct iser_conn *iser_conn = task->conn->dd_data;
        struct iser_device *device = iser_conn->ib_conn.device;
 
        /* DEVICE_REMOVAL event might have already released the device */
        if (!device)
                return;
 
-       ib_dma_unmap_single(device->ib_device,
-               tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+       if (likely(tx_desc->mapped)) {
+               ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+                                   ISER_HEADERS_LEN, DMA_TO_DEVICE);
+               tx_desc->mapped = false;
+       }
 
        /* mgmt tasks do not need special cleanup */
        if (!task->sc)
@@ -622,6 +630,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
        if (ep) {
                iser_conn = ep->dd_data;
                max_cmds = iser_conn->max_cmds;
+               shost->sg_tablesize = iser_conn->scsi_sg_tablesize;
+               shost->max_sectors = iser_conn->scsi_max_sectors;
 
                mutex_lock(&iser_conn->state_mutex);
                if (iser_conn->state != ISER_CONN_UP) {
@@ -640,6 +650,15 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
                                                   SHOST_DIX_GUARD_CRC);
                }
 
+               /*
+                * Limit the sg_tablesize and max_sectors based on the device
+                * max fastreg page list length.
+                */
+               shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize,
+                       ib_conn->device->dev_attr.max_fast_reg_page_list_len);
+               shost->max_sectors = min_t(unsigned int,
+                       1024, (shost->sg_tablesize * PAGE_SIZE) >> 9);
+
                if (iscsi_host_add(shost,
                                   ib_conn->device->ib_device->dma_device)) {
                        mutex_unlock(&iser_conn->state_mutex);
@@ -742,15 +761,9 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s
        stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
        stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
        stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
-       stats->custom_length = 4;
-       strcpy(stats->custom[0].desc, "qp_tx_queue_full");
-       stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */
-       strcpy(stats->custom[1].desc, "fmr_map_not_avail");
-       stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */;
-       strcpy(stats->custom[2].desc, "eh_abort_cnt");
-       stats->custom[2].value = conn->eh_abort_cnt;
-       strcpy(stats->custom[3].desc, "fmr_unalign_cnt");
-       stats->custom[3].value = conn->fmr_unalign_cnt;
+       stats->custom_length = 1;
+       strcpy(stats->custom[0].desc, "fmr_unalign_cnt");
+       stats->custom[0].value = conn->fmr_unalign_cnt;
 }
 
 static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
@@ -839,10 +852,9 @@ failure:
 static int
 iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
 {
-       struct iser_conn *iser_conn;
+       struct iser_conn *iser_conn = ep->dd_data;
        int rc;
 
-       iser_conn = ep->dd_data;
        rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion,
                                                       msecs_to_jiffies(timeout_ms));
        /* if conn establishment failed, return error code to iscsi */
@@ -854,7 +866,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
                mutex_unlock(&iser_conn->state_mutex);
        }
 
-       iser_info("ib conn %p rc = %d\n", iser_conn, rc);
+       iser_info("iser conn %p rc = %d\n", iser_conn, rc);
 
        if (rc > 0)
                return 1; /* success, this is the equivalent of POLLOUT */
@@ -876,11 +888,9 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
 static void
 iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
 {
-       struct iser_conn *iser_conn;
+       struct iser_conn *iser_conn = ep->dd_data;
 
-       iser_conn = ep->dd_data;
-       iser_info("ep %p iser conn %p state %d\n",
-                 ep, iser_conn, iser_conn->state);
+       iser_info("ep %p iser conn %p\n", ep, iser_conn);
 
        mutex_lock(&iser_conn->state_mutex);
        iser_conn_terminate(iser_conn);
@@ -900,6 +910,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
                mutex_unlock(&iser_conn->state_mutex);
                iser_conn_release(iser_conn);
        }
+
        iscsi_destroy_endpoint(ep);
 }
 
@@ -962,8 +973,8 @@ static struct scsi_host_template iscsi_iser_sht = {
        .name                   = "iSCSI Initiator over iSER",
        .queuecommand           = iscsi_queuecommand,
        .change_queue_depth     = scsi_change_queue_depth,
-       .sg_tablesize           = ISCSI_ISER_SG_TABLESIZE,
-       .max_sectors            = 1024,
+       .sg_tablesize           = ISCSI_ISER_DEF_SG_TABLESIZE,
+       .max_sectors            = ISER_DEF_MAX_SECTORS,
        .cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
        .eh_abort_handler       = iscsi_eh_abort,
        .eh_device_reset_handler= iscsi_eh_device_reset,
@@ -1074,7 +1085,7 @@ static void __exit iser_exit(void)
 
        if (!connlist_empty) {
                iser_err("Error cleanup stage completed but we still have iser "
-                        "connections, destroying them anyway.\n");
+                        "connections, destroying them anyway\n");
                list_for_each_entry_safe(iser_conn, n, &ig.connlist,
                                         conn_list) {
                        iser_conn_release(iser_conn);
index 262ba1f8ee507d30111b98b7970b5bbc86e051c6..86f6583485ef3f99c678a5ce1087f45e7e2ba1f1 100644 (file)
 #define SHIFT_4K       12
 #define SIZE_4K        (1ULL << SHIFT_4K)
 #define MASK_4K        (~(SIZE_4K-1))
-                                       /* support up to 512KB in one RDMA */
-#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
+
+/* Default support is 512KB I/O size */
+#define ISER_DEF_MAX_SECTORS           1024
+#define ISCSI_ISER_DEF_SG_TABLESIZE    ((ISER_DEF_MAX_SECTORS * 512) >> SHIFT_4K)
+/* Maximum support is 8MB I/O size */
+#define ISCSI_ISER_MAX_SG_TABLESIZE    ((16384 * 512) >> SHIFT_4K)
+
 #define ISER_DEF_XMIT_CMDS_DEFAULT             512
 #if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
        #define ISER_DEF_XMIT_CMDS_MAX          ISCSI_DEF_XMIT_CMDS_MAX
@@ -239,6 +244,7 @@ struct iser_data_buf {
 struct iser_device;
 struct iscsi_iser_task;
 struct iscsi_endpoint;
+struct iser_reg_resources;
 
 /**
  * struct iser_mem_reg - iSER memory registration info
@@ -259,6 +265,14 @@ enum iser_desc_type {
        ISCSI_TX_DATAOUT
 };
 
+/* Maximum number of work requests per task:
+ * Data memory region local invalidate + fast registration
+ * Protection memory region local invalidate + fast registration
+ * Signature memory region local invalidate + fast registration
+ * PDU send
+ */
+#define ISER_MAX_WRS 7
+
 /**
  * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
  *
@@ -270,6 +284,12 @@ enum iser_desc_type {
  *                 sg[1] optionally points to either of immediate data
  *                 unsolicited data-out or control
  * @num_sge:       number sges used on this TX task
+ * @mapped:        Is the task header mapped
+ * @wr_idx:        Current WR index
+ * @wrs:           Array of WRs per task
+ * @data_reg:      Data buffer registration details
+ * @prot_reg:      Protection buffer registration details
+ * @sig_attrs:     Signature attributes
  */
 struct iser_tx_desc {
        struct iser_hdr              iser_header;
@@ -278,6 +298,12 @@ struct iser_tx_desc {
        u64                          dma_addr;
        struct ib_sge                tx_sg[2];
        int                          num_sge;
+       bool                         mapped;
+       u8                           wr_idx;
+       struct ib_send_wr            wrs[ISER_MAX_WRS];
+       struct iser_mem_reg          data_reg;
+       struct iser_mem_reg          prot_reg;
+       struct ib_sig_attrs          sig_attrs;
 };
 
 #define ISER_RX_PAD_SIZE       (256 - (ISER_RX_PAYLOAD_SIZE + \
@@ -323,6 +349,33 @@ struct iser_comp {
        int                      active_qps;
 };
 
+/**
+ * struct iser_device - Memory registration operations
+ *     per-device registration schemes
+ *
+ * @alloc_reg_res:     Allocate registration resources
+ * @free_reg_res:      Free registration resources
+ * @fast_reg_mem:      Register memory buffers
+ * @unreg_mem:         Un-register memory buffers
+ * @reg_desc_get:      Get a registration descriptor for pool
+ * @reg_desc_put:      Get a registration descriptor to pool
+ */
+struct iser_reg_ops {
+       int            (*alloc_reg_res)(struct ib_conn *ib_conn,
+                                       unsigned cmds_max,
+                                       unsigned int size);
+       void           (*free_reg_res)(struct ib_conn *ib_conn);
+       int            (*reg_mem)(struct iscsi_iser_task *iser_task,
+                                 struct iser_data_buf *mem,
+                                 struct iser_reg_resources *rsc,
+                                 struct iser_mem_reg *reg);
+       void           (*unreg_mem)(struct iscsi_iser_task *iser_task,
+                                   enum iser_data_dir cmd_dir);
+       struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn);
+       void           (*reg_desc_put)(struct ib_conn *ib_conn,
+                                      struct iser_fr_desc *desc);
+};
+
 /**
  * struct iser_device - iSER device handle
  *
@@ -336,11 +389,7 @@ struct iser_comp {
  * @comps_used:    Number of completion contexts used, Min between online
  *                 cpus and device max completion vectors
  * @comps:         Dinamically allocated array of completion handlers
- * Memory registration pool Function pointers (FMR or Fastreg):
- *     @iser_alloc_rdma_reg_res: Allocation of memory regions pool
- *     @iser_free_rdma_reg_res:  Free of memory regions pool
- *     @iser_reg_rdma_mem:       Memory registration routine
- *     @iser_unreg_rdma_mem:     Memory deregistration routine
+ * @reg_ops:       Registration ops
  */
 struct iser_device {
        struct ib_device             *ib_device;
@@ -352,54 +401,73 @@ struct iser_device {
        int                          refcount;
        int                          comps_used;
        struct iser_comp             *comps;
-       int                          (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
-                                                               unsigned cmds_max);
-       void                         (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
-       int                          (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task,
-                                                         enum iser_data_dir cmd_dir);
-       void                         (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
-                                                           enum iser_data_dir cmd_dir);
+       struct iser_reg_ops          *reg_ops;
 };
 
 #define ISER_CHECK_GUARD       0xc0
 #define ISER_CHECK_REFTAG      0x0f
 #define ISER_CHECK_APPTAG      0x30
 
-enum iser_reg_indicator {
-       ISER_DATA_KEY_VALID     = 1 << 0,
-       ISER_PROT_KEY_VALID     = 1 << 1,
-       ISER_SIG_KEY_VALID      = 1 << 2,
-       ISER_FASTREG_PROTECTED  = 1 << 3,
+/**
+ * struct iser_reg_resources - Fast registration recources
+ *
+ * @mr:         memory region
+ * @fmr_pool:   pool of fmrs
+ * @frpl:       fast reg page list used by frwrs
+ * @page_vec:   fast reg page list used by fmr pool
+ * @mr_valid:   is mr valid indicator
+ */
+struct iser_reg_resources {
+       union {
+               struct ib_mr             *mr;
+               struct ib_fmr_pool       *fmr_pool;
+       };
+       union {
+               struct ib_fast_reg_page_list     *frpl;
+               struct iser_page_vec             *page_vec;
+       };
+       u8                                mr_valid:1;
 };
 
 /**
  * struct iser_pi_context - Protection information context
  *
- * @prot_mr:        protection memory region
- * @prot_frpl:      protection fastreg page list
- * @sig_mr:         signature feature enabled memory region
+ * @rsc:             protection buffer registration resources
+ * @sig_mr:          signature enable memory region
+ * @sig_mr_valid:    is sig_mr valid indicator
+ * @sig_protected:   is region protected indicator
  */
 struct iser_pi_context {
-       struct ib_mr                   *prot_mr;
-       struct ib_fast_reg_page_list   *prot_frpl;
+       struct iser_reg_resources       rsc;
        struct ib_mr                   *sig_mr;
+       u8                              sig_mr_valid:1;
+       u8                              sig_protected:1;
 };
 
 /**
- * struct fast_reg_descriptor - Fast registration descriptor
+ * struct iser_fr_desc - Fast registration descriptor
  *
  * @list:           entry in connection fastreg pool
- * @data_mr:        data memory region
- * @data_frpl:      data fastreg page list
+ * @rsc:            data buffer registration resources
  * @pi_ctx:         protection information context
- * @reg_indicators: fast registration indicators
  */
-struct fast_reg_descriptor {
+struct iser_fr_desc {
        struct list_head                  list;
-       struct ib_mr                     *data_mr;
-       struct ib_fast_reg_page_list     *data_frpl;
+       struct iser_reg_resources         rsc;
        struct iser_pi_context           *pi_ctx;
-       u8                                reg_indicators;
+};
+
+/**
+ * struct iser_fr_pool: connection fast registration pool
+ *
+ * @list:                list of fastreg descriptors
+ * @lock:                protects fmr/fastreg pool
+ * @size:                size of the pool
+ */
+struct iser_fr_pool {
+       struct list_head        list;
+       spinlock_t              lock;
+       int                     size;
 };
 
 /**
@@ -415,15 +483,7 @@ struct fast_reg_descriptor {
  * @pi_support:          Indicate device T10-PI support
  * @beacon:              beacon send wr to signal all flush errors were drained
  * @flush_comp:          completes when all connection completions consumed
- * @lock:                protects fmr/fastreg pool
- * @union.fmr:
- *     @pool:            FMR pool for fast registrations
- *     @page_vec:        page vector to hold mapped commands pages
- *                       used for registration
- * @union.fastreg:
- *     @pool:            Fast registration descriptors pool for fast
- *                       registrations
- *     @pool_size:       Size of pool
+ * @fr_pool:             connection fast registration poool
  */
 struct ib_conn {
        struct rdma_cm_id           *cma_id;
@@ -436,17 +496,7 @@ struct ib_conn {
        bool                         pi_support;
        struct ib_send_wr            beacon;
        struct completion            flush_comp;
-       spinlock_t                   lock;
-       union {
-               struct {
-                       struct ib_fmr_pool      *pool;
-                       struct iser_page_vec    *page_vec;
-               } fmr;
-               struct {
-                       struct list_head         pool;
-                       int                      pool_size;
-               } fastreg;
-       };
+       struct iser_fr_pool          fr_pool;
 };
 
 /**
@@ -477,6 +527,8 @@ struct ib_conn {
  * @rx_desc_head:     head of rx_descs cyclic buffer
  * @rx_descs:         rx buffers array (cyclic buffer)
  * @num_rx_descs:     number of rx descriptors
+ * @scsi_sg_tablesize: scsi host sg_tablesize
+ * @scsi_max_sectors: scsi host max sectors
  */
 struct iser_conn {
        struct ib_conn               ib_conn;
@@ -501,6 +553,8 @@ struct iser_conn {
        unsigned int                 rx_desc_head;
        struct iser_rx_desc          *rx_descs;
        u32                          num_rx_descs;
+       unsigned short               scsi_sg_tablesize;
+       unsigned int                 scsi_max_sectors;
 };
 
 /**
@@ -556,6 +610,9 @@ extern struct iser_global ig;
 extern int iser_debug_level;
 extern bool iser_pi_enable;
 extern int iser_pi_guard;
+extern unsigned int iser_max_sectors;
+
+int iser_assign_reg_ops(struct iser_device *device);
 
 int iser_send_control(struct iscsi_conn *conn,
                      struct iscsi_task *task);
@@ -597,10 +654,10 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
                                     struct iser_data_buf *mem,
                                     enum iser_data_dir cmd_dir);
 
-int  iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task,
-                          enum iser_data_dir cmd_dir);
-int  iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task,
-                              enum iser_data_dir cmd_dir);
+int iser_reg_rdma_mem(struct iscsi_iser_task *task,
+                     enum iser_data_dir dir);
+void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
+                        enum iser_data_dir dir);
 
 int  iser_connect(struct iser_conn *iser_conn,
                  struct sockaddr *src_addr,
@@ -630,15 +687,40 @@ int  iser_initialize_task_headers(struct iscsi_task *task,
                        struct iser_tx_desc *tx_desc);
 int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
                              struct iscsi_session *session);
-int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
+                       unsigned cmds_max,
+                       unsigned int size);
 void iser_free_fmr_pool(struct ib_conn *ib_conn);
-int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
+                           unsigned cmds_max,
+                           unsigned int size);
 void iser_free_fastreg_pool(struct ib_conn *ib_conn);
 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
                             enum iser_data_dir cmd_dir, sector_t *sector);
-struct fast_reg_descriptor *
-iser_reg_desc_get(struct ib_conn *ib_conn);
+struct iser_fr_desc *
+iser_reg_desc_get_fr(struct ib_conn *ib_conn);
 void
-iser_reg_desc_put(struct ib_conn *ib_conn,
-                 struct fast_reg_descriptor *desc);
+iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+                    struct iser_fr_desc *desc);
+struct iser_fr_desc *
+iser_reg_desc_get_fmr(struct ib_conn *ib_conn);
+void
+iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
+                     struct iser_fr_desc *desc);
+
+static inline struct ib_send_wr *
+iser_tx_next_wr(struct iser_tx_desc *tx_desc)
+{
+       struct ib_send_wr *cur_wr = &tx_desc->wrs[tx_desc->wr_idx];
+       struct ib_send_wr *last_wr;
+
+       if (tx_desc->wr_idx) {
+               last_wr = &tx_desc->wrs[tx_desc->wr_idx - 1];
+               last_wr->next = cur_wr;
+       }
+       tx_desc->wr_idx++;
+
+       return cur_wr;
+}
+
 #endif
index 3e2118e8ed8798e4c7946d980ce0351e31088f34..d511879d8cdfc862765f871b0876b43307e3b7c7 100644 (file)
@@ -49,7 +49,6 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
 
 {
        struct iscsi_iser_task *iser_task = task->dd_data;
-       struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
        struct iser_mem_reg *mem_reg;
        int err;
        struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -73,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task)
                        return err;
        }
 
-       err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+       err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
        if (err) {
                iser_err("Failed to set up Data-IN RDMA\n");
                return err;
@@ -103,7 +102,6 @@ iser_prepare_write_cmd(struct iscsi_task *task,
                       unsigned int edtl)
 {
        struct iscsi_iser_task *iser_task = task->dd_data;
-       struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
        struct iser_mem_reg *mem_reg;
        int err;
        struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -128,7 +126,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
                        return err;
        }
 
-       err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+       err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
        if (err != 0) {
                iser_err("Failed to register write cmd RDMA mem\n");
                return err;
@@ -170,13 +168,7 @@ static void iser_create_send_desc(struct iser_conn *iser_conn,
 
        memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
        tx_desc->iser_header.flags = ISER_VER;
-
        tx_desc->num_sge = 1;
-
-       if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
-               tx_desc->tx_sg[0].lkey = device->mr->lkey;
-               iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
-       }
 }
 
 static void iser_free_login_buf(struct iser_conn *iser_conn)
@@ -266,7 +258,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
        iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
        iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
 
-       if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max))
+       if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
+                                          iser_conn->scsi_sg_tablesize))
                goto create_rdma_reg_res_failed;
 
        if (iser_alloc_login_buf(iser_conn))
@@ -291,7 +284,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
                rx_sg = &rx_desc->rx_sg;
                rx_sg->addr   = rx_desc->dma_addr;
                rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-               rx_sg->lkey   = device->mr->lkey;
+               rx_sg->lkey   = device->pd->local_dma_lkey;
        }
 
        iser_conn->rx_desc_head = 0;
@@ -307,7 +300,7 @@ rx_desc_dma_map_failed:
 rx_desc_alloc_fail:
        iser_free_login_buf(iser_conn);
 alloc_login_buf_fail:
-       device->iser_free_rdma_reg_res(ib_conn);
+       device->reg_ops->free_reg_res(ib_conn);
 create_rdma_reg_res_failed:
        iser_err("failed allocating rx descriptors / data buffers\n");
        return -ENOMEM;
@@ -320,8 +313,8 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
        struct ib_conn *ib_conn = &iser_conn->ib_conn;
        struct iser_device *device = ib_conn->device;
 
-       if (device->iser_free_rdma_reg_res)
-               device->iser_free_rdma_reg_res(ib_conn);
+       if (device->reg_ops->free_reg_res)
+               device->reg_ops->free_reg_res(ib_conn);
 
        rx_desc = iser_conn->rx_descs;
        for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
@@ -454,7 +447,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
        unsigned long buf_offset;
        unsigned long data_seg_len;
        uint32_t itt;
-       int err = 0;
+       int err;
        struct ib_sge *tx_dsg;
 
        itt = (__force uint32_t)hdr->itt;
@@ -475,7 +468,9 @@ int iser_send_data_out(struct iscsi_conn *conn,
        memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
        /* build the tx desc */
-       iser_initialize_task_headers(task, tx_desc);
+       err = iser_initialize_task_headers(task, tx_desc);
+       if (err)
+               goto send_data_out_error;
 
        mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
        tx_dsg = &tx_desc->tx_sg[1];
@@ -502,7 +497,7 @@ int iser_send_data_out(struct iscsi_conn *conn,
 
 send_data_out_error:
        kmem_cache_free(ig.desc_cache, tx_desc);
-       iser_err("conn %p failed err %d\n",conn, err);
+       iser_err("conn %p failed err %d\n", conn, err);
        return err;
 }
 
@@ -543,7 +538,7 @@ int iser_send_control(struct iscsi_conn *conn,
 
                tx_dsg->addr    = iser_conn->login_req_dma;
                tx_dsg->length  = task->data_count;
-               tx_dsg->lkey    = device->mr->lkey;
+               tx_dsg->lkey    = device->pd->local_dma_lkey;
                mdesc->num_sge = 2;
        }
 
@@ -666,7 +661,6 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 {
-       struct iser_device *device = iser_task->iser_conn->ib_conn.device;
        int is_rdma_data_aligned = 1;
        int is_rdma_prot_aligned = 1;
        int prot_count = scsi_prot_sg_count(iser_task->sc);
@@ -703,7 +697,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
        }
 
        if (iser_task->dir[ISER_DIR_IN]) {
-               device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
+               iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
                if (is_rdma_data_aligned)
                        iser_dma_unmap_task_data(iser_task,
                                                 &iser_task->data[ISER_DIR_IN],
@@ -715,7 +709,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
        }
 
        if (iser_task->dir[ISER_DIR_OUT]) {
-               device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
+               iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
                if (is_rdma_data_aligned)
                        iser_dma_unmap_task_data(iser_task,
                                                 &iser_task->data[ISER_DIR_OUT],
index f0cdc961eb11b92dd5a80930dab123a7587c8f2c..2493cc748db839b4ec885b82e5292633f242ad01 100644 (file)
 #include <linux/scatterlist.h>
 
 #include "iscsi_iser.h"
+static
+int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
+                     struct iser_data_buf *mem,
+                     struct iser_reg_resources *rsc,
+                     struct iser_mem_reg *mem_reg);
+static
+int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
+                    struct iser_data_buf *mem,
+                    struct iser_reg_resources *rsc,
+                    struct iser_mem_reg *mem_reg);
+
+static struct iser_reg_ops fastreg_ops = {
+       .alloc_reg_res  = iser_alloc_fastreg_pool,
+       .free_reg_res   = iser_free_fastreg_pool,
+       .reg_mem        = iser_fast_reg_mr,
+       .unreg_mem      = iser_unreg_mem_fastreg,
+       .reg_desc_get   = iser_reg_desc_get_fr,
+       .reg_desc_put   = iser_reg_desc_put_fr,
+};
+
+static struct iser_reg_ops fmr_ops = {
+       .alloc_reg_res  = iser_alloc_fmr_pool,
+       .free_reg_res   = iser_free_fmr_pool,
+       .reg_mem        = iser_fast_reg_fmr,
+       .unreg_mem      = iser_unreg_mem_fmr,
+       .reg_desc_get   = iser_reg_desc_get_fmr,
+       .reg_desc_put   = iser_reg_desc_put_fmr,
+};
+
+int iser_assign_reg_ops(struct iser_device *device)
+{
+       struct ib_device_attr *dev_attr = &device->dev_attr;
+
+       /* Assign function handles  - based on FMR support */
+       if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
+           device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+               iser_info("FMR supported, using FMR for registration\n");
+               device->reg_ops = &fmr_ops;
+       } else
+       if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+               iser_info("FastReg supported, using FastReg for registration\n");
+               device->reg_ops = &fastreg_ops;
+       } else {
+               iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
+               return -1;
+       }
+
+       return 0;
+}
 
 static void
 iser_free_bounce_sg(struct iser_data_buf *data)
@@ -146,30 +195,47 @@ iser_copy_to_bounce(struct iser_data_buf *data)
        iser_copy_bounce(data, true);
 }
 
-struct fast_reg_descriptor *
-iser_reg_desc_get(struct ib_conn *ib_conn)
+struct iser_fr_desc *
+iser_reg_desc_get_fr(struct ib_conn *ib_conn)
 {
-       struct fast_reg_descriptor *desc;
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+       struct iser_fr_desc *desc;
        unsigned long flags;
 
-       spin_lock_irqsave(&ib_conn->lock, flags);
-       desc = list_first_entry(&ib_conn->fastreg.pool,
-                               struct fast_reg_descriptor, list);
+       spin_lock_irqsave(&fr_pool->lock, flags);
+       desc = list_first_entry(&fr_pool->list,
+                               struct iser_fr_desc, list);
        list_del(&desc->list);
-       spin_unlock_irqrestore(&ib_conn->lock, flags);
+       spin_unlock_irqrestore(&fr_pool->lock, flags);
 
        return desc;
 }
 
 void
-iser_reg_desc_put(struct ib_conn *ib_conn,
-                 struct fast_reg_descriptor *desc)
+iser_reg_desc_put_fr(struct ib_conn *ib_conn,
+                    struct iser_fr_desc *desc)
 {
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
        unsigned long flags;
 
-       spin_lock_irqsave(&ib_conn->lock, flags);
-       list_add(&desc->list, &ib_conn->fastreg.pool);
-       spin_unlock_irqrestore(&ib_conn->lock, flags);
+       spin_lock_irqsave(&fr_pool->lock, flags);
+       list_add(&desc->list, &fr_pool->list);
+       spin_unlock_irqrestore(&fr_pool->lock, flags);
+}
+
+struct iser_fr_desc *
+iser_reg_desc_get_fmr(struct ib_conn *ib_conn)
+{
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+
+       return list_first_entry(&fr_pool->list,
+                               struct iser_fr_desc, list);
+}
+
+void
+iser_reg_desc_put_fmr(struct ib_conn *ib_conn,
+                     struct iser_fr_desc *desc)
+{
 }
 
 /**
@@ -297,7 +363,8 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,
  * consecutive SG elements are actually fragments of the same physcial page.
  */
 static int iser_data_buf_aligned_len(struct iser_data_buf *data,
-                                     struct ib_device *ibdev)
+                                    struct ib_device *ibdev,
+                                    unsigned sg_tablesize)
 {
        struct scatterlist *sg, *sgl, *next_sg = NULL;
        u64 start_addr, end_addr;
@@ -309,6 +376,14 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data,
        sgl = data->sg;
        start_addr  = ib_sg_dma_address(ibdev, sgl);
 
+       if (unlikely(sgl[0].offset &&
+                    data->data_len >= sg_tablesize * PAGE_SIZE)) {
+               iser_dbg("can't register length %lx with offset %x "
+                        "fall to bounce buffer\n", data->data_len,
+                        sgl[0].offset);
+               return 0;
+       }
+
        for_each_sg(sgl, sg, data->dma_nents, i) {
                if (start_check && !IS_4K_ALIGNED(start_addr))
                        break;
@@ -330,8 +405,11 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data,
                        break;
        }
        ret_len = (next_sg) ? i : i+1;
-       iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
-                ret_len, data->dma_nents, data);
+
+       if (unlikely(ret_len != data->dma_nents))
+               iser_warn("rdma alignment violation (%d/%d aligned)\n",
+                         ret_len, data->dma_nents);
+
        return ret_len;
 }
 
@@ -393,7 +471,7 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 {
        struct scatterlist *sg = mem->sg;
 
-       reg->sge.lkey = device->mr->lkey;
+       reg->sge.lkey = device->pd->local_dma_lkey;
        reg->rkey = device->mr->rkey;
        reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
        reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
@@ -407,15 +485,12 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
 
 static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
                              struct iser_data_buf *mem,
-                             enum iser_data_dir cmd_dir,
-                             int aligned_len)
+                             enum iser_data_dir cmd_dir)
 {
        struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn;
        struct iser_device *device = iser_task->iser_conn->ib_conn.device;
 
        iscsi_conn->fmr_unalign_cnt++;
-       iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
-                 aligned_len, mem->size);
 
        if (iser_debug_level > 0)
                iser_data_buf_dump(mem, device->ib_device);
@@ -439,13 +514,15 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
  * returns: 0 on success, errno code on failure
  */
 static
-int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
+int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task,
                      struct iser_data_buf *mem,
-                     struct iser_page_vec *page_vec,
-                     struct iser_mem_reg *mem_reg)
+                     struct iser_reg_resources *rsc,
+                     struct iser_mem_reg *reg)
 {
        struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
        struct iser_device *device = ib_conn->device;
+       struct iser_page_vec *page_vec = rsc->page_vec;
+       struct ib_fmr_pool *fmr_pool = rsc->fmr_pool;
        struct ib_pool_fmr *fmr;
        int ret, plen;
 
@@ -461,7 +538,7 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
                return -EINVAL;
        }
 
-       fmr  = ib_fmr_pool_map_phys(ib_conn->fmr.pool,
+       fmr  = ib_fmr_pool_map_phys(fmr_pool,
                                    page_vec->pages,
                                    page_vec->length,
                                    page_vec->pages[0]);
@@ -471,11 +548,15 @@ int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
                return ret;
        }
 
-       mem_reg->sge.lkey = fmr->fmr->lkey;
-       mem_reg->rkey = fmr->fmr->rkey;
-       mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset;
-       mem_reg->sge.length = page_vec->data_size;
-       mem_reg->mem_h = fmr;
+       reg->sge.lkey = fmr->fmr->lkey;
+       reg->rkey = fmr->fmr->rkey;
+       reg->sge.addr = page_vec->pages[0] + page_vec->offset;
+       reg->sge.length = page_vec->data_size;
+       reg->mem_h = fmr;
+
+       iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+                " length=0x%x\n", reg->sge.lkey, reg->rkey,
+                reg->sge.addr, reg->sge.length);
 
        return 0;
 }
@@ -505,71 +586,17 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
 void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
                            enum iser_data_dir cmd_dir)
 {
+       struct iser_device *device = iser_task->iser_conn->ib_conn.device;
        struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
 
        if (!reg->mem_h)
                return;
 
-       iser_reg_desc_put(&iser_task->iser_conn->ib_conn,
-                         reg->mem_h);
+       device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn,
+                                    reg->mem_h);
        reg->mem_h = NULL;
 }
 
-/**
- * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
- * using FMR (if possible) obtaining rkey and va
- *
- * returns 0 on success, errno code on failure
- */
-int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
-                         enum iser_data_dir cmd_dir)
-{
-       struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-       struct iser_device   *device = ib_conn->device;
-       struct ib_device     *ibdev = device->ib_device;
-       struct iser_data_buf *mem = &iser_task->data[cmd_dir];
-       struct iser_mem_reg *mem_reg;
-       int aligned_len;
-       int err;
-       int i;
-
-       mem_reg = &iser_task->rdma_reg[cmd_dir];
-
-       aligned_len = iser_data_buf_aligned_len(mem, ibdev);
-       if (aligned_len != mem->dma_nents) {
-               err = fall_to_bounce_buf(iser_task, mem,
-                                        cmd_dir, aligned_len);
-               if (err) {
-                       iser_err("failed to allocate bounce buffer\n");
-                       return err;
-               }
-       }
-
-       /* if there a single dma entry, FMR is not needed */
-       if (mem->dma_nents == 1) {
-               return iser_reg_dma(device, mem, mem_reg);
-       } else { /* use FMR for multiple dma entries */
-               err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec,
-                                       mem_reg);
-               if (err && err != -EAGAIN) {
-                       iser_data_buf_dump(mem, ibdev);
-                       iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
-                                mem->dma_nents,
-                                ntoh24(iser_task->desc.iscsi_header.dlength));
-                       iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
-                                ib_conn->fmr.page_vec->data_size,
-                                ib_conn->fmr.page_vec->length,
-                                ib_conn->fmr.page_vec->offset);
-                       for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
-                               iser_err("page_vec[%d] = 0x%llx\n", i,
-                                        (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
-               }
-               if (err)
-                       return err;
-       }
-       return 0;
-}
-
 static void
 iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
                    struct ib_sig_domain *domain)
@@ -637,10 +664,11 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
 {
        u32 rkey;
 
-       memset(inv_wr, 0, sizeof(*inv_wr));
        inv_wr->opcode = IB_WR_LOCAL_INV;
        inv_wr->wr_id = ISER_FASTREG_LI_WRID;
        inv_wr->ex.invalidate_rkey = mr->rkey;
+       inv_wr->send_flags = 0;
+       inv_wr->num_sge = 0;
 
        rkey = ib_inc_rkey(mr->rkey);
        ib_update_fast_reg_key(mr, rkey);
@@ -648,61 +676,51 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
 
 static int
 iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
-               struct fast_reg_descriptor *desc,
+               struct iser_pi_context *pi_ctx,
                struct iser_mem_reg *data_reg,
                struct iser_mem_reg *prot_reg,
                struct iser_mem_reg *sig_reg)
 {
-       struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-       struct iser_pi_context *pi_ctx = desc->pi_ctx;
-       struct ib_send_wr sig_wr, inv_wr;
-       struct ib_send_wr *bad_wr, *wr = NULL;
-       struct ib_sig_attrs sig_attrs;
+       struct iser_tx_desc *tx_desc = &iser_task->desc;
+       struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs;
+       struct ib_send_wr *wr;
        int ret;
 
-       memset(&sig_attrs, 0, sizeof(sig_attrs));
-       ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
+       memset(sig_attrs, 0, sizeof(*sig_attrs));
+       ret = iser_set_sig_attrs(iser_task->sc, sig_attrs);
        if (ret)
                goto err;
 
-       iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
+       iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask);
 
-       if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
-               iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
-               wr = &inv_wr;
+       if (!pi_ctx->sig_mr_valid) {
+               wr = iser_tx_next_wr(tx_desc);
+               iser_inv_rkey(wr, pi_ctx->sig_mr);
        }
 
-       memset(&sig_wr, 0, sizeof(sig_wr));
-       sig_wr.opcode = IB_WR_REG_SIG_MR;
-       sig_wr.wr_id = ISER_FASTREG_LI_WRID;
-       sig_wr.sg_list = &data_reg->sge;
-       sig_wr.num_sge = 1;
-       sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
-       sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
+       wr = iser_tx_next_wr(tx_desc);
+       wr->opcode = IB_WR_REG_SIG_MR;
+       wr->wr_id = ISER_FASTREG_LI_WRID;
+       wr->sg_list = &data_reg->sge;
+       wr->num_sge = 1;
+       wr->send_flags = 0;
+       wr->wr.sig_handover.sig_attrs = sig_attrs;
+       wr->wr.sig_handover.sig_mr = pi_ctx->sig_mr;
        if (scsi_prot_sg_count(iser_task->sc))
-               sig_wr.wr.sig_handover.prot = &prot_reg->sge;
-       sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
-                                             IB_ACCESS_REMOTE_READ |
-                                             IB_ACCESS_REMOTE_WRITE;
-
-       if (!wr)
-               wr = &sig_wr;
+               wr->wr.sig_handover.prot = &prot_reg->sge;
        else
-               wr->next = &sig_wr;
-
-       ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
-       if (ret) {
-               iser_err("reg_sig_mr failed, ret:%d\n", ret);
-               goto err;
-       }
-       desc->reg_indicators &= ~ISER_SIG_KEY_VALID;
+               wr->wr.sig_handover.prot = NULL;
+       wr->wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
+                                          IB_ACCESS_REMOTE_READ |
+                                          IB_ACCESS_REMOTE_WRITE;
+       pi_ctx->sig_mr_valid = 0;
 
        sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
        sig_reg->rkey = pi_ctx->sig_mr->rkey;
        sig_reg->sge.addr = 0;
        sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
 
-       iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n",
+       iser_dbg("sig reg: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n",
                 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
                 sig_reg->sge.length);
 err:
@@ -711,29 +729,16 @@ err:
 
 static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
                            struct iser_data_buf *mem,
-                           struct fast_reg_descriptor *desc,
-                           enum iser_reg_indicator ind,
+                           struct iser_reg_resources *rsc,
                            struct iser_mem_reg *reg)
 {
        struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
        struct iser_device *device = ib_conn->device;
-       struct ib_mr *mr;
-       struct ib_fast_reg_page_list *frpl;
-       struct ib_send_wr fastreg_wr, inv_wr;
-       struct ib_send_wr *bad_wr, *wr = NULL;
-       int ret, offset, size, plen;
-
-       /* if there a single dma entry, dma mr suffices */
-       if (mem->dma_nents == 1)
-               return iser_reg_dma(device, mem, reg);
-
-       if (ind == ISER_DATA_KEY_VALID) {
-               mr = desc->data_mr;
-               frpl = desc->data_frpl;
-       } else {
-               mr = desc->pi_ctx->prot_mr;
-               frpl = desc->pi_ctx->prot_frpl;
-       }
+       struct ib_mr *mr = rsc->mr;
+       struct ib_fast_reg_page_list *frpl = rsc->frpl;
+       struct iser_tx_desc *tx_desc = &iser_task->desc;
+       struct ib_send_wr *wr;
+       int offset, size, plen;
 
        plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
                                   &offset, &size);
@@ -742,118 +747,151 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
                return -EINVAL;
        }
 
-       if (!(desc->reg_indicators & ind)) {
-               iser_inv_rkey(&inv_wr, mr);
-               wr = &inv_wr;
+       if (!rsc->mr_valid) {
+               wr = iser_tx_next_wr(tx_desc);
+               iser_inv_rkey(wr, mr);
        }
 
-       /* Prepare FASTREG WR */
-       memset(&fastreg_wr, 0, sizeof(fastreg_wr));
-       fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
-       fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-       fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset;
-       fastreg_wr.wr.fast_reg.page_list = frpl;
-       fastreg_wr.wr.fast_reg.page_list_len = plen;
-       fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
-       fastreg_wr.wr.fast_reg.length = size;
-       fastreg_wr.wr.fast_reg.rkey = mr->rkey;
-       fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
-                                              IB_ACCESS_REMOTE_WRITE |
-                                              IB_ACCESS_REMOTE_READ);
-
-       if (!wr)
-               wr = &fastreg_wr;
-       else
-               wr->next = &fastreg_wr;
-
-       ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
-       if (ret) {
-               iser_err("fast registration failed, ret:%d\n", ret);
-               return ret;
-       }
-       desc->reg_indicators &= ~ind;
+       wr = iser_tx_next_wr(tx_desc);
+       wr->opcode = IB_WR_FAST_REG_MR;
+       wr->wr_id = ISER_FASTREG_LI_WRID;
+       wr->send_flags = 0;
+       wr->wr.fast_reg.iova_start = frpl->page_list[0] + offset;
+       wr->wr.fast_reg.page_list = frpl;
+       wr->wr.fast_reg.page_list_len = plen;
+       wr->wr.fast_reg.page_shift = SHIFT_4K;
+       wr->wr.fast_reg.length = size;
+       wr->wr.fast_reg.rkey = mr->rkey;
+       wr->wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
+                                       IB_ACCESS_REMOTE_WRITE |
+                                       IB_ACCESS_REMOTE_READ);
+       rsc->mr_valid = 0;
 
        reg->sge.lkey = mr->lkey;
        reg->rkey = mr->rkey;
        reg->sge.addr = frpl->page_list[0] + offset;
        reg->sge.length = size;
 
-       return ret;
+       iser_dbg("fast reg: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+                " length=0x%x\n", reg->sge.lkey, reg->rkey,
+                reg->sge.addr, reg->sge.length);
+
+       return 0;
 }
 
-/**
- * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA,
- * using Fast Registration WR (if possible) obtaining rkey and va
- *
- * returns 0 on success, errno code on failure
- */
-int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
-                             enum iser_data_dir cmd_dir)
+static int
+iser_handle_unaligned_buf(struct iscsi_iser_task *task,
+                         struct iser_data_buf *mem,
+                         enum iser_data_dir dir)
 {
-       struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
-       struct iser_device *device = ib_conn->device;
-       struct ib_device *ibdev = device->ib_device;
-       struct iser_data_buf *mem = &iser_task->data[cmd_dir];
-       struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir];
-       struct fast_reg_descriptor *desc = NULL;
+       struct iser_conn *iser_conn = task->iser_conn;
+       struct iser_device *device = iser_conn->ib_conn.device;
        int err, aligned_len;
 
-       aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+       aligned_len = iser_data_buf_aligned_len(mem, device->ib_device,
+                                               iser_conn->scsi_sg_tablesize);
        if (aligned_len != mem->dma_nents) {
-               err = fall_to_bounce_buf(iser_task, mem,
-                                        cmd_dir, aligned_len);
-               if (err) {
-                       iser_err("failed to allocate bounce buffer\n");
+               err = fall_to_bounce_buf(task, mem, dir);
+               if (err)
                        return err;
-               }
        }
 
+       return 0;
+}
+
+static int
+iser_reg_prot_sg(struct iscsi_iser_task *task,
+                struct iser_data_buf *mem,
+                struct iser_fr_desc *desc,
+                struct iser_mem_reg *reg)
+{
+       struct iser_device *device = task->iser_conn->ib_conn.device;
+
+       if (mem->dma_nents == 1)
+               return iser_reg_dma(device, mem, reg);
+
+       return device->reg_ops->reg_mem(task, mem, &desc->pi_ctx->rsc, reg);
+}
+
+static int
+iser_reg_data_sg(struct iscsi_iser_task *task,
+                struct iser_data_buf *mem,
+                struct iser_fr_desc *desc,
+                struct iser_mem_reg *reg)
+{
+       struct iser_device *device = task->iser_conn->ib_conn.device;
+
+       if (mem->dma_nents == 1)
+               return iser_reg_dma(device, mem, reg);
+
+       return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg);
+}
+
+int iser_reg_rdma_mem(struct iscsi_iser_task *task,
+                     enum iser_data_dir dir)
+{
+       struct ib_conn *ib_conn = &task->iser_conn->ib_conn;
+       struct iser_device *device = ib_conn->device;
+       struct iser_data_buf *mem = &task->data[dir];
+       struct iser_mem_reg *reg = &task->rdma_reg[dir];
+       struct iser_mem_reg *data_reg;
+       struct iser_fr_desc *desc = NULL;
+       int err;
+
+       err = iser_handle_unaligned_buf(task, mem, dir);
+       if (unlikely(err))
+               return err;
+
        if (mem->dma_nents != 1 ||
-           scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
-               desc = iser_reg_desc_get(ib_conn);
-               mem_reg->mem_h = desc;
+           scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
+               desc = device->reg_ops->reg_desc_get(ib_conn);
+               reg->mem_h = desc;
        }
 
-       err = iser_fast_reg_mr(iser_task, mem, desc,
-                              ISER_DATA_KEY_VALID, mem_reg);
-       if (err)
+       if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL)
+               data_reg = reg;
+       else
+               data_reg = &task->desc.data_reg;
+
+       err = iser_reg_data_sg(task, mem, desc, data_reg);
+       if (unlikely(err))
                goto err_reg;
 
-       if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
-               struct iser_mem_reg prot_reg;
-
-               memset(&prot_reg, 0, sizeof(prot_reg));
-               if (scsi_prot_sg_count(iser_task->sc)) {
-                       mem = &iser_task->prot[cmd_dir];
-                       aligned_len = iser_data_buf_aligned_len(mem, ibdev);
-                       if (aligned_len != mem->dma_nents) {
-                               err = fall_to_bounce_buf(iser_task, mem,
-                                                        cmd_dir, aligned_len);
-                               if (err) {
-                                       iser_err("failed to allocate bounce buffer\n");
-                                       return err;
-                               }
-                       }
+       if (scsi_get_prot_op(task->sc) != SCSI_PROT_NORMAL) {
+               struct iser_mem_reg *prot_reg = &task->desc.prot_reg;
 
-                       err = iser_fast_reg_mr(iser_task, mem, desc,
-                                              ISER_PROT_KEY_VALID, &prot_reg);
-                       if (err)
+               if (scsi_prot_sg_count(task->sc)) {
+                       mem = &task->prot[dir];
+                       err = iser_handle_unaligned_buf(task, mem, dir);
+                       if (unlikely(err))
                                goto err_reg;
-               }
 
-               err = iser_reg_sig_mr(iser_task, desc, mem_reg,
-                                     &prot_reg, mem_reg);
-               if (err) {
-                       iser_err("Failed to register signature mr\n");
-                       return err;
+                       err = iser_reg_prot_sg(task, mem, desc, prot_reg);
+                       if (unlikely(err))
+                               goto err_reg;
                }
-               desc->reg_indicators |= ISER_FASTREG_PROTECTED;
+
+               err = iser_reg_sig_mr(task, desc->pi_ctx, data_reg,
+                                     prot_reg, reg);
+               if (unlikely(err))
+                       goto err_reg;
+
+               desc->pi_ctx->sig_protected = 1;
        }
 
        return 0;
+
 err_reg:
        if (desc)
-               iser_reg_desc_put(ib_conn, desc);
+               device->reg_ops->reg_desc_put(ib_conn, desc);
 
        return err;
 }
+
+void iser_unreg_rdma_mem(struct iscsi_iser_task *task,
+                        enum iser_data_dir dir)
+{
+       struct iser_device *device = task->iser_conn->ib_conn.device;
+
+       device->reg_ops->unreg_mem(task, dir);
+}
index 5c9f565ea0e88840c3c3c7a031e20180c3c8025d..ae70cc1463ac2b75d7eae512bf3224e90ba2d59f 100644 (file)
@@ -87,25 +87,9 @@ static int iser_create_device_ib_res(struct iser_device *device)
                return ret;
        }
 
-       /* Assign function handles  - based on FMR support */
-       if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
-           device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
-               iser_info("FMR supported, using FMR for registration\n");
-               device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
-               device->iser_free_rdma_reg_res = iser_free_fmr_pool;
-               device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
-               device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
-       } else
-       if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
-               iser_info("FastReg supported, using FastReg for registration\n");
-               device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool;
-               device->iser_free_rdma_reg_res = iser_free_fastreg_pool;
-               device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg;
-               device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg;
-       } else {
-               iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
-               return -1;
-       }
+       ret = iser_assign_reg_ops(device);
+       if (ret)
+               return ret;
 
        device->comps_used = min_t(int, num_online_cpus(),
                                 device->ib_device->num_comp_vectors);
@@ -201,7 +185,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
 
        (void)ib_unregister_event_handler(&device->event_handler);
        (void)ib_dereg_mr(device->mr);
-       (void)ib_dealloc_pd(device->pd);
+       ib_dealloc_pd(device->pd);
 
        kfree(device->comps);
        device->comps = NULL;
@@ -211,28 +195,40 @@ static void iser_free_device_ib_res(struct iser_device *device)
 }
 
 /**
- * iser_create_fmr_pool - Creates FMR pool and page_vector
+ * iser_alloc_fmr_pool - Creates FMR pool and page_vector
  *
  * returns 0 on success, or errno code on failure
  */
-int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+int iser_alloc_fmr_pool(struct ib_conn *ib_conn,
+                       unsigned cmds_max,
+                       unsigned int size)
 {
        struct iser_device *device = ib_conn->device;
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+       struct iser_page_vec *page_vec;
+       struct iser_fr_desc *desc;
+       struct ib_fmr_pool *fmr_pool;
        struct ib_fmr_pool_param params;
-       int ret = -ENOMEM;
+       int ret;
 
-       ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) +
-                                       (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)),
-                                       GFP_KERNEL);
-       if (!ib_conn->fmr.page_vec)
-               return ret;
+       INIT_LIST_HEAD(&fr_pool->list);
+       spin_lock_init(&fr_pool->lock);
+
+       desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+       if (!desc)
+               return -ENOMEM;
 
-       ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1);
+       page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size),
+                          GFP_KERNEL);
+       if (!page_vec) {
+               ret = -ENOMEM;
+               goto err_frpl;
+       }
+
+       page_vec->pages = (u64 *)(page_vec + 1);
 
        params.page_shift        = SHIFT_4K;
-       /* when the first/last SG element are not start/end *
-        * page aligned, the map whould be of N+1 pages     */
-       params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
+       params.max_pages_per_fmr = size;
        /* make the pool size twice the max number of SCSI commands *
         * the ML is expected to queue, watermark for unmap at 50%  */
        params.pool_size         = cmds_max * 2;
@@ -243,23 +239,25 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
                                    IB_ACCESS_REMOTE_WRITE |
                                    IB_ACCESS_REMOTE_READ);
 
-       ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, &params);
-       if (!IS_ERR(ib_conn->fmr.pool))
-               return 0;
-
-       /* no FMR => no need for page_vec */
-       kfree(ib_conn->fmr.page_vec);
-       ib_conn->fmr.page_vec = NULL;
-
-       ret = PTR_ERR(ib_conn->fmr.pool);
-       ib_conn->fmr.pool = NULL;
-       if (ret != -ENOSYS) {
+       fmr_pool = ib_create_fmr_pool(device->pd, &params);
+       if (IS_ERR(fmr_pool)) {
+               ret = PTR_ERR(fmr_pool);
                iser_err("FMR allocation failed, err %d\n", ret);
-               return ret;
-       } else {
-               iser_warn("FMRs are not supported, using unaligned mode\n");
-               return 0;
+               goto err_fmr;
        }
+
+       desc->rsc.page_vec = page_vec;
+       desc->rsc.fmr_pool = fmr_pool;
+       list_add(&desc->list, &fr_pool->list);
+
+       return 0;
+
+err_fmr:
+       kfree(page_vec);
+err_frpl:
+       kfree(desc);
+
+       return ret;
 }
 
 /**
@@ -267,26 +265,68 @@ int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
  */
 void iser_free_fmr_pool(struct ib_conn *ib_conn)
 {
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+       struct iser_fr_desc *desc;
+
+       desc = list_first_entry(&fr_pool->list,
+                               struct iser_fr_desc, list);
+       list_del(&desc->list);
+
        iser_info("freeing conn %p fmr pool %p\n",
-                 ib_conn, ib_conn->fmr.pool);
+                 ib_conn, desc->rsc.fmr_pool);
+
+       ib_destroy_fmr_pool(desc->rsc.fmr_pool);
+       kfree(desc->rsc.page_vec);
+       kfree(desc);
+}
+
+static int
+iser_alloc_reg_res(struct ib_device *ib_device,
+                  struct ib_pd *pd,
+                  struct iser_reg_resources *res,
+                  unsigned int size)
+{
+       int ret;
+
+       res->frpl = ib_alloc_fast_reg_page_list(ib_device, size);
+       if (IS_ERR(res->frpl)) {
+               ret = PTR_ERR(res->frpl);
+               iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
+                        ret);
+               return PTR_ERR(res->frpl);
+       }
+
+       res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, size);
+       if (IS_ERR(res->mr)) {
+               ret = PTR_ERR(res->mr);
+               iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+               goto fast_reg_mr_failure;
+       }
+       res->mr_valid = 1;
 
-       if (ib_conn->fmr.pool != NULL)
-               ib_destroy_fmr_pool(ib_conn->fmr.pool);
+       return 0;
+
+fast_reg_mr_failure:
+       ib_free_fast_reg_page_list(res->frpl);
 
-       ib_conn->fmr.pool = NULL;
+       return ret;
+}
 
-       kfree(ib_conn->fmr.page_vec);
-       ib_conn->fmr.page_vec = NULL;
+static void
+iser_free_reg_res(struct iser_reg_resources *rsc)
+{
+       ib_dereg_mr(rsc->mr);
+       ib_free_fast_reg_page_list(rsc->frpl);
 }
 
 static int
-iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
-                 struct fast_reg_descriptor *desc)
+iser_alloc_pi_ctx(struct ib_device *ib_device,
+                 struct ib_pd *pd,
+                 struct iser_fr_desc *desc,
+                 unsigned int size)
 {
        struct iser_pi_context *pi_ctx = NULL;
-       struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2,
-                                              .flags = IB_MR_SIGNATURE_EN};
-       int ret = 0;
+       int ret;
 
        desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
        if (!desc->pi_ctx)
@@ -294,36 +334,25 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
 
        pi_ctx = desc->pi_ctx;
 
-       pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device,
-                                           ISCSI_ISER_SG_TABLESIZE);
-       if (IS_ERR(pi_ctx->prot_frpl)) {
-               ret = PTR_ERR(pi_ctx->prot_frpl);
-               goto prot_frpl_failure;
-       }
-
-       pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd,
-                                       ISCSI_ISER_SG_TABLESIZE + 1);
-       if (IS_ERR(pi_ctx->prot_mr)) {
-               ret = PTR_ERR(pi_ctx->prot_mr);
-               goto prot_mr_failure;
+       ret = iser_alloc_reg_res(ib_device, pd, &pi_ctx->rsc, size);
+       if (ret) {
+               iser_err("failed to allocate reg_resources\n");
+               goto alloc_reg_res_err;
        }
-       desc->reg_indicators |= ISER_PROT_KEY_VALID;
 
-       pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+       pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
        if (IS_ERR(pi_ctx->sig_mr)) {
                ret = PTR_ERR(pi_ctx->sig_mr);
                goto sig_mr_failure;
        }
-       desc->reg_indicators |= ISER_SIG_KEY_VALID;
-       desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+       pi_ctx->sig_mr_valid = 1;
+       desc->pi_ctx->sig_protected = 0;
 
        return 0;
 
 sig_mr_failure:
-       ib_dereg_mr(desc->pi_ctx->prot_mr);
-prot_mr_failure:
-       ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl);
-prot_frpl_failure:
+       iser_free_reg_res(&pi_ctx->rsc);
+alloc_reg_res_err:
        kfree(desc->pi_ctx);
 
        return ret;
@@ -332,82 +361,71 @@ prot_frpl_failure:
 static void
 iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
 {
-       ib_free_fast_reg_page_list(pi_ctx->prot_frpl);
-       ib_dereg_mr(pi_ctx->prot_mr);
-       ib_destroy_mr(pi_ctx->sig_mr);
+       iser_free_reg_res(&pi_ctx->rsc);
+       ib_dereg_mr(pi_ctx->sig_mr);
        kfree(pi_ctx);
 }
 
-static int
-iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
-                        bool pi_enable, struct fast_reg_descriptor *desc)
+static struct iser_fr_desc *
+iser_create_fastreg_desc(struct ib_device *ib_device,
+                        struct ib_pd *pd,
+                        bool pi_enable,
+                        unsigned int size)
 {
+       struct iser_fr_desc *desc;
        int ret;
 
-       desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device,
-                                                     ISCSI_ISER_SG_TABLESIZE + 1);
-       if (IS_ERR(desc->data_frpl)) {
-               ret = PTR_ERR(desc->data_frpl);
-               iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
-                        ret);
-               return PTR_ERR(desc->data_frpl);
-       }
+       desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+       if (!desc)
+               return ERR_PTR(-ENOMEM);
 
-       desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
-       if (IS_ERR(desc->data_mr)) {
-               ret = PTR_ERR(desc->data_mr);
-               iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
-               goto fast_reg_mr_failure;
-       }
-       desc->reg_indicators |= ISER_DATA_KEY_VALID;
+       ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc, size);
+       if (ret)
+               goto reg_res_alloc_failure;
 
        if (pi_enable) {
-               ret = iser_alloc_pi_ctx(ib_device, pd, desc);
+               ret = iser_alloc_pi_ctx(ib_device, pd, desc, size);
                if (ret)
                        goto pi_ctx_alloc_failure;
        }
 
-       return 0;
+       return desc;
+
 pi_ctx_alloc_failure:
-       ib_dereg_mr(desc->data_mr);
-fast_reg_mr_failure:
-       ib_free_fast_reg_page_list(desc->data_frpl);
+       iser_free_reg_res(&desc->rsc);
+reg_res_alloc_failure:
+       kfree(desc);
 
-       return ret;
+       return ERR_PTR(ret);
 }
 
 /**
- * iser_create_fastreg_pool - Creates pool of fast_reg descriptors
+ * iser_alloc_fastreg_pool - Creates pool of fast_reg descriptors
  * for fast registration work requests.
  * returns 0 on success, or errno code on failure
  */
-int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+int iser_alloc_fastreg_pool(struct ib_conn *ib_conn,
+                           unsigned cmds_max,
+                           unsigned int size)
 {
        struct iser_device *device = ib_conn->device;
-       struct fast_reg_descriptor *desc;
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+       struct iser_fr_desc *desc;
        int i, ret;
 
-       INIT_LIST_HEAD(&ib_conn->fastreg.pool);
-       ib_conn->fastreg.pool_size = 0;
+       INIT_LIST_HEAD(&fr_pool->list);
+       spin_lock_init(&fr_pool->lock);
+       fr_pool->size = 0;
        for (i = 0; i < cmds_max; i++) {
-               desc = kzalloc(sizeof(*desc), GFP_KERNEL);
-               if (!desc) {
-                       iser_err("Failed to allocate a new fast_reg descriptor\n");
-                       ret = -ENOMEM;
-                       goto err;
-               }
-
-               ret = iser_create_fastreg_desc(device->ib_device, device->pd,
-                                              ib_conn->pi_support, desc);
-               if (ret) {
-                       iser_err("Failed to create fastreg descriptor err=%d\n",
-                                ret);
-                       kfree(desc);
+               desc = iser_create_fastreg_desc(device->ib_device, device->pd,
+                                               ib_conn->pi_support, size);
+               if (IS_ERR(desc)) {
+                       ret = PTR_ERR(desc);
                        goto err;
                }
 
-               list_add_tail(&desc->list, &ib_conn->fastreg.pool);
-               ib_conn->fastreg.pool_size++;
+               list_add_tail(&desc->list, &fr_pool->list);
+               fr_pool->size++;
        }
 
        return 0;
@@ -422,27 +440,27 @@ err:
  */
 void iser_free_fastreg_pool(struct ib_conn *ib_conn)
 {
-       struct fast_reg_descriptor *desc, *tmp;
+       struct iser_fr_pool *fr_pool = &ib_conn->fr_pool;
+       struct iser_fr_desc *desc, *tmp;
        int i = 0;
 
-       if (list_empty(&ib_conn->fastreg.pool))
+       if (list_empty(&fr_pool->list))
                return;
 
        iser_info("freeing conn %p fr pool\n", ib_conn);
 
-       list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
+       list_for_each_entry_safe(desc, tmp, &fr_pool->list, list) {
                list_del(&desc->list);
-               ib_free_fast_reg_page_list(desc->data_frpl);
-               ib_dereg_mr(desc->data_mr);
+               iser_free_reg_res(&desc->rsc);
                if (desc->pi_ctx)
                        iser_free_pi_ctx(desc->pi_ctx);
                kfree(desc);
                ++i;
        }
 
-       if (i < ib_conn->fastreg.pool_size)
+       if (i < fr_pool->size)
                iser_warn("pool still has %d regions registered\n",
-                         ib_conn->fastreg.pool_size - i);
+                         fr_pool->size - i);
 }
 
 /**
@@ -738,6 +756,31 @@ static void iser_connect_error(struct rdma_cm_id *cma_id)
        iser_conn->state = ISER_CONN_TERMINATING;
 }
 
+static void
+iser_calc_scsi_params(struct iser_conn *iser_conn,
+                     unsigned int max_sectors)
+{
+       struct iser_device *device = iser_conn->ib_conn.device;
+       unsigned short sg_tablesize, sup_sg_tablesize;
+
+       sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K);
+       sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE,
+                                device->dev_attr.max_fast_reg_page_list_len);
+
+       if (sg_tablesize > sup_sg_tablesize) {
+               sg_tablesize = sup_sg_tablesize;
+               iser_conn->scsi_max_sectors = sg_tablesize * SIZE_4K / 512;
+       } else {
+               iser_conn->scsi_max_sectors = max_sectors;
+       }
+
+       iser_conn->scsi_sg_tablesize = sg_tablesize;
+
+       iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n",
+                iser_conn, iser_conn->scsi_sg_tablesize,
+                iser_conn->scsi_max_sectors);
+}
+
 /**
  * Called with state mutex held
  **/
@@ -776,6 +819,8 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id)
                }
        }
 
+       iser_calc_scsi_params(iser_conn, iser_max_sectors);
+
        ret = rdma_resolve_route(cma_id, 1000);
        if (ret) {
                iser_err("resolve route failed: %d\n", ret);
@@ -938,7 +983,6 @@ void iser_conn_init(struct iser_conn *iser_conn)
        init_completion(&iser_conn->ib_completion);
        init_completion(&iser_conn->up_completion);
        INIT_LIST_HEAD(&iser_conn->conn_list);
-       spin_lock_init(&iser_conn->ib_conn.lock);
        mutex_init(&iser_conn->state_mutex);
 }
 
@@ -1017,7 +1061,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
 
        sge.addr   = iser_conn->login_resp_dma;
        sge.length = ISER_RX_LOGIN_SIZE;
-       sge.lkey   = ib_conn->device->mr->lkey;
+       sge.lkey   = ib_conn->device->pd->local_dma_lkey;
 
        rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
        rx_wr.sg_list = &sge;
@@ -1072,23 +1116,24 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
                   bool signal)
 {
-       int               ib_ret;
-       struct ib_send_wr send_wr, *send_wr_failed;
+       struct ib_send_wr *bad_wr, *wr = iser_tx_next_wr(tx_desc);
+       int ib_ret;
 
        ib_dma_sync_single_for_device(ib_conn->device->ib_device,
                                      tx_desc->dma_addr, ISER_HEADERS_LEN,
                                      DMA_TO_DEVICE);
 
-       send_wr.next       = NULL;
-       send_wr.wr_id      = (uintptr_t)tx_desc;
-       send_wr.sg_list    = tx_desc->tx_sg;
-       send_wr.num_sge    = tx_desc->num_sge;
-       send_wr.opcode     = IB_WR_SEND;
-       send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
+       wr->next = NULL;
+       wr->wr_id = (uintptr_t)tx_desc;
+       wr->sg_list = tx_desc->tx_sg;
+       wr->num_sge = tx_desc->num_sge;
+       wr->opcode = IB_WR_SEND;
+       wr->send_flags = signal ? IB_SEND_SIGNALED : 0;
 
-       ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
+       ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0], &bad_wr);
        if (ib_ret)
-               iser_err("ib_post_send failed, ret:%d\n", ib_ret);
+               iser_err("ib_post_send failed, ret:%d opcode:%d\n",
+                        ib_ret, bad_wr->opcode);
 
        return ib_ret;
 }
@@ -1240,13 +1285,13 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
                             enum iser_data_dir cmd_dir, sector_t *sector)
 {
        struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
-       struct fast_reg_descriptor *desc = reg->mem_h;
+       struct iser_fr_desc *desc = reg->mem_h;
        unsigned long sector_size = iser_task->sc->device->sector_size;
        struct ib_mr_status mr_status;
        int ret;
 
-       if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) {
-               desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+       if (desc && desc->pi_ctx->sig_protected) {
+               desc->pi_ctx->sig_protected = 0;
                ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
                                         IB_MR_CHECK_SIG_STATUS, &mr_status);
                if (ret) {
index d851e1828d6f5152e9c8ca49de9b64a3b953f180..403bd29443b8e7d06ac1a16cdae826088af98efe 100644 (file)
@@ -235,7 +235,7 @@ isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
                rx_sg = &rx_desc->rx_sg;
                rx_sg->addr = rx_desc->dma_addr;
                rx_sg->length = ISER_RX_PAYLOAD_SIZE;
-               rx_sg->lkey = device->mr->lkey;
+               rx_sg->lkey = device->pd->local_dma_lkey;
        }
 
        isert_conn->rx_desc_head = 0;
@@ -385,22 +385,12 @@ isert_create_device_ib_res(struct isert_device *device)
                goto out_cq;
        }
 
-       device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(device->mr)) {
-               ret = PTR_ERR(device->mr);
-               isert_err("failed to create dma mr, device %p, ret=%d\n",
-                         device, ret);
-               goto out_mr;
-       }
-
        /* Check signature cap */
        device->pi_capable = dev_attr->device_cap_flags &
                             IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
 
        return 0;
 
-out_mr:
-       ib_dealloc_pd(device->pd);
 out_cq:
        isert_free_comps(device);
        return ret;
@@ -411,7 +401,6 @@ isert_free_device_ib_res(struct isert_device *device)
 {
        isert_info("device %p\n", device);
 
-       ib_dereg_mr(device->mr);
        ib_dealloc_pd(device->pd);
        isert_free_comps(device);
 }
@@ -491,7 +480,7 @@ isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
                if (fr_desc->pi_ctx) {
                        ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl);
                        ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
-                       ib_destroy_mr(fr_desc->pi_ctx->sig_mr);
+                       ib_dereg_mr(fr_desc->pi_ctx->sig_mr);
                        kfree(fr_desc->pi_ctx);
                }
                kfree(fr_desc);
@@ -508,7 +497,6 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
                    struct ib_device *device,
                    struct ib_pd *pd)
 {
-       struct ib_mr_init_attr mr_init_attr;
        struct pi_context *pi_ctx;
        int ret;
 
@@ -527,7 +515,8 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
                goto err_pi_ctx;
        }
 
-       pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+       pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+                                     ISCSI_ISER_SG_TABLESIZE);
        if (IS_ERR(pi_ctx->prot_mr)) {
                isert_err("Failed to allocate prot frmr err=%ld\n",
                          PTR_ERR(pi_ctx->prot_mr));
@@ -536,10 +525,7 @@ isert_create_pi_ctx(struct fast_reg_descriptor *desc,
        }
        desc->ind |= ISERT_PROT_KEY_VALID;
 
-       memset(&mr_init_attr, 0, sizeof(mr_init_attr));
-       mr_init_attr.max_reg_descriptors = 2;
-       mr_init_attr.flags |= IB_MR_SIGNATURE_EN;
-       pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+       pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2);
        if (IS_ERR(pi_ctx->sig_mr)) {
                isert_err("Failed to allocate signature enabled mr err=%ld\n",
                          PTR_ERR(pi_ctx->sig_mr));
@@ -577,7 +563,8 @@ isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
                return PTR_ERR(fr_desc->data_frpl);
        }
 
-       fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+       fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+                                      ISCSI_ISER_SG_TABLESIZE);
        if (IS_ERR(fr_desc->data_mr)) {
                isert_err("Failed to allocate data frmr err=%ld\n",
                          PTR_ERR(fr_desc->data_mr));
@@ -1092,8 +1079,8 @@ isert_create_send_desc(struct isert_conn *isert_conn,
        tx_desc->num_sge = 1;
        tx_desc->isert_cmd = isert_cmd;
 
-       if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
-               tx_desc->tx_sg[0].lkey = device->mr->lkey;
+       if (tx_desc->tx_sg[0].lkey != device->pd->local_dma_lkey) {
+               tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
                isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc);
        }
 }
@@ -1116,7 +1103,7 @@ isert_init_tx_hdrs(struct isert_conn *isert_conn,
        tx_desc->dma_addr = dma_addr;
        tx_desc->tx_sg[0].addr  = tx_desc->dma_addr;
        tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
-       tx_desc->tx_sg[0].lkey = device->mr->lkey;
+       tx_desc->tx_sg[0].lkey = device->pd->local_dma_lkey;
 
        isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n",
                  tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length,
@@ -1149,7 +1136,7 @@ isert_rdma_post_recvl(struct isert_conn *isert_conn)
        memset(&sge, 0, sizeof(struct ib_sge));
        sge.addr = isert_conn->login_req_dma;
        sge.length = ISER_RX_LOGIN_SIZE;
-       sge.lkey = isert_conn->device->mr->lkey;
+       sge.lkey = isert_conn->device->pd->local_dma_lkey;
 
        isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
                sge.addr, sge.length, sge.lkey);
@@ -1199,7 +1186,7 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
 
                tx_dsg->addr    = isert_conn->login_rsp_dma;
                tx_dsg->length  = length;
-               tx_dsg->lkey    = isert_conn->device->mr->lkey;
+               tx_dsg->lkey    = isert_conn->device->pd->local_dma_lkey;
                tx_desc->num_sge = 2;
        }
        if (!login->login_failed) {
@@ -2216,7 +2203,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
                isert_cmd->pdu_buf_len = pdu_len;
                tx_dsg->addr    = isert_cmd->pdu_buf_dma;
                tx_dsg->length  = pdu_len;
-               tx_dsg->lkey    = device->mr->lkey;
+               tx_dsg->lkey    = device->pd->local_dma_lkey;
                isert_cmd->tx_desc.num_sge = 2;
        }
 
@@ -2344,7 +2331,7 @@ isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
        isert_cmd->pdu_buf_len = ISCSI_HDR_LEN;
        tx_dsg->addr    = isert_cmd->pdu_buf_dma;
        tx_dsg->length  = ISCSI_HDR_LEN;
-       tx_dsg->lkey    = device->mr->lkey;
+       tx_dsg->lkey    = device->pd->local_dma_lkey;
        isert_cmd->tx_desc.num_sge = 2;
 
        isert_init_send_wr(isert_conn, isert_cmd, send_wr);
@@ -2385,7 +2372,7 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
                isert_cmd->pdu_buf_len = txt_rsp_len;
                tx_dsg->addr    = isert_cmd->pdu_buf_dma;
                tx_dsg->length  = txt_rsp_len;
-               tx_dsg->lkey    = device->mr->lkey;
+               tx_dsg->lkey    = device->pd->local_dma_lkey;
                isert_cmd->tx_desc.num_sge = 2;
        }
        isert_init_send_wr(isert_conn, isert_cmd, send_wr);
@@ -2426,7 +2413,7 @@ isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
                ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
                ib_sge->length = min_t(u32, data_left,
                                ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
-               ib_sge->lkey = device->mr->lkey;
+               ib_sge->lkey = device->pd->local_dma_lkey;
 
                isert_dbg("RDMA ib_sge: addr: 0x%llx  length: %u lkey: %x\n",
                          ib_sge->addr, ib_sge->length, ib_sge->lkey);
@@ -2600,7 +2587,7 @@ isert_fast_reg_mr(struct isert_conn *isert_conn,
        u32 page_off;
 
        if (mem->dma_nents == 1) {
-               sge->lkey = device->mr->lkey;
+               sge->lkey = device->pd->local_dma_lkey;
                sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
                sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
                isert_dbg("sge: addr: 0x%llx  length: %u lkey: %x\n",
@@ -3108,7 +3095,7 @@ out:
 
 static int
 isert_setup_np(struct iscsi_np *np,
-              struct __kernel_sockaddr_storage *ksockaddr)
+              struct sockaddr_storage *ksockaddr)
 {
        struct isert_np *isert_np;
        struct rdma_cm_id *isert_lid;
@@ -3130,7 +3117,7 @@ isert_setup_np(struct iscsi_np *np,
         * in iscsi_target_configfs.c code..
         */
        memcpy(&np->np_sockaddr, ksockaddr,
-              sizeof(struct __kernel_sockaddr_storage));
+              sizeof(struct sockaddr_storage));
 
        isert_lid = isert_setup_id(isert_np);
        if (IS_ERR(isert_lid)) {
@@ -3212,32 +3199,11 @@ isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
 {
        struct rdma_cm_id *cm_id = isert_conn->cm_id;
        struct rdma_route *cm_route = &cm_id->route;
-       struct sockaddr_in *sock_in;
-       struct sockaddr_in6 *sock_in6;
 
        conn->login_family = np->np_sockaddr.ss_family;
 
-       if (np->np_sockaddr.ss_family == AF_INET6) {
-               sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr;
-               snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c",
-                        &sock_in6->sin6_addr.in6_u);
-               conn->login_port = ntohs(sock_in6->sin6_port);
-
-               sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr;
-               snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c",
-                        &sock_in6->sin6_addr.in6_u);
-               conn->local_port = ntohs(sock_in6->sin6_port);
-       } else {
-               sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr;
-               sprintf(conn->login_ip, "%pI4",
-                       &sock_in->sin_addr.s_addr);
-               conn->login_port = ntohs(sock_in->sin_port);
-
-               sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr;
-               sprintf(conn->local_ip, "%pI4",
-                       &sock_in->sin_addr.s_addr);
-               conn->local_port = ntohs(sock_in->sin_port);
-       }
+       conn->login_sockaddr = cm_route->addr.dst_addr;
+       conn->local_sockaddr = cm_route->addr.src_addr;
 }
 
 static int
index 9ec23a786c029a15b9480377e8b46721b67f9c8a..6a04ba3c0f7224563e3432dffa38c4fcd12fad83 100644 (file)
@@ -209,7 +209,6 @@ struct isert_device {
        int                     refcount;
        struct ib_device        *ib_device;
        struct ib_pd            *pd;
-       struct ib_mr            *mr;
        struct isert_comp       *comps;
        int                     comps_used;
        struct list_head        dev_node;
index 31a20b462266611299aeeae5cd51fd19b69b635e..b481490ad25756f6de36cd718c0983be751c8e5c 100644 (file)
@@ -55,8 +55,8 @@
 
 #define DRV_NAME       "ib_srp"
 #define PFX            DRV_NAME ": "
-#define DRV_VERSION    "1.0"
-#define DRV_RELDATE    "July 1, 2013"
+#define DRV_VERSION    "2.0"
+#define DRV_RELDATE    "July 26, 2015"
 
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator");
@@ -68,8 +68,8 @@ static unsigned int srp_sg_tablesize;
 static unsigned int cmd_sg_entries;
 static unsigned int indirect_sg_entries;
 static bool allow_ext_sg;
-static bool prefer_fr;
-static bool register_always;
+static bool prefer_fr = true;
+static bool register_always = true;
 static int topspin_workarounds = 1;
 
 module_param(srp_sg_tablesize, uint, 0444);
@@ -131,7 +131,7 @@ MODULE_PARM_DESC(ch_count,
                 "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA.");
 
 static void srp_add_one(struct ib_device *device);
-static void srp_remove_one(struct ib_device *device);
+static void srp_remove_one(struct ib_device *device, void *client_data);
 static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
 static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
@@ -378,7 +378,8 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
        INIT_LIST_HEAD(&pool->free_list);
 
        for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
-               mr = ib_alloc_fast_reg_mr(pd, max_page_list_len);
+               mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG,
+                                max_page_list_len);
                if (IS_ERR(mr)) {
                        ret = PTR_ERR(mr);
                        goto destroy_pool;
@@ -545,7 +546,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        if (ret)
                goto err_qp;
 
-       if (dev->use_fast_reg && dev->has_fr) {
+       if (dev->use_fast_reg) {
                fr_pool = srp_alloc_fr_pool(target);
                if (IS_ERR(fr_pool)) {
                        ret = PTR_ERR(fr_pool);
@@ -553,10 +554,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
                                     "FR pool allocation failed (%d)\n", ret);
                        goto err_qp;
                }
-               if (ch->fr_pool)
-                       srp_destroy_fr_pool(ch->fr_pool);
-               ch->fr_pool = fr_pool;
-       } else if (!dev->use_fast_reg && dev->has_fmr) {
+       } else if (dev->use_fmr) {
                fmr_pool = srp_alloc_fmr_pool(target);
                if (IS_ERR(fmr_pool)) {
                        ret = PTR_ERR(fmr_pool);
@@ -564,9 +562,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
                                     "FMR pool allocation failed (%d)\n", ret);
                        goto err_qp;
                }
-               if (ch->fmr_pool)
-                       ib_destroy_fmr_pool(ch->fmr_pool);
-               ch->fmr_pool = fmr_pool;
        }
 
        if (ch->qp)
@@ -580,6 +575,16 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
        ch->recv_cq = recv_cq;
        ch->send_cq = send_cq;
 
+       if (dev->use_fast_reg) {
+               if (ch->fr_pool)
+                       srp_destroy_fr_pool(ch->fr_pool);
+               ch->fr_pool = fr_pool;
+       } else if (dev->use_fmr) {
+               if (ch->fmr_pool)
+                       ib_destroy_fmr_pool(ch->fmr_pool);
+               ch->fmr_pool = fmr_pool;
+       }
+
        kfree(init_attr);
        return 0;
 
@@ -622,7 +627,7 @@ static void srp_free_ch_ib(struct srp_target_port *target,
        if (dev->use_fast_reg) {
                if (ch->fr_pool)
                        srp_destroy_fr_pool(ch->fr_pool);
-       } else {
+       } else if (dev->use_fmr) {
                if (ch->fmr_pool)
                        ib_destroy_fmr_pool(ch->fmr_pool);
        }
@@ -1084,7 +1089,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd,
                if (req->nmdesc)
                        srp_fr_pool_put(ch->fr_pool, req->fr_list,
                                        req->nmdesc);
-       } else {
+       } else if (dev->use_fmr) {
                struct ib_pool_fmr **pfmr;
 
                for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
@@ -1259,6 +1264,8 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
 {
        struct srp_direct_buf *desc = state->desc;
 
+       WARN_ON_ONCE(!dma_len);
+
        desc->va = cpu_to_be64(dma_addr);
        desc->key = cpu_to_be32(rkey);
        desc->len = cpu_to_be32(dma_len);
@@ -1271,18 +1278,24 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
 static int srp_map_finish_fmr(struct srp_map_state *state,
                              struct srp_rdma_ch *ch)
 {
+       struct srp_target_port *target = ch->target;
+       struct srp_device *dev = target->srp_host->srp_dev;
        struct ib_pool_fmr *fmr;
        u64 io_addr = 0;
 
+       if (state->fmr.next >= state->fmr.end)
+               return -ENOMEM;
+
        fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages,
                                   state->npages, io_addr);
        if (IS_ERR(fmr))
                return PTR_ERR(fmr);
 
-       *state->next_fmr++ = fmr;
+       *state->fmr.next++ = fmr;
        state->nmdesc++;
 
-       srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey);
+       srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask,
+                    state->dma_len, fmr->fmr->rkey);
 
        return 0;
 }
@@ -1297,6 +1310,9 @@ static int srp_map_finish_fr(struct srp_map_state *state,
        struct srp_fr_desc *desc;
        u32 rkey;
 
+       if (state->fr.next >= state->fr.end)
+               return -ENOMEM;
+
        desc = srp_fr_pool_get(ch->fr_pool);
        if (!desc)
                return -ENOMEM;
@@ -1320,7 +1336,7 @@ static int srp_map_finish_fr(struct srp_map_state *state,
                                       IB_ACCESS_REMOTE_WRITE);
        wr.wr.fast_reg.rkey = desc->mr->lkey;
 
-       *state->next_fr++ = desc;
+       *state->fr.next++ = desc;
        state->nmdesc++;
 
        srp_map_desc(state, state->base_dma_addr, state->dma_len,
@@ -1333,17 +1349,19 @@ static int srp_finish_mapping(struct srp_map_state *state,
                              struct srp_rdma_ch *ch)
 {
        struct srp_target_port *target = ch->target;
+       struct srp_device *dev = target->srp_host->srp_dev;
        int ret = 0;
 
+       WARN_ON_ONCE(!dev->use_fast_reg && !dev->use_fmr);
+
        if (state->npages == 0)
                return 0;
 
-       if (state->npages == 1 && !register_always)
+       if (state->npages == 1 && target->global_mr)
                srp_map_desc(state, state->base_dma_addr, state->dma_len,
-                            target->rkey);
+                            target->global_mr->rkey);
        else
-               ret = target->srp_host->srp_dev->use_fast_reg ?
-                       srp_map_finish_fr(state, ch) :
+               ret = dev->use_fast_reg ? srp_map_finish_fr(state, ch) :
                        srp_map_finish_fmr(state, ch);
 
        if (ret == 0) {
@@ -1354,66 +1372,19 @@ static int srp_finish_mapping(struct srp_map_state *state,
        return ret;
 }
 
-static void srp_map_update_start(struct srp_map_state *state,
-                                struct scatterlist *sg, int sg_index,
-                                dma_addr_t dma_addr)
-{
-       state->unmapped_sg = sg;
-       state->unmapped_index = sg_index;
-       state->unmapped_addr = dma_addr;
-}
-
 static int srp_map_sg_entry(struct srp_map_state *state,
                            struct srp_rdma_ch *ch,
-                           struct scatterlist *sg, int sg_index,
-                           bool use_mr)
+                           struct scatterlist *sg, int sg_index)
 {
        struct srp_target_port *target = ch->target;
        struct srp_device *dev = target->srp_host->srp_dev;
        struct ib_device *ibdev = dev->dev;
        dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
        unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
-       unsigned int len;
+       unsigned int len = 0;
        int ret;
 
-       if (!dma_len)
-               return 0;
-
-       if (!use_mr) {
-               /*
-                * Once we're in direct map mode for a request, we don't
-                * go back to FMR or FR mode, so no need to update anything
-                * other than the descriptor.
-                */
-               srp_map_desc(state, dma_addr, dma_len, target->rkey);
-               return 0;
-       }
-
-       /*
-        * Since not all RDMA HW drivers support non-zero page offsets for
-        * FMR, if we start at an offset into a page, don't merge into the
-        * current FMR mapping. Finish it out, and use the kernel's MR for
-        * this sg entry.
-        */
-       if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) ||
-           dma_len > dev->mr_max_size) {
-               ret = srp_finish_mapping(state, ch);
-               if (ret)
-                       return ret;
-
-               srp_map_desc(state, dma_addr, dma_len, target->rkey);
-               srp_map_update_start(state, NULL, 0, 0);
-               return 0;
-       }
-
-       /*
-        * If this is the first sg that will be mapped via FMR or via FR, save
-        * our position. We need to know the first unmapped entry, its index,
-        * and the first unmapped address within that entry to be able to
-        * restart mapping after an error.
-        */
-       if (!state->unmapped_sg)
-               srp_map_update_start(state, sg, sg_index, dma_addr);
+       WARN_ON_ONCE(!dma_len);
 
        while (dma_len) {
                unsigned offset = dma_addr & ~dev->mr_page_mask;
@@ -1421,8 +1392,6 @@ static int srp_map_sg_entry(struct srp_map_state *state,
                        ret = srp_finish_mapping(state, ch);
                        if (ret)
                                return ret;
-
-                       srp_map_update_start(state, sg, sg_index, dma_addr);
                }
 
                len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
@@ -1441,11 +1410,8 @@ static int srp_map_sg_entry(struct srp_map_state *state,
         * boundries.
         */
        ret = 0;
-       if (len != dev->mr_page_size) {
+       if (len != dev->mr_page_size)
                ret = srp_finish_mapping(state, ch);
-               if (!ret)
-                       srp_map_update_start(state, NULL, 0, 0);
-       }
        return ret;
 }
 
@@ -1455,50 +1421,80 @@ static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
 {
        struct srp_target_port *target = ch->target;
        struct srp_device *dev = target->srp_host->srp_dev;
-       struct ib_device *ibdev = dev->dev;
        struct scatterlist *sg;
-       int i;
-       bool use_mr;
+       int i, ret;
 
        state->desc     = req->indirect_desc;
        state->pages    = req->map_page;
        if (dev->use_fast_reg) {
-               state->next_fr = req->fr_list;
-               use_mr = !!ch->fr_pool;
-       } else {
-               state->next_fmr = req->fmr_list;
-               use_mr = !!ch->fmr_pool;
+               state->fr.next = req->fr_list;
+               state->fr.end = req->fr_list + target->cmd_sg_cnt;
+       } else if (dev->use_fmr) {
+               state->fmr.next = req->fmr_list;
+               state->fmr.end = req->fmr_list + target->cmd_sg_cnt;
        }
 
-       for_each_sg(scat, sg, count, i) {
-               if (srp_map_sg_entry(state, ch, sg, i, use_mr)) {
-                       /*
-                        * Memory registration failed, so backtrack to the
-                        * first unmapped entry and continue on without using
-                        * memory registration.
-                        */
-                       dma_addr_t dma_addr;
-                       unsigned int dma_len;
-
-backtrack:
-                       sg = state->unmapped_sg;
-                       i = state->unmapped_index;
-
-                       dma_addr = ib_sg_dma_address(ibdev, sg);
-                       dma_len = ib_sg_dma_len(ibdev, sg);
-                       dma_len -= (state->unmapped_addr - dma_addr);
-                       dma_addr = state->unmapped_addr;
-                       use_mr = false;
-                       srp_map_desc(state, dma_addr, dma_len, target->rkey);
+       if (dev->use_fast_reg || dev->use_fmr) {
+               for_each_sg(scat, sg, count, i) {
+                       ret = srp_map_sg_entry(state, ch, sg, i);
+                       if (ret)
+                               goto out;
+               }
+               ret = srp_finish_mapping(state, ch);
+               if (ret)
+                       goto out;
+       } else {
+               for_each_sg(scat, sg, count, i) {
+                       srp_map_desc(state, ib_sg_dma_address(dev->dev, sg),
+                                    ib_sg_dma_len(dev->dev, sg),
+                                    target->global_mr->rkey);
                }
        }
 
-       if (use_mr && srp_finish_mapping(state, ch))
-               goto backtrack;
-
        req->nmdesc = state->nmdesc;
+       ret = 0;
 
-       return 0;
+out:
+       return ret;
+}
+
+/*
+ * Register the indirect data buffer descriptor with the HCA.
+ *
+ * Note: since the indirect data buffer descriptor has been allocated with
+ * kmalloc() it is guaranteed that this buffer is a physically contiguous
+ * memory buffer.
+ */
+static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req,
+                      void **next_mr, void **end_mr, u32 idb_len,
+                      __be32 *idb_rkey)
+{
+       struct srp_target_port *target = ch->target;
+       struct srp_device *dev = target->srp_host->srp_dev;
+       struct srp_map_state state;
+       struct srp_direct_buf idb_desc;
+       u64 idb_pages[1];
+       int ret;
+
+       memset(&state, 0, sizeof(state));
+       memset(&idb_desc, 0, sizeof(idb_desc));
+       state.gen.next = next_mr;
+       state.gen.end = end_mr;
+       state.desc = &idb_desc;
+       state.pages = idb_pages;
+       state.pages[0] = (req->indirect_dma_addr &
+                         dev->mr_page_mask);
+       state.npages = 1;
+       state.base_dma_addr = req->indirect_dma_addr;
+       state.dma_len = idb_len;
+       ret = srp_finish_mapping(&state, ch);
+       if (ret < 0)
+               goto out;
+
+       *idb_rkey = idb_desc.key;
+
+out:
+       return ret;
 }
 
 static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
@@ -1507,12 +1503,13 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
        struct srp_target_port *target = ch->target;
        struct scatterlist *scat;
        struct srp_cmd *cmd = req->cmd->buf;
-       int len, nents, count;
+       int len, nents, count, ret;
        struct srp_device *dev;
        struct ib_device *ibdev;
        struct srp_map_state state;
        struct srp_indirect_buf *indirect_hdr;
-       u32 table_len;
+       u32 idb_len, table_len;
+       __be32 idb_rkey;
        u8 fmt;
 
        if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
@@ -1539,7 +1536,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
        fmt = SRP_DATA_DESC_DIRECT;
        len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf);
 
-       if (count == 1 && !register_always) {
+       if (count == 1 && target->global_mr) {
                /*
                 * The midlayer only generated a single gather/scatter
                 * entry, or DMA mapping coalesced everything to a
@@ -1549,7 +1546,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
                struct srp_direct_buf *buf = (void *) cmd->add_data;
 
                buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
-               buf->key = cpu_to_be32(target->rkey);
+               buf->key = cpu_to_be32(target->global_mr->rkey);
                buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
 
                req->nmdesc = 0;
@@ -1594,6 +1591,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
 
        count = min(state.ndesc, target->cmd_sg_cnt);
        table_len = state.ndesc * sizeof (struct srp_direct_buf);
+       idb_len = sizeof(struct srp_indirect_buf) + table_len;
 
        fmt = SRP_DATA_DESC_INDIRECT;
        len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
@@ -1602,8 +1600,18 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
        memcpy(indirect_hdr->desc_list, req->indirect_desc,
               count * sizeof (struct srp_direct_buf));
 
+       if (!target->global_mr) {
+               ret = srp_map_idb(ch, req, state.gen.next, state.gen.end,
+                                 idb_len, &idb_rkey);
+               if (ret < 0)
+                       return ret;
+               req->nmdesc++;
+       } else {
+               idb_rkey = target->global_mr->rkey;
+       }
+
        indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
-       indirect_hdr->table_desc.key = cpu_to_be32(target->rkey);
+       indirect_hdr->table_desc.key = idb_rkey;
        indirect_hdr->table_desc.len = cpu_to_be32(table_len);
        indirect_hdr->len = cpu_to_be32(state.total_len);
 
@@ -2171,7 +2179,7 @@ static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask)
 }
 
 static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
-                              struct srp_login_rsp *lrsp,
+                              const struct srp_login_rsp *lrsp,
                               struct srp_rdma_ch *ch)
 {
        struct srp_target_port *target = ch->target;
@@ -2757,6 +2765,13 @@ static int srp_sdev_count(struct Scsi_Host *host)
        return c;
 }
 
+/*
+ * Return values:
+ * < 0 upon failure. Caller is responsible for SRP target port cleanup.
+ * 0 and target->state == SRP_TARGET_REMOVED if asynchronous target port
+ *    removal has been scheduled.
+ * 0 and target->state != SRP_TARGET_REMOVED upon success.
+ */
 static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
 {
        struct srp_rport_identifiers ids;
@@ -3146,8 +3161,8 @@ static ssize_t srp_create_target(struct device *dev,
        target->io_class        = SRP_REV16A_IB_IO_CLASS;
        target->scsi_host       = target_host;
        target->srp_host        = host;
-       target->lkey            = host->srp_dev->mr->lkey;
-       target->rkey            = host->srp_dev->mr->rkey;
+       target->lkey            = host->srp_dev->pd->local_dma_lkey;
+       target->global_mr       = host->srp_dev->global_mr;
        target->cmd_sg_cnt      = cmd_sg_entries;
        target->sg_tablesize    = indirect_sg_entries ? : cmd_sg_entries;
        target->allow_ext_sg    = allow_ext_sg;
@@ -3262,7 +3277,7 @@ static ssize_t srp_create_target(struct device *dev,
                                        srp_free_ch_ib(target, ch);
                                        srp_free_req_data(target, ch);
                                        target->ch_count = ch - target->ch;
-                                       break;
+                                       goto connected;
                                }
                        }
 
@@ -3272,6 +3287,7 @@ static ssize_t srp_create_target(struct device *dev,
                node_idx++;
        }
 
+connected:
        target->scsi_host->nr_hw_queues = target->ch_count;
 
        ret = srp_add_target(host, target);
@@ -3294,6 +3310,8 @@ out:
        mutex_unlock(&host->add_target_mutex);
 
        scsi_host_put(target->scsi_host);
+       if (ret < 0)
+               scsi_host_put(target->scsi_host);
 
        return ret;
 
@@ -3401,6 +3419,7 @@ static void srp_add_one(struct ib_device *device)
 
        srp_dev->use_fast_reg = (srp_dev->has_fr &&
                                 (!srp_dev->has_fmr || prefer_fr));
+       srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
 
        /*
         * Use the smallest page size supported by the HCA, down to a
@@ -3433,12 +3452,16 @@ static void srp_add_one(struct ib_device *device)
        if (IS_ERR(srp_dev->pd))
                goto free_dev;
 
-       srp_dev->mr = ib_get_dma_mr(srp_dev->pd,
-                                   IB_ACCESS_LOCAL_WRITE |
-                                   IB_ACCESS_REMOTE_READ |
-                                   IB_ACCESS_REMOTE_WRITE);
-       if (IS_ERR(srp_dev->mr))
-               goto err_pd;
+       if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) {
+               srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd,
+                                                  IB_ACCESS_LOCAL_WRITE |
+                                                  IB_ACCESS_REMOTE_READ |
+                                                  IB_ACCESS_REMOTE_WRITE);
+               if (IS_ERR(srp_dev->global_mr))
+                       goto err_pd;
+       } else {
+               srp_dev->global_mr = NULL;
+       }
 
        for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
                host = srp_add_port(srp_dev, p);
@@ -3460,13 +3483,13 @@ free_attr:
        kfree(dev_attr);
 }
 
-static void srp_remove_one(struct ib_device *device)
+static void srp_remove_one(struct ib_device *device, void *client_data)
 {
        struct srp_device *srp_dev;
        struct srp_host *host, *tmp_host;
        struct srp_target_port *target;
 
-       srp_dev = ib_get_client_data(device, &srp_client);
+       srp_dev = client_data;
        if (!srp_dev)
                return;
 
@@ -3495,7 +3518,8 @@ static void srp_remove_one(struct ib_device *device)
                kfree(host);
        }
 
-       ib_dereg_mr(srp_dev->mr);
+       if (srp_dev->global_mr)
+               ib_dereg_mr(srp_dev->global_mr);
        ib_dealloc_pd(srp_dev->pd);
 
        kfree(srp_dev);
index 17ee3f80ba550aec9c9ae9d326e2e91e2bb7daf0..3608f2e4819c412ee742bca4499cd84de32aa343 100644 (file)
@@ -95,13 +95,14 @@ struct srp_device {
        struct list_head        dev_list;
        struct ib_device       *dev;
        struct ib_pd           *pd;
-       struct ib_mr           *mr;
+       struct ib_mr           *global_mr;
        u64                     mr_page_mask;
        int                     mr_page_size;
        int                     mr_max_size;
        int                     max_pages_per_mr;
        bool                    has_fmr;
        bool                    has_fr;
+       bool                    use_fmr;
        bool                    use_fast_reg;
 };
 
@@ -182,10 +183,10 @@ struct srp_target_port {
        spinlock_t              lock;
 
        /* read only in the hot path */
+       struct ib_mr            *global_mr;
        struct srp_rdma_ch      *ch;
        u32                     ch_count;
        u32                     lkey;
-       u32                     rkey;
        enum srp_target_state   state;
        unsigned int            max_iu_len;
        unsigned int            cmd_sg_cnt;
@@ -276,14 +277,21 @@ struct srp_fr_pool {
  * @npages:        Number of page addresses in the pages[] array.
  * @nmdesc:        Number of FMR or FR memory descriptors used for mapping.
  * @ndesc:         Number of SRP buffer descriptors that have been filled in.
- * @unmapped_sg:    First element of the sg-list that is mapped via FMR or FR.
- * @unmapped_index: Index of the first element mapped via FMR or FR.
- * @unmapped_addr:  DMA address of the first element mapped via FMR or FR.
  */
 struct srp_map_state {
        union {
-               struct ib_pool_fmr **next_fmr;
-               struct srp_fr_desc **next_fr;
+               struct {
+                       struct ib_pool_fmr **next;
+                       struct ib_pool_fmr **end;
+               } fmr;
+               struct {
+                       struct srp_fr_desc **next;
+                       struct srp_fr_desc **end;
+               } fr;
+               struct {
+                       void               **next;
+                       void               **end;
+               } gen;
        };
        struct srp_direct_buf  *desc;
        u64                    *pages;
@@ -293,9 +301,6 @@ struct srp_map_state {
        unsigned int            npages;
        unsigned int            nmdesc;
        unsigned int            ndesc;
-       struct scatterlist     *unmapped_sg;
-       int                     unmapped_index;
-       dma_addr_t              unmapped_addr;
 };
 
 #endif /* IB_SRP_H */
index 60ff0a2390e5f02f7cffabb9a5154cc0c4b734e7..f6fe0414139beeafa3ddfba0ed33bf1ebc6a7489 100644 (file)
@@ -783,7 +783,7 @@ static int srpt_post_recv(struct srpt_device *sdev,
 
        list.addr = ioctx->ioctx.dma;
        list.length = srp_max_req_size;
-       list.lkey = sdev->mr->lkey;
+       list.lkey = sdev->pd->local_dma_lkey;
 
        wr.next = NULL;
        wr.sg_list = &list;
@@ -818,7 +818,7 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
 
        list.addr = ioctx->ioctx.dma;
        list.length = len;
-       list.lkey = sdev->mr->lkey;
+       list.lkey = sdev->pd->local_dma_lkey;
 
        wr.next = NULL;
        wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
@@ -1206,7 +1206,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 
                while (rsize > 0 && tsize > 0) {
                        sge->addr = dma_addr;
-                       sge->lkey = ch->sport->sdev->mr->lkey;
+                       sge->lkey = ch->sport->sdev->pd->local_dma_lkey;
 
                        if (rsize >= dma_len) {
                                sge->length =
@@ -3211,10 +3211,6 @@ static void srpt_add_one(struct ib_device *device)
        if (IS_ERR(sdev->pd))
                goto free_dev;
 
-       sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(sdev->mr))
-               goto err_pd;
-
        sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
 
        srq_attr.event_handler = srpt_srq_event;
@@ -3226,7 +3222,7 @@ static void srpt_add_one(struct ib_device *device)
 
        sdev->srq = ib_create_srq(sdev->pd, &srq_attr);
        if (IS_ERR(sdev->srq))
-               goto err_mr;
+               goto err_pd;
 
        pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
                 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
@@ -3250,7 +3246,7 @@ static void srpt_add_one(struct ib_device *device)
         * in the system as service_id; therefore, the target_id will change
         * if this HCA is gone bad and replaced by different HCA
         */
-       if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL))
+       if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0))
                goto err_cm;
 
        INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
@@ -3311,8 +3307,6 @@ err_cm:
        ib_destroy_cm_id(sdev->cm_id);
 err_srq:
        ib_destroy_srq(sdev->srq);
-err_mr:
-       ib_dereg_mr(sdev->mr);
 err_pd:
        ib_dealloc_pd(sdev->pd);
 free_dev:
@@ -3326,12 +3320,11 @@ err:
 /**
  * srpt_remove_one() - InfiniBand device removal callback function.
  */
-static void srpt_remove_one(struct ib_device *device)
+static void srpt_remove_one(struct ib_device *device, void *client_data)
 {
-       struct srpt_device *sdev;
+       struct srpt_device *sdev = client_data;
        int i;
 
-       sdev = ib_get_client_data(device, &srpt_client);
        if (!sdev) {
                pr_info("%s(%s): nothing to do.\n", __func__, device->name);
                return;
@@ -3358,7 +3351,6 @@ static void srpt_remove_one(struct ib_device *device)
        srpt_release_sdev(sdev);
 
        ib_destroy_srq(sdev->srq);
-       ib_dereg_mr(sdev->mr);
        ib_dealloc_pd(sdev->pd);
 
        srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
index 21f8df67522ab5131c43c5447ed4e3b57002badb..5faad8acd78931a54a78117cac8b4025f78dd460 100644 (file)
@@ -393,7 +393,6 @@ struct srpt_port {
 struct srpt_device {
        struct ib_device        *device;
        struct ib_pd            *pd;
-       struct ib_mr            *mr;
        struct ib_srq           *srq;
        struct ib_cm_id         *cm_id;
        struct ib_device_attr   dev_attr;
index 9d35499faca46bb3067138e795e5544527d956d6..08d496411f7570bf73368cdfc32b01ed98c59da8 100644 (file)
@@ -290,19 +290,14 @@ static int evdev_flush(struct file *file, fl_owner_t id)
 {
        struct evdev_client *client = file->private_data;
        struct evdev *evdev = client->evdev;
-       int retval;
 
-       retval = mutex_lock_interruptible(&evdev->mutex);
-       if (retval)
-               return retval;
+       mutex_lock(&evdev->mutex);
 
-       if (!evdev->exist || client->revoked)
-               retval = -ENODEV;
-       else
-               retval = input_flush_device(&evdev->handle, file);
+       if (evdev->exist && !client->revoked)
+               input_flush_device(&evdev->handle, file);
 
        mutex_unlock(&evdev->mutex);
-       return retval;
+       return 0;
 }
 
 static void evdev_free(struct device *dev)
index d2ea863d6a45fed60d118f9242d434f259504546..2165f3dd328babc2285a0e98e5bcf06fc423b983 100644 (file)
@@ -5,8 +5,6 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
- *
- * <<Power management needs to be implemented>>.
  */
 
 #include <linux/clk.h>
index 1f7e15ca5fbe0e20394a6eaa13e50e1fc314655b..4f5ef5bb535b86dcd0d16a198dca65de731f746f 100644 (file)
@@ -118,6 +118,7 @@ static const struct of_device_id ab8500_ponkey_match[] = {
        { .compatible = "stericsson,ab8500-ponkey", },
        {}
 };
+MODULE_DEVICE_TABLE(of, ab8500_ponkey_match);
 #endif
 
 static struct platform_driver ab8500_ponkey_driver = {
index e82edf810d1f3e6354b992f852ddbe8f433cd0b3..f2261ab5470126f0935c3ca9d3830b55699dc378 100644 (file)
@@ -173,6 +173,7 @@ static const struct of_device_id pwm_beeper_match[] = {
        { .compatible = "pwm-beeper", },
        { },
 };
+MODULE_DEVICE_TABLE(of, pwm_beeper_match);
 #endif
 
 static struct platform_driver pwm_beeper_driver = {
index 6bf3f1082f71ea6c6802fe7e4b73a4c5378df120..a804705eb04a280226d42eb28c0e2e283b7be697 100644 (file)
@@ -249,6 +249,7 @@ static const struct of_device_id regulator_haptic_dt_match[] = {
        { .compatible = "regulator-haptic" },
        { /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, regulator_haptic_dt_match);
 
 static struct platform_driver regulator_haptic_driver = {
        .probe          = regulator_haptic_probe,
index 54116e544c96e06de7978426e266c9bfb9e63737..6f997aa49183ae40daa709361e1b66f19fb862a8 100644 (file)
@@ -253,6 +253,7 @@ static const struct of_device_id bbc_beep_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, bbc_beep_match);
 
 static struct platform_driver bbc_beep_driver = {
        .driver = {
@@ -332,6 +333,7 @@ static const struct of_device_id grover_beep_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, grover_beep_match);
 
 static struct platform_driver grover_beep_driver = {
        .driver = {
index 95599e478e19cec0ab26b8a309d3714b6be1d6f8..23d0549539d43904299c51810730f5d34955d952 100644 (file)
@@ -232,7 +232,7 @@ static int xenkbd_connect_backend(struct xenbus_device *dev,
        struct xenbus_transaction xbt;
 
        ret = gnttab_grant_foreign_access(dev->otherend_id,
-                                         virt_to_mfn(info->page), 0);
+                                         virt_to_gfn(info->page), 0);
        if (ret < 0)
                return ret;
        info->gref = ret;
@@ -255,7 +255,7 @@ static int xenkbd_connect_backend(struct xenbus_device *dev,
                goto error_irqh;
        }
        ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
-                           virt_to_mfn(info->page));
+                           virt_to_gfn(info->page));
        if (ret)
                goto error_xenbus;
        ret = xenbus_printf(xbt, dev->nodename, "page-gref", "%u", info->gref);
index e2b7420eed97d8ac0390909118b369dc800fd566..fa945304b9a576d4303c778eca929a5f3517092a 100644 (file)
@@ -1170,6 +1170,7 @@ static const struct acpi_device_id elan_acpi_id[] = {
        { "ELAN0000", 0 },
        { "ELAN0100", 0 },
        { "ELAN0600", 0 },
+       { "ELAN1000", 0 },
        { }
 };
 MODULE_DEVICE_TABLE(acpi, elan_acpi_id);
index c9c98f0ab284f8ac990fe62f477327d8fbe74c69..db91de539ee30bfc14612f0056f3979b884ff4aa 100644 (file)
@@ -877,7 +877,7 @@ static int __init i8042_check_aux(void)
 static int i8042_controller_check(void)
 {
        if (i8042_flush()) {
-               pr_err("No controller found\n");
+               pr_info("No controller found\n");
                return -ENODEV;
        }
 
index 059edeb7f04a0d40d7db362bd206c726f4bf5560..600dcceff5426aaf4f6fc7b20ce966960bf0aa92 100644 (file)
@@ -479,6 +479,18 @@ config TOUCHSCREEN_MTOUCH
          To compile this driver as a module, choose M here: the
          module will be called mtouch.
 
+config TOUCHSCREEN_IMX6UL_TSC
+       tristate "Freescale i.MX6UL touchscreen controller"
+       depends on (OF && GPIOLIB) || COMPILE_TEST
+       help
+         Say Y here if you have a Freescale i.MX6UL, and want to
+         use the internal touchscreen controller.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called imx6ul_tsc.
+
 config TOUCHSCREEN_INEXIO
        tristate "iNexio serial touchscreens"
        select SERIO
@@ -1040,4 +1052,16 @@ config TOUCHSCREEN_ZFORCE
          To compile this driver as a module, choose M here: the
          module will be called zforce_ts.
 
+config TOUCHSCREEN_COLIBRI_VF50
+       tristate "Toradex Colibri on board touchscreen driver"
+       depends on GPIOLIB && IIO && VF610_ADC
+       help
+         Say Y here if you have a Colibri VF50 and plan to use
+         the on-board provided 4-wire touchscreen driver.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called colibri_vf50_ts.
+
 endif
index c85aae23e7f84f26a4b60c213774d6ef94d5216a..1b79cc09744af93b02ecabe958c24fc47bd189f3 100644 (file)
@@ -38,6 +38,7 @@ obj-$(CONFIG_TOUCHSCREEN_EGALAX)      += egalax_ts.o
 obj-$(CONFIG_TOUCHSCREEN_FUJITSU)      += fujitsu_ts.o
 obj-$(CONFIG_TOUCHSCREEN_GOODIX)       += goodix.o
 obj-$(CONFIG_TOUCHSCREEN_ILI210X)      += ili210x.o
+obj-$(CONFIG_TOUCHSCREEN_IMX6UL_TSC)   += imx6ul_tsc.o
 obj-$(CONFIG_TOUCHSCREEN_INEXIO)       += inexio.o
 obj-$(CONFIG_TOUCHSCREEN_INTEL_MID)    += intel-mid-touch.o
 obj-$(CONFIG_TOUCHSCREEN_IPROC)                += bcm_iproc_tsc.o
@@ -85,3 +86,4 @@ obj-$(CONFIG_TOUCHSCREEN_W90X900)     += w90p910_ts.o
 obj-$(CONFIG_TOUCHSCREEN_SX8654)       += sx8654.o
 obj-$(CONFIG_TOUCHSCREEN_TPS6507X)     += tps6507x-ts.o
 obj-$(CONFIG_TOUCHSCREEN_ZFORCE)       += zforce_ts.o
+obj-$(CONFIG_TOUCHSCREEN_COLIBRI_VF50) += colibri-vf50-ts.o
diff --git a/drivers/input/touchscreen/colibri-vf50-ts.c b/drivers/input/touchscreen/colibri-vf50-ts.c
new file mode 100644 (file)
index 0000000..5d4903a
--- /dev/null
@@ -0,0 +1,386 @@
+/*
+ * Toradex Colibri VF50 Touchscreen driver
+ *
+ * Copyright 2015 Toradex AG
+ *
+ * Originally authored by Stefan Agner for 3.0 kernel
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/iio/consumer.h>
+#include <linux/iio/types.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define DRIVER_NAME                    "colibri-vf50-ts"
+#define DRV_VERSION                    "1.0"
+
+#define VF_ADC_MAX                     ((1 << 12) - 1)
+
+#define COLI_TOUCH_MIN_DELAY_US                1000
+#define COLI_TOUCH_MAX_DELAY_US                2000
+#define COLI_PULLUP_MIN_DELAY_US       10000
+#define COLI_PULLUP_MAX_DELAY_US       11000
+#define COLI_TOUCH_NO_OF_AVGS          5
+#define COLI_TOUCH_REQ_ADC_CHAN                4
+
+struct vf50_touch_device {
+       struct platform_device *pdev;
+       struct input_dev *ts_input;
+       struct iio_channel *channels;
+       struct gpio_desc *gpio_xp;
+       struct gpio_desc *gpio_xm;
+       struct gpio_desc *gpio_yp;
+       struct gpio_desc *gpio_ym;
+       int pen_irq;
+       int min_pressure;
+       bool stop_touchscreen;
+};
+
+/*
+ * Enables given plates and measures touch parameters using ADC
+ */
+static int adc_ts_measure(struct iio_channel *channel,
+                         struct gpio_desc *plate_p, struct gpio_desc *plate_m)
+{
+       int i, value = 0, val = 0;
+       int error;
+
+       gpiod_set_value(plate_p, 1);
+       gpiod_set_value(plate_m, 1);
+
+       usleep_range(COLI_TOUCH_MIN_DELAY_US, COLI_TOUCH_MAX_DELAY_US);
+
+       for (i = 0; i < COLI_TOUCH_NO_OF_AVGS; i++) {
+               error = iio_read_channel_raw(channel, &val);
+               if (error < 0) {
+                       value = error;
+                       goto error_iio_read;
+               }
+
+               value += val;
+       }
+
+       value /= COLI_TOUCH_NO_OF_AVGS;
+
+error_iio_read:
+       gpiod_set_value(plate_p, 0);
+       gpiod_set_value(plate_m, 0);
+
+       return value;
+}
+
+/*
+ * Enable touch detection using falling edge detection on XM
+ */
+static void vf50_ts_enable_touch_detection(struct vf50_touch_device *vf50_ts)
+{
+       /* Enable plate YM (needs to be strong GND, high active) */
+       gpiod_set_value(vf50_ts->gpio_ym, 1);
+
+       /*
+        * Let the platform mux to idle state in order to enable
+        * Pull-Up on GPIO
+        */
+       pinctrl_pm_select_idle_state(&vf50_ts->pdev->dev);
+
+       /* Wait for the pull-up to be stable on high */
+       usleep_range(COLI_PULLUP_MIN_DELAY_US, COLI_PULLUP_MAX_DELAY_US);
+}
+
+/*
+ * ADC touch screen sampling bottom half irq handler
+ */
+static irqreturn_t vf50_ts_irq_bh(int irq, void *private)
+{
+       struct vf50_touch_device *vf50_ts = private;
+       struct device *dev = &vf50_ts->pdev->dev;
+       int val_x, val_y, val_z1, val_z2, val_p = 0;
+       bool discard_val_on_start = true;
+
+       /* Disable the touch detection plates */
+       gpiod_set_value(vf50_ts->gpio_ym, 0);
+
+       /* Let the platform mux to default state in order to mux as ADC */
+       pinctrl_pm_select_default_state(dev);
+
+       while (!vf50_ts->stop_touchscreen) {
+               /* X-Direction */
+               val_x = adc_ts_measure(&vf50_ts->channels[0],
+                               vf50_ts->gpio_xp, vf50_ts->gpio_xm);
+               if (val_x < 0)
+                       break;
+
+               /* Y-Direction */
+               val_y = adc_ts_measure(&vf50_ts->channels[1],
+                               vf50_ts->gpio_yp, vf50_ts->gpio_ym);
+               if (val_y < 0)
+                       break;
+
+               /*
+                * Touch pressure
+                * Measure on XP/YM
+                */
+               val_z1 = adc_ts_measure(&vf50_ts->channels[2],
+                               vf50_ts->gpio_yp, vf50_ts->gpio_xm);
+               if (val_z1 < 0)
+                       break;
+               val_z2 = adc_ts_measure(&vf50_ts->channels[3],
+                               vf50_ts->gpio_yp, vf50_ts->gpio_xm);
+               if (val_z2 < 0)
+                       break;
+
+               /* Validate signal (avoid calculation using noise) */
+               if (val_z1 > 64 && val_x > 64) {
+                       /*
+                        * Calculate resistance between the plates
+                        * lower resistance means higher pressure
+                        */
+                       int r_x = (1000 * val_x) / VF_ADC_MAX;
+
+                       val_p = (r_x * val_z2) / val_z1 - r_x;
+
+               } else {
+                       val_p = 2000;
+               }
+
+               val_p = 2000 - val_p;
+               dev_dbg(dev,
+                       "Measured values: x: %d, y: %d, z1: %d, z2: %d, p: %d\n",
+                       val_x, val_y, val_z1, val_z2, val_p);
+
+               /*
+                * If touch pressure is too low, stop measuring and reenable
+                * touch detection
+                */
+               if (val_p < vf50_ts->min_pressure || val_p > 2000)
+                       break;
+
+               /*
+                * The pressure may not be enough for the first x and the
+                * second y measurement, but, the pressure is ok when the
+                * driver is doing the third and fourth measurement. To
+                * take care of this, we drop the first measurement always.
+                */
+               if (discard_val_on_start) {
+                       discard_val_on_start = false;
+               } else {
+                       /*
+                        * Report touch position and sleep for
+                        * the next measurement.
+                        */
+                       input_report_abs(vf50_ts->ts_input,
+                                       ABS_X, VF_ADC_MAX - val_x);
+                       input_report_abs(vf50_ts->ts_input,
+                                       ABS_Y, VF_ADC_MAX - val_y);
+                       input_report_abs(vf50_ts->ts_input,
+                                       ABS_PRESSURE, val_p);
+                       input_report_key(vf50_ts->ts_input, BTN_TOUCH, 1);
+                       input_sync(vf50_ts->ts_input);
+               }
+
+               usleep_range(COLI_PULLUP_MIN_DELAY_US,
+                            COLI_PULLUP_MAX_DELAY_US);
+       }
+
+       /* Report no more touch, re-enable touch detection */
+       input_report_abs(vf50_ts->ts_input, ABS_PRESSURE, 0);
+       input_report_key(vf50_ts->ts_input, BTN_TOUCH, 0);
+       input_sync(vf50_ts->ts_input);
+
+       vf50_ts_enable_touch_detection(vf50_ts);
+
+       return IRQ_HANDLED;
+}
+
+static int vf50_ts_open(struct input_dev *dev_input)
+{
+       struct vf50_touch_device *touchdev = input_get_drvdata(dev_input);
+       struct device *dev = &touchdev->pdev->dev;
+
+       dev_dbg(dev, "Input device %s opened, starting touch detection\n",
+               dev_input->name);
+
+       touchdev->stop_touchscreen = false;
+
+       /* Mux detection before request IRQ, wait for pull-up to settle */
+       vf50_ts_enable_touch_detection(touchdev);
+
+       return 0;
+}
+
+static void vf50_ts_close(struct input_dev *dev_input)
+{
+       struct vf50_touch_device *touchdev = input_get_drvdata(dev_input);
+       struct device *dev = &touchdev->pdev->dev;
+
+       touchdev->stop_touchscreen = true;
+
+       /* Make sure IRQ is not running past close */
+       mb();
+       synchronize_irq(touchdev->pen_irq);
+
+       gpiod_set_value(touchdev->gpio_ym, 0);
+       pinctrl_pm_select_default_state(dev);
+
+       dev_dbg(dev, "Input device %s closed, disable touch detection\n",
+               dev_input->name);
+}
+
+static int vf50_ts_get_gpiod(struct device *dev, struct gpio_desc **gpio_d,
+                            const char *con_id, enum gpiod_flags flags)
+{
+       int error;
+
+       *gpio_d = devm_gpiod_get(dev, con_id, flags);
+       if (IS_ERR(*gpio_d)) {
+               error = PTR_ERR(*gpio_d);
+               dev_err(dev, "Could not get gpio_%s %d\n", con_id, error);
+               return error;
+       }
+
+       return 0;
+}
+
+static void vf50_ts_channel_release(void *data)
+{
+       struct iio_channel *channels = data;
+
+       iio_channel_release_all(channels);
+}
+
+static int vf50_ts_probe(struct platform_device *pdev)
+{
+       struct input_dev *input;
+       struct iio_channel *channels;
+       struct device *dev = &pdev->dev;
+       struct vf50_touch_device *touchdev;
+       int num_adc_channels;
+       int error;
+
+       channels = iio_channel_get_all(dev);
+       if (IS_ERR(channels))
+               return PTR_ERR(channels);
+
+       error = devm_add_action(dev, vf50_ts_channel_release, channels);
+       if (error) {
+               iio_channel_release_all(channels);
+               dev_err(dev, "Failed to register iio channel release action");
+               return error;
+       }
+
+       num_adc_channels = 0;
+       while (channels[num_adc_channels].indio_dev)
+               num_adc_channels++;
+
+       if (num_adc_channels != COLI_TOUCH_REQ_ADC_CHAN) {
+               dev_err(dev, "Inadequate ADC channels specified\n");
+               return -EINVAL;
+       }
+
+       touchdev = devm_kzalloc(dev, sizeof(*touchdev), GFP_KERNEL);
+       if (!touchdev)
+               return -ENOMEM;
+
+       touchdev->pdev = pdev;
+       touchdev->channels = channels;
+
+       error = of_property_read_u32(dev->of_node, "vf50-ts-min-pressure",
+                                &touchdev->min_pressure);
+       if (error)
+               return error;
+
+       input = devm_input_allocate_device(dev);
+       if (!input) {
+               dev_err(dev, "Failed to allocate TS input device\n");
+               return -ENOMEM;
+       }
+
+       platform_set_drvdata(pdev, touchdev);
+
+       input->name = DRIVER_NAME;
+       input->id.bustype = BUS_HOST;
+       input->dev.parent = dev;
+       input->open = vf50_ts_open;
+       input->close = vf50_ts_close;
+
+       input_set_capability(input, EV_KEY, BTN_TOUCH);
+       input_set_abs_params(input, ABS_X, 0, VF_ADC_MAX, 0, 0);
+       input_set_abs_params(input, ABS_Y, 0, VF_ADC_MAX, 0, 0);
+       input_set_abs_params(input, ABS_PRESSURE, 0, VF_ADC_MAX, 0, 0);
+
+       touchdev->ts_input = input;
+       input_set_drvdata(input, touchdev);
+
+       error = input_register_device(input);
+       if (error) {
+               dev_err(dev, "Failed to register input device\n");
+               return error;
+       }
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_xp, "xp", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_xm,
+                               "xm", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_yp, "yp", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       error = vf50_ts_get_gpiod(dev, &touchdev->gpio_ym, "ym", GPIOD_OUT_LOW);
+       if (error)
+               return error;
+
+       touchdev->pen_irq = platform_get_irq(pdev, 0);
+       if (touchdev->pen_irq < 0)
+               return touchdev->pen_irq;
+
+       error = devm_request_threaded_irq(dev, touchdev->pen_irq,
+                                         NULL, vf50_ts_irq_bh, IRQF_ONESHOT,
+                                         "vf50 touch", touchdev);
+       if (error) {
+               dev_err(dev, "Failed to request IRQ %d: %d\n",
+                       touchdev->pen_irq, error);
+               return error;
+       }
+
+       return 0;
+}
+
+static const struct of_device_id vf50_touch_of_match[] = {
+       { .compatible = "toradex,vf50-touchscreen", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, vf50_touch_of_match);
+
+static struct platform_driver vf50_touch_driver = {
+       .driver = {
+               .name = "toradex,vf50_touchctrl",
+               .of_match_table = vf50_touch_of_match,
+       },
+       .probe = vf50_ts_probe,
+};
+module_platform_driver(vf50_touch_driver);
+
+MODULE_AUTHOR("Sanchayan Maity");
+MODULE_DESCRIPTION("Colibri VF50 Touchscreen driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
index 9a323dd915dea696b5749d77af8c05c5d7b3c093..a9f95c7d3c0066cbd257b80755efdbe683340dbf 100644 (file)
@@ -86,4 +86,3 @@ module_i2c_driver(cyttsp4_i2c_driver);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cypress TrueTouch(R) Standard Product (TTSP) I2C driver");
 MODULE_AUTHOR("Cypress");
-MODULE_ALIAS("i2c:cyttsp4");
index 519e2de2f8dfc6107303412cef6c25bf3a770bf2..eee51b3f2e3f39382e3c421226b78de5e0c76e38 100644 (file)
@@ -86,4 +86,3 @@ module_i2c_driver(cyttsp_i2c_driver);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cypress TrueTouch(R) Standard Product (TTSP) I2C driver");
 MODULE_AUTHOR("Cypress");
-MODULE_ALIAS("i2c:cyttsp");
index ddac134b25b108841e7e7797003fe9a6725e4576..17cc20ef4923bdb2fb2edcf770d7c303be9d6af5 100644 (file)
 #define ELAN_FW_PAGESIZE       132
 
 /* calibration timeout definition */
-#define ELAN_CALI_TIMEOUT_MSEC 10000
+#define ELAN_CALI_TIMEOUT_MSEC 12000
 
 #define ELAN_POWERON_DELAY_USEC        500
 #define ELAN_RESET_DELAY_MSEC  20
diff --git a/drivers/input/touchscreen/imx6ul_tsc.c b/drivers/input/touchscreen/imx6ul_tsc.c
new file mode 100644 (file)
index 0000000..ff0b758
--- /dev/null
@@ -0,0 +1,523 @@
+/*
+ * Freescale i.MX6UL touchscreen controller driver
+ *
+ * Copyright (C) 2015 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/gpio/consumer.h>
+#include <linux/input.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/of.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/io.h>
+
+/* ADC configuration registers field define */
+#define ADC_AIEN               (0x1 << 7)
+#define ADC_CONV_DISABLE       0x1F
+#define ADC_CAL                        (0x1 << 7)
+#define ADC_CALF               0x2
+#define ADC_12BIT_MODE         (0x2 << 2)
+#define ADC_IPG_CLK            0x00
+#define ADC_CLK_DIV_8          (0x03 << 5)
+#define ADC_SHORT_SAMPLE_MODE  (0x0 << 4)
+#define ADC_HARDWARE_TRIGGER   (0x1 << 13)
+#define SELECT_CHANNEL_4       0x04
+#define SELECT_CHANNEL_1       0x01
+#define DISABLE_CONVERSION_INT (0x0 << 7)
+
+/* ADC registers */
+#define REG_ADC_HC0            0x00
+#define REG_ADC_HC1            0x04
+#define REG_ADC_HC2            0x08
+#define REG_ADC_HC3            0x0C
+#define REG_ADC_HC4            0x10
+#define REG_ADC_HS             0x14
+#define REG_ADC_R0             0x18
+#define REG_ADC_CFG            0x2C
+#define REG_ADC_GC             0x30
+#define REG_ADC_GS             0x34
+
+#define ADC_TIMEOUT            msecs_to_jiffies(100)
+
+/* TSC registers */
+#define REG_TSC_BASIC_SETING   0x00
+#define REG_TSC_PRE_CHARGE_TIME        0x10
+#define REG_TSC_FLOW_CONTROL   0x20
+#define REG_TSC_MEASURE_VALUE  0x30
+#define REG_TSC_INT_EN         0x40
+#define REG_TSC_INT_SIG_EN     0x50
+#define REG_TSC_INT_STATUS     0x60
+#define REG_TSC_DEBUG_MODE     0x70
+#define REG_TSC_DEBUG_MODE2    0x80
+
+/* TSC configuration registers field define */
+#define DETECT_4_WIRE_MODE     (0x0 << 4)
+#define AUTO_MEASURE           0x1
+#define MEASURE_SIGNAL         0x1
+#define DETECT_SIGNAL          (0x1 << 4)
+#define VALID_SIGNAL           (0x1 << 8)
+#define MEASURE_INT_EN         0x1
+#define MEASURE_SIG_EN         0x1
+#define VALID_SIG_EN           (0x1 << 8)
+#define DE_GLITCH_2            (0x2 << 29)
+#define START_SENSE            (0x1 << 12)
+#define TSC_DISABLE            (0x1 << 16)
+#define DETECT_MODE            0x2
+
+struct imx6ul_tsc {
+       struct device *dev;
+       struct input_dev *input;
+       void __iomem *tsc_regs;
+       void __iomem *adc_regs;
+       struct clk *tsc_clk;
+       struct clk *adc_clk;
+       struct gpio_desc *xnur_gpio;
+
+       int measure_delay_time;
+       int pre_charge_time;
+
+       struct completion completion;
+};
+
+/*
+ * TSC module need ADC to get the measure value. So
+ * before config TSC, we should initialize ADC module.
+ */
+static void imx6ul_adc_init(struct imx6ul_tsc *tsc)
+{
+       int adc_hc = 0;
+       int adc_gc;
+       int adc_gs;
+       int adc_cfg;
+       int timeout;
+
+       reinit_completion(&tsc->completion);
+
+       adc_cfg = readl(tsc->adc_regs + REG_ADC_CFG);
+       adc_cfg |= ADC_12BIT_MODE | ADC_IPG_CLK;
+       adc_cfg |= ADC_CLK_DIV_8 | ADC_SHORT_SAMPLE_MODE;
+       adc_cfg &= ~ADC_HARDWARE_TRIGGER;
+       writel(adc_cfg, tsc->adc_regs + REG_ADC_CFG);
+
+       /* enable calibration interrupt */
+       adc_hc |= ADC_AIEN;
+       adc_hc |= ADC_CONV_DISABLE;
+       writel(adc_hc, tsc->adc_regs + REG_ADC_HC0);
+
+       /* start ADC calibration */
+       adc_gc = readl(tsc->adc_regs + REG_ADC_GC);
+       adc_gc |= ADC_CAL;
+       writel(adc_gc, tsc->adc_regs + REG_ADC_GC);
+
+       timeout = wait_for_completion_timeout
+                       (&tsc->completion, ADC_TIMEOUT);
+       if (timeout == 0)
+               dev_err(tsc->dev, "Timeout for adc calibration\n");
+
+       adc_gs = readl(tsc->adc_regs + REG_ADC_GS);
+       if (adc_gs & ADC_CALF)
+               dev_err(tsc->dev, "ADC calibration failed\n");
+
+       /* TSC need the ADC work in hardware trigger */
+       adc_cfg = readl(tsc->adc_regs + REG_ADC_CFG);
+       adc_cfg |= ADC_HARDWARE_TRIGGER;
+       writel(adc_cfg, tsc->adc_regs + REG_ADC_CFG);
+}
+
+/*
+ * This is a TSC workaround. Currently TSC misconnect two
+ * ADC channels, this function remap channel configure for
+ * hardware trigger.
+ */
+static void imx6ul_tsc_channel_config(struct imx6ul_tsc *tsc)
+{
+       int adc_hc0, adc_hc1, adc_hc2, adc_hc3, adc_hc4;
+
+       adc_hc0 = DISABLE_CONVERSION_INT;
+       writel(adc_hc0, tsc->adc_regs + REG_ADC_HC0);
+
+       adc_hc1 = DISABLE_CONVERSION_INT | SELECT_CHANNEL_4;
+       writel(adc_hc1, tsc->adc_regs + REG_ADC_HC1);
+
+       adc_hc2 = DISABLE_CONVERSION_INT;
+       writel(adc_hc2, tsc->adc_regs + REG_ADC_HC2);
+
+       adc_hc3 = DISABLE_CONVERSION_INT | SELECT_CHANNEL_1;
+       writel(adc_hc3, tsc->adc_regs + REG_ADC_HC3);
+
+       adc_hc4 = DISABLE_CONVERSION_INT;
+       writel(adc_hc4, tsc->adc_regs + REG_ADC_HC4);
+}
+
+/*
+ * TSC setting, confige the pre-charge time and measure delay time.
+ * different touch screen may need different pre-charge time and
+ * measure delay time.
+ */
+static void imx6ul_tsc_set(struct imx6ul_tsc *tsc)
+{
+       int basic_setting = 0;
+       int start;
+
+       basic_setting |= tsc->measure_delay_time << 8;
+       basic_setting |= DETECT_4_WIRE_MODE | AUTO_MEASURE;
+       writel(basic_setting, tsc->tsc_regs + REG_TSC_BASIC_SETING);
+
+       writel(DE_GLITCH_2, tsc->tsc_regs + REG_TSC_DEBUG_MODE2);
+
+       writel(tsc->pre_charge_time, tsc->tsc_regs + REG_TSC_PRE_CHARGE_TIME);
+       writel(MEASURE_INT_EN, tsc->tsc_regs + REG_TSC_INT_EN);
+       writel(MEASURE_SIG_EN | VALID_SIG_EN,
+               tsc->tsc_regs + REG_TSC_INT_SIG_EN);
+
+       /* start sense detection */
+       start = readl(tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+       start |= START_SENSE;
+       start &= ~TSC_DISABLE;
+       writel(start, tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+}
+
+static void imx6ul_tsc_init(struct imx6ul_tsc *tsc)
+{
+       imx6ul_adc_init(tsc);
+       imx6ul_tsc_channel_config(tsc);
+       imx6ul_tsc_set(tsc);
+}
+
+static void imx6ul_tsc_disable(struct imx6ul_tsc *tsc)
+{
+       int tsc_flow;
+       int adc_cfg;
+
+       /* TSC controller enters to idle status */
+       tsc_flow = readl(tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+       tsc_flow |= TSC_DISABLE;
+       writel(tsc_flow, tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+
+       /* ADC controller enters to stop mode */
+       adc_cfg = readl(tsc->adc_regs + REG_ADC_HC0);
+       adc_cfg |= ADC_CONV_DISABLE;
+       writel(adc_cfg, tsc->adc_regs + REG_ADC_HC0);
+}
+
+/* Delay some time (max 2ms), wait the pre-charge done. */
+static bool tsc_wait_detect_mode(struct imx6ul_tsc *tsc)
+{
+       unsigned long timeout = jiffies + msecs_to_jiffies(2);
+       int state_machine;
+       int debug_mode2;
+
+       do {
+               if (time_after(jiffies, timeout))
+                       return false;
+
+               usleep_range(200, 400);
+               debug_mode2 = readl(tsc->tsc_regs + REG_TSC_DEBUG_MODE2);
+               state_machine = (debug_mode2 >> 20) & 0x7;
+       } while (state_machine != DETECT_MODE);
+
+       usleep_range(200, 400);
+       return true;
+}
+
+static irqreturn_t tsc_irq_fn(int irq, void *dev_id)
+{
+       struct imx6ul_tsc *tsc = dev_id;
+       int status;
+       int value;
+       int x, y;
+       int start;
+
+       status = readl(tsc->tsc_regs + REG_TSC_INT_STATUS);
+
+       /* write 1 to clear the bit measure-signal */
+       writel(MEASURE_SIGNAL | DETECT_SIGNAL,
+               tsc->tsc_regs + REG_TSC_INT_STATUS);
+
+       /* It's a HW self-clean bit. Set this bit and start sense detection */
+       start = readl(tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+       start |= START_SENSE;
+       writel(start, tsc->tsc_regs + REG_TSC_FLOW_CONTROL);
+
+       if (status & MEASURE_SIGNAL) {
+               value = readl(tsc->tsc_regs + REG_TSC_MEASURE_VALUE);
+               x = (value >> 16) & 0x0fff;
+               y = value & 0x0fff;
+
+               /*
+                * In detect mode, we can get the xnur gpio value,
+                * otherwise assume contact is stiull active.
+                */
+               if (!tsc_wait_detect_mode(tsc) ||
+                   gpiod_get_value_cansleep(tsc->xnur_gpio)) {
+                       input_report_key(tsc->input, BTN_TOUCH, 1);
+                       input_report_abs(tsc->input, ABS_X, x);
+                       input_report_abs(tsc->input, ABS_Y, y);
+               } else {
+                       input_report_key(tsc->input, BTN_TOUCH, 0);
+               }
+
+               input_sync(tsc->input);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t adc_irq_fn(int irq, void *dev_id)
+{
+       struct imx6ul_tsc *tsc = dev_id;
+       int coco;
+       int value;
+
+       coco = readl(tsc->adc_regs + REG_ADC_HS);
+       if (coco & 0x01) {
+               value = readl(tsc->adc_regs + REG_ADC_R0);
+               complete(&tsc->completion);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int imx6ul_tsc_open(struct input_dev *input_dev)
+{
+       struct imx6ul_tsc *tsc = input_get_drvdata(input_dev);
+       int err;
+
+       err = clk_prepare_enable(tsc->adc_clk);
+       if (err) {
+               dev_err(tsc->dev,
+                       "Could not prepare or enable the adc clock: %d\n",
+                       err);
+               return err;
+       }
+
+       err = clk_prepare_enable(tsc->tsc_clk);
+       if (err) {
+               dev_err(tsc->dev,
+                       "Could not prepare or enable the tsc clock: %d\n",
+                       err);
+               clk_disable_unprepare(tsc->adc_clk);
+               return err;
+       }
+
+       imx6ul_tsc_init(tsc);
+
+       return 0;
+}
+
+static void imx6ul_tsc_close(struct input_dev *input_dev)
+{
+       struct imx6ul_tsc *tsc = input_get_drvdata(input_dev);
+
+       imx6ul_tsc_disable(tsc);
+
+       clk_disable_unprepare(tsc->tsc_clk);
+       clk_disable_unprepare(tsc->adc_clk);
+}
+
+static int imx6ul_tsc_probe(struct platform_device *pdev)
+{
+       struct device_node *np = pdev->dev.of_node;
+       struct imx6ul_tsc *tsc;
+       struct input_dev *input_dev;
+       struct resource *tsc_mem;
+       struct resource *adc_mem;
+       int err;
+       int tsc_irq;
+       int adc_irq;
+
+       tsc = devm_kzalloc(&pdev->dev, sizeof(struct imx6ul_tsc), GFP_KERNEL);
+       if (!tsc)
+               return -ENOMEM;
+
+       input_dev = devm_input_allocate_device(&pdev->dev);
+       if (!input_dev)
+               return -ENOMEM;
+
+       input_dev->name = "iMX6UL TouchScreen Controller";
+       input_dev->id.bustype = BUS_HOST;
+
+       input_dev->open = imx6ul_tsc_open;
+       input_dev->close = imx6ul_tsc_close;
+
+       input_set_capability(input_dev, EV_KEY, BTN_TOUCH);
+       input_set_abs_params(input_dev, ABS_X, 0, 0xFFF, 0, 0);
+       input_set_abs_params(input_dev, ABS_Y, 0, 0xFFF, 0, 0);
+
+       input_set_drvdata(input_dev, tsc);
+
+       tsc->dev = &pdev->dev;
+       tsc->input = input_dev;
+       init_completion(&tsc->completion);
+
+       tsc->xnur_gpio = devm_gpiod_get(&pdev->dev, "xnur", GPIOD_IN);
+       if (IS_ERR(tsc->xnur_gpio)) {
+               err = PTR_ERR(tsc->xnur_gpio);
+               dev_err(&pdev->dev,
+                       "failed to request GPIO tsc_X- (xnur): %d\n", err);
+               return err;
+       }
+
+       tsc_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       tsc->tsc_regs = devm_ioremap_resource(&pdev->dev, tsc_mem);
+       if (IS_ERR(tsc->tsc_regs)) {
+               err = PTR_ERR(tsc->tsc_regs);
+               dev_err(&pdev->dev, "failed to remap tsc memory: %d\n", err);
+               return err;
+       }
+
+       adc_mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+       tsc->adc_regs = devm_ioremap_resource(&pdev->dev, adc_mem);
+       if (IS_ERR(tsc->adc_regs)) {
+               err = PTR_ERR(tsc->adc_regs);
+               dev_err(&pdev->dev, "failed to remap adc memory: %d\n", err);
+               return err;
+       }
+
+       tsc->tsc_clk = devm_clk_get(&pdev->dev, "tsc");
+       if (IS_ERR(tsc->tsc_clk)) {
+               err = PTR_ERR(tsc->tsc_clk);
+               dev_err(&pdev->dev, "failed getting tsc clock: %d\n", err);
+               return err;
+       }
+
+       tsc->adc_clk = devm_clk_get(&pdev->dev, "adc");
+       if (IS_ERR(tsc->adc_clk)) {
+               err = PTR_ERR(tsc->adc_clk);
+               dev_err(&pdev->dev, "failed getting adc clock: %d\n", err);
+               return err;
+       }
+
+       tsc_irq = platform_get_irq(pdev, 0);
+       if (tsc_irq < 0) {
+               dev_err(&pdev->dev, "no tsc irq resource?\n");
+               return tsc_irq;
+       }
+
+       adc_irq = platform_get_irq(pdev, 1);
+       if (adc_irq <= 0) {
+               dev_err(&pdev->dev, "no adc irq resource?\n");
+               return adc_irq;
+       }
+
+       err = devm_request_threaded_irq(tsc->dev, tsc_irq,
+                                       NULL, tsc_irq_fn, IRQF_ONESHOT,
+                                       dev_name(&pdev->dev), tsc);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "failed requesting tsc irq %d: %d\n",
+                       tsc_irq, err);
+               return err;
+       }
+
+       err = devm_request_irq(tsc->dev, adc_irq, adc_irq_fn, 0,
+                               dev_name(&pdev->dev), tsc);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "failed requesting adc irq %d: %d\n",
+                       adc_irq, err);
+               return err;
+       }
+
+       err = of_property_read_u32(np, "measure-delay-time",
+                                  &tsc->measure_delay_time);
+       if (err)
+               tsc->measure_delay_time = 0xffff;
+
+       err = of_property_read_u32(np, "pre-charge-time",
+                                  &tsc->pre_charge_time);
+       if (err)
+               tsc->pre_charge_time = 0xfff;
+
+       err = input_register_device(tsc->input);
+       if (err) {
+               dev_err(&pdev->dev,
+                       "failed to register input device: %d\n", err);
+               return err;
+       }
+
+       platform_set_drvdata(pdev, tsc);
+       return 0;
+}
+
+static int __maybe_unused imx6ul_tsc_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct imx6ul_tsc *tsc = platform_get_drvdata(pdev);
+       struct input_dev *input_dev = tsc->input;
+
+       mutex_lock(&input_dev->mutex);
+
+       if (input_dev->users) {
+               imx6ul_tsc_disable(tsc);
+
+               clk_disable_unprepare(tsc->tsc_clk);
+               clk_disable_unprepare(tsc->adc_clk);
+       }
+
+       mutex_unlock(&input_dev->mutex);
+
+       return 0;
+}
+
+static int __maybe_unused imx6ul_tsc_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct imx6ul_tsc *tsc = platform_get_drvdata(pdev);
+       struct input_dev *input_dev = tsc->input;
+       int retval = 0;
+
+       mutex_lock(&input_dev->mutex);
+
+       if (input_dev->users) {
+               retval = clk_prepare_enable(tsc->adc_clk);
+               if (retval)
+                       goto out;
+
+               retval = clk_prepare_enable(tsc->tsc_clk);
+               if (retval) {
+                       clk_disable_unprepare(tsc->adc_clk);
+                       goto out;
+               }
+
+               imx6ul_tsc_init(tsc);
+       }
+
+out:
+       mutex_unlock(&input_dev->mutex);
+       return retval;
+}
+
+static SIMPLE_DEV_PM_OPS(imx6ul_tsc_pm_ops,
+                        imx6ul_tsc_suspend, imx6ul_tsc_resume);
+
+static const struct of_device_id imx6ul_tsc_match[] = {
+       { .compatible = "fsl,imx6ul-tsc", },
+       { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, imx6ul_tsc_match);
+
+static struct platform_driver imx6ul_tsc_driver = {
+       .driver         = {
+               .name   = "imx6ul-tsc",
+               .of_match_table = imx6ul_tsc_match,
+               .pm     = &imx6ul_tsc_pm_ops,
+       },
+       .probe          = imx6ul_tsc_probe,
+};
+module_platform_driver(imx6ul_tsc_driver);
+
+MODULE_AUTHOR("Haibo Chen <haibo.chen@freescale.com>");
+MODULE_DESCRIPTION("Freescale i.MX6UL Touchscreen controller driver");
+MODULE_LICENSE("GPL v2");
index c0116994067d5cf46503a14d51de62a33a2ea05b..485794376ee5e656d91e89b94998d7a44ffc9992 100644 (file)
@@ -191,7 +191,7 @@ static void sun4i_ts_close(struct input_dev *dev)
        writel(TEMP_IRQ_EN(1), ts->base + TP_INT_FIFOC);
 }
 
-static int sun4i_get_temp(const struct sun4i_ts_data *ts, long *temp)
+static int sun4i_get_temp(const struct sun4i_ts_data *ts, int *temp)
 {
        /* No temp_data until the first irq */
        if (ts->temp_data == -1)
@@ -202,7 +202,7 @@ static int sun4i_get_temp(const struct sun4i_ts_data *ts, long *temp)
        return 0;
 }
 
-static int sun4i_get_tz_temp(void *data, long *temp)
+static int sun4i_get_tz_temp(void *data, int *temp)
 {
        return sun4i_get_temp(data, temp);
 }
@@ -215,14 +215,14 @@ static ssize_t show_temp(struct device *dev, struct device_attribute *devattr,
                         char *buf)
 {
        struct sun4i_ts_data *ts = dev_get_drvdata(dev);
-       long temp;
+       int temp;
        int error;
 
        error = sun4i_get_temp(ts, &temp);
        if (error)
                return error;
 
-       return sprintf(buf, "%ld\n", temp);
+       return sprintf(buf, "%d\n", temp);
 }
 
 static ssize_t show_temp_label(struct device *dev,
index f491aec95160df84290237020bd4f2447af320dd..4664c2a96c67fee361c3476ddc8f9a8e8842d271 100644 (file)
@@ -23,7 +23,8 @@ config IOMMU_IO_PGTABLE
 config IOMMU_IO_PGTABLE_LPAE
        bool "ARMv7/v8 Long Descriptor Format"
        select IOMMU_IO_PGTABLE
-       depends on ARM || ARM64 || COMPILE_TEST
+       # SWIOTLB guarantees a dma_to_phys() implementation
+       depends on ARM || ARM64 || (COMPILE_TEST && SWIOTLB)
        help
          Enable support for the ARM long descriptor pagetable format.
          This allocator supports 4K/2M/1G, 16K/32M and 64K/512M page
index 658ee39e65696898422bcd9c825d8a49fbc37359..f82060e778a23bb7a8901ef2356d42b5363d93a6 100644 (file)
@@ -1835,8 +1835,8 @@ static void free_gcr3_table(struct protection_domain *domain)
                free_gcr3_tbl_level2(domain->gcr3_tbl);
        else if (domain->glx == 1)
                free_gcr3_tbl_level1(domain->gcr3_tbl);
-       else if (domain->glx != 0)
-               BUG();
+       else
+               BUG_ON(domain->glx != 0);
 
        free_page((unsigned long)domain->gcr3_tbl);
 }
@@ -3947,11 +3947,6 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
        if (ret < 0)
                return ret;
 
-       ret = -ENOMEM;
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
-       if (!data)
-               goto out_free_parent;
-
        if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
                if (get_irq_table(devid, true))
                        index = info->ioapic_pin;
@@ -3962,7 +3957,6 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
        }
        if (index < 0) {
                pr_warn("Failed to allocate IRTE\n");
-               kfree(data);
                goto out_free_parent;
        }
 
@@ -3974,17 +3968,18 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
                        goto out_free_data;
                }
 
-               if (i > 0) {
-                       data = kzalloc(sizeof(*data), GFP_KERNEL);
-                       if (!data)
-                               goto out_free_data;
-               }
+               ret = -ENOMEM;
+               data = kzalloc(sizeof(*data), GFP_KERNEL);
+               if (!data)
+                       goto out_free_data;
+
                irq_data->hwirq = (devid << 16) + i;
                irq_data->chip_data = data;
                irq_data->chip = &amd_ir_chip;
                irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
                irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
        }
+
        return 0;
 
 out_free_data:
index a24495eb4e26c5c596efa79c084b14c19fe5932c..5ef347a13cb5d54789c07869b0527d81cb24365e 100644 (file)
@@ -154,7 +154,7 @@ bool amd_iommu_iotlb_sup __read_mostly = true;
 u32 amd_iommu_max_pasid __read_mostly = ~0;
 
 bool amd_iommu_v2_present __read_mostly;
-bool amd_iommu_pc_present __read_mostly;
+static bool amd_iommu_pc_present __read_mostly;
 
 bool amd_iommu_force_isolation __read_mostly;
 
index f7b875bb70d42138027f49ebde8150d27ce14cd2..1131664b918b0a574c7cc654a6a3cd04107f8e81 100644 (file)
@@ -356,8 +356,8 @@ static void free_pasid_states(struct device_state *dev_state)
                free_pasid_states_level2(dev_state->states);
        else if (dev_state->pasid_levels == 1)
                free_pasid_states_level1(dev_state->states);
-       else if (dev_state->pasid_levels != 0)
-               BUG();
+       else
+               BUG_ON(dev_state->pasid_levels != 0);
 
        free_page((unsigned long)dev_state->states);
 }
index da902baaa7946aac569b7ebe8a316c647dfd8187..dafaf59dc3b82833fb78d55e8f194ff728999d35 100644 (file)
 
 #define ARM_SMMU_IRQ_CTRL              0x50
 #define IRQ_CTRL_EVTQ_IRQEN            (1 << 2)
+#define IRQ_CTRL_PRIQ_IRQEN            (1 << 1)
 #define IRQ_CTRL_GERROR_IRQEN          (1 << 0)
 
 #define ARM_SMMU_IRQ_CTRLACK           0x54
 #define ARM_SMMU_PRIQ_IRQ_CFG2         0xdc
 
 /* Common MSI config fields */
-#define MSI_CFG0_SH_SHIFT              60
-#define MSI_CFG0_SH_NSH                        (0UL << MSI_CFG0_SH_SHIFT)
-#define MSI_CFG0_SH_OSH                        (2UL << MSI_CFG0_SH_SHIFT)
-#define MSI_CFG0_SH_ISH                        (3UL << MSI_CFG0_SH_SHIFT)
-#define MSI_CFG0_MEMATTR_SHIFT         56
-#define MSI_CFG0_MEMATTR_DEVICE_nGnRE  (0x1 << MSI_CFG0_MEMATTR_SHIFT)
 #define MSI_CFG0_ADDR_SHIFT            2
 #define MSI_CFG0_ADDR_MASK             0x3fffffffffffUL
+#define MSI_CFG2_SH_SHIFT              4
+#define MSI_CFG2_SH_NSH                        (0UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_SH_OSH                        (2UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_SH_ISH                        (3UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_MEMATTR_SHIFT         0
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE  (0x1 << MSI_CFG2_MEMATTR_SHIFT)
 
 #define Q_IDX(q, p)                    ((p) & ((1 << (q)->max_n_shift) - 1))
 #define Q_WRP(q, p)                    ((p) & (1 << (q)->max_n_shift))
@@ -1330,33 +1331,10 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
        arm_smmu_cmdq_issue_cmd(smmu, &cmd);
 }
 
-static void arm_smmu_flush_pgtable(void *addr, size_t size, void *cookie)
-{
-       struct arm_smmu_domain *smmu_domain = cookie;
-       struct arm_smmu_device *smmu = smmu_domain->smmu;
-       unsigned long offset = (unsigned long)addr & ~PAGE_MASK;
-
-       if (smmu->features & ARM_SMMU_FEAT_COHERENCY) {
-               dsb(ishst);
-       } else {
-               dma_addr_t dma_addr;
-               struct device *dev = smmu->dev;
-
-               dma_addr = dma_map_page(dev, virt_to_page(addr), offset, size,
-                                       DMA_TO_DEVICE);
-
-               if (dma_mapping_error(dev, dma_addr))
-                       dev_err(dev, "failed to flush pgtable at %p\n", addr);
-               else
-                       dma_unmap_page(dev, dma_addr, size, DMA_TO_DEVICE);
-       }
-}
-
 static struct iommu_gather_ops arm_smmu_gather_ops = {
        .tlb_flush_all  = arm_smmu_tlb_inv_context,
        .tlb_add_flush  = arm_smmu_tlb_inv_range_nosync,
        .tlb_sync       = arm_smmu_tlb_sync,
-       .flush_pgtable  = arm_smmu_flush_pgtable,
 };
 
 /* IOMMU API */
@@ -1531,6 +1509,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
                .ias            = ias,
                .oas            = oas,
                .tlb            = &arm_smmu_gather_ops,
+               .iommu_dev      = smmu->dev,
        };
 
        pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
@@ -2053,9 +2032,17 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
        int ret;
        struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
-       /* Calculate the L1 size, capped to the SIDSIZE */
-       size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
-       size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+       /*
+        * If we can resolve everything with a single L2 table, then we
+        * just need a single L1 descriptor. Otherwise, calculate the L1
+        * size, capped to the SIDSIZE.
+        */
+       if (smmu->sid_bits < STRTAB_SPLIT) {
+               size = 0;
+       } else {
+               size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
+               size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+       }
        cfg->num_l1_ents = 1 << size;
 
        size += STRTAB_SPLIT;
@@ -2198,6 +2185,7 @@ static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
 static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
 {
        int ret, irq;
+       u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN;
 
        /* Disable IRQs first */
        ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL,
@@ -2252,13 +2240,13 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
                        if (IS_ERR_VALUE(ret))
                                dev_warn(smmu->dev,
                                         "failed to enable priq irq\n");
+                       else
+                               irqen_flags |= IRQ_CTRL_PRIQ_IRQEN;
                }
        }
 
        /* Enable interrupt generation on the SMMU */
-       ret = arm_smmu_write_reg_sync(smmu,
-                                     IRQ_CTRL_EVTQ_IRQEN |
-                                     IRQ_CTRL_GERROR_IRQEN,
+       ret = arm_smmu_write_reg_sync(smmu, irqen_flags,
                                      ARM_SMMU_IRQ_CTRL, ARM_SMMU_IRQ_CTRLACK);
        if (ret)
                dev_warn(smmu->dev, "failed to enable irqs\n");
@@ -2540,12 +2528,12 @@ static int arm_smmu_device_probe(struct arm_smmu_device *smmu)
        case IDR5_OAS_44_BIT:
                smmu->oas = 44;
                break;
+       default:
+               dev_info(smmu->dev,
+                       "unknown output address size. Truncating to 48-bit\n");
+               /* Fallthrough */
        case IDR5_OAS_48_BIT:
                smmu->oas = 48;
-               break;
-       default:
-               dev_err(smmu->dev, "unknown output address size!\n");
-               return -ENXIO;
        }
 
        /* Set the DMA mask for our table walker */
index 4cd0c29cb585000c0e5899651948ad1dc2ffbf1f..48a39dfa977795deb8271dc5b34a7b3e0be002d0 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/iopoll.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -607,34 +608,10 @@ static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
        }
 }
 
-static void arm_smmu_flush_pgtable(void *addr, size_t size, void *cookie)
-{
-       struct arm_smmu_domain *smmu_domain = cookie;
-       struct arm_smmu_device *smmu = smmu_domain->smmu;
-       unsigned long offset = (unsigned long)addr & ~PAGE_MASK;
-
-
-       /* Ensure new page tables are visible to the hardware walker */
-       if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) {
-               dsb(ishst);
-       } else {
-               /*
-                * If the SMMU can't walk tables in the CPU caches, treat them
-                * like non-coherent DMA since we need to flush the new entries
-                * all the way out to memory. There's no possibility of
-                * recursion here as the SMMU table walker will not be wired
-                * through another SMMU.
-                */
-               dma_map_page(smmu->dev, virt_to_page(addr), offset, size,
-                            DMA_TO_DEVICE);
-       }
-}
-
 static struct iommu_gather_ops arm_smmu_gather_ops = {
        .tlb_flush_all  = arm_smmu_tlb_inv_context,
        .tlb_add_flush  = arm_smmu_tlb_inv_range_nosync,
        .tlb_sync       = arm_smmu_tlb_sync,
-       .flush_pgtable  = arm_smmu_flush_pgtable,
 };
 
 static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
@@ -898,6 +875,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
                .ias            = ias,
                .oas            = oas,
                .tlb            = &arm_smmu_gather_ops,
+               .iommu_dev      = smmu->dev,
        };
 
        smmu_domain->smmu = smmu;
@@ -1532,6 +1510,7 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
        unsigned long size;
        void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
        u32 id;
+       bool cttw_dt, cttw_reg;
 
        dev_notice(smmu->dev, "probing hardware configuration...\n");
        dev_notice(smmu->dev, "SMMUv%d with:\n", smmu->version);
@@ -1571,10 +1550,22 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
                dev_notice(smmu->dev, "\taddress translation ops\n");
        }
 
-       if (id & ID0_CTTW) {
+       /*
+        * In order for DMA API calls to work properly, we must defer to what
+        * the DT says about coherency, regardless of what the hardware claims.
+        * Fortunately, this also opens up a workaround for systems where the
+        * ID register value has ended up configured incorrectly.
+        */
+       cttw_dt = of_dma_is_coherent(smmu->dev->of_node);
+       cttw_reg = !!(id & ID0_CTTW);
+       if (cttw_dt)
                smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
-               dev_notice(smmu->dev, "\tcoherent table walk\n");
-       }
+       if (cttw_dt || cttw_reg)
+               dev_notice(smmu->dev, "\t%scoherent table walk\n",
+                          cttw_dt ? "" : "non-");
+       if (cttw_dt != cttw_reg)
+               dev_notice(smmu->dev,
+                          "\t(IDR0.CTTW overridden by dma-coherent property)\n");
 
        if (id & ID0_SMS) {
                u32 smr, sid, mask;
index c9db04d4ef39ae36553279859b6ca0f1c7972db1..8757f8dfc4e57afee580fc68d76b88658467dea5 100644 (file)
@@ -1068,7 +1068,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd)
        if (intel_iommu_enabled)
                iommu->iommu_dev = iommu_device_create(NULL, iommu,
                                                       intel_iommu_groups,
-                                                      iommu->name);
+                                                      "%s", iommu->name);
 
        return 0;
 
index abeedc9a78c27c4e8ee419b2571d9a88fbb4fa74..2570f2a25dc432606e283d1dc7dd450e6fec3bd3 100644 (file)
@@ -41,7 +41,6 @@ struct pamu_isr_data {
 
 static struct paace *ppaact;
 static struct paace *spaact;
-static struct ome *omt __initdata;
 
 /*
  * Table for matching compatible strings, for device tree
@@ -50,7 +49,7 @@ static struct ome *omt __initdata;
  * SOCs. For the older SOCs "fsl,qoriq-device-config-1.0"
  * string would be used.
  */
-static const struct of_device_id guts_device_ids[] __initconst = {
+static const struct of_device_id guts_device_ids[] = {
        { .compatible = "fsl,qoriq-device-config-1.0", },
        { .compatible = "fsl,qoriq-device-config-2.0", },
        {}
@@ -599,7 +598,7 @@ found_cpu_node:
  * Memory accesses to QMAN and BMAN private memory need not be coherent, so
  * clear the PAACE entry coherency attribute for them.
  */
-static void __init setup_qbman_paace(struct paace *ppaace, int  paace_type)
+static void setup_qbman_paace(struct paace *ppaace, int  paace_type)
 {
        switch (paace_type) {
        case QMAN_PAACE:
@@ -629,7 +628,7 @@ static void __init setup_qbman_paace(struct paace *ppaace, int  paace_type)
  * this table to translate device transaction to appropriate corenet
  * transaction.
  */
-static void __init setup_omt(struct ome *omt)
+static void setup_omt(struct ome *omt)
 {
        struct ome *ome;
 
@@ -666,7 +665,7 @@ static void __init setup_omt(struct ome *omt)
  * Get the maximum number of PAACT table entries
  * and subwindows supported by PAMU
  */
-static void __init get_pamu_cap_values(unsigned long pamu_reg_base)
+static void get_pamu_cap_values(unsigned long pamu_reg_base)
 {
        u32 pc_val;
 
@@ -676,9 +675,9 @@ static void __init get_pamu_cap_values(unsigned long pamu_reg_base)
 }
 
 /* Setup PAMU registers pointing to PAACT, SPAACT and OMT */
-static int __init setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
-                                phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
-                                phys_addr_t omt_phys)
+static int setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
+                         phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
+                         phys_addr_t omt_phys)
 {
        u32 *pc;
        struct pamu_mmap_regs *pamu_regs;
@@ -720,7 +719,7 @@ static int __init setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu
 }
 
 /* Enable all device LIODNS */
-static void __init setup_liodns(void)
+static void setup_liodns(void)
 {
        int i, len;
        struct paace *ppaace;
@@ -846,7 +845,7 @@ struct ccsr_law {
 /*
  * Create a coherence subdomain for a given memory block.
  */
-static int __init create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
+static int create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
 {
        struct device_node *np;
        const __be32 *iprop;
@@ -988,7 +987,7 @@ error:
 static const struct {
        u32 svr;
        u32 port_id;
-} port_id_map[] __initconst = {
+} port_id_map[] = {
        {(SVR_P2040 << 8) | 0x10, 0xFF000000},  /* P2040 1.0 */
        {(SVR_P2040 << 8) | 0x11, 0xFF000000},  /* P2040 1.1 */
        {(SVR_P2041 << 8) | 0x10, 0xFF000000},  /* P2041 1.0 */
@@ -1006,7 +1005,7 @@ static const struct {
 
 #define SVR_SECURITY   0x80000 /* The Security (E) bit */
 
-static int __init fsl_pamu_probe(struct platform_device *pdev)
+static int fsl_pamu_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        void __iomem *pamu_regs = NULL;
@@ -1022,6 +1021,7 @@ static int __init fsl_pamu_probe(struct platform_device *pdev)
        int irq;
        phys_addr_t ppaact_phys;
        phys_addr_t spaact_phys;
+       struct ome *omt;
        phys_addr_t omt_phys;
        size_t mem_size = 0;
        unsigned int order = 0;
@@ -1200,7 +1200,7 @@ error:
        return ret;
 }
 
-static struct platform_driver fsl_of_pamu_driver __initdata = {
+static struct platform_driver fsl_of_pamu_driver = {
        .driver = {
                .name = "fsl-of-pamu",
        },
index c82ebee6c7e5c8b2dc968d9108ac7d829eab46b6..2d7349a3ee1496408f051b4da8accebc8dd02ec1 100644 (file)
@@ -364,7 +364,8 @@ static inline int first_pte_in_page(struct dma_pte *pte)
 static struct dmar_domain *si_domain;
 static int hw_pass_through = 1;
 
-/* domain represents a virtual machine, more than one devices
+/*
+ * Domain represents a virtual machine, more than one devices
  * across iommus may be owned in one domain, e.g. kvm guest.
  */
 #define DOMAIN_FLAG_VIRTUAL_MACHINE    (1 << 0)
@@ -372,11 +373,21 @@ static int hw_pass_through = 1;
 /* si_domain contains mulitple devices */
 #define DOMAIN_FLAG_STATIC_IDENTITY    (1 << 1)
 
+#define for_each_domain_iommu(idx, domain)                     \
+       for (idx = 0; idx < g_num_of_iommus; idx++)             \
+               if (domain->iommu_refcnt[idx])
+
 struct dmar_domain {
-       int     id;                     /* domain id */
        int     nid;                    /* node id */
-       DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
-                                       /* bitmap of iommus this domain uses*/
+
+       unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
+                                       /* Refcount of devices per iommu */
+
+
+       u16             iommu_did[DMAR_UNITS_SUPPORTED];
+                                       /* Domain ids per IOMMU. Use u16 since
+                                        * domain ids are 16 bit wide according
+                                        * to VT-d spec, section 9.3 */
 
        struct list_head devices;       /* all devices' list */
        struct iova_domain iovad;       /* iova's that belong to this domain */
@@ -395,7 +406,6 @@ struct dmar_domain {
        int             iommu_superpage;/* Level of superpages supported:
                                           0 == 4KiB (no superpages), 1 == 2MiB,
                                           2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-       spinlock_t      iommu_lock;     /* protect iommu set in domain */
        u64             max_addr;       /* maximum mapped address */
 
        struct iommu_domain domain;     /* generic domain data structure for
@@ -465,10 +475,11 @@ static long list_size;
 
 static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
-static void domain_remove_one_dev_info(struct dmar_domain *domain,
-                                      struct device *dev);
-static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
-                                          struct device *dev);
+static void dmar_remove_one_dev_info(struct dmar_domain *domain,
+                                    struct device *dev);
+static void __dmar_remove_one_dev_info(struct device_domain_info *info);
+static void domain_context_clear(struct intel_iommu *iommu,
+                                struct device *dev);
 static int domain_detach_iommu(struct dmar_domain *domain,
                               struct intel_iommu *iommu);
 
@@ -568,6 +579,36 @@ __setup("intel_iommu=", intel_iommu_setup);
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
 
+static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
+{
+       struct dmar_domain **domains;
+       int idx = did >> 8;
+
+       domains = iommu->domains[idx];
+       if (!domains)
+               return NULL;
+
+       return domains[did & 0xff];
+}
+
+static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
+                            struct dmar_domain *domain)
+{
+       struct dmar_domain **domains;
+       int idx = did >> 8;
+
+       if (!iommu->domains[idx]) {
+               size_t size = 256 * sizeof(struct dmar_domain *);
+               iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
+       }
+
+       domains = iommu->domains[idx];
+       if (WARN_ON(!domains))
+               return;
+       else
+               domains[did & 0xff] = domain;
+}
+
 static inline void *alloc_pgtable_page(int node)
 {
        struct page *page;
@@ -609,6 +650,11 @@ static inline int domain_type_is_vm(struct dmar_domain *domain)
        return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 }
 
+static inline int domain_type_is_si(struct dmar_domain *domain)
+{
+       return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+}
+
 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 {
        return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
@@ -663,7 +709,9 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 
        /* si_domain and vm domain should not get here. */
        BUG_ON(domain_type_is_vm_or_si(domain));
-       iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
+       for_each_domain_iommu(iommu_id, domain)
+               break;
+
        if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
                return NULL;
 
@@ -679,7 +727,7 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain)
 
        domain->iommu_coherency = 1;
 
-       for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
+       for_each_domain_iommu(i, domain) {
                found = true;
                if (!ecap_coherent(g_iommus[i]->ecap)) {
                        domain->iommu_coherency = 0;
@@ -759,6 +807,7 @@ static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu
        struct context_entry *context;
        u64 *entry;
 
+       entry = &root->lo;
        if (ecs_enabled(iommu)) {
                if (devfn >= 0x80) {
                        devfn -= 0x80;
@@ -766,7 +815,6 @@ static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu
                }
                devfn *= 2;
        }
-       entry = &root->lo;
        if (*entry & 1)
                context = phys_to_virt(*entry & VTD_PAGE_MASK);
        else {
@@ -1166,9 +1214,9 @@ next:
 /* We can't just free the pages because the IOMMU may still be walking
    the page tables, and may have cached the intermediate levels. The
    pages can only be freed after the IOTLB flush has been done. */
-struct page *domain_unmap(struct dmar_domain *domain,
-                         unsigned long start_pfn,
-                         unsigned long last_pfn)
+static struct page *domain_unmap(struct dmar_domain *domain,
+                                unsigned long start_pfn,
+                                unsigned long last_pfn)
 {
        struct page *freelist = NULL;
 
@@ -1192,7 +1240,7 @@ struct page *domain_unmap(struct dmar_domain *domain,
        return freelist;
 }
 
-void dma_free_pagelist(struct page *freelist)
+static void dma_free_pagelist(struct page *freelist)
 {
        struct page *pg;
 
@@ -1360,24 +1408,23 @@ iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
                         u8 bus, u8 devfn)
 {
        bool found = false;
-       unsigned long flags;
        struct device_domain_info *info;
        struct pci_dev *pdev;
 
+       assert_spin_locked(&device_domain_lock);
+
        if (!ecap_dev_iotlb_support(iommu->ecap))
                return NULL;
 
        if (!iommu->qi)
                return NULL;
 
-       spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_entry(info, &domain->devices, link)
                if (info->iommu == iommu && info->bus == bus &&
                    info->devfn == devfn) {
                        found = true;
                        break;
                }
-       spin_unlock_irqrestore(&device_domain_lock, flags);
 
        if (!found || !info->dev || !dev_is_pci(info->dev))
                return NULL;
@@ -1436,11 +1483,14 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
        spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
-static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-                                 unsigned long pfn, unsigned int pages, int ih, int map)
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
+                                 struct dmar_domain *domain,
+                                 unsigned long pfn, unsigned int pages,
+                                 int ih, int map)
 {
        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+       u16 did = domain->iommu_did[iommu->seq_id];
 
        BUG_ON(pages == 0);
 
@@ -1464,7 +1514,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
         * flush. However, device IOTLB doesn't need to be flushed in this case.
         */
        if (!cap_caching_mode(iommu->cap) || !map)
-               iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
+               iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
+                                     addr, mask);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1519,65 +1570,80 @@ static void iommu_disable_translation(struct intel_iommu *iommu)
 
 static int iommu_init_domains(struct intel_iommu *iommu)
 {
-       unsigned long ndomains;
-       unsigned long nlongs;
+       u32 ndomains, nlongs;
+       size_t size;
 
        ndomains = cap_ndoms(iommu->cap);
-       pr_debug("%s: Number of Domains supported <%ld>\n",
+       pr_debug("%s: Number of Domains supported <%d>\n",
                 iommu->name, ndomains);
        nlongs = BITS_TO_LONGS(ndomains);
 
        spin_lock_init(&iommu->lock);
 
-       /* TBD: there might be 64K domains,
-        * consider other allocation for future chip
-        */
        iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
        if (!iommu->domain_ids) {
                pr_err("%s: Allocating domain id array failed\n",
                       iommu->name);
                return -ENOMEM;
        }
-       iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
-                       GFP_KERNEL);
-       if (!iommu->domains) {
+
+       size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
+       iommu->domains = kzalloc(size, GFP_KERNEL);
+
+       if (iommu->domains) {
+               size = 256 * sizeof(struct dmar_domain *);
+               iommu->domains[0] = kzalloc(size, GFP_KERNEL);
+       }
+
+       if (!iommu->domains || !iommu->domains[0]) {
                pr_err("%s: Allocating domain array failed\n",
                       iommu->name);
                kfree(iommu->domain_ids);
+               kfree(iommu->domains);
                iommu->domain_ids = NULL;
+               iommu->domains    = NULL;
                return -ENOMEM;
        }
 
+
+
        /*
-        * if Caching mode is set, then invalid translations are tagged
-        * with domainid 0. Hence we need to pre-allocate it.
+        * If Caching mode is set, then invalid translations are tagged
+        * with domain-id 0, hence we need to pre-allocate it. We also
+        * use domain-id 0 as a marker for non-allocated domain-id, so
+        * make sure it is not used for a real domain.
         */
-       if (cap_caching_mode(iommu->cap))
-               set_bit(0, iommu->domain_ids);
+       set_bit(0, iommu->domain_ids);
+
        return 0;
 }
 
 static void disable_dmar_iommu(struct intel_iommu *iommu)
 {
-       struct dmar_domain *domain;
-       int i;
+       struct device_domain_info *info, *tmp;
+       unsigned long flags;
 
-       if ((iommu->domains) && (iommu->domain_ids)) {
-               for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
-                       /*
-                        * Domain id 0 is reserved for invalid translation
-                        * if hardware supports caching mode.
-                        */
-                       if (cap_caching_mode(iommu->cap) && i == 0)
-                               continue;
+       if (!iommu->domains || !iommu->domain_ids)
+               return;
 
-                       domain = iommu->domains[i];
-                       clear_bit(i, iommu->domain_ids);
-                       if (domain_detach_iommu(domain, iommu) == 0 &&
-                           !domain_type_is_vm(domain))
-                               domain_exit(domain);
-               }
+       spin_lock_irqsave(&device_domain_lock, flags);
+       list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
+               struct dmar_domain *domain;
+
+               if (info->iommu != iommu)
+                       continue;
+
+               if (!info->dev || !info->domain)
+                       continue;
+
+               domain = info->domain;
+
+               dmar_remove_one_dev_info(domain, info->dev);
+
+               if (!domain_type_is_vm_or_si(domain))
+                       domain_exit(domain);
        }
+       spin_unlock_irqrestore(&device_domain_lock, flags);
 
        if (iommu->gcmd & DMA_GCMD_TE)
                iommu_disable_translation(iommu);
@@ -1586,6 +1652,11 @@ static void disable_dmar_iommu(struct intel_iommu *iommu)
 static void free_dmar_iommu(struct intel_iommu *iommu)
 {
        if ((iommu->domains) && (iommu->domain_ids)) {
+               int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
+               int i;
+
+               for (i = 0; i < elems; i++)
+                       kfree(iommu->domains[i]);
                kfree(iommu->domains);
                kfree(iommu->domain_ids);
                iommu->domains = NULL;
@@ -1600,8 +1671,6 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
 
 static struct dmar_domain *alloc_domain(int flags)
 {
-       /* domain id for virtual machine, it won't be set in context */
-       static atomic_t vm_domid = ATOMIC_INIT(0);
        struct dmar_domain *domain;
 
        domain = alloc_domain_mem();
@@ -1611,111 +1680,64 @@ static struct dmar_domain *alloc_domain(int flags)
        memset(domain, 0, sizeof(*domain));
        domain->nid = -1;
        domain->flags = flags;
-       spin_lock_init(&domain->iommu_lock);
        INIT_LIST_HEAD(&domain->devices);
-       if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
-               domain->id = atomic_inc_return(&vm_domid);
 
        return domain;
 }
 
-static int __iommu_attach_domain(struct dmar_domain *domain,
-                                struct intel_iommu *iommu)
-{
-       int num;
-       unsigned long ndomains;
-
-       ndomains = cap_ndoms(iommu->cap);
-       num = find_first_zero_bit(iommu->domain_ids, ndomains);
-       if (num < ndomains) {
-               set_bit(num, iommu->domain_ids);
-               iommu->domains[num] = domain;
-       } else {
-               num = -ENOSPC;
-       }
-
-       return num;
-}
-
-static int iommu_attach_domain(struct dmar_domain *domain,
+/* Must be called with iommu->lock */
+static int domain_attach_iommu(struct dmar_domain *domain,
                               struct intel_iommu *iommu)
 {
-       int num;
-       unsigned long flags;
-
-       spin_lock_irqsave(&iommu->lock, flags);
-       num = __iommu_attach_domain(domain, iommu);
-       spin_unlock_irqrestore(&iommu->lock, flags);
-       if (num < 0)
-               pr_err("%s: No free domain ids\n", iommu->name);
-
-       return num;
-}
-
-static int iommu_attach_vm_domain(struct dmar_domain *domain,
-                                 struct intel_iommu *iommu)
-{
-       int num;
        unsigned long ndomains;
+       int num;
 
-       ndomains = cap_ndoms(iommu->cap);
-       for_each_set_bit(num, iommu->domain_ids, ndomains)
-               if (iommu->domains[num] == domain)
-                       return num;
-
-       return __iommu_attach_domain(domain, iommu);
-}
-
-static void iommu_detach_domain(struct dmar_domain *domain,
-                               struct intel_iommu *iommu)
-{
-       unsigned long flags;
-       int num, ndomains;
+       assert_spin_locked(&device_domain_lock);
+       assert_spin_locked(&iommu->lock);
 
-       spin_lock_irqsave(&iommu->lock, flags);
-       if (domain_type_is_vm_or_si(domain)) {
+       domain->iommu_refcnt[iommu->seq_id] += 1;
+       domain->iommu_count += 1;
+       if (domain->iommu_refcnt[iommu->seq_id] == 1) {
                ndomains = cap_ndoms(iommu->cap);
-               for_each_set_bit(num, iommu->domain_ids, ndomains) {
-                       if (iommu->domains[num] == domain) {
-                               clear_bit(num, iommu->domain_ids);
-                               iommu->domains[num] = NULL;
-                               break;
-                       }
+               num      = find_first_zero_bit(iommu->domain_ids, ndomains);
+
+               if (num >= ndomains) {
+                       pr_err("%s: No free domain ids\n", iommu->name);
+                       domain->iommu_refcnt[iommu->seq_id] -= 1;
+                       domain->iommu_count -= 1;
+                       return -ENOSPC;
                }
-       } else {
-               clear_bit(domain->id, iommu->domain_ids);
-               iommu->domains[domain->id] = NULL;
-       }
-       spin_unlock_irqrestore(&iommu->lock, flags);
-}
 
-static void domain_attach_iommu(struct dmar_domain *domain,
-                              struct intel_iommu *iommu)
-{
-       unsigned long flags;
+               set_bit(num, iommu->domain_ids);
+               set_iommu_domain(iommu, num, domain);
+
+               domain->iommu_did[iommu->seq_id] = num;
+               domain->nid                      = iommu->node;
 
-       spin_lock_irqsave(&domain->iommu_lock, flags);
-       if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
-               domain->iommu_count++;
-               if (domain->iommu_count == 1)
-                       domain->nid = iommu->node;
                domain_update_iommu_cap(domain);
        }
-       spin_unlock_irqrestore(&domain->iommu_lock, flags);
+
+       return 0;
 }
 
 static int domain_detach_iommu(struct dmar_domain *domain,
                               struct intel_iommu *iommu)
 {
-       unsigned long flags;
-       int count = INT_MAX;
+       int num, count = INT_MAX;
+
+       assert_spin_locked(&device_domain_lock);
+       assert_spin_locked(&iommu->lock);
+
+       domain->iommu_refcnt[iommu->seq_id] -= 1;
+       count = --domain->iommu_count;
+       if (domain->iommu_refcnt[iommu->seq_id] == 0) {
+               num = domain->iommu_did[iommu->seq_id];
+               clear_bit(num, iommu->domain_ids);
+               set_iommu_domain(iommu, num, NULL);
 
-       spin_lock_irqsave(&domain->iommu_lock, flags);
-       if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
-               count = --domain->iommu_count;
                domain_update_iommu_cap(domain);
+               domain->iommu_did[iommu->seq_id] = 0;
        }
-       spin_unlock_irqrestore(&domain->iommu_lock, flags);
 
        return count;
 }
@@ -1782,9 +1804,9 @@ static inline int guestwidth_to_adjustwidth(int gaw)
        return agaw;
 }
 
-static int domain_init(struct dmar_domain *domain, int guest_width)
+static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
+                      int guest_width)
 {
-       struct intel_iommu *iommu;
        int adjust_width, agaw;
        unsigned long sagaw;
 
@@ -1793,7 +1815,6 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
        domain_reserve_special_ranges(domain);
 
        /* calculate AGAW */
-       iommu = domain_get_iommu(domain);
        if (guest_width > cap_mgaw(iommu->cap))
                guest_width = cap_mgaw(iommu->cap);
        domain->gaw = guest_width;
@@ -1836,8 +1857,6 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 
 static void domain_exit(struct dmar_domain *domain)
 {
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
        struct page *freelist = NULL;
 
        /* Domain 0 is reserved, so dont process it */
@@ -1848,22 +1867,16 @@ static void domain_exit(struct dmar_domain *domain)
        if (!intel_iommu_strict)
                flush_unmaps_timeout(0);
 
-       /* remove associated devices */
+       /* Remove associated devices and clear attached or cached domains */
+       rcu_read_lock();
        domain_remove_dev_info(domain);
+       rcu_read_unlock();
 
        /* destroy iovas */
        put_iova_domain(&domain->iovad);
 
        freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
 
-       /* clear attached or cached domains */
-       rcu_read_lock();
-       for_each_active_iommu(iommu, drhd)
-               if (domain_type_is_vm(domain) ||
-                   test_bit(iommu->seq_id, domain->iommu_bmp))
-                       iommu_detach_domain(domain, iommu);
-       rcu_read_unlock();
-
        dma_free_pagelist(freelist);
 
        free_domain_mem(domain);
@@ -1871,79 +1884,68 @@ static void domain_exit(struct dmar_domain *domain)
 
 static int domain_context_mapping_one(struct dmar_domain *domain,
                                      struct intel_iommu *iommu,
-                                     u8 bus, u8 devfn, int translation)
+                                     u8 bus, u8 devfn)
 {
+       u16 did = domain->iommu_did[iommu->seq_id];
+       int translation = CONTEXT_TT_MULTI_LEVEL;
+       struct device_domain_info *info = NULL;
        struct context_entry *context;
        unsigned long flags;
        struct dma_pte *pgd;
-       int id;
-       int agaw;
-       struct device_domain_info *info = NULL;
+       int ret, agaw;
+
+       WARN_ON(did == 0);
+
+       if (hw_pass_through && domain_type_is_si(domain))
+               translation = CONTEXT_TT_PASS_THROUGH;
 
        pr_debug("Set context mapping for %02x:%02x.%d\n",
                bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 
        BUG_ON(!domain->pgd);
-       BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
-              translation != CONTEXT_TT_MULTI_LEVEL);
 
-       spin_lock_irqsave(&iommu->lock, flags);
+       spin_lock_irqsave(&device_domain_lock, flags);
+       spin_lock(&iommu->lock);
+
+       ret = -ENOMEM;
        context = iommu_context_addr(iommu, bus, devfn, 1);
-       spin_unlock_irqrestore(&iommu->lock, flags);
        if (!context)
-               return -ENOMEM;
-       spin_lock_irqsave(&iommu->lock, flags);
-       if (context_present(context)) {
-               spin_unlock_irqrestore(&iommu->lock, flags);
-               return 0;
-       }
+               goto out_unlock;
 
-       context_clear_entry(context);
+       ret = 0;
+       if (context_present(context))
+               goto out_unlock;
 
-       id = domain->id;
        pgd = domain->pgd;
 
-       if (domain_type_is_vm_or_si(domain)) {
-               if (domain_type_is_vm(domain)) {
-                       id = iommu_attach_vm_domain(domain, iommu);
-                       if (id < 0) {
-                               spin_unlock_irqrestore(&iommu->lock, flags);
-                               pr_err("%s: No free domain ids\n", iommu->name);
-                               return -EFAULT;
-                       }
-               }
+       context_clear_entry(context);
+       context_set_domain_id(context, did);
 
-               /* Skip top levels of page tables for
-                * iommu which has less agaw than default.
-                * Unnecessary for PT mode.
-                */
-               if (translation != CONTEXT_TT_PASS_THROUGH) {
-                       for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
-                               pgd = phys_to_virt(dma_pte_addr(pgd));
-                               if (!dma_pte_present(pgd)) {
-                                       spin_unlock_irqrestore(&iommu->lock, flags);
-                                       return -ENOMEM;
-                               }
-                       }
+       /*
+        * Skip top levels of page tables for iommu which has less agaw
+        * than default.  Unnecessary for PT mode.
+        */
+       if (translation != CONTEXT_TT_PASS_THROUGH) {
+               for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+                       ret = -ENOMEM;
+                       pgd = phys_to_virt(dma_pte_addr(pgd));
+                       if (!dma_pte_present(pgd))
+                               goto out_unlock;
                }
-       }
-
-       context_set_domain_id(context, id);
 
-       if (translation != CONTEXT_TT_PASS_THROUGH) {
                info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
                translation = info ? CONTEXT_TT_DEV_IOTLB :
                                     CONTEXT_TT_MULTI_LEVEL;
-       }
-       /*
-        * In pass through mode, AW must be programmed to indicate the largest
-        * AGAW value supported by hardware. And ASR is ignored by hardware.
-        */
-       if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
-               context_set_address_width(context, iommu->msagaw);
-       else {
+
                context_set_address_root(context, virt_to_phys(pgd));
                context_set_address_width(context, iommu->agaw);
+       } else {
+               /*
+                * In pass through mode, AW must be programmed to
+                * indicate the largest AGAW value supported by
+                * hardware. And ASR is ignored by hardware.
+                */
+               context_set_address_width(context, iommu->msagaw);
        }
 
        context_set_translation_type(context, translation);
@@ -1962,14 +1964,17 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
                                           (((u16)bus) << 8) | devfn,
                                           DMA_CCMD_MASK_NOBIT,
                                           DMA_CCMD_DEVICE_INVL);
-               iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
+               iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
        } else {
                iommu_flush_write_buffer(iommu);
        }
        iommu_enable_dev_iotlb(info);
-       spin_unlock_irqrestore(&iommu->lock, flags);
 
-       domain_attach_iommu(domain, iommu);
+       ret = 0;
+
+out_unlock:
+       spin_unlock(&iommu->lock);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
 
        return 0;
 }
@@ -1977,7 +1982,6 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 struct domain_context_mapping_data {
        struct dmar_domain *domain;
        struct intel_iommu *iommu;
-       int translation;
 };
 
 static int domain_context_mapping_cb(struct pci_dev *pdev,
@@ -1986,13 +1990,11 @@ static int domain_context_mapping_cb(struct pci_dev *pdev,
        struct domain_context_mapping_data *data = opaque;
 
        return domain_context_mapping_one(data->domain, data->iommu,
-                                         PCI_BUS_NUM(alias), alias & 0xff,
-                                         data->translation);
+                                         PCI_BUS_NUM(alias), alias & 0xff);
 }
 
 static int
-domain_context_mapping(struct dmar_domain *domain, struct device *dev,
-                      int translation)
+domain_context_mapping(struct dmar_domain *domain, struct device *dev)
 {
        struct intel_iommu *iommu;
        u8 bus, devfn;
@@ -2003,12 +2005,10 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev,
                return -ENODEV;
 
        if (!dev_is_pci(dev))
-               return domain_context_mapping_one(domain, iommu, bus, devfn,
-                                                 translation);
+               return domain_context_mapping_one(domain, iommu, bus, devfn);
 
        data.domain = domain;
        data.iommu = iommu;
-       data.translation = translation;
 
        return pci_for_each_dma_alias(to_pci_dev(dev),
                                      &domain_context_mapping_cb, &data);
@@ -2194,7 +2194,7 @@ static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long i
        return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
 }
 
-static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
+static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
 {
        if (!iommu)
                return;
@@ -2220,21 +2220,8 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
        unsigned long flags;
 
        spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_entry_safe(info, tmp, &domain->devices, link) {
-               unlink_domain_info(info);
-               spin_unlock_irqrestore(&device_domain_lock, flags);
-
-               iommu_disable_dev_iotlb(info);
-               iommu_detach_dev(info->iommu, info->bus, info->devfn);
-
-               if (domain_type_is_vm(domain)) {
-                       iommu_detach_dependent_devices(info->iommu, info->dev);
-                       domain_detach_iommu(domain, info->iommu);
-               }
-
-               free_devinfo_mem(info);
-               spin_lock_irqsave(&device_domain_lock, flags);
-       }
+       list_for_each_entry_safe(info, tmp, &domain->devices, link)
+               __dmar_remove_one_dev_info(info);
        spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
@@ -2266,14 +2253,15 @@ dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
        return NULL;
 }
 
-static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
-                                               int bus, int devfn,
-                                               struct device *dev,
-                                               struct dmar_domain *domain)
+static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
+                                                   int bus, int devfn,
+                                                   struct device *dev,
+                                                   struct dmar_domain *domain)
 {
        struct dmar_domain *found = NULL;
        struct device_domain_info *info;
        unsigned long flags;
+       int ret;
 
        info = alloc_devinfo_mem();
        if (!info)
@@ -2290,12 +2278,16 @@ static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
        spin_lock_irqsave(&device_domain_lock, flags);
        if (dev)
                found = find_domain(dev);
-       else {
+
+       if (!found) {
                struct device_domain_info *info2;
                info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
-               if (info2)
-                       found = info2->domain;
+               if (info2) {
+                       found      = info2->domain;
+                       info2->dev = dev;
+               }
        }
+
        if (found) {
                spin_unlock_irqrestore(&device_domain_lock, flags);
                free_devinfo_mem(info);
@@ -2303,12 +2295,27 @@ static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
                return found;
        }
 
+       spin_lock(&iommu->lock);
+       ret = domain_attach_iommu(domain, iommu);
+       spin_unlock(&iommu->lock);
+
+       if (ret) {
+               spin_unlock_irqrestore(&device_domain_lock, flags);
+               return NULL;
+       }
+
        list_add(&info->link, &domain->devices);
        list_add(&info->global, &device_domain_list);
        if (dev)
                dev->archdata.iommu = info;
        spin_unlock_irqrestore(&device_domain_lock, flags);
 
+       if (dev && domain_context_mapping(domain, dev)) {
+               pr_err("Domain context map for %s failed\n", dev_name(dev));
+               dmar_remove_one_dev_info(domain, dev);
+               return NULL;
+       }
+
        return domain;
 }
 
@@ -2321,10 +2328,10 @@ static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
 /* domain is initialized */
 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
 {
+       struct device_domain_info *info = NULL;
        struct dmar_domain *domain, *tmp;
        struct intel_iommu *iommu;
-       struct device_domain_info *info;
-       u16 dma_alias;
+       u16 req_id, dma_alias;
        unsigned long flags;
        u8 bus, devfn;
 
@@ -2336,6 +2343,8 @@ static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
        if (!iommu)
                return NULL;
 
+       req_id = ((u16)bus << 8) | devfn;
+
        if (dev_is_pci(dev)) {
                struct pci_dev *pdev = to_pci_dev(dev);
 
@@ -2360,21 +2369,15 @@ static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
        domain = alloc_domain(0);
        if (!domain)
                return NULL;
-       domain->id = iommu_attach_domain(domain, iommu);
-       if (domain->id < 0) {
-               free_domain_mem(domain);
-               return NULL;
-       }
-       domain_attach_iommu(domain, iommu);
-       if (domain_init(domain, gaw)) {
+       if (domain_init(domain, iommu, gaw)) {
                domain_exit(domain);
                return NULL;
        }
 
        /* register PCI DMA alias device */
-       if (dev_is_pci(dev)) {
-               tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
-                                          dma_alias & 0xff, NULL, domain);
+       if (req_id != dma_alias && dev_is_pci(dev)) {
+               tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
+                                              dma_alias & 0xff, NULL, domain);
 
                if (!tmp || tmp != domain) {
                        domain_exit(domain);
@@ -2386,7 +2389,7 @@ static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
        }
 
 found_domain:
-       tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
+       tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
 
        if (!tmp || tmp != domain) {
                domain_exit(domain);
@@ -2414,8 +2417,7 @@ static int iommu_domain_identity_map(struct dmar_domain *domain,
                return -ENOMEM;
        }
 
-       pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
-                start, end, domain->id);
+       pr_debug("Mapping reserved region %llx-%llx\n", start, end);
        /*
         * RMRR range might have overlap with physical memory range,
         * clear it first
@@ -2476,11 +2478,6 @@ static int iommu_prepare_identity_map(struct device *dev,
        if (ret)
                goto error;
 
-       /* context entry init */
-       ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
-       if (ret)
-               goto error;
-
        return 0;
 
  error:
@@ -2526,37 +2523,18 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width);
 
 static int __init si_domain_init(int hw)
 {
-       struct dmar_drhd_unit *drhd;
-       struct intel_iommu *iommu;
        int nid, ret = 0;
-       bool first = true;
 
        si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
        if (!si_domain)
                return -EFAULT;
 
-       for_each_active_iommu(iommu, drhd) {
-               ret = iommu_attach_domain(si_domain, iommu);
-               if (ret < 0) {
-                       domain_exit(si_domain);
-                       return -EFAULT;
-               } else if (first) {
-                       si_domain->id = ret;
-                       first = false;
-               } else if (si_domain->id != ret) {
-                       domain_exit(si_domain);
-                       return -EFAULT;
-               }
-               domain_attach_iommu(si_domain, iommu);
-       }
-
        if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
                domain_exit(si_domain);
                return -EFAULT;
        }
 
-       pr_debug("Identity mapping domain is domain %d\n",
-                si_domain->id);
+       pr_debug("Identity mapping domain allocated\n");
 
        if (hw)
                return 0;
@@ -2590,28 +2568,20 @@ static int identity_mapping(struct device *dev)
        return 0;
 }
 
-static int domain_add_dev_info(struct dmar_domain *domain,
-                              struct device *dev, int translation)
+static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 {
        struct dmar_domain *ndomain;
        struct intel_iommu *iommu;
        u8 bus, devfn;
-       int ret;
 
        iommu = device_to_iommu(dev, &bus, &devfn);
        if (!iommu)
                return -ENODEV;
 
-       ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
+       ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
        if (ndomain != domain)
                return -EBUSY;
 
-       ret = domain_context_mapping(domain, dev, translation);
-       if (ret) {
-               domain_remove_one_dev_info(domain, dev);
-               return ret;
-       }
-
        return 0;
 }
 
@@ -2751,9 +2721,7 @@ static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw
        if (!iommu_should_identity_map(dev, 1))
                return 0;
 
-       ret = domain_add_dev_info(si_domain, dev,
-                                 hw ? CONTEXT_TT_PASS_THROUGH :
-                                      CONTEXT_TT_MULTI_LEVEL);
+       ret = domain_add_dev_info(si_domain, dev);
        if (!ret)
                pr_info("%s identity mapping for device %s\n",
                        hw ? "Hardware" : "Software", dev_name(dev));
@@ -2839,15 +2807,18 @@ static void intel_iommu_init_qi(struct intel_iommu *iommu)
 }
 
 static int copy_context_table(struct intel_iommu *iommu,
-                             struct root_entry *old_re,
+                             struct root_entry __iomem *old_re,
                              struct context_entry **tbl,
                              int bus, bool ext)
 {
-       struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
        int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
+       struct context_entry __iomem *old_ce = NULL;
+       struct context_entry *new_ce = NULL, ce;
+       struct root_entry re;
        phys_addr_t old_ce_phys;
 
        tbl_idx = ext ? bus * 2 : bus;
+       memcpy_fromio(&re, old_re, sizeof(re));
 
        for (devfn = 0; devfn < 256; devfn++) {
                /* First calculate the correct index */
@@ -2867,9 +2838,9 @@ static int copy_context_table(struct intel_iommu *iommu,
 
                        ret = 0;
                        if (devfn < 0x80)
-                               old_ce_phys = root_entry_lctp(old_re);
+                               old_ce_phys = root_entry_lctp(&re);
                        else
-                               old_ce_phys = root_entry_uctp(old_re);
+                               old_ce_phys = root_entry_uctp(&re);
 
                        if (!old_ce_phys) {
                                if (ext && devfn == 0) {
@@ -2894,7 +2865,7 @@ static int copy_context_table(struct intel_iommu *iommu,
                }
 
                /* Now copy the context entry */
-               ce = old_ce[idx];
+               memcpy_fromio(&ce, old_ce + idx, sizeof(ce));
 
                if (!__context_present(&ce))
                        continue;
@@ -2938,8 +2909,8 @@ out:
 
 static int copy_translation_tables(struct intel_iommu *iommu)
 {
+       struct root_entry __iomem *old_rt;
        struct context_entry **ctxt_tbls;
-       struct root_entry *old_rt;
        phys_addr_t old_rt_phys;
        int ctxt_table_entries;
        unsigned long flags;
@@ -3269,7 +3240,6 @@ static struct iova *intel_alloc_iova(struct device *dev,
 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
 {
        struct dmar_domain *domain;
-       int ret;
 
        domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
        if (!domain) {
@@ -3278,16 +3248,6 @@ static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
                return NULL;
        }
 
-       /* make sure context mapping is ok */
-       if (unlikely(!domain_context_mapped(dev))) {
-               ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
-               if (ret) {
-                       pr_err("Domain context map for %s failed\n",
-                              dev_name(dev));
-                       return NULL;
-               }
-       }
-
        return domain;
 }
 
@@ -3323,7 +3283,7 @@ static int iommu_no_mapping(struct device *dev)
                         * 32 bit DMA is removed from si_domain and fall back
                         * to non-identity mapping.
                         */
-                       domain_remove_one_dev_info(si_domain, dev);
+                       dmar_remove_one_dev_info(si_domain, dev);
                        pr_info("32bit %s uses non-identity mapping\n",
                                dev_name(dev));
                        return 0;
@@ -3335,10 +3295,7 @@ static int iommu_no_mapping(struct device *dev)
                 */
                if (iommu_should_identity_map(dev, 0)) {
                        int ret;
-                       ret = domain_add_dev_info(si_domain, dev,
-                                                 hw_pass_through ?
-                                                 CONTEXT_TT_PASS_THROUGH :
-                                                 CONTEXT_TT_MULTI_LEVEL);
+                       ret = domain_add_dev_info(si_domain, dev);
                        if (!ret) {
                                pr_info("64bit %s uses identity mapping\n",
                                        dev_name(dev));
@@ -3399,7 +3356,9 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
+               iommu_flush_iotlb_psi(iommu, domain,
+                                     mm_to_dma_pfn(iova->pfn_lo),
+                                     size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -3450,7 +3409,7 @@ static void flush_unmaps(void)
 
                        /* On real hardware multiple invalidations are expensive */
                        if (cap_caching_mode(iommu->cap))
-                               iommu_flush_iotlb_psi(iommu, domain->id,
+                               iommu_flush_iotlb_psi(iommu, domain,
                                        iova->pfn_lo, iova_size(iova),
                                        !deferred_flush[i].freelist[j], 0);
                        else {
@@ -3534,7 +3493,7 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
        freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
-               iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
+               iommu_flush_iotlb_psi(iommu, domain, start_pfn,
                                      last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
@@ -3692,7 +3651,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
+               iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -4169,13 +4128,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
        iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
        iommu_enable_translation(iommu);
 
-       if (si_domain) {
-               ret = iommu_attach_domain(si_domain, iommu);
-               if (ret < 0 || si_domain->id != ret)
-                       goto disable_iommu;
-               domain_attach_iommu(si_domain, iommu);
-       }
-
        iommu_disable_protect_mem_regions(iommu);
        return 0;
 
@@ -4337,11 +4289,9 @@ static int device_notifier(struct notifier_block *nb,
        if (!domain)
                return 0;
 
-       down_read(&dmar_global_lock);
-       domain_remove_one_dev_info(domain, dev);
+       dmar_remove_one_dev_info(domain, dev);
        if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
                domain_exit(domain);
-       up_read(&dmar_global_lock);
 
        return 0;
 }
@@ -4398,7 +4348,7 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
 
                        rcu_read_lock();
                        for_each_active_iommu(iommu, drhd)
-                               iommu_flush_iotlb_psi(iommu, si_domain->id,
+                               iommu_flush_iotlb_psi(iommu, si_domain,
                                        iova->pfn_lo, iova_size(iova),
                                        !freelist, 0);
                        rcu_read_unlock();
@@ -4457,11 +4407,32 @@ static ssize_t intel_iommu_show_ecap(struct device *dev,
 }
 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
 
+static ssize_t intel_iommu_show_ndoms(struct device *dev,
+                                     struct device_attribute *attr,
+                                     char *buf)
+{
+       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+}
+static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
+
+static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
+                                          struct device_attribute *attr,
+                                          char *buf)
+{
+       struct intel_iommu *iommu = dev_get_drvdata(dev);
+       return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+                                                 cap_ndoms(iommu->cap)));
+}
+static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+
 static struct attribute *intel_iommu_attrs[] = {
        &dev_attr_version.attr,
        &dev_attr_address.attr,
        &dev_attr_cap.attr,
        &dev_attr_ecap.attr,
+       &dev_attr_domains_supported.attr,
+       &dev_attr_domains_used.attr,
        NULL,
 };
 
@@ -4541,7 +4512,7 @@ int __init intel_iommu_init(void)
        for_each_active_iommu(iommu, drhd)
                iommu->iommu_dev = iommu_device_create(NULL, iommu,
                                                       intel_iommu_groups,
-                                                      iommu->name);
+                                                      "%s", iommu->name);
 
        bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
        bus_register_notifier(&pci_bus_type, &device_nb);
@@ -4561,11 +4532,11 @@ out_free_dmar:
        return ret;
 }
 
-static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
 {
        struct intel_iommu *iommu = opaque;
 
-       iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+       domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
        return 0;
 }
 
@@ -4575,63 +4546,50 @@ static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
  * devices, unbinding the driver from any one of them will possibly leave
  * the others unable to operate.
  */
-static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
-                                          struct device *dev)
+static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
 {
        if (!iommu || !dev || !dev_is_pci(dev))
                return;
 
-       pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
+       pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
 }
 
-static void domain_remove_one_dev_info(struct dmar_domain *domain,
-                                      struct device *dev)
+static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 {
-       struct device_domain_info *info, *tmp;
        struct intel_iommu *iommu;
        unsigned long flags;
-       bool found = false;
-       u8 bus, devfn;
 
-       iommu = device_to_iommu(dev, &bus, &devfn);
-       if (!iommu)
+       assert_spin_locked(&device_domain_lock);
+
+       if (WARN_ON(!info))
                return;
 
-       spin_lock_irqsave(&device_domain_lock, flags);
-       list_for_each_entry_safe(info, tmp, &domain->devices, link) {
-               if (info->iommu == iommu && info->bus == bus &&
-                   info->devfn == devfn) {
-                       unlink_domain_info(info);
-                       spin_unlock_irqrestore(&device_domain_lock, flags);
+       iommu = info->iommu;
 
-                       iommu_disable_dev_iotlb(info);
-                       iommu_detach_dev(iommu, info->bus, info->devfn);
-                       iommu_detach_dependent_devices(iommu, dev);
-                       free_devinfo_mem(info);
+       if (info->dev) {
+               iommu_disable_dev_iotlb(info);
+               domain_context_clear(iommu, info->dev);
+       }
 
-                       spin_lock_irqsave(&device_domain_lock, flags);
+       unlink_domain_info(info);
 
-                       if (found)
-                               break;
-                       else
-                               continue;
-               }
+       spin_lock_irqsave(&iommu->lock, flags);
+       domain_detach_iommu(info->domain, iommu);
+       spin_unlock_irqrestore(&iommu->lock, flags);
 
-               /* if there is no other devices under the same iommu
-                * owned by this domain, clear this iommu in iommu_bmp
-                * update iommu count and coherency
-                */
-               if (info->iommu == iommu)
-                       found = true;
-       }
+       free_devinfo_mem(info);
+}
 
-       spin_unlock_irqrestore(&device_domain_lock, flags);
+static void dmar_remove_one_dev_info(struct dmar_domain *domain,
+                                    struct device *dev)
+{
+       struct device_domain_info *info;
+       unsigned long flags;
 
-       if (found == 0) {
-               domain_detach_iommu(domain, iommu);
-               if (!domain_type_is_vm_or_si(domain))
-                       iommu_detach_domain(domain, iommu);
-       }
+       spin_lock_irqsave(&device_domain_lock, flags);
+       info = dev->archdata.iommu;
+       __dmar_remove_one_dev_info(info);
+       spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
 static int md_domain_init(struct dmar_domain *domain, int guest_width)
@@ -4712,10 +4670,9 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
 
                old_domain = find_domain(dev);
                if (old_domain) {
-                       if (domain_type_is_vm_or_si(dmar_domain))
-                               domain_remove_one_dev_info(old_domain, dev);
-                       else
-                               domain_remove_dev_info(old_domain);
+                       rcu_read_lock();
+                       dmar_remove_one_dev_info(old_domain, dev);
+                       rcu_read_unlock();
 
                        if (!domain_type_is_vm_or_si(old_domain) &&
                             list_empty(&old_domain->devices))
@@ -4755,13 +4712,13 @@ static int intel_iommu_attach_device(struct iommu_domain *domain,
                dmar_domain->agaw--;
        }
 
-       return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
+       return domain_add_dev_info(dmar_domain, dev);
 }
 
 static void intel_iommu_detach_device(struct iommu_domain *domain,
                                      struct device *dev)
 {
-       domain_remove_one_dev_info(to_dmar_domain(domain), dev);
+       dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
 }
 
 static int intel_iommu_map(struct iommu_domain *domain,
@@ -4810,12 +4767,11 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
        struct intel_iommu *iommu;
        unsigned long start_pfn, last_pfn;
        unsigned int npages;
-       int iommu_id, num, ndomains, level = 0;
+       int iommu_id, level = 0;
 
        /* Cope with horrid API which requires us to unmap more than the
           size argument if it happens to be a large-page mapping. */
-       if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
-               BUG();
+       BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
 
        if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
                size = VTD_PAGE_SIZE << level_to_offset_bits(level);
@@ -4827,19 +4783,11 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 
        npages = last_pfn - start_pfn + 1;
 
-       for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
-               iommu = g_iommus[iommu_id];
-
-               /*
-                * find bit position of dmar_domain
-                */
-               ndomains = cap_ndoms(iommu->cap);
-               for_each_set_bit(num, iommu->domain_ids, ndomains) {
-                       if (iommu->domains[num] == dmar_domain)
-                               iommu_flush_iotlb_psi(iommu, num, start_pfn,
-                                                    npages, !freelist, 0);
-              }
+       for_each_domain_iommu(iommu_id, dmar_domain) {
+               iommu = g_iommus[iommu_id];
 
+               iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+                                     start_pfn, npages, !freelist, 0);
        }
 
        dma_free_pagelist(freelist);
index f15692a410c7e7064e844be2b7d83feee20ef5c5..9ec4e0d94ffd5bdd0be3be7fe6c83ae9ac37e85f 100644 (file)
@@ -384,7 +384,7 @@ static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
 
 static int iommu_load_old_irte(struct intel_iommu *iommu)
 {
-       struct irte *old_ir_table;
+       struct irte __iomem *old_ir_table;
        phys_addr_t irt_phys;
        unsigned int i;
        size_t size;
@@ -413,7 +413,7 @@ static int iommu_load_old_irte(struct intel_iommu *iommu)
                return -ENOMEM;
 
        /* Copy data over */
-       memcpy(iommu->ir_table->base, old_ir_table, size);
+       memcpy_fromio(iommu->ir_table->base, old_ir_table, size);
 
        __iommu_flush_cache(iommu, iommu->ir_table->base, size);
 
@@ -426,6 +426,8 @@ static int iommu_load_old_irte(struct intel_iommu *iommu)
                        bitmap_set(iommu->ir_table->bitmap, i, 1);
        }
 
+       iounmap(old_ir_table);
+
        return 0;
 }
 
index 4e460216bd1644e5bb8a26ba9a4c3891d7452393..73c07482f48763c5af3f0d43d73a2f04774bb74d 100644 (file)
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
+#include <asm/barrier.h>
+
 #include "io-pgtable.h"
 
 #define ARM_LPAE_MAX_ADDR_BITS         48
@@ -200,20 +202,97 @@ typedef u64 arm_lpae_iopte;
 
 static bool selftest_running = false;
 
+static dma_addr_t __arm_lpae_dma_addr(struct device *dev, void *pages)
+{
+       return phys_to_dma(dev, virt_to_phys(pages));
+}
+
+static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
+                                   struct io_pgtable_cfg *cfg)
+{
+       struct device *dev = cfg->iommu_dev;
+       dma_addr_t dma;
+       void *pages = alloc_pages_exact(size, gfp | __GFP_ZERO);
+
+       if (!pages)
+               return NULL;
+
+       if (!selftest_running) {
+               dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
+               if (dma_mapping_error(dev, dma))
+                       goto out_free;
+               /*
+                * We depend on the IOMMU being able to work with any physical
+                * address directly, so if the DMA layer suggests it can't by
+                * giving us back some translation, that bodes very badly...
+                */
+               if (dma != __arm_lpae_dma_addr(dev, pages))
+                       goto out_unmap;
+       }
+
+       return pages;
+
+out_unmap:
+       dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+       dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+       free_pages_exact(pages, size);
+       return NULL;
+}
+
+static void __arm_lpae_free_pages(void *pages, size_t size,
+                                 struct io_pgtable_cfg *cfg)
+{
+       struct device *dev = cfg->iommu_dev;
+
+       if (!selftest_running)
+               dma_unmap_single(dev, __arm_lpae_dma_addr(dev, pages),
+                                size, DMA_TO_DEVICE);
+       free_pages_exact(pages, size);
+}
+
+static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
+                              struct io_pgtable_cfg *cfg)
+{
+       struct device *dev = cfg->iommu_dev;
+
+       *ptep = pte;
+
+       if (!selftest_running)
+               dma_sync_single_for_device(dev, __arm_lpae_dma_addr(dev, ptep),
+                                          sizeof(pte), DMA_TO_DEVICE);
+}
+
+static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+                           unsigned long iova, size_t size, int lvl,
+                           arm_lpae_iopte *ptep);
+
 static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
                             unsigned long iova, phys_addr_t paddr,
                             arm_lpae_iopte prot, int lvl,
                             arm_lpae_iopte *ptep)
 {
        arm_lpae_iopte pte = prot;
+       struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
-       /* We require an unmap first */
        if (iopte_leaf(*ptep, lvl)) {
+               /* We require an unmap first */
                WARN_ON(!selftest_running);
                return -EEXIST;
+       } else if (iopte_type(*ptep, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
+               /*
+                * We need to unmap and free the old table before
+                * overwriting it with a block entry.
+                */
+               arm_lpae_iopte *tblp;
+               size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+               tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
+               if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
+                       return -EINVAL;
        }
 
-       if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+       if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
                pte |= ARM_LPAE_PTE_NS;
 
        if (lvl == ARM_LPAE_MAX_LEVELS - 1)
@@ -224,8 +303,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
        pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS;
        pte |= pfn_to_iopte(paddr >> data->pg_shift, data);
 
-       *ptep = pte;
-       data->iop.cfg.tlb->flush_pgtable(ptep, sizeof(*ptep), data->iop.cookie);
+       __arm_lpae_set_pte(ptep, pte, cfg);
        return 0;
 }
 
@@ -234,14 +312,14 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
                          int lvl, arm_lpae_iopte *ptep)
 {
        arm_lpae_iopte *cptep, pte;
-       void *cookie = data->iop.cookie;
        size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+       struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
        /* Find our entry at the current level */
        ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
 
        /* If we can install a leaf entry at this level, then do so */
-       if (size == block_size && (size & data->iop.cfg.pgsize_bitmap))
+       if (size == block_size && (size & cfg->pgsize_bitmap))
                return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep);
 
        /* We can't allocate tables at the final level */
@@ -251,18 +329,15 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova,
        /* Grab a pointer to the next level */
        pte = *ptep;
        if (!pte) {
-               cptep = alloc_pages_exact(1UL << data->pg_shift,
-                                        GFP_ATOMIC | __GFP_ZERO);
+               cptep = __arm_lpae_alloc_pages(1UL << data->pg_shift,
+                                              GFP_ATOMIC, cfg);
                if (!cptep)
                        return -ENOMEM;
 
-               data->iop.cfg.tlb->flush_pgtable(cptep, 1UL << data->pg_shift,
-                                                cookie);
                pte = __pa(cptep) | ARM_LPAE_PTE_TYPE_TABLE;
-               if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+               if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
                        pte |= ARM_LPAE_PTE_NSTABLE;
-               *ptep = pte;
-               data->iop.cfg.tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+               __arm_lpae_set_pte(ptep, pte, cfg);
        } else {
                cptep = iopte_deref(pte, data);
        }
@@ -309,7 +384,7 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
 {
        struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
        arm_lpae_iopte *ptep = data->pgd;
-       int lvl = ARM_LPAE_START_LVL(data);
+       int ret, lvl = ARM_LPAE_START_LVL(data);
        arm_lpae_iopte prot;
 
        /* If no access, then nothing to do */
@@ -317,7 +392,14 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
                return 0;
 
        prot = arm_lpae_prot_to_pte(data, iommu_prot);
-       return __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+       ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+       /*
+        * Synchronise all PTE updates for the new mapping before there's
+        * a chance for anything to kick off a table walk for the new iova.
+        */
+       wmb();
+
+       return ret;
 }
 
 static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
@@ -347,7 +429,7 @@ static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
                __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
        }
 
-       free_pages_exact(start, table_size);
+       __arm_lpae_free_pages(start, table_size, &data->iop.cfg);
 }
 
 static void arm_lpae_free_pgtable(struct io_pgtable *iop)
@@ -366,8 +448,7 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
        unsigned long blk_start, blk_end;
        phys_addr_t blk_paddr;
        arm_lpae_iopte table = 0;
-       void *cookie = data->iop.cookie;
-       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+       struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
        blk_start = iova & ~(blk_size - 1);
        blk_end = blk_start + blk_size;
@@ -393,10 +474,9 @@ static int arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data,
                }
        }
 
-       *ptep = table;
-       tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+       __arm_lpae_set_pte(ptep, table, cfg);
        iova &= ~(blk_size - 1);
-       tlb->tlb_add_flush(iova, blk_size, true, cookie);
+       cfg->tlb->tlb_add_flush(iova, blk_size, true, data->iop.cookie);
        return size;
 }
 
@@ -418,13 +498,12 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 
        /* If the size matches this level, we're in the right place */
        if (size == blk_size) {
-               *ptep = 0;
-               tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+               __arm_lpae_set_pte(ptep, 0, &data->iop.cfg);
 
                if (!iopte_leaf(pte, lvl)) {
                        /* Also flush any partial walks */
                        tlb->tlb_add_flush(iova, size, false, cookie);
-                       tlb->tlb_sync(data->iop.cookie);
+                       tlb->tlb_sync(cookie);
                        ptep = iopte_deref(pte, data);
                        __arm_lpae_free_pgtable(data, lvl + 1, ptep);
                } else {
@@ -640,11 +719,12 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
        cfg->arm_lpae_s1_cfg.mair[1] = 0;
 
        /* Looking good; allocate a pgd */
-       data->pgd = alloc_pages_exact(data->pgd_size, GFP_KERNEL | __GFP_ZERO);
+       data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
        if (!data->pgd)
                goto out_free_data;
 
-       cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+       /* Ensure the empty pgd is visible before any actual TTBR write */
+       wmb();
 
        /* TTBRs */
        cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd);
@@ -728,11 +808,12 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
        cfg->arm_lpae_s2_cfg.vtcr = reg;
 
        /* Allocate pgd pages */
-       data->pgd = alloc_pages_exact(data->pgd_size, GFP_KERNEL | __GFP_ZERO);
+       data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
        if (!data->pgd)
                goto out_free_data;
 
-       cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+       /* Ensure the empty pgd is visible before any actual TTBR write */
+       wmb();
 
        /* VTTBR */
        cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
@@ -818,16 +899,10 @@ static void dummy_tlb_sync(void *cookie)
        WARN_ON(cookie != cfg_cookie);
 }
 
-static void dummy_flush_pgtable(void *ptr, size_t size, void *cookie)
-{
-       WARN_ON(cookie != cfg_cookie);
-}
-
 static struct iommu_gather_ops dummy_tlb_ops __initdata = {
        .tlb_flush_all  = dummy_tlb_flush_all,
        .tlb_add_flush  = dummy_tlb_add_flush,
        .tlb_sync       = dummy_tlb_sync,
-       .flush_pgtable  = dummy_flush_pgtable,
 };
 
 static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
index 6436fe24bc2f6fdc0273d6017056a24f964b640e..6f2e319d4f04a58d1174984338c0af9d856e9329 100644 (file)
 
 #include "io-pgtable.h"
 
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
-
 static const struct io_pgtable_init_fns *
 io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
 {
index 10e32f69c6681368cb04023173a4bcf0b859839a..ac9e2341a633ed82420d3d06bd24cec23240b86b 100644 (file)
@@ -17,8 +17,9 @@ enum io_pgtable_fmt {
  *
  * @tlb_flush_all: Synchronously invalidate the entire TLB context.
  * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range.
- * @tlb_sync:      Ensure any queue TLB invalidation has taken effect.
- * @flush_pgtable: Ensure page table updates are visible to the IOMMU.
+ * @tlb_sync:      Ensure any queued TLB invalidation has taken effect, and
+ *                 any corresponding page table updates are visible to the
+ *                 IOMMU.
  *
  * Note that these can all be called in atomic context and must therefore
  * not block.
@@ -28,7 +29,6 @@ struct iommu_gather_ops {
        void (*tlb_add_flush)(unsigned long iova, size_t size, bool leaf,
                              void *cookie);
        void (*tlb_sync)(void *cookie);
-       void (*flush_pgtable)(void *ptr, size_t size, void *cookie);
 };
 
 /**
@@ -41,6 +41,8 @@ struct iommu_gather_ops {
  * @ias:           Input address (iova) size, in bits.
  * @oas:           Output address (paddr) size, in bits.
  * @tlb:           TLB management callbacks for this set of tables.
+ * @iommu_dev:     The device representing the DMA configuration for the
+ *                 page table walker.
  */
 struct io_pgtable_cfg {
        #define IO_PGTABLE_QUIRK_ARM_NS (1 << 0)        /* Set NS bit in PTEs */
@@ -49,6 +51,7 @@ struct io_pgtable_cfg {
        unsigned int                    ias;
        unsigned int                    oas;
        const struct iommu_gather_ops   *tlb;
+       struct device                   *iommu_dev;
 
        /* Low-level data specific to the table format */
        union {
@@ -140,4 +143,9 @@ struct io_pgtable_init_fns {
        void (*free)(struct io_pgtable *iop);
 };
 
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+
 #endif /* __IO_PGTABLE_H */
index 1a67c531a07eb908519e5c130dd22e0ebf377566..8cf605fa9946013642b2a88f500beb285cc55cfc 100644 (file)
@@ -283,24 +283,10 @@ static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, bool leaf,
        /* The hardware doesn't support selective TLB flush. */
 }
 
-static void ipmmu_flush_pgtable(void *ptr, size_t size, void *cookie)
-{
-       unsigned long offset = (unsigned long)ptr & ~PAGE_MASK;
-       struct ipmmu_vmsa_domain *domain = cookie;
-
-       /*
-        * TODO: Add support for coherent walk through CCI with DVM and remove
-        * cache handling.
-        */
-       dma_map_page(domain->mmu->dev, virt_to_page(ptr), offset, size,
-                    DMA_TO_DEVICE);
-}
-
 static struct iommu_gather_ops ipmmu_gather_ops = {
        .tlb_flush_all = ipmmu_tlb_flush_all,
        .tlb_add_flush = ipmmu_tlb_add_flush,
        .tlb_sync = ipmmu_tlb_flush_all,
-       .flush_pgtable = ipmmu_flush_pgtable,
 };
 
 /* -----------------------------------------------------------------------------
@@ -327,6 +313,11 @@ static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
        domain->cfg.ias = 32;
        domain->cfg.oas = 40;
        domain->cfg.tlb = &ipmmu_gather_ops;
+       /*
+        * TODO: Add support for coherent walk through CCI with DVM and remove
+        * cache handling. For now, delegate it to the io-pgtable code.
+        */
+       domain->cfg.iommu_dev = domain->mmu->dev;
 
        domain->iop = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &domain->cfg,
                                           domain);
index 2d9993062ded6b2d543c89a9c09c3e6a85dfb17f..913455a5fd40e044e21ec20095296027dad3c1e6 100644 (file)
@@ -84,7 +84,7 @@ void set_irq_remapping_broken(void)
 bool irq_remapping_cap(enum irq_remap_cap cap)
 {
        if (!remap_ops || disable_irq_post)
-               return 0;
+               return false;
 
        return (remap_ops->capability & (1 << cap));
 }
index 15a2063812fa8ddf5aa2aa2eb912c526b8bc4da2..e321fa517a4526d191a6a997a361e0228305932b 100644 (file)
@@ -106,8 +106,8 @@ static int __flush_iotlb(struct iommu_domain *domain)
 #endif
 
        list_for_each_entry(ctx_drvdata, &priv->list_attached, attached_elm) {
-               if (!ctx_drvdata->pdev || !ctx_drvdata->pdev->dev.parent)
-                       BUG();
+
+               BUG_ON(!ctx_drvdata->pdev || !ctx_drvdata->pdev->dev.parent);
 
                iommu_drvdata = dev_get_drvdata(ctx_drvdata->pdev->dev.parent);
                BUG_ON(!iommu_drvdata);
index 43429ab62228a4d0674dadeab740be812af77ff8..60ba238090d92f3eed93de994d5fd7271a68b8ce 100644 (file)
@@ -141,10 +141,12 @@ struct iommu_ops *of_iommu_configure(struct device *dev,
        struct iommu_ops *ops = NULL;
        int idx = 0;
 
-       if (dev_is_pci(dev)) {
-               dev_err(dev, "IOMMU is currently not supported for PCI\n");
+       /*
+        * We can't do much for PCI devices without knowing how
+        * device IDs are wired up from the PCI bus to the IOMMU.
+        */
+       if (dev_is_pci(dev))
                return NULL;
-       }
 
        /*
         * We don't currently walk up the tree looking for a parent IOMMU.
index f3d20a2039d20417093fd70c6192b905e7c3ccf8..9bc20e2119a35412df6584258c4e7a1c2d3b9687 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
 #include <linux/debugfs.h>
 #include <linux/platform_data/iommu-omap.h>
 
@@ -29,6 +30,59 @@ static inline bool is_omap_iommu_detached(struct omap_iommu *obj)
        return !obj->domain;
 }
 
+#define pr_reg(name)                                                   \
+       do {                                                            \
+               ssize_t bytes;                                          \
+               const char *str = "%20s: %08x\n";                       \
+               const int maxcol = 32;                                  \
+               bytes = snprintf(p, maxcol, str, __stringify(name),     \
+                                iommu_read_reg(obj, MMU_##name));      \
+               p += bytes;                                             \
+               len -= bytes;                                           \
+               if (len < maxcol)                                       \
+                       goto out;                                       \
+       } while (0)
+
+static ssize_t
+omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
+{
+       char *p = buf;
+
+       pr_reg(REVISION);
+       pr_reg(IRQSTATUS);
+       pr_reg(IRQENABLE);
+       pr_reg(WALKING_ST);
+       pr_reg(CNTL);
+       pr_reg(FAULT_AD);
+       pr_reg(TTB);
+       pr_reg(LOCK);
+       pr_reg(LD_TLB);
+       pr_reg(CAM);
+       pr_reg(RAM);
+       pr_reg(GFLUSH);
+       pr_reg(FLUSH_ENTRY);
+       pr_reg(READ_CAM);
+       pr_reg(READ_RAM);
+       pr_reg(EMU_FAULT_AD);
+out:
+       return p - buf;
+}
+
+static ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf,
+                                  ssize_t bytes)
+{
+       if (!obj || !buf)
+               return -EINVAL;
+
+       pm_runtime_get_sync(obj->dev);
+
+       bytes = omap2_iommu_dump_ctx(obj, buf, bytes);
+
+       pm_runtime_put_sync(obj->dev);
+
+       return bytes;
+}
+
 static ssize_t debug_read_regs(struct file *file, char __user *userbuf,
                               size_t count, loff_t *ppos)
 {
@@ -55,34 +109,72 @@ static ssize_t debug_read_regs(struct file *file, char __user *userbuf,
        return bytes;
 }
 
-static ssize_t debug_read_tlb(struct file *file, char __user *userbuf,
-                             size_t count, loff_t *ppos)
+static int
+__dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
 {
-       struct omap_iommu *obj = file->private_data;
-       char *p, *buf;
-       ssize_t bytes, rest;
+       int i;
+       struct iotlb_lock saved;
+       struct cr_regs tmp;
+       struct cr_regs *p = crs;
+
+       pm_runtime_get_sync(obj->dev);
+       iotlb_lock_get(obj, &saved);
+
+       for_each_iotlb_cr(obj, num, i, tmp) {
+               if (!iotlb_cr_valid(&tmp))
+                       continue;
+               *p++ = tmp;
+       }
+
+       iotlb_lock_set(obj, &saved);
+       pm_runtime_put_sync(obj->dev);
+
+       return  p - crs;
+}
+
+static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
+                            struct seq_file *s)
+{
+       seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
+                         (cr->cam & MMU_CAM_P) ? 1 : 0);
+       return 0;
+}
+
+static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s)
+{
+       int i, num;
+       struct cr_regs *cr;
+
+       num = obj->nr_tlb_entries;
+
+       cr = kcalloc(num, sizeof(*cr), GFP_KERNEL);
+       if (!cr)
+               return 0;
+
+       num = __dump_tlb_entries(obj, cr, num);
+       for (i = 0; i < num; i++)
+               iotlb_dump_cr(obj, cr + i, s);
+       kfree(cr);
+
+       return 0;
+}
+
+static int debug_read_tlb(struct seq_file *s, void *data)
+{
+       struct omap_iommu *obj = s->private;
 
        if (is_omap_iommu_detached(obj))
                return -EPERM;
 
-       buf = kmalloc(count, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-       p = buf;
-
        mutex_lock(&iommu_debug_lock);
 
-       p += sprintf(p, "%8s %8s\n", "cam:", "ram:");
-       p += sprintf(p, "-----------------------------------------\n");
-       rest = count - (p - buf);
-       p += omap_dump_tlb_entries(obj, p, rest);
-
-       bytes = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+       seq_printf(s, "%8s %8s\n", "cam:", "ram:");
+       seq_puts(s, "-----------------------------------------\n");
+       omap_dump_tlb_entries(obj, s);
 
        mutex_unlock(&iommu_debug_lock);
-       kfree(buf);
 
-       return bytes;
+       return 0;
 }
 
 static void dump_ioptable(struct seq_file *s)
@@ -154,10 +246,10 @@ static int debug_read_pagetable(struct seq_file *s, void *data)
                .open = simple_open,                                    \
                .read = debug_read_##name,                              \
                .llseek = generic_file_llseek,                          \
-       };
+       }
 
 DEBUG_FOPS_RO(regs);
-DEBUG_FOPS_RO(tlb);
+DEBUG_SEQ_FOPS_RO(tlb);
 DEBUG_SEQ_FOPS_RO(pagetable);
 
 #define __DEBUG_ADD_FILE(attr, mode)                                   \
index a22c33d6a486c9dea9bfa820feea18a034623eec..36d0033c2ccbfc554b02b8b4183fdbde4ededffe 100644 (file)
@@ -12,7 +12,6 @@
  */
 
 #include <linux/err.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #define to_iommu(dev)                                                  \
        ((struct omap_iommu *)platform_get_drvdata(to_platform_device(dev)))
 
-#define for_each_iotlb_cr(obj, n, __i, cr)                             \
-       for (__i = 0;                                                   \
-            (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true);   \
-            __i++)
-
 /* bitmap of the page sizes currently supported */
 #define OMAP_IOMMU_PGSIZES     (SZ_4K | SZ_64K | SZ_1M | SZ_16M)
 
@@ -72,11 +66,6 @@ struct omap_iommu_domain {
 #define MMU_LOCK_VICT(x)       \
        ((x & MMU_LOCK_VICT_MASK) >> MMU_LOCK_VICT_SHIFT)
 
-struct iotlb_lock {
-       short base;
-       short vict;
-};
-
 static struct platform_driver omap_iommu_driver;
 static struct kmem_cache *iopte_cachep;
 
@@ -213,14 +202,6 @@ static void iommu_disable(struct omap_iommu *obj)
 /*
  *     TLB operations
  */
-static inline int iotlb_cr_valid(struct cr_regs *cr)
-{
-       if (!cr)
-               return -EINVAL;
-
-       return cr->cam & MMU_CAM_V;
-}
-
 static u32 iotlb_cr_to_virt(struct cr_regs *cr)
 {
        u32 page_size = cr->cam & MMU_CAM_PGSZ_MASK;
@@ -260,7 +241,7 @@ static u32 iommu_report_fault(struct omap_iommu *obj, u32 *da)
        return status;
 }
 
-static void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
 {
        u32 val;
 
@@ -268,10 +249,9 @@ static void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
 
        l->base = MMU_LOCK_BASE(val);
        l->vict = MMU_LOCK_VICT(val);
-
 }
 
-static void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l)
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l)
 {
        u32 val;
 
@@ -297,7 +277,7 @@ static void iotlb_load_cr(struct omap_iommu *obj, struct cr_regs *cr)
 }
 
 /* only used in iotlb iteration for-loop */
-static struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n)
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n)
 {
        struct cr_regs cr;
        struct iotlb_lock l;
@@ -468,129 +448,6 @@ static void flush_iotlb_all(struct omap_iommu *obj)
        pm_runtime_put_sync(obj->dev);
 }
 
-#ifdef CONFIG_OMAP_IOMMU_DEBUG
-
-#define pr_reg(name)                                                   \
-       do {                                                            \
-               ssize_t bytes;                                          \
-               const char *str = "%20s: %08x\n";                       \
-               const int maxcol = 32;                                  \
-               bytes = snprintf(p, maxcol, str, __stringify(name),     \
-                                iommu_read_reg(obj, MMU_##name));      \
-               p += bytes;                                             \
-               len -= bytes;                                           \
-               if (len < maxcol)                                       \
-                       goto out;                                       \
-       } while (0)
-
-static ssize_t
-omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
-{
-       char *p = buf;
-
-       pr_reg(REVISION);
-       pr_reg(IRQSTATUS);
-       pr_reg(IRQENABLE);
-       pr_reg(WALKING_ST);
-       pr_reg(CNTL);
-       pr_reg(FAULT_AD);
-       pr_reg(TTB);
-       pr_reg(LOCK);
-       pr_reg(LD_TLB);
-       pr_reg(CAM);
-       pr_reg(RAM);
-       pr_reg(GFLUSH);
-       pr_reg(FLUSH_ENTRY);
-       pr_reg(READ_CAM);
-       pr_reg(READ_RAM);
-       pr_reg(EMU_FAULT_AD);
-out:
-       return p - buf;
-}
-
-ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t bytes)
-{
-       if (!obj || !buf)
-               return -EINVAL;
-
-       pm_runtime_get_sync(obj->dev);
-
-       bytes = omap2_iommu_dump_ctx(obj, buf, bytes);
-
-       pm_runtime_put_sync(obj->dev);
-
-       return bytes;
-}
-
-static int
-__dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
-{
-       int i;
-       struct iotlb_lock saved;
-       struct cr_regs tmp;
-       struct cr_regs *p = crs;
-
-       pm_runtime_get_sync(obj->dev);
-       iotlb_lock_get(obj, &saved);
-
-       for_each_iotlb_cr(obj, num, i, tmp) {
-               if (!iotlb_cr_valid(&tmp))
-                       continue;
-               *p++ = tmp;
-       }
-
-       iotlb_lock_set(obj, &saved);
-       pm_runtime_put_sync(obj->dev);
-
-       return  p - crs;
-}
-
-/**
- * iotlb_dump_cr - Dump an iommu tlb entry into buf
- * @obj:       target iommu
- * @cr:                contents of cam and ram register
- * @buf:       output buffer
- **/
-static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
-                            char *buf)
-{
-       char *p = buf;
-
-       /* FIXME: Need more detail analysis of cam/ram */
-       p += sprintf(p, "%08x %08x %01x\n", cr->cam, cr->ram,
-                                       (cr->cam & MMU_CAM_P) ? 1 : 0);
-
-       return p - buf;
-}
-
-/**
- * omap_dump_tlb_entries - dump cr arrays to given buffer
- * @obj:       target iommu
- * @buf:       output buffer
- **/
-size_t omap_dump_tlb_entries(struct omap_iommu *obj, char *buf, ssize_t bytes)
-{
-       int i, num;
-       struct cr_regs *cr;
-       char *p = buf;
-
-       num = bytes / sizeof(*cr);
-       num = min(obj->nr_tlb_entries, num);
-
-       cr = kcalloc(num, sizeof(*cr), GFP_KERNEL);
-       if (!cr)
-               return 0;
-
-       num = __dump_tlb_entries(obj, cr, num);
-       for (i = 0; i < num; i++)
-               p += iotlb_dump_cr(obj, cr + i, p);
-       kfree(cr);
-
-       return p - buf;
-}
-
-#endif /* CONFIG_OMAP_IOMMU_DEBUG */
-
 /*
  *     H/W pagetable operations
  */
@@ -930,14 +787,14 @@ static irqreturn_t iommu_fault_handler(int irq, void *data)
 
        if (!iopgd_is_table(*iopgd)) {
                dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:px%08x\n",
-                               obj->name, errs, da, iopgd, *iopgd);
+                       obj->name, errs, da, iopgd, *iopgd);
                return IRQ_NONE;
        }
 
        iopte = iopte_offset(iopgd, da);
 
        dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:0x%08x pte:0x%p *pte:0x%08x\n",
-                       obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
+               obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
 
        return IRQ_NONE;
 }
@@ -963,9 +820,8 @@ static struct omap_iommu *omap_iommu_attach(const char *name, u32 *iopgd)
        struct device *dev;
        struct omap_iommu *obj;
 
-       dev = driver_find_device(&omap_iommu_driver.driver, NULL,
-                               (void *)name,
-                               device_match_by_alias);
+       dev = driver_find_device(&omap_iommu_driver.driver, NULL, (void *)name,
+                                device_match_by_alias);
        if (!dev)
                return ERR_PTR(-ENODEV);
 
@@ -1089,7 +945,6 @@ static const struct of_device_id omap_iommu_of_match[] = {
        { .compatible = "ti,dra7-iommu" },
        {},
 };
-MODULE_DEVICE_TABLE(of, omap_iommu_of_match);
 
 static struct platform_driver omap_iommu_driver = {
        .probe  = omap_iommu_probe,
@@ -1121,7 +976,7 @@ static u32 iotlb_init_entry(struct iotlb_entry *e, u32 da, u32 pa, int pgsz)
 }
 
 static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
-                        phys_addr_t pa, size_t bytes, int prot)
+                         phys_addr_t pa, size_t bytes, int prot)
 {
        struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
        struct omap_iommu *oiommu = omap_domain->iommu_dev;
@@ -1148,7 +1003,7 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
 }
 
 static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
-                           size_t size)
+                              size_t size)
 {
        struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
        struct omap_iommu *oiommu = omap_domain->iommu_dev;
@@ -1199,7 +1054,7 @@ out:
 }
 
 static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
-                       struct device *dev)
+                                  struct device *dev)
 {
        struct omap_iommu *oiommu = dev_to_omap_iommu(dev);
        struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
@@ -1220,7 +1075,7 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
 }
 
 static void omap_iommu_detach_dev(struct iommu_domain *domain,
-                                struct device *dev)
+                                 struct device *dev)
 {
        struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
 
@@ -1237,16 +1092,12 @@ static struct iommu_domain *omap_iommu_domain_alloc(unsigned type)
                return NULL;
 
        omap_domain = kzalloc(sizeof(*omap_domain), GFP_KERNEL);
-       if (!omap_domain) {
-               pr_err("kzalloc failed\n");
+       if (!omap_domain)
                goto out;
-       }
 
        omap_domain->pgtable = kzalloc(IOPGD_TABLE_SIZE, GFP_KERNEL);
-       if (!omap_domain->pgtable) {
-               pr_err("kzalloc failed\n");
+       if (!omap_domain->pgtable)
                goto fail_nomem;
-       }
 
        /*
         * should never fail, but please keep this around to ensure
@@ -1285,7 +1136,7 @@ static void omap_iommu_domain_free(struct iommu_domain *domain)
 }
 
 static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
-                                         dma_addr_t da)
+                                          dma_addr_t da)
 {
        struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
        struct omap_iommu *oiommu = omap_domain->iommu_dev;
@@ -1302,7 +1153,7 @@ static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
                        ret = omap_iommu_translate(*pte, da, IOLARGE_MASK);
                else
                        dev_err(dev, "bogus pte 0x%x, da 0x%llx", *pte,
-                                                       (unsigned long long)da);
+                               (unsigned long long)da);
        } else {
                if (iopgd_is_section(*pgd))
                        ret = omap_iommu_translate(*pgd, da, IOSECTION_MASK);
@@ -1310,7 +1161,7 @@ static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
                        ret = omap_iommu_translate(*pgd, da, IOSUPER_MASK);
                else
                        dev_err(dev, "bogus pgd 0x%x, da 0x%llx", *pgd,
-                                                       (unsigned long long)da);
+                               (unsigned long long)da);
        }
 
        return ret;
@@ -1405,20 +1256,5 @@ static int __init omap_iommu_init(void)
 
        return platform_driver_register(&omap_iommu_driver);
 }
-/* must be ready before omap3isp is probed */
 subsys_initcall(omap_iommu_init);
-
-static void __exit omap_iommu_exit(void)
-{
-       kmem_cache_destroy(iopte_cachep);
-
-       platform_driver_unregister(&omap_iommu_driver);
-
-       omap_iommu_debugfs_exit();
-}
-module_exit(omap_iommu_exit);
-
-MODULE_DESCRIPTION("omap iommu: tlb and pagetable primitives");
-MODULE_ALIAS("platform:omap-iommu");
-MODULE_AUTHOR("Hiroshi DOYU, Paul Mundt and Toshihiro Kobayashi");
-MODULE_LICENSE("GPL v2");
+/* must be ready before omap3isp is probed */
index d736630df3c8a16a1a853915be83dec437225df0..a656df2f9e03d27b2ad06cbd6f202aaecf2ee5dc 100644 (file)
 #ifndef _OMAP_IOMMU_H
 #define _OMAP_IOMMU_H
 
+#include <linux/bitops.h>
+
+#define for_each_iotlb_cr(obj, n, __i, cr)                             \
+       for (__i = 0;                                                   \
+            (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true);   \
+            __i++)
+
 struct iotlb_entry {
        u32 da;
        u32 pa;
        u32 pgsz, prsvd, valid;
-       union {
-               u16 ap;
-               struct {
-                       u32 endian, elsz, mixed;
-               };
-       };
+       u32 endian, elsz, mixed;
 };
 
 struct omap_iommu {
@@ -49,20 +51,13 @@ struct omap_iommu {
 };
 
 struct cr_regs {
-       union {
-               struct {
-                       u16 cam_l;
-                       u16 cam_h;
-               };
-               u32 cam;
-       };
-       union {
-               struct {
-                       u16 ram_l;
-                       u16 ram_h;
-               };
-               u32 ram;
-       };
+       u32 cam;
+       u32 ram;
+};
+
+struct iotlb_lock {
+       short base;
+       short vict;
 };
 
 /**
@@ -103,11 +98,11 @@ static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev)
  * MMU Register bit definitions
  */
 /* IRQSTATUS & IRQENABLE */
-#define MMU_IRQ_MULTIHITFAULT  (1 << 4)
-#define MMU_IRQ_TABLEWALKFAULT (1 << 3)
-#define MMU_IRQ_EMUMISS                (1 << 2)
-#define MMU_IRQ_TRANSLATIONFAULT       (1 << 1)
-#define MMU_IRQ_TLBMISS                (1 << 0)
+#define MMU_IRQ_MULTIHITFAULT  BIT(4)
+#define MMU_IRQ_TABLEWALKFAULT BIT(3)
+#define MMU_IRQ_EMUMISS                BIT(2)
+#define MMU_IRQ_TRANSLATIONFAULT       BIT(1)
+#define MMU_IRQ_TLBMISS                BIT(0)
 
 #define __MMU_IRQ_FAULT                \
        (MMU_IRQ_MULTIHITFAULT | MMU_IRQ_EMUMISS | MMU_IRQ_TRANSLATIONFAULT)
@@ -119,16 +114,16 @@ static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev)
 /* MMU_CNTL */
 #define MMU_CNTL_SHIFT         1
 #define MMU_CNTL_MASK          (7 << MMU_CNTL_SHIFT)
-#define MMU_CNTL_EML_TLB       (1 << 3)
-#define MMU_CNTL_TWL_EN                (1 << 2)
-#define MMU_CNTL_MMU_EN                (1 << 1)
+#define MMU_CNTL_EML_TLB       BIT(3)
+#define MMU_CNTL_TWL_EN                BIT(2)
+#define MMU_CNTL_MMU_EN                BIT(1)
 
 /* CAM */
 #define MMU_CAM_VATAG_SHIFT    12
 #define MMU_CAM_VATAG_MASK \
        ((~0UL >> MMU_CAM_VATAG_SHIFT) << MMU_CAM_VATAG_SHIFT)
-#define MMU_CAM_P              (1 << 3)
-#define MMU_CAM_V              (1 << 2)
+#define MMU_CAM_P              BIT(3)
+#define MMU_CAM_V              BIT(2)
 #define MMU_CAM_PGSZ_MASK      3
 #define MMU_CAM_PGSZ_1M                (0 << 0)
 #define MMU_CAM_PGSZ_64K       (1 << 0)
@@ -141,9 +136,9 @@ static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev)
        ((~0UL >> MMU_RAM_PADDR_SHIFT) << MMU_RAM_PADDR_SHIFT)
 
 #define MMU_RAM_ENDIAN_SHIFT   9
-#define MMU_RAM_ENDIAN_MASK    (1 << MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_MASK    BIT(MMU_RAM_ENDIAN_SHIFT)
 #define MMU_RAM_ENDIAN_LITTLE  (0 << MMU_RAM_ENDIAN_SHIFT)
-#define MMU_RAM_ENDIAN_BIG     (1 << MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_BIG     BIT(MMU_RAM_ENDIAN_SHIFT)
 
 #define MMU_RAM_ELSZ_SHIFT     7
 #define MMU_RAM_ELSZ_MASK      (3 << MMU_RAM_ELSZ_SHIFT)
@@ -152,7 +147,7 @@ static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev)
 #define MMU_RAM_ELSZ_32                (2 << MMU_RAM_ELSZ_SHIFT)
 #define MMU_RAM_ELSZ_NONE      (3 << MMU_RAM_ELSZ_SHIFT)
 #define MMU_RAM_MIXED_SHIFT    6
-#define MMU_RAM_MIXED_MASK     (1 << MMU_RAM_MIXED_SHIFT)
+#define MMU_RAM_MIXED_MASK     BIT(MMU_RAM_MIXED_SHIFT)
 #define MMU_RAM_MIXED          MMU_RAM_MIXED_MASK
 
 #define MMU_GP_REG_BUS_ERR_BACK_EN     0x1
@@ -190,12 +185,12 @@ static inline struct omap_iommu *dev_to_omap_iommu(struct device *dev)
 /*
  * global functions
  */
-#ifdef CONFIG_OMAP_IOMMU_DEBUG
-extern ssize_t
-omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len);
-extern size_t
-omap_dump_tlb_entries(struct omap_iommu *obj, char *buf, ssize_t len);
 
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n);
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l);
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l);
+
+#ifdef CONFIG_OMAP_IOMMU_DEBUG
 void omap_iommu_debugfs_init(void);
 void omap_iommu_debugfs_exit(void);
 
@@ -222,4 +217,12 @@ static inline void iommu_write_reg(struct omap_iommu *obj, u32 val, size_t offs)
        __raw_writel(val, obj->regbase + offs);
 }
 
+static inline int iotlb_cr_valid(struct cr_regs *cr)
+{
+       if (!cr)
+               return -EINVAL;
+
+       return cr->cam & MMU_CAM_V;
+}
+
 #endif /* _OMAP_IOMMU_H */
index f891683e3f05af915738151a1a7574268fe84b3b..01a315227bf052d03a0c1f72e6e4e48c6121b201 100644 (file)
  * published by the Free Software Foundation.
  */
 
+#ifndef _OMAP_IOPGTABLE_H
+#define _OMAP_IOPGTABLE_H
+
+#include <linux/bitops.h>
+
 /*
  * "L2 table" address mask and size definitions.
  */
 #define IOPGD_SHIFT            20
-#define IOPGD_SIZE             (1UL << IOPGD_SHIFT)
+#define IOPGD_SIZE             BIT(IOPGD_SHIFT)
 #define IOPGD_MASK             (~(IOPGD_SIZE - 1))
 
 /*
  * "section" address mask and size definitions.
  */
 #define IOSECTION_SHIFT                20
-#define IOSECTION_SIZE         (1UL << IOSECTION_SHIFT)
+#define IOSECTION_SIZE         BIT(IOSECTION_SHIFT)
 #define IOSECTION_MASK         (~(IOSECTION_SIZE - 1))
 
 /*
  * "supersection" address mask and size definitions.
  */
 #define IOSUPER_SHIFT          24
-#define IOSUPER_SIZE           (1UL << IOSUPER_SHIFT)
+#define IOSUPER_SIZE           BIT(IOSUPER_SHIFT)
 #define IOSUPER_MASK           (~(IOSUPER_SIZE - 1))
 
 #define PTRS_PER_IOPGD         (1UL << (32 - IOPGD_SHIFT))
  * "small page" address mask and size definitions.
  */
 #define IOPTE_SHIFT            12
-#define IOPTE_SIZE             (1UL << IOPTE_SHIFT)
+#define IOPTE_SIZE             BIT(IOPTE_SHIFT)
 #define IOPTE_MASK             (~(IOPTE_SIZE - 1))
 
 /*
  * "large page" address mask and size definitions.
  */
 #define IOLARGE_SHIFT          16
-#define IOLARGE_SIZE           (1UL << IOLARGE_SHIFT)
+#define IOLARGE_SIZE           BIT(IOLARGE_SHIFT)
 #define IOLARGE_MASK           (~(IOLARGE_SIZE - 1))
 
 #define PTRS_PER_IOPTE         (1UL << (IOPGD_SHIFT - IOPTE_SHIFT))
@@ -69,16 +74,16 @@ static inline phys_addr_t omap_iommu_translate(u32 d, u32 va, u32 mask)
 /*
  * some descriptor attributes.
  */
-#define IOPGD_TABLE            (1 << 0)
-#define IOPGD_SECTION          (2 << 0)
-#define IOPGD_SUPER            (1 << 18 | 2 << 0)
+#define IOPGD_TABLE            (1)
+#define IOPGD_SECTION          (2)
+#define IOPGD_SUPER            (BIT(18) | IOPGD_SECTION)
 
 #define iopgd_is_table(x)      (((x) & 3) == IOPGD_TABLE)
 #define iopgd_is_section(x)    (((x) & (1 << 18 | 3)) == IOPGD_SECTION)
 #define iopgd_is_super(x)      (((x) & (1 << 18 | 3)) == IOPGD_SUPER)
 
-#define IOPTE_SMALL            (2 << 0)
-#define IOPTE_LARGE            (1 << 0)
+#define IOPTE_SMALL            (2)
+#define IOPTE_LARGE            (1)
 
 #define iopte_is_small(x)      (((x) & 2) == IOPTE_SMALL)
 #define iopte_is_large(x)      (((x) & 3) == IOPTE_LARGE)
@@ -93,3 +98,5 @@ static inline phys_addr_t omap_iommu_translate(u32 d, u32 va, u32 mask)
 /* to find an entry in the second-level page table. */
 #define iopte_index(da)                (((da) >> IOPTE_SHIFT) & (PTRS_PER_IOPTE - 1))
 #define iopte_offset(iopgd, da)        (iopgd_page_vaddr(iopgd) + iopte_index(da))
+
+#endif /* _OMAP_IOPGTABLE_H */
index c1f2e521dc52cdb383b528c27d07f0786e4ffc43..9305964250acaf94cef7da5ce7f5a2fb6e78b5ce 100644 (file)
@@ -27,6 +27,7 @@ struct tegra_smmu {
        const struct tegra_smmu_soc *soc;
 
        unsigned long pfn_mask;
+       unsigned long tlb_mask;
 
        unsigned long *asids;
        struct mutex lock;
@@ -40,8 +41,10 @@ struct tegra_smmu_as {
        struct iommu_domain domain;
        struct tegra_smmu *smmu;
        unsigned int use_count;
-       struct page *count;
+       u32 *count;
+       struct page **pts;
        struct page *pd;
+       dma_addr_t pd_dma;
        unsigned id;
        u32 attr;
 };
@@ -68,7 +71,8 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset)
 #define SMMU_TLB_CONFIG 0x14
 #define  SMMU_TLB_CONFIG_HIT_UNDER_MISS (1 << 29)
 #define  SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION (1 << 28)
-#define  SMMU_TLB_CONFIG_ACTIVE_LINES(x) ((x) & 0x3f)
+#define  SMMU_TLB_CONFIG_ACTIVE_LINES(smmu) \
+       ((smmu)->soc->num_tlb_lines & (smmu)->tlb_mask)
 
 #define SMMU_PTC_CONFIG 0x18
 #define  SMMU_PTC_CONFIG_ENABLE (1 << 29)
@@ -79,9 +83,9 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset)
 #define  SMMU_PTB_ASID_VALUE(x) ((x) & 0x7f)
 
 #define SMMU_PTB_DATA 0x020
-#define  SMMU_PTB_DATA_VALUE(page, attr) (page_to_phys(page) >> 12 | (attr))
+#define  SMMU_PTB_DATA_VALUE(dma, attr) ((dma) >> 12 | (attr))
 
-#define SMMU_MK_PDE(page, attr) (page_to_phys(page) >> SMMU_PTE_SHIFT | (attr))
+#define SMMU_MK_PDE(dma, attr) ((dma) >> SMMU_PTE_SHIFT | (attr))
 
 #define SMMU_TLB_FLUSH 0x030
 #define  SMMU_TLB_FLUSH_VA_MATCH_ALL     (0 << 0)
@@ -134,29 +138,49 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset)
 #define SMMU_PTE_ATTR          (SMMU_PTE_READABLE | SMMU_PTE_WRITABLE | \
                                 SMMU_PTE_NONSECURE)
 
-static inline void smmu_flush_ptc(struct tegra_smmu *smmu, struct page *page,
+static unsigned int iova_pd_index(unsigned long iova)
+{
+       return (iova >> SMMU_PDE_SHIFT) & (SMMU_NUM_PDE - 1);
+}
+
+static unsigned int iova_pt_index(unsigned long iova)
+{
+       return (iova >> SMMU_PTE_SHIFT) & (SMMU_NUM_PTE - 1);
+}
+
+static bool smmu_dma_addr_valid(struct tegra_smmu *smmu, dma_addr_t addr)
+{
+       addr >>= 12;
+       return (addr & smmu->pfn_mask) == addr;
+}
+
+static dma_addr_t smmu_pde_to_dma(u32 pde)
+{
+       return pde << 12;
+}
+
+static void smmu_flush_ptc_all(struct tegra_smmu *smmu)
+{
+       smmu_writel(smmu, SMMU_PTC_FLUSH_TYPE_ALL, SMMU_PTC_FLUSH);
+}
+
+static inline void smmu_flush_ptc(struct tegra_smmu *smmu, dma_addr_t dma,
                                  unsigned long offset)
 {
-       phys_addr_t phys = page ? page_to_phys(page) : 0;
        u32 value;
 
-       if (page) {
-               offset &= ~(smmu->mc->soc->atom_size - 1);
+       offset &= ~(smmu->mc->soc->atom_size - 1);
 
-               if (smmu->mc->soc->num_address_bits > 32) {
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-                       value = (phys >> 32) & SMMU_PTC_FLUSH_HI_MASK;
+       if (smmu->mc->soc->num_address_bits > 32) {
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+               value = (dma >> 32) & SMMU_PTC_FLUSH_HI_MASK;
 #else
-                       value = 0;
+               value = 0;
 #endif
-                       smmu_writel(smmu, value, SMMU_PTC_FLUSH_HI);
-               }
-
-               value = (phys + offset) | SMMU_PTC_FLUSH_TYPE_ADR;
-       } else {
-               value = SMMU_PTC_FLUSH_TYPE_ALL;
+               smmu_writel(smmu, value, SMMU_PTC_FLUSH_HI);
        }
 
+       value = (dma + offset) | SMMU_PTC_FLUSH_TYPE_ADR;
        smmu_writel(smmu, value, SMMU_PTC_FLUSH);
 }
 
@@ -236,8 +260,6 @@ static bool tegra_smmu_capable(enum iommu_cap cap)
 static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type)
 {
        struct tegra_smmu_as *as;
-       unsigned int i;
-       uint32_t *pd;
 
        if (type != IOMMU_DOMAIN_UNMANAGED)
                return NULL;
@@ -248,32 +270,26 @@ static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type)
 
        as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE;
 
-       as->pd = alloc_page(GFP_KERNEL | __GFP_DMA);
+       as->pd = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
        if (!as->pd) {
                kfree(as);
                return NULL;
        }
 
-       as->count = alloc_page(GFP_KERNEL);
+       as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL);
        if (!as->count) {
                __free_page(as->pd);
                kfree(as);
                return NULL;
        }
 
-       /* clear PDEs */
-       pd = page_address(as->pd);
-       SetPageReserved(as->pd);
-
-       for (i = 0; i < SMMU_NUM_PDE; i++)
-               pd[i] = 0;
-
-       /* clear PDE usage counters */
-       pd = page_address(as->count);
-       SetPageReserved(as->count);
-
-       for (i = 0; i < SMMU_NUM_PDE; i++)
-               pd[i] = 0;
+       as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL);
+       if (!as->pts) {
+               kfree(as->count);
+               __free_page(as->pd);
+               kfree(as);
+               return NULL;
+       }
 
        /* setup aperture */
        as->domain.geometry.aperture_start = 0;
@@ -288,7 +304,6 @@ static void tegra_smmu_domain_free(struct iommu_domain *domain)
        struct tegra_smmu_as *as = to_smmu_as(domain);
 
        /* TODO: free page directory and page tables */
-       ClearPageReserved(as->pd);
 
        kfree(as);
 }
@@ -376,16 +391,26 @@ static int tegra_smmu_as_prepare(struct tegra_smmu *smmu,
                return 0;
        }
 
+       as->pd_dma = dma_map_page(smmu->dev, as->pd, 0, SMMU_SIZE_PD,
+                                 DMA_TO_DEVICE);
+       if (dma_mapping_error(smmu->dev, as->pd_dma))
+               return -ENOMEM;
+
+       /* We can't handle 64-bit DMA addresses */
+       if (!smmu_dma_addr_valid(smmu, as->pd_dma)) {
+               err = -ENOMEM;
+               goto err_unmap;
+       }
+
        err = tegra_smmu_alloc_asid(smmu, &as->id);
        if (err < 0)
-               return err;
+               goto err_unmap;
 
-       smmu->soc->ops->flush_dcache(as->pd, 0, SMMU_SIZE_PD);
-       smmu_flush_ptc(smmu, as->pd, 0);
+       smmu_flush_ptc(smmu, as->pd_dma, 0);
        smmu_flush_tlb_asid(smmu, as->id);
 
        smmu_writel(smmu, as->id & 0x7f, SMMU_PTB_ASID);
-       value = SMMU_PTB_DATA_VALUE(as->pd, as->attr);
+       value = SMMU_PTB_DATA_VALUE(as->pd_dma, as->attr);
        smmu_writel(smmu, value, SMMU_PTB_DATA);
        smmu_flush(smmu);
 
@@ -393,6 +418,10 @@ static int tegra_smmu_as_prepare(struct tegra_smmu *smmu,
        as->use_count++;
 
        return 0;
+
+err_unmap:
+       dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+       return err;
 }
 
 static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
@@ -402,6 +431,9 @@ static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
                return;
 
        tegra_smmu_free_asid(smmu, as->id);
+
+       dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+
        as->smmu = NULL;
 }
 
@@ -465,96 +497,155 @@ static void tegra_smmu_detach_dev(struct iommu_domain *domain, struct device *de
        }
 }
 
+static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova,
+                              u32 value)
+{
+       unsigned int pd_index = iova_pd_index(iova);
+       struct tegra_smmu *smmu = as->smmu;
+       u32 *pd = page_address(as->pd);
+       unsigned long offset = pd_index * sizeof(*pd);
+
+       /* Set the page directory entry first */
+       pd[pd_index] = value;
+
+       /* The flush the page directory entry from caches */
+       dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset,
+                                        sizeof(*pd), DMA_TO_DEVICE);
+
+       /* And flush the iommu */
+       smmu_flush_ptc(smmu, as->pd_dma, offset);
+       smmu_flush_tlb_section(smmu, as->id, iova);
+       smmu_flush(smmu);
+}
+
+static u32 *tegra_smmu_pte_offset(struct page *pt_page, unsigned long iova)
+{
+       u32 *pt = page_address(pt_page);
+
+       return pt + iova_pt_index(iova);
+}
+
+static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova,
+                                 dma_addr_t *dmap)
+{
+       unsigned int pd_index = iova_pd_index(iova);
+       struct page *pt_page;
+       u32 *pd;
+
+       pt_page = as->pts[pd_index];
+       if (!pt_page)
+               return NULL;
+
+       pd = page_address(as->pd);
+       *dmap = smmu_pde_to_dma(pd[pd_index]);
+
+       return tegra_smmu_pte_offset(pt_page, iova);
+}
+
 static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
-                      struct page **pagep)
+                      dma_addr_t *dmap)
 {
-       u32 *pd = page_address(as->pd), *pt, *count;
-       u32 pde = (iova >> SMMU_PDE_SHIFT) & 0x3ff;
-       u32 pte = (iova >> SMMU_PTE_SHIFT) & 0x3ff;
+       unsigned int pde = iova_pd_index(iova);
        struct tegra_smmu *smmu = as->smmu;
-       struct page *page;
-       unsigned int i;
 
-       if (pd[pde] == 0) {
-               page = alloc_page(GFP_KERNEL | __GFP_DMA);
+       if (!as->pts[pde]) {
+               struct page *page;
+               dma_addr_t dma;
+
+               page = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
                if (!page)
                        return NULL;
 
-               pt = page_address(page);
-               SetPageReserved(page);
+               dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT,
+                                  DMA_TO_DEVICE);
+               if (dma_mapping_error(smmu->dev, dma)) {
+                       __free_page(page);
+                       return NULL;
+               }
 
-               for (i = 0; i < SMMU_NUM_PTE; i++)
-                       pt[i] = 0;
+               if (!smmu_dma_addr_valid(smmu, dma)) {
+                       dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT,
+                                      DMA_TO_DEVICE);
+                       __free_page(page);
+                       return NULL;
+               }
 
-               smmu->soc->ops->flush_dcache(page, 0, SMMU_SIZE_PT);
+               as->pts[pde] = page;
 
-               pd[pde] = SMMU_MK_PDE(page, SMMU_PDE_ATTR | SMMU_PDE_NEXT);
+               tegra_smmu_set_pde(as, iova, SMMU_MK_PDE(dma, SMMU_PDE_ATTR |
+                                                             SMMU_PDE_NEXT));
 
-               smmu->soc->ops->flush_dcache(as->pd, pde << 2, 4);
-               smmu_flush_ptc(smmu, as->pd, pde << 2);
-               smmu_flush_tlb_section(smmu, as->id, iova);
-               smmu_flush(smmu);
+               *dmap = dma;
        } else {
-               page = pfn_to_page(pd[pde] & smmu->pfn_mask);
-               pt = page_address(page);
+               u32 *pd = page_address(as->pd);
+
+               *dmap = smmu_pde_to_dma(pd[pde]);
        }
 
-       *pagep = page;
+       return tegra_smmu_pte_offset(as->pts[pde], iova);
+}
 
-       /* Keep track of entries in this page table. */
-       count = page_address(as->count);
-       if (pt[pte] == 0)
-               count[pde]++;
+static void tegra_smmu_pte_get_use(struct tegra_smmu_as *as, unsigned long iova)
+{
+       unsigned int pd_index = iova_pd_index(iova);
 
-       return &pt[pte];
+       as->count[pd_index]++;
 }
 
-static void as_put_pte(struct tegra_smmu_as *as, dma_addr_t iova)
+static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova)
 {
-       u32 pde = (iova >> SMMU_PDE_SHIFT) & 0x3ff;
-       u32 pte = (iova >> SMMU_PTE_SHIFT) & 0x3ff;
-       u32 *count = page_address(as->count);
-       u32 *pd = page_address(as->pd), *pt;
-       struct page *page;
-
-       page = pfn_to_page(pd[pde] & as->smmu->pfn_mask);
-       pt = page_address(page);
+       unsigned int pde = iova_pd_index(iova);
+       struct page *page = as->pts[pde];
 
        /*
         * When no entries in this page table are used anymore, return the
         * memory page to the system.
         */
-       if (pt[pte] != 0) {
-               if (--count[pde] == 0) {
-                       ClearPageReserved(page);
-                       __free_page(page);
-                       pd[pde] = 0;
-               }
+       if (--as->count[pde] == 0) {
+               struct tegra_smmu *smmu = as->smmu;
+               u32 *pd = page_address(as->pd);
+               dma_addr_t pte_dma = smmu_pde_to_dma(pd[pde]);
+
+               tegra_smmu_set_pde(as, iova, 0);
 
-               pt[pte] = 0;
+               dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE);
+               __free_page(page);
+               as->pts[pde] = NULL;
        }
 }
 
+static void tegra_smmu_set_pte(struct tegra_smmu_as *as, unsigned long iova,
+                              u32 *pte, dma_addr_t pte_dma, u32 val)
+{
+       struct tegra_smmu *smmu = as->smmu;
+       unsigned long offset = offset_in_page(pte);
+
+       *pte = val;
+
+       dma_sync_single_range_for_device(smmu->dev, pte_dma, offset,
+                                        4, DMA_TO_DEVICE);
+       smmu_flush_ptc(smmu, pte_dma, offset);
+       smmu_flush_tlb_group(smmu, as->id, iova);
+       smmu_flush(smmu);
+}
+
 static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
                          phys_addr_t paddr, size_t size, int prot)
 {
        struct tegra_smmu_as *as = to_smmu_as(domain);
-       struct tegra_smmu *smmu = as->smmu;
-       unsigned long offset;
-       struct page *page;
+       dma_addr_t pte_dma;
        u32 *pte;
 
-       pte = as_get_pte(as, iova, &page);
+       pte = as_get_pte(as, iova, &pte_dma);
        if (!pte)
                return -ENOMEM;
 
-       *pte = __phys_to_pfn(paddr) | SMMU_PTE_ATTR;
-       offset = offset_in_page(pte);
+       /* If we aren't overwriting a pre-existing entry, increment use */
+       if (*pte == 0)
+               tegra_smmu_pte_get_use(as, iova);
 
-       smmu->soc->ops->flush_dcache(page, offset, 4);
-       smmu_flush_ptc(smmu, page, offset);
-       smmu_flush_tlb_group(smmu, as->id, iova);
-       smmu_flush(smmu);
+       tegra_smmu_set_pte(as, iova, pte, pte_dma,
+                          __phys_to_pfn(paddr) | SMMU_PTE_ATTR);
 
        return 0;
 }
@@ -563,22 +654,15 @@ static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
                               size_t size)
 {
        struct tegra_smmu_as *as = to_smmu_as(domain);
-       struct tegra_smmu *smmu = as->smmu;
-       unsigned long offset;
-       struct page *page;
+       dma_addr_t pte_dma;
        u32 *pte;
 
-       pte = as_get_pte(as, iova, &page);
-       if (!pte)
+       pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+       if (!pte || !*pte)
                return 0;
 
-       offset = offset_in_page(pte);
-       as_put_pte(as, iova);
-
-       smmu->soc->ops->flush_dcache(page, offset, 4);
-       smmu_flush_ptc(smmu, page, offset);
-       smmu_flush_tlb_group(smmu, as->id, iova);
-       smmu_flush(smmu);
+       tegra_smmu_set_pte(as, iova, pte, pte_dma, 0);
+       tegra_smmu_pte_put_use(as, iova);
 
        return size;
 }
@@ -587,11 +671,14 @@ static phys_addr_t tegra_smmu_iova_to_phys(struct iommu_domain *domain,
                                           dma_addr_t iova)
 {
        struct tegra_smmu_as *as = to_smmu_as(domain);
-       struct page *page;
        unsigned long pfn;
+       dma_addr_t pte_dma;
        u32 *pte;
 
-       pte = as_get_pte(as, iova, &page);
+       pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+       if (!pte || !*pte)
+               return 0;
+
        pfn = *pte & as->smmu->pfn_mask;
 
        return PFN_PHYS(pfn);
@@ -816,6 +903,9 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev,
        smmu->pfn_mask = BIT_MASK(mc->soc->num_address_bits - PAGE_SHIFT) - 1;
        dev_dbg(dev, "address bits: %u, PFN mask: %#lx\n",
                mc->soc->num_address_bits, smmu->pfn_mask);
+       smmu->tlb_mask = (smmu->soc->num_tlb_lines << 1) - 1;
+       dev_dbg(dev, "TLB lines: %u, mask: %#lx\n", smmu->soc->num_tlb_lines,
+               smmu->tlb_mask);
 
        value = SMMU_PTC_CONFIG_ENABLE | SMMU_PTC_CONFIG_INDEX_MAP(0x3f);
 
@@ -825,14 +915,14 @@ struct tegra_smmu *tegra_smmu_probe(struct device *dev,
        smmu_writel(smmu, value, SMMU_PTC_CONFIG);
 
        value = SMMU_TLB_CONFIG_HIT_UNDER_MISS |
-               SMMU_TLB_CONFIG_ACTIVE_LINES(0x20);
+               SMMU_TLB_CONFIG_ACTIVE_LINES(smmu);
 
        if (soc->supports_round_robin_arbitration)
                value |= SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION;
 
        smmu_writel(smmu, value, SMMU_TLB_CONFIG);
 
-       smmu_flush_ptc(smmu, NULL, 0);
+       smmu_flush_ptc_all(smmu);
        smmu_flush_tlb(smmu);
        smmu_writel(smmu, SMMU_CONFIG_ENABLE, SMMU_CONFIG);
        smmu_flush(smmu);
index e406bc5f13e4f10083ff2d91aa9d81785a8cc2fe..7deed6ef54c2eaf048538e46ecbcb3ec6d5ff91c 100644 (file)
@@ -31,6 +31,7 @@
 #include <asm/cputype.h>
 #include <asm/exception.h>
 #include <asm/smp_plat.h>
+#include <asm/virt.h>
 
 #include "irq-gic-common.h"
 
@@ -50,6 +51,7 @@ struct gic_chip_data {
 };
 
 static struct gic_chip_data gic_data __read_mostly;
+static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
 
 #define gic_data_rdist()               (this_cpu_ptr(gic_data.rdists.rdist))
 #define gic_data_rdist_rd_base()       (gic_data_rdist()->rd_base)
@@ -68,6 +70,11 @@ static inline int gic_irq_in_rdist(struct irq_data *d)
        return gic_irq(d) < 32;
 }
 
+static inline bool forwarded_irq(struct irq_data *d)
+{
+       return d->handler_data != NULL;
+}
+
 static inline void __iomem *gic_dist_base(struct irq_data *d)
 {
        if (gic_irq_in_rdist(d))        /* SGI+PPI -> SGI_base for this CPU */
@@ -231,6 +238,21 @@ static void gic_mask_irq(struct irq_data *d)
        gic_poke_irq(d, GICD_ICENABLER);
 }
 
+static void gic_eoimode1_mask_irq(struct irq_data *d)
+{
+       gic_mask_irq(d);
+       /*
+        * When masking a forwarded interrupt, make sure it is
+        * deactivated as well.
+        *
+        * This ensures that an interrupt that is getting
+        * disabled/masked will not get "stuck", because there is
+        * noone to deactivate it (guest is being terminated).
+        */
+       if (forwarded_irq(d))
+               gic_poke_irq(d, GICD_ICACTIVER);
+}
+
 static void gic_unmask_irq(struct irq_data *d)
 {
        gic_poke_irq(d, GICD_ISENABLER);
@@ -296,6 +318,17 @@ static void gic_eoi_irq(struct irq_data *d)
        gic_write_eoir(gic_irq(d));
 }
 
+static void gic_eoimode1_eoi_irq(struct irq_data *d)
+{
+       /*
+        * No need to deactivate an LPI, or an interrupt that
+        * is is getting forwarded to a vcpu.
+        */
+       if (gic_irq(d) >= 8192 || forwarded_irq(d))
+               return;
+       gic_write_dir(gic_irq(d));
+}
+
 static int gic_set_type(struct irq_data *d, unsigned int type)
 {
        unsigned int irq = gic_irq(d);
@@ -322,6 +355,12 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
        return gic_configure_irq(irq, type, base, rwp_wait);
 }
 
+static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
+{
+       d->handler_data = vcpu;
+       return 0;
+}
+
 static u64 gic_mpidr_to_affinity(u64 mpidr)
 {
        u64 aff;
@@ -343,15 +382,26 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs
 
                if (likely(irqnr > 15 && irqnr < 1020) || irqnr >= 8192) {
                        int err;
+
+                       if (static_key_true(&supports_deactivate))
+                               gic_write_eoir(irqnr);
+
                        err = handle_domain_irq(gic_data.domain, irqnr, regs);
                        if (err) {
                                WARN_ONCE(true, "Unexpected interrupt received!\n");
-                               gic_write_eoir(irqnr);
+                               if (static_key_true(&supports_deactivate)) {
+                                       if (irqnr < 8192)
+                                               gic_write_dir(irqnr);
+                               } else {
+                                       gic_write_eoir(irqnr);
+                               }
                        }
                        continue;
                }
                if (irqnr < 16) {
                        gic_write_eoir(irqnr);
+                       if (static_key_true(&supports_deactivate))
+                               gic_write_dir(irqnr);
 #ifdef CONFIG_SMP
                        handle_IPI(irqnr, regs);
 #else
@@ -451,8 +501,13 @@ static void gic_cpu_sys_reg_init(void)
        /* Set priority mask register */
        gic_write_pmr(DEFAULT_PMR_VALUE);
 
-       /* EOI deactivates interrupt too (mode 0) */
-       gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
+       if (static_key_true(&supports_deactivate)) {
+               /* EOI drops priority only (mode 1) */
+               gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop);
+       } else {
+               /* EOI deactivates interrupt too (mode 0) */
+               gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
+       }
 
        /* ... and let's hit the road... */
        gic_write_grpen1(1);
@@ -661,11 +716,29 @@ static struct irq_chip gic_chip = {
        .flags                  = IRQCHIP_SET_TYPE_MASKED,
 };
 
+static struct irq_chip gic_eoimode1_chip = {
+       .name                   = "GICv3",
+       .irq_mask               = gic_eoimode1_mask_irq,
+       .irq_unmask             = gic_unmask_irq,
+       .irq_eoi                = gic_eoimode1_eoi_irq,
+       .irq_set_type           = gic_set_type,
+       .irq_set_affinity       = gic_set_affinity,
+       .irq_get_irqchip_state  = gic_irq_get_irqchip_state,
+       .irq_set_irqchip_state  = gic_irq_set_irqchip_state,
+       .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
+       .flags                  = IRQCHIP_SET_TYPE_MASKED,
+};
+
 #define GIC_ID_NR              (1U << gic_data.rdists.id_bits)
 
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
                              irq_hw_number_t hw)
 {
+       struct irq_chip *chip = &gic_chip;
+
+       if (static_key_true(&supports_deactivate))
+               chip = &gic_eoimode1_chip;
+
        /* SGIs are private to the core kernel */
        if (hw < 16)
                return -EPERM;
@@ -679,13 +752,13 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
        /* PPIs */
        if (hw < 32) {
                irq_set_percpu_devid(irq);
-               irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+               irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_percpu_devid_irq, NULL, NULL);
                set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN);
        }
        /* SPIs */
        if (hw >= 32 && hw < gic_data.irq_nr) {
-               irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+               irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_fasteoi_irq, NULL, NULL);
                set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
        }
@@ -693,7 +766,7 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
        if (hw >= 8192 && hw < GIC_ID_NR) {
                if (!gic_dist_supports_lpis())
                        return -EPERM;
-               irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+               irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_fasteoi_irq, NULL, NULL);
                set_irq_flags(irq, IRQF_VALID);
        }
@@ -820,6 +893,12 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
        if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
                redist_stride = 0;
 
+       if (!is_hyp_mode_available())
+               static_key_slow_dec(&supports_deactivate);
+
+       if (static_key_true(&supports_deactivate))
+               pr_info("GIC: Using split EOI/Deactivate mode\n");
+
        gic_data.dist_base = dist_base;
        gic_data.redist_regions = rdist_regs;
        gic_data.nr_redist_regions = nr_redist_regions;
index aa3e7b8a69c4349bcd775c7c796062bda3b392f2..e6b7ed537952949eedb43105bcdd555209b62711 100644 (file)
@@ -47,6 +47,7 @@
 #include <asm/irq.h>
 #include <asm/exception.h>
 #include <asm/smp_plat.h>
+#include <asm/virt.h>
 
 #include "irq-gic-common.h"
 
@@ -82,6 +83,8 @@ static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 #define NR_GIC_CPU_IF 8
 static u8 gic_cpu_map[NR_GIC_CPU_IF] __read_mostly;
 
+static struct static_key supports_deactivate = STATIC_KEY_INIT_TRUE;
+
 #ifndef MAX_GIC_NR
 #define MAX_GIC_NR     1
 #endif
@@ -137,6 +140,36 @@ static inline unsigned int gic_irq(struct irq_data *d)
        return d->hwirq;
 }
 
+static inline bool cascading_gic_irq(struct irq_data *d)
+{
+       void *data = irq_data_get_irq_handler_data(d);
+
+       /*
+        * If handler_data pointing to one of the secondary GICs, then
+        * this is a cascading interrupt, and it cannot possibly be
+        * forwarded.
+        */
+       if (data >= (void *)(gic_data + 1) &&
+           data <  (void *)(gic_data + MAX_GIC_NR))
+               return true;
+
+       return false;
+}
+
+static inline bool forwarded_irq(struct irq_data *d)
+{
+       /*
+        * A forwarded interrupt:
+        * - is on the primary GIC
+        * - has its handler_data set to a value
+        * - that isn't a secondary GIC
+        */
+       if (d->handler_data && !cascading_gic_irq(d))
+               return true;
+
+       return false;
+}
+
 /*
  * Routines to acknowledge, disable and enable interrupts
  */
@@ -157,6 +190,21 @@ static void gic_mask_irq(struct irq_data *d)
        gic_poke_irq(d, GIC_DIST_ENABLE_CLEAR);
 }
 
+static void gic_eoimode1_mask_irq(struct irq_data *d)
+{
+       gic_mask_irq(d);
+       /*
+        * When masking a forwarded interrupt, make sure it is
+        * deactivated as well.
+        *
+        * This ensures that an interrupt that is getting
+        * disabled/masked will not get "stuck", because there is
+        * noone to deactivate it (guest is being terminated).
+        */
+       if (forwarded_irq(d))
+               gic_poke_irq(d, GIC_DIST_ACTIVE_CLEAR);
+}
+
 static void gic_unmask_irq(struct irq_data *d)
 {
        gic_poke_irq(d, GIC_DIST_ENABLE_SET);
@@ -167,6 +215,15 @@ static void gic_eoi_irq(struct irq_data *d)
        writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_EOI);
 }
 
+static void gic_eoimode1_eoi_irq(struct irq_data *d)
+{
+       /* Do not deactivate an IRQ forwarded to a vcpu. */
+       if (forwarded_irq(d))
+               return;
+
+       writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
+}
+
 static int gic_irq_set_irqchip_state(struct irq_data *d,
                                     enum irqchip_irq_state which, bool val)
 {
@@ -233,6 +290,16 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
        return gic_configure_irq(gicirq, type, base, NULL);
 }
 
+static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
+{
+       /* Only interrupts on the primary GIC can be forwarded to a vcpu. */
+       if (cascading_gic_irq(d))
+               return -EINVAL;
+
+       d->handler_data = vcpu;
+       return 0;
+}
+
 #ifdef CONFIG_SMP
 static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
                            bool force)
@@ -272,11 +339,15 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
                irqnr = irqstat & GICC_IAR_INT_ID_MASK;
 
                if (likely(irqnr > 15 && irqnr < 1021)) {
+                       if (static_key_true(&supports_deactivate))
+                               writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
                        handle_domain_irq(gic->domain, irqnr, regs);
                        continue;
                }
                if (irqnr < 16) {
                        writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
+                       if (static_key_true(&supports_deactivate))
+                               writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
 #ifdef CONFIG_SMP
                        handle_IPI(irqnr, regs);
 #endif
@@ -329,6 +400,23 @@ static struct irq_chip gic_chip = {
                                  IRQCHIP_MASK_ON_SUSPEND,
 };
 
+static struct irq_chip gic_eoimode1_chip = {
+       .name                   = "GICv2",
+       .irq_mask               = gic_eoimode1_mask_irq,
+       .irq_unmask             = gic_unmask_irq,
+       .irq_eoi                = gic_eoimode1_eoi_irq,
+       .irq_set_type           = gic_set_type,
+#ifdef CONFIG_SMP
+       .irq_set_affinity       = gic_set_affinity,
+#endif
+       .irq_get_irqchip_state  = gic_irq_get_irqchip_state,
+       .irq_set_irqchip_state  = gic_irq_set_irqchip_state,
+       .irq_set_vcpu_affinity  = gic_irq_set_vcpu_affinity,
+       .flags                  = IRQCHIP_SET_TYPE_MASKED |
+                                 IRQCHIP_SKIP_SET_WAKE |
+                                 IRQCHIP_MASK_ON_SUSPEND,
+};
+
 void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq)
 {
        if (gic_nr >= MAX_GIC_NR)
@@ -360,6 +448,10 @@ static void gic_cpu_if_up(struct gic_chip_data *gic)
 {
        void __iomem *cpu_base = gic_data_cpu_base(gic);
        u32 bypass = 0;
+       u32 mode = 0;
+
+       if (static_key_true(&supports_deactivate))
+               mode = GIC_CPU_CTRL_EOImodeNS;
 
        /*
        * Preserve bypass disable bits to be written back later
@@ -367,7 +459,7 @@ static void gic_cpu_if_up(struct gic_chip_data *gic)
        bypass = readl(cpu_base + GIC_CPU_CTRL);
        bypass &= GICC_DIS_BYPASS_MASK;
 
-       writel_relaxed(bypass | GICC_ENABLE, cpu_base + GIC_CPU_CTRL);
+       writel_relaxed(bypass | mode | GICC_ENABLE, cpu_base + GIC_CPU_CTRL);
 }
 
 
@@ -803,13 +895,20 @@ void __init gic_init_physaddr(struct device_node *node)
 static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
                                irq_hw_number_t hw)
 {
+       struct irq_chip *chip = &gic_chip;
+
+       if (static_key_true(&supports_deactivate)) {
+               if (d->host_data == (void *)&gic_data[0])
+                       chip = &gic_eoimode1_chip;
+       }
+
        if (hw < 32) {
                irq_set_percpu_devid(irq);
-               irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+               irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_percpu_devid_irq, NULL, NULL);
                set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN);
        } else {
-               irq_domain_set_info(d, irq, hw, &gic_chip, d->host_data,
+               irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_fasteoi_irq, NULL, NULL);
                set_irq_flags(irq, IRQF_VALID | IRQF_PROBE);
        }
@@ -894,7 +993,7 @@ static const struct irq_domain_ops gic_irq_domain_ops = {
        .xlate = gic_irq_domain_xlate,
 };
 
-void __init gic_init_bases(unsigned int gic_nr, int irq_start,
+static void __init __gic_init_bases(unsigned int gic_nr, int irq_start,
                           void __iomem *dist_base, void __iomem *cpu_base,
                           u32 percpu_offset, struct device_node *node)
 {
@@ -995,6 +1094,8 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
                register_cpu_notifier(&gic_cpu_notifier);
 #endif
                set_handle_irq(gic_handle_irq);
+               if (static_key_true(&supports_deactivate))
+                       pr_info("GIC: Using split EOI/Deactivate mode\n");
        }
 
        gic_dist_init(gic);
@@ -1002,6 +1103,19 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
        gic_pm_init(gic);
 }
 
+void __init gic_init_bases(unsigned int gic_nr, int irq_start,
+                          void __iomem *dist_base, void __iomem *cpu_base,
+                          u32 percpu_offset, struct device_node *node)
+{
+       /*
+        * Non-DT/ACPI systems won't run a hypervisor, so let's not
+        * bother with these...
+        */
+       static_key_slow_dec(&supports_deactivate);
+       __gic_init_bases(gic_nr, irq_start, dist_base, cpu_base,
+                        percpu_offset, node);
+}
+
 #ifdef CONFIG_OF
 static int gic_cnt __initdata;
 
@@ -1010,6 +1124,7 @@ gic_of_init(struct device_node *node, struct device_node *parent)
 {
        void __iomem *cpu_base;
        void __iomem *dist_base;
+       struct resource cpu_res;
        u32 percpu_offset;
        int irq;
 
@@ -1022,10 +1137,20 @@ gic_of_init(struct device_node *node, struct device_node *parent)
        cpu_base = of_iomap(node, 1);
        WARN(!cpu_base, "unable to map gic cpu registers\n");
 
+       of_address_to_resource(node, 1, &cpu_res);
+
+       /*
+        * Disable split EOI/Deactivate if either HYP is not available
+        * or the CPU interface is too small.
+        */
+       if (gic_cnt == 0 && (!is_hyp_mode_available() ||
+                            resource_size(&cpu_res) < SZ_8K))
+               static_key_slow_dec(&supports_deactivate);
+
        if (of_property_read_u32(node, "cpu-offset", &percpu_offset))
                percpu_offset = 0;
 
-       gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
+       __gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node);
        if (!gic_cnt)
                gic_init_physaddr(node);
 
@@ -1140,12 +1265,20 @@ gic_v2_acpi_init(struct acpi_table_header *table)
                return -ENOMEM;
        }
 
+       /*
+        * Disable split EOI/Deactivate if HYP is not available. ACPI
+        * guarantees that we'll always have a GICv2, so the CPU
+        * interface will always be the right size.
+        */
+       if (!is_hyp_mode_available())
+               static_key_slow_dec(&supports_deactivate);
+
        /*
         * Initialize zero GIC instance (no multi-GIC support). Also, set GIC
         * as default IRQ domain to allow for GSI registration and GSI to IRQ
         * number translation (see acpi_register_gsi() and acpi_gsi_to_irq()).
         */
-       gic_init_bases(0, -1, dist_base, cpu_base, 0, NULL);
+       __gic_init_bases(0, -1, dist_base, cpu_base, 0, NULL);
        irq_set_default_host(gic_data[0].domain);
 
        acpi_irq_model = ACPI_IRQ_MODEL_GIC;
index b713466997a0d2016b9b4a19e0430ac1c342dbe8..f8f2e76d34bf30d8e398d577ba025384330ac316 100644 (file)
@@ -38,7 +38,7 @@ typedef struct icn_cdef {
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/major.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
index d5415eedba86738dcf23f72d6b6369eab0e0ea9d..3e01e6fb342468269eccf6ef71032a625087913c 100644 (file)
@@ -393,7 +393,7 @@ config DM_MULTIPATH
        # of SCSI_DH if the latter isn't defined but if
        # it is, DM_MULTIPATH must depend on it.  We get a build
        # error if SCSI_DH=m and DM_MULTIPATH=y
-       depends on SCSI_DH || !SCSI_DH
+       depends on !SCSI_DH || SCSI
        ---help---
          Allow volume managers to support multipath hardware.
 
index eff7bdd7731d5e437d3b83ca4803ac8c03bac6b6..5a67671a3973b576a9bdcc8dabc576448336b9ba 100644 (file)
@@ -159,12 +159,9 @@ static struct priority_group *alloc_priority_group(void)
 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 {
        struct pgpath *pgpath, *tmp;
-       struct multipath *m = ti->private;
 
        list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
                list_del(&pgpath->list);
-               if (m->hw_handler_name)
-                       scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
                dm_put_device(ti, pgpath->path.dev);
                free_pgpath(pgpath);
        }
@@ -580,6 +577,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
                q = bdev_get_queue(p->path.dev->bdev);
 
        if (m->retain_attached_hw_handler) {
+retain:
                attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
                if (attached_handler_name) {
                        /*
@@ -599,20 +597,14 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
        }
 
        if (m->hw_handler_name) {
-               /*
-                * Increments scsi_dh reference, even when using an
-                * already-attached handler.
-                */
                r = scsi_dh_attach(q, m->hw_handler_name);
                if (r == -EBUSY) {
-                       /*
-                        * Already attached to different hw_handler:
-                        * try to reattach with correct one.
-                        */
-                       scsi_dh_detach(q);
-                       r = scsi_dh_attach(q, m->hw_handler_name);
-               }
+                       char b[BDEVNAME_SIZE];
 
+                       printk(KERN_INFO "dm-mpath: retaining handler on device %s\n",
+                               bdevname(p->path.dev->bdev, b));
+                       goto retain;
+               }
                if (r < 0) {
                        ti->error = "error attaching hardware handler";
                        dm_put_device(ti, p->path.dev);
@@ -624,7 +616,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
                        if (r < 0) {
                                ti->error = "unable to set hardware "
                                                        "handler parameters";
-                               scsi_dh_detach(q);
                                dm_put_device(ti, p->path.dev);
                                goto bad;
                        }
@@ -734,12 +725,6 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
                return 0;
 
        m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
-       if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
-                                    "scsi_dh_%s", m->hw_handler_name)) {
-               ti->error = "unknown hardware handler type";
-               ret = -EINVAL;
-               goto fail;
-       }
 
        if (hw_argc > 1) {
                char *p;
index dc2aaab54aef7bf050b24f5921194b5598c7690c..217d613b0fe7a66c53b0085696d43345728df761 100644 (file)
@@ -10,6 +10,7 @@ config VIDEO_OMAP2_VOUT
        select OMAP2_DSS if HAS_IOMEM && ARCH_OMAP2PLUS
        select OMAP2_VRFB if ARCH_OMAP2 || ARCH_OMAP3
        select VIDEO_OMAP2_VOUT_VRFB if VIDEO_OMAP2_VOUT && OMAP2_VRFB
+       select FRAME_VECTOR
        default n
        ---help---
          V4L2 Display driver support for OMAP2/3 based boards.
index f09c5f17a42f35a37e7b4ae80eb746c98f9a755b..70c28d19ea04c8a7cfa452a6c68d0452094d6fcb 100644 (file)
@@ -195,46 +195,34 @@ static int omap_vout_try_format(struct v4l2_pix_format *pix)
 }
 
 /*
- * omap_vout_uservirt_to_phys: This inline function is used to convert user
- * space virtual address to physical address.
+ * omap_vout_get_userptr: Convert user space virtual address to physical
+ * address.
  */
-static unsigned long omap_vout_uservirt_to_phys(unsigned long virtp)
+static int omap_vout_get_userptr(struct videobuf_buffer *vb, u32 virtp,
+                                u32 *physp)
 {
-       unsigned long physp = 0;
-       struct vm_area_struct *vma;
-       struct mm_struct *mm = current->mm;
+       struct frame_vector *vec;
+       int ret;
 
        /* For kernel direct-mapped memory, take the easy way */
-       if (virtp >= PAGE_OFFSET)
-               return virt_to_phys((void *) virtp);
-
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma(mm, virtp);
-       if (vma && (vma->vm_flags & VM_IO) && vma->vm_pgoff) {
-               /* this will catch, kernel-allocated, mmaped-to-usermode
-                  addresses */
-               physp = (vma->vm_pgoff << PAGE_SHIFT) + (virtp - vma->vm_start);
-               up_read(&current->mm->mmap_sem);
-       } else {
-               /* otherwise, use get_user_pages() for general userland pages */
-               int res, nr_pages = 1;
-               struct page *pages;
+       if (virtp >= PAGE_OFFSET) {
+               *physp = virt_to_phys((void *)virtp);
+               return 0;
+       }
 
-               res = get_user_pages(current, current->mm, virtp, nr_pages, 1,
-                               0, &pages, NULL);
-               up_read(&current->mm->mmap_sem);
+       vec = frame_vector_create(1);
+       if (!vec)
+               return -ENOMEM;
 
-               if (res == nr_pages) {
-                       physp =  __pa(page_address(&pages[0]) +
-                                       (virtp & ~PAGE_MASK));
-               } else {
-                       printk(KERN_WARNING VOUT_NAME
-                                       "get_user_pages failed\n");
-                       return 0;
-               }
+       ret = get_vaddr_frames(virtp, 1, true, false, vec);
+       if (ret != 1) {
+               frame_vector_destroy(vec);
+               return -EINVAL;
        }
+       *physp = __pfn_to_phys(frame_vector_pfns(vec)[0]);
+       vb->priv = vec;
 
-       return physp;
+       return 0;
 }
 
 /*
@@ -784,11 +772,15 @@ static int omap_vout_buffer_prepare(struct videobuf_queue *q,
         * address of the buffer
         */
        if (V4L2_MEMORY_USERPTR == vb->memory) {
+               int ret;
+
                if (0 == vb->baddr)
                        return -EINVAL;
                /* Physical address */
-               vout->queued_buf_addr[vb->i] = (u8 *)
-                       omap_vout_uservirt_to_phys(vb->baddr);
+               ret = omap_vout_get_userptr(vb, vb->baddr,
+                               (u32 *)&vout->queued_buf_addr[vb->i]);
+               if (ret < 0)
+                       return ret;
        } else {
                unsigned long addr, dma_addr;
                unsigned long size;
@@ -834,12 +826,13 @@ static void omap_vout_buffer_queue(struct videobuf_queue *q,
 static void omap_vout_buffer_release(struct videobuf_queue *q,
                            struct videobuf_buffer *vb)
 {
-       struct omap_vout_device *vout = q->priv_data;
-
        vb->state = VIDEOBUF_NEEDS_INIT;
+       if (vb->memory == V4L2_MEMORY_USERPTR && vb->priv) {
+               struct frame_vector *vec = vb->priv;
 
-       if (V4L2_MEMORY_MMAP != vout->memory)
-               return;
+               put_vaddr_frames(vec);
+               frame_vector_destroy(vec);
+       }
 }
 
 /*
@@ -872,7 +865,7 @@ static void omap_vout_vm_close(struct vm_area_struct *vma)
        vout->mmap_count--;
 }
 
-static struct vm_operations_struct omap_vout_vm_ops = {
+static const struct vm_operations_struct omap_vout_vm_ops = {
        .open   = omap_vout_vm_open,
        .close  = omap_vout_vm_close,
 };
index b4b022933e29e463c8075b53a2ec8b0a0d7271b5..82876a67f1449b62f02142f4a677aee8880c295a 100644 (file)
@@ -84,6 +84,7 @@ config VIDEOBUF2_CORE
 
 config VIDEOBUF2_MEMOPS
        tristate
+       select FRAME_VECTOR
 
 config VIDEOBUF2_DMA_CONTIG
        tristate
index f1022d810d2208f92057b3cf25eddabd5973aa90..4f59b7ec05d0fe7261312082d7ee2c2e17453b53 100644 (file)
@@ -1691,9 +1691,7 @@ static int __buf_prepare(struct vb2_buffer *vb, const struct v4l2_buffer *b)
                ret = __qbuf_mmap(vb, b);
                break;
        case V4L2_MEMORY_USERPTR:
-               down_read(&current->mm->mmap_sem);
                ret = __qbuf_userptr(vb, b);
-               up_read(&current->mm->mmap_sem);
                break;
        case V4L2_MEMORY_DMABUF:
                ret = __qbuf_dmabuf(vb, b);
index 94c1e6455d365de7faa1b4b0df14ce279820d5f2..2397ceb1dc6b3743f18fefe4ddc958ecb5c6589c 100644 (file)
@@ -32,15 +32,13 @@ struct vb2_dc_buf {
        dma_addr_t                      dma_addr;
        enum dma_data_direction         dma_dir;
        struct sg_table                 *dma_sgt;
+       struct frame_vector             *vec;
 
        /* MMAP related */
        struct vb2_vmarea_handler       handler;
        atomic_t                        refcount;
        struct sg_table                 *sgt_base;
 
-       /* USERPTR related */
-       struct vm_area_struct           *vma;
-
        /* DMABUF related */
        struct dma_buf_attachment       *db_attach;
 };
@@ -49,24 +47,6 @@ struct vb2_dc_buf {
 /*        scatterlist table functions        */
 /*********************************************/
 
-
-static void vb2_dc_sgt_foreach_page(struct sg_table *sgt,
-       void (*cb)(struct page *pg))
-{
-       struct scatterlist *s;
-       unsigned int i;
-
-       for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
-               struct page *page = sg_page(s);
-               unsigned int n_pages = PAGE_ALIGN(s->offset + s->length)
-                       >> PAGE_SHIFT;
-               unsigned int j;
-
-               for (j = 0; j < n_pages; ++j, ++page)
-                       cb(page);
-       }
-}
-
 static unsigned long vb2_dc_get_contiguous_size(struct sg_table *sgt)
 {
        struct scatterlist *s;
@@ -429,92 +409,12 @@ static struct dma_buf *vb2_dc_get_dmabuf(void *buf_priv, unsigned long flags)
 /*       callbacks for USERPTR buffers       */
 /*********************************************/
 
-static inline int vma_is_io(struct vm_area_struct *vma)
-{
-       return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
-}
-
-static int vb2_dc_get_user_pfn(unsigned long start, int n_pages,
-       struct vm_area_struct *vma, unsigned long *res)
-{
-       unsigned long pfn, start_pfn, prev_pfn;
-       unsigned int i;
-       int ret;
-
-       if (!vma_is_io(vma))
-               return -EFAULT;
-
-       ret = follow_pfn(vma, start, &pfn);
-       if (ret)
-               return ret;
-
-       start_pfn = pfn;
-       start += PAGE_SIZE;
-
-       for (i = 1; i < n_pages; ++i, start += PAGE_SIZE) {
-               prev_pfn = pfn;
-               ret = follow_pfn(vma, start, &pfn);
-
-               if (ret) {
-                       pr_err("no page for address %lu\n", start);
-                       return ret;
-               }
-               if (pfn != prev_pfn + 1)
-                       return -EINVAL;
-       }
-
-       *res = start_pfn;
-       return 0;
-}
-
-static int vb2_dc_get_user_pages(unsigned long start, struct page **pages,
-       int n_pages, struct vm_area_struct *vma,
-       enum dma_data_direction dma_dir)
-{
-       if (vma_is_io(vma)) {
-               unsigned int i;
-
-               for (i = 0; i < n_pages; ++i, start += PAGE_SIZE) {
-                       unsigned long pfn;
-                       int ret = follow_pfn(vma, start, &pfn);
-
-                       if (!pfn_valid(pfn))
-                               return -EINVAL;
-
-                       if (ret) {
-                               pr_err("no page for address %lu\n", start);
-                               return ret;
-                       }
-                       pages[i] = pfn_to_page(pfn);
-               }
-       } else {
-               int n;
-
-               n = get_user_pages(current, current->mm, start & PAGE_MASK,
-                       n_pages, dma_dir == DMA_FROM_DEVICE, 1, pages, NULL);
-               /* negative error means that no page was pinned */
-               n = max(n, 0);
-               if (n != n_pages) {
-                       pr_err("got only %d of %d user pages\n", n, n_pages);
-                       while (n)
-                               put_page(pages[--n]);
-                       return -EFAULT;
-               }
-       }
-
-       return 0;
-}
-
-static void vb2_dc_put_dirty_page(struct page *page)
-{
-       set_page_dirty_lock(page);
-       put_page(page);
-}
-
 static void vb2_dc_put_userptr(void *buf_priv)
 {
        struct vb2_dc_buf *buf = buf_priv;
        struct sg_table *sgt = buf->dma_sgt;
+       int i;
+       struct page **pages;
 
        if (sgt) {
                DEFINE_DMA_ATTRS(attrs);
@@ -526,13 +426,15 @@ static void vb2_dc_put_userptr(void *buf_priv)
                 */
                dma_unmap_sg_attrs(buf->dev, sgt->sgl, sgt->orig_nents,
                                   buf->dma_dir, &attrs);
-               if (!vma_is_io(buf->vma))
-                       vb2_dc_sgt_foreach_page(sgt, vb2_dc_put_dirty_page);
-
+               pages = frame_vector_pages(buf->vec);
+               /* sgt should exist only if vector contains pages... */
+               BUG_ON(IS_ERR(pages));
+               for (i = 0; i < frame_vector_count(buf->vec); i++)
+                       set_page_dirty_lock(pages[i]);
                sg_free_table(sgt);
                kfree(sgt);
        }
-       vb2_put_vma(buf->vma);
+       vb2_destroy_framevec(buf->vec);
        kfree(buf);
 }
 
@@ -572,13 +474,10 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
 {
        struct vb2_dc_conf *conf = alloc_ctx;
        struct vb2_dc_buf *buf;
-       unsigned long start;
-       unsigned long end;
+       struct frame_vector *vec;
        unsigned long offset;
-       struct page **pages;
-       int n_pages;
+       int n_pages, i;
        int ret = 0;
-       struct vm_area_struct *vma;
        struct sg_table *sgt;
        unsigned long contig_size;
        unsigned long dma_align = dma_get_cache_alignment();
@@ -604,72 +503,43 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        buf->dev = conf->dev;
        buf->dma_dir = dma_dir;
 
-       start = vaddr & PAGE_MASK;
        offset = vaddr & ~PAGE_MASK;
-       end = PAGE_ALIGN(vaddr + size);
-       n_pages = (end - start) >> PAGE_SHIFT;
-
-       pages = kmalloc(n_pages * sizeof(pages[0]), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               pr_err("failed to allocate pages table\n");
+       vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+       if (IS_ERR(vec)) {
+               ret = PTR_ERR(vec);
                goto fail_buf;
        }
+       buf->vec = vec;
+       n_pages = frame_vector_count(vec);
+       ret = frame_vector_to_pages(vec);
+       if (ret < 0) {
+               unsigned long *nums = frame_vector_pfns(vec);
 
-       /* current->mm->mmap_sem is taken by videobuf2 core */
-       vma = find_vma(current->mm, vaddr);
-       if (!vma) {
-               pr_err("no vma for address %lu\n", vaddr);
-               ret = -EFAULT;
-               goto fail_pages;
-       }
-
-       if (vma->vm_end < vaddr + size) {
-               pr_err("vma at %lu is too small for %lu bytes\n", vaddr, size);
-               ret = -EFAULT;
-               goto fail_pages;
-       }
-
-       buf->vma = vb2_get_vma(vma);
-       if (!buf->vma) {
-               pr_err("failed to copy vma\n");
-               ret = -ENOMEM;
-               goto fail_pages;
-       }
-
-       /* extract page list from userspace mapping */
-       ret = vb2_dc_get_user_pages(start, pages, n_pages, vma, dma_dir);
-       if (ret) {
-               unsigned long pfn;
-               if (vb2_dc_get_user_pfn(start, n_pages, vma, &pfn) == 0) {
-                       buf->dma_addr = vb2_dc_pfn_to_dma(buf->dev, pfn);
-                       buf->size = size;
-                       kfree(pages);
-                       return buf;
-               }
-
-               pr_err("failed to get user pages\n");
-               goto fail_vma;
+               /*
+                * Failed to convert to pages... Check the memory is physically
+                * contiguous and use direct mapping
+                */
+               for (i = 1; i < n_pages; i++)
+                       if (nums[i-1] + 1 != nums[i])
+                               goto fail_pfnvec;
+               buf->dma_addr = vb2_dc_pfn_to_dma(buf->dev, nums[0]);
+               goto out;
        }
 
        sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
        if (!sgt) {
                pr_err("failed to allocate sg table\n");
                ret = -ENOMEM;
-               goto fail_get_user_pages;
+               goto fail_pfnvec;
        }
 
-       ret = sg_alloc_table_from_pages(sgt, pages, n_pages,
+       ret = sg_alloc_table_from_pages(sgt, frame_vector_pages(vec), n_pages,
                offset, size, GFP_KERNEL);
        if (ret) {
                pr_err("failed to initialize sg table\n");
                goto fail_sgt;
        }
 
-       /* pages are no longer needed */
-       kfree(pages);
-       pages = NULL;
-
        /*
         * No need to sync to the device, this will happen later when the
         * prepare() memop is called.
@@ -691,8 +561,9 @@ static void *vb2_dc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        }
 
        buf->dma_addr = sg_dma_address(sgt->sgl);
-       buf->size = size;
        buf->dma_sgt = sgt;
+out:
+       buf->size = size;
 
        return buf;
 
@@ -701,23 +572,13 @@ fail_map_sg:
                           buf->dma_dir, &attrs);
 
 fail_sgt_init:
-       if (!vma_is_io(buf->vma))
-               vb2_dc_sgt_foreach_page(sgt, put_page);
        sg_free_table(sgt);
 
 fail_sgt:
        kfree(sgt);
 
-fail_get_user_pages:
-       if (pages && !vma_is_io(buf->vma))
-               while (n_pages)
-                       put_page(pages[--n_pages]);
-
-fail_vma:
-       vb2_put_vma(buf->vma);
-
-fail_pages:
-       kfree(pages); /* kfree is NULL-proof */
+fail_pfnvec:
+       vb2_destroy_framevec(vec);
 
 fail_buf:
        kfree(buf);
index 7289b81bd7b72e46acf5f522a2341a31088020e4..be7bd6535c9d87d6bc3839e4bccae160f2251047 100644 (file)
@@ -38,6 +38,7 @@ struct vb2_dma_sg_buf {
        struct device                   *dev;
        void                            *vaddr;
        struct page                     **pages;
+       struct frame_vector             *vec;
        int                             offset;
        enum dma_data_direction         dma_dir;
        struct sg_table                 sg_table;
@@ -51,7 +52,6 @@ struct vb2_dma_sg_buf {
        unsigned int                    num_pages;
        atomic_t                        refcount;
        struct vb2_vmarea_handler       handler;
-       struct vm_area_struct           *vma;
 
        struct dma_buf_attachment       *db_attach;
 };
@@ -225,25 +225,17 @@ static void vb2_dma_sg_finish(void *buf_priv)
        dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
 }
 
-static inline int vma_is_io(struct vm_area_struct *vma)
-{
-       return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
-}
-
 static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
                                    unsigned long size,
                                    enum dma_data_direction dma_dir)
 {
        struct vb2_dma_sg_conf *conf = alloc_ctx;
        struct vb2_dma_sg_buf *buf;
-       unsigned long first, last;
-       int num_pages_from_user;
-       struct vm_area_struct *vma;
        struct sg_table *sgt;
        DEFINE_DMA_ATTRS(attrs);
+       struct frame_vector *vec;
 
        dma_set_attr(DMA_ATTR_SKIP_CPU_SYNC, &attrs);
-
        buf = kzalloc(sizeof *buf, GFP_KERNEL);
        if (!buf)
                return NULL;
@@ -254,61 +246,19 @@ static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
        buf->offset = vaddr & ~PAGE_MASK;
        buf->size = size;
        buf->dma_sgt = &buf->sg_table;
+       vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE);
+       if (IS_ERR(vec))
+               goto userptr_fail_pfnvec;
+       buf->vec = vec;
 
-       first = (vaddr           & PAGE_MASK) >> PAGE_SHIFT;
-       last  = ((vaddr + size - 1) & PAGE_MASK) >> PAGE_SHIFT;
-       buf->num_pages = last - first + 1;
-
-       buf->pages = kzalloc(buf->num_pages * sizeof(struct page *),
-                            GFP_KERNEL);
-       if (!buf->pages)
-               goto userptr_fail_alloc_pages;
-
-       vma = find_vma(current->mm, vaddr);
-       if (!vma) {
-               dprintk(1, "no vma for address %lu\n", vaddr);
-               goto userptr_fail_find_vma;
-       }
-
-       if (vma->vm_end < vaddr + size) {
-               dprintk(1, "vma at %lu is too small for %lu bytes\n",
-                       vaddr, size);
-               goto userptr_fail_find_vma;
-       }
-
-       buf->vma = vb2_get_vma(vma);
-       if (!buf->vma) {
-               dprintk(1, "failed to copy vma\n");
-               goto userptr_fail_find_vma;
-       }
-
-       if (vma_is_io(buf->vma)) {
-               for (num_pages_from_user = 0;
-                    num_pages_from_user < buf->num_pages;
-                    ++num_pages_from_user, vaddr += PAGE_SIZE) {
-                       unsigned long pfn;
-
-                       if (follow_pfn(vma, vaddr, &pfn)) {
-                               dprintk(1, "no page for address %lu\n", vaddr);
-                               break;
-                       }
-                       buf->pages[num_pages_from_user] = pfn_to_page(pfn);
-               }
-       } else
-               num_pages_from_user = get_user_pages(current, current->mm,
-                                            vaddr & PAGE_MASK,
-                                            buf->num_pages,
-                                            buf->dma_dir == DMA_FROM_DEVICE,
-                                            1, /* force */
-                                            buf->pages,
-                                            NULL);
-
-       if (num_pages_from_user != buf->num_pages)
-               goto userptr_fail_get_user_pages;
+       buf->pages = frame_vector_pages(vec);
+       if (IS_ERR(buf->pages))
+               goto userptr_fail_sgtable;
+       buf->num_pages = frame_vector_count(vec);
 
        if (sg_alloc_table_from_pages(buf->dma_sgt, buf->pages,
                        buf->num_pages, buf->offset, size, 0))
-               goto userptr_fail_alloc_table_from_pages;
+               goto userptr_fail_sgtable;
 
        sgt = &buf->sg_table;
        /*
@@ -324,17 +274,9 @@ static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
 
 userptr_fail_map:
        sg_free_table(&buf->sg_table);
-userptr_fail_alloc_table_from_pages:
-userptr_fail_get_user_pages:
-       dprintk(1, "get_user_pages requested/got: %d/%d]\n",
-               buf->num_pages, num_pages_from_user);
-       if (!vma_is_io(buf->vma))
-               while (--num_pages_from_user >= 0)
-                       put_page(buf->pages[num_pages_from_user]);
-       vb2_put_vma(buf->vma);
-userptr_fail_find_vma:
-       kfree(buf->pages);
-userptr_fail_alloc_pages:
+userptr_fail_sgtable:
+       vb2_destroy_framevec(vec);
+userptr_fail_pfnvec:
        kfree(buf);
        return NULL;
 }
@@ -362,11 +304,8 @@ static void vb2_dma_sg_put_userptr(void *buf_priv)
        while (--i >= 0) {
                if (buf->dma_dir == DMA_FROM_DEVICE)
                        set_page_dirty_lock(buf->pages[i]);
-               if (!vma_is_io(buf->vma))
-                       put_page(buf->pages[i]);
        }
-       kfree(buf->pages);
-       vb2_put_vma(buf->vma);
+       vb2_destroy_framevec(buf->vec);
        kfree(buf);
 }
 
index 0d49b7951f84a55a0d996ba90412b49ec249762c..48c6a49c4928f0446d53526c6cbda0b52ef3e08f 100644 (file)
 #include <media/videobuf2-memops.h>
 
 /**
- * vb2_get_vma() - acquire and lock the virtual memory area
- * @vma:       given virtual memory area
+ * vb2_create_framevec() - map virtual addresses to pfns
+ * @start:     Virtual user address where we start mapping
+ * @length:    Length of a range to map
+ * @write:     Should we map for writing into the area
  *
- * This function attempts to acquire an area mapped in the userspace for
- * the duration of a hardware operation. The area is "locked" by performing
- * the same set of operation that are done when process calls fork() and
- * memory areas are duplicated.
- *
- * Returns a copy of a virtual memory region on success or NULL.
- */
-struct vm_area_struct *vb2_get_vma(struct vm_area_struct *vma)
-{
-       struct vm_area_struct *vma_copy;
-
-       vma_copy = kmalloc(sizeof(*vma_copy), GFP_KERNEL);
-       if (vma_copy == NULL)
-               return NULL;
-
-       if (vma->vm_ops && vma->vm_ops->open)
-               vma->vm_ops->open(vma);
-
-       if (vma->vm_file)
-               get_file(vma->vm_file);
-
-       memcpy(vma_copy, vma, sizeof(*vma));
-
-       vma_copy->vm_mm = NULL;
-       vma_copy->vm_next = NULL;
-       vma_copy->vm_prev = NULL;
-
-       return vma_copy;
-}
-EXPORT_SYMBOL_GPL(vb2_get_vma);
-
-/**
- * vb2_put_userptr() - release a userspace virtual memory area
- * @vma:       virtual memory region associated with the area to be released
- *
- * This function releases the previously acquired memory area after a hardware
- * operation.
+ * This function allocates and fills in a vector with pfns corresponding to
+ * virtual address range passed in arguments. If pfns have corresponding pages,
+ * page references are also grabbed to pin pages in memory. The function
+ * returns pointer to the vector on success and error pointer in case of
+ * failure. Returned vector needs to be freed via vb2_destroy_pfnvec().
  */
-void vb2_put_vma(struct vm_area_struct *vma)
+struct frame_vector *vb2_create_framevec(unsigned long start,
+                                        unsigned long length,
+                                        bool write)
 {
-       if (!vma)
-               return;
-
-       if (vma->vm_ops && vma->vm_ops->close)
-               vma->vm_ops->close(vma);
-
-       if (vma->vm_file)
-               fput(vma->vm_file);
-
-       kfree(vma);
+       int ret;
+       unsigned long first, last;
+       unsigned long nr;
+       struct frame_vector *vec;
+
+       first = start >> PAGE_SHIFT;
+       last = (start + length - 1) >> PAGE_SHIFT;
+       nr = last - first + 1;
+       vec = frame_vector_create(nr);
+       if (!vec)
+               return ERR_PTR(-ENOMEM);
+       ret = get_vaddr_frames(start, nr, write, 1, vec);
+       if (ret < 0)
+               goto out_destroy;
+       /* We accept only complete set of PFNs */
+       if (ret != nr) {
+               ret = -EFAULT;
+               goto out_release;
+       }
+       return vec;
+out_release:
+       put_vaddr_frames(vec);
+out_destroy:
+       frame_vector_destroy(vec);
+       return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(vb2_put_vma);
+EXPORT_SYMBOL(vb2_create_framevec);
 
 /**
- * vb2_get_contig_userptr() - lock physically contiguous userspace mapped memory
- * @vaddr:     starting virtual address of the area to be verified
- * @size:      size of the area
- * @res_paddr: will return physical address for the given vaddr
- * @res_vma:   will return locked copy of struct vm_area for the given area
- *
- * This function will go through memory area of size @size mapped at @vaddr and
- * verify that the underlying physical pages are contiguous. If they are
- * contiguous the virtual memory area is locked and a @res_vma is filled with
- * the copy and @res_pa set to the physical address of the buffer.
+ * vb2_destroy_framevec() - release vector of mapped pfns
+ * @vec:       vector of pfns / pages to release
  *
- * Returns 0 on success.
+ * This releases references to all pages in the vector @vec (if corresponding
+ * pfns are backed by pages) and frees the passed vector.
  */
-int vb2_get_contig_userptr(unsigned long vaddr, unsigned long size,
-                          struct vm_area_struct **res_vma, dma_addr_t *res_pa)
+void vb2_destroy_framevec(struct frame_vector *vec)
 {
-       struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
-       unsigned long offset, start, end;
-       unsigned long this_pfn, prev_pfn;
-       dma_addr_t pa = 0;
-
-       start = vaddr;
-       offset = start & ~PAGE_MASK;
-       end = start + size;
-
-       vma = find_vma(mm, start);
-
-       if (vma == NULL || vma->vm_end < end)
-               return -EFAULT;
-
-       for (prev_pfn = 0; start < end; start += PAGE_SIZE) {
-               int ret = follow_pfn(vma, start, &this_pfn);
-               if (ret)
-                       return ret;
-
-               if (prev_pfn == 0)
-                       pa = this_pfn << PAGE_SHIFT;
-               else if (this_pfn != prev_pfn + 1)
-                       return -EFAULT;
-
-               prev_pfn = this_pfn;
-       }
-
-       /*
-        * Memory is contiguous, lock vma and return to the caller
-        */
-       *res_vma = vb2_get_vma(vma);
-       if (*res_vma == NULL)
-               return -ENOMEM;
-
-       *res_pa = pa + offset;
-       return 0;
+       put_vaddr_frames(vec);
+       frame_vector_destroy(vec);
 }
-EXPORT_SYMBOL_GPL(vb2_get_contig_userptr);
+EXPORT_SYMBOL(vb2_destroy_framevec);
 
 /**
  * vb2_common_vm_open() - increase refcount of the vma
index 2fe4c27f524a85d9732ba4cbc72f5979f954466f..ecb8f0c7f0253b7eef10e97e3e88bd7e25c28296 100644 (file)
 
 struct vb2_vmalloc_buf {
        void                            *vaddr;
-       struct page                     **pages;
-       struct vm_area_struct           *vma;
+       struct frame_vector             *vec;
        enum dma_data_direction         dma_dir;
        unsigned long                   size;
-       unsigned int                    n_pages;
        atomic_t                        refcount;
        struct vb2_vmarea_handler       handler;
        struct dma_buf                  *dbuf;
@@ -76,10 +74,8 @@ static void *vb2_vmalloc_get_userptr(void *alloc_ctx, unsigned long vaddr,
                                     enum dma_data_direction dma_dir)
 {
        struct vb2_vmalloc_buf *buf;
-       unsigned long first, last;
-       int n_pages, offset;
-       struct vm_area_struct *vma;
-       dma_addr_t physp;
+       struct frame_vector *vec;
+       int n_pages, offset, i;
 
        buf = kzalloc(sizeof(*buf), GFP_KERNEL);
        if (!buf)
@@ -88,51 +84,36 @@ static void *vb2_vmalloc_get_userptr(void *alloc_ctx, unsigned long vaddr,
        buf->dma_dir = dma_dir;
        offset = vaddr & ~PAGE_MASK;
        buf->size = size;
-
-
-       vma = find_vma(current->mm, vaddr);
-       if (vma && (vma->vm_flags & VM_PFNMAP) && (vma->vm_pgoff)) {
-               if (vb2_get_contig_userptr(vaddr, size, &vma, &physp))
-                       goto fail_pages_array_alloc;
-               buf->vma = vma;
-               buf->vaddr = (__force void *)ioremap_nocache(physp, size);
-               if (!buf->vaddr)
-                       goto fail_pages_array_alloc;
+       vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+       if (IS_ERR(vec))
+               goto fail_pfnvec_create;
+       buf->vec = vec;
+       n_pages = frame_vector_count(vec);
+       if (frame_vector_to_pages(vec) < 0) {
+               unsigned long *nums = frame_vector_pfns(vec);
+
+               /*
+                * We cannot get page pointers for these pfns. Check memory is
+                * physically contiguous and use direct mapping.
+                */
+               for (i = 1; i < n_pages; i++)
+                       if (nums[i-1] + 1 != nums[i])
+                               goto fail_map;
+               buf->vaddr = (__force void *)
+                               ioremap_nocache(nums[0] << PAGE_SHIFT, size);
        } else {
-               first = vaddr >> PAGE_SHIFT;
-               last  = (vaddr + size - 1) >> PAGE_SHIFT;
-               buf->n_pages = last - first + 1;
-               buf->pages = kzalloc(buf->n_pages * sizeof(struct page *),
-                                    GFP_KERNEL);
-               if (!buf->pages)
-                       goto fail_pages_array_alloc;
-
-               /* current->mm->mmap_sem is taken by videobuf2 core */
-               n_pages = get_user_pages(current, current->mm,
-                                        vaddr & PAGE_MASK, buf->n_pages,
-                                        dma_dir == DMA_FROM_DEVICE,
-                                        1, /* force */
-                                        buf->pages, NULL);
-               if (n_pages != buf->n_pages)
-                       goto fail_get_user_pages;
-
-               buf->vaddr = vm_map_ram(buf->pages, buf->n_pages, -1,
+               buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1,
                                        PAGE_KERNEL);
-               if (!buf->vaddr)
-                       goto fail_get_user_pages;
        }
 
+       if (!buf->vaddr)
+               goto fail_map;
        buf->vaddr += offset;
        return buf;
 
-fail_get_user_pages:
-       pr_debug("get_user_pages requested/got: %d/%d]\n", n_pages,
-                buf->n_pages);
-       while (--n_pages >= 0)
-               put_page(buf->pages[n_pages]);
-       kfree(buf->pages);
-
-fail_pages_array_alloc:
+fail_map:
+       vb2_destroy_framevec(vec);
+fail_pfnvec_create:
        kfree(buf);
 
        return NULL;
@@ -143,20 +124,21 @@ static void vb2_vmalloc_put_userptr(void *buf_priv)
        struct vb2_vmalloc_buf *buf = buf_priv;
        unsigned long vaddr = (unsigned long)buf->vaddr & PAGE_MASK;
        unsigned int i;
+       struct page **pages;
+       unsigned int n_pages;
 
-       if (buf->pages) {
+       if (!buf->vec->is_pfns) {
+               n_pages = frame_vector_count(buf->vec);
+               pages = frame_vector_pages(buf->vec);
                if (vaddr)
-                       vm_unmap_ram((void *)vaddr, buf->n_pages);
-               for (i = 0; i < buf->n_pages; ++i) {
-                       if (buf->dma_dir == DMA_FROM_DEVICE)
-                               set_page_dirty_lock(buf->pages[i]);
-                       put_page(buf->pages[i]);
-               }
-               kfree(buf->pages);
+                       vm_unmap_ram((void *)vaddr, n_pages);
+               if (buf->dma_dir == DMA_FROM_DEVICE)
+                       for (i = 0; i < n_pages; i++)
+                               set_page_dirty_lock(pages[i]);
        } else {
-               vb2_put_vma(buf->vma);
                iounmap((__force void __iomem *)buf->vaddr);
        }
+       vb2_destroy_framevec(buf->vec);
        kfree(buf);
 }
 
index c8765db07a62517c2f4cf221d7cf488af95cf273..ba8fff3d66a655d0875a50f4abb7ea863ea0099c 100644 (file)
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mm.h>
 
-#include <asm/cacheflush.h>
-
 #include <dt-bindings/memory/tegra114-mc.h>
 
 #include "mc.h"
@@ -914,20 +912,6 @@ static const struct tegra_smmu_swgroup tegra114_swgroups[] = {
        { .name = "tsec",      .swgroup = TEGRA_SWGROUP_TSEC,      .reg = 0x294 },
 };
 
-static void tegra114_flush_dcache(struct page *page, unsigned long offset,
-                                 size_t size)
-{
-       phys_addr_t phys = page_to_phys(page) + offset;
-       void *virt = page_address(page) + offset;
-
-       __cpuc_flush_dcache_area(virt, size);
-       outer_flush_range(phys, phys + size);
-}
-
-static const struct tegra_smmu_ops tegra114_smmu_ops = {
-       .flush_dcache = tegra114_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra114_smmu_soc = {
        .clients = tegra114_mc_clients,
        .num_clients = ARRAY_SIZE(tegra114_mc_clients),
@@ -935,8 +919,8 @@ static const struct tegra_smmu_soc tegra114_smmu_soc = {
        .num_swgroups = ARRAY_SIZE(tegra114_swgroups),
        .supports_round_robin_arbitration = false,
        .supports_request_limit = false,
+       .num_tlb_lines = 32,
        .num_asids = 4,
-       .ops = &tegra114_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra114_mc_soc = {
index 060fb3d7a23fb2b4ba486f6e069e55f190563898..21e7255e3d96af10a53549be2c86b8e076a99abd 100644 (file)
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mm.h>
 
-#include <asm/cacheflush.h>
-
 #include <dt-bindings/memory/tegra124-mc.h>
 
 #include "mc.h"
@@ -1002,20 +1000,6 @@ static const struct tegra_smmu_swgroup tegra124_swgroups[] = {
 };
 
 #ifdef CONFIG_ARCH_TEGRA_124_SOC
-static void tegra124_flush_dcache(struct page *page, unsigned long offset,
-                                 size_t size)
-{
-       phys_addr_t phys = page_to_phys(page) + offset;
-       void *virt = page_address(page) + offset;
-
-       __cpuc_flush_dcache_area(virt, size);
-       outer_flush_range(phys, phys + size);
-}
-
-static const struct tegra_smmu_ops tegra124_smmu_ops = {
-       .flush_dcache = tegra124_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra124_smmu_soc = {
        .clients = tegra124_mc_clients,
        .num_clients = ARRAY_SIZE(tegra124_mc_clients),
@@ -1024,7 +1008,6 @@ static const struct tegra_smmu_soc tegra124_smmu_soc = {
        .supports_round_robin_arbitration = true,
        .supports_request_limit = true,
        .num_asids = 128,
-       .ops = &tegra124_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra124_mc_soc = {
@@ -1040,18 +1023,6 @@ const struct tegra_mc_soc tegra124_mc_soc = {
 #endif /* CONFIG_ARCH_TEGRA_124_SOC */
 
 #ifdef CONFIG_ARCH_TEGRA_132_SOC
-static void tegra132_flush_dcache(struct page *page, unsigned long offset,
-                                 size_t size)
-{
-       void *virt = page_address(page) + offset;
-
-       __flush_dcache_area(virt, size);
-}
-
-static const struct tegra_smmu_ops tegra132_smmu_ops = {
-       .flush_dcache = tegra132_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra132_smmu_soc = {
        .clients = tegra124_mc_clients,
        .num_clients = ARRAY_SIZE(tegra124_mc_clients),
@@ -1059,8 +1030,8 @@ static const struct tegra_smmu_soc tegra132_smmu_soc = {
        .num_swgroups = ARRAY_SIZE(tegra124_swgroups),
        .supports_round_robin_arbitration = true,
        .supports_request_limit = true,
+       .num_tlb_lines = 32,
        .num_asids = 128,
-       .ops = &tegra132_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra132_mc_soc = {
index 52e16c7b34f81f94a851dfc1b839676f8d879a9f..b44737840e70c188344c3d51e5edc08ffb02b256 100644 (file)
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mm.h>
 
-#include <asm/cacheflush.h>
-
 #include <dt-bindings/memory/tegra30-mc.h>
 
 #include "mc.h"
@@ -936,20 +934,6 @@ static const struct tegra_smmu_swgroup tegra30_swgroups[] = {
        { .name = "isp",  .swgroup = TEGRA_SWGROUP_ISP,  .reg = 0x258 },
 };
 
-static void tegra30_flush_dcache(struct page *page, unsigned long offset,
-                                size_t size)
-{
-       phys_addr_t phys = page_to_phys(page) + offset;
-       void *virt = page_address(page) + offset;
-
-       __cpuc_flush_dcache_area(virt, size);
-       outer_flush_range(phys, phys + size);
-}
-
-static const struct tegra_smmu_ops tegra30_smmu_ops = {
-       .flush_dcache = tegra30_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra30_smmu_soc = {
        .clients = tegra30_mc_clients,
        .num_clients = ARRAY_SIZE(tegra30_mc_clients),
@@ -957,8 +941,8 @@ static const struct tegra_smmu_soc tegra30_smmu_soc = {
        .num_swgroups = ARRAY_SIZE(tegra30_swgroups),
        .supports_round_robin_arbitration = false,
        .supports_request_limit = false,
+       .num_tlb_lines = 16,
        .num_asids = 4,
-       .ops = &tegra30_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra30_mc_soc = {
index 2b254f3a1154e641628dc9e92a84572f4a4174f9..c6cb7f8f325e91c530656b4d1eac709c82f8212e 100644 (file)
@@ -186,19 +186,11 @@ static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf,
        if (count > io_limit)
                count = io_limit;
 
-       switch (at24->use_smbus) {
-       case I2C_SMBUS_I2C_BLOCK_DATA:
+       if (at24->use_smbus) {
                /* Smaller eeproms can work given some SMBus extension calls */
                if (count > I2C_SMBUS_BLOCK_MAX)
                        count = I2C_SMBUS_BLOCK_MAX;
-               break;
-       case I2C_SMBUS_WORD_DATA:
-               count = 2;
-               break;
-       case I2C_SMBUS_BYTE_DATA:
-               count = 1;
-               break;
-       default:
+       } else {
                /*
                 * When we have a better choice than SMBus calls, use a
                 * combined I2C message. Write address; then read up to
@@ -229,27 +221,10 @@ static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf,
        timeout = jiffies + msecs_to_jiffies(write_timeout);
        do {
                read_time = jiffies;
-               switch (at24->use_smbus) {
-               case I2C_SMBUS_I2C_BLOCK_DATA:
-                       status = i2c_smbus_read_i2c_block_data(client, offset,
-                                       count, buf);
-                       break;
-               case I2C_SMBUS_WORD_DATA:
-                       status = i2c_smbus_read_word_data(client, offset);
-                       if (status >= 0) {
-                               buf[0] = status & 0xff;
-                               buf[1] = status >> 8;
-                               status = count;
-                       }
-                       break;
-               case I2C_SMBUS_BYTE_DATA:
-                       status = i2c_smbus_read_byte_data(client, offset);
-                       if (status >= 0) {
-                               buf[0] = status;
-                               status = count;
-                       }
-                       break;
-               default:
+               if (at24->use_smbus) {
+                       status = i2c_smbus_read_i2c_block_data_or_emulated(client, offset,
+                                                                          count, buf);
+               } else {
                        status = i2c_transfer(client->adapter, msg, 2);
                        if (status == 2)
                                status = count;
index 9aa4332a6b04d43034a93b54116ea7578d2a00bb..e4dd93b2518c8486a5591c620fa88be0be2d0377 100644 (file)
@@ -191,6 +191,7 @@ static const struct i2c_device_id max6875_id[] = {
        { "max6875", 0 },
        { }
 };
+MODULE_DEVICE_TABLE(i2c, max6875_id);
 
 static struct i2c_driver max6875_driver = {
        .driver = {
index c49d244265eccd6c4017a8bcd73a7f0749fdc3dc..70e62d6a3231fd7ce4f42ea2cc8c2065163f083c 100644 (file)
@@ -418,7 +418,7 @@ static void genwqe_vma_close(struct vm_area_struct *vma)
        kfree(dma_map);
 }
 
-static struct vm_operations_struct genwqe_vma_ops = {
+static const struct vm_operations_struct genwqe_vma_ops = {
        .open   = genwqe_vma_open,
        .close  = genwqe_vma_close,
 };
index 2bc0f5089f829ea76477dbf2648a5c7edf77853f..b346638833b0cd7bfbc97b022c8e0ae25fa88872 100644 (file)
@@ -364,6 +364,7 @@ int mei_watchdog_register(struct mei_device *dev)
 
        int ret;
 
+       amt_wd_dev.parent = dev->dev;
        /* unlock to perserve correct locking order */
        mutex_unlock(&dev->device_lock);
        ret = watchdog_register_device(&amt_wd_dev);
index 95c894482fddf443d4516ffad39c54adda5be754..340b44d9e8cf7c634685fd2afe84fddfc136d14b 100644 (file)
@@ -239,7 +239,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
        mq->mmr_blade = uv_cpu_to_blade_id(cpu);
 
        nid = cpu_to_node(cpu);
-       page = alloc_pages_exact_node(nid,
+       page = __alloc_pages_node(nid,
                                      GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                      pg_order);
        if (page == NULL) {
index a1b820fcb2a6ff60093011d696761ba4ca15e584..c742cfd7674e0eb699b2e656cff1eba288932b6f 100644 (file)
 #include "queue.h"
 
 MODULE_ALIAS("mmc:block");
+
+#ifdef KERNEL
 #ifdef MODULE_PARAM_PREFIX
 #undef MODULE_PARAM_PREFIX
 #endif
 #define MODULE_PARAM_PREFIX "mmcblk."
+#endif
 
 #define INAND_CMD38_ARG_EXT_CSD  113
 #define INAND_CMD38_ARG_ERASE    0x00
@@ -2386,6 +2389,7 @@ force_ro_fail:
 #define CID_MANFID_TOSHIBA     0x11
 #define CID_MANFID_MICRON      0x13
 #define CID_MANFID_SAMSUNG     0x15
+#define CID_MANFID_KINGSTON    0x70
 
 static const struct mmc_fixup blk_fixups[] =
 {
@@ -2408,6 +2412,10 @@ static const struct mmc_fixup blk_fixups[] =
         *
         * N.B. This doesn't affect SD cards.
         */
+       MMC_FIXUP("SDMB-32", CID_MANFID_SANDISK, CID_OEMID_ANY, add_quirk_mmc,
+                 MMC_QUIRK_BLK_NO_CMD23),
+       MMC_FIXUP("SDM032", CID_MANFID_SANDISK, CID_OEMID_ANY, add_quirk_mmc,
+                 MMC_QUIRK_BLK_NO_CMD23),
        MMC_FIXUP("MMC08G", CID_MANFID_TOSHIBA, CID_OEMID_ANY, add_quirk_mmc,
                  MMC_QUIRK_BLK_NO_CMD23),
        MMC_FIXUP("MMC16G", CID_MANFID_TOSHIBA, CID_OEMID_ANY, add_quirk_mmc,
@@ -2444,6 +2452,15 @@ static const struct mmc_fixup blk_fixups[] =
        MMC_FIXUP("VZL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc,
                  MMC_QUIRK_SEC_ERASE_TRIM_BROKEN),
 
+       /*
+        *  On Some Kingston eMMCs, performing trim can result in
+        *  unrecoverable data conrruption occasionally due to a firmware bug.
+        */
+       MMC_FIXUP("V10008", CID_MANFID_KINGSTON, CID_OEMID_ANY, add_quirk_mmc,
+                 MMC_QUIRK_TRIM_BROKEN),
+       MMC_FIXUP("V10016", CID_MANFID_KINGSTON, CID_OEMID_ANY, add_quirk_mmc,
+                 MMC_QUIRK_TRIM_BROKEN),
+
        END_FIXUP
 };
 
index 9ad73f30f744fd3f1f5261c6a0480ea90460a036..0520064dc33beb164aa9d80642c371e227d599b1 100644 (file)
@@ -358,8 +358,10 @@ EXPORT_SYMBOL(mmc_start_bkops);
  */
 static void mmc_wait_data_done(struct mmc_request *mrq)
 {
-       mrq->host->context_info.is_done_rcv = true;
-       wake_up_interruptible(&mrq->host->context_info.wait);
+       struct mmc_context_info *context_info = &mrq->host->context_info;
+
+       context_info->is_done_rcv = true;
+       wake_up_interruptible(&context_info->wait);
 }
 
 static void mmc_wait_done(struct mmc_request *mrq)
@@ -2168,6 +2170,7 @@ int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
              unsigned int arg)
 {
        unsigned int rem, to = from + nr;
+       int err;
 
        if (!(card->host->caps & MMC_CAP_ERASE) ||
            !(card->csd.cmdclass & CCC_ERASE))
@@ -2218,6 +2221,22 @@ int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
        /* 'from' and 'to' are inclusive */
        to -= 1;
 
+       /*
+        * Special case where only one erase-group fits in the timeout budget:
+        * If the region crosses an erase-group boundary on this particular
+        * case, we will be trimming more than one erase-group which, does not
+        * fit in the timeout budget of the controller, so we need to split it
+        * and call mmc_do_erase() twice if necessary. This special case is
+        * identified by the card->eg_boundary flag.
+        */
+       rem = card->erase_size - (from % card->erase_size);
+       if ((arg & MMC_TRIM_ARGS) && (card->eg_boundary) && (nr > rem)) {
+               err = mmc_do_erase(card, from, from + rem - 1, arg);
+               from += rem;
+               if ((err) || (to <= from))
+                       return err;
+       }
+
        return mmc_do_erase(card, from, to, arg);
 }
 EXPORT_SYMBOL(mmc_erase);
@@ -2233,7 +2252,8 @@ EXPORT_SYMBOL(mmc_can_erase);
 
 int mmc_can_trim(struct mmc_card *card)
 {
-       if (card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN)
+       if ((card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN) &&
+           (!(card->quirks & MMC_QUIRK_TRIM_BROKEN)))
                return 1;
        return 0;
 }
@@ -2313,16 +2333,28 @@ static unsigned int mmc_do_calc_max_discard(struct mmc_card *card,
        if (!qty)
                return 0;
 
+       /*
+        * When specifying a sector range to trim, chances are we might cross
+        * an erase-group boundary even if the amount of sectors is less than
+        * one erase-group.
+        * If we can only fit one erase-group in the controller timeout budget,
+        * we have to care that erase-group boundaries are not crossed by a
+        * single trim operation. We flag that special case with "eg_boundary".
+        * In all other cases we can just decrement qty and pretend that we
+        * always touch (qty + 1) erase-groups as a simple optimization.
+        */
        if (qty == 1)
-               return 1;
+               card->eg_boundary = 1;
+       else
+               qty--;
 
        /* Convert qty to sectors */
        if (card->erase_shift)
-               max_discard = --qty << card->erase_shift;
+               max_discard = qty << card->erase_shift;
        else if (mmc_card_sd(card))
-               max_discard = qty;
+               max_discard = qty + 1;
        else
-               max_discard = --qty * card->erase_size;
+               max_discard = qty * card->erase_size;
 
        return max_discard;
 }
index 99a9c9011c501011db0953319dc4d82b003ead6f..abd933b7029bec26b7adebbea2db8fe3be426eb6 100644 (file)
@@ -398,7 +398,7 @@ int mmc_of_parse(struct mmc_host *host)
 {
        struct device_node *np;
        u32 bus_width;
-       int len, ret;
+       int ret;
        bool cd_cap_invert, cd_gpio_invert = false;
        bool ro_cap_invert, ro_gpio_invert = false;
 
@@ -445,12 +445,12 @@ int mmc_of_parse(struct mmc_host *host)
         */
 
        /* Parse Card Detection */
-       if (of_find_property(np, "non-removable", &len)) {
+       if (of_property_read_bool(np, "non-removable")) {
                host->caps |= MMC_CAP_NONREMOVABLE;
        } else {
                cd_cap_invert = of_property_read_bool(np, "cd-inverted");
 
-               if (of_find_property(np, "broken-cd", &len))
+               if (of_property_read_bool(np, "broken-cd"))
                        host->caps |= MMC_CAP_NEEDS_POLL;
 
                ret = mmc_gpiod_request_cd(host, "cd", 0, true,
@@ -491,41 +491,41 @@ int mmc_of_parse(struct mmc_host *host)
        if (ro_cap_invert ^ ro_gpio_invert)
                host->caps2 |= MMC_CAP2_RO_ACTIVE_HIGH;
 
-       if (of_find_property(np, "cap-sd-highspeed", &len))
+       if (of_property_read_bool(np, "cap-sd-highspeed"))
                host->caps |= MMC_CAP_SD_HIGHSPEED;
-       if (of_find_property(np, "cap-mmc-highspeed", &len))
+       if (of_property_read_bool(np, "cap-mmc-highspeed"))
                host->caps |= MMC_CAP_MMC_HIGHSPEED;
-       if (of_find_property(np, "sd-uhs-sdr12", &len))
+       if (of_property_read_bool(np, "sd-uhs-sdr12"))
                host->caps |= MMC_CAP_UHS_SDR12;
-       if (of_find_property(np, "sd-uhs-sdr25", &len))
+       if (of_property_read_bool(np, "sd-uhs-sdr25"))
                host->caps |= MMC_CAP_UHS_SDR25;
-       if (of_find_property(np, "sd-uhs-sdr50", &len))
+       if (of_property_read_bool(np, "sd-uhs-sdr50"))
                host->caps |= MMC_CAP_UHS_SDR50;
-       if (of_find_property(np, "sd-uhs-sdr104", &len))
+       if (of_property_read_bool(np, "sd-uhs-sdr104"))
                host->caps |= MMC_CAP_UHS_SDR104;
-       if (of_find_property(np, "sd-uhs-ddr50", &len))
+       if (of_property_read_bool(np, "sd-uhs-ddr50"))
                host->caps |= MMC_CAP_UHS_DDR50;
-       if (of_find_property(np, "cap-power-off-card", &len))
+       if (of_property_read_bool(np, "cap-power-off-card"))
                host->caps |= MMC_CAP_POWER_OFF_CARD;
-       if (of_find_property(np, "cap-sdio-irq", &len))
+       if (of_property_read_bool(np, "cap-sdio-irq"))
                host->caps |= MMC_CAP_SDIO_IRQ;
-       if (of_find_property(np, "full-pwr-cycle", &len))
+       if (of_property_read_bool(np, "full-pwr-cycle"))
                host->caps2 |= MMC_CAP2_FULL_PWR_CYCLE;
-       if (of_find_property(np, "keep-power-in-suspend", &len))
+       if (of_property_read_bool(np, "keep-power-in-suspend"))
                host->pm_caps |= MMC_PM_KEEP_POWER;
-       if (of_find_property(np, "enable-sdio-wakeup", &len))
+       if (of_property_read_bool(np, "enable-sdio-wakeup"))
                host->pm_caps |= MMC_PM_WAKE_SDIO_IRQ;
-       if (of_find_property(np, "mmc-ddr-1_8v", &len))
+       if (of_property_read_bool(np, "mmc-ddr-1_8v"))
                host->caps |= MMC_CAP_1_8V_DDR;
-       if (of_find_property(np, "mmc-ddr-1_2v", &len))
+       if (of_property_read_bool(np, "mmc-ddr-1_2v"))
                host->caps |= MMC_CAP_1_2V_DDR;
-       if (of_find_property(np, "mmc-hs200-1_8v", &len))
+       if (of_property_read_bool(np, "mmc-hs200-1_8v"))
                host->caps2 |= MMC_CAP2_HS200_1_8V_SDR;
-       if (of_find_property(np, "mmc-hs200-1_2v", &len))
+       if (of_property_read_bool(np, "mmc-hs200-1_2v"))
                host->caps2 |= MMC_CAP2_HS200_1_2V_SDR;
-       if (of_find_property(np, "mmc-hs400-1_8v", &len))
+       if (of_property_read_bool(np, "mmc-hs400-1_8v"))
                host->caps2 |= MMC_CAP2_HS400_1_8V | MMC_CAP2_HS200_1_8V_SDR;
-       if (of_find_property(np, "mmc-hs400-1_2v", &len))
+       if (of_property_read_bool(np, "mmc-hs400-1_2v"))
                host->caps2 |= MMC_CAP2_HS400_1_2V | MMC_CAP2_HS200_1_2V_SDR;
 
        host->dsr_req = !of_property_read_u32(np, "dsr", &host->dsr);
index 6a0f9c79be2652bdf843c40692e7dd188d76567a..8a1e3498261e9301cffad18889c5392e6751b7a6 100644 (file)
@@ -129,6 +129,14 @@ config MMC_SDHCI_OF_ARASAN
 
          If unsure, say N.
 
+config MMC_SDHCI_OF_AT91
+       tristate "SDHCI OF support for the Atmel SDMMC controller"
+       depends on MMC_SDHCI_PLTFM
+       depends on OF
+       select MMC_SDHCI_IO_ACCESSORS
+       help
+         This selects the Atmel SDMMC driver
+
 config MMC_SDHCI_OF_ESDHC
        tristate "SDHCI OF support for the Freescale eSDHC controller"
        depends on MMC_SDHCI_PLTFM
index e928d61c5f4be3d70bd4b6d37b7852fccfe70b5d..4f3452afa6ca3d0340cb67b74a874900f61bf54d 100644 (file)
@@ -67,6 +67,7 @@ obj-$(CONFIG_MMC_SDHCI_ESDHC_IMX)     += sdhci-esdhc-imx.o
 obj-$(CONFIG_MMC_SDHCI_DOVE)           += sdhci-dove.o
 obj-$(CONFIG_MMC_SDHCI_TEGRA)          += sdhci-tegra.o
 obj-$(CONFIG_MMC_SDHCI_OF_ARASAN)      += sdhci-of-arasan.o
+obj-$(CONFIG_MMC_SDHCI_OF_AT91)                += sdhci-of-at91.o
 obj-$(CONFIG_MMC_SDHCI_OF_ESDHC)       += sdhci-of-esdhc.o
 obj-$(CONFIG_MMC_SDHCI_OF_HLWD)                += sdhci-of-hlwd.o
 obj-$(CONFIG_MMC_SDHCI_BCM_KONA)       += sdhci-bcm-kona.o
index b1eac719a4cca2aa95033ace18e83b0c03a749df..dca5518b01395a31e98f3a91a2df14736e063652 100644 (file)
@@ -118,7 +118,7 @@ struct goldfish_mmc_host {
        struct mmc_host         *mmc;
        struct device           *dev;
        unsigned char           id; /* 16xx chips have 2 MMC blocks */
-       void __iomem            *virt_base;
+       void                    *virt_base;
        unsigned int            phys_base;
        int                     irq;
        unsigned char           bus_mode;
index 9a39e0b7e583625e7fa8a3f24dab0179e3da880a..bf62e429f7fcc1902d2275677d41cebca8ff5cb5 100644 (file)
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/types.h>
-#include <linux/platform_data/atmel.h>
 #include <linux/platform_data/mmc-atmel-mci.h>
 
 #include <linux/mmc/host.h>
index de15121bba7dffe07725451890100d57abba8b25..bc76aa22473ea2f6e75f8b98d96e90b60ed99538 100644 (file)
@@ -73,6 +73,9 @@ static int dw_mci_rockchip_init(struct dw_mci *host)
        /* It is slot 8 on Rockchip SoCs */
        host->sdio_id0 = 8;
 
+       /* It needs this quirk on all Rockchip SoCs */
+       host->pdata->quirks |= DW_MCI_QUIRK_BROKEN_DTO;
+
        return 0;
 }
 
index 40e9d8e45f25c64f1cb421807ced3af1c9ad1ca7..fcbf5524fd3136f6242036ae206239f83c90db71 100644 (file)
@@ -99,6 +99,9 @@ struct idmac_desc {
 
        __le32          des3;   /* buffer 2 physical address */
 };
+
+/* Each descriptor can transfer up to 4KB of data in chained mode */
+#define DW_MCI_DESC_DATA_LENGTH        0x1000
 #endif /* CONFIG_MMC_DW_IDMAC */
 
 static bool dw_mci_reset(struct dw_mci *host);
@@ -235,8 +238,8 @@ static u32 dw_mci_prepare_command(struct mmc_host *mmc, struct mmc_command *cmd)
        struct dw_mci *host = slot->host;
        const struct dw_mci_drv_data *drv_data = slot->host->drv_data;
        u32 cmdr;
-       cmd->error = -EINPROGRESS;
 
+       cmd->error = -EINPROGRESS;
        cmdr = cmd->opcode;
 
        if (cmd->opcode == MMC_STOP_TRANSMISSION ||
@@ -371,7 +374,7 @@ static void dw_mci_start_command(struct dw_mci *host,
                 cmd->arg, cmd_flags);
 
        mci_writel(host, CMDARG, cmd->arg);
-       wmb();
+       wmb(); /* drain writebuffer */
        dw_mci_wait_while_busy(host, cmd_flags);
 
        mci_writel(host, CMD, cmd_flags | SDMMC_CMD_START);
@@ -380,6 +383,7 @@ static void dw_mci_start_command(struct dw_mci *host,
 static inline void send_stop_abort(struct dw_mci *host, struct mmc_data *data)
 {
        struct mmc_command *stop = data->stop ? data->stop : &host->stop_abort;
+
        dw_mci_start_command(host, stop, host->stop_cmdr);
 }
 
@@ -462,69 +466,102 @@ static void dw_mci_idmac_complete_dma(struct dw_mci *host)
 static void dw_mci_translate_sglist(struct dw_mci *host, struct mmc_data *data,
                                    unsigned int sg_len)
 {
+       unsigned int desc_len;
        int i;
+
        if (host->dma_64bit_address == 1) {
-               struct idmac_desc_64addr *desc = host->sg_cpu;
+               struct idmac_desc_64addr *desc_first, *desc_last, *desc;
 
-               for (i = 0; i < sg_len; i++, desc++) {
+               desc_first = desc_last = desc = host->sg_cpu;
+
+               for (i = 0; i < sg_len; i++) {
                        unsigned int length = sg_dma_len(&data->sg[i]);
+
                        u64 mem_addr = sg_dma_address(&data->sg[i]);
 
-                       /*
-                        * Set the OWN bit and disable interrupts for this
-                        * descriptor
-                        */
-                       desc->des0 = IDMAC_DES0_OWN | IDMAC_DES0_DIC |
-                                               IDMAC_DES0_CH;
-                       /* Buffer length */
-                       IDMAC_64ADDR_SET_BUFFER1_SIZE(desc, length);
-
-                       /* Physical address to DMA to/from */
-                       desc->des4 = mem_addr & 0xffffffff;
-                       desc->des5 = mem_addr >> 32;
+                       for ( ; length ; desc++) {
+                               desc_len = (length <= DW_MCI_DESC_DATA_LENGTH) ?
+                                          length : DW_MCI_DESC_DATA_LENGTH;
+
+                               length -= desc_len;
+
+                               /*
+                                * Set the OWN bit and disable interrupts
+                                * for this descriptor
+                                */
+                               desc->des0 = IDMAC_DES0_OWN | IDMAC_DES0_DIC |
+                                                       IDMAC_DES0_CH;
+
+                               /* Buffer length */
+                               IDMAC_64ADDR_SET_BUFFER1_SIZE(desc, desc_len);
+
+                               /* Physical address to DMA to/from */
+                               desc->des4 = mem_addr & 0xffffffff;
+                               desc->des5 = mem_addr >> 32;
+
+                               /* Update physical address for the next desc */
+                               mem_addr += desc_len;
+
+                               /* Save pointer to the last descriptor */
+                               desc_last = desc;
+                       }
                }
 
                /* Set first descriptor */
-               desc = host->sg_cpu;
-               desc->des0 |= IDMAC_DES0_FD;
+               desc_first->des0 |= IDMAC_DES0_FD;
 
                /* Set last descriptor */
-               desc = host->sg_cpu + (i - 1) *
-                               sizeof(struct idmac_desc_64addr);
-               desc->des0 &= ~(IDMAC_DES0_CH | IDMAC_DES0_DIC);
-               desc->des0 |= IDMAC_DES0_LD;
+               desc_last->des0 &= ~(IDMAC_DES0_CH | IDMAC_DES0_DIC);
+               desc_last->des0 |= IDMAC_DES0_LD;
 
        } else {
-               struct idmac_desc *desc = host->sg_cpu;
+               struct idmac_desc *desc_first, *desc_last, *desc;
 
-               for (i = 0; i < sg_len; i++, desc++) {
+               desc_first = desc_last = desc = host->sg_cpu;
+
+               for (i = 0; i < sg_len; i++) {
                        unsigned int length = sg_dma_len(&data->sg[i]);
+
                        u32 mem_addr = sg_dma_address(&data->sg[i]);
 
-                       /*
-                        * Set the OWN bit and disable interrupts for this
-                        * descriptor
-                        */
-                       desc->des0 = cpu_to_le32(IDMAC_DES0_OWN |
-                                       IDMAC_DES0_DIC | IDMAC_DES0_CH);
-                       /* Buffer length */
-                       IDMAC_SET_BUFFER1_SIZE(desc, length);
+                       for ( ; length ; desc++) {
+                               desc_len = (length <= DW_MCI_DESC_DATA_LENGTH) ?
+                                          length : DW_MCI_DESC_DATA_LENGTH;
+
+                               length -= desc_len;
+
+                               /*
+                                * Set the OWN bit and disable interrupts
+                                * for this descriptor
+                                */
+                               desc->des0 = cpu_to_le32(IDMAC_DES0_OWN |
+                                                        IDMAC_DES0_DIC |
+                                                        IDMAC_DES0_CH);
 
-                       /* Physical address to DMA to/from */
-                       desc->des2 = cpu_to_le32(mem_addr);
+                               /* Buffer length */
+                               IDMAC_SET_BUFFER1_SIZE(desc, desc_len);
+
+                               /* Physical address to DMA to/from */
+                               desc->des2 = cpu_to_le32(mem_addr);
+
+                               /* Update physical address for the next desc */
+                               mem_addr += desc_len;
+
+                               /* Save pointer to the last descriptor */
+                               desc_last = desc;
+                       }
                }
 
                /* Set first descriptor */
-               desc = host->sg_cpu;
-               desc->des0 |= cpu_to_le32(IDMAC_DES0_FD);
+               desc_first->des0 |= cpu_to_le32(IDMAC_DES0_FD);
 
                /* Set last descriptor */
-               desc = host->sg_cpu + (i - 1) * sizeof(struct idmac_desc);
-               desc->des0 &= cpu_to_le32(~(IDMAC_DES0_CH | IDMAC_DES0_DIC));
-               desc->des0 |= cpu_to_le32(IDMAC_DES0_LD);
+               desc_last->des0 &= cpu_to_le32(~(IDMAC_DES0_CH |
+                                              IDMAC_DES0_DIC));
+               desc_last->des0 |= cpu_to_le32(IDMAC_DES0_LD);
        }
 
-       wmb();
+       wmb(); /* drain writebuffer */
 }
 
 static void dw_mci_idmac_start_dma(struct dw_mci *host, unsigned int sg_len)
@@ -542,6 +579,7 @@ static void dw_mci_idmac_start_dma(struct dw_mci *host, unsigned int sg_len)
        temp |= SDMMC_CTRL_USE_IDMAC;
        mci_writel(host, CTRL, temp);
 
+       /* drain writebuffer */
        wmb();
 
        /* Enable the IDMAC */
@@ -589,7 +627,9 @@ static int dw_mci_idmac_init(struct dw_mci *host)
                host->ring_size = PAGE_SIZE / sizeof(struct idmac_desc);
 
                /* Forward link the descriptor list */
-               for (i = 0, p = host->sg_cpu; i < host->ring_size - 1; i++, p++) {
+               for (i = 0, p = host->sg_cpu;
+                    i < host->ring_size - 1;
+                    i++, p++) {
                        p->des3 = cpu_to_le32(host->sg_dma +
                                        (sizeof(struct idmac_desc) * (i + 1)));
                        p->des1 = 0;
@@ -718,7 +758,7 @@ static void dw_mci_adjust_fifoth(struct dw_mci *host, struct mmc_data *data)
        u32 fifo_width = 1 << host->data_shift;
        u32 blksz_depth = blksz / fifo_width, fifoth_val;
        u32 msize = 0, rx_wmark = 1, tx_wmark, tx_wmark_invers;
-       int idx = (sizeof(mszs) / sizeof(mszs[0])) - 1;
+       int idx = ARRAY_SIZE(mszs) - 1;
 
        tx_wmark = (host->fifo_depth) / 2;
        tx_wmark_invers = host->fifo_depth - tx_wmark;
@@ -843,6 +883,7 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
 {
        unsigned long irqflags;
+       int flags = SG_MITER_ATOMIC;
        u32 temp;
 
        data->error = -EINPROGRESS;
@@ -859,7 +900,6 @@ static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
        }
 
        if (dw_mci_submit_data_dma(host, data)) {
-               int flags = SG_MITER_ATOMIC;
                if (host->data->flags & MMC_DATA_READ)
                        flags |= SG_MITER_TO_SG;
                else
@@ -906,7 +946,7 @@ static void mci_send_cmd(struct dw_mci_slot *slot, u32 cmd, u32 arg)
        unsigned int cmd_status = 0;
 
        mci_writel(host, CMDARG, arg);
-       wmb();
+       wmb(); /* drain writebuffer */
        dw_mci_wait_while_busy(host, cmd);
        mci_writel(host, CMD, SDMMC_CMD_START | cmd);
 
@@ -1019,7 +1059,7 @@ static void __dw_mci_start_request(struct dw_mci *host,
 
        if (data) {
                dw_mci_submit_data(host, data);
-               wmb();
+               wmb(); /* drain writebuffer */
        }
 
        dw_mci_start_command(host, cmd, cmdflags);
@@ -1384,14 +1424,15 @@ static int dw_mci_execute_tuning(struct mmc_host *mmc, u32 opcode)
        struct dw_mci_slot *slot = mmc_priv(mmc);
        struct dw_mci *host = slot->host;
        const struct dw_mci_drv_data *drv_data = host->drv_data;
-       int err = -ENOSYS;
+       int err = -EINVAL;
 
        if (drv_data && drv_data->execute_tuning)
                err = drv_data->execute_tuning(slot);
        return err;
 }
 
-static int dw_mci_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios)
+static int dw_mci_prepare_hs400_tuning(struct mmc_host *mmc,
+                                      struct mmc_ios *ios)
 {
        struct dw_mci_slot *slot = mmc_priv(mmc);
        struct dw_mci *host = slot->host;
@@ -1533,6 +1574,20 @@ static int dw_mci_data_complete(struct dw_mci *host, struct mmc_data *data)
        return data->error;
 }
 
+static void dw_mci_set_drto(struct dw_mci *host)
+{
+       unsigned int drto_clks;
+       unsigned int drto_ms;
+
+       drto_clks = mci_readl(host, TMOUT) >> 8;
+       drto_ms = DIV_ROUND_UP(drto_clks, host->bus_hz / 1000);
+
+       /* add a bit spare time */
+       drto_ms += 10;
+
+       mod_timer(&host->dto_timer, jiffies + msecs_to_jiffies(drto_ms));
+}
+
 static void dw_mci_tasklet_func(unsigned long priv)
 {
        struct dw_mci *host = (struct dw_mci *)priv;
@@ -1610,8 +1665,16 @@ static void dw_mci_tasklet_func(unsigned long priv)
                        }
 
                        if (!test_and_clear_bit(EVENT_XFER_COMPLETE,
-                                               &host->pending_events))
+                                               &host->pending_events)) {
+                               /*
+                                * If all data-related interrupts don't come
+                                * within the given time in reading data state.
+                                */
+                               if ((host->quirks & DW_MCI_QUIRK_BROKEN_DTO) &&
+                                   (host->dir_status == DW_MCI_RECV_STATUS))
+                                       dw_mci_set_drto(host);
                                break;
+                       }
 
                        set_bit(EVENT_XFER_COMPLETE, &host->completed_events);
 
@@ -1644,8 +1707,17 @@ static void dw_mci_tasklet_func(unsigned long priv)
 
                case STATE_DATA_BUSY:
                        if (!test_and_clear_bit(EVENT_DATA_COMPLETE,
-                                               &host->pending_events))
+                                               &host->pending_events)) {
+                               /*
+                                * If data error interrupt comes but data over
+                                * interrupt doesn't come within the given time.
+                                * in reading data state.
+                                */
+                               if ((host->quirks & DW_MCI_QUIRK_BROKEN_DTO) &&
+                                   (host->dir_status == DW_MCI_RECV_STATUS))
+                                       dw_mci_set_drto(host);
                                break;
+                       }
 
                        host->data = NULL;
                        set_bit(EVENT_DATA_COMPLETE, &host->completed_events);
@@ -1743,7 +1815,7 @@ static int dw_mci_push_part_bytes(struct dw_mci *host, void *buf, int cnt)
 /* pull first bytes from part_buf, only use during pull */
 static int dw_mci_pull_part_bytes(struct dw_mci *host, void *buf, int cnt)
 {
-       cnt = min(cnt, (int)host->part_buf_count);
+       cnt = min_t(int, cnt, host->part_buf_count);
        if (cnt) {
                memcpy(buf, (void *)&host->part_buf + host->part_buf_start,
                       cnt);
@@ -1769,6 +1841,7 @@ static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
        /* try and push anything in the part_buf */
        if (unlikely(host->part_buf_count)) {
                int len = dw_mci_push_part_bytes(host, buf, cnt);
+
                buf += len;
                cnt -= len;
                if (host->part_buf_count == 2) {
@@ -1795,6 +1868,7 @@ static void dw_mci_push_data16(struct dw_mci *host, void *buf, int cnt)
 #endif
        {
                u16 *pdata = buf;
+
                for (; cnt >= 2; cnt -= 2)
                        mci_fifo_writew(host->fifo_reg, *pdata++);
                buf = pdata;
@@ -1819,6 +1893,7 @@ static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
                        int len = min(cnt & -2, (int)sizeof(aligned_buf));
                        int items = len >> 1;
                        int i;
+
                        for (i = 0; i < items; ++i)
                                aligned_buf[i] = mci_fifo_readw(host->fifo_reg);
                        /* memcpy from aligned buffer into output buffer */
@@ -1830,6 +1905,7 @@ static void dw_mci_pull_data16(struct dw_mci *host, void *buf, int cnt)
 #endif
        {
                u16 *pdata = buf;
+
                for (; cnt >= 2; cnt -= 2)
                        *pdata++ = mci_fifo_readw(host->fifo_reg);
                buf = pdata;
@@ -1848,6 +1924,7 @@ static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt)
        /* try and push anything in the part_buf */
        if (unlikely(host->part_buf_count)) {
                int len = dw_mci_push_part_bytes(host, buf, cnt);
+
                buf += len;
                cnt -= len;
                if (host->part_buf_count == 4) {
@@ -1874,6 +1951,7 @@ static void dw_mci_push_data32(struct dw_mci *host, void *buf, int cnt)
 #endif
        {
                u32 *pdata = buf;
+
                for (; cnt >= 4; cnt -= 4)
                        mci_fifo_writel(host->fifo_reg, *pdata++);
                buf = pdata;
@@ -1898,6 +1976,7 @@ static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt)
                        int len = min(cnt & -4, (int)sizeof(aligned_buf));
                        int items = len >> 2;
                        int i;
+
                        for (i = 0; i < items; ++i)
                                aligned_buf[i] = mci_fifo_readl(host->fifo_reg);
                        /* memcpy from aligned buffer into output buffer */
@@ -1909,6 +1988,7 @@ static void dw_mci_pull_data32(struct dw_mci *host, void *buf, int cnt)
 #endif
        {
                u32 *pdata = buf;
+
                for (; cnt >= 4; cnt -= 4)
                        *pdata++ = mci_fifo_readl(host->fifo_reg);
                buf = pdata;
@@ -1927,6 +2007,7 @@ static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt)
        /* try and push anything in the part_buf */
        if (unlikely(host->part_buf_count)) {
                int len = dw_mci_push_part_bytes(host, buf, cnt);
+
                buf += len;
                cnt -= len;
 
@@ -1954,6 +2035,7 @@ static void dw_mci_push_data64(struct dw_mci *host, void *buf, int cnt)
 #endif
        {
                u64 *pdata = buf;
+
                for (; cnt >= 8; cnt -= 8)
                        mci_fifo_writeq(host->fifo_reg, *pdata++);
                buf = pdata;
@@ -1978,6 +2060,7 @@ static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt)
                        int len = min(cnt & -8, (int)sizeof(aligned_buf));
                        int items = len >> 3;
                        int i;
+
                        for (i = 0; i < items; ++i)
                                aligned_buf[i] = mci_fifo_readq(host->fifo_reg);
 
@@ -1990,6 +2073,7 @@ static void dw_mci_pull_data64(struct dw_mci *host, void *buf, int cnt)
 #endif
        {
                u64 *pdata = buf;
+
                for (; cnt >= 8; cnt -= 8)
                        *pdata++ = mci_fifo_readq(host->fifo_reg);
                buf = pdata;
@@ -2065,7 +2149,7 @@ static void dw_mci_read_data_pio(struct dw_mci *host, bool dto)
 done:
        sg_miter_stop(sg_miter);
        host->sg = NULL;
-       smp_wmb();
+       smp_wmb(); /* drain writebuffer */
        set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
 }
 
@@ -2119,7 +2203,7 @@ static void dw_mci_write_data_pio(struct dw_mci *host)
 done:
        sg_miter_stop(sg_miter);
        host->sg = NULL;
-       smp_wmb();
+       smp_wmb(); /* drain writebuffer */
        set_bit(EVENT_XFER_COMPLETE, &host->pending_events);
 }
 
@@ -2128,7 +2212,7 @@ static void dw_mci_cmd_interrupt(struct dw_mci *host, u32 status)
        if (!host->cmd_status)
                host->cmd_status = status;
 
-       smp_wmb();
+       smp_wmb(); /* drain writebuffer */
 
        set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
        tasklet_schedule(&host->tasklet);
@@ -2192,7 +2276,7 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
                if (pending & DW_MCI_CMD_ERROR_FLAGS) {
                        mci_writel(host, RINTSTS, DW_MCI_CMD_ERROR_FLAGS);
                        host->cmd_status = pending;
-                       smp_wmb();
+                       smp_wmb(); /* drain writebuffer */
                        set_bit(EVENT_CMD_COMPLETE, &host->pending_events);
                }
 
@@ -2200,16 +2284,19 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
                        /* if there is an error report DATA_ERROR */
                        mci_writel(host, RINTSTS, DW_MCI_DATA_ERROR_FLAGS);
                        host->data_status = pending;
-                       smp_wmb();
+                       smp_wmb(); /* drain writebuffer */
                        set_bit(EVENT_DATA_ERROR, &host->pending_events);
                        tasklet_schedule(&host->tasklet);
                }
 
                if (pending & SDMMC_INT_DATA_OVER) {
+                       if (host->quirks & DW_MCI_QUIRK_BROKEN_DTO)
+                               del_timer(&host->dto_timer);
+
                        mci_writel(host, RINTSTS, SDMMC_INT_DATA_OVER);
                        if (!host->data_status)
                                host->data_status = pending;
-                       smp_wmb();
+                       smp_wmb(); /* drain writebuffer */
                        if (host->dir_status == DW_MCI_RECV_STATUS) {
                                if (host->sg != NULL)
                                        dw_mci_read_data_pio(host, true);
@@ -2383,27 +2470,20 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
        if (ret)
                goto err_host_allocated;
 
-       if (host->pdata->blk_settings) {
-               mmc->max_segs = host->pdata->blk_settings->max_segs;
-               mmc->max_blk_size = host->pdata->blk_settings->max_blk_size;
-               mmc->max_blk_count = host->pdata->blk_settings->max_blk_count;
-               mmc->max_req_size = host->pdata->blk_settings->max_req_size;
-               mmc->max_seg_size = host->pdata->blk_settings->max_seg_size;
-       } else {
-               /* Useful defaults if platform data is unset. */
-#ifdef CONFIG_MMC_DW_IDMAC
+       /* Useful defaults if platform data is unset. */
+       if (host->use_dma) {
                mmc->max_segs = host->ring_size;
                mmc->max_blk_size = 65536;
                mmc->max_seg_size = 0x1000;
                mmc->max_req_size = mmc->max_seg_size * host->ring_size;
                mmc->max_blk_count = mmc->max_req_size / 512;
-#else
+       } else {
                mmc->max_segs = 64;
                mmc->max_blk_size = 65536; /* BLKSIZ is 16 bits */
                mmc->max_blk_count = 512;
-               mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
+               mmc->max_req_size = mmc->max_blk_size *
+                                   mmc->max_blk_count;
                mmc->max_seg_size = mmc->max_req_size;
-#endif /* CONFIG_MMC_DW_IDMAC */
        }
 
        if (dw_mci_get_cd(mmc))
@@ -2473,8 +2553,8 @@ static void dw_mci_init_dma(struct dw_mci *host)
        if (host->dma_ops->init && host->dma_ops->start &&
            host->dma_ops->stop && host->dma_ops->cleanup) {
                if (host->dma_ops->init(host)) {
-                       dev_err(host->dev, "%s: Unable to initialize "
-                               "DMA Controller.\n", __func__);
+                       dev_err(host->dev, "%s: Unable to initialize DMA Controller.\n",
+                               __func__);
                        goto no_dma;
                }
        } else {
@@ -2488,7 +2568,6 @@ static void dw_mci_init_dma(struct dw_mci *host)
 no_dma:
        dev_info(host->dev, "Using PIO mode.\n");
        host->use_dma = 0;
-       return;
 }
 
 static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset)
@@ -2542,6 +2621,7 @@ static bool dw_mci_reset(struct dw_mci *host)
                if (host->use_dma) {
                        unsigned long timeout = jiffies + msecs_to_jiffies(500);
                        u32 status;
+
                        do {
                                status = mci_readl(host, STATUS);
                                if (!(status & SDMMC_STATUS_DMA_REQ))
@@ -2551,8 +2631,8 @@ static bool dw_mci_reset(struct dw_mci *host)
 
                        if (status & SDMMC_STATUS_DMA_REQ) {
                                dev_err(host->dev,
-                                       "%s: Timeout waiting for dma_req to "
-                                       "clear during reset\n", __func__);
+                                       "%s: Timeout waiting for dma_req to clear during reset\n",
+                                       __func__);
                                goto ciu_out;
                        }
 
@@ -2563,8 +2643,8 @@ static bool dw_mci_reset(struct dw_mci *host)
        } else {
                /* if the controller reset bit did clear, then set clock regs */
                if (!(mci_readl(host, CTRL) & SDMMC_CTRL_RESET)) {
-                       dev_err(host->dev, "%s: fifo/dma reset bits didn't "
-                               "clear but ciu was reset, doing clock update\n",
+                       dev_err(host->dev,
+                               "%s: fifo/dma reset bits didn't clear but ciu was reset, doing clock update\n",
                                __func__);
                        goto ciu_out;
                }
@@ -2598,6 +2678,28 @@ static void dw_mci_cmd11_timer(unsigned long arg)
        tasklet_schedule(&host->tasklet);
 }
 
+static void dw_mci_dto_timer(unsigned long arg)
+{
+       struct dw_mci *host = (struct dw_mci *)arg;
+
+       switch (host->state) {
+       case STATE_SENDING_DATA:
+       case STATE_DATA_BUSY:
+               /*
+                * If DTO interrupt does NOT come in sending data state,
+                * we should notify the driver to terminate current transfer
+                * and report a data timeout to the core.
+                */
+               host->data_status = SDMMC_INT_DRTO;
+               set_bit(EVENT_DATA_ERROR, &host->pending_events);
+               set_bit(EVENT_DATA_COMPLETE, &host->pending_events);
+               tasklet_schedule(&host->tasklet);
+               break;
+       default:
+               break;
+       }
+}
+
 #ifdef CONFIG_OF
 static struct dw_mci_of_quirks {
        char *quirk;
@@ -2625,8 +2727,8 @@ static struct dw_mci_board *dw_mci_parse_dt(struct dw_mci *host)
        /* find out number of slots supported */
        if (of_property_read_u32(dev->of_node, "num-slots",
                                &pdata->num_slots)) {
-               dev_info(dev, "num-slots property not found, "
-                               "assuming 1 slot is available\n");
+               dev_info(dev,
+                        "num-slots property not found, assuming 1 slot is available\n");
                pdata->num_slots = 1;
        }
 
@@ -2636,8 +2738,8 @@ static struct dw_mci_board *dw_mci_parse_dt(struct dw_mci *host)
                        pdata->quirks |= of_quirks[idx].id;
 
        if (of_property_read_u32(np, "fifo-depth", &pdata->fifo_depth))
-               dev_info(dev, "fifo-depth property not found, using "
-                               "value of FIFOTH register as default\n");
+               dev_info(dev,
+                        "fifo-depth property not found, using value of FIFOTH register as default\n");
 
        of_property_read_u32(np, "card-detect-delay", &pdata->detect_delay_ms);
 
@@ -2650,8 +2752,10 @@ static struct dw_mci_board *dw_mci_parse_dt(struct dw_mci *host)
                        return ERR_PTR(ret);
        }
 
-       if (of_find_property(np, "supports-highspeed", NULL))
+       if (of_find_property(np, "supports-highspeed", NULL)) {
+               dev_info(dev, "supports-highspeed property is deprecated.\n");
                pdata->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED;
+       }
 
        return pdata;
 }
@@ -2706,7 +2810,7 @@ int dw_mci_probe(struct dw_mci *host)
                }
        }
 
-       if (host->pdata->num_slots > 1) {
+       if (host->pdata->num_slots < 1) {
                dev_err(host->dev,
                        "Platform data must supply num_slots.\n");
                return -ENODEV;
@@ -2774,6 +2878,10 @@ int dw_mci_probe(struct dw_mci *host)
 
        host->quirks = host->pdata->quirks;
 
+       if (host->quirks & DW_MCI_QUIRK_BROKEN_DTO)
+               setup_timer(&host->dto_timer,
+                           dw_mci_dto_timer, (unsigned long)host);
+
        spin_lock_init(&host->lock);
        spin_lock_init(&host->irq_lock);
        INIT_LIST_HEAD(&host->queue);
@@ -2874,11 +2982,11 @@ int dw_mci_probe(struct dw_mci *host)
        mci_writel(host, INTMASK, SDMMC_INT_CMD_DONE | SDMMC_INT_DATA_OVER |
                   SDMMC_INT_TXDR | SDMMC_INT_RXDR |
                   DW_MCI_ERROR_FLAGS);
-       mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE); /* Enable mci interrupt */
+       /* Enable mci interrupt */
+       mci_writel(host, CTRL, SDMMC_CTRL_INT_ENABLE);
 
-       dev_info(host->dev, "DW MMC controller at irq %d, "
-                "%d bit host data width, "
-                "%u deep fifo\n",
+       dev_info(host->dev,
+                "DW MMC controller at irq %d,%d bit host data width,%u deep fifo\n",
                 host->irq, width, fifo_size);
 
        /* We need at least one slot to succeed */
@@ -2893,8 +3001,9 @@ int dw_mci_probe(struct dw_mci *host)
        if (init_slots) {
                dev_info(host->dev, "%d slots initialized\n", init_slots);
        } else {
-               dev_dbg(host->dev, "attempted to initialize %d slots, "
-                                       "but failed on all\n", host->num_slots);
+               dev_dbg(host->dev,
+                       "attempted to initialize %d slots, but failed on all\n",
+                       host->num_slots);
                goto err_dmaunmap;
        }
 
@@ -2992,6 +3101,7 @@ int dw_mci_resume(struct dw_mci *host)
 
        for (i = 0; i < host->num_slots; i++) {
                struct dw_mci_slot *slot = host->slot[i];
+
                if (!slot)
                        continue;
                if (slot->mmc->pm_flags & MMC_PM_KEEP_POWER) {
index 68dd6c79c378c8a355a36e23300eb996bfcbc720..b763b11ed9e1e72320a2d8e4a8d9e393b229766e 100644 (file)
@@ -948,6 +948,7 @@ mmc_omap_prepare_data(struct mmc_omap_host *host, struct mmc_request *req)
 {
        struct mmc_data *data = req->data;
        int i, use_dma = 1, block_size;
+       struct scatterlist *sg;
        unsigned sg_len;
 
        host->data = data;
@@ -972,8 +973,8 @@ mmc_omap_prepare_data(struct mmc_omap_host *host, struct mmc_request *req)
        sg_len = (data->blocks == 1) ? 1 : data->sg_len;
 
        /* Only do DMA for entire blocks */
-       for (i = 0; i < sg_len; i++) {
-               if ((data->sg[i].length % block_size) != 0) {
+       for_each_sg(data->sg, sg, sg_len, i) {
+               if ((sg->length % block_size) != 0) {
                        use_dma = 0;
                        break;
                }
@@ -1419,8 +1420,10 @@ static int mmc_omap_probe(struct platform_device *pdev)
        host->reg_shift = (mmc_omap7xx() ? 1 : 2);
 
        host->mmc_omap_wq = alloc_workqueue("mmc_omap", 0, 0);
-       if (!host->mmc_omap_wq)
+       if (!host->mmc_omap_wq) {
+               ret = -ENOMEM;
                goto err_plat_cleanup;
+       }
 
        for (i = 0; i < pdata->nr_slots; i++) {
                ret = mmc_omap_new_slot(host, i);
index 4d120323689043f21c522b44389757f8391d45fc..781e4db317671ce6146dea121a56f42f90e7c491 100644 (file)
@@ -181,18 +181,9 @@ struct omap_hsmmc_host {
        struct  mmc_data        *data;
        struct  clk             *fclk;
        struct  clk             *dbclk;
-       /*
-        * vcc == configured supply
-        * vcc_aux == optional
-        *   -  MMC1, supply for DAT4..DAT7
-        *   -  MMC2/MMC2, external level shifter voltage supply, for
-        *      chip (SDIO, eMMC, etc) or transceiver (MMC2 only)
-        */
-       struct  regulator       *vcc;
-       struct  regulator       *vcc_aux;
        struct  regulator       *pbias;
-       bool                    pbias_enabled;
        void    __iomem         *base;
+       int                     vqmmc_enabled;
        resource_size_t         mapbase;
        spinlock_t              irq_lock; /* Prevent races with irq handler */
        unsigned int            dma_len;
@@ -213,7 +204,6 @@ struct omap_hsmmc_host {
        int                     context_loss;
        int                     protect_card;
        int                     reqs_blocked;
-       int                     use_reg;
        int                     req_in_progress;
        unsigned long           clk_rate;
        unsigned int            flags;
@@ -254,32 +244,133 @@ static int omap_hsmmc_get_cover_state(struct device *dev)
        return mmc_gpio_get_cd(host->mmc);
 }
 
-#ifdef CONFIG_REGULATOR
+static int omap_hsmmc_enable_supply(struct mmc_host *mmc)
+{
+       int ret;
+       struct omap_hsmmc_host *host = mmc_priv(mmc);
+       struct mmc_ios *ios = &mmc->ios;
+
+       if (mmc->supply.vmmc) {
+               ret = mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd);
+               if (ret)
+                       return ret;
+       }
+
+       /* Enable interface voltage rail, if needed */
+       if (mmc->supply.vqmmc && !host->vqmmc_enabled) {
+               ret = regulator_enable(mmc->supply.vqmmc);
+               if (ret) {
+                       dev_err(mmc_dev(mmc), "vmmc_aux reg enable failed\n");
+                       goto err_vqmmc;
+               }
+               host->vqmmc_enabled = 1;
+       }
+
+       return 0;
+
+err_vqmmc:
+       if (mmc->supply.vmmc)
+               mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
+
+       return ret;
+}
+
+static int omap_hsmmc_disable_supply(struct mmc_host *mmc)
+{
+       int ret;
+       int status;
+       struct omap_hsmmc_host *host = mmc_priv(mmc);
+
+       if (mmc->supply.vqmmc && host->vqmmc_enabled) {
+               ret = regulator_disable(mmc->supply.vqmmc);
+               if (ret) {
+                       dev_err(mmc_dev(mmc), "vmmc_aux reg disable failed\n");
+                       return ret;
+               }
+               host->vqmmc_enabled = 0;
+       }
+
+       if (mmc->supply.vmmc) {
+               ret = mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
+               if (ret)
+                       goto err_set_ocr;
+       }
+
+       return 0;
+
+err_set_ocr:
+       if (mmc->supply.vqmmc) {
+               status = regulator_enable(mmc->supply.vqmmc);
+               if (status)
+                       dev_err(mmc_dev(mmc), "vmmc_aux re-enable failed\n");
+       }
+
+       return ret;
+}
+
+static int omap_hsmmc_set_pbias(struct omap_hsmmc_host *host, bool power_on,
+                               int vdd)
+{
+       int ret;
+
+       if (!host->pbias)
+               return 0;
+
+       if (power_on) {
+               if (vdd <= VDD_165_195)
+                       ret = regulator_set_voltage(host->pbias, VDD_1V8,
+                                                   VDD_1V8);
+               else
+                       ret = regulator_set_voltage(host->pbias, VDD_3V0,
+                                                   VDD_3V0);
+               if (ret < 0) {
+                       dev_err(host->dev, "pbias set voltage fail\n");
+                       return ret;
+               }
+
+               if (!regulator_is_enabled(host->pbias)) {
+                       ret = regulator_enable(host->pbias);
+                       if (ret) {
+                               dev_err(host->dev, "pbias reg enable fail\n");
+                               return ret;
+                       }
+               }
+       } else {
+               if (regulator_is_enabled(host->pbias)) {
+                       ret = regulator_disable(host->pbias);
+                       if (ret) {
+                               dev_err(host->dev, "pbias reg disable fail\n");
+                               return ret;
+                       }
+               }
+       }
+
+       return 0;
+}
 
 static int omap_hsmmc_set_power(struct device *dev, int power_on, int vdd)
 {
        struct omap_hsmmc_host *host =
                platform_get_drvdata(to_platform_device(dev));
+       struct mmc_host *mmc = host->mmc;
        int ret = 0;
 
+       if (mmc_pdata(host)->set_power)
+               return mmc_pdata(host)->set_power(dev, power_on, vdd);
+
        /*
         * If we don't see a Vcc regulator, assume it's a fixed
         * voltage always-on regulator.
         */
-       if (!host->vcc)
+       if (!mmc->supply.vmmc)
                return 0;
 
        if (mmc_pdata(host)->before_set_reg)
                mmc_pdata(host)->before_set_reg(dev, power_on, vdd);
 
-       if (host->pbias) {
-               if (host->pbias_enabled == 1) {
-                       ret = regulator_disable(host->pbias);
-                       if (!ret)
-                               host->pbias_enabled = 0;
-               }
-               regulator_set_voltage(host->pbias, VDD_3V0, VDD_3V0);
-       }
+       ret = omap_hsmmc_set_pbias(host, false, 0);
+       if (ret)
+               return ret;
 
        /*
         * Assume Vcc regulator is used only to power the card ... OMAP
@@ -295,129 +386,138 @@ static int omap_hsmmc_set_power(struct device *dev, int power_on, int vdd)
         * chips/cards need an interface voltage rail too.
         */
        if (power_on) {
-               if (host->vcc)
-                       ret = mmc_regulator_set_ocr(host->mmc, host->vcc, vdd);
-               /* Enable interface voltage rail, if needed */
-               if (ret == 0 && host->vcc_aux) {
-                       ret = regulator_enable(host->vcc_aux);
-                       if (ret < 0 && host->vcc)
-                               ret = mmc_regulator_set_ocr(host->mmc,
-                                                       host->vcc, 0);
-               }
-       } else {
-               /* Shut down the rail */
-               if (host->vcc_aux)
-                       ret = regulator_disable(host->vcc_aux);
-               if (host->vcc) {
-                       /* Then proceed to shut down the local regulator */
-                       ret = mmc_regulator_set_ocr(host->mmc,
-                                               host->vcc, 0);
-               }
-       }
-
-       if (host->pbias) {
-               if (vdd <= VDD_165_195)
-                       ret = regulator_set_voltage(host->pbias, VDD_1V8,
-                                                               VDD_1V8);
-               else
-                       ret = regulator_set_voltage(host->pbias, VDD_3V0,
-                                                               VDD_3V0);
-               if (ret < 0)
-                       goto error_set_power;
+               ret = omap_hsmmc_enable_supply(mmc);
+               if (ret)
+                       return ret;
 
-               if (host->pbias_enabled == 0) {
-                       ret = regulator_enable(host->pbias);
-                       if (!ret)
-                               host->pbias_enabled = 1;
-               }
+               ret = omap_hsmmc_set_pbias(host, true, vdd);
+               if (ret)
+                       goto err_set_voltage;
+       } else {
+               ret = omap_hsmmc_disable_supply(mmc);
+               if (ret)
+                       return ret;
        }
 
        if (mmc_pdata(host)->after_set_reg)
                mmc_pdata(host)->after_set_reg(dev, power_on, vdd);
 
-error_set_power:
+       return 0;
+
+err_set_voltage:
+       omap_hsmmc_disable_supply(mmc);
+
        return ret;
 }
 
-static int omap_hsmmc_reg_get(struct omap_hsmmc_host *host)
+static int omap_hsmmc_disable_boot_regulator(struct regulator *reg)
 {
-       struct regulator *reg;
-       int ocr_value = 0;
+       int ret;
 
-       reg = devm_regulator_get(host->dev, "vmmc");
-       if (IS_ERR(reg)) {
-               dev_err(host->dev, "unable to get vmmc regulator %ld\n",
-                       PTR_ERR(reg));
-               return PTR_ERR(reg);
-       } else {
-               host->vcc = reg;
-               ocr_value = mmc_regulator_get_ocrmask(reg);
-               if (!mmc_pdata(host)->ocr_mask) {
-                       mmc_pdata(host)->ocr_mask = ocr_value;
-               } else {
-                       if (!(mmc_pdata(host)->ocr_mask & ocr_value)) {
-                               dev_err(host->dev, "ocrmask %x is not supported\n",
-                                       mmc_pdata(host)->ocr_mask);
-                               mmc_pdata(host)->ocr_mask = 0;
-                               return -EINVAL;
-                       }
-               }
+       if (!reg)
+               return 0;
+
+       if (regulator_is_enabled(reg)) {
+               ret = regulator_enable(reg);
+               if (ret)
+                       return ret;
+
+               ret = regulator_disable(reg);
+               if (ret)
+                       return ret;
        }
-       mmc_pdata(host)->set_power = omap_hsmmc_set_power;
 
-       /* Allow an aux regulator */
-       reg = devm_regulator_get_optional(host->dev, "vmmc_aux");
-       host->vcc_aux = IS_ERR(reg) ? NULL : reg;
+       return 0;
+}
 
-       reg = devm_regulator_get_optional(host->dev, "pbias");
-       host->pbias = IS_ERR(reg) ? NULL : reg;
+static int omap_hsmmc_disable_boot_regulators(struct omap_hsmmc_host *host)
+{
+       struct mmc_host *mmc = host->mmc;
+       int ret;
 
-       /* For eMMC do not power off when not in sleep state */
-       if (mmc_pdata(host)->no_regulator_off_init)
-               return 0;
        /*
-        * To disable boot_on regulator, enable regulator
-        * to increase usecount and then disable it.
+        * disable regulators enabled during boot and get the usecount
+        * right so that regulators can be enabled/disabled by checking
+        * the return value of regulator_is_enabled
         */
-       if ((host->vcc && regulator_is_enabled(host->vcc) > 0) ||
-           (host->vcc_aux && regulator_is_enabled(host->vcc_aux))) {
-               int vdd = ffs(mmc_pdata(host)->ocr_mask) - 1;
+       ret = omap_hsmmc_disable_boot_regulator(mmc->supply.vmmc);
+       if (ret) {
+               dev_err(host->dev, "fail to disable boot enabled vmmc reg\n");
+               return ret;
+       }
+
+       ret = omap_hsmmc_disable_boot_regulator(mmc->supply.vqmmc);
+       if (ret) {
+               dev_err(host->dev,
+                       "fail to disable boot enabled vmmc_aux reg\n");
+               return ret;
+       }
 
-               mmc_pdata(host)->set_power(host->dev, 1, vdd);
-               mmc_pdata(host)->set_power(host->dev, 0, 0);
+       ret = omap_hsmmc_disable_boot_regulator(host->pbias);
+       if (ret) {
+               dev_err(host->dev,
+                       "failed to disable boot enabled pbias reg\n");
+               return ret;
        }
 
        return 0;
 }
 
-static void omap_hsmmc_reg_put(struct omap_hsmmc_host *host)
+static int omap_hsmmc_reg_get(struct omap_hsmmc_host *host)
 {
-       mmc_pdata(host)->set_power = NULL;
-}
+       int ocr_value = 0;
+       int ret;
+       struct mmc_host *mmc = host->mmc;
 
-static inline int omap_hsmmc_have_reg(void)
-{
-       return 1;
-}
+       if (mmc_pdata(host)->set_power)
+               return 0;
 
-#else
+       mmc->supply.vmmc = devm_regulator_get_optional(host->dev, "vmmc");
+       if (IS_ERR(mmc->supply.vmmc)) {
+               ret = PTR_ERR(mmc->supply.vmmc);
+               if (ret != -ENODEV)
+                       return ret;
+               dev_dbg(host->dev, "unable to get vmmc regulator %ld\n",
+                       PTR_ERR(mmc->supply.vmmc));
+               mmc->supply.vmmc = NULL;
+       } else {
+               ocr_value = mmc_regulator_get_ocrmask(mmc->supply.vmmc);
+               if (ocr_value > 0)
+                       mmc_pdata(host)->ocr_mask = ocr_value;
+       }
 
-static inline int omap_hsmmc_reg_get(struct omap_hsmmc_host *host)
-{
-       return -EINVAL;
-}
+       /* Allow an aux regulator */
+       mmc->supply.vqmmc = devm_regulator_get_optional(host->dev, "vmmc_aux");
+       if (IS_ERR(mmc->supply.vqmmc)) {
+               ret = PTR_ERR(mmc->supply.vqmmc);
+               if (ret != -ENODEV)
+                       return ret;
+               dev_dbg(host->dev, "unable to get vmmc_aux regulator %ld\n",
+                       PTR_ERR(mmc->supply.vqmmc));
+               mmc->supply.vqmmc = NULL;
+       }
 
-static inline void omap_hsmmc_reg_put(struct omap_hsmmc_host *host)
-{
-}
+       host->pbias = devm_regulator_get_optional(host->dev, "pbias");
+       if (IS_ERR(host->pbias)) {
+               ret = PTR_ERR(host->pbias);
+               if (ret != -ENODEV)
+                       return ret;
+               dev_dbg(host->dev, "unable to get pbias regulator %ld\n",
+                       PTR_ERR(host->pbias));
+               host->pbias = NULL;
+       }
+
+       /* For eMMC do not power off when not in sleep state */
+       if (mmc_pdata(host)->no_regulator_off_init)
+               return 0;
+
+       ret = omap_hsmmc_disable_boot_regulators(host);
+       if (ret)
+               return ret;
 
-static inline int omap_hsmmc_have_reg(void)
-{
        return 0;
 }
 
-#endif
-
 static irqreturn_t omap_hsmmc_cover_irq(int irq, void *dev_id);
 
 static int omap_hsmmc_gpio_init(struct mmc_host *mmc,
@@ -1149,11 +1249,11 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd)
                clk_disable_unprepare(host->dbclk);
 
        /* Turn the power off */
-       ret = mmc_pdata(host)->set_power(host->dev, 0, 0);
+       ret = omap_hsmmc_set_power(host->dev, 0, 0);
 
        /* Turn the power ON with given VDD 1.8 or 3.0v */
        if (!ret)
-               ret = mmc_pdata(host)->set_power(host->dev, 1, vdd);
+               ret = omap_hsmmc_set_power(host->dev, 1, vdd);
        pm_runtime_get_sync(host->dev);
        if (host->dbclk)
                clk_prepare_enable(host->dbclk);
@@ -1552,10 +1652,10 @@ static void omap_hsmmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        if (ios->power_mode != host->power_mode) {
                switch (ios->power_mode) {
                case MMC_POWER_OFF:
-                       mmc_pdata(host)->set_power(host->dev, 0, 0);
+                       omap_hsmmc_set_power(host->dev, 0, 0);
                        break;
                case MMC_POWER_UP:
-                       mmc_pdata(host)->set_power(host->dev, 1, ios->vdd);
+                       omap_hsmmc_set_power(host->dev, 1, ios->vdd);
                        break;
                case MMC_POWER_ON:
                        do_send_init_stream = 1;
@@ -1953,7 +2053,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
        host->base      = base + pdata->reg_offset;
        host->power_mode = MMC_POWER_OFF;
        host->next_data.cookie = 1;
-       host->pbias_enabled = 0;
+       host->vqmmc_enabled = 0;
 
        ret = omap_hsmmc_gpio_init(mmc, host, pdata);
        if (ret)
@@ -2078,12 +2178,9 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
                goto err_irq;
        }
 
-       if (omap_hsmmc_have_reg() && !mmc_pdata(host)->set_power) {
-               ret = omap_hsmmc_reg_get(host);
-               if (ret)
-                       goto err_irq;
-               host->use_reg = 1;
-       }
+       ret = omap_hsmmc_reg_get(host);
+       if (ret)
+               goto err_irq;
 
        mmc->ocr_avail = mmc_pdata(host)->ocr_mask;
 
@@ -2125,8 +2222,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 
 err_slot_name:
        mmc_remove_host(mmc);
-       if (host->use_reg)
-               omap_hsmmc_reg_put(host);
 err_irq:
        device_init_wakeup(&pdev->dev, false);
        if (host->tx_chan)
@@ -2150,8 +2245,6 @@ static int omap_hsmmc_remove(struct platform_device *pdev)
 
        pm_runtime_get_sync(host->dev);
        mmc_remove_host(host->mmc);
-       if (host->use_reg)
-               omap_hsmmc_reg_put(host);
 
        if (host->tx_chan)
                dma_release_channel(host->tx_chan);
index 1b6d0bfe35f53c2d474d884feda7c7b1271911c7..1420f29628c70d8e8fdedbfa3fe7d77f1ba0ae0b 100644 (file)
@@ -22,7 +22,9 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
+#include <linux/dma/pxa-dma.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/mmc/host.h>
@@ -37,7 +39,6 @@
 #include <asm/sizes.h>
 
 #include <mach/hardware.h>
-#include <mach/dma.h>
 #include <linux/platform_data/mmc-pxamci.h>
 
 #include "pxamci.h"
@@ -58,7 +59,6 @@ struct pxamci_host {
        struct clk              *clk;
        unsigned long           clkrate;
        int                     irq;
-       int                     dma;
        unsigned int            clkrt;
        unsigned int            cmdat;
        unsigned int            imask;
@@ -69,8 +69,10 @@ struct pxamci_host {
        struct mmc_command      *cmd;
        struct mmc_data         *data;
 
+       struct dma_chan         *dma_chan_rx;
+       struct dma_chan         *dma_chan_tx;
+       dma_cookie_t            dma_cookie;
        dma_addr_t              sg_dma;
-       struct pxa_dma_desc     *sg_cpu;
        unsigned int            dma_len;
 
        unsigned int            dma_dir;
@@ -173,14 +175,18 @@ static void pxamci_disable_irq(struct pxamci_host *host, unsigned int mask)
        spin_unlock_irqrestore(&host->lock, flags);
 }
 
+static void pxamci_dma_irq(void *param);
+
 static void pxamci_setup_data(struct pxamci_host *host, struct mmc_data *data)
 {
+       struct dma_async_tx_descriptor *tx;
+       enum dma_data_direction direction;
+       struct dma_slave_config config;
+       struct dma_chan *chan;
        unsigned int nob = data->blocks;
        unsigned long long clks;
        unsigned int timeout;
-       bool dalgn = 0;
-       u32 dcmd;
-       int i;
+       int ret;
 
        host->data = data;
 
@@ -195,54 +201,48 @@ static void pxamci_setup_data(struct pxamci_host *host, struct mmc_data *data)
        timeout = (unsigned int)clks + (data->timeout_clks << host->clkrt);
        writel((timeout + 255) / 256, host->base + MMC_RDTO);
 
+       memset(&config, 0, sizeof(config));
+       config.src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
+       config.dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
+       config.src_addr = host->res->start + MMC_RXFIFO;
+       config.dst_addr = host->res->start + MMC_TXFIFO;
+       config.src_maxburst = 32;
+       config.dst_maxburst = 32;
+
        if (data->flags & MMC_DATA_READ) {
                host->dma_dir = DMA_FROM_DEVICE;
-               dcmd = DCMD_INCTRGADDR | DCMD_FLOWSRC;
-               DRCMR(host->dma_drcmrtx) = 0;
-               DRCMR(host->dma_drcmrrx) = host->dma | DRCMR_MAPVLD;
+               direction = DMA_DEV_TO_MEM;
+               chan = host->dma_chan_rx;
        } else {
                host->dma_dir = DMA_TO_DEVICE;
-               dcmd = DCMD_INCSRCADDR | DCMD_FLOWTRG;
-               DRCMR(host->dma_drcmrrx) = 0;
-               DRCMR(host->dma_drcmrtx) = host->dma | DRCMR_MAPVLD;
+               direction = DMA_MEM_TO_DEV;
+               chan = host->dma_chan_tx;
        }
 
-       dcmd |= DCMD_BURST32 | DCMD_WIDTH1;
+       config.direction = direction;
+
+       ret = dmaengine_slave_config(chan, &config);
+       if (ret < 0) {
+               dev_err(mmc_dev(host->mmc), "dma slave config failed\n");
+               return;
+       }
 
-       host->dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+       host->dma_len = dma_map_sg(chan->device->dev, data->sg, data->sg_len,
                                   host->dma_dir);
 
-       for (i = 0; i < host->dma_len; i++) {
-               unsigned int length = sg_dma_len(&data->sg[i]);
-               host->sg_cpu[i].dcmd = dcmd | length;
-               if (length & 31 && !(data->flags & MMC_DATA_READ))
-                       host->sg_cpu[i].dcmd |= DCMD_ENDIRQEN;
-               /* Not aligned to 8-byte boundary? */
-               if (sg_dma_address(&data->sg[i]) & 0x7)
-                       dalgn = 1;
-               if (data->flags & MMC_DATA_READ) {
-                       host->sg_cpu[i].dsadr = host->res->start + MMC_RXFIFO;
-                       host->sg_cpu[i].dtadr = sg_dma_address(&data->sg[i]);
-               } else {
-                       host->sg_cpu[i].dsadr = sg_dma_address(&data->sg[i]);
-                       host->sg_cpu[i].dtadr = host->res->start + MMC_TXFIFO;
-               }
-               host->sg_cpu[i].ddadr = host->sg_dma + (i + 1) *
-                                       sizeof(struct pxa_dma_desc);
+       tx = dmaengine_prep_slave_sg(chan, data->sg, host->dma_len, direction,
+                                    DMA_PREP_INTERRUPT);
+       if (!tx) {
+               dev_err(mmc_dev(host->mmc), "prep_slave_sg() failed\n");
+               return;
        }
-       host->sg_cpu[host->dma_len - 1].ddadr = DDADR_STOP;
-       wmb();
 
-       /*
-        * The PXA27x DMA controller encounters overhead when working with
-        * unaligned (to 8-byte boundaries) data, so switch on byte alignment
-        * mode only if we have unaligned data.
-        */
-       if (dalgn)
-               DALGN |= (1 << host->dma);
-       else
-               DALGN &= ~(1 << host->dma);
-       DDADR(host->dma) = host->sg_dma;
+       if (!(data->flags & MMC_DATA_READ)) {
+               tx->callback = pxamci_dma_irq;
+               tx->callback_param = host;
+       }
+
+       host->dma_cookie = dmaengine_submit(tx);
 
        /*
         * workaround for erratum #91:
@@ -251,7 +251,7 @@ static void pxamci_setup_data(struct pxamci_host *host, struct mmc_data *data)
         * before starting DMA.
         */
        if (!cpu_is_pxa27x() || data->flags & MMC_DATA_READ)
-               DCSR(host->dma) = DCSR_RUN;
+               dma_async_issue_pending(chan);
 }
 
 static void pxamci_start_cmd(struct pxamci_host *host, struct mmc_command *cmd, unsigned int cmdat)
@@ -343,7 +343,7 @@ static int pxamci_cmd_done(struct pxamci_host *host, unsigned int stat)
                 * enable DMA late
                 */
                if (cpu_is_pxa27x() && host->data->flags & MMC_DATA_WRITE)
-                       DCSR(host->dma) = DCSR_RUN;
+                       dma_async_issue_pending(host->dma_chan_tx);
        } else {
                pxamci_finish_request(host, host->mrq);
        }
@@ -354,13 +354,17 @@ static int pxamci_cmd_done(struct pxamci_host *host, unsigned int stat)
 static int pxamci_data_done(struct pxamci_host *host, unsigned int stat)
 {
        struct mmc_data *data = host->data;
+       struct dma_chan *chan;
 
        if (!data)
                return 0;
 
-       DCSR(host->dma) = 0;
-       dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
-                    host->dma_dir);
+       if (data->flags & MMC_DATA_READ)
+               chan = host->dma_chan_rx;
+       else
+               chan = host->dma_chan_tx;
+       dma_unmap_sg(chan->device->dev,
+                    data->sg, data->sg_len, host->dma_dir);
 
        if (stat & STAT_READ_TIME_OUT)
                data->error = -ETIMEDOUT;
@@ -552,20 +556,37 @@ static const struct mmc_host_ops pxamci_ops = {
        .enable_sdio_irq        = pxamci_enable_sdio_irq,
 };
 
-static void pxamci_dma_irq(int dma, void *devid)
+static void pxamci_dma_irq(void *param)
 {
-       struct pxamci_host *host = devid;
-       int dcsr = DCSR(dma);
-       DCSR(dma) = dcsr & ~DCSR_STOPIRQEN;
+       struct pxamci_host *host = param;
+       struct dma_tx_state state;
+       enum dma_status status;
+       struct dma_chan *chan;
+       unsigned long flags;
+
+       spin_lock_irqsave(&host->lock, flags);
+
+       if (!host->data)
+               goto out_unlock;
 
-       if (dcsr & DCSR_ENDINTR) {
+       if (host->data->flags & MMC_DATA_READ)
+               chan = host->dma_chan_rx;
+       else
+               chan = host->dma_chan_tx;
+
+       status = dmaengine_tx_status(chan, host->dma_cookie, &state);
+
+       if (likely(status == DMA_COMPLETE)) {
                writel(BUF_PART_FULL, host->base + MMC_PRTBUF);
        } else {
-               pr_err("%s: DMA error on channel %d (DCSR=%#x)\n",
-                      mmc_hostname(host->mmc), dma, dcsr);
+               pr_err("%s: DMA error on %s channel\n", mmc_hostname(host->mmc),
+                       host->data->flags & MMC_DATA_READ ? "rx" : "tx");
                host->data->error = -EIO;
                pxamci_data_done(host, 0);
        }
+
+out_unlock:
+       spin_unlock_irqrestore(&host->lock, flags);
 }
 
 static irqreturn_t pxamci_detect_irq(int irq, void *devid)
@@ -625,7 +646,9 @@ static int pxamci_probe(struct platform_device *pdev)
        struct mmc_host *mmc;
        struct pxamci_host *host = NULL;
        struct resource *r, *dmarx, *dmatx;
+       struct pxad_param param_rx, param_tx;
        int ret, irq, gpio_cd = -1, gpio_ro = -1, gpio_power = -1;
+       dma_cap_mask_t mask;
 
        ret = pxamci_of_init(pdev);
        if (ret)
@@ -671,7 +694,6 @@ static int pxamci_probe(struct platform_device *pdev)
 
        host = mmc_priv(mmc);
        host->mmc = mmc;
-       host->dma = -1;
        host->pdata = pdev->dev.platform_data;
        host->clkrt = CLKRT_OFF;
 
@@ -702,12 +724,6 @@ static int pxamci_probe(struct platform_device *pdev)
                                     MMC_CAP_SD_HIGHSPEED;
        }
 
-       host->sg_cpu = dma_alloc_coherent(&pdev->dev, PAGE_SIZE, &host->sg_dma, GFP_KERNEL);
-       if (!host->sg_cpu) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
        spin_lock_init(&host->lock);
        host->res = r;
        host->irq = irq;
@@ -728,32 +744,45 @@ static int pxamci_probe(struct platform_device *pdev)
        writel(64, host->base + MMC_RESTO);
        writel(host->imask, host->base + MMC_I_MASK);
 
-       host->dma = pxa_request_dma(DRIVER_NAME, DMA_PRIO_LOW,
-                                   pxamci_dma_irq, host);
-       if (host->dma < 0) {
-               ret = -EBUSY;
-               goto out;
-       }
-
        ret = request_irq(host->irq, pxamci_irq, 0, DRIVER_NAME, host);
        if (ret)
                goto out;
 
        platform_set_drvdata(pdev, mmc);
 
-       dmarx = platform_get_resource(pdev, IORESOURCE_DMA, 0);
-       if (!dmarx) {
-               ret = -ENXIO;
+       if (!pdev->dev.of_node) {
+               dmarx = platform_get_resource(pdev, IORESOURCE_DMA, 0);
+               dmatx = platform_get_resource(pdev, IORESOURCE_DMA, 1);
+               if (!dmarx || !dmatx) {
+                       ret = -ENXIO;
+                       goto out;
+               }
+               param_rx.prio = PXAD_PRIO_LOWEST;
+               param_rx.drcmr = dmarx->start;
+               param_tx.prio = PXAD_PRIO_LOWEST;
+               param_tx.drcmr = dmatx->start;
+       }
+
+       dma_cap_zero(mask);
+       dma_cap_set(DMA_SLAVE, mask);
+
+       host->dma_chan_rx =
+               dma_request_slave_channel_compat(mask, pxad_filter_fn,
+                                                &param_rx, &pdev->dev, "rx");
+       if (host->dma_chan_rx == NULL) {
+               dev_err(&pdev->dev, "unable to request rx dma channel\n");
+               ret = -ENODEV;
                goto out;
        }
-       host->dma_drcmrrx = dmarx->start;
 
-       dmatx = platform_get_resource(pdev, IORESOURCE_DMA, 1);
-       if (!dmatx) {
-               ret = -ENXIO;
+       host->dma_chan_tx =
+               dma_request_slave_channel_compat(mask, pxad_filter_fn,
+                                                &param_tx,  &pdev->dev, "tx");
+       if (host->dma_chan_tx == NULL) {
+               dev_err(&pdev->dev, "unable to request tx dma channel\n");
+               ret = -ENODEV;
                goto out;
        }
-       host->dma_drcmrtx = dmatx->start;
 
        if (host->pdata) {
                gpio_cd = host->pdata->gpio_card_detect;
@@ -814,12 +843,12 @@ err_gpio_ro:
        gpio_free(gpio_power);
  out:
        if (host) {
-               if (host->dma >= 0)
-                       pxa_free_dma(host->dma);
+               if (host->dma_chan_rx)
+                       dma_release_channel(host->dma_chan_rx);
+               if (host->dma_chan_tx)
+                       dma_release_channel(host->dma_chan_tx);
                if (host->base)
                        iounmap(host->base);
-               if (host->sg_cpu)
-                       dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
                if (host->clk)
                        clk_put(host->clk);
        }
@@ -863,13 +892,12 @@ static int pxamci_remove(struct platform_device *pdev)
                       END_CMD_RES|PRG_DONE|DATA_TRAN_DONE,
                       host->base + MMC_I_MASK);
 
-               DRCMR(host->dma_drcmrrx) = 0;
-               DRCMR(host->dma_drcmrtx) = 0;
-
                free_irq(host->irq, host);
-               pxa_free_dma(host->dma);
+               dmaengine_terminate_all(host->dma_chan_rx);
+               dmaengine_terminate_all(host->dma_chan_tx);
+               dma_release_channel(host->dma_chan_rx);
+               dma_release_channel(host->dma_chan_tx);
                iounmap(host->base);
-               dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
 
                clk_put(host->clk);
 
index c6b9f6492e1a2529b7f683686bc4d939229dba63..886d230f41d07357f9fe624b9a94de428789e962 100644 (file)
@@ -32,6 +32,7 @@
 #include "sdhci-esdhc.h"
 
 #define        ESDHC_CTRL_D3CD                 0x08
+#define ESDHC_BURST_LEN_EN_INCR                (1 << 27)
 /* VENDOR SPEC register */
 #define ESDHC_VENDOR_SPEC              0xc0
 #define  ESDHC_VENDOR_SPEC_SDIO_QUIRK  (1 << 1)
@@ -44,6 +45,7 @@
 #define  ESDHC_MIX_CTRL_EXE_TUNE       (1 << 22)
 #define  ESDHC_MIX_CTRL_SMPCLK_SEL     (1 << 23)
 #define  ESDHC_MIX_CTRL_FBCLK_SEL      (1 << 25)
+#define  ESDHC_MIX_CTRL_HS400_EN       (1 << 26)
 /* Bits 3 and 6 are not SDHCI standard definitions */
 #define  ESDHC_MIX_CTRL_SDHCI_MASK     0xb7
 /* Tuning bits */
 #define  ESDHC_TUNE_CTRL_MIN           0
 #define  ESDHC_TUNE_CTRL_MAX           ((1 << 7) - 1)
 
+/* strobe dll register */
+#define ESDHC_STROBE_DLL_CTRL          0x70
+#define ESDHC_STROBE_DLL_CTRL_ENABLE   (1 << 0)
+#define ESDHC_STROBE_DLL_CTRL_RESET    (1 << 1)
+#define ESDHC_STROBE_DLL_CTRL_SLV_DLY_TARGET_SHIFT     3
+
+#define ESDHC_STROBE_DLL_STATUS                0x74
+#define ESDHC_STROBE_DLL_STS_REF_LOCK  (1 << 1)
+#define ESDHC_STROBE_DLL_STS_SLV_LOCK  0x1
+
 #define ESDHC_TUNING_CTRL              0xcc
 #define ESDHC_STD_TUNING_EN            (1 << 24)
 /* NOTE: the minimum valid tuning start tap for mx6sl is 1 */
 #define ESDHC_TUNING_START_TAP         0x1
+#define ESDHC_TUNING_STEP_SHIFT                16
 
 /* pinctrl state */
 #define ESDHC_PINCTRL_STATE_100MHZ     "state_100mhz"
 #define ESDHC_FLAG_ERR004536           BIT(7)
 /* The IP supports HS200 mode */
 #define ESDHC_FLAG_HS200               BIT(8)
+/* The IP supports HS400 mode */
+#define ESDHC_FLAG_HS400               BIT(9)
+
+/* A higher clock ferquency than this rate requires strobell dll control */
+#define ESDHC_STROBE_DLL_CLK_FREQ      100000000
 
 struct esdhc_soc_data {
        u32 flags;
@@ -156,6 +174,12 @@ static struct esdhc_soc_data usdhc_imx6sx_data = {
                        | ESDHC_FLAG_HAVE_CAP1 | ESDHC_FLAG_HS200,
 };
 
+static struct esdhc_soc_data usdhc_imx7d_data = {
+       .flags = ESDHC_FLAG_USDHC | ESDHC_FLAG_STD_TUNING
+                       | ESDHC_FLAG_HAVE_CAP1 | ESDHC_FLAG_HS200
+                       | ESDHC_FLAG_HS400,
+};
+
 struct pltfm_imx_data {
        u32 scratchpad;
        struct pinctrl *pinctrl;
@@ -199,6 +223,7 @@ static const struct of_device_id imx_esdhc_dt_ids[] = {
        { .compatible = "fsl,imx6sx-usdhc", .data = &usdhc_imx6sx_data, },
        { .compatible = "fsl,imx6sl-usdhc", .data = &usdhc_imx6sl_data, },
        { .compatible = "fsl,imx6q-usdhc", .data = &usdhc_imx6q_data, },
+       { .compatible = "fsl,imx7d-usdhc", .data = &usdhc_imx7d_data, },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, imx_esdhc_dt_ids);
@@ -274,6 +299,9 @@ static u32 esdhc_readl_le(struct sdhci_host *host, int reg)
                                val = SDHCI_SUPPORT_DDR50 | SDHCI_SUPPORT_SDR104
                                        | SDHCI_SUPPORT_SDR50
                                        | SDHCI_USE_SDR50_TUNING;
+
+                       if (imx_data->socdata->flags & ESDHC_FLAG_HS400)
+                               val |= SDHCI_SUPPORT_HS400;
                }
        }
 
@@ -448,6 +476,7 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg)
                } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) {
                        u32 v = readl(host->ioaddr + SDHCI_ACMD12_ERR);
                        u32 m = readl(host->ioaddr + ESDHC_MIX_CTRL);
+                       u32 tuning_ctrl;
                        if (val & SDHCI_CTRL_TUNED_CLK) {
                                v |= ESDHC_MIX_CTRL_SMPCLK_SEL;
                        } else {
@@ -458,6 +487,11 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg)
                        if (val & SDHCI_CTRL_EXEC_TUNING) {
                                v |= ESDHC_MIX_CTRL_EXE_TUNE;
                                m |= ESDHC_MIX_CTRL_FBCLK_SEL;
+                               tuning_ctrl = readl(host->ioaddr + ESDHC_TUNING_CTRL);
+                               tuning_ctrl |= ESDHC_STD_TUNING_EN | ESDHC_TUNING_START_TAP;
+                               if (imx_data->boarddata.tuning_step)
+                                       tuning_ctrl |= imx_data->boarddata.tuning_step << ESDHC_TUNING_STEP_SHIFT;
+                                       writel(tuning_ctrl, host->ioaddr + ESDHC_TUNING_CTRL);
                        } else {
                                v &= ~ESDHC_MIX_CTRL_EXE_TUNE;
                        }
@@ -774,6 +808,7 @@ static int esdhc_change_pinstate(struct sdhci_host *host,
                break;
        case MMC_TIMING_UHS_SDR104:
        case MMC_TIMING_MMC_HS200:
+       case MMC_TIMING_MMC_HS400:
                pinctrl = imx_data->pins_200mhz;
                break;
        default:
@@ -784,24 +819,68 @@ static int esdhc_change_pinstate(struct sdhci_host *host,
        return pinctrl_select_state(imx_data->pinctrl, pinctrl);
 }
 
+/*
+ * For HS400 eMMC, there is a data_strobe line, this signal is generated
+ * by the device and used for data output and CRC status response output
+ * in HS400 mode. The frequency of this signal follows the frequency of
+ * CLK generated by host. Host receive the data which is aligned to the
+ * edge of data_strobe line. Due to the time delay between CLK line and
+ * data_strobe line, if the delay time is larger than one clock cycle,
+ * then CLK and data_strobe line will misaligned, read error shows up.
+ * So when the CLK is higher than 100MHz, each clock cycle is short enough,
+ * host should config the delay target.
+ */
+static void esdhc_set_strobe_dll(struct sdhci_host *host)
+{
+       u32 v;
+
+       if (host->mmc->actual_clock > ESDHC_STROBE_DLL_CLK_FREQ) {
+               /* force a reset on strobe dll */
+               writel(ESDHC_STROBE_DLL_CTRL_RESET,
+                       host->ioaddr + ESDHC_STROBE_DLL_CTRL);
+               /*
+                * enable strobe dll ctrl and adjust the delay target
+                * for the uSDHC loopback read clock
+                */
+               v = ESDHC_STROBE_DLL_CTRL_ENABLE |
+                       (7 << ESDHC_STROBE_DLL_CTRL_SLV_DLY_TARGET_SHIFT);
+               writel(v, host->ioaddr + ESDHC_STROBE_DLL_CTRL);
+               /* wait 1us to make sure strobe dll status register stable */
+               udelay(1);
+               v = readl(host->ioaddr + ESDHC_STROBE_DLL_STATUS);
+               if (!(v & ESDHC_STROBE_DLL_STS_REF_LOCK))
+                       dev_warn(mmc_dev(host->mmc),
+                               "warning! HS400 strobe DLL status REF not lock!\n");
+               if (!(v & ESDHC_STROBE_DLL_STS_SLV_LOCK))
+                       dev_warn(mmc_dev(host->mmc),
+                               "warning! HS400 strobe DLL status SLV not lock!\n");
+       }
+}
+
 static void esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned timing)
 {
+       u32 m;
        struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
        struct pltfm_imx_data *imx_data = pltfm_host->priv;
        struct esdhc_platform_data *boarddata = &imx_data->boarddata;
 
+       /* disable ddr mode and disable HS400 mode */
+       m = readl(host->ioaddr + ESDHC_MIX_CTRL);
+       m &= ~(ESDHC_MIX_CTRL_DDREN | ESDHC_MIX_CTRL_HS400_EN);
+       imx_data->is_ddr = 0;
+
        switch (timing) {
        case MMC_TIMING_UHS_SDR12:
        case MMC_TIMING_UHS_SDR25:
        case MMC_TIMING_UHS_SDR50:
        case MMC_TIMING_UHS_SDR104:
        case MMC_TIMING_MMC_HS200:
+               writel(m, host->ioaddr + ESDHC_MIX_CTRL);
                break;
        case MMC_TIMING_UHS_DDR50:
        case MMC_TIMING_MMC_DDR52:
-               writel(readl(host->ioaddr + ESDHC_MIX_CTRL) |
-                               ESDHC_MIX_CTRL_DDREN,
-                               host->ioaddr + ESDHC_MIX_CTRL);
+               m |= ESDHC_MIX_CTRL_DDREN;
+               writel(m, host->ioaddr + ESDHC_MIX_CTRL);
                imx_data->is_ddr = 1;
                if (boarddata->delay_line) {
                        u32 v;
@@ -813,6 +892,12 @@ static void esdhc_set_uhs_signaling(struct sdhci_host *host, unsigned timing)
                        writel(v, host->ioaddr + ESDHC_DLL_CTRL);
                }
                break;
+       case MMC_TIMING_MMC_HS400:
+               m |= ESDHC_MIX_CTRL_DDREN | ESDHC_MIX_CTRL_HS400_EN;
+               writel(m, host->ioaddr + ESDHC_MIX_CTRL);
+               imx_data->is_ddr = 1;
+               esdhc_set_strobe_dll(host);
+               break;
        }
 
        esdhc_change_pinstate(host, timing);
@@ -886,6 +971,8 @@ sdhci_esdhc_imx_probe_dt(struct platform_device *pdev,
        if (gpio_is_valid(boarddata->wp_gpio))
                boarddata->wp_type = ESDHC_WP_GPIO;
 
+       of_property_read_u32(np, "fsl,tuning-step", &boarddata->tuning_step);
+
        if (of_find_property(np, "no-1-8-v", NULL))
                boarddata->support_vsel = false;
        else
@@ -1073,10 +1160,26 @@ static int sdhci_esdhc_imx_probe(struct platform_device *pdev)
         * to something insane.  Change it back here.
         */
        if (esdhc_is_usdhc(imx_data)) {
-               writel(0x08100810, host->ioaddr + ESDHC_WTMK_LVL);
+               writel(0x10401040, host->ioaddr + ESDHC_WTMK_LVL);
+
                host->quirks2 |= SDHCI_QUIRK2_PRESET_VALUE_BROKEN;
                host->mmc->caps |= MMC_CAP_1_8V_DDR;
 
+               /*
+                * ROM code will change the bit burst_length_enable setting
+                * to zero if this usdhc is choosed to boot system. Change
+                * it back here, otherwise it will impact the performance a
+                * lot. This bit is used to enable/disable the burst length
+                * for the external AHB2AXI bridge, it's usefully especially
+                * for INCR transfer because without burst length indicator,
+                * the AHB2AXI bridge does not know the burst length in
+                * advance. And without burst length indicator, AHB INCR
+                * transfer can only be converted to singles on the AXI side.
+                */
+               writel(readl(host->ioaddr + SDHCI_HOST_CONTROL)
+                       | ESDHC_BURST_LEN_EN_INCR,
+                       host->ioaddr + SDHCI_HOST_CONTROL);
+
                if (!(imx_data->socdata->flags & ESDHC_FLAG_HS200))
                        host->quirks2 |= SDHCI_QUIRK2_BROKEN_HS200;
 
@@ -1100,6 +1203,9 @@ static int sdhci_esdhc_imx_probe(struct platform_device *pdev)
        if (imx_data->socdata->flags & ESDHC_FLAG_ERR004536)
                host->quirks |= SDHCI_QUIRK_BROKEN_ADMA;
 
+       if (imx_data->socdata->flags & ESDHC_FLAG_HS400)
+               host->quirks2 |= SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400;
+
        if (of_id)
                err = sdhci_esdhc_imx_probe_dt(pdev, host, imx_data);
        else
index a870c42731d7a4eea86b39e9e965d94a0603631d..163ac9974d9101dcf28220263a88cef8f43b7750 100644 (file)
@@ -21,7 +21,8 @@
 #define ESDHC_DEFAULT_QUIRKS   (SDHCI_QUIRK_FORCE_BLK_SZ_2048 | \
                                SDHCI_QUIRK_NO_BUSY_IRQ | \
                                SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK | \
-                               SDHCI_QUIRK_PIO_NEEDS_DELAY)
+                               SDHCI_QUIRK_PIO_NEEDS_DELAY | \
+                               SDHCI_QUIRK_NO_HISPD_BIT)
 
 #define ESDHC_SYSTEM_CONTROL   0x2c
 #define ESDHC_CLOCK_MASK       0x0000fff0
index 4a09f7608c66affcede7565ea7c75c41bb10f6ba..4bcee033fedaf520d95433f03e652f0d7a786924 100644 (file)
@@ -489,6 +489,11 @@ static int sdhci_msm_probe(struct platform_device *pdev)
                goto pclk_disable;
        }
 
+       /* Vote for maximum clock rate for maximum performance */
+       ret = clk_set_rate(msm_host->clk, INT_MAX);
+       if (ret)
+               dev_warn(&pdev->dev, "core clock boost failed\n");
+
        ret = clk_prepare_enable(msm_host->clk);
        if (ret)
                goto pclk_disable;
index 21c0c08dfe54cf997e7d7031b9db5bac9ff77e64..75379cb0fb354e7aa7749852ced8fca5973d435e 100644 (file)
@@ -63,6 +63,9 @@ static struct sdhci_ops sdhci_arasan_ops = {
 
 static struct sdhci_pltfm_data sdhci_arasan_pdata = {
        .ops = &sdhci_arasan_ops,
+       .quirks = SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN,
+       .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN |
+                       SDHCI_QUIRK2_CLOCK_DIV_ZERO_BROKEN,
 };
 
 #ifdef CONFIG_PM_SLEEP
@@ -214,6 +217,7 @@ static int sdhci_arasan_remove(struct platform_device *pdev)
 
 static const struct of_device_id sdhci_arasan_of_match[] = {
        { .compatible = "arasan,sdhci-8.9a" },
+       { .compatible = "arasan,sdhci-5.1" },
        { .compatible = "arasan,sdhci-4.9a" },
        { }
 };
diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c
new file mode 100644 (file)
index 0000000..d155664
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Atmel SDMMC controller driver.
+ *
+ * Copyright (C) 2015 Atmel,
+ *              2015 Ludovic Desroches <ludovic.desroches@atmel.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/mmc/host.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+
+#include "sdhci-pltfm.h"
+
+#define SDMMC_CACR     0x230
+#define                SDMMC_CACR_CAPWREN      BIT(0)
+#define                SDMMC_CACR_KEY          (0x46 << 8)
+
+struct sdhci_at91_priv {
+       struct clk *hclock;
+       struct clk *gck;
+       struct clk *mainck;
+};
+
+static const struct sdhci_ops sdhci_at91_sama5d2_ops = {
+       .set_clock              = sdhci_set_clock,
+       .set_bus_width          = sdhci_set_bus_width,
+       .reset                  = sdhci_reset,
+       .set_uhs_signaling      = sdhci_set_uhs_signaling,
+};
+
+static const struct sdhci_pltfm_data soc_data_sama5d2 = {
+       .ops = &sdhci_at91_sama5d2_ops,
+};
+
+static const struct of_device_id sdhci_at91_dt_match[] = {
+       { .compatible = "atmel,sama5d2-sdhci", .data = &soc_data_sama5d2 },
+       {}
+};
+
+static int sdhci_at91_probe(struct platform_device *pdev)
+{
+       const struct of_device_id       *match;
+       const struct sdhci_pltfm_data   *soc_data;
+       struct sdhci_host               *host;
+       struct sdhci_pltfm_host         *pltfm_host;
+       struct sdhci_at91_priv          *priv;
+       unsigned int                    caps0, caps1;
+       unsigned int                    clk_base, clk_mul;
+       unsigned int                    gck_rate, real_gck_rate;
+       int                             ret;
+
+       match = of_match_device(sdhci_at91_dt_match, &pdev->dev);
+       if (!match)
+               return -EINVAL;
+       soc_data = match->data;
+
+       priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv) {
+               dev_err(&pdev->dev, "unable to allocate private data\n");
+               return -ENOMEM;
+       }
+
+       priv->mainck = devm_clk_get(&pdev->dev, "baseclk");
+       if (IS_ERR(priv->mainck)) {
+               dev_err(&pdev->dev, "failed to get baseclk\n");
+               return PTR_ERR(priv->mainck);
+       }
+
+       priv->hclock = devm_clk_get(&pdev->dev, "hclock");
+       if (IS_ERR(priv->hclock)) {
+               dev_err(&pdev->dev, "failed to get hclock\n");
+               return PTR_ERR(priv->hclock);
+       }
+
+       priv->gck = devm_clk_get(&pdev->dev, "multclk");
+       if (IS_ERR(priv->gck)) {
+               dev_err(&pdev->dev, "failed to get multclk\n");
+               return PTR_ERR(priv->gck);
+       }
+
+       host = sdhci_pltfm_init(pdev, soc_data, 0);
+       if (IS_ERR(host))
+               return PTR_ERR(host);
+
+       /*
+        * The mult clock is provided by as a generated clock by the PMC
+        * controller. In order to set the rate of gck, we have to get the
+        * base clock rate and the clock mult from capabilities.
+        */
+       clk_prepare_enable(priv->hclock);
+       caps0 = readl(host->ioaddr + SDHCI_CAPABILITIES);
+       caps1 = readl(host->ioaddr + SDHCI_CAPABILITIES_1);
+       clk_base = (caps0 & SDHCI_CLOCK_V3_BASE_MASK) >> SDHCI_CLOCK_BASE_SHIFT;
+       clk_mul = (caps1 & SDHCI_CLOCK_MUL_MASK) >> SDHCI_CLOCK_MUL_SHIFT;
+       gck_rate = clk_base * 1000000 * (clk_mul + 1);
+       ret = clk_set_rate(priv->gck, gck_rate);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "failed to set gck");
+               goto hclock_disable_unprepare;
+               return -EINVAL;
+       }
+       /*
+        * We need to check if we have the requested rate for gck because in
+        * some cases this rate could be not supported. If it happens, the rate
+        * is the closest one gck can provide. We have to update the value
+        * of clk mul.
+        */
+       real_gck_rate = clk_get_rate(priv->gck);
+       if (real_gck_rate != gck_rate) {
+               clk_mul = real_gck_rate / (clk_base * 1000000) - 1;
+               caps1 &= (~SDHCI_CLOCK_MUL_MASK);
+               caps1 |= ((clk_mul << SDHCI_CLOCK_MUL_SHIFT) & SDHCI_CLOCK_MUL_MASK);
+               /* Set capabilities in r/w mode. */
+               writel(SDMMC_CACR_KEY | SDMMC_CACR_CAPWREN, host->ioaddr + SDMMC_CACR);
+               writel(caps1, host->ioaddr + SDHCI_CAPABILITIES_1);
+               /* Set capabilities in ro mode. */
+               writel(0, host->ioaddr + SDMMC_CACR);
+               dev_info(&pdev->dev, "update clk mul to %u as gck rate is %u Hz\n",
+                        clk_mul, real_gck_rate);
+       }
+
+       clk_prepare_enable(priv->mainck);
+       clk_prepare_enable(priv->gck);
+
+       pltfm_host = sdhci_priv(host);
+       pltfm_host->priv = priv;
+
+       ret = mmc_of_parse(host->mmc);
+       if (ret)
+               goto clocks_disable_unprepare;
+
+       sdhci_get_of_property(pdev);
+
+       ret = sdhci_add_host(host);
+       if (ret)
+               goto clocks_disable_unprepare;
+
+       return 0;
+
+clocks_disable_unprepare:
+       clk_disable_unprepare(priv->gck);
+       clk_disable_unprepare(priv->mainck);
+hclock_disable_unprepare:
+       clk_disable_unprepare(priv->hclock);
+       sdhci_pltfm_free(pdev);
+       return ret;
+}
+
+static int sdhci_at91_remove(struct platform_device *pdev)
+{
+       struct sdhci_host       *host = platform_get_drvdata(pdev);
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+       struct sdhci_at91_priv  *priv = pltfm_host->priv;
+
+       sdhci_pltfm_unregister(pdev);
+
+       clk_disable_unprepare(priv->gck);
+       clk_disable_unprepare(priv->hclock);
+       clk_disable_unprepare(priv->mainck);
+
+       return 0;
+}
+
+static struct platform_driver sdhci_at91_driver = {
+       .driver         = {
+               .name   = "sdhci-at91",
+               .of_match_table = sdhci_at91_dt_match,
+               .pm     = SDHCI_PLTFM_PMOPS,
+       },
+       .probe          = sdhci_at91_probe,
+       .remove         = sdhci_at91_remove,
+};
+
+module_platform_driver(sdhci_at91_driver);
+
+MODULE_DESCRIPTION("SDHCI driver for at91");
+MODULE_AUTHOR("Ludovic Desroches <ludovic.desroches@atmel.com>");
+MODULE_LICENSE("GPL v2");
index 797be7549a15c01a0e9acda2b0adaccb0a6c9416..653f335bef1516ca9aba102e1f3ab21d99a754ce 100644 (file)
@@ -208,6 +208,12 @@ static void esdhc_of_set_clock(struct sdhci_host *host, unsigned int clock)
        if (clock == 0)
                return;
 
+       /* Workaround to start pre_div at 2 for VNN < VENDOR_V_23 */
+       temp = esdhc_readw(host, SDHCI_HOST_VERSION);
+       temp = (temp & SDHCI_VENDOR_VER_MASK) >> SDHCI_VENDOR_VER_SHIFT;
+       if (temp < VENDOR_V_23)
+               pre_div = 2;
+
        /* Workaround to reduce the clock frequency for p1010 esdhc */
        if (of_find_compatible_node(NULL, NULL, "fsl,p1010-esdhc")) {
                if (clock > 20000000)
index 94f54d2772e885b891024db94bb152890468f548..b3b0a3e4fca1652e3ceb1759c0bcbd13bcc1d22e 100644 (file)
@@ -618,6 +618,7 @@ static int jmicron_resume(struct sdhci_pci_chip *chip)
 static const struct sdhci_pci_fixes sdhci_o2 = {
        .probe = sdhci_pci_o2_probe,
        .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC,
+       .quirks2 = SDHCI_QUIRK2_CLEAR_TRANSFERMODE_REG_BEFORE_CMD,
        .probe_slot = sdhci_pci_o2_probe_slot,
        .resume = sdhci_pci_o2_resume,
 };
index 0110bae25b7e8e1d6dcfb54a6a8a518c2763f122..884294576356d4c87e3869162186aadbc24b4f6a 100644 (file)
@@ -161,8 +161,8 @@ static struct sdhci_pltfm_data sdhci_sirf_pdata = {
        .quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL |
                SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK |
                SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN |
-               SDHCI_QUIRK_INVERTED_WRITE_PROTECT |
-               SDHCI_QUIRK_DELAY_AFTER_POWER,
+               SDHCI_QUIRK_RESET_CMD_DATA_ON_IOS,
+       .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN,
 };
 
 static int sdhci_sirf_probe(struct platform_device *pdev)
index 1dbe932320309fc87f75a40cc2552ba5a784d5bd..64b7fdbd1a9ccab80034e8a38660ef944daf8bae 100644 (file)
@@ -54,8 +54,7 @@ static void sdhci_finish_command(struct sdhci_host *);
 static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode);
 static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable);
 static int sdhci_pre_dma_transfer(struct sdhci_host *host,
-                                       struct mmc_data *data,
-                                       struct sdhci_host_next *next);
+                                       struct mmc_data *data);
 static int sdhci_do_get_cd(struct sdhci_host *host);
 
 #ifdef CONFIG_PM
@@ -207,8 +206,7 @@ EXPORT_SYMBOL_GPL(sdhci_reset);
 static void sdhci_do_reset(struct sdhci_host *host, u8 mask)
 {
        if (host->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) {
-               if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) &
-                       SDHCI_CARD_PRESENT))
+               if (!sdhci_do_get_cd(host))
                        return;
        }
 
@@ -496,7 +494,7 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
                goto fail;
        BUG_ON(host->align_addr & host->align_mask);
 
-       host->sg_count = sdhci_pre_dma_transfer(host, data, NULL);
+       host->sg_count = sdhci_pre_dma_transfer(host, data);
        if (host->sg_count < 0)
                goto unmap_align;
 
@@ -635,9 +633,11 @@ static void sdhci_adma_table_post(struct sdhci_host *host,
                }
        }
 
-       if (!data->host_cookie)
+       if (data->host_cookie == COOKIE_MAPPED) {
                dma_unmap_sg(mmc_dev(host->mmc), data->sg,
                        data->sg_len, direction);
+               data->host_cookie = COOKIE_UNMAPPED;
+       }
 }
 
 static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd)
@@ -833,7 +833,7 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
                } else {
                        int sg_cnt;
 
-                       sg_cnt = sdhci_pre_dma_transfer(host, data, NULL);
+                       sg_cnt = sdhci_pre_dma_transfer(host, data);
                        if (sg_cnt <= 0) {
                                /*
                                 * This only happens when someone fed
@@ -949,11 +949,13 @@ static void sdhci_finish_data(struct sdhci_host *host)
                if (host->flags & SDHCI_USE_ADMA)
                        sdhci_adma_table_post(host, data);
                else {
-                       if (!data->host_cookie)
+                       if (data->host_cookie == COOKIE_MAPPED) {
                                dma_unmap_sg(mmc_dev(host->mmc),
                                        data->sg, data->sg_len,
                                        (data->flags & MMC_DATA_READ) ?
                                        DMA_FROM_DEVICE : DMA_TO_DEVICE);
+                               data->host_cookie = COOKIE_UNMAPPED;
+                       }
                }
        }
 
@@ -1132,6 +1134,7 @@ static u16 sdhci_get_preset_value(struct sdhci_host *host)
                preset = sdhci_readw(host, SDHCI_PRESET_FOR_SDR104);
                break;
        case MMC_TIMING_UHS_DDR50:
+       case MMC_TIMING_MMC_DDR52:
                preset = sdhci_readw(host, SDHCI_PRESET_FOR_DDR50);
                break;
        case MMC_TIMING_MMC_HS400:
@@ -1152,6 +1155,7 @@ void sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
        int real_div = div, clk_mul = 1;
        u16 clk = 0;
        unsigned long timeout;
+       bool switch_base_clk = false;
 
        host->mmc->actual_clock = 0;
 
@@ -1189,15 +1193,25 @@ void sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
                                        <= clock)
                                        break;
                        }
-                       /*
-                        * Set Programmable Clock Mode in the Clock
-                        * Control register.
-                        */
-                       clk = SDHCI_PROG_CLOCK_MODE;
-                       real_div = div;
-                       clk_mul = host->clk_mul;
-                       div--;
-               } else {
+                       if ((host->max_clk * host->clk_mul / div) <= clock) {
+                               /*
+                                * Set Programmable Clock Mode in the Clock
+                                * Control register.
+                                */
+                               clk = SDHCI_PROG_CLOCK_MODE;
+                               real_div = div;
+                               clk_mul = host->clk_mul;
+                               div--;
+                       } else {
+                               /*
+                                * Divisor can be too small to reach clock
+                                * speed requirement. Then use the base clock.
+                                */
+                               switch_base_clk = true;
+                       }
+               }
+
+               if (!host->clk_mul || switch_base_clk) {
                        /* Version 3.00 divisors must be a multiple of 2. */
                        if (host->max_clk <= clock)
                                div = 1;
@@ -1210,6 +1224,9 @@ void sdhci_set_clock(struct sdhci_host *host, unsigned int clock)
                        }
                        real_div = div;
                        div >>= 1;
+                       if ((host->quirks2 & SDHCI_QUIRK2_CLOCK_DIV_ZERO_BROKEN)
+                               && !div && host->max_clk <= 25000000)
+                               div = 1;
                }
        } else {
                /* Version 2.00 divisors must be a power of 2. */
@@ -1559,7 +1576,8 @@ static void sdhci_do_set_ios(struct sdhci_host *host, struct mmc_ios *ios)
                                 (ios->timing == MMC_TIMING_UHS_SDR25) ||
                                 (ios->timing == MMC_TIMING_UHS_SDR50) ||
                                 (ios->timing == MMC_TIMING_UHS_SDR104) ||
-                                (ios->timing == MMC_TIMING_UHS_DDR50))) {
+                                (ios->timing == MMC_TIMING_UHS_DDR50) ||
+                                (ios->timing == MMC_TIMING_MMC_DDR52))) {
                        u16 preset;
 
                        sdhci_enable_preset_value(host, true);
@@ -1601,15 +1619,21 @@ static int sdhci_do_get_cd(struct sdhci_host *host)
        if (host->flags & SDHCI_DEVICE_DEAD)
                return 0;
 
-       /* If polling/nonremovable, assume that the card is always present. */
-       if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) ||
-           (host->mmc->caps & MMC_CAP_NONREMOVABLE))
+       /* If nonremovable, assume that the card is always present. */
+       if (host->mmc->caps & MMC_CAP_NONREMOVABLE)
                return 1;
 
-       /* Try slot gpio detect */
+       /*
+        * Try slot gpio detect, if defined it take precedence
+        * over build in controller functionality
+        */
        if (!IS_ERR_VALUE(gpio_cd))
                return !!gpio_cd;
 
+       /* If polling, assume that the card is always present. */
+       if (host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION)
+               return 1;
+
        /* Host native card detect */
        return !!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT);
 }
@@ -2097,49 +2121,36 @@ static void sdhci_post_req(struct mmc_host *mmc, struct mmc_request *mrq,
        struct mmc_data *data = mrq->data;
 
        if (host->flags & SDHCI_REQ_USE_DMA) {
-               if (data->host_cookie)
+               if (data->host_cookie == COOKIE_GIVEN ||
+                               data->host_cookie == COOKIE_MAPPED)
                        dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
                                         data->flags & MMC_DATA_WRITE ?
                                         DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               mrq->data->host_cookie = 0;
+               data->host_cookie = COOKIE_UNMAPPED;
        }
 }
 
 static int sdhci_pre_dma_transfer(struct sdhci_host *host,
-                                      struct mmc_data *data,
-                                      struct sdhci_host_next *next)
+                                      struct mmc_data *data)
 {
        int sg_count;
 
-       if (!next && data->host_cookie &&
-           data->host_cookie != host->next_data.cookie) {
-               pr_debug(DRIVER_NAME "[%s] invalid cookie: %d, next-cookie %d\n",
-                       __func__, data->host_cookie, host->next_data.cookie);
-               data->host_cookie = 0;
+       if (data->host_cookie == COOKIE_MAPPED) {
+               data->host_cookie = COOKIE_GIVEN;
+               return data->sg_count;
        }
 
-       /* Check if next job is already prepared */
-       if (next ||
-           (!next && data->host_cookie != host->next_data.cookie)) {
-               sg_count = dma_map_sg(mmc_dev(host->mmc), data->sg,
-                                    data->sg_len,
-                                    data->flags & MMC_DATA_WRITE ?
-                                    DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
-       } else {
-               sg_count = host->next_data.sg_count;
-               host->next_data.sg_count = 0;
-       }
+       WARN_ON(data->host_cookie == COOKIE_GIVEN);
 
+       sg_count = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+                               data->flags & MMC_DATA_WRITE ?
+                               DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
        if (sg_count == 0)
-               return -EINVAL;
+               return -ENOSPC;
 
-       if (next) {
-               next->sg_count = sg_count;
-               data->host_cookie = ++next->cookie < 0 ? 1 : next->cookie;
-       } else
-               host->sg_count = sg_count;
+       data->sg_count = sg_count;
+       data->host_cookie = COOKIE_MAPPED;
 
        return sg_count;
 }
@@ -2149,16 +2160,10 @@ static void sdhci_pre_req(struct mmc_host *mmc, struct mmc_request *mrq,
 {
        struct sdhci_host *host = mmc_priv(mmc);
 
-       if (mrq->data->host_cookie) {
-               mrq->data->host_cookie = 0;
-               return;
-       }
+       mrq->data->host_cookie = COOKIE_UNMAPPED;
 
        if (host->flags & SDHCI_REQ_USE_DMA)
-               if (sdhci_pre_dma_transfer(host,
-                                       mrq->data,
-                                       &host->next_data) < 0)
-                       mrq->data->host_cookie = 0;
+               sdhci_pre_dma_transfer(host, mrq->data);
 }
 
 static void sdhci_card_event(struct mmc_host *mmc)
@@ -3030,7 +3035,6 @@ int sdhci_add_host(struct sdhci_host *host)
                host->max_clk = host->ops->get_max_clock(host);
        }
 
-       host->next_data.cookie = 1;
        /*
         * In case of Host Controller v3.00, find out whether clock
         * multiplier is supported.
@@ -3126,7 +3130,8 @@ int sdhci_add_host(struct sdhci_host *host)
                mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED;
 
        if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) &&
-           !(mmc->caps & MMC_CAP_NONREMOVABLE))
+           !(mmc->caps & MMC_CAP_NONREMOVABLE) &&
+           IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc)))
                mmc->caps |= MMC_CAP_NEEDS_POLL;
 
        /* If there are external regulators, get them */
index 5521d29368e466d0d752839e34210a452823d62f..7c02ff46c8ac3ecdaf37e792fd6bcb43c9bd029e 100644 (file)
@@ -309,9 +309,10 @@ struct sdhci_adma2_64_desc {
  */
 #define SDHCI_MAX_SEGS         128
 
-struct sdhci_host_next {
-       unsigned int    sg_count;
-       s32             cookie;
+enum sdhci_cookie {
+       COOKIE_UNMAPPED,
+       COOKIE_MAPPED,
+       COOKIE_GIVEN,
 };
 
 struct sdhci_host {
@@ -409,6 +410,8 @@ struct sdhci_host {
 #define SDHCI_QUIRK2_SUPPORT_SINGLE                    (1<<13)
 /* Controller broken with using ACMD23 */
 #define SDHCI_QUIRK2_ACMD23_BROKEN                     (1<<14)
+/* Broken Clock divider zero in controller */
+#define SDHCI_QUIRK2_CLOCK_DIV_ZERO_BROKEN             (1<<15)
 
        int irq;                /* Device IRQ */
        void __iomem *ioaddr;   /* Mapped address */
@@ -503,7 +506,6 @@ struct sdhci_host {
        unsigned int            tuning_mode;    /* Re-tuning mode supported by host */
 #define SDHCI_TUNING_MODE_1    0
 
-       struct sdhci_host_next  next_data;
        unsigned long private[0] ____cacheline_aligned;
 };
 
index 5a1fdd405b1af14ff1725a8b9d9a3ab83e84b15a..ad9ffea7d659d28e057f13c1b7912ca7243743ac 100644 (file)
@@ -1632,7 +1632,9 @@ static int sh_mmcif_suspend(struct device *dev)
 {
        struct sh_mmcif_host *host = dev_get_drvdata(dev);
 
+       pm_runtime_get_sync(dev);
        sh_mmcif_writel(host->addr, MMCIF_CE_INT_MASK, MASK_ALL);
+       pm_runtime_put(dev);
 
        return 0;
 }
index 4d3e1ffe5508273fc1f9f91b7b5c3218bed1ea07..a7b7a67715986d748d9f880088cc2ae069bd2283 100644 (file)
@@ -595,7 +595,7 @@ static irqreturn_t sunxi_mmc_handle_manual_stop(int irq, void *dev_id)
 
 static int sunxi_mmc_oclk_onoff(struct sunxi_mmc_host *host, u32 oclk_en)
 {
-       unsigned long expire = jiffies + msecs_to_jiffies(250);
+       unsigned long expire = jiffies + msecs_to_jiffies(750);
        u32 rval;
 
        rval = mmc_readl(host, REG_CLKCR);
index e3dcf31a8bd6a04e8abd174c964313db584fe1a1..a10fde40b6c3ddbed98e05a693336ea6b1bb1c65 100644 (file)
@@ -83,6 +83,8 @@ static int tmio_mmc_next_sg(struct tmio_mmc_host *host)
        return --host->sg_len;
 }
 
+#define CMDREQ_TIMEOUT 5000
+
 #ifdef CONFIG_MMC_DEBUG
 
 #define STATUS_TO_TEXT(a, status, i) \
@@ -230,7 +232,7 @@ static void tmio_mmc_reset_work(struct work_struct *work)
         */
        if (IS_ERR_OR_NULL(mrq)
            || time_is_after_jiffies(host->last_req_ts +
-               msecs_to_jiffies(2000))) {
+               msecs_to_jiffies(CMDREQ_TIMEOUT))) {
                spin_unlock_irqrestore(&host->lock, flags);
                return;
        }
@@ -818,7 +820,7 @@ static void tmio_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
        ret = tmio_mmc_start_command(host, mrq->cmd);
        if (!ret) {
                schedule_delayed_work(&host->delayed_reset_work,
-                                     msecs_to_jiffies(2000));
+                                     msecs_to_jiffies(CMDREQ_TIMEOUT));
                return;
        }
 
index 54b082b1804a5c05973a16e196d3dbd40f1c1779..4498e92116b808d2a62f6e146f33f3b41838ee33 100644 (file)
@@ -1611,7 +1611,7 @@ static irqreturn_t usdhi6_cd(int irq, void *dev_id)
                return IRQ_NONE;
 
        /* Ack */
-       usdhi6_write(host, USDHI6_SD_INFO1, !status);
+       usdhi6_write(host, USDHI6_SD_INFO1, ~status);
 
        if (!work_pending(&mmc->detect.work) &&
            (((status & USDHI6_SD_INFO1_CARD_INSERT) &&
@@ -1634,6 +1634,7 @@ static void usdhi6_timeout_work(struct work_struct *work)
        struct usdhi6_host *host = container_of(d, struct usdhi6_host, timeout_work);
        struct mmc_request *mrq = host->mrq;
        struct mmc_data *data = mrq ? mrq->data : NULL;
+       struct scatterlist *sg = host->sg ?: data->sg;
 
        dev_warn(mmc_dev(host->mmc),
                 "%s timeout wait %u CMD%d: IRQ 0x%08x:0x%08x, last IRQ 0x%08x\n",
@@ -1669,7 +1670,7 @@ static void usdhi6_timeout_work(struct work_struct *work)
                        "%c: page #%u @ +0x%zx %ux%u in SG%u. Current SG %u bytes @ %u\n",
                        data->flags & MMC_DATA_READ ? 'R' : 'W', host->page_idx,
                        host->offset, data->blocks, data->blksz, data->sg_len,
-                       sg_dma_len(host->sg), host->sg->offset);
+                       sg_dma_len(sg), sg->offset);
                usdhi6_sg_unmap(host, true);
                /*
                 * If USDHI6_WAIT_FOR_DATA_END times out, we have already unmapped
@@ -1715,12 +1716,14 @@ static int usdhi6_probe(struct platform_device *pdev)
        if (!mmc)
                return -ENOMEM;
 
+       ret = mmc_regulator_get_supply(mmc);
+       if (ret == -EPROBE_DEFER)
+               goto e_free_mmc;
+
        ret = mmc_of_parse(mmc);
        if (ret < 0)
                goto e_free_mmc;
 
-       mmc_regulator_get_supply(mmc);
-
        host            = mmc_priv(mmc);
        host->mmc       = mmc;
        host->wait      = USDHI6_WAIT_FOR_REQUEST;
@@ -1734,8 +1737,10 @@ static int usdhi6_probe(struct platform_device *pdev)
        }
 
        host->clk = devm_clk_get(dev, NULL);
-       if (IS_ERR(host->clk))
+       if (IS_ERR(host->clk)) {
+               ret = PTR_ERR(host->clk);
                goto e_free_mmc;
+       }
 
        host->imclk = clk_get_rate(host->clk);
 
index 2fc4957cbe7fec0a2f873cd6cca11cbb88403919..a70eb83e68f12cb82fc6d30f55261883191ce196 100644 (file)
@@ -41,7 +41,7 @@
 #include <linux/fs.h>
 #include <linux/ioctl.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include <linux/mtd/mtd.h>
 
index 1f897ec3c242a977a52128e37f56ef9431fbd548..075a027632b5c88ac73d2d1fd7eed49569f713c3 100644 (file)
@@ -26,7 +26,8 @@ obj-$(CONFIG_MTD_NAND_CS553X)         += cs553x_nand.o
 obj-$(CONFIG_MTD_NAND_NDFC)            += ndfc.o
 obj-$(CONFIG_MTD_NAND_ATMEL)           += atmel_nand.o
 obj-$(CONFIG_MTD_NAND_GPIO)            += gpio.o
-obj-$(CONFIG_MTD_NAND_OMAP2)           += omap2.o
+omap2_nand-objs := omap2.o
+obj-$(CONFIG_MTD_NAND_OMAP2)           += omap2_nand.o
 obj-$(CONFIG_MTD_NAND_OMAP_BCH_BUILD)  += omap_elm.o
 obj-$(CONFIG_MTD_NAND_CM_X270)         += cmx270_nand.o
 obj-$(CONFIG_MTD_NAND_PXA3xx)          += pxa3xx_nand.o
index 7da266a5397990281e0a35625435168b2f11a78f..0802158a3f757b19506947bdbf1d46124ae6fdc8 100644 (file)
@@ -24,7 +24,7 @@
 #include <linux/rslib.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
index 32a216d31141ba91a48f3f3724d7f7659d29dd0e..ab7bda0bb245ce4e702f87ebf5136f09920fbf1d 100644 (file)
@@ -18,7 +18,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/onenand.h>
 #include <linux/mtd/partitions.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 /*
  * Note: Driver name and platform data format have been updated!
index c27d427fead43456d7afeef20ecc8966ebf6d60a..f59aedfe1462a431018de993543c4abc69131eed 100644 (file)
@@ -586,6 +586,7 @@ static const struct flash_info spi_nor_ids[] = {
        /* Micron */
        { "n25q032",     INFO(0x20ba16, 0, 64 * 1024,   64, SPI_NOR_QUAD_READ) },
        { "n25q064",     INFO(0x20ba17, 0, 64 * 1024,  128, SECT_4K | SPI_NOR_QUAD_READ) },
+       { "n25q064a",    INFO(0x20bb17, 0, 64 * 1024,  128, SECT_4K | SPI_NOR_QUAD_READ) },
        { "n25q128a11",  INFO(0x20bb18, 0, 64 * 1024,  256, SPI_NOR_QUAD_READ) },
        { "n25q128a13",  INFO(0x20ba18, 0, 64 * 1024,  256, SPI_NOR_QUAD_READ) },
        { "n25q256a",    INFO(0x20ba19, 0, 64 * 1024,  512, SECT_4K | SPI_NOR_QUAD_READ) },
@@ -602,7 +603,7 @@ static const struct flash_info spi_nor_ids[] = {
         * for the chips listed here (without boot sectors).
         */
        { "s25sl032p",  INFO(0x010215, 0x4d00,  64 * 1024,  64, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
-       { "s25sl064p",  INFO(0x010216, 0x4d00,  64 * 1024, 128, 0) },
+       { "s25sl064p",  INFO(0x010216, 0x4d00,  64 * 1024, 128, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
        { "s25fl256s0", INFO(0x010219, 0x4d00, 256 * 1024, 128, 0) },
        { "s25fl256s1", INFO(0x010219, 0x4d01,  64 * 1024, 512, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
        { "s25fl512s",  INFO(0x010220, 0x4d00, 256 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
@@ -610,8 +611,8 @@ static const struct flash_info spi_nor_ids[] = {
        { "s25sl12800", INFO(0x012018, 0x0300, 256 * 1024,  64, 0) },
        { "s25sl12801", INFO(0x012018, 0x0301,  64 * 1024, 256, 0) },
        { "s25fl128s",  INFO6(0x012018, 0x4d0180, 64 * 1024, 256, SECT_4K | SPI_NOR_QUAD_READ) },
-       { "s25fl129p0", INFO(0x012018, 0x4d00, 256 * 1024,  64, 0) },
-       { "s25fl129p1", INFO(0x012018, 0x4d01,  64 * 1024, 256, 0) },
+       { "s25fl129p0", INFO(0x012018, 0x4d00, 256 * 1024,  64, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
+       { "s25fl129p1", INFO(0x012018, 0x4d01,  64 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
        { "s25sl004a",  INFO(0x010212,      0,  64 * 1024,   8, 0) },
        { "s25sl008a",  INFO(0x010213,      0,  64 * 1024,  16, 0) },
        { "s25sl016a",  INFO(0x010214,      0,  64 * 1024,  32, 0) },
@@ -633,6 +634,7 @@ static const struct flash_info spi_nor_ids[] = {
        { "sst25wf512",  INFO(0xbf2501, 0, 64 * 1024,  1, SECT_4K | SST_WRITE) },
        { "sst25wf010",  INFO(0xbf2502, 0, 64 * 1024,  2, SECT_4K | SST_WRITE) },
        { "sst25wf020",  INFO(0xbf2503, 0, 64 * 1024,  4, SECT_4K | SST_WRITE) },
+       { "sst25wf020a", INFO(0x621612, 0, 64 * 1024,  4, SECT_4K) },
        { "sst25wf040",  INFO(0xbf2504, 0, 64 * 1024,  8, SECT_4K | SST_WRITE) },
        { "sst25wf080",  INFO(0xbf2505, 0, 64 * 1024, 16, SECT_4K | SST_WRITE) },
 
@@ -1216,7 +1218,7 @@ static const struct flash_info *spi_nor_match_id(const char *name)
 {
        const struct flash_info *id = spi_nor_ids;
 
-       while (id->name[0]) {
+       while (id->name) {
                if (!strcmp(name, id->name))
                        return id;
                id++;
index 6dda57e2e724f575490248cb504120fe7e2ca600..55e93b6b6d2150f2687f36bdeebe5db8c4ab2b01 100644 (file)
@@ -737,19 +737,6 @@ static int bond_option_mode_set(struct bonding *bond,
        return 0;
 }
 
-static struct net_device *__bond_option_active_slave_get(struct bonding *bond,
-                                                        struct slave *slave)
-{
-       return bond_uses_primary(bond) && slave ? slave->dev : NULL;
-}
-
-struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
-{
-       struct slave *slave = rcu_dereference(bond->curr_active_slave);
-
-       return __bond_option_active_slave_get(bond, slave);
-}
-
 static int bond_option_active_slave_set(struct bonding *bond,
                                        const struct bond_opt_value *newval)
 {
index 289e20443d83a3507f2700afc91f395828f45149..9d56515f4c4da8ef307ff68ccb438a5e7742ceea 100644 (file)
@@ -418,7 +418,7 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch  *ds, int port)
        core_writel(priv, port, CORE_FAST_AGE_PORT);
 
        reg = core_readl(priv, CORE_FAST_AGE_CTRL);
-       reg |= EN_AGE_PORT | FAST_AGE_STR_DONE;
+       reg |= EN_AGE_PORT | EN_AGE_DYNAMIC | FAST_AGE_STR_DONE;
        core_writel(priv, reg, CORE_FAST_AGE_CTRL);
 
        do {
@@ -432,6 +432,8 @@ static int bcm_sf2_sw_fast_age_port(struct dsa_switch  *ds, int port)
        if (!timeout)
                return -ETIMEDOUT;
 
+       core_writel(priv, 0, CORE_FAST_AGE_CTRL);
+
        return 0;
 }
 
@@ -507,7 +509,7 @@ static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
        u32 reg;
 
        reg = core_readl(priv, CORE_G_PCTL_PORT(port));
-       cur_hw_state = reg >> G_MISTP_STATE_SHIFT;
+       cur_hw_state = reg & (G_MISTP_STATE_MASK << G_MISTP_STATE_SHIFT);
 
        switch (state) {
        case BR_STATE_DISABLED:
@@ -531,10 +533,12 @@ static int bcm_sf2_sw_br_set_stp_state(struct dsa_switch *ds, int port,
        }
 
        /* Fast-age ARL entries if we are moving a port from Learning or
-        * Forwarding state to Disabled, Blocking or Listening state
+        * Forwarding (cur_hw_state) state to Disabled, Blocking or Listening
+        * state (hw_state)
         */
        if (cur_hw_state != hw_state) {
-               if (cur_hw_state & 4 && !(hw_state & 4)) {
+               if (cur_hw_state >= G_MISTP_LEARN_STATE &&
+                   hw_state <= G_MISTP_LISTEN_STATE) {
                        ret = bcm_sf2_sw_fast_age_port(ds, port);
                        if (ret) {
                                pr_err("%s: fast-ageing failed\n", __func__);
index 22e2ebf313332f4dd004162b31faf288d7f7ab25..789d7b7737da4ada78f06850eb835cd437c1040a 100644 (file)
@@ -112,8 +112,8 @@ static inline u64 name##_readq(struct bcm_sf2_priv *priv, u32 off)  \
        spin_unlock(&priv->indir_lock);                                 \
        return (u64)indir << 32 | dir;                                  \
 }                                                                      \
-static inline void name##_writeq(struct bcm_sf2_priv *priv, u32 off,   \
-                                                       u64 val)        \
+static inline void name##_writeq(struct bcm_sf2_priv *priv, u64 val,   \
+                                                       u32 off)        \
 {                                                                      \
        spin_lock(&priv->indir_lock);                                   \
        reg_writel(priv, upper_32_bits(val), REG_DIR_DATA_WRITE);       \
index d54b7400e8d820b334a45afab9003e2150732242..c2daaf087761c38c2dcbb6423c742950893a7024 100644 (file)
@@ -117,6 +117,11 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
        .port_join_bridge       = mv88e6xxx_join_bridge,
        .port_leave_bridge      = mv88e6xxx_leave_bridge,
        .port_stp_update        = mv88e6xxx_port_stp_update,
+       .port_pvid_get          = mv88e6xxx_port_pvid_get,
+       .port_pvid_set          = mv88e6xxx_port_pvid_set,
+       .port_vlan_add          = mv88e6xxx_port_vlan_add,
+       .port_vlan_del          = mv88e6xxx_port_vlan_del,
+       .vlan_getnext           = mv88e6xxx_vlan_getnext,
        .port_fdb_add           = mv88e6xxx_port_fdb_add,
        .port_fdb_del           = mv88e6xxx_port_fdb_del,
        .port_fdb_getnext       = mv88e6xxx_port_fdb_getnext,
index da48e66377b5ff42dc497a5ac4685ca1e16a1eb9..fe644823ceaf951139a55fdeeb910058ecad6cd1 100644 (file)
@@ -511,8 +511,7 @@ static int tse_poll(struct napi_struct *napi, int budget)
 
        if (rxcomplete < budget) {
 
-               napi_gro_flush(napi, false);
-               __napi_complete(napi);
+               napi_complete(napi);
 
                netdev_dbg(priv->dev,
                           "NAPI Complete, did %d packets with budget %d\n",
@@ -1518,6 +1517,7 @@ static int altera_tse_probe(struct platform_device *pdev)
        spin_lock_init(&priv->tx_lock);
        spin_lock_init(&priv->rxdma_irq_lock);
 
+       netif_carrier_off(ndev);
        ret = register_netdev(ndev);
        if (ret) {
                dev_err(&pdev->dev, "failed to register TSE net device\n");
index 0660deecc2c9a77f88e4eb9a0dbe02a68d0fec86..f683d97d7614e7414ccf837ead2e2a507b07e4d3 100644 (file)
@@ -818,10 +818,9 @@ static int setup_glist(struct lio *lio)
        INIT_LIST_HEAD(&lio->glist);
 
        for (i = 0; i < lio->tx_qsize; i++) {
-               g = kmalloc(sizeof(*g), GFP_KERNEL);
+               g = kzalloc(sizeof(*g), GFP_KERNEL);
                if (!g)
                        break;
-               memset(g, 0, sizeof(struct octnic_gather));
 
                g->sg_size =
                        ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
index eb22d58743e22a452c3e935694cd45f252a5f26a..f5dcde27e40281d3b7dc8cf05468eec704b98f40 100644 (file)
@@ -4568,28 +4568,23 @@ static void free_some_resources(struct adapter *adapter)
 
 static int get_chip_type(struct pci_dev *pdev, u32 pl_rev)
 {
-       int ver, chip;
        u16 device_id;
 
        /* Retrieve adapter's device ID */
        pci_read_config_word(pdev, PCI_DEVICE_ID, &device_id);
-       ver = device_id >> 12;
-       switch (ver) {
+
+       switch (device_id >> 12) {
        case CHELSIO_T4:
-               chip |= CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
-               break;
+               return CHELSIO_CHIP_CODE(CHELSIO_T4, pl_rev);
        case CHELSIO_T5:
-               chip |= CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
-               break;
+               return CHELSIO_CHIP_CODE(CHELSIO_T5, pl_rev);
        case CHELSIO_T6:
-               chip |= CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
-               break;
+               return CHELSIO_CHIP_CODE(CHELSIO_T6, pl_rev);
        default:
                dev_err(&pdev->dev, "Device %d is not supported\n",
                        device_id);
-               return -EINVAL;
        }
-       return chip;
+       return -EINVAL;
 }
 
 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -4724,8 +4719,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
                        err = -ENOMEM;
                        goto out_free_adapter;
                }
-               t4_write_reg(adapter, SGE_STAT_CFG_A,
-                            STATSOURCE_T5_V(7) | STATMODE_V(0));
        }
 
        setup_memwin(adapter);
@@ -4737,6 +4730,11 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
        if (err)
                goto out_unmap_bar;
 
+       /* configure SGE_STAT_CFG_A to read WC stats */
+       if (!is_t4(adapter->params.chip))
+               t4_write_reg(adapter, SGE_STAT_CFG_A,
+                            STATSOURCE_T5_V(7) | STATMODE_V(0));
+
        for_each_port(adapter, i) {
                struct net_device *netdev;
 
index 78f446c58422ecd0ec0794f1f41ba403e6292290..9162746d7729559fff5c33315da186f9fe0244d5 100644 (file)
@@ -807,7 +807,7 @@ static inline unsigned int calc_tx_flits(const struct sk_buff *skb)
         * message or, if we're doing a Large Send Offload, an LSO CPL message
         * with an embedded TX Packet Write CPL message.
         */
-       flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4;
+       flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
        if (skb_shinfo(skb)->gso_size)
                flits += (sizeof(struct fw_eth_tx_pkt_wr) +
                          sizeof(struct cpl_tx_pkt_lso_core) +
index ab4674684acc27fb18f7c9b6fb114eb9dfa2b0d0..a32de30ea663b396833305cc6a51339c7f7c05c8 100644 (file)
@@ -762,8 +762,6 @@ enum fw_ldst_func_mod_index {
 
 struct fw_ldst_cmd {
        __be32 op_to_addrspace;
-#define FW_LDST_CMD_ADDRSPACE_S                0
-#define FW_LDST_CMD_ADDRSPACE_V(x)     ((x) << FW_LDST_CMD_ADDRSPACE_S)
        __be32 cycles_to_len16;
        union fw_ldst {
                struct fw_ldst_addrval {
@@ -788,6 +786,13 @@ struct fw_ldst_cmd {
                        __be16 vctl;
                        __be16 rval;
                } mdio;
+               struct fw_ldst_cim_rq {
+                       u8 req_first64[8];
+                       u8 req_second64[8];
+                       u8 resp_first64[8];
+                       u8 resp_second64[8];
+                       __be32 r3[2];
+               } cim_rq;
                union fw_ldst_mps {
                        struct fw_ldst_mps_rplc {
                                __be16 fid_idx;
@@ -828,9 +833,33 @@ struct fw_ldst_cmd {
                        __be16 nset_pkd;
                        __be32 data[12];
                } pcie;
+               struct fw_ldst_i2c_deprecated {
+                       u8 pid_pkd;
+                       u8 base;
+                       u8 boffset;
+                       u8 data;
+                       __be32 r9;
+               } i2c_deprecated;
+               struct fw_ldst_i2c {
+                       u8 pid;
+                       u8 did;
+                       u8 boffset;
+                       u8 blen;
+                       __be32 r9;
+                       __u8   data[48];
+               } i2c;
+               struct fw_ldst_le {
+                       __be32 index;
+                       __be32 r9;
+                       u8 val[33];
+                       u8 r11[7];
+               } le;
        } u;
 };
 
+#define FW_LDST_CMD_ADDRSPACE_S                0
+#define FW_LDST_CMD_ADDRSPACE_V(x)     ((x) << FW_LDST_CMD_ADDRSPACE_S)
+
 #define FW_LDST_CMD_MSG_S       31
 #define FW_LDST_CMD_MSG_V(x)   ((x) << FW_LDST_CMD_MSG_S)
 
index 92bafa793de6ce489f60fba98bd6c069d3a9f957..c4b262ca7d43623fba1cef20dfc6c03ebeadfd7f 100644 (file)
@@ -36,8 +36,8 @@
 #define __T4FW_VERSION_H__
 
 #define T4FW_VERSION_MAJOR 0x01
-#define T4FW_VERSION_MINOR 0x0D
-#define T4FW_VERSION_MICRO 0x20
+#define T4FW_VERSION_MINOR 0x0E
+#define T4FW_VERSION_MICRO 0x04
 #define T4FW_VERSION_BUILD 0x00
 
 #define T4FW_MIN_VERSION_MAJOR 0x01
@@ -45,8 +45,8 @@
 #define T4FW_MIN_VERSION_MICRO 0x00
 
 #define T5FW_VERSION_MAJOR 0x01
-#define T5FW_VERSION_MINOR 0x0D
-#define T5FW_VERSION_MICRO 0x20
+#define T5FW_VERSION_MINOR 0x0E
+#define T5FW_VERSION_MICRO 0x04
 #define T5FW_VERSION_BUILD 0x00
 
 #define T5FW_MIN_VERSION_MAJOR 0x00
@@ -54,8 +54,8 @@
 #define T5FW_MIN_VERSION_MICRO 0x00
 
 #define T6FW_VERSION_MAJOR 0x01
-#define T6FW_VERSION_MINOR 0x0D
-#define T6FW_VERSION_MICRO 0x2D
+#define T6FW_VERSION_MINOR 0x0E
+#define T6FW_VERSION_MICRO 0x04
 #define T6FW_VERSION_BUILD 0x00
 
 #define T6FW_MIN_VERSION_MAJOR 0x00
index c0a7813603c3d1c7f9df5bd783c395a61d83b54b..cf94b72dbacd942b9c56d2bc669854bea7c2ac18 100644 (file)
@@ -1226,7 +1226,7 @@ static irqreturn_t dm9000_interrupt(int irq, void *dev_id)
        if (int_status & ISR_PRS)
                dm9000_rx(dev);
 
-       /* Trnasmit Interrupt check */
+       /* Transmit Interrupt check */
        if (int_status & ISR_PTS)
                dm9000_tx_done(dev, db);
 
index 3be1fbdcdd0215cbd6589001b3a11c2091d309c2..eb323913cd39fb981a8c0cc02140c0c7205ee4f8 100644 (file)
@@ -1968,7 +1968,7 @@ static int __be_cmd_rx_filter(struct be_adapter *adapter, u32 flags, u32 value)
                        memcpy(req->mcast_mac[i++].byte, ha->addr, ETH_ALEN);
        }
 
-       status = be_mcc_notify(adapter);
+       status = be_mcc_notify_wait(adapter);
 err:
        spin_unlock_bh(&adapter->mcc_lock);
        return status;
index 442410cd2ca4b11baaa40039085bbe937344eec5..a2c96fd883938b6d15aa1335e4169b290f6bf241 100644 (file)
@@ -1132,10 +1132,6 @@ static int ethoc_probe(struct platform_device *pdev)
                memcpy(netdev->dev_addr, pdata->hwaddr, IFHWADDRLEN);
                priv->phy_id = pdata->phy_id;
        } else {
-               priv->phy_id = -1;
-
-#ifdef CONFIG_OF
-               {
                const uint8_t *mac;
 
                mac = of_get_property(pdev->dev.of_node,
@@ -1143,8 +1139,7 @@ static int ethoc_probe(struct platform_device *pdev)
                                      NULL);
                if (mac)
                        memcpy(netdev->dev_addr, mac, IFHWADDRLEN);
-               }
-#endif
+               priv->phy_id = -1;
        }
 
        /* Check that the given MAC address is valid. If it isn't, read the
index 91925e38705eb2c8c6c03ea41587d9801a416479..dd4ca39d5d8f62cf96190576224799d00d17b2ff 100644 (file)
@@ -1816,11 +1816,13 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int mii_id, int regnum,
        struct fec_enet_private *fep = bus->priv;
        struct device *dev = &fep->pdev->dev;
        unsigned long time_left;
-       int ret = 0;
+       int ret;
 
        ret = pm_runtime_get_sync(dev);
        if (ret < 0)
                return ret;
+       else
+               ret = 0;
 
        fep->mii_timeout = 0;
        reinit_completion(&fep->mdio_done);
@@ -3029,6 +3031,14 @@ fec_set_mac_address(struct net_device *ndev, void *p)
                memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len);
        }
 
+       /* Add netif status check here to avoid system hang in below case:
+        * ifconfig ethx down; ifconfig ethx hw ether xx:xx:xx:xx:xx:xx;
+        * After ethx down, fec all clocks are gated off and then register
+        * access causes system hang.
+        */
+       if (!netif_running(ndev))
+               return 0;
+
        writel(ndev->dev_addr[3] | (ndev->dev_addr[2] << 8) |
                (ndev->dev_addr[1] << 16) | (ndev->dev_addr[0] << 24),
                fep->hwp + FEC_ADDR_LOW);
index 6e9a792097d315891af8c36812e38fce15e71640..060dd39229747c4b1f43e4f9f374777bea4d95ec 100644 (file)
@@ -583,7 +583,7 @@ jme_setup_tx_resources(struct jme_adapter *jme)
        atomic_set(&txring->next_to_clean, 0);
        atomic_set(&txring->nr_free, jme->tx_ring_size);
 
-       txring->bufinf          = kmalloc(sizeof(struct jme_buffer_info) *
+       txring->bufinf          = kzalloc(sizeof(struct jme_buffer_info) *
                                        jme->tx_ring_size, GFP_ATOMIC);
        if (unlikely(!(txring->bufinf)))
                goto err_free_txring;
@@ -592,8 +592,6 @@ jme_setup_tx_resources(struct jme_adapter *jme)
         * Initialize Transmit Descriptors
         */
        memset(txring->alloc, 0, TX_RING_ALLOC_SIZE(jme->tx_ring_size));
-       memset(txring->bufinf, 0,
-               sizeof(struct jme_buffer_info) * jme->tx_ring_size);
 
        return 0;
 
@@ -845,7 +843,7 @@ jme_setup_rx_resources(struct jme_adapter *jme)
        rxring->next_to_use     = 0;
        atomic_set(&rxring->next_to_clean, 0);
 
-       rxring->bufinf          = kmalloc(sizeof(struct jme_buffer_info) *
+       rxring->bufinf          = kzalloc(sizeof(struct jme_buffer_info) *
                                        jme->rx_ring_size, GFP_ATOMIC);
        if (unlikely(!(rxring->bufinf)))
                goto err_free_rxring;
@@ -853,8 +851,6 @@ jme_setup_rx_resources(struct jme_adapter *jme)
        /*
         * Initiallize Receive Descriptors
         */
-       memset(rxring->bufinf, 0,
-               sizeof(struct jme_buffer_info) * jme->rx_ring_size);
        for (i = 0 ; i < jme->rx_ring_size ; ++i) {
                if (unlikely(jme_make_new_rx_buf(jme, i))) {
                        jme_free_rx_resources(jme);
index d52639bc491f7a1aa76f8bd127df8b9e68be0332..960169efe636a659241f7e8fae10706fd0503939 100644 (file)
@@ -1859,14 +1859,11 @@ oom:
                return;
        }
 
-       mc_spec = kmalloc(0x200, GFP_ATOMIC);
+       mc_spec = kzalloc(0x200, GFP_ATOMIC);
        if (mc_spec == NULL)
                goto oom;
        mc_other = mc_spec + (0x100 >> 2);
 
-       memset(mc_spec, 0, 0x100);
-       memset(mc_other, 0, 0x100);
-
        netdev_for_each_mc_addr(ha, dev) {
                u8 *a = ha->addr;
                u32 *table;
index 913b716ed2e141189a978af29ab5d54c1b606387..a946e4bf71d2a18cce11bb497ec9b737653ae652 100644 (file)
@@ -224,6 +224,26 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
        kfree(mdev);
 }
 
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+       int i;
+       struct mlx4_en_dev *mdev = ctx;
+
+       /* Create a netdev for each port */
+       mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+               mlx4_info(mdev, "Activating port:%d\n", i);
+               if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+                       mdev->pndev[i] = NULL;
+       }
+
+       /* register notifier */
+       mdev->nb.notifier_call = mlx4_en_netdev_event;
+       if (register_netdevice_notifier(&mdev->nb)) {
+               mdev->nb.notifier_call = NULL;
+               mlx4_err(mdev, "Failed to create notifier\n");
+       }
+}
+
 static void *mlx4_en_add(struct mlx4_dev *dev)
 {
        struct mlx4_en_dev *mdev;
@@ -297,21 +317,6 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
        mutex_init(&mdev->state_lock);
        mdev->device_up = true;
 
-       /* Setup ports */
-
-       /* Create a netdev for each port */
-       mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-               mlx4_info(mdev, "Activating port:%d\n", i);
-               if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
-                       mdev->pndev[i] = NULL;
-       }
-       /* register notifier */
-       mdev->nb.notifier_call = mlx4_en_netdev_event;
-       if (register_netdevice_notifier(&mdev->nb)) {
-               mdev->nb.notifier_call = NULL;
-               mlx4_err(mdev, "Failed to create notifier\n");
-       }
-
        return mdev;
 
 err_mr:
@@ -335,6 +340,7 @@ static struct mlx4_interface mlx4_en_interface = {
        .event          = mlx4_en_event,
        .get_dev        = mlx4_en_get_netdev,
        .protocol       = MLX4_PROT_ETH,
+       .activate       = mlx4_en_activate,
 };
 
 static void mlx4_en_verify_params(void)
index 0d80aed5904371c2a2358a99618e7a2328b50c09..0472941af82033852106e5ab3dfc2399ca82cdaa 100644 (file)
@@ -63,8 +63,11 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
                spin_lock_irq(&priv->ctx_lock);
                list_add_tail(&dev_ctx->list, &priv->ctx_list);
                spin_unlock_irq(&priv->ctx_lock);
+               if (intf->activate)
+                       intf->activate(&priv->dev, dev_ctx->context);
        } else
                kfree(dev_ctx);
+
 }
 
 static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
index 9335e5ae18ccee954b4cc08eff41a01871b41b2e..aa0d5ffe92d8177234c1975d958a76751a9539c6 100644 (file)
@@ -200,3 +200,25 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
 
        return err;
 }
+
+int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey)
+{
+       struct mlx5_cmd_query_special_contexts_mbox_in in;
+       struct mlx5_cmd_query_special_contexts_mbox_out out;
+       int err;
+
+       memset(&in, 0, sizeof(in));
+       memset(&out, 0, sizeof(out));
+       in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+       err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+       if (err)
+               return err;
+
+       if (out.hdr.status)
+               err = mlx5_cmd_status_to_err(&out.hdr);
+
+       *rsvd_lkey = be32_to_cpu(out.resd_lkey);
+
+       return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_special_context);
index 5ab3adf88166c5d2ef18c64824e078b756b41060..9f0bdd993955cab628d676d079ceb0d6bd37e805 100644 (file)
@@ -918,8 +918,6 @@ int qlcnic_83xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
                                mbx->req.arg = NULL;
                                return -ENOMEM;
                        }
-                       memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num);
-                       memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num);
                        temp = adapter->ahw->fw_hal_version << 29;
                        mbx->req.arg[0] = (type | (mbx->req.num << 16) | temp);
                        mbx->cmd_op = type;
index 6e6f18fc5d7698b08be53682ebafbd2ff1ce614c..a5f422f26cb4396a8dc25b65557c83793e956d94 100644 (file)
@@ -73,8 +73,6 @@ int qlcnic_82xx_alloc_mbx_args(struct qlcnic_cmd_args *mbx,
                                mbx->req.arg = NULL;
                                return -ENOMEM;
                        }
-                       memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num);
-                       memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num);
                        mbx->req.arg[0] = type;
                        break;
                }
index 546cd5f1c85aeba1d2d74f01986995858f443206..7327b729ba2eae4119efff54ae2a199a77b11da3 100644 (file)
@@ -729,8 +729,6 @@ static int qlcnic_sriov_alloc_bc_mbx_args(struct qlcnic_cmd_args *mbx, u32 type)
                                mbx->req.arg = NULL;
                                return -ENOMEM;
                        }
-                       memset(mbx->req.arg, 0, sizeof(u32) * mbx->req.num);
-                       memset(mbx->rsp.arg, 0, sizeof(u32) * mbx->rsp.num);
                        mbx->req.arg[0] = (type | (mbx->req.num << 16) |
                                           (3 << 29));
                        mbx->rsp.arg[0] = (type & 0xffff) | mbx->rsp.num << 16;
index 24dcbe62412a10a457bc5fa7ea8ce0ed1f016221..2b32e0c5a0b46bcdb4e50d1931c676f2f65503d0 100644 (file)
@@ -833,7 +833,8 @@ struct rtl8169_private {
        unsigned features;
 
        struct mii_if_info mii;
-       struct rtl8169_counters counters;
+       dma_addr_t counters_phys_addr;
+       struct rtl8169_counters *counters;
        struct rtl8169_tc_offsets tc_offset;
        u32 saved_wolopts;
        u32 opts1_mask;
@@ -2190,53 +2191,37 @@ static int rtl8169_get_sset_count(struct net_device *dev, int sset)
        }
 }
 
-static struct rtl8169_counters *rtl8169_map_counters(struct net_device *dev,
-                                                    dma_addr_t *paddr,
-                                                    u32 counter_cmd)
+DECLARE_RTL_COND(rtl_counters_cond)
 {
-       struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
-       struct device *d = &tp->pci_dev->dev;
-       struct rtl8169_counters *counters;
-       u32 cmd;
 
-       counters = dma_alloc_coherent(d, sizeof(*counters), paddr, GFP_KERNEL);
-       if (counters) {
-               RTL_W32(CounterAddrHigh, (u64)*paddr >> 32);
-               cmd = (u64)*paddr & DMA_BIT_MASK(32);
-               RTL_W32(CounterAddrLow, cmd);
-               RTL_W32(CounterAddrLow, cmd | counter_cmd);
-       }
-       return counters;
+       return RTL_R32(CounterAddrLow) & (CounterReset | CounterDump);
 }
 
-static void rtl8169_unmap_counters (struct net_device *dev,
-                                   dma_addr_t paddr,
-                                   struct rtl8169_counters *counters)
+static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
-       struct device *d = &tp->pci_dev->dev;
+       dma_addr_t paddr = tp->counters_phys_addr;
+       u32 cmd;
+       bool ret;
 
-       RTL_W32(CounterAddrLow, 0);
-       RTL_W32(CounterAddrHigh, 0);
+       RTL_W32(CounterAddrHigh, (u64)paddr >> 32);
+       cmd = (u64)paddr & DMA_BIT_MASK(32);
+       RTL_W32(CounterAddrLow, cmd);
+       RTL_W32(CounterAddrLow, cmd | counter_cmd);
 
-       dma_free_coherent(d, sizeof(*counters), counters, paddr);
-}
+       ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000);
 
-DECLARE_RTL_COND(rtl_reset_counters_cond)
-{
-       void __iomem *ioaddr = tp->mmio_addr;
+       RTL_W32(CounterAddrLow, 0);
+       RTL_W32(CounterAddrHigh, 0);
 
-       return RTL_R32(CounterAddrLow) & CounterReset;
+       return ret;
 }
 
 static bool rtl8169_reset_counters(struct net_device *dev)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
-       struct rtl8169_counters *counters;
-       dma_addr_t paddr;
-       bool ret = true;
 
        /*
         * Versions prior to RTL_GIGA_MAC_VER_19 don't support resetting the
@@ -2245,32 +2230,13 @@ static bool rtl8169_reset_counters(struct net_device *dev)
        if (tp->mac_version < RTL_GIGA_MAC_VER_19)
                return true;
 
-       counters = rtl8169_map_counters(dev, &paddr, CounterReset);
-       if (!counters)
-               return false;
-
-       if (!rtl_udelay_loop_wait_low(tp, &rtl_reset_counters_cond, 10, 1000))
-               ret = false;
-
-       rtl8169_unmap_counters(dev, paddr, counters);
-
-       return ret;
-}
-
-DECLARE_RTL_COND(rtl_counters_cond)
-{
-       void __iomem *ioaddr = tp->mmio_addr;
-
-       return RTL_R32(CounterAddrLow) & CounterDump;
+       return rtl8169_do_counters(dev, CounterReset);
 }
 
 static bool rtl8169_update_counters(struct net_device *dev)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
-       struct rtl8169_counters *counters;
-       dma_addr_t paddr;
-       bool ret = true;
 
        /*
         * Some chips are unable to dump tally counters when the receiver
@@ -2279,23 +2245,13 @@ static bool rtl8169_update_counters(struct net_device *dev)
        if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
                return true;
 
-       counters = rtl8169_map_counters(dev, &paddr, CounterDump);
-       if (!counters)
-               return false;
-
-       if (rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000))
-               memcpy(&tp->counters, counters, sizeof(*counters));
-       else
-               ret = false;
-
-       rtl8169_unmap_counters(dev, paddr, counters);
-
-       return ret;
+       return rtl8169_do_counters(dev, CounterDump);
 }
 
 static bool rtl8169_init_counter_offsets(struct net_device *dev)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
+       struct rtl8169_counters *counters = tp->counters;
        bool ret = false;
 
        /*
@@ -2323,9 +2279,9 @@ static bool rtl8169_init_counter_offsets(struct net_device *dev)
        if (rtl8169_update_counters(dev))
                ret = true;
 
-       tp->tc_offset.tx_errors = tp->counters.tx_errors;
-       tp->tc_offset.tx_multi_collision = tp->counters.tx_multi_collision;
-       tp->tc_offset.tx_aborted = tp->counters.tx_aborted;
+       tp->tc_offset.tx_errors = counters->tx_errors;
+       tp->tc_offset.tx_multi_collision = counters->tx_multi_collision;
+       tp->tc_offset.tx_aborted = counters->tx_aborted;
        tp->tc_offset.inited = true;
 
        return ret;
@@ -2335,24 +2291,25 @@ static void rtl8169_get_ethtool_stats(struct net_device *dev,
                                      struct ethtool_stats *stats, u64 *data)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
+       struct rtl8169_counters *counters = tp->counters;
 
        ASSERT_RTNL();
 
        rtl8169_update_counters(dev);
 
-       data[0] = le64_to_cpu(tp->counters.tx_packets);
-       data[1] = le64_to_cpu(tp->counters.rx_packets);
-       data[2] = le64_to_cpu(tp->counters.tx_errors);
-       data[3] = le32_to_cpu(tp->counters.rx_errors);
-       data[4] = le16_to_cpu(tp->counters.rx_missed);
-       data[5] = le16_to_cpu(tp->counters.align_errors);
-       data[6] = le32_to_cpu(tp->counters.tx_one_collision);
-       data[7] = le32_to_cpu(tp->counters.tx_multi_collision);
-       data[8] = le64_to_cpu(tp->counters.rx_unicast);
-       data[9] = le64_to_cpu(tp->counters.rx_broadcast);
-       data[10] = le32_to_cpu(tp->counters.rx_multicast);
-       data[11] = le16_to_cpu(tp->counters.tx_aborted);
-       data[12] = le16_to_cpu(tp->counters.tx_underun);
+       data[0] = le64_to_cpu(counters->tx_packets);
+       data[1] = le64_to_cpu(counters->rx_packets);
+       data[2] = le64_to_cpu(counters->tx_errors);
+       data[3] = le32_to_cpu(counters->rx_errors);
+       data[4] = le16_to_cpu(counters->rx_missed);
+       data[5] = le16_to_cpu(counters->align_errors);
+       data[6] = le32_to_cpu(counters->tx_one_collision);
+       data[7] = le32_to_cpu(counters->tx_multi_collision);
+       data[8] = le64_to_cpu(counters->rx_unicast);
+       data[9] = le64_to_cpu(counters->rx_broadcast);
+       data[10] = le32_to_cpu(counters->rx_multicast);
+       data[11] = le16_to_cpu(counters->tx_aborted);
+       data[12] = le16_to_cpu(counters->tx_underun);
 }
 
 static void rtl8169_get_strings(struct net_device *dev, u32 stringset, u8 *data)
@@ -7780,6 +7737,7 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
        struct rtl8169_private *tp = netdev_priv(dev);
        void __iomem *ioaddr = tp->mmio_addr;
+       struct rtl8169_counters *counters = tp->counters;
        unsigned int start;
 
        if (netif_running(dev))
@@ -7816,11 +7774,11 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
         * Subtract values fetched during initalization.
         * See rtl8169_init_counter_offsets for a description why we do that.
         */
-       stats->tx_errors = le64_to_cpu(tp->counters.tx_errors) -
+       stats->tx_errors = le64_to_cpu(counters->tx_errors) -
                le64_to_cpu(tp->tc_offset.tx_errors);
-       stats->collisions = le32_to_cpu(tp->counters.tx_multi_collision) -
+       stats->collisions = le32_to_cpu(counters->tx_multi_collision) -
                le32_to_cpu(tp->tc_offset.tx_multi_collision);
-       stats->tx_aborted_errors = le16_to_cpu(tp->counters.tx_aborted) -
+       stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted) -
                le16_to_cpu(tp->tc_offset.tx_aborted);
 
        return stats;
@@ -8022,6 +7980,9 @@ static void rtl_remove_one(struct pci_dev *pdev)
 
        unregister_netdev(dev);
 
+       dma_free_coherent(&tp->pci_dev->dev, sizeof(*tp->counters),
+                         tp->counters, tp->counters_phys_addr);
+
        rtl_release_firmware(tp);
 
        if (pci_dev_run_wake(pdev))
@@ -8447,9 +8408,16 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        tp->rtl_fw = RTL_FIRMWARE_UNKNOWN;
 
+       tp->counters = dma_alloc_coherent (&pdev->dev, sizeof(*tp->counters),
+                                          &tp->counters_phys_addr, GFP_KERNEL);
+       if (!tp->counters) {
+               rc = -ENOMEM;
+               goto err_out_msi_4;
+       }
+
        rc = register_netdev(dev);
        if (rc < 0)
-               goto err_out_msi_4;
+               goto err_out_cnt_5;
 
        pci_set_drvdata(pdev, dev);
 
@@ -8483,6 +8451,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 out:
        return rc;
 
+err_out_cnt_5:
+       dma_free_coherent(&pdev->dev, sizeof(*tp->counters), tp->counters,
+                         tp->counters_phys_addr);
 err_out_msi_4:
        netif_napi_del(&tp->napi);
        rtl_disable_msi(pdev, tp);
index 864b476f7fd5a33b81ac2f6ea9b08e0cf99d299d..925f2f8659b8f181fc9a447329180cbc4a12a5e9 100644 (file)
@@ -837,8 +837,11 @@ static int stmmac_init_phy(struct net_device *dev)
                                     interface);
        }
 
-       if (IS_ERR(phydev)) {
+       if (IS_ERR_OR_NULL(phydev)) {
                pr_err("%s: Could not attach to PHY\n", dev->name);
+               if (!phydev)
+                       return -ENODEV;
+
                return PTR_ERR(phydev);
        }
 
index a8f315106742d9b1f5c06fbfaa90520ffa8323ab..8276ee5a7d541b2e27bb1e9a70ee74532e8a155e 100644 (file)
@@ -20,7 +20,7 @@ config SYNOPSYS_DWC_ETH_QOS
        select PHYLIB
        select CRC32
        select MII
-       depends on OF
+       depends on OF && HAS_DMA
        ---help---
          This driver supports the DWC Ethernet QoS from Synopsys
 
index d8757bf9ad755ed6a3114d9d0a9664e59618744a..a9acf7156855546080e850bc07c7a5380ccc1980 100644 (file)
@@ -61,11 +61,21 @@ MODULE_VERSION(NTB_NETDEV_VER);
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Intel Corporation");
 
+/* Time in usecs for tx resource reaper */
+static unsigned int tx_time = 1;
+
+/* Number of descriptors to free before resuming tx */
+static unsigned int tx_start = 10;
+
+/* Number of descriptors still available before stop upper layer tx */
+static unsigned int tx_stop = 5;
+
 struct ntb_netdev {
        struct list_head list;
        struct pci_dev *pdev;
        struct net_device *ndev;
        struct ntb_transport_qp *qp;
+       struct timer_list tx_timer;
 };
 
 #define        NTB_TX_TIMEOUT_MS       1000
@@ -136,11 +146,42 @@ enqueue_again:
        }
 }
 
+static int __ntb_netdev_maybe_stop_tx(struct net_device *netdev,
+                                     struct ntb_transport_qp *qp, int size)
+{
+       struct ntb_netdev *dev = netdev_priv(netdev);
+
+       netif_stop_queue(netdev);
+       /* Make sure to see the latest value of ntb_transport_tx_free_entry()
+        * since the queue was last started.
+        */
+       smp_mb();
+
+       if (likely(ntb_transport_tx_free_entry(qp) < size)) {
+               mod_timer(&dev->tx_timer, jiffies + usecs_to_jiffies(tx_time));
+               return -EBUSY;
+       }
+
+       netif_start_queue(netdev);
+       return 0;
+}
+
+static int ntb_netdev_maybe_stop_tx(struct net_device *ndev,
+                                   struct ntb_transport_qp *qp, int size)
+{
+       if (netif_queue_stopped(ndev) ||
+           (ntb_transport_tx_free_entry(qp) >= size))
+               return 0;
+
+       return __ntb_netdev_maybe_stop_tx(ndev, qp, size);
+}
+
 static void ntb_netdev_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
                                  void *data, int len)
 {
        struct net_device *ndev = qp_data;
        struct sk_buff *skb;
+       struct ntb_netdev *dev = netdev_priv(ndev);
 
        skb = data;
        if (!skb || !ndev)
@@ -155,6 +196,15 @@ static void ntb_netdev_tx_handler(struct ntb_transport_qp *qp, void *qp_data,
        }
 
        dev_kfree_skb(skb);
+
+       if (ntb_transport_tx_free_entry(dev->qp) >= tx_start) {
+               /* Make sure anybody stopping the queue after this sees the new
+                * value of ntb_transport_tx_free_entry()
+                */
+               smp_mb();
+               if (netif_queue_stopped(ndev))
+                       netif_wake_queue(ndev);
+       }
 }
 
 static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
@@ -163,10 +213,15 @@ static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
        struct ntb_netdev *dev = netdev_priv(ndev);
        int rc;
 
+       ntb_netdev_maybe_stop_tx(ndev, dev->qp, tx_stop);
+
        rc = ntb_transport_tx_enqueue(dev->qp, skb, skb->data, skb->len);
        if (rc)
                goto err;
 
+       /* check for next submit */
+       ntb_netdev_maybe_stop_tx(ndev, dev->qp, tx_stop);
+
        return NETDEV_TX_OK;
 
 err:
@@ -175,6 +230,23 @@ err:
        return NETDEV_TX_BUSY;
 }
 
+static void ntb_netdev_tx_timer(unsigned long data)
+{
+       struct net_device *ndev = (struct net_device *)data;
+       struct ntb_netdev *dev = netdev_priv(ndev);
+
+       if (ntb_transport_tx_free_entry(dev->qp) < tx_stop) {
+               mod_timer(&dev->tx_timer, jiffies + msecs_to_jiffies(tx_time));
+       } else {
+               /* Make sure anybody stopping the queue after this sees the new
+                * value of ntb_transport_tx_free_entry()
+                */
+               smp_mb();
+               if (netif_queue_stopped(ndev))
+                       netif_wake_queue(ndev);
+       }
+}
+
 static int ntb_netdev_open(struct net_device *ndev)
 {
        struct ntb_netdev *dev = netdev_priv(ndev);
@@ -197,8 +269,11 @@ static int ntb_netdev_open(struct net_device *ndev)
                }
        }
 
+       setup_timer(&dev->tx_timer, ntb_netdev_tx_timer, (unsigned long)ndev);
+
        netif_carrier_off(ndev);
        ntb_transport_link_up(dev->qp);
+       netif_start_queue(ndev);
 
        return 0;
 
@@ -219,6 +294,8 @@ static int ntb_netdev_close(struct net_device *ndev)
        while ((skb = ntb_transport_rx_remove(dev->qp, &len)))
                dev_kfree_skb(skb);
 
+       del_timer_sync(&dev->tx_timer);
+
        return 0;
 }
 
index c07030dbe7484b50e1f49c9dff6a35ba6cf0fc95..c5ad98ace5d0abeff5d6ef8f3f028e537b241a8e 100644 (file)
@@ -127,6 +127,11 @@ config DP83867_PHY
        ---help---
          Currently supports the DP83867 PHY.
 
+config MICROCHIP_PHY
+       tristate "Drivers for Microchip PHYs"
+       help
+         Supports the LAN88XX PHYs.
+
 config FIXED_PHY
        tristate "Driver for MDIO Bus/PHY emulation with fixed speed/link PHYs"
        depends on PHYLIB
index 9bb103358c74d2c87054c08f0c35d1d83609a43a..87f079c4b2c7ab16e5577b0b86fdd8509f9b7c8f 100644 (file)
@@ -37,3 +37,4 @@ obj-$(CONFIG_MDIO_BUS_MUX_MMIOREG) += mdio-mux-mmioreg.o
 obj-$(CONFIG_MDIO_SUN4I)       += mdio-sun4i.o
 obj-$(CONFIG_MDIO_MOXART)      += mdio-moxart.o
 obj-$(CONFIG_MDIO_BCM_UNIMAC)  += mdio-bcm-unimac.o
+obj-$(CONFIG_MICROCHIP_PHY)    += microchip.o
index 12c7eb2c604e15021b2f5fb78120004708fb22e8..fb1299c6326ec1f388d5c544e4607645274cbb08 100644 (file)
@@ -325,7 +325,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
        phy_addr = phy_fixed_addr++;
        spin_unlock(&phy_fixed_addr_lock);
 
-       ret = fixed_phy_add(PHY_POLL, phy_addr, status, link_gpio);
+       ret = fixed_phy_add(irq, phy_addr, status, link_gpio);
        if (ret < 0)
                return ERR_PTR(ret);
 
diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c
new file mode 100644 (file)
index 0000000..c0a20eb
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mii.h>
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include <linux/microchipphy.h>
+
+#define DRIVER_AUTHOR  "WOOJUNG HUH <woojung.huh@microchip.com>"
+#define DRIVER_DESC    "Microchip LAN88XX PHY driver"
+
+struct lan88xx_priv {
+       int     chip_id;
+       int     chip_rev;
+       __u32   wolopts;
+};
+
+static int lan88xx_phy_config_intr(struct phy_device *phydev)
+{
+       int rc;
+
+       if (phydev->interrupts == PHY_INTERRUPT_ENABLED) {
+               /* unmask all source and clear them before enable */
+               rc = phy_write(phydev, LAN88XX_INT_MASK, 0x7FFF);
+               rc = phy_read(phydev, LAN88XX_INT_STS);
+               rc = phy_write(phydev, LAN88XX_INT_MASK,
+                              LAN88XX_INT_MASK_MDINTPIN_EN_ |
+                              LAN88XX_INT_MASK_LINK_CHANGE_);
+       } else {
+               rc = phy_write(phydev, LAN88XX_INT_MASK, 0);
+       }
+
+       return rc < 0 ? rc : 0;
+}
+
+static int lan88xx_phy_ack_interrupt(struct phy_device *phydev)
+{
+       int rc = phy_read(phydev, LAN88XX_INT_STS);
+
+       return rc < 0 ? rc : 0;
+}
+
+int lan88xx_suspend(struct phy_device *phydev)
+{
+       struct lan88xx_priv *priv = phydev->priv;
+
+       /* do not power down PHY when WOL is enabled */
+       if (!priv->wolopts)
+               genphy_suspend(phydev);
+
+       return 0;
+}
+
+static int lan88xx_probe(struct phy_device *phydev)
+{
+       struct device *dev = &phydev->dev;
+       struct lan88xx_priv *priv;
+
+       priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->wolopts = 0;
+
+       /* these values can be used to identify internal PHY */
+       priv->chip_id = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_ID,
+                                             3, phydev->addr);
+       priv->chip_rev = phy_read_mmd_indirect(phydev, LAN88XX_MMD3_CHIP_REV,
+                                              3, phydev->addr);
+
+       phydev->priv = priv;
+
+       return 0;
+}
+
+static void lan88xx_remove(struct phy_device *phydev)
+{
+       struct device *dev = &phydev->dev;
+       struct lan88xx_priv *priv = phydev->priv;
+
+       if (priv)
+               devm_kfree(dev, priv);
+}
+
+static int lan88xx_set_wol(struct phy_device *phydev,
+                          struct ethtool_wolinfo *wol)
+{
+       struct lan88xx_priv *priv = phydev->priv;
+
+       priv->wolopts = wol->wolopts;
+
+       return 0;
+}
+
+static struct phy_driver microchip_phy_driver[] = {
+{
+       .phy_id         = 0x0007c130,
+       .phy_id_mask    = 0xfffffff0,
+       .name           = "Microchip LAN88xx",
+
+       .features       = (PHY_GBIT_FEATURES |
+                          SUPPORTED_Pause | SUPPORTED_Asym_Pause),
+       .flags          = PHY_HAS_INTERRUPT | PHY_HAS_MAGICANEG,
+
+       .probe          = lan88xx_probe,
+       .remove         = lan88xx_remove,
+
+       .config_init    = genphy_config_init,
+       .config_aneg    = genphy_config_aneg,
+       .read_status    = genphy_read_status,
+
+       .ack_interrupt  = lan88xx_phy_ack_interrupt,
+       .config_intr    = lan88xx_phy_config_intr,
+
+       .suspend        = lan88xx_suspend,
+       .resume         = genphy_resume,
+       .set_wol        = lan88xx_set_wol,
+
+       .driver         = { .owner = THIS_MODULE, }
+} };
+
+module_phy_driver(microchip_phy_driver);
+
+static struct mdio_device_id __maybe_unused microchip_tbl[] = {
+       { 0x0007c130, 0xfffffff0 },
+       { }
+};
+
+MODULE_DEVICE_TABLE(mdio, microchip_tbl);
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
index 39364a45af4043880fe3d195bb381fcfb8858e74..a39518fc93aadf82918fd944dcf88c6de5321f48 100644 (file)
@@ -1049,8 +1049,7 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
 {
        struct mii_if_info *mii = &dev->mii;
        struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET };
-       u16 ladv, radv;
-       int ret;
+       int ladv, radv, ret;
        u32 buf;
 
        /* clear PHY interrupt status */
@@ -1104,12 +1103,12 @@ static int lan78xx_link_reset(struct lan78xx_net *dev)
                }
 
                ladv = lan78xx_mdio_read(dev->net, mii->phy_id, MII_ADVERTISE);
-               if (unlikely(ladv < 0))
-                       return -EIO;
+               if (ladv < 0)
+                       return ladv;
 
                radv = lan78xx_mdio_read(dev->net, mii->phy_id, MII_LPA);
-               if (unlikely(radv < 0))
-                       return -EIO;
+               if (radv < 0)
+                       return radv;
 
                netif_dbg(dev, link, dev->net,
                          "speed: %u duplex: %d anadv: 0x%04x anlpa: 0x%04x",
index fe4ec324aebc0284f3a72849dcfc6cc9187196a8..d9427ca3dba79628f402867b83e735bee78fac3b 100644 (file)
 #include <linux/mdio.h>
 #include <linux/usb/cdc.h>
 
-/* Version Information */
-#define DRIVER_VERSION "v1.08.1 (2015/07/28)"
+/* Information for net-next */
+#define NETNEXT_VERSION                "08"
+
+/* Information for net */
+#define NET_VERSION            "2"
+
+#define DRIVER_VERSION         "v1." NETNEXT_VERSION "." NET_VERSION
 #define DRIVER_AUTHOR "Realtek linux nic maintainers <nic_swsd@realtek.com>"
 #define DRIVER_DESC "Realtek RTL8152/RTL8153 Based USB Ethernet Adapters"
 #define MODULENAME "r8152"
 #define OCP_EEE_ABLE           0xa5c4
 #define OCP_EEE_ADV            0xa5d0
 #define OCP_EEE_LPABLE         0xa5d2
+#define OCP_PHY_STATE          0xa708          /* nway state for 8153 */
 #define OCP_ADC_CFG            0xbc06
 
 /* SRAM Register */
 /* OCP_DOWN_SPEED */
 #define EN_10M_BGOFF           0x0080
 
+/* OCP_PHY_STATE */
+#define TXDIS_STATE            0x01
+#define ABD_STATE              0x02
+
 /* OCP_ADC_CFG */
 #define CKADSEL_L              0x0100
 #define ADC_EN                 0x0080
@@ -604,6 +614,7 @@ struct r8152 {
                void (*unload)(struct r8152 *);
                int (*eee_get)(struct r8152 *, struct ethtool_eee *);
                int (*eee_set)(struct r8152 *, struct ethtool_eee *);
+               bool (*in_nway)(struct r8152 *);
        } rtl_ops;
 
        int intr_interval;
@@ -2941,6 +2952,32 @@ static void rtl8153_down(struct r8152 *tp)
        r8153_enable_aldps(tp);
 }
 
+static bool rtl8152_in_nway(struct r8152 *tp)
+{
+       u16 nway_state;
+
+       ocp_write_word(tp, MCU_TYPE_PLA, PLA_OCP_GPHY_BASE, 0x2000);
+       tp->ocp_base = 0x2000;
+       ocp_write_byte(tp, MCU_TYPE_PLA, 0xb014, 0x4c);         /* phy state */
+       nway_state = ocp_read_word(tp, MCU_TYPE_PLA, 0xb01a);
+
+       /* bit 15: TXDIS_STATE, bit 14: ABD_STATE */
+       if (nway_state & 0xc000)
+               return false;
+       else
+               return true;
+}
+
+static bool rtl8153_in_nway(struct r8152 *tp)
+{
+       u16 phy_state = ocp_reg_read(tp, OCP_PHY_STATE) & 0xff;
+
+       if (phy_state == TXDIS_STATE || phy_state == ABD_STATE)
+               return false;
+       else
+               return true;
+}
+
 static void set_carrier(struct r8152 *tp)
 {
        struct net_device *netdev = tp->netdev;
@@ -3405,6 +3442,27 @@ static int rtl8152_post_reset(struct usb_interface *intf)
        return 0;
 }
 
+static bool delay_autosuspend(struct r8152 *tp)
+{
+       bool sw_linking = !!netif_carrier_ok(tp->netdev);
+       bool hw_linking = !!(rtl8152_get_speed(tp) & LINK_STATUS);
+
+       /* This means a linking change occurs and the driver doesn't detect it,
+        * yet. If the driver has disabled tx/rx and hw is linking on, the
+        * device wouldn't wake up by receiving any packet.
+        */
+       if (work_busy(&tp->schedule.work) || sw_linking != hw_linking)
+               return true;
+
+       /* If the linking down is occurred by nway, the device may miss the
+        * linking change event. And it wouldn't wake when linking on.
+        */
+       if (!sw_linking && tp->rtl_ops.in_nway(tp))
+               return true;
+       else
+               return false;
+}
+
 static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message)
 {
        struct r8152 *tp = usb_get_intfdata(intf);
@@ -3414,7 +3472,7 @@ static int rtl8152_suspend(struct usb_interface *intf, pm_message_t message)
        mutex_lock(&tp->control);
 
        if (PMSG_IS_AUTO(message)) {
-               if (netif_running(netdev) && work_busy(&tp->schedule.work)) {
+               if (netif_running(netdev) && delay_autosuspend(tp)) {
                        ret = -EBUSY;
                        goto out1;
                }
@@ -4044,6 +4102,7 @@ static int rtl_ops_init(struct r8152 *tp)
                ops->unload             = rtl8152_unload;
                ops->eee_get            = r8152_get_eee;
                ops->eee_set            = r8152_set_eee;
+               ops->in_nway            = rtl8152_in_nway;
                break;
 
        case RTL_VER_03:
@@ -4058,6 +4117,7 @@ static int rtl_ops_init(struct r8152 *tp)
                ops->unload             = rtl8153_unload;
                ops->eee_get            = r8153_get_eee;
                ops->eee_set            = r8153_set_eee;
+               ops->in_nway            = rtl8153_in_nway;
                break;
 
        default:
index e0498571ae267c1d74dc3a65ed6d59c54cb92d1a..b4cf10781348effecf9ca1f6896c0aeab852bba2 100644 (file)
@@ -428,12 +428,18 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb,
        old_state = entry->state;
        entry->state = state;
        __skb_unlink(skb, list);
-       spin_unlock(&list->lock);
-       spin_lock(&dev->done.lock);
+
+       /* defer_bh() is never called with list == &dev->done.
+        * spin_lock_nested() tells lockdep that it is OK to take
+        * dev->done.lock here with list->lock held.
+        */
+       spin_lock_nested(&dev->done.lock, SINGLE_DEPTH_NESTING);
+
        __skb_queue_tail(&dev->done, skb);
        if (dev->done.qlen == 1)
                tasklet_schedule(&dev->bh);
-       spin_unlock_irqrestore(&dev->done.lock, flags);
+       spin_unlock(&dev->done.lock);
+       spin_unlock_irqrestore(&list->lock, flags);
        return old_state;
 }
 
@@ -749,6 +755,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);
 
 /*-------------------------------------------------------------------------*/
 
+static void wait_skb_queue_empty(struct sk_buff_head *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->lock, flags);
+       while (!skb_queue_empty(q)) {
+               spin_unlock_irqrestore(&q->lock, flags);
+               schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_lock_irqsave(&q->lock, flags);
+       }
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+
 // precondition: never called in_interrupt
 static void usbnet_terminate_urbs(struct usbnet *dev)
 {
@@ -762,14 +782,11 @@ static void usbnet_terminate_urbs(struct usbnet *dev)
                unlink_urbs(dev, &dev->rxq);
 
        /* maybe wait for deletions to finish. */
-       while (!skb_queue_empty(&dev->rxq)
-               && !skb_queue_empty(&dev->txq)
-               && !skb_queue_empty(&dev->done)) {
-                       schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       netif_dbg(dev, ifdown, dev->net,
-                                 "waited for %d urb completions\n", temp);
-       }
+       wait_skb_queue_empty(&dev->rxq);
+       wait_skb_queue_empty(&dev->txq);
+       wait_skb_queue_empty(&dev->done);
+       netif_dbg(dev, ifdown, dev->net,
+                 "waited for %d urb completions\n", temp);
        set_current_state(TASK_RUNNING);
        remove_wait_queue(&dev->wait, &wait);
 }
index ce988fd01b3486c7f40d06289f1c1d001d7c2bcf..cf8b7f0473b3985af3c6afc68ecf856e704fbc16 100644 (file)
@@ -1223,7 +1223,6 @@ drop:
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
        struct metadata_dst *tun_dst = NULL;
-       struct ip_tunnel_info *info;
        struct vxlan_sock *vs;
        struct vxlanhdr *vxh;
        u32 flags, vni;
@@ -1270,8 +1269,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                if (!tun_dst)
                        goto drop;
 
-               info = &tun_dst->u.tun_info;
-               md = ip_tunnel_info_opts(info);
+               md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
        } else {
                memset(md, 0, sizeof(*md));
        }
@@ -1286,7 +1284,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                md->gbp = ntohs(gbp->policy_id);
 
                if (tun_dst)
-                       info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+                       tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
 
                if (gbp->dont_learn)
                        md->gbp |= VXLAN_GBP_DONT_LEARN;
index 758c4ba1e97c91389ad6e5d1e6cda37f0dae26fa..8fef8d83436ddef9682df26e8ed2af6f2ae13377 100644 (file)
@@ -1358,6 +1358,8 @@ sbni_ioctl( struct net_device  *dev,  struct ifreq  *ifr,  int  cmd )
                if( !slave_dev  ||  !(slave_dev->flags & IFF_UP) ) {
                        netdev_err(dev, "trying to enslave non-active device %s\n",
                                   slave_name);
+                       if (slave_dev)
+                               dev_put(slave_dev);
                        return  -EPERM;
                }
 
index 613ca2b2527be25a0c4a51329acdab30bb0e3ab3..d1a1e160ef31132f35d56ace1e7dd12a13b267e6 100644 (file)
@@ -156,6 +156,12 @@ static const struct file_operations fops_vring = {
        .llseek         = seq_lseek,
 };
 
+static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
+                           const char *prefix)
+{
+       seq_hex_dump(s, prefix, DUMP_PREFIX_NONE, 16, 1, p, len, false);
+}
+
 static void wil_print_ring(struct seq_file *s, const char *prefix,
                           void __iomem *off)
 {
@@ -212,8 +218,6 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
                                   le16_to_cpu(hdr.seq), len,
                                   le16_to_cpu(hdr.type), hdr.flags);
                        if (len <= MAX_MBOXITEM_SIZE) {
-                               int n = 0;
-                               char printbuf[16 * 3 + 2];
                                unsigned char databuf[MAX_MBOXITEM_SIZE];
                                void __iomem *src = wmi_buffer(wil, d.addr) +
                                        sizeof(struct wil6210_mbox_hdr);
@@ -223,16 +227,7 @@ static void wil_print_ring(struct seq_file *s, const char *prefix,
                                 * reading header
                                 */
                                wil_memcpy_fromio_32(databuf, src, len);
-                               while (n < len) {
-                                       int l = min(len - n, 16);
-
-                                       hex_dump_to_buffer(databuf + n, l,
-                                                          16, 1, printbuf,
-                                                          sizeof(printbuf),
-                                                          false);
-                                       seq_printf(s, "      : %s\n", printbuf);
-                                       n += l;
-                               }
+                               wil_seq_hexdump(s, databuf, len, "      : ");
                        }
                } else {
                        seq_puts(s, "\n");
@@ -867,22 +862,6 @@ static const struct file_operations fops_wmi = {
        .open  = simple_open,
 };
 
-static void wil_seq_hexdump(struct seq_file *s, void *p, int len,
-                           const char *prefix)
-{
-       char printbuf[16 * 3 + 2];
-       int i = 0;
-
-       while (i < len) {
-               int l = min(len - i, 16);
-
-               hex_dump_to_buffer(p + i, l, 16, 1, printbuf,
-                                  sizeof(printbuf), false);
-               seq_printf(s, "%s%s\n", prefix, printbuf);
-               i += l;
-       }
-}
-
 static void wil_seq_print_skb(struct seq_file *s, struct sk_buff *skb)
 {
        int i = 0;
index 6dc76c1e807b4d0ad45ae0bbbd1ba1310ed33239..a7bf747271162c450c1da60e12e417bdfe6b366a 100644 (file)
@@ -200,11 +200,6 @@ struct xenvif_queue { /* Per-queue data for xenvif */
        struct xenvif_stats stats;
 };
 
-/* Maximum number of Rx slots a to-guest packet may use, including the
- * slot needed for GSO meta-data.
- */
-#define XEN_NETBK_RX_SLOTS_MAX (MAX_SKB_FRAGS + 1)
-
 enum state_bit_shift {
        /* This bit marks that the vif is connected */
        VIF_STATUS_CONNECTED,
@@ -317,11 +312,6 @@ int xenvif_dealloc_kthread(void *data);
 
 void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
 
-/* Determine whether the needed number of slots (req) are available,
- * and set req_event if not.
- */
-bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed);
-
 void xenvif_carrier_on(struct xenvif *vif);
 
 /* Callback from stack when TX packet can be released */
index 42569b994ea84ae03a9ff0d9b88109d3029c4f30..ec98d43916a818152263c8128b4854e9d340c3d2 100644 (file)
@@ -149,9 +149,20 @@ static inline pending_ring_idx_t pending_index(unsigned i)
        return i & (MAX_PENDING_REQS-1);
 }
 
-bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue, int needed)
+static int xenvif_rx_ring_slots_needed(struct xenvif *vif)
+{
+       if (vif->gso_mask)
+               return DIV_ROUND_UP(vif->dev->gso_max_size, PAGE_SIZE) + 1;
+       else
+               return DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
+}
+
+static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
 {
        RING_IDX prod, cons;
+       int needed;
+
+       needed = xenvif_rx_ring_slots_needed(queue->vif);
 
        do {
                prod = queue->rx.sring->req_prod;
@@ -314,7 +325,7 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb
                } else {
                        copy_gop->source.domid = DOMID_SELF;
                        copy_gop->source.u.gmfn =
-                               virt_to_mfn(page_address(page));
+                               virt_to_gfn(page_address(page));
                }
                copy_gop->source.offset = offset;
 
@@ -513,7 +524,7 @@ static void xenvif_rx_action(struct xenvif_queue *queue)
 
        skb_queue_head_init(&rxq);
 
-       while (xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX)
+       while (xenvif_rx_ring_slots_available(queue)
               && (skb = xenvif_rx_dequeue(queue)) != NULL) {
                queue->last_rx_time = jiffies;
 
@@ -1395,7 +1406,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
                queue->tx_copy_ops[*copy_ops].source.offset = txreq.offset;
 
                queue->tx_copy_ops[*copy_ops].dest.u.gmfn =
-                       virt_to_mfn(skb->data);
+                       virt_to_gfn(skb->data);
                queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
                queue->tx_copy_ops[*copy_ops].dest.offset =
                        offset_in_page(skb->data);
@@ -1938,8 +1949,7 @@ static bool xenvif_rx_queue_stalled(struct xenvif_queue *queue)
        prod = queue->rx.sring->req_prod;
        cons = queue->rx.req_cons;
 
-       return !queue->stalled
-               && prod - cons < XEN_NETBK_RX_SLOTS_MAX
+       return !queue->stalled && prod - cons < 1
                && time_after(jiffies,
                              queue->last_rx_time + queue->vif->stall_timeout);
 }
@@ -1951,14 +1961,13 @@ static bool xenvif_rx_queue_ready(struct xenvif_queue *queue)
        prod = queue->rx.sring->req_prod;
        cons = queue->rx.req_cons;
 
-       return queue->stalled
-               && prod - cons >= XEN_NETBK_RX_SLOTS_MAX;
+       return queue->stalled && prod - cons >= 1;
 }
 
 static bool xenvif_have_rx_work(struct xenvif_queue *queue)
 {
        return (!skb_queue_empty(&queue->rx_queue)
-               && xenvif_rx_ring_slots_available(queue, XEN_NETBK_RX_SLOTS_MAX))
+               && xenvif_rx_ring_slots_available(queue))
                || (queue->vif->stall_timeout &&
                    (xenvif_rx_queue_stalled(queue)
                     || xenvif_rx_queue_ready(queue)))
@@ -2105,8 +2114,11 @@ static int __init netback_init(void)
        if (!xen_domain())
                return -ENODEV;
 
-       /* Allow as many queues as there are CPUs, by default */
-       xenvif_max_queues = num_online_cpus();
+       /* Allow as many queues as there are CPUs if user has not
+        * specified a value.
+        */
+       if (xenvif_max_queues == 0)
+               xenvif_max_queues = num_online_cpus();
 
        if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) {
                pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n",
index e27e6d2ea6d283cc3508579b38e8fb27b8ade035..f821a97d78278feed765d08d886a4665d8795bd5 100644 (file)
@@ -291,7 +291,7 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue)
                struct sk_buff *skb;
                unsigned short id;
                grant_ref_t ref;
-               unsigned long pfn;
+               unsigned long gfn;
                struct xen_netif_rx_request *req;
 
                skb = xennet_alloc_one_rx_buffer(queue);
@@ -307,12 +307,12 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue)
                BUG_ON((signed short)ref < 0);
                queue->grant_rx_ref[id] = ref;
 
-               pfn = page_to_pfn(skb_frag_page(&skb_shinfo(skb)->frags[0]));
+               gfn = xen_page_to_gfn(skb_frag_page(&skb_shinfo(skb)->frags[0]));
 
                req = RING_GET_REQUEST(&queue->rx, req_prod);
                gnttab_grant_foreign_access_ref(ref,
                                                queue->info->xbdev->otherend_id,
-                                               pfn_to_mfn(pfn),
+                                               gfn,
                                                0);
 
                req->id = id;
@@ -430,8 +430,10 @@ static struct xen_netif_tx_request *xennet_make_one_txreq(
        ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
        BUG_ON((signed short)ref < 0);
 
-       gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id,
-                                       page_to_mfn(page), GNTMAP_readonly);
+       gnttab_grant_foreign_access_ref(ref,
+                                       queue->info->xbdev->otherend_id,
+                                       xen_page_to_gfn(page),
+                                       GNTMAP_readonly);
 
        queue->tx_skbs[id].skb = skb;
        queue->grant_tx_page[id] = page;
@@ -2132,8 +2134,11 @@ static int __init netif_init(void)
 
        pr_info("Initialising Xen virtual ethernet driver\n");
 
-       /* Allow as many queues as there are CPUs, by default */
-       xennet_max_queues = num_online_cpus();
+       /* Allow as many queues as there are CPUs if user has not
+        * specified a value.
+        */
+       if (xennet_max_queues == 0)
+               xennet_max_queues = num_online_cpus();
 
        return xenbus_register_frontend(&netfront_driver);
 }
index 87751cfd6f4faa4f1ac69b5255bdeb3aae69cda1..865a3e3cc581670bbd78a49166067ae955545de9 100644 (file)
@@ -190,14 +190,17 @@ static inline int pdev_is_xeon(struct pci_dev *pdev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
        case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
        case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
        case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                return 1;
        }
        return 0;
@@ -237,7 +240,7 @@ static inline int ndev_ignore_unsafe(struct intel_ntb_dev *ndev,
 
 static int ndev_mw_to_bar(struct intel_ntb_dev *ndev, int idx)
 {
-       if (idx < 0 || idx > ndev->mw_count)
+       if (idx < 0 || idx >= ndev->mw_count)
                return -EINVAL;
        return ndev->reg->mw_bar[idx];
 }
@@ -572,10 +575,13 @@ static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
                         "Connection Topology -\t%s\n",
                         ntb_topo_string(ndev->ntb.topo));
 
-       off += scnprintf(buf + off, buf_size - off,
-                        "B2B Offset -\t\t%#lx\n", ndev->b2b_off);
-       off += scnprintf(buf + off, buf_size - off,
-                        "B2B MW Idx -\t\t%d\n", ndev->b2b_idx);
+       if (ndev->b2b_idx != UINT_MAX) {
+               off += scnprintf(buf + off, buf_size - off,
+                                "B2B MW Idx -\t\t%u\n", ndev->b2b_idx);
+               off += scnprintf(buf + off, buf_size - off,
+                                "B2B Offset -\t\t%#lx\n", ndev->b2b_off);
+       }
+
        off += scnprintf(buf + off, buf_size - off,
                         "BAR4 Split -\t\t%s\n",
                         ndev->bar4_split ? "yes" : "no");
@@ -1484,7 +1490,7 @@ static int xeon_setup_b2b_mw(struct intel_ntb_dev *ndev,
        pdev = ndev_pdev(ndev);
        mmio = ndev->self_mmio;
 
-       if (ndev->b2b_idx >= ndev->mw_count) {
+       if (ndev->b2b_idx == UINT_MAX) {
                dev_dbg(ndev_dev(ndev), "not using b2b mw\n");
                b2b_bar = 0;
                ndev->b2b_off = 0;
@@ -1776,6 +1782,13 @@ static int xeon_init_ntb(struct intel_ntb_dev *ndev)
                        else
                                ndev->b2b_idx = b2b_mw_idx;
 
+                       if (ndev->b2b_idx >= ndev->mw_count) {
+                               dev_dbg(ndev_dev(ndev),
+                                       "b2b_mw_idx %d invalid for mw_count %u\n",
+                                       b2b_mw_idx, ndev->mw_count);
+                               return -EINVAL;
+                       }
+
                        dev_dbg(ndev_dev(ndev),
                                "setting up b2b mw idx %d means %d\n",
                                b2b_mw_idx, ndev->b2b_idx);
@@ -1843,6 +1856,9 @@ static int xeon_init_dev(struct intel_ntb_dev *ndev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                ndev->hwerr_flags |= NTB_HWERR_SDOORBELL_LOCKUP;
                break;
        }
@@ -1857,6 +1873,9 @@ static int xeon_init_dev(struct intel_ntb_dev *ndev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                ndev->hwerr_flags |= NTB_HWERR_SB01BASE_LOCKUP;
                break;
        }
@@ -1878,6 +1897,9 @@ static int xeon_init_dev(struct intel_ntb_dev *ndev)
        case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
        case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+       case PCI_DEVICE_ID_INTEL_NTB_SS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_PS_BDX:
+       case PCI_DEVICE_ID_INTEL_NTB_B2B_BDX:
                ndev->hwerr_flags |= NTB_HWERR_B2BDOORBELL_BIT14;
                break;
        }
@@ -1996,7 +2018,7 @@ static inline void ndev_init_struct(struct intel_ntb_dev *ndev,
        ndev->ntb.ops = &intel_ntb_ops;
 
        ndev->b2b_off = 0;
-       ndev->b2b_idx = INT_MAX;
+       ndev->b2b_idx = UINT_MAX;
 
        ndev->bar4_split = 0;
 
@@ -2234,14 +2256,17 @@ static const struct pci_device_id intel_ntb_pci_tbl[] = {
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
+       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BDX)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
+       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_BDX)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
        {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
+       {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_BDX)},
        {0}
 };
 MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl);
index 7ddaf387b679c3c2eb5521fbcc0be30cbf93aa1e..ea0612f797df61e9087b101df592d61c9d33e1ef 100644 (file)
@@ -67,6 +67,9 @@
 #define PCI_DEVICE_ID_INTEL_NTB_PS_HSX 0x2F0E
 #define PCI_DEVICE_ID_INTEL_NTB_SS_HSX 0x2F0F
 #define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD        0x0C4E
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BDX        0x6F0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_BDX 0x6F0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_BDX 0x6F0F
 
 /* Intel Xeon hardware */
 
index 1c6386d5f79c742737e4ee1a8a2b99df686ffaa0..6e3ee907d18613be26c2f23d759637de60282a17 100644 (file)
@@ -119,7 +119,8 @@ struct ntb_transport_qp {
        struct ntb_transport_ctx *transport;
        struct ntb_dev *ndev;
        void *cb_data;
-       struct dma_chan *dma_chan;
+       struct dma_chan *tx_dma_chan;
+       struct dma_chan *rx_dma_chan;
 
        bool client_ready;
        bool link_is_up;
@@ -297,7 +298,7 @@ static LIST_HEAD(ntb_transport_list);
 
 static int ntb_bus_init(struct ntb_transport_ctx *nt)
 {
-       list_add(&nt->entry, &ntb_transport_list);
+       list_add_tail(&nt->entry, &ntb_transport_list);
        return 0;
 }
 
@@ -452,7 +453,7 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 
        out_offset = 0;
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "NTB QP stats\n");
+                              "\nNTB QP stats:\n\n");
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "rx_bytes - \t%llu\n", qp->rx_bytes);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
@@ -470,11 +471,11 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "rx_err_ver - \t%llu\n", qp->rx_err_ver);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "rx_buff - \t%p\n", qp->rx_buff);
+                              "rx_buff - \t0x%p\n", qp->rx_buff);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "rx_index - \t%u\n", qp->rx_index);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "rx_max_entry - \t%u\n", qp->rx_max_entry);
+                              "rx_max_entry - \t%u\n\n", qp->rx_max_entry);
 
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "tx_bytes - \t%llu\n", qp->tx_bytes);
@@ -489,15 +490,32 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "tx_err_no_buf - %llu\n", qp->tx_err_no_buf);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "tx_mw - \t%p\n", qp->tx_mw);
+                              "tx_mw - \t0x%p\n", qp->tx_mw);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "tx_index - \t%u\n", qp->tx_index);
+                              "tx_index (H) - \t%u\n", qp->tx_index);
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "RRI (T) - \t%u\n",
+                              qp->remote_rx_info->entry);
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
                               "tx_max_entry - \t%u\n", qp->tx_max_entry);
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "free tx - \t%u\n",
+                              ntb_transport_tx_free_entry(qp));
 
        out_offset += snprintf(buf + out_offset, out_count - out_offset,
-                              "\nQP Link %s\n",
+                              "\n");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "Using TX DMA - \t%s\n",
+                              qp->tx_dma_chan ? "Yes" : "No");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "Using RX DMA - \t%s\n",
+                              qp->rx_dma_chan ? "Yes" : "No");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "QP Link - \t%s\n",
                               qp->link_is_up ? "Up" : "Down");
+       out_offset += snprintf(buf + out_offset, out_count - out_offset,
+                              "\n");
+
        if (out_offset > out_count)
                out_offset = out_count;
 
@@ -535,6 +553,7 @@ static struct ntb_queue_entry *ntb_list_rm(spinlock_t *lock,
        }
        entry = list_first_entry(list, struct ntb_queue_entry, entry);
        list_del(&entry->entry);
+
 out:
        spin_unlock_irqrestore(lock, flags);
 
@@ -1206,7 +1225,7 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
 {
        struct dma_async_tx_descriptor *txd;
        struct ntb_transport_qp *qp = entry->qp;
-       struct dma_chan *chan = qp->dma_chan;
+       struct dma_chan *chan = qp->rx_dma_chan;
        struct dma_device *device;
        size_t pay_off, buff_off, len;
        struct dmaengine_unmap_data *unmap;
@@ -1219,18 +1238,18 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset)
                goto err;
 
        if (len < copy_bytes)
-               goto err_wait;
+               goto err;
 
        device = chan->device;
        pay_off = (size_t)offset & ~PAGE_MASK;
        buff_off = (size_t)buf & ~PAGE_MASK;
 
        if (!is_dma_copy_aligned(device, pay_off, buff_off, len))
-               goto err_wait;
+               goto err;
 
        unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
        if (!unmap)
-               goto err_wait;
+               goto err;
 
        unmap->len = len;
        unmap->addr[0] = dma_map_page(device->dev, virt_to_page(offset),
@@ -1273,12 +1292,6 @@ err_set_unmap:
        dmaengine_unmap_put(unmap);
 err_get_unmap:
        dmaengine_unmap_put(unmap);
-err_wait:
-       /* If the callbacks come out of order, the writing of the index to the
-        * last completed will be out of order.  This may result in the
-        * receive stalling forever.
-        */
-       dma_sync_wait(chan, qp->last_cookie);
 err:
        ntb_memcpy_rx(entry, offset);
        qp->rx_memcpy++;
@@ -1373,8 +1386,8 @@ static void ntb_transport_rxc_db(unsigned long data)
                        break;
        }
 
-       if (i && qp->dma_chan)
-               dma_async_issue_pending(qp->dma_chan);
+       if (i && qp->rx_dma_chan)
+               dma_async_issue_pending(qp->rx_dma_chan);
 
        if (i == qp->rx_max_entry) {
                /* there is more work to do */
@@ -1441,7 +1454,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 {
        struct ntb_payload_header __iomem *hdr;
        struct dma_async_tx_descriptor *txd;
-       struct dma_chan *chan = qp->dma_chan;
+       struct dma_chan *chan = qp->tx_dma_chan;
        struct dma_device *device;
        size_t dest_off, buff_off;
        struct dmaengine_unmap_data *unmap;
@@ -1634,14 +1647,27 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
        dma_cap_set(DMA_MEMCPY, dma_mask);
 
        if (use_dma) {
-               qp->dma_chan = dma_request_channel(dma_mask, ntb_dma_filter_fn,
-                                                  (void *)(unsigned long)node);
-               if (!qp->dma_chan)
-                       dev_info(&pdev->dev, "Unable to allocate DMA channel\n");
+               qp->tx_dma_chan =
+                       dma_request_channel(dma_mask, ntb_dma_filter_fn,
+                                           (void *)(unsigned long)node);
+               if (!qp->tx_dma_chan)
+                       dev_info(&pdev->dev, "Unable to allocate TX DMA channel\n");
+
+               qp->rx_dma_chan =
+                       dma_request_channel(dma_mask, ntb_dma_filter_fn,
+                                           (void *)(unsigned long)node);
+               if (!qp->rx_dma_chan)
+                       dev_info(&pdev->dev, "Unable to allocate RX DMA channel\n");
        } else {
-               qp->dma_chan = NULL;
+               qp->tx_dma_chan = NULL;
+               qp->rx_dma_chan = NULL;
        }
-       dev_dbg(&pdev->dev, "Using %s memcpy\n", qp->dma_chan ? "DMA" : "CPU");
+
+       dev_dbg(&pdev->dev, "Using %s memcpy for TX\n",
+               qp->tx_dma_chan ? "DMA" : "CPU");
+
+       dev_dbg(&pdev->dev, "Using %s memcpy for RX\n",
+               qp->rx_dma_chan ? "DMA" : "CPU");
 
        for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
                entry = kzalloc_node(sizeof(*entry), GFP_ATOMIC, node);
@@ -1676,8 +1702,10 @@ err2:
 err1:
        while ((entry = ntb_list_rm(&qp->ntb_rx_q_lock, &qp->rx_free_q)))
                kfree(entry);
-       if (qp->dma_chan)
-               dma_release_channel(qp->dma_chan);
+       if (qp->tx_dma_chan)
+               dma_release_channel(qp->tx_dma_chan);
+       if (qp->rx_dma_chan)
+               dma_release_channel(qp->rx_dma_chan);
        nt->qp_bitmap_free |= qp_bit;
 err:
        return NULL;
@@ -1701,12 +1729,27 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 
        pdev = qp->ndev->pdev;
 
-       if (qp->dma_chan) {
-               struct dma_chan *chan = qp->dma_chan;
+       if (qp->tx_dma_chan) {
+               struct dma_chan *chan = qp->tx_dma_chan;
+               /* Putting the dma_chan to NULL will force any new traffic to be
+                * processed by the CPU instead of the DAM engine
+                */
+               qp->tx_dma_chan = NULL;
+
+               /* Try to be nice and wait for any queued DMA engine
+                * transactions to process before smashing it with a rock
+                */
+               dma_sync_wait(chan, qp->last_cookie);
+               dmaengine_terminate_all(chan);
+               dma_release_channel(chan);
+       }
+
+       if (qp->rx_dma_chan) {
+               struct dma_chan *chan = qp->rx_dma_chan;
                /* Putting the dma_chan to NULL will force any new traffic to be
                 * processed by the CPU instead of the DAM engine
                 */
-               qp->dma_chan = NULL;
+               qp->rx_dma_chan = NULL;
 
                /* Try to be nice and wait for any queued DMA engine
                 * transactions to process before smashing it with a rock
@@ -1843,7 +1886,7 @@ int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
        entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
        if (!entry) {
                qp->tx_err_no_buf++;
-               return -ENOMEM;
+               return -EBUSY;
        }
 
        entry->cb_data = cb;
@@ -1954,21 +1997,34 @@ EXPORT_SYMBOL_GPL(ntb_transport_qp_num);
 unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp)
 {
        unsigned int max;
+       unsigned int copy_align;
 
        if (!qp)
                return 0;
 
-       if (!qp->dma_chan)
+       if (!qp->tx_dma_chan && !qp->rx_dma_chan)
                return qp->tx_max_frame - sizeof(struct ntb_payload_header);
 
+       copy_align = max(qp->tx_dma_chan->device->copy_align,
+                        qp->rx_dma_chan->device->copy_align);
+
        /* If DMA engine usage is possible, try to find the max size for that */
        max = qp->tx_max_frame - sizeof(struct ntb_payload_header);
-       max -= max % (1 << qp->dma_chan->device->copy_align);
+       max -= max % (1 << copy_align);
 
        return max;
 }
 EXPORT_SYMBOL_GPL(ntb_transport_max_size);
 
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp)
+{
+       unsigned int head = qp->tx_index;
+       unsigned int tail = qp->remote_rx_info->entry;
+
+       return tail > head ? tail - head : qp->tx_max_entry + tail - head;
+}
+EXPORT_SYMBOL_GPL(ntb_transport_tx_free_entry);
+
 static void ntb_transport_doorbell_callback(void *data, int vector)
 {
        struct ntb_transport_ctx *nt = data;
index 72226acb5c0fb8af7b72b88fd7b8f46329095626..53c11621d5b16fd6eb33764469b4f1d4659fe8a5 100644 (file)
@@ -21,6 +21,7 @@ config BLK_DEV_PMEM
        default LIBNVDIMM
        depends on HAS_IOMEM
        select ND_BTT if BTT
+       select ND_PFN if NVDIMM_PFN
        help
          Memory ranges for PMEM are described by either an NFIT
          (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
@@ -47,12 +48,16 @@ config ND_BLK
          (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode
          capabilities.
 
+config ND_CLAIM
+       bool
+
 config ND_BTT
        tristate
 
 config BTT
        bool "BTT: Block Translation Table (atomic sector updates)"
        default y if LIBNVDIMM
+       select ND_CLAIM
        help
          The Block Translation Table (BTT) provides atomic sector
          update semantics for persistent memory devices, so that
@@ -65,4 +70,22 @@ config BTT
 
          Select Y if unsure
 
+config ND_PFN
+       tristate
+
+config NVDIMM_PFN
+       bool "PFN: Map persistent (device) memory"
+       default LIBNVDIMM
+       depends on ZONE_DEVICE
+       select ND_CLAIM
+       help
+         Map persistent memory, i.e. advertise it to the memory
+         management sub-system.  By default persistent memory does
+         not support direct I/O, RDMA, or any other usage that
+         requires a 'struct page' to mediate an I/O request.  This
+         driver allocates and initializes the infrastructure needed
+         to support those use cases.
+
+         Select Y if unsure
+
 endif
index 594bb97c867a7b55c1b3812682b6cfa43cc916d1..ea84d3c4e8e5aa5b9a5925445f6c592f0a7f9a2b 100644 (file)
@@ -2,6 +2,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 
 nd_pmem-y := pmem.o
 
@@ -9,6 +10,8 @@ nd_btt-y := btt.o
 
 nd_blk-y := blk.o
 
+nd_e820-y := e820.o
+
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
@@ -17,4 +20,6 @@ libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
 libnvdimm-y += namespace_devs.o
 libnvdimm-y += label.o
+libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
 libnvdimm-$(CONFIG_BTT) += btt_devs.o
+libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
index 341202ed32b404c866f5c8cdb7dbd99b710f68ae..254239746020b5f0334b27fa7550a82ee3d9b97b 100644 (file)
@@ -582,33 +582,6 @@ static void free_arenas(struct btt *btt)
        }
 }
 
-/*
- * This function checks if the metadata layout is valid and error free
- */
-static int arena_is_valid(struct arena_info *arena, struct btt_sb *super,
-                               u8 *uuid, u32 lbasize)
-{
-       u64 checksum;
-
-       if (memcmp(super->uuid, uuid, 16))
-               return 0;
-
-       checksum = le64_to_cpu(super->checksum);
-       super->checksum = 0;
-       if (checksum != nd_btt_sb_checksum(super))
-               return 0;
-       super->checksum = cpu_to_le64(checksum);
-
-       if (lbasize != le32_to_cpu(super->external_lbasize))
-               return 0;
-
-       /* TODO: figure out action for this */
-       if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
-               dev_info(to_dev(arena), "Found arena with an error flag\n");
-
-       return 1;
-}
-
 /*
  * This function reads an existing valid btt superblock and
  * populates the corresponding arena_info struct
@@ -632,8 +605,9 @@ static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
        arena->logoff = arena_off + le64_to_cpu(super->logoff);
        arena->info2off = arena_off + le64_to_cpu(super->info2off);
 
-       arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) :
-                       (arena->info2off - arena->infooff + BTT_PG_SIZE);
+       arena->size = (le64_to_cpu(super->nextoff) > 0)
+               ? (le64_to_cpu(super->nextoff))
+               : (arena->info2off - arena->infooff + BTT_PG_SIZE);
 
        arena->flags = le32_to_cpu(super->flags);
 }
@@ -665,8 +639,7 @@ static int discover_arenas(struct btt *btt)
                if (ret)
                        goto out;
 
-               if (!arena_is_valid(arena, super, btt->nd_btt->uuid,
-                               btt->lbasize)) {
+               if (!nd_btt_arena_is_valid(btt->nd_btt, super)) {
                        if (remaining == btt->rawsize) {
                                btt->init_state = INIT_NOTFOUND;
                                dev_info(to_dev(arena), "No existing arenas\n");
@@ -755,10 +728,13 @@ static int create_arenas(struct btt *btt)
  * It is only called for an uninitialized arena when a write
  * to that arena occurs for the first time.
  */
-static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid)
+static int btt_arena_write_layout(struct arena_info *arena)
 {
        int ret;
+       u64 sum;
        struct btt_sb *super;
+       struct nd_btt *nd_btt = arena->nd_btt;
+       const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
 
        ret = btt_map_init(arena);
        if (ret)
@@ -773,7 +749,8 @@ static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid)
                return -ENOMEM;
 
        strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
-       memcpy(super->uuid, uuid, 16);
+       memcpy(super->uuid, nd_btt->uuid, 16);
+       memcpy(super->parent_uuid, parent_uuid, 16);
        super->flags = cpu_to_le32(arena->flags);
        super->version_major = cpu_to_le16(arena->version_major);
        super->version_minor = cpu_to_le16(arena->version_minor);
@@ -794,7 +771,8 @@ static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid)
        super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
 
        super->flags = 0;
-       super->checksum = cpu_to_le64(nd_btt_sb_checksum(super));
+       sum = nd_sb_checksum((struct nd_gen_sb *) super);
+       super->checksum = cpu_to_le64(sum);
 
        ret = btt_info_write(arena, super);
 
@@ -813,7 +791,7 @@ static int btt_meta_init(struct btt *btt)
 
        mutex_lock(&btt->init_lock);
        list_for_each_entry(arena, &btt->arena_list, list) {
-               ret = btt_arena_write_layout(arena, btt->nd_btt->uuid);
+               ret = btt_arena_write_layout(arena);
                if (ret)
                        goto unlock;
 
@@ -1447,8 +1425,6 @@ static int __init nd_btt_init(void)
 {
        int rc;
 
-       BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
-
        btt_major = register_blkdev(0, "btt");
        if (btt_major < 0)
                return btt_major;
index 75b0d80a6bd9de78431e03d13e90aa7e4e773c80..b2f8651e5395f07f8e9fe117262fe8f131eea715 100644 (file)
@@ -182,4 +182,7 @@ struct btt {
        int init_state;
        int num_arenas;
 };
+
+bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super);
+
 #endif
index 6ac8c0fea3ec2fd580aa5095f59176448b4f1a22..59ad54a63d9fa98fcd584a68c7dbb8a4aa863d18 100644 (file)
 #include "btt.h"
 #include "nd.h"
 
-static void __nd_btt_detach_ndns(struct nd_btt *nd_btt)
-{
-       struct nd_namespace_common *ndns = nd_btt->ndns;
-
-       dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
-                       || ndns->claim != &nd_btt->dev,
-                       "%s: invalid claim\n", __func__);
-       ndns->claim = NULL;
-       nd_btt->ndns = NULL;
-       put_device(&ndns->dev);
-}
-
-static void nd_btt_detach_ndns(struct nd_btt *nd_btt)
-{
-       struct nd_namespace_common *ndns = nd_btt->ndns;
-
-       if (!ndns)
-               return;
-       get_device(&ndns->dev);
-       device_lock(&ndns->dev);
-       __nd_btt_detach_ndns(nd_btt);
-       device_unlock(&ndns->dev);
-       put_device(&ndns->dev);
-}
-
-static bool __nd_btt_attach_ndns(struct nd_btt *nd_btt,
-               struct nd_namespace_common *ndns)
-{
-       if (ndns->claim)
-               return false;
-       dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
-                       || nd_btt->ndns,
-                       "%s: invalid claim\n", __func__);
-       ndns->claim = &nd_btt->dev;
-       nd_btt->ndns = ndns;
-       get_device(&ndns->dev);
-       return true;
-}
-
-static bool nd_btt_attach_ndns(struct nd_btt *nd_btt,
-               struct nd_namespace_common *ndns)
-{
-       bool claimed;
-
-       device_lock(&ndns->dev);
-       claimed = __nd_btt_attach_ndns(nd_btt, ndns);
-       device_unlock(&ndns->dev);
-       return claimed;
-}
-
 static void nd_btt_release(struct device *dev)
 {
        struct nd_region *nd_region = to_nd_region(dev->parent);
        struct nd_btt *nd_btt = to_nd_btt(dev);
 
        dev_dbg(dev, "%s\n", __func__);
-       nd_btt_detach_ndns(nd_btt);
+       nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns);
        ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
        kfree(nd_btt->uuid);
        kfree(nd_btt);
@@ -172,104 +122,15 @@ static ssize_t namespace_show(struct device *dev,
        return rc;
 }
 
-static int namespace_match(struct device *dev, void *data)
-{
-       char *name = data;
-
-       return strcmp(name, dev_name(dev)) == 0;
-}
-
-static bool is_nd_btt_idle(struct device *dev)
-{
-       struct nd_region *nd_region = to_nd_region(dev->parent);
-       struct nd_btt *nd_btt = to_nd_btt(dev);
-
-       if (nd_region->btt_seed == dev || nd_btt->ndns || dev->driver)
-               return false;
-       return true;
-}
-
-static ssize_t __namespace_store(struct device *dev,
-               struct device_attribute *attr, const char *buf, size_t len)
-{
-       struct nd_btt *nd_btt = to_nd_btt(dev);
-       struct nd_namespace_common *ndns;
-       struct device *found;
-       char *name;
-
-       if (dev->driver) {
-               dev_dbg(dev, "%s: -EBUSY\n", __func__);
-               return -EBUSY;
-       }
-
-       name = kstrndup(buf, len, GFP_KERNEL);
-       if (!name)
-               return -ENOMEM;
-       strim(name);
-
-       if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
-               /* pass */;
-       else {
-               len = -EINVAL;
-               goto out;
-       }
-
-       ndns = nd_btt->ndns;
-       if (strcmp(name, "") == 0) {
-               /* detach the namespace and destroy / reset the btt device */
-               nd_btt_detach_ndns(nd_btt);
-               if (is_nd_btt_idle(dev))
-                       nd_device_unregister(dev, ND_ASYNC);
-               else {
-                       nd_btt->lbasize = 0;
-                       kfree(nd_btt->uuid);
-                       nd_btt->uuid = NULL;
-               }
-               goto out;
-       } else if (ndns) {
-               dev_dbg(dev, "namespace already set to: %s\n",
-                               dev_name(&ndns->dev));
-               len = -EBUSY;
-               goto out;
-       }
-
-       found = device_find_child(dev->parent, name, namespace_match);
-       if (!found) {
-               dev_dbg(dev, "'%s' not found under %s\n", name,
-                               dev_name(dev->parent));
-               len = -ENODEV;
-               goto out;
-       }
-
-       ndns = to_ndns(found);
-       if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
-               dev_dbg(dev, "%s too small to host btt\n", name);
-               len = -ENXIO;
-               goto out_attach;
-       }
-
-       WARN_ON_ONCE(!is_nvdimm_bus_locked(&nd_btt->dev));
-       if (!nd_btt_attach_ndns(nd_btt, ndns)) {
-               dev_dbg(dev, "%s already claimed\n",
-                               dev_name(&ndns->dev));
-               len = -EBUSY;
-       }
-
- out_attach:
-       put_device(&ndns->dev); /* from device_find_child */
- out:
-       kfree(name);
-       return len;
-}
-
 static ssize_t namespace_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t len)
 {
+       struct nd_btt *nd_btt = to_nd_btt(dev);
        ssize_t rc;
 
        nvdimm_bus_lock(dev);
        device_lock(dev);
-       rc = __namespace_store(dev, attr, buf, len);
+       rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len);
        dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
                        rc, buf, buf[len - 1] == '\n' ? "" : "\n");
        device_unlock(dev);
@@ -324,7 +185,7 @@ static struct device *__nd_btt_create(struct nd_region *nd_region,
        dev->type = &nd_btt_device_type;
        dev->groups = nd_btt_attribute_groups;
        device_initialize(&nd_btt->dev);
-       if (ndns && !__nd_btt_attach_ndns(nd_btt, ndns)) {
+       if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) {
                dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
                                __func__, dev_name(ndns->claim));
                put_device(dev);
@@ -342,30 +203,54 @@ struct device *nd_btt_create(struct nd_region *nd_region)
        return dev;
 }
 
-/*
- * nd_btt_sb_checksum: compute checksum for btt info block
+static bool uuid_is_null(u8 *uuid)
+{
+       static const u8 null_uuid[16];
+
+       return (memcmp(uuid, null_uuid, 16) == 0);
+}
+
+/**
+ * nd_btt_arena_is_valid - check if the metadata layout is valid
+ * @nd_btt:    device with BTT geometry and backing device info
+ * @super:     pointer to the arena's info block being tested
+ *
+ * Check consistency of the btt info block with itself by validating
+ * the checksum, and with the parent namespace by verifying the
+ * parent_uuid contained in the info block with the one supplied in.
  *
- * Returns a fletcher64 checksum of everything in the given info block
- * except the last field (since that's where the checksum lives).
+ * Returns:
+ * false for an invalid info block, true for a valid one
  */
-u64 nd_btt_sb_checksum(struct btt_sb *btt_sb)
+bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super)
 {
-       u64 sum;
-       __le64 sum_save;
-
-       sum_save = btt_sb->checksum;
-       btt_sb->checksum = 0;
-       sum = nd_fletcher64(btt_sb, sizeof(*btt_sb), 1);
-       btt_sb->checksum = sum_save;
-       return sum;
+       const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
+       u64 checksum;
+
+       if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0)
+               return false;
+
+       if (!uuid_is_null(super->parent_uuid))
+               if (memcmp(super->parent_uuid, parent_uuid, 16) != 0)
+                       return false;
+
+       checksum = le64_to_cpu(super->checksum);
+       super->checksum = 0;
+       if (checksum != nd_sb_checksum((struct nd_gen_sb *) super))
+               return false;
+       super->checksum = cpu_to_le64(checksum);
+
+       /* TODO: figure out action for this */
+       if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
+               dev_info(&nd_btt->dev, "Found arena with an error flag\n");
+
+       return true;
 }
-EXPORT_SYMBOL(nd_btt_sb_checksum);
+EXPORT_SYMBOL(nd_btt_arena_is_valid);
 
 static int __nd_btt_probe(struct nd_btt *nd_btt,
                struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
 {
-       u64 checksum;
-
        if (!btt_sb || !ndns || !nd_btt)
                return -ENODEV;
 
@@ -375,14 +260,8 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
        if (nvdimm_namespace_capacity(ndns) < SZ_16M)
                return -ENXIO;
 
-       if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
-               return -ENODEV;
-
-       checksum = le64_to_cpu(btt_sb->checksum);
-       btt_sb->checksum = 0;
-       if (checksum != nd_btt_sb_checksum(btt_sb))
+       if (!nd_btt_arena_is_valid(nd_btt, btt_sb))
                return -ENODEV;
-       btt_sb->checksum = cpu_to_le64(checksum);
 
        nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
        nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL);
@@ -416,7 +295,9 @@ int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
        dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__,
                        rc == 0 ? dev_name(dev) : "<none>");
        if (rc < 0) {
-               __nd_btt_detach_ndns(to_nd_btt(dev));
+               struct nd_btt *nd_btt = to_nd_btt(dev);
+
+               __nd_detach_ndns(dev, &nd_btt->ndns);
                put_device(dev);
        }
 
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
new file mode 100644 (file)
index 0000000..e8f03b0
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include "nd-core.h"
+#include "pfn.h"
+#include "btt.h"
+#include "nd.h"
+
+void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns)
+{
+       struct nd_namespace_common *ndns = *_ndns;
+
+       dev_WARN_ONCE(dev, !mutex_is_locked(&ndns->dev.mutex)
+                       || ndns->claim != dev,
+                       "%s: invalid claim\n", __func__);
+       ndns->claim = NULL;
+       *_ndns = NULL;
+       put_device(&ndns->dev);
+}
+
+void nd_detach_ndns(struct device *dev,
+               struct nd_namespace_common **_ndns)
+{
+       struct nd_namespace_common *ndns = *_ndns;
+
+       if (!ndns)
+               return;
+       get_device(&ndns->dev);
+       device_lock(&ndns->dev);
+       __nd_detach_ndns(dev, _ndns);
+       device_unlock(&ndns->dev);
+       put_device(&ndns->dev);
+}
+
+bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+               struct nd_namespace_common **_ndns)
+{
+       if (attach->claim)
+               return false;
+       dev_WARN_ONCE(dev, !mutex_is_locked(&attach->dev.mutex)
+                       || *_ndns,
+                       "%s: invalid claim\n", __func__);
+       attach->claim = dev;
+       *_ndns = attach;
+       get_device(&attach->dev);
+       return true;
+}
+
+bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+               struct nd_namespace_common **_ndns)
+{
+       bool claimed;
+
+       device_lock(&attach->dev);
+       claimed = __nd_attach_ndns(dev, attach, _ndns);
+       device_unlock(&attach->dev);
+       return claimed;
+}
+
+static int namespace_match(struct device *dev, void *data)
+{
+       char *name = data;
+
+       return strcmp(name, dev_name(dev)) == 0;
+}
+
+static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
+{
+       struct nd_region *nd_region = to_nd_region(dev->parent);
+       struct device *seed = NULL;
+
+       if (is_nd_btt(dev))
+               seed = nd_region->btt_seed;
+       else if (is_nd_pfn(dev))
+               seed = nd_region->pfn_seed;
+
+       if (seed == dev || ndns || dev->driver)
+               return false;
+       return true;
+}
+
+static void nd_detach_and_reset(struct device *dev,
+               struct nd_namespace_common **_ndns)
+{
+       /* detach the namespace and destroy / reset the device */
+       nd_detach_ndns(dev, _ndns);
+       if (is_idle(dev, *_ndns)) {
+               nd_device_unregister(dev, ND_ASYNC);
+       } else if (is_nd_btt(dev)) {
+               struct nd_btt *nd_btt = to_nd_btt(dev);
+
+               nd_btt->lbasize = 0;
+               kfree(nd_btt->uuid);
+               nd_btt->uuid = NULL;
+       } else if (is_nd_pfn(dev)) {
+               struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+               kfree(nd_pfn->uuid);
+               nd_pfn->uuid = NULL;
+               nd_pfn->mode = PFN_MODE_NONE;
+       }
+}
+
+ssize_t nd_namespace_store(struct device *dev,
+               struct nd_namespace_common **_ndns, const char *buf,
+               size_t len)
+{
+       struct nd_namespace_common *ndns;
+       struct device *found;
+       char *name;
+
+       if (dev->driver) {
+               dev_dbg(dev, "%s: -EBUSY\n", __func__);
+               return -EBUSY;
+       }
+
+       name = kstrndup(buf, len, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
+       strim(name);
+
+       if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
+               /* pass */;
+       else {
+               len = -EINVAL;
+               goto out;
+       }
+
+       ndns = *_ndns;
+       if (strcmp(name, "") == 0) {
+               nd_detach_and_reset(dev, _ndns);
+               goto out;
+       } else if (ndns) {
+               dev_dbg(dev, "namespace already set to: %s\n",
+                               dev_name(&ndns->dev));
+               len = -EBUSY;
+               goto out;
+       }
+
+       found = device_find_child(dev->parent, name, namespace_match);
+       if (!found) {
+               dev_dbg(dev, "'%s' not found under %s\n", name,
+                               dev_name(dev->parent));
+               len = -ENODEV;
+               goto out;
+       }
+
+       ndns = to_ndns(found);
+       if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
+               dev_dbg(dev, "%s too small to host\n", name);
+               len = -ENXIO;
+               goto out_attach;
+       }
+
+       WARN_ON_ONCE(!is_nvdimm_bus_locked(dev));
+       if (!nd_attach_ndns(dev, ndns, _ndns)) {
+               dev_dbg(dev, "%s already claimed\n",
+                               dev_name(&ndns->dev));
+               len = -EBUSY;
+       }
+
+ out_attach:
+       put_device(&ndns->dev); /* from device_find_child */
+ out:
+       kfree(name);
+       return len;
+}
+
+/*
+ * nd_sb_checksum: compute checksum for a generic info block
+ *
+ * Returns a fletcher64 checksum of everything in the given info block
+ * except the last field (since that's where the checksum lives).
+ */
+u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
+{
+       u64 sum;
+       __le64 sum_save;
+
+       BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
+       BUILD_BUG_ON(sizeof(struct nd_pfn_sb) != SZ_4K);
+       BUILD_BUG_ON(sizeof(struct nd_gen_sb) != SZ_4K);
+
+       sum_save = nd_gen_sb->checksum;
+       nd_gen_sb->checksum = 0;
+       sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1);
+       nd_gen_sb->checksum = sum_save;
+       return sum;
+}
+EXPORT_SYMBOL(nd_sb_checksum);
index c05eb807d674dcc0243765a96cb4544c8b8f9a39..651b8d19d324f3814fb4dc2e75bb8670ba07f9f7 100644 (file)
@@ -241,10 +241,7 @@ void nvdimm_drvdata_release(struct kref *kref)
                nvdimm_free_dpa(ndd, res);
        nvdimm_bus_unlock(dev);
 
-       if (ndd->data && is_vmalloc_addr(ndd->data))
-               vfree(ndd->data);
-       else
-               kfree(ndd->data);
+       kvfree(ndd->data);
        kfree(ndd);
        put_device(dev);
 }
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
new file mode 100644 (file)
index 0000000..8282db2
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+
+static const struct attribute_group *e820_pmem_attribute_groups[] = {
+       &nvdimm_bus_attribute_group,
+       NULL,
+};
+
+static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
+       &nd_region_attribute_group,
+       &nd_device_attribute_group,
+       NULL,
+};
+
+static int e820_pmem_remove(struct platform_device *pdev)
+{
+       struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev);
+
+       nvdimm_bus_unregister(nvdimm_bus);
+       return 0;
+}
+
+static int e820_pmem_probe(struct platform_device *pdev)
+{
+       static struct nvdimm_bus_descriptor nd_desc;
+       struct device *dev = &pdev->dev;
+       struct nvdimm_bus *nvdimm_bus;
+       struct resource *p;
+
+       nd_desc.attr_groups = e820_pmem_attribute_groups;
+       nd_desc.provider_name = "e820";
+       nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
+       if (!nvdimm_bus)
+               goto err;
+       platform_set_drvdata(pdev, nvdimm_bus);
+
+       for (p = iomem_resource.child; p ; p = p->sibling) {
+               struct nd_region_desc ndr_desc;
+
+               if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0)
+                       continue;
+
+               memset(&ndr_desc, 0, sizeof(ndr_desc));
+               ndr_desc.res = p;
+               ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+               ndr_desc.numa_node = NUMA_NO_NODE;
+               set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
+               if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+                       goto err;
+       }
+
+       return 0;
+
+ err:
+       nvdimm_bus_unregister(nvdimm_bus);
+       dev_err(dev, "failed to register legacy persistent memory ranges\n");
+       return -ENXIO;
+}
+
+static struct platform_driver e820_pmem_driver = {
+       .probe = e820_pmem_probe,
+       .remove = e820_pmem_remove,
+       .driver = {
+               .name = "e820_pmem",
+       },
+};
+
+static __init int e820_pmem_init(void)
+{
+       return platform_driver_register(&e820_pmem_driver);
+}
+
+static __exit void e820_pmem_exit(void)
+{
+       platform_driver_unregister(&e820_pmem_driver);
+}
+
+MODULE_ALIAS("platform:e820_pmem*");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+module_init(e820_pmem_init);
+module_exit(e820_pmem_exit);
index fef0dd80d4adb18c10a62a89ae0a53a78c9ea091..0955b2cb10fe87ec17cac82ac60eeedac54fb3b5 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/slab.h>
+#include <linux/pmem.h>
 #include <linux/nd.h>
 #include "nd-core.h"
 #include "nd.h"
@@ -76,22 +77,54 @@ static bool is_namespace_io(struct device *dev)
        return dev ? dev->type == &namespace_io_device_type : false;
 }
 
+bool pmem_should_map_pages(struct device *dev)
+{
+       struct nd_region *nd_region = to_nd_region(dev->parent);
+
+       if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
+               return false;
+
+       if (!test_bit(ND_REGION_PAGEMAP, &nd_region->flags))
+               return false;
+
+       if (is_nd_pfn(dev) || is_nd_btt(dev))
+               return false;
+
+#ifdef ARCH_MEMREMAP_PMEM
+       return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
+#else
+       return false;
+#endif
+}
+EXPORT_SYMBOL(pmem_should_map_pages);
+
 const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
                char *name)
 {
        struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
-       const char *suffix = "";
+       const char *suffix = NULL;
 
-       if (ndns->claim && is_nd_btt(ndns->claim))
-               suffix = "s";
+       if (ndns->claim) {
+               if (is_nd_btt(ndns->claim))
+                       suffix = "s";
+               else if (is_nd_pfn(ndns->claim))
+                       suffix = "m";
+               else
+                       dev_WARN_ONCE(&ndns->dev, 1,
+                                       "unknown claim type by %s\n",
+                                       dev_name(ndns->claim));
+       }
 
-       if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev))
-               sprintf(name, "pmem%d%s", nd_region->id, suffix);
-       else if (is_namespace_blk(&ndns->dev)) {
+       if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) {
+               if (!suffix && pmem_should_map_pages(&ndns->dev))
+                       suffix = "m";
+               sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : "");
+       } else if (is_namespace_blk(&ndns->dev)) {
                struct nd_namespace_blk *nsblk;
 
                nsblk = to_nd_namespace_blk(&ndns->dev);
-               sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix);
+               sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id,
+                               suffix ? suffix : "");
        } else {
                return NULL;
        }
@@ -100,6 +133,26 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 }
 EXPORT_SYMBOL(nvdimm_namespace_disk_name);
 
+const u8 *nd_dev_to_uuid(struct device *dev)
+{
+       static const u8 null_uuid[16];
+
+       if (!dev)
+               return null_uuid;
+
+       if (is_namespace_pmem(dev)) {
+               struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+               return nspm->uuid;
+       } else if (is_namespace_blk(dev)) {
+               struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+               return nsblk->uuid;
+       } else
+               return null_uuid;
+}
+EXPORT_SYMBOL(nd_dev_to_uuid);
+
 static ssize_t nstype_show(struct device *dev,
                struct device_attribute *attr, char *buf)
 {
@@ -1235,12 +1288,22 @@ static const struct attribute_group *nd_namespace_attribute_groups[] = {
 struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 {
        struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+       struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
        struct nd_namespace_common *ndns;
        resource_size_t size;
 
-       if (nd_btt) {
-               ndns = nd_btt->ndns;
-               if (!ndns)
+       if (nd_btt || nd_pfn) {
+               struct device *host = NULL;
+
+               if (nd_btt) {
+                       host = &nd_btt->dev;
+                       ndns = nd_btt->ndns;
+               } else if (nd_pfn) {
+                       host = &nd_pfn->dev;
+                       ndns = nd_pfn->ndns;
+               }
+
+               if (!ndns || !host)
                        return ERR_PTR(-ENODEV);
 
                /*
@@ -1251,12 +1314,12 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
                device_unlock(&ndns->dev);
                if (ndns->dev.driver) {
                        dev_dbg(&ndns->dev, "is active, can't bind %s\n",
-                                       dev_name(&nd_btt->dev));
+                                       dev_name(host));
                        return ERR_PTR(-EBUSY);
                }
-               if (dev_WARN_ONCE(&ndns->dev, ndns->claim != &nd_btt->dev,
+               if (dev_WARN_ONCE(&ndns->dev, ndns->claim != host,
                                        "host (%s) vs claim (%s) mismatch\n",
-                                       dev_name(&nd_btt->dev),
+                                       dev_name(host),
                                        dev_name(ndns->claim)))
                        return ERR_PTR(-ENXIO);
        } else {
index e1970c71ad1c53b3e74a0e976232c34daa594a9a..159aed53204249ce6e9da13fcaa0b58406a1f2f6 100644 (file)
@@ -80,4 +80,13 @@ struct resource *nsblk_add_resource(struct nd_region *nd_region,
 int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
 void get_ndd(struct nvdimm_drvdata *ndd);
 resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
+void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns);
+bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+               struct nd_namespace_common **_ndns);
+bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
+               struct nd_namespace_common **_ndns);
+ssize_t nd_namespace_store(struct device *dev,
+               struct nd_namespace_common **_ndns, const char *buf,
+               size_t len);
 #endif /* __ND_CORE_H__ */
index c41f53e74277cb23252be136e7b94a859b6571f6..417e521d299cb4645f5739f7608cc2d0d84a701c 100644 (file)
@@ -29,6 +29,13 @@ enum {
        ND_MAX_LANES = 256,
        SECTOR_SHIFT = 9,
        INT_LBASIZE_ALIGNMENT = 64,
+#if IS_ENABLED(CONFIG_NVDIMM_PFN)
+       ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE,
+       ND_PFN_MASK = ND_PFN_ALIGN - 1,
+#else
+       ND_PFN_ALIGN = 0,
+       ND_PFN_MASK = 0,
+#endif
 };
 
 struct nvdimm_drvdata {
@@ -92,8 +99,11 @@ struct nd_region {
        struct device dev;
        struct ida ns_ida;
        struct ida btt_ida;
+       struct ida pfn_ida;
+       unsigned long flags;
        struct device *ns_seed;
        struct device *btt_seed;
+       struct device *pfn_seed;
        u16 ndr_mappings;
        u64 ndr_size;
        u64 ndr_start;
@@ -133,6 +143,22 @@ struct nd_btt {
        int id;
 };
 
+enum nd_pfn_mode {
+       PFN_MODE_NONE,
+       PFN_MODE_RAM,
+       PFN_MODE_PMEM,
+};
+
+struct nd_pfn {
+       int id;
+       u8 *uuid;
+       struct device dev;
+       unsigned long npfns;
+       enum nd_pfn_mode mode;
+       struct nd_pfn_sb *pfn_sb;
+       struct nd_namespace_common *ndns;
+};
+
 enum nd_async_mode {
        ND_SYNC,
        ND_ASYNC,
@@ -159,14 +185,19 @@ int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
 int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
                void *buf, size_t len);
 struct nd_btt *to_nd_btt(struct device *dev);
-struct btt_sb;
-u64 nd_btt_sb_checksum(struct btt_sb *btt_sb);
+
+struct nd_gen_sb {
+       char reserved[SZ_4K - 8];
+       __le64 checksum;
+};
+
+u64 nd_sb_checksum(struct nd_gen_sb *sb);
 #if IS_ENABLED(CONFIG_BTT)
 int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata);
 bool is_nd_btt(struct device *dev);
 struct device *nd_btt_create(struct nd_region *nd_region);
 #else
-static inline nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+static inline int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
 {
        return -ENODEV;
 }
@@ -180,8 +211,36 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 {
        return NULL;
 }
+#endif
 
+struct nd_pfn *to_nd_pfn(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_PFN)
+int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata);
+bool is_nd_pfn(struct device *dev);
+struct device *nd_pfn_create(struct nd_region *nd_region);
+int nd_pfn_validate(struct nd_pfn *nd_pfn);
+#else
+static inline int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
+{
+       return -ENODEV;
+}
+
+static inline bool is_nd_pfn(struct device *dev)
+{
+       return false;
+}
+
+static inline struct device *nd_pfn_create(struct nd_region *nd_region)
+{
+       return NULL;
+}
+
+static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
+{
+       return -ENODEV;
+}
 #endif
+
 struct nd_region *to_nd_region(struct device *dev);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
@@ -217,4 +276,6 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 }
 void nd_iostat_end(struct bio *bio, unsigned long start);
 resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
+const u8 *nd_dev_to_uuid(struct device *dev);
+bool pmem_should_map_pages(struct device *dev);
 #endif /* __ND_H__ */
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
new file mode 100644 (file)
index 0000000..cc24375
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __NVDIMM_PFN_H
+#define __NVDIMM_PFN_H
+
+#include <linux/types.h>
+
+#define PFN_SIG_LEN 16
+#define PFN_SIG "NVDIMM_PFN_INFO\0"
+
+struct nd_pfn_sb {
+       u8 signature[PFN_SIG_LEN];
+       u8 uuid[16];
+       u8 parent_uuid[16];
+       __le32 flags;
+       __le16 version_major;
+       __le16 version_minor;
+       __le64 dataoff;
+       __le64 npfns;
+       __le32 mode;
+       u8 padding[4012];
+       __le64 checksum;
+};
+#endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
new file mode 100644 (file)
index 0000000..3fd7d0d
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "pfn.h"
+#include "nd.h"
+
+static void nd_pfn_release(struct device *dev)
+{
+       struct nd_region *nd_region = to_nd_region(dev->parent);
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+       dev_dbg(dev, "%s\n", __func__);
+       nd_detach_ndns(&nd_pfn->dev, &nd_pfn->ndns);
+       ida_simple_remove(&nd_region->pfn_ida, nd_pfn->id);
+       kfree(nd_pfn->uuid);
+       kfree(nd_pfn);
+}
+
+static struct device_type nd_pfn_device_type = {
+       .name = "nd_pfn",
+       .release = nd_pfn_release,
+};
+
+bool is_nd_pfn(struct device *dev)
+{
+       return dev ? dev->type == &nd_pfn_device_type : false;
+}
+EXPORT_SYMBOL(is_nd_pfn);
+
+struct nd_pfn *to_nd_pfn(struct device *dev)
+{
+       struct nd_pfn *nd_pfn = container_of(dev, struct nd_pfn, dev);
+
+       WARN_ON(!is_nd_pfn(dev));
+       return nd_pfn;
+}
+EXPORT_SYMBOL(to_nd_pfn);
+
+static ssize_t mode_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+       switch (nd_pfn->mode) {
+       case PFN_MODE_RAM:
+               return sprintf(buf, "ram\n");
+       case PFN_MODE_PMEM:
+               return sprintf(buf, "pmem\n");
+       default:
+               return sprintf(buf, "none\n");
+       }
+}
+
+static ssize_t mode_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t len)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+       ssize_t rc = 0;
+
+       device_lock(dev);
+       nvdimm_bus_lock(dev);
+       if (dev->driver)
+               rc = -EBUSY;
+       else {
+               size_t n = len - 1;
+
+               if (strncmp(buf, "pmem\n", n) == 0
+                               || strncmp(buf, "pmem", n) == 0) {
+                       /* TODO: allocate from PMEM support */
+                       rc = -ENOTTY;
+               } else if (strncmp(buf, "ram\n", n) == 0
+                               || strncmp(buf, "ram", n) == 0)
+                       nd_pfn->mode = PFN_MODE_RAM;
+               else if (strncmp(buf, "none\n", n) == 0
+                               || strncmp(buf, "none", n) == 0)
+                       nd_pfn->mode = PFN_MODE_NONE;
+               else
+                       rc = -EINVAL;
+       }
+       dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+                       rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+       nvdimm_bus_unlock(dev);
+       device_unlock(dev);
+
+       return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t uuid_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+
+       if (nd_pfn->uuid)
+               return sprintf(buf, "%pUb\n", nd_pfn->uuid);
+       return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t len)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+       ssize_t rc;
+
+       device_lock(dev);
+       rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len);
+       dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+                       rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+       device_unlock(dev);
+
+       return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+       ssize_t rc;
+
+       nvdimm_bus_lock(dev);
+       rc = sprintf(buf, "%s\n", nd_pfn->ndns
+                       ? dev_name(&nd_pfn->ndns->dev) : "");
+       nvdimm_bus_unlock(dev);
+       return rc;
+}
+
+static ssize_t namespace_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t len)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+       ssize_t rc;
+
+       nvdimm_bus_lock(dev);
+       device_lock(dev);
+       rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len);
+       dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+                       rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+       device_unlock(dev);
+       nvdimm_bus_unlock(dev);
+
+       return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static struct attribute *nd_pfn_attributes[] = {
+       &dev_attr_mode.attr,
+       &dev_attr_namespace.attr,
+       &dev_attr_uuid.attr,
+       NULL,
+};
+
+static struct attribute_group nd_pfn_attribute_group = {
+       .attrs = nd_pfn_attributes,
+};
+
+static const struct attribute_group *nd_pfn_attribute_groups[] = {
+       &nd_pfn_attribute_group,
+       &nd_device_attribute_group,
+       &nd_numa_attribute_group,
+       NULL,
+};
+
+static struct device *__nd_pfn_create(struct nd_region *nd_region,
+               u8 *uuid, enum nd_pfn_mode mode,
+               struct nd_namespace_common *ndns)
+{
+       struct nd_pfn *nd_pfn;
+       struct device *dev;
+
+       /* we can only create pages for contiguous ranged of pmem */
+       if (!is_nd_pmem(&nd_region->dev))
+               return NULL;
+
+       nd_pfn = kzalloc(sizeof(*nd_pfn), GFP_KERNEL);
+       if (!nd_pfn)
+               return NULL;
+
+       nd_pfn->id = ida_simple_get(&nd_region->pfn_ida, 0, 0, GFP_KERNEL);
+       if (nd_pfn->id < 0) {
+               kfree(nd_pfn);
+               return NULL;
+       }
+
+       nd_pfn->mode = mode;
+       if (uuid)
+               uuid = kmemdup(uuid, 16, GFP_KERNEL);
+       nd_pfn->uuid = uuid;
+       dev = &nd_pfn->dev;
+       dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
+       dev->parent = &nd_region->dev;
+       dev->type = &nd_pfn_device_type;
+       dev->groups = nd_pfn_attribute_groups;
+       device_initialize(&nd_pfn->dev);
+       if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
+               dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
+                               __func__, dev_name(ndns->claim));
+               put_device(dev);
+               return NULL;
+       }
+       return dev;
+}
+
+struct device *nd_pfn_create(struct nd_region *nd_region)
+{
+       struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE,
+                       NULL);
+
+       if (dev)
+               __nd_device_register(dev);
+       return dev;
+}
+
+int nd_pfn_validate(struct nd_pfn *nd_pfn)
+{
+       struct nd_namespace_common *ndns = nd_pfn->ndns;
+       struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+       struct nd_namespace_io *nsio;
+       u64 checksum, offset;
+
+       if (!pfn_sb || !ndns)
+               return -ENODEV;
+
+       if (!is_nd_pmem(nd_pfn->dev.parent))
+               return -ENODEV;
+
+       /* section alignment for simple hotplug */
+       if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN)
+               return -ENODEV;
+
+       if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
+               return -ENXIO;
+
+       if (memcmp(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN) != 0)
+               return -ENODEV;
+
+       checksum = le64_to_cpu(pfn_sb->checksum);
+       pfn_sb->checksum = 0;
+       if (checksum != nd_sb_checksum((struct nd_gen_sb *) pfn_sb))
+               return -ENODEV;
+       pfn_sb->checksum = cpu_to_le64(checksum);
+
+       switch (le32_to_cpu(pfn_sb->mode)) {
+       case PFN_MODE_RAM:
+               break;
+       case PFN_MODE_PMEM:
+               /* TODO: allocate from PMEM support */
+               return -ENOTTY;
+       default:
+               return -ENXIO;
+       }
+
+       if (!nd_pfn->uuid) {
+               /* from probe we allocate */
+               nd_pfn->uuid = kmemdup(pfn_sb->uuid, 16, GFP_KERNEL);
+               if (!nd_pfn->uuid)
+                       return -ENOMEM;
+       } else {
+               /* from init we validate */
+               if (memcmp(nd_pfn->uuid, pfn_sb->uuid, 16) != 0)
+                       return -EINVAL;
+       }
+
+       /*
+        * These warnings are verbose because they can only trigger in
+        * the case where the physical address alignment of the
+        * namespace has changed since the pfn superblock was
+        * established.
+        */
+       offset = le64_to_cpu(pfn_sb->dataoff);
+       nsio = to_nd_namespace_io(&ndns->dev);
+       if (nsio->res.start & ND_PFN_MASK) {
+               dev_err(&nd_pfn->dev,
+                               "init failed: %s not section aligned\n",
+                               dev_name(&ndns->dev));
+               return -EBUSY;
+       } else if (offset >= resource_size(&nsio->res)) {
+               dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n",
+                               dev_name(&ndns->dev));
+               return -EBUSY;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(nd_pfn_validate);
+
+int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
+{
+       int rc;
+       struct device *dev;
+       struct nd_pfn *nd_pfn;
+       struct nd_pfn_sb *pfn_sb;
+       struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+       if (ndns->force_raw)
+               return -ENODEV;
+
+       nvdimm_bus_lock(&ndns->dev);
+       dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns);
+       nvdimm_bus_unlock(&ndns->dev);
+       if (!dev)
+               return -ENOMEM;
+       dev_set_drvdata(dev, drvdata);
+       pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
+       nd_pfn = to_nd_pfn(dev);
+       nd_pfn->pfn_sb = pfn_sb;
+       rc = nd_pfn_validate(nd_pfn);
+       nd_pfn->pfn_sb = NULL;
+       kfree(pfn_sb);
+       dev_dbg(&ndns->dev, "%s: pfn: %s\n", __func__,
+                       rc == 0 ? dev_name(dev) : "<none>");
+       if (rc < 0) {
+               __nd_detach_ndns(dev, &nd_pfn->ndns);
+               put_device(dev);
+       } else
+               __nd_device_register(&nd_pfn->dev);
+
+       return rc;
+}
+EXPORT_SYMBOL(nd_pfn_probe);
index 4c079d5cb53974aa18260f05a88214016559aa75..b9525385c0dc35532ceaed6087c3f36ea2697ae0 100644 (file)
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
+#include <linux/memory_hotplug.h>
 #include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
 #include <linux/nd.h>
+#include "pfn.h"
 #include "nd.h"
 
 struct pmem_device {
        struct request_queue    *pmem_queue;
        struct gendisk          *pmem_disk;
+       struct nd_namespace_common *ndns;
 
        /* One contiguous memory region per device */
        phys_addr_t             phys_addr;
+       /* when non-zero this device is hosting a 'pfn' instance */
+       phys_addr_t             data_offset;
        void __pmem             *virt_addr;
        size_t                  size;
 };
@@ -44,7 +50,7 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
                        sector_t sector)
 {
        void *mem = kmap_atomic(page);
-       size_t pmem_off = sector << 9;
+       phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
        void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
 
        if (rw == READ) {
@@ -92,19 +98,26 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
-                             void **kaddr, unsigned long *pfn, long size)
+                     void __pmem **kaddr, unsigned long *pfn)
 {
        struct pmem_device *pmem = bdev->bd_disk->private_data;
-       size_t offset = sector << 9;
-
-       if (!pmem)
-               return -ENODEV;
+       resource_size_t offset = sector * 512 + pmem->data_offset;
+       resource_size_t size;
+
+       if (pmem->data_offset) {
+               /*
+                * Limit the direct_access() size to what is covered by
+                * the memmap
+                */
+               size = (pmem->size - offset) & ~ND_PFN_MASK;
+       } else
+               size = pmem->size - offset;
 
        /* FIXME convert DAX to comprehend that this mapping has a lifetime */
-       *kaddr = (void __force *) pmem->virt_addr + offset;
+       *kaddr = pmem->virt_addr + offset;
        *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
 
-       return pmem->size - offset;
+       return size;
 }
 
 static const struct block_device_operations pmem_fops = {
@@ -119,27 +132,33 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 {
        struct pmem_device *pmem;
 
-       pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+       pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
        if (!pmem)
                return ERR_PTR(-ENOMEM);
 
        pmem->phys_addr = res->start;
        pmem->size = resource_size(res);
-       if (!arch_has_pmem_api())
+       if (!arch_has_wmb_pmem())
                dev_warn(dev, "unable to guarantee persistence of writes\n");
 
-       if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
+       if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
+                       dev_name(dev))) {
                dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
                                &pmem->phys_addr, pmem->size);
-               kfree(pmem);
                return ERR_PTR(-EBUSY);
        }
 
-       pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size);
-       if (!pmem->virt_addr) {
-               release_mem_region(pmem->phys_addr, pmem->size);
-               kfree(pmem);
-               return ERR_PTR(-ENXIO);
+       if (pmem_should_map_pages(dev)) {
+               void *addr = devm_memremap_pages(dev, res);
+
+               if (IS_ERR(addr))
+                       return addr;
+               pmem->virt_addr = (void __pmem *) addr;
+       } else {
+               pmem->virt_addr = memremap_pmem(dev, pmem->phys_addr,
+                               pmem->size);
+               if (!pmem->virt_addr)
+                       return ERR_PTR(-ENXIO);
        }
 
        return pmem;
@@ -147,13 +166,16 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 
 static void pmem_detach_disk(struct pmem_device *pmem)
 {
+       if (!pmem->pmem_disk)
+               return;
+
        del_gendisk(pmem->pmem_disk);
        put_disk(pmem->pmem_disk);
        blk_cleanup_queue(pmem->pmem_queue);
 }
 
-static int pmem_attach_disk(struct nd_namespace_common *ndns,
-               struct pmem_device *pmem)
+static int pmem_attach_disk(struct device *dev,
+               struct nd_namespace_common *ndns, struct pmem_device *pmem)
 {
        struct gendisk *disk;
 
@@ -162,6 +184,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns,
                return -ENOMEM;
 
        blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+       blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
        blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
        blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
        queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
@@ -179,8 +202,8 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns,
        disk->queue             = pmem->pmem_queue;
        disk->flags             = GENHD_FL_EXT_DEVT;
        nvdimm_namespace_disk_name(ndns, disk->disk_name);
-       disk->driverfs_dev = &ndns->dev;
-       set_capacity(disk, pmem->size >> 9);
+       disk->driverfs_dev = dev;
+       set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
        pmem->pmem_disk = disk;
 
        add_disk(disk);
@@ -209,11 +232,152 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
        return 0;
 }
 
-static void pmem_free(struct pmem_device *pmem)
+static int nd_pfn_init(struct nd_pfn *nd_pfn)
+{
+       struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
+       struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
+       struct nd_namespace_common *ndns = nd_pfn->ndns;
+       struct nd_region *nd_region;
+       unsigned long npfns;
+       phys_addr_t offset;
+       u64 checksum;
+       int rc;
+
+       if (!pfn_sb)
+               return -ENOMEM;
+
+       nd_pfn->pfn_sb = pfn_sb;
+       rc = nd_pfn_validate(nd_pfn);
+       if (rc == 0 || rc == -EBUSY)
+               return rc;
+
+       /* section alignment for simple hotplug */
+       if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN
+                       || pmem->phys_addr & ND_PFN_MASK)
+               return -ENODEV;
+
+       nd_region = to_nd_region(nd_pfn->dev.parent);
+       if (nd_region->ro) {
+               dev_info(&nd_pfn->dev,
+                               "%s is read-only, unable to init metadata\n",
+                               dev_name(&nd_region->dev));
+               goto err;
+       }
+
+       memset(pfn_sb, 0, sizeof(*pfn_sb));
+       npfns = (pmem->size - SZ_8K) / SZ_4K;
+       /*
+        * Note, we use 64 here for the standard size of struct page,
+        * debugging options may cause it to be larger in which case the
+        * implementation will limit the pfns advertised through
+        * ->direct_access() to those that are included in the memmap.
+        */
+       if (nd_pfn->mode == PFN_MODE_PMEM)
+               offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE);
+       else if (nd_pfn->mode == PFN_MODE_RAM)
+               offset = SZ_8K;
+       else
+               goto err;
+
+       npfns = (pmem->size - offset) / SZ_4K;
+       pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
+       pfn_sb->dataoff = cpu_to_le64(offset);
+       pfn_sb->npfns = cpu_to_le64(npfns);
+       memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
+       memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
+       pfn_sb->version_major = cpu_to_le16(1);
+       checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
+       pfn_sb->checksum = cpu_to_le64(checksum);
+
+       rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
+       if (rc)
+               goto err;
+
+       return 0;
+ err:
+       nd_pfn->pfn_sb = NULL;
+       kfree(pfn_sb);
+       return -ENXIO;
+}
+
+static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
+{
+       struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
+       struct pmem_device *pmem;
+
+       /* free pmem disk */
+       pmem = dev_get_drvdata(&nd_pfn->dev);
+       pmem_detach_disk(pmem);
+
+       /* release nd_pfn resources */
+       kfree(nd_pfn->pfn_sb);
+       nd_pfn->pfn_sb = NULL;
+
+       return 0;
+}
+
+static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 {
-       memunmap_pmem(pmem->virt_addr);
-       release_mem_region(pmem->phys_addr, pmem->size);
-       kfree(pmem);
+       struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+       struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
+       struct device *dev = &nd_pfn->dev;
+       struct vmem_altmap *altmap;
+       struct nd_region *nd_region;
+       struct nd_pfn_sb *pfn_sb;
+       struct pmem_device *pmem;
+       phys_addr_t offset;
+       int rc;
+
+       if (!nd_pfn->uuid || !nd_pfn->ndns)
+               return -ENODEV;
+
+       nd_region = to_nd_region(dev->parent);
+       rc = nd_pfn_init(nd_pfn);
+       if (rc)
+               return rc;
+
+       if (PAGE_SIZE != SZ_4K) {
+               dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n");
+               return -ENXIO;
+       }
+       if (nsio->res.start & ND_PFN_MASK) {
+               dev_err(dev, "%s not memory hotplug section aligned\n",
+                               dev_name(&ndns->dev));
+               return -ENXIO;
+       }
+
+       pfn_sb = nd_pfn->pfn_sb;
+       offset = le64_to_cpu(pfn_sb->dataoff);
+       nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
+       if (nd_pfn->mode == PFN_MODE_RAM) {
+               if (offset != SZ_8K)
+                       return -EINVAL;
+               nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
+               altmap = NULL;
+       } else {
+               rc = -ENXIO;
+               goto err;
+       }
+
+       /* establish pfn range for lookup, and switch to direct map */
+       pmem = dev_get_drvdata(dev);
+       memunmap_pmem(dev, pmem->virt_addr);
+       pmem->virt_addr = (void __pmem *)devm_memremap_pages(dev, &nsio->res);
+       if (IS_ERR(pmem->virt_addr)) {
+               rc = PTR_ERR(pmem->virt_addr);
+               goto err;
+       }
+
+       /* attach pmem disk in "pfn-mode" */
+       pmem->data_offset = offset;
+       rc = pmem_attach_disk(dev, ndns, pmem);
+       if (rc)
+               goto err;
+
+       return rc;
+ err:
+       nvdimm_namespace_detach_pfn(ndns);
+       return rc;
 }
 
 static int nd_pmem_probe(struct device *dev)
@@ -222,7 +386,6 @@ static int nd_pmem_probe(struct device *dev)
        struct nd_namespace_common *ndns;
        struct nd_namespace_io *nsio;
        struct pmem_device *pmem;
-       int rc;
 
        ndns = nvdimm_namespace_common_probe(dev);
        if (IS_ERR(ndns))
@@ -233,18 +396,27 @@ static int nd_pmem_probe(struct device *dev)
        if (IS_ERR(pmem))
                return PTR_ERR(pmem);
 
+       pmem->ndns = ndns;
        dev_set_drvdata(dev, pmem);
        ndns->rw_bytes = pmem_rw_bytes;
+
        if (is_nd_btt(dev))
-               rc = nvdimm_namespace_attach_btt(ndns);
-       else if (nd_btt_probe(ndns, pmem) == 0) {
+               return nvdimm_namespace_attach_btt(ndns);
+
+       if (is_nd_pfn(dev))
+               return nvdimm_namespace_attach_pfn(ndns);
+
+       if (nd_btt_probe(ndns, pmem) == 0) {
                /* we'll come back as btt-pmem */
-               rc = -ENXIO;
-       } else
-               rc = pmem_attach_disk(ndns, pmem);
-       if (rc)
-               pmem_free(pmem);
-       return rc;
+               return -ENXIO;
+       }
+
+       if (nd_pfn_probe(ndns, pmem) == 0) {
+               /* we'll come back as pfn-pmem */
+               return -ENXIO;
+       }
+
+       return pmem_attach_disk(dev, ndns, pmem);
 }
 
 static int nd_pmem_remove(struct device *dev)
@@ -252,10 +424,11 @@ static int nd_pmem_remove(struct device *dev)
        struct pmem_device *pmem = dev_get_drvdata(dev);
 
        if (is_nd_btt(dev))
-               nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+               nvdimm_namespace_detach_btt(pmem->ndns);
+       else if (is_nd_pfn(dev))
+               nvdimm_namespace_detach_pfn(pmem->ndns);
        else
                pmem_detach_disk(pmem);
-       pmem_free(pmem);
 
        return 0;
 }
index f28f78ccff190ceb92c86684245c0364c3dde6ed..7da63eac78eec1b449457fb123bbac4154deaef3 100644 (file)
@@ -53,6 +53,7 @@ static int nd_region_probe(struct device *dev)
                return -ENODEV;
 
        nd_region->btt_seed = nd_btt_create(nd_region);
+       nd_region->pfn_seed = nd_pfn_create(nd_region);
        if (err == 0)
                return 0;
 
@@ -84,6 +85,7 @@ static int nd_region_remove(struct device *dev)
        nvdimm_bus_lock(dev);
        nd_region->ns_seed = NULL;
        nd_region->btt_seed = NULL;
+       nd_region->pfn_seed = NULL;
        dev_set_drvdata(dev, NULL);
        nvdimm_bus_unlock(dev);
 
index 7384455792bfb629ed6a2b9a5dbe40d1f58f2627..529f3f02e7b2d90ee989e88d400aa1bb4f5f0500 100644 (file)
@@ -345,6 +345,23 @@ static ssize_t btt_seed_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(btt_seed);
 
+static ssize_t pfn_seed_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct nd_region *nd_region = to_nd_region(dev);
+       ssize_t rc;
+
+       nvdimm_bus_lock(dev);
+       if (nd_region->pfn_seed)
+               rc = sprintf(buf, "%s\n", dev_name(nd_region->pfn_seed));
+       else
+               rc = sprintf(buf, "\n");
+       nvdimm_bus_unlock(dev);
+
+       return rc;
+}
+static DEVICE_ATTR_RO(pfn_seed);
+
 static ssize_t read_only_show(struct device *dev,
                struct device_attribute *attr, char *buf)
 {
@@ -373,6 +390,7 @@ static struct attribute *nd_region_attributes[] = {
        &dev_attr_nstype.attr,
        &dev_attr_mappings.attr,
        &dev_attr_btt_seed.attr,
+       &dev_attr_pfn_seed.attr,
        &dev_attr_read_only.attr,
        &dev_attr_set_cookie.attr,
        &dev_attr_available_size.attr,
@@ -740,10 +758,12 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
        nd_region->provider_data = ndr_desc->provider_data;
        nd_region->nd_set = ndr_desc->nd_set;
        nd_region->num_lanes = ndr_desc->num_lanes;
+       nd_region->flags = ndr_desc->flags;
        nd_region->ro = ro;
        nd_region->numa_node = ndr_desc->numa_node;
        ida_init(&nd_region->ns_ida);
        ida_init(&nd_region->btt_ida);
+       ida_init(&nd_region->pfn_ida);
        dev = &nd_region->dev;
        dev_set_name(dev, "region%d", nd_region->id);
        dev->parent = &nvdimm_bus->dev;
index 2956d725649f25ffd7d115097a36a711867b052a..55317fa9c9dca32557c40b1735ac2c2bec5d4714 100644 (file)
@@ -432,6 +432,7 @@ int of_irq_get_byname(struct device_node *dev, const char *name)
 
        return of_irq_get(dev, index);
 }
+EXPORT_SYMBOL_GPL(of_irq_get_byname);
 
 /**
  * of_irq_count - Count the number of IRQs a node uses
index 02ff84fcfa61289f33d7a53f8f1eb16c6b871082..957b42198328f74809cf944ef33a023cb5745b41 100644 (file)
@@ -1103,16 +1103,9 @@ static int ccio_proc_bitmap_info(struct seq_file *m, void *p)
        struct ioc *ioc = ioc_list;
 
        while (ioc != NULL) {
-               u32 *res_ptr = (u32 *)ioc->res_map;
-               int j;
-
-               for (j = 0; j < (ioc->res_size / sizeof(u32)); j++) {
-                       if ((j & 7) == 0)
-                               seq_puts(m, "\n   ");
-                       seq_printf(m, "%08x", *res_ptr);
-                       res_ptr++;
-               }
-               seq_puts(m, "\n\n");
+               seq_hex_dump(m, "   ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+                            ioc->res_size, false);
+               seq_putc(m, '\n');
                ioc = ioc->next;
                break; /* XXX - remove me */
        }
index 901e1a3fa4e2689e6cfd31a01d01a80520dc3a22..7b9e89ba0465f120b07385643b56900a61054696 100644 (file)
@@ -1555,8 +1555,11 @@ lba_driver_probe(struct parisc_device *dev)
        if (lba_dev->hba.lmmio_space.flags)
                pci_add_resource_offset(&resources, &lba_dev->hba.lmmio_space,
                                        lba_dev->hba.lmmio_space_offset);
-       if (lba_dev->hba.gmmio_space.flags)
-               pci_add_resource(&resources, &lba_dev->hba.gmmio_space);
+       if (lba_dev->hba.gmmio_space.flags) {
+               /* pci_add_resource(&resources, &lba_dev->hba.gmmio_space); */
+               pr_warn("LBA: Not registering GMMIO space %pR\n",
+                       &lba_dev->hba.gmmio_space);
+       }
 
        pci_add_resource(&resources, &lba_dev->hba.bus_num);
 
index f1441e466c06cd12218d01f6f527c5485a78cdac..225049b492e535f7bf30ac8ef00f110d4647c0c2 100644 (file)
@@ -1854,14 +1854,9 @@ sba_proc_bitmap_info(struct seq_file *m, void *p)
 {
        struct sba_device *sba_dev = sba_list;
        struct ioc *ioc = &sba_dev->ioc[0];     /* FIXME: Multi-IOC support! */
-       unsigned int *res_ptr = (unsigned int *)ioc->res_map;
-       int i;
 
-       for (i = 0; i < (ioc->res_size/sizeof(unsigned int)); ++i, ++res_ptr) {
-               if ((i & 7) == 0)
-                       seq_puts(m, "\n   ");
-               seq_printf(m, " %08x", *res_ptr);
-       }
+       seq_hex_dump(m, "   ", DUMP_PREFIX_NONE, 32, 4, ioc->res_map,
+                    ioc->res_size, false);
        seq_putc(m, '\n');
 
        return 0;
index 944f50015ed07b41b2de0da464812910c411cc1a..73de4efcbe6edc85c8f3fb54e0c925a7ab30492b 100644 (file)
@@ -2,7 +2,7 @@
 # PCI configuration
 #
 config PCI_BUS_ADDR_T_64BIT
-       def_bool y if (ARCH_DMA_ADDR_T_64BIT || (64BIT && !PARISC))
+       def_bool y if (ARCH_DMA_ADDR_T_64BIT || 64BIT)
        depends on PCI
 
 config PCI_MSI
index 52a880ca1768362ec41399e8df11c3504a28a107..dd652f2ae03db964ed539c5d369092173ab9ab33 100644 (file)
@@ -467,7 +467,7 @@ static void pci_device_shutdown(struct device *dev)
        pci_msi_shutdown(pci_dev);
        pci_msix_shutdown(pci_dev);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * If this is a kexec reboot, turn off Bus Master bit on the
         * device to tell it to not continue to do DMA. Don't touch
index 8177f3b0449152a7c01136a1925f4bfcf885cf8a..0b2be174d9818e9ffe86110068c9eb2e3ea19f11 100644 (file)
@@ -326,8 +326,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
                struct resource *res = &dev->resource[PCI_ROM_RESOURCE];
                dev->rom_base_reg = rom;
                res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
-                               IORESOURCE_READONLY | IORESOURCE_CACHEABLE |
-                               IORESOURCE_SIZEALIGN;
+                               IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
                __pci_read_base(dev, pci_bar_mem32, res, rom);
        }
 }
index 69723e07036bafbd29c6d6deb66e2f7bd7f0133e..9638a00c67c2bd9a3735a632c99d01e9c2625e5d 100644 (file)
@@ -349,6 +349,9 @@ static bool pinctrl_ready_for_gpio_range(unsigned gpio)
        struct pinctrl_gpio_range *range = NULL;
        struct gpio_chip *chip = gpio_to_chip(gpio);
 
+       if (WARN(!chip, "no gpio_chip for gpio%i?", gpio))
+               return false;
+
        mutex_lock(&pinctrldev_list_mutex);
 
        /* Loop over the pin controllers */
index 461fffc4c62ae8b3661b930e244c51267ca81406..11f8b835d3b64f61fde81bc77c2b5ecf931fd351 100644 (file)
@@ -337,9 +337,9 @@ static int dc_pinctrl_probe(struct platform_device *pdev)
        pmap->dev = &pdev->dev;
 
        pmap->pctl = pinctrl_register(pctl_desc, &pdev->dev, pmap);
-       if (!pmap->pctl) {
+       if (IS_ERR(pmap->pctl)) {
                dev_err(&pdev->dev, "pinctrl driver registration failed\n");
-               return -EINVAL;
+               return PTR_ERR(pmap->pctl);
        }
 
        ret = dc_gpiochip_add(pmap, pdev->dev.of_node);
index 67e08cb315c47e67329f8c4090ef2a1f503b9873..29984b36926aef871bb8e5ce93fa6308ddec37b5 100644 (file)
@@ -313,8 +313,7 @@ static int pinmux_func_name_to_selector(struct pinctrl_dev *pctldev,
 
        /* See if this pctldev has this function */
        while (selector < nfuncs) {
-               const char *fname = ops->get_function_name(pctldev,
-                                                          selector);
+               const char *fname = ops->get_function_name(pctldev, selector);
 
                if (!strcmp(function, fname))
                        return selector;
index c978b311031b52a0f999f6fda52d0f9c34e0a7b0..e1a3721bc8e5814fbef5a39184170407436cb193 100644 (file)
@@ -723,9 +723,9 @@ static int pm8xxx_gpio_probe(struct platform_device *pdev)
 #endif
 
        pctrl->pctrl = pinctrl_register(&pctrl->desc, &pdev->dev, pctrl);
-       if (!pctrl->pctrl) {
+       if (IS_ERR(pctrl->pctrl)) {
                dev_err(&pdev->dev, "couldn't register pm8xxx gpio driver\n");
-               return -ENODEV;
+               return PTR_ERR(pctrl->pctrl);
        }
 
        pctrl->chip = pm8xxx_gpio_template;
index 2d1b69f171be7c4dcffeba6199536f65075b604e..6652b8d7f707aefc5656edd348f95e0d139c21da 100644 (file)
@@ -814,9 +814,9 @@ static int pm8xxx_mpp_probe(struct platform_device *pdev)
 #endif
 
        pctrl->pctrl = pinctrl_register(&pctrl->desc, &pdev->dev, pctrl);
-       if (!pctrl->pctrl) {
+       if (IS_ERR(pctrl->pctrl)) {
                dev_err(&pdev->dev, "couldn't register pm8xxx mpp driver\n");
-               return -ENODEV;
+               return PTR_ERR(pctrl->pctrl);
        }
 
        pctrl->chip = pm8xxx_mpp_template;
index 019844d479bb5c2b1bd63ebc840f796d5c684965..d168b39dd7fdccf5b134e20230ec00a7320f2423 100644 (file)
@@ -361,7 +361,7 @@ static inline void s3c24xx_demux_eint(struct irq_desc *desc,
                                      u32 offset, u32 range)
 {
        struct s3c24xx_eint_data *data = irq_desc_get_handler_data(desc);
-       struct irq_chip *chip = irq_desc_get_irq_chip(desc);
+       struct irq_chip *chip = irq_desc_get_chip(desc);
        struct samsung_pinctrl_drv_data *d = data->drvdata;
        unsigned int pend, mask;
 
index 6dc13e4de3962ee66f7f2cf32dacbc7296529b64..c69bb703f483569e4f8069affe1da123be5a4365 100644 (file)
@@ -919,4 +919,9 @@ config INTEL_PMC_IPC
        The PMC is an ARC processor which defines IPC commands for communication
        with other entities in the CPU.
 
+config SURFACE_PRO3_BUTTON
+       tristate "Power/home/volume buttons driver for Microsoft Surface Pro 3 tablet"
+       depends on ACPI && INPUT
+       ---help---
+         This driver handles the power/home/volume buttons on the Microsoft Surface Pro 3 tablet.
 endif # X86_PLATFORM_DEVICES
index dda95a98532101c1d8d55b54bb47d4545ece5bc4..ada5128190285c5bad8b5be023811ce9db795339 100644 (file)
@@ -60,3 +60,4 @@ obj-$(CONFIG_INTEL_SMARTCONNECT)      += intel-smartconnect.o
 obj-$(CONFIG_PVPANIC)           += pvpanic.o
 obj-$(CONFIG_ALIENWARE_WMI)    += alienware-wmi.o
 obj-$(CONFIG_INTEL_PMC_IPC)    += intel_pmc_ipc.o
+obj-$(CONFIG_SURFACE_PRO3_BUTTON)      += surfacepro3_button.o
index f6b280dbfb3331b847a3d9927b1d7105502f5bd4..d773b9dc48a0ad889ec9f9142ba0547afbd2bee4 100644 (file)
@@ -807,6 +807,7 @@ static const struct acpi_device_id norfkill_ids[] __initconst = {
        { "IBM0068", 0},
        { "LEN0068", 0},
        { "SNY5001", 0},        /* sony-laptop in charge */
+       { "HPQ6601", 0},
        { "", 0},
 };
 
index 1ef02daddb60b81aeba3e627d0c7da5870d5248b..460fa6708bfccbc6774284fc859a877d750d02b3 100644 (file)
@@ -346,8 +346,7 @@ static void acerhdf_check_param(struct thermal_zone_device *thermal)
  * as late as the polling interval is since we can't do that in the respective
  * accessors of the module parameters.
  */
-static int acerhdf_get_ec_temp(struct thermal_zone_device *thermal,
-                              unsigned long *t)
+static int acerhdf_get_ec_temp(struct thermal_zone_device *thermal, int *t)
 {
        int temp, err = 0;
 
@@ -453,7 +452,7 @@ static int acerhdf_get_trip_type(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_trip_hyst(struct thermal_zone_device *thermal, int trip,
-                                unsigned long *temp)
+                                int *temp)
 {
        if (trip != 0)
                return -EINVAL;
@@ -464,7 +463,7 @@ static int acerhdf_get_trip_hyst(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_trip_temp(struct thermal_zone_device *thermal, int trip,
-                                unsigned long *temp)
+                                int *temp)
 {
        if (trip == 0)
                *temp = fanon;
@@ -477,7 +476,7 @@ static int acerhdf_get_trip_temp(struct thermal_zone_device *thermal, int trip,
 }
 
 static int acerhdf_get_crit_temp(struct thermal_zone_device *thermal,
-                                unsigned long *temperature)
+                                int *temperature)
 {
        *temperature = ACERHDF_TEMP_CRIT;
        return 0;
index 58d29c4f2840c4974ab204aa63f15cf3ad961a56..f2b5d0a8adf03a2bb5ac521aa62fa2046c06f438 100644 (file)
@@ -332,6 +332,7 @@ static const struct key_entry asus_keymap[] = {
        {KE_KEY, 0x65, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + TV */
        {KE_KEY, 0x66, { KEY_SWITCHVIDEOMODE } }, /* SDSP CRT + TV */
        {KE_KEY, 0x67, { KEY_SWITCHVIDEOMODE } }, /* SDSP LCD + CRT + TV */
+       {KE_KEY, 0x6A, { KEY_TOUCHPAD_TOGGLE } }, /* Lock Touchpad Fn + F9 */
        {KE_KEY, 0x6B, { KEY_TOUCHPAD_TOGGLE } }, /* Lock Touchpad */
        {KE_KEY, 0x6C, { KEY_SLEEP } }, /* Suspend */
        {KE_KEY, 0x6D, { KEY_SLEEP } }, /* Hibernate */
index abdaed34c7285116ffb573102880d55fafdfa8c1..131fee2b093eadde86ba6c0b0fc53be458150e82 100644 (file)
@@ -126,6 +126,24 @@ static const struct dmi_system_id asus_quirks[] = {
                },
                .driver_data = &quirk_asus_wapf4,
        },
+       {
+               .callback = dmi_matched,
+               .ident = "ASUSTeK COMPUTER INC. X456UA",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "X456UA"),
+               },
+               .driver_data = &quirk_asus_wapf4,
+       },
+       {
+               .callback = dmi_matched,
+               .ident = "ASUSTeK COMPUTER INC. X456UF",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "X456UF"),
+               },
+               .driver_data = &quirk_asus_wapf4,
+       },
        {
                .callback = dmi_matched,
                .ident = "ASUSTeK COMPUTER INC. X501U",
index 4e4cc8bd7557d75f4c7aa76ffd2ed734ad54699a..988eedbd7c636e97a9708fc3b72d07ec2162e6cb 100644 (file)
@@ -114,14 +114,9 @@ static int __init hpwl_init(void)
 
        pr_info("Initializing HPQ6001 module\n");
        err = acpi_bus_register_driver(&hpwl_driver);
-       if (err) {
+       if (err)
                pr_err("Unable to register HP wireless control driver.\n");
-               goto error_acpi_register;
-       }
-
-       return 0;
 
-error_acpi_register:
        return err;
 }
 
index 06697315a0887f6493c0f55185b864c12f8bc76e..fb4dd7b3ee711f9ba9b42b4384331c1df9e32fc0 100644 (file)
@@ -54,8 +54,9 @@ MODULE_ALIAS("wmi:5FB7F034-2C63-45e9-BE91-3D44E2C707E4");
 #define HPWMI_HARDWARE_QUERY 0x4
 #define HPWMI_WIRELESS_QUERY 0x5
 #define HPWMI_BIOS_QUERY 0x9
+#define HPWMI_FEATURE_QUERY 0xb
 #define HPWMI_HOTKEY_QUERY 0xc
-#define HPWMI_FEATURE_QUERY 0xd
+#define HPWMI_FEATURE2_QUERY 0xd
 #define HPWMI_WIRELESS2_QUERY 0x1b
 #define HPWMI_POSTCODEERROR_QUERY 0x2a
 
@@ -295,25 +296,33 @@ static int hp_wmi_tablet_state(void)
        return (state & 0x4) ? 1 : 0;
 }
 
-static int __init hp_wmi_bios_2009_later(void)
+static int __init hp_wmi_bios_2008_later(void)
 {
        int state = 0;
        int ret = hp_wmi_perform_query(HPWMI_FEATURE_QUERY, 0, &state,
                                       sizeof(state), sizeof(state));
-       if (ret)
-               return ret;
+       if (!ret)
+               return 1;
 
-       return (state & 0x10) ? 1 : 0;
+       return (ret == HPWMI_RET_UNKNOWN_CMDTYPE) ? 0 : -ENXIO;
 }
 
-static int hp_wmi_enable_hotkeys(void)
+static int __init hp_wmi_bios_2009_later(void)
 {
-       int ret;
-       int query = 0x6e;
+       int state = 0;
+       int ret = hp_wmi_perform_query(HPWMI_FEATURE2_QUERY, 0, &state,
+                                      sizeof(state), sizeof(state));
+       if (!ret)
+               return 1;
 
-       ret = hp_wmi_perform_query(HPWMI_BIOS_QUERY, 1, &query, sizeof(query),
-                                  0);
+       return (ret == HPWMI_RET_UNKNOWN_CMDTYPE) ? 0 : -ENXIO;
+}
 
+static int __init hp_wmi_enable_hotkeys(void)
+{
+       int value = 0x6e;
+       int ret = hp_wmi_perform_query(HPWMI_BIOS_QUERY, 1, &value,
+                                      sizeof(value), 0);
        if (ret)
                return -EINVAL;
        return 0;
@@ -663,7 +672,7 @@ static int __init hp_wmi_input_setup(void)
                            hp_wmi_tablet_state());
        input_sync(hp_wmi_input_dev);
 
-       if (hp_wmi_bios_2009_later() == 4)
+       if (!hp_wmi_bios_2009_later() && hp_wmi_bios_2008_later())
                hp_wmi_enable_hotkeys();
 
        status = wmi_install_notify_handler(HPWMI_EVENT_GUID, hp_wmi_notify, NULL);
index 76b57388d01b5b73838bd3ab73741b5c096148d7..fce49f3c6ed688f676aea9be4647cc63d3c56416 100644 (file)
@@ -852,6 +852,20 @@ static const struct dmi_system_id no_hw_rfkill_list[] = {
                        DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo Yoga 2"),
                },
        },
+       {
+               .ident = "Lenovo Yoga 3 14",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION, "Lenovo Yoga 3 14"),
+               },
+       },
+       {
+               .ident = "Lenovo Yoga 2 11 / 13 / Pro",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+                       DMI_MATCH(DMI_BOARD_NAME, "Yoga2"),
+               },
+       },
        {
                .ident = "Lenovo Yoga 3 Pro 1370",
                .matches = {
index 0944e834af8d25b848524247f6a5231935d4177b..9f713b832ba3ce8810d6bfcb2c1f7b697d0b26b7 100644 (file)
@@ -132,7 +132,7 @@ static int is_valid_adc(uint16_t adc_val, uint16_t min, uint16_t max)
  * to achieve very close approximate temp value with less than
  * 0.5C error
  */
-static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
+static int adc_to_temp(int direct, uint16_t adc_val, int *tp)
 {
        int temp;
 
@@ -174,14 +174,13 @@ static int adc_to_temp(int direct, uint16_t adc_val, unsigned long *tp)
  *
  * Can sleep
  */
-static int mid_read_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int mid_read_temp(struct thermal_zone_device *tzd, int *temp)
 {
        struct thermal_device_info *td_info = tzd->devdata;
        uint16_t adc_val, addr;
        uint8_t data = 0;
        int ret;
-       unsigned long curr_temp;
-
+       int curr_temp;
 
        addr = td_info->chnl_addr;
 
@@ -453,7 +452,7 @@ static SIMPLE_DEV_PM_OPS(mid_thermal_pm,
  *
  * Can sleep
  */
-static int read_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int read_curr_temp(struct thermal_zone_device *tzd, int *temp)
 {
        WARN_ON(tzd == NULL);
        return mid_read_temp(tzd, temp);
diff --git a/drivers/platform/x86/surfacepro3_button.c b/drivers/platform/x86/surfacepro3_button.c
new file mode 100644 (file)
index 0000000..f7dade3
--- /dev/null
@@ -0,0 +1,216 @@
+/*
+ * power/home/volume button support for
+ * Microsoft Surface Pro 3 tablet.
+ *
+ * Copyright (c) 2015 Intel Corporation.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/input.h>
+#include <linux/acpi.h>
+#include <acpi/button.h>
+
+#define SURFACE_BUTTON_HID             "MSHW0028"
+#define SURFACE_BUTTON_OBJ_NAME                "VGBI"
+#define SURFACE_BUTTON_DEVICE_NAME     "Surface Pro 3 Buttons"
+
+#define SURFACE_BUTTON_NOTIFY_PRESS_POWER      0xc6
+#define SURFACE_BUTTON_NOTIFY_RELEASE_POWER    0xc7
+
+#define SURFACE_BUTTON_NOTIFY_PRESS_HOME       0xc4
+#define SURFACE_BUTTON_NOTIFY_RELEASE_HOME     0xc5
+
+#define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_UP  0xc0
+#define SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_UP        0xc1
+
+#define SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_DOWN        0xc2
+#define SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_DOWN      0xc3
+
+ACPI_MODULE_NAME("surface pro 3 button");
+
+MODULE_AUTHOR("Chen Yu");
+MODULE_DESCRIPTION("Surface Pro3 Button Driver");
+MODULE_LICENSE("GPL v2");
+
+/*
+ * Power button, Home button, Volume buttons support is supposed to
+ * be covered by drivers/input/misc/soc_button_array.c, which is implemented
+ * according to "Windows ACPI Design Guide for SoC Platforms".
+ * However surface pro3 seems not to obey the specs, instead it uses
+ * device VGBI(MSHW0028) for dispatching the events.
+ * We choose acpi_driver rather than platform_driver/i2c_driver because
+ * although VGBI has an i2c resource connected to i2c controller, it
+ * is not embedded in any i2c controller's scope, thus neither platform_device
+ * will be created, nor i2c_client will be enumerated, we have to use
+ * acpi_driver.
+ */
+static const struct acpi_device_id surface_button_device_ids[] = {
+       {SURFACE_BUTTON_HID,    0},
+       {"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, surface_button_device_ids);
+
+struct surface_button {
+       unsigned int type;
+       struct input_dev *input;
+       char phys[32];                  /* for input device */
+       unsigned long pushed;
+       bool suspended;
+};
+
+static void surface_button_notify(struct acpi_device *device, u32 event)
+{
+       struct surface_button *button = acpi_driver_data(device);
+       struct input_dev *input;
+       int key_code = KEY_RESERVED;
+       bool pressed = false;
+
+       switch (event) {
+       /* Power button press,release handle */
+       case SURFACE_BUTTON_NOTIFY_PRESS_POWER:
+               pressed = true;
+               /*fall through*/
+       case SURFACE_BUTTON_NOTIFY_RELEASE_POWER:
+               key_code = KEY_POWER;
+               break;
+       /* Home button press,release handle */
+       case SURFACE_BUTTON_NOTIFY_PRESS_HOME:
+               pressed = true;
+               /*fall through*/
+       case SURFACE_BUTTON_NOTIFY_RELEASE_HOME:
+               key_code = KEY_LEFTMETA;
+               break;
+       /* Volume up button press,release handle */
+       case SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_UP:
+               pressed = true;
+               /*fall through*/
+       case SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_UP:
+               key_code = KEY_VOLUMEUP;
+               break;
+       /* Volume down button press,release handle */
+       case SURFACE_BUTTON_NOTIFY_PRESS_VOLUME_DOWN:
+               pressed = true;
+               /*fall through*/
+       case SURFACE_BUTTON_NOTIFY_RELEASE_VOLUME_DOWN:
+               key_code = KEY_VOLUMEDOWN;
+               break;
+       default:
+               dev_info_ratelimited(&device->dev,
+                                 "Unsupported event [0x%x]\n", event);
+               break;
+       }
+       input = button->input;
+       if (KEY_RESERVED == key_code)
+               return;
+       if (pressed)
+               pm_wakeup_event(&device->dev, 0);
+       if (button->suspended)
+               return;
+       input_report_key(input, key_code, pressed?1:0);
+       input_sync(input);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int surface_button_suspend(struct device *dev)
+{
+       struct acpi_device *device = to_acpi_device(dev);
+       struct surface_button *button = acpi_driver_data(device);
+
+       button->suspended = true;
+       return 0;
+}
+
+static int surface_button_resume(struct device *dev)
+{
+       struct acpi_device *device = to_acpi_device(dev);
+       struct surface_button *button = acpi_driver_data(device);
+
+       button->suspended = false;
+       return 0;
+}
+#endif
+
+static int surface_button_add(struct acpi_device *device)
+{
+       struct surface_button *button;
+       struct input_dev *input;
+       const char *hid = acpi_device_hid(device);
+       char *name;
+       int error;
+
+       if (strncmp(acpi_device_bid(device), SURFACE_BUTTON_OBJ_NAME,
+           strlen(SURFACE_BUTTON_OBJ_NAME)))
+               return -ENODEV;
+
+       button = kzalloc(sizeof(struct surface_button), GFP_KERNEL);
+       if (!button)
+               return -ENOMEM;
+
+       device->driver_data = button;
+       button->input = input = input_allocate_device();
+       if (!input) {
+               error = -ENOMEM;
+               goto err_free_button;
+       }
+
+       name = acpi_device_name(device);
+       strcpy(name, SURFACE_BUTTON_DEVICE_NAME);
+       snprintf(button->phys, sizeof(button->phys), "%s/buttons", hid);
+
+       input->name = name;
+       input->phys = button->phys;
+       input->id.bustype = BUS_HOST;
+       input->dev.parent = &device->dev;
+       input_set_capability(input, EV_KEY, KEY_POWER);
+       input_set_capability(input, EV_KEY, KEY_LEFTMETA);
+       input_set_capability(input, EV_KEY, KEY_VOLUMEUP);
+       input_set_capability(input, EV_KEY, KEY_VOLUMEDOWN);
+
+       error = input_register_device(input);
+       if (error)
+               goto err_free_input;
+       dev_info(&device->dev,
+                       "%s [%s]\n", name, acpi_device_bid(device));
+       return 0;
+
+ err_free_input:
+       input_free_device(input);
+ err_free_button:
+       kfree(button);
+       return error;
+}
+
+static int surface_button_remove(struct acpi_device *device)
+{
+       struct surface_button *button = acpi_driver_data(device);
+
+       input_unregister_device(button->input);
+       kfree(button);
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(surface_button_pm,
+               surface_button_suspend, surface_button_resume);
+
+static struct acpi_driver surface_button_driver = {
+       .name = "surface_pro3_button",
+       .class = "SurfacePro3",
+       .ids = surface_button_device_ids,
+       .ops = {
+               .add = surface_button_add,
+               .remove = surface_button_remove,
+               .notify = surface_button_notify,
+       },
+       .drv.pm = &surface_button_pm,
+};
+
+module_acpi_driver(surface_button_driver);
index 33e488cf5569861391fc40844bcc6ce95b305c3c..131dd74641833e69e144f6a22d1d4021d09ee1c9 100644 (file)
@@ -402,7 +402,7 @@ static const char *str_supported(int is_supported);
 #else
 static inline const char *str_supported(int is_supported) { return ""; }
 #define vdbg_printk(a_dbg_level, format, arg...)       \
-       no_printk(format, ##arg)
+       do { if (0) no_printk(format, ##arg); } while (0)
 #endif
 
 static void tpacpi_log_usertask(const char * const what)
index 3ad7b1fa24ce5459900c4b6c966b631c4b6ddc46..f2372f400ddbb406927efc71c843c05a0222c4c4 100644 (file)
@@ -31,7 +31,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#define TOSHIBA_ACPI_VERSION   "0.22"
+#define TOSHIBA_ACPI_VERSION   "0.23"
 #define PROC_INTERFACE_VERSION 1
 
 #include <linux/kernel.h>
@@ -50,6 +50,8 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/toshiba.h>
 #include <acpi/video.h>
 
 MODULE_AUTHOR("John Belmonte");
@@ -91,6 +93,7 @@ MODULE_LICENSE("GPL");
 
 /* Return codes */
 #define TOS_SUCCESS                    0x0000
+#define TOS_SUCCESS2                   0x0001
 #define TOS_OPEN_CLOSE_OK              0x0044
 #define TOS_FAILURE                    0x1000
 #define TOS_NOT_SUPPORTED              0x8000
@@ -111,7 +114,6 @@ MODULE_LICENSE("GPL");
 #define HCI_VIDEO_OUT                  0x001c
 #define HCI_HOTKEY_EVENT               0x001e
 #define HCI_LCD_BRIGHTNESS             0x002a
-#define HCI_WIRELESS                   0x0056
 #define HCI_ACCELEROMETER              0x006d
 #define HCI_KBD_ILLUMINATION           0x0095
 #define HCI_ECO_MODE                   0x0097
@@ -140,10 +142,6 @@ MODULE_LICENSE("GPL");
 #define HCI_VIDEO_OUT_LCD              0x1
 #define HCI_VIDEO_OUT_CRT              0x2
 #define HCI_VIDEO_OUT_TV               0x4
-#define HCI_WIRELESS_KILL_SWITCH       0x01
-#define HCI_WIRELESS_BT_PRESENT                0x0f
-#define HCI_WIRELESS_BT_ATTACH         0x40
-#define HCI_WIRELESS_BT_POWER          0x80
 #define SCI_KBD_MODE_MASK              0x1f
 #define SCI_KBD_MODE_FNZ               0x1
 #define SCI_KBD_MODE_AUTO              0x2
@@ -170,6 +168,7 @@ struct toshiba_acpi_dev {
        struct led_classdev led_dev;
        struct led_classdev kbd_led;
        struct led_classdev eco_led;
+       struct miscdevice miscdev;
 
        int force_fan;
        int last_key_event;
@@ -189,7 +188,6 @@ struct toshiba_acpi_dev {
        unsigned int info_supported:1;
        unsigned int tr_backlight_supported:1;
        unsigned int kbd_illum_supported:1;
-       unsigned int kbd_led_registered:1;
        unsigned int touchpad_supported:1;
        unsigned int eco_supported:1;
        unsigned int accelerometer_supported:1;
@@ -200,6 +198,10 @@ struct toshiba_acpi_dev {
        unsigned int panel_power_on_supported:1;
        unsigned int usb_three_supported:1;
        unsigned int sysfs_created:1;
+
+       bool kbd_led_registered;
+       bool illumination_led_registered;
+       bool eco_led_registered;
 };
 
 static struct toshiba_acpi_dev *toshiba_acpi;
@@ -248,16 +250,16 @@ static const struct key_entry toshiba_acpi_keymap[] = {
 };
 
 static const struct key_entry toshiba_acpi_alt_keymap[] = {
-       { KE_KEY, 0x157, { KEY_MUTE } },
        { KE_KEY, 0x102, { KEY_ZOOMOUT } },
        { KE_KEY, 0x103, { KEY_ZOOMIN } },
        { KE_KEY, 0x12c, { KEY_KBDILLUMTOGGLE } },
        { KE_KEY, 0x139, { KEY_ZOOMRESET } },
-       { KE_KEY, 0x13e, { KEY_SWITCHVIDEOMODE } },
        { KE_KEY, 0x13c, { KEY_BRIGHTNESSDOWN } },
        { KE_KEY, 0x13d, { KEY_BRIGHTNESSUP } },
-       { KE_KEY, 0x158, { KEY_WLAN } },
+       { KE_KEY, 0x13e, { KEY_SWITCHVIDEOMODE } },
        { KE_KEY, 0x13f, { KEY_TOUCHPAD_TOGGLE } },
+       { KE_KEY, 0x157, { KEY_MUTE } },
+       { KE_KEY, 0x158, { KEY_WLAN } },
        { KE_END, 0 },
 };
 
@@ -441,26 +443,24 @@ static u32 sci_write(struct toshiba_acpi_dev *dev, u32 reg, u32 in1)
 }
 
 /* Illumination support */
-static int toshiba_illumination_available(struct toshiba_acpi_dev *dev)
+static void toshiba_illumination_available(struct toshiba_acpi_dev *dev)
 {
        u32 in[TCI_WORDS] = { SCI_GET, SCI_ILLUMINATION, 0, 0, 0, 0 };
        u32 out[TCI_WORDS];
        acpi_status status;
 
+       dev->illumination_supported = 0;
+       dev->illumination_led_registered = false;
+
        if (!sci_open(dev))
-               return 0;
+               return;
 
        status = tci_raw(dev, in, out);
        sci_close(dev);
-       if (ACPI_FAILURE(status)) {
+       if (ACPI_FAILURE(status))
                pr_err("ACPI call to query Illumination support failed\n");
-               return 0;
-       } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("Illumination device not available\n");
-               return 0;
-       }
-
-       return 1;
+       else if (out[0] == TOS_SUCCESS)
+               dev->illumination_supported = 1;
 }
 
 static void toshiba_illumination_set(struct led_classdev *cdev,
@@ -468,7 +468,8 @@ static void toshiba_illumination_set(struct led_classdev *cdev,
 {
        struct toshiba_acpi_dev *dev = container_of(cdev,
                        struct toshiba_acpi_dev, led_dev);
-       u32 state, result;
+       u32 result;
+       u32 state;
 
        /* First request : initialize communication. */
        if (!sci_open(dev))
@@ -478,13 +479,8 @@ static void toshiba_illumination_set(struct led_classdev *cdev,
        state = brightness ? 1 : 0;
        result = sci_write(dev, SCI_ILLUMINATION, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call for illumination failed\n");
-               return;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Illumination not supported\n");
-               return;
-       }
 }
 
 static enum led_brightness toshiba_illumination_get(struct led_classdev *cdev)
@@ -500,11 +496,10 @@ static enum led_brightness toshiba_illumination_get(struct led_classdev *cdev)
        /* Check the illumination */
        result = sci_read(dev, SCI_ILLUMINATION, &state);
        sci_close(dev);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE) {
                pr_err("ACPI call for illumination failed\n");
                return LED_OFF;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Illumination not supported\n");
+       } else if (result != TOS_SUCCESS) {
                return LED_OFF;
        }
 
@@ -512,41 +507,40 @@ static enum led_brightness toshiba_illumination_get(struct led_classdev *cdev)
 }
 
 /* KBD Illumination */
-static int toshiba_kbd_illum_available(struct toshiba_acpi_dev *dev)
+static void toshiba_kbd_illum_available(struct toshiba_acpi_dev *dev)
 {
        u32 in[TCI_WORDS] = { SCI_GET, SCI_KBD_ILLUM_STATUS, 0, 0, 0, 0 };
        u32 out[TCI_WORDS];
        acpi_status status;
 
+       dev->kbd_illum_supported = 0;
+       dev->kbd_led_registered = false;
+
        if (!sci_open(dev))
-               return 0;
+               return;
 
        status = tci_raw(dev, in, out);
        sci_close(dev);
-       if (ACPI_FAILURE(status) || out[0] == TOS_INPUT_DATA_ERROR) {
+       if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to query kbd illumination support failed\n");
-               return 0;
-       } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("Keyboard illumination not available\n");
-               return 0;
+       } else if (out[0] == TOS_SUCCESS) {
+               /*
+                * Check for keyboard backlight timeout max value,
+                * previous kbd backlight implementation set this to
+                * 0x3c0003, and now the new implementation set this
+                * to 0x3c001a, use this to distinguish between them.
+                */
+               if (out[3] == SCI_KBD_TIME_MAX)
+                       dev->kbd_type = 2;
+               else
+                       dev->kbd_type = 1;
+               /* Get the current keyboard backlight mode */
+               dev->kbd_mode = out[2] & SCI_KBD_MODE_MASK;
+               /* Get the current time (1-60 seconds) */
+               dev->kbd_time = out[2] >> HCI_MISC_SHIFT;
+               /* Flag as supported */
+               dev->kbd_illum_supported = 1;
        }
-
-       /*
-        * Check for keyboard backlight timeout max value,
-        * previous kbd backlight implementation set this to
-        * 0x3c0003, and now the new implementation set this
-        * to 0x3c001a, use this to distinguish between them.
-        */
-       if (out[3] == SCI_KBD_TIME_MAX)
-               dev->kbd_type = 2;
-       else
-               dev->kbd_type = 1;
-       /* Get the current keyboard backlight mode */
-       dev->kbd_mode = out[2] & SCI_KBD_MODE_MASK;
-       /* Get the current time (1-60 seconds) */
-       dev->kbd_time = out[2] >> HCI_MISC_SHIFT;
-
-       return 1;
 }
 
 static int toshiba_kbd_illum_status_set(struct toshiba_acpi_dev *dev, u32 time)
@@ -558,15 +552,12 @@ static int toshiba_kbd_illum_status_set(struct toshiba_acpi_dev *dev, u32 time)
 
        result = sci_write(dev, SCI_KBD_ILLUM_STATUS, time);
        sci_close(dev);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set KBD backlight status failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Keyboard backlight status not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_kbd_illum_status_get(struct toshiba_acpi_dev *dev, u32 *time)
@@ -578,30 +569,27 @@ static int toshiba_kbd_illum_status_get(struct toshiba_acpi_dev *dev, u32 *time)
 
        result = sci_read(dev, SCI_KBD_ILLUM_STATUS, time);
        sci_close(dev);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to get KBD backlight status failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Keyboard backlight status not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static enum led_brightness toshiba_kbd_backlight_get(struct led_classdev *cdev)
 {
        struct toshiba_acpi_dev *dev = container_of(cdev,
                        struct toshiba_acpi_dev, kbd_led);
-       u32 state, result;
+       u32 result;
+       u32 state;
 
        /* Check the keyboard backlight state */
        result = hci_read(dev, HCI_KBD_ILLUMINATION, &state);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE) {
                pr_err("ACPI call to get the keyboard backlight failed\n");
                return LED_OFF;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Keyboard backlight not supported\n");
+       } else if (result != TOS_SUCCESS) {
                return LED_OFF;
        }
 
@@ -613,18 +601,14 @@ static void toshiba_kbd_backlight_set(struct led_classdev *cdev,
 {
        struct toshiba_acpi_dev *dev = container_of(cdev,
                        struct toshiba_acpi_dev, kbd_led);
-       u32 state, result;
+       u32 result;
+       u32 state;
 
        /* Set the keyboard backlight state */
        state = brightness ? 1 : 0;
        result = hci_write(dev, HCI_KBD_ILLUMINATION, state);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set KBD Illumination mode failed\n");
-               return;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Keyboard backlight not supported\n");
-               return;
-       }
 }
 
 /* TouchPad support */
@@ -637,14 +621,12 @@ static int toshiba_touchpad_set(struct toshiba_acpi_dev *dev, u32 state)
 
        result = sci_write(dev, SCI_TOUCHPAD, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set the touchpad failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_touchpad_get(struct toshiba_acpi_dev *dev, u32 *state)
@@ -656,28 +638,27 @@ static int toshiba_touchpad_get(struct toshiba_acpi_dev *dev, u32 *state)
 
        result = sci_read(dev, SCI_TOUCHPAD, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to query the touchpad failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 /* Eco Mode support */
-static int toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
+static void toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
 {
        acpi_status status;
        u32 in[TCI_WORDS] = { HCI_GET, HCI_ECO_MODE, 0, 0, 0, 0 };
        u32 out[TCI_WORDS];
 
+       dev->eco_supported = 0;
+       dev->eco_led_registered = false;
+
        status = tci_raw(dev, in, out);
        if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to get ECO led failed\n");
-       } else if (out[0] == TOS_NOT_INSTALLED) {
-               pr_info("ECO led not installed");
        } else if (out[0] == TOS_INPUT_DATA_ERROR) {
                /*
                 * If we receive 0x8300 (Input Data Error), it means that the
@@ -690,13 +671,11 @@ static int toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
                 */
                in[3] = 1;
                status = tci_raw(dev, in, out);
-               if (ACPI_FAILURE(status) || out[0] == TOS_FAILURE)
+               if (ACPI_FAILURE(status))
                        pr_err("ACPI call to get ECO led failed\n");
                else if (out[0] == TOS_SUCCESS)
-                       return 1;
+                       dev->eco_supported = 1;
        }
-
-       return 0;
 }
 
 static enum led_brightness
@@ -709,9 +688,11 @@ toshiba_eco_mode_get_status(struct led_classdev *cdev)
        acpi_status status;
 
        status = tci_raw(dev, in, out);
-       if (ACPI_FAILURE(status) || out[0] == TOS_INPUT_DATA_ERROR) {
+       if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to get ECO led failed\n");
                return LED_OFF;
+       } else if (out[0] != TOS_SUCCESS) {
+               return LED_OFF;
        }
 
        return out[2] ? LED_FULL : LED_OFF;
@@ -729,41 +710,32 @@ static void toshiba_eco_mode_set_status(struct led_classdev *cdev,
        /* Switch the Eco Mode led on/off */
        in[2] = (brightness) ? 1 : 0;
        status = tci_raw(dev, in, out);
-       if (ACPI_FAILURE(status) || out[0] == TOS_INPUT_DATA_ERROR) {
+       if (ACPI_FAILURE(status))
                pr_err("ACPI call to set ECO led failed\n");
-               return;
-       }
 }
 
 /* Accelerometer support */
-static int toshiba_accelerometer_supported(struct toshiba_acpi_dev *dev)
+static void toshiba_accelerometer_available(struct toshiba_acpi_dev *dev)
 {
        u32 in[TCI_WORDS] = { HCI_GET, HCI_ACCELEROMETER2, 0, 0, 0, 0 };
        u32 out[TCI_WORDS];
        acpi_status status;
 
+       dev->accelerometer_supported = 0;
+
        /*
         * Check if the accelerometer call exists,
         * this call also serves as initialization
         */
        status = tci_raw(dev, in, out);
-       if (ACPI_FAILURE(status) || out[0] == TOS_INPUT_DATA_ERROR) {
+       if (ACPI_FAILURE(status))
                pr_err("ACPI call to query the accelerometer failed\n");
-               return -EIO;
-       } else if (out[0] == TOS_DATA_NOT_AVAILABLE ||
-                  out[0] == TOS_NOT_INITIALIZED) {
-               pr_err("Accelerometer not initialized\n");
-               return -EIO;
-       } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("Accelerometer not supported\n");
-               return -ENODEV;
-       }
-
-       return 0;
+       else if (out[0] == TOS_SUCCESS)
+               dev->accelerometer_supported = 1;
 }
 
 static int toshiba_accelerometer_get(struct toshiba_acpi_dev *dev,
-                                     u32 *xy, u32 *z)
+                                    u32 *xy, u32 *z)
 {
        u32 in[TCI_WORDS] = { HCI_GET, HCI_ACCELEROMETER, 0, 1, 0, 0 };
        u32 out[TCI_WORDS];
@@ -771,15 +743,18 @@ static int toshiba_accelerometer_get(struct toshiba_acpi_dev *dev,
 
        /* Check the Accelerometer status */
        status = tci_raw(dev, in, out);
-       if (ACPI_FAILURE(status) || out[0] == TOS_INPUT_DATA_ERROR) {
+       if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to query the accelerometer failed\n");
                return -EIO;
+       } else if (out[0] == TOS_NOT_SUPPORTED) {
+               return -ENODEV;
+       } else if (out[0] == TOS_SUCCESS) {
+               *xy = out[2];
+               *z = out[4];
+               return 0;
        }
 
-       *xy = out[2];
-       *z = out[4];
-
-       return 0;
+       return -EIO;
 }
 
 /* Sleep (Charge and Music) utilities support */
@@ -789,7 +764,6 @@ static void toshiba_usb_sleep_charge_available(struct toshiba_acpi_dev *dev)
        u32 out[TCI_WORDS];
        acpi_status status;
 
-       /* Set the feature to "not supported" in case of error */
        dev->usb_sleep_charge_supported = 0;
 
        if (!sci_open(dev))
@@ -801,7 +775,6 @@ static void toshiba_usb_sleep_charge_available(struct toshiba_acpi_dev *dev)
                sci_close(dev);
                return;
        } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("USB Sleep and Charge not supported\n");
                sci_close(dev);
                return;
        } else if (out[0] == TOS_SUCCESS) {
@@ -810,25 +783,15 @@ static void toshiba_usb_sleep_charge_available(struct toshiba_acpi_dev *dev)
 
        in[5] = SCI_USB_CHARGE_BAT_LVL;
        status = tci_raw(dev, in, out);
+       sci_close(dev);
        if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to get USB Sleep and Charge mode failed\n");
-               sci_close(dev);
-               return;
-       } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("USB Sleep and Charge not supported\n");
-               sci_close(dev);
-               return;
        } else if (out[0] == TOS_SUCCESS) {
                dev->usbsc_bat_level = out[2];
-               /*
-                * If we reach this point, it means that the laptop has support
-                * for this feature and all values are initialized.
-                * Set it as supported.
-                */
+               /* Flag as supported */
                dev->usb_sleep_charge_supported = 1;
        }
 
-       sci_close(dev);
 }
 
 static int toshiba_usb_sleep_charge_get(struct toshiba_acpi_dev *dev,
@@ -841,17 +804,12 @@ static int toshiba_usb_sleep_charge_get(struct toshiba_acpi_dev *dev,
 
        result = sci_read(dev, SCI_USB_SLEEP_CHARGE, mode);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set USB S&C mode failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("USB Sleep and Charge not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_usb_sleep_charge_set(struct toshiba_acpi_dev *dev,
@@ -864,17 +822,12 @@ static int toshiba_usb_sleep_charge_set(struct toshiba_acpi_dev *dev,
 
        result = sci_write(dev, SCI_USB_SLEEP_CHARGE, mode);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set USB S&C mode failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("USB Sleep and Charge not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_sleep_functions_status_get(struct toshiba_acpi_dev *dev,
@@ -892,17 +845,14 @@ static int toshiba_sleep_functions_status_get(struct toshiba_acpi_dev *dev,
        sci_close(dev);
        if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to get USB S&C battery level failed\n");
-               return -EIO;
        } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("USB Sleep and Charge not supported\n");
                return -ENODEV;
-       } else if (out[0] == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
+       } else if (out[0] == TOS_SUCCESS) {
+               *mode = out[2];
+               return 0;
        }
 
-       *mode = out[2];
-
-       return 0;
+       return -EIO;
 }
 
 static int toshiba_sleep_functions_status_set(struct toshiba_acpi_dev *dev,
@@ -919,17 +869,12 @@ static int toshiba_sleep_functions_status_set(struct toshiba_acpi_dev *dev,
        in[5] = SCI_USB_CHARGE_BAT_LVL;
        status = tci_raw(dev, in, out);
        sci_close(dev);
-       if (ACPI_FAILURE(status)) {
+       if (ACPI_FAILURE(status))
                pr_err("ACPI call to set USB S&C battery level failed\n");
-               return -EIO;
-       } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("USB Sleep and Charge not supported\n");
+       else if (out[0] == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (out[0] == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return out[0] == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_usb_rapid_charge_get(struct toshiba_acpi_dev *dev,
@@ -947,16 +892,14 @@ static int toshiba_usb_rapid_charge_get(struct toshiba_acpi_dev *dev,
        sci_close(dev);
        if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to get USB Rapid Charge failed\n");
-               return -EIO;
-       } else if (out[0] == TOS_NOT_SUPPORTED ||
-                  out[0] == TOS_INPUT_DATA_ERROR) {
-               pr_info("USB Rapid Charge not supported\n");
+       } else if (out[0] == TOS_NOT_SUPPORTED) {
                return -ENODEV;
+       } else if (out[0] == TOS_SUCCESS || out[0] == TOS_SUCCESS2) {
+               *state = out[2];
+               return 0;
        }
 
-       *state = out[2];
-
-       return 0;
+       return -EIO;
 }
 
 static int toshiba_usb_rapid_charge_set(struct toshiba_acpi_dev *dev,
@@ -973,17 +916,12 @@ static int toshiba_usb_rapid_charge_set(struct toshiba_acpi_dev *dev,
        in[5] = SCI_USB_CHARGE_RAPID_DSP;
        status = tci_raw(dev, in, out);
        sci_close(dev);
-       if (ACPI_FAILURE(status)) {
+       if (ACPI_FAILURE(status))
                pr_err("ACPI call to set USB Rapid Charge failed\n");
-               return -EIO;
-       } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("USB Rapid Charge not supported\n");
+       else if (out[0] == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (out[0] == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return (out[0] == TOS_SUCCESS || out[0] == TOS_SUCCESS2) ? 0 : -EIO;
 }
 
 static int toshiba_usb_sleep_music_get(struct toshiba_acpi_dev *dev, u32 *state)
@@ -995,17 +933,12 @@ static int toshiba_usb_sleep_music_get(struct toshiba_acpi_dev *dev, u32 *state)
 
        result = sci_read(dev, SCI_USB_SLEEP_MUSIC, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to get Sleep and Music failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Sleep and Music not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_usb_sleep_music_set(struct toshiba_acpi_dev *dev, u32 state)
@@ -1017,17 +950,12 @@ static int toshiba_usb_sleep_music_set(struct toshiba_acpi_dev *dev, u32 state)
 
        result = sci_write(dev, SCI_USB_SLEEP_MUSIC, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set Sleep and Music failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Sleep and Music not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 /* Keyboard function keys */
@@ -1040,15 +968,12 @@ static int toshiba_function_keys_get(struct toshiba_acpi_dev *dev, u32 *mode)
 
        result = sci_read(dev, SCI_KBD_FUNCTION_KEYS, mode);
        sci_close(dev);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to get KBD function keys failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("KBD function keys not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       }
 
-       return 0;
+       return (result == TOS_SUCCESS || result == TOS_SUCCESS2) ? 0 : -EIO;
 }
 
 static int toshiba_function_keys_set(struct toshiba_acpi_dev *dev, u32 mode)
@@ -1060,15 +985,12 @@ static int toshiba_function_keys_set(struct toshiba_acpi_dev *dev, u32 mode)
 
        result = sci_write(dev, SCI_KBD_FUNCTION_KEYS, mode);
        sci_close(dev);
-       if (result == TOS_FAILURE || result == TOS_INPUT_DATA_ERROR) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set KBD function keys failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("KBD function keys not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       }
 
-       return 0;
+       return (result == TOS_SUCCESS || result == TOS_SUCCESS2) ? 0 : -EIO;
 }
 
 /* Panel Power ON */
@@ -1081,17 +1003,12 @@ static int toshiba_panel_power_on_get(struct toshiba_acpi_dev *dev, u32 *state)
 
        result = sci_read(dev, SCI_PANEL_POWER_ON, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to get Panel Power ON failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Panel Power on not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int toshiba_panel_power_on_set(struct toshiba_acpi_dev *dev, u32 state)
@@ -1103,17 +1020,12 @@ static int toshiba_panel_power_on_set(struct toshiba_acpi_dev *dev, u32 state)
 
        result = sci_write(dev, SCI_PANEL_POWER_ON, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set Panel Power ON failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("Panel Power ON not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 /* USB Three */
@@ -1126,17 +1038,12 @@ static int toshiba_usb_three_get(struct toshiba_acpi_dev *dev, u32 *state)
 
        result = sci_read(dev, SCI_USB_THREE, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to get USB 3 failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("USB 3 not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return (result == TOS_SUCCESS || result == TOS_SUCCESS2) ? 0 : -EIO;
 }
 
 static int toshiba_usb_three_set(struct toshiba_acpi_dev *dev, u32 state)
@@ -1148,17 +1055,12 @@ static int toshiba_usb_three_set(struct toshiba_acpi_dev *dev, u32 state)
 
        result = sci_write(dev, SCI_USB_THREE, state);
        sci_close(dev);
-       if (result == TOS_FAILURE) {
+       if (result == TOS_FAILURE)
                pr_err("ACPI call to set USB 3 failed\n");
-               return -EIO;
-       } else if (result == TOS_NOT_SUPPORTED) {
-               pr_info("USB 3 not supported\n");
+       else if (result == TOS_NOT_SUPPORTED)
                return -ENODEV;
-       } else if (result == TOS_INPUT_DATA_ERROR) {
-               return -EIO;
-       }
 
-       return 0;
+       return (result == TOS_SUCCESS || result == TOS_SUCCESS2) ? 0 : -EIO;
 }
 
 /* Hotkey Event type */
@@ -1172,35 +1074,39 @@ static int toshiba_hotkey_event_type_get(struct toshiba_acpi_dev *dev,
        status = tci_raw(dev, in, out);
        if (ACPI_FAILURE(status)) {
                pr_err("ACPI call to get System type failed\n");
-               return -EIO;
        } else if (out[0] == TOS_NOT_SUPPORTED) {
-               pr_info("System type not supported\n");
                return -ENODEV;
+       } else if (out[0] == TOS_SUCCESS) {
+               *type = out[3];
+               return 0;
        }
 
-       *type = out[3];
-
-       return 0;
+       return -EIO;
 }
 
 /* Transflective Backlight */
-static int get_tr_backlight_status(struct toshiba_acpi_dev *dev, bool *enabled)
+static int get_tr_backlight_status(struct toshiba_acpi_dev *dev, u32 *status)
 {
-       u32 hci_result;
-       u32 status;
+       u32 result = hci_read(dev, HCI_TR_BACKLIGHT, status);
+
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to get Transflective Backlight failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
 
-       hci_result = hci_read(dev, HCI_TR_BACKLIGHT, &status);
-       *enabled = !status;
-       return hci_result == TOS_SUCCESS ? 0 : -EIO;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
-static int set_tr_backlight_status(struct toshiba_acpi_dev *dev, bool enable)
+static int set_tr_backlight_status(struct toshiba_acpi_dev *dev, u32 status)
 {
-       u32 hci_result;
-       u32 value = !enable;
+       u32 result = hci_write(dev, HCI_TR_BACKLIGHT, !status);
+
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to set Transflective Backlight failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
 
-       hci_result = hci_write(dev, HCI_TR_BACKLIGHT, value);
-       return hci_result == TOS_SUCCESS ? 0 : -EIO;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static struct proc_dir_entry *toshiba_proc_dir;
@@ -1208,23 +1114,26 @@ static struct proc_dir_entry *toshiba_proc_dir;
 /* LCD Brightness */
 static int __get_lcd_brightness(struct toshiba_acpi_dev *dev)
 {
-       u32 hci_result;
+       u32 result;
        u32 value;
        int brightness = 0;
 
        if (dev->tr_backlight_supported) {
-               bool enabled;
-               int ret = get_tr_backlight_status(dev, &enabled);
+               int ret = get_tr_backlight_status(dev, &value);
 
                if (ret)
                        return ret;
-               if (enabled)
+               if (value)
                        return 0;
                brightness++;
        }
 
-       hci_result = hci_read(dev, HCI_LCD_BRIGHTNESS, &value);
-       if (hci_result == TOS_SUCCESS)
+       result = hci_read(dev, HCI_LCD_BRIGHTNESS, &value);
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to get LCD Brightness failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
+       if (result == TOS_SUCCESS)
                return brightness + (value >> HCI_LCD_BRIGHTNESS_SHIFT);
 
        return -EIO;
@@ -1240,8 +1149,8 @@ static int get_lcd_brightness(struct backlight_device *bd)
 static int lcd_proc_show(struct seq_file *m, void *v)
 {
        struct toshiba_acpi_dev *dev = m->private;
-       int value;
        int levels;
+       int value;
 
        if (!dev->backlight_dev)
                return -ENODEV;
@@ -1255,6 +1164,7 @@ static int lcd_proc_show(struct seq_file *m, void *v)
        }
 
        pr_err("Error reading LCD brightness\n");
+
        return -EIO;
 }
 
@@ -1265,11 +1175,10 @@ static int lcd_proc_open(struct inode *inode, struct file *file)
 
 static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value)
 {
-       u32 hci_result;
+       u32 result;
 
        if (dev->tr_backlight_supported) {
-               bool enable = !value;
-               int ret = set_tr_backlight_status(dev, enable);
+               int ret = set_tr_backlight_status(dev, !value);
 
                if (ret)
                        return ret;
@@ -1278,8 +1187,13 @@ static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value)
        }
 
        value = value << HCI_LCD_BRIGHTNESS_SHIFT;
-       hci_result = hci_write(dev, HCI_LCD_BRIGHTNESS, value);
-       return hci_result == TOS_SUCCESS ? 0 : -EIO;
+       result = hci_write(dev, HCI_LCD_BRIGHTNESS, value);
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to set LCD Brightness failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
+
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int set_lcd_status(struct backlight_device *bd)
@@ -1295,24 +1209,22 @@ static ssize_t lcd_proc_write(struct file *file, const char __user *buf,
        struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
        char cmd[42];
        size_t len;
-       int value;
-       int ret;
        int levels = dev->backlight_dev->props.max_brightness + 1;
+       int value;
 
        len = min(count, sizeof(cmd) - 1);
        if (copy_from_user(cmd, buf, len))
                return -EFAULT;
        cmd[len] = '\0';
 
-       if (sscanf(cmd, " brightness : %i", &value) == 1 &&
-           value >= 0 && value < levels) {
-               ret = set_lcd_brightness(dev, value);
-               if (ret == 0)
-                       ret = count;
-       } else {
-               ret = -EINVAL;
-       }
-       return ret;
+       if (sscanf(cmd, " brightness : %i", &value) != 1 &&
+           value < 0 && value > levels)
+               return -EINVAL;
+
+       if (set_lcd_brightness(dev, value))
+               return -EIO;
+
+       return count;
 }
 
 static const struct file_operations lcd_proc_fops = {
@@ -1324,22 +1236,25 @@ static const struct file_operations lcd_proc_fops = {
        .write          = lcd_proc_write,
 };
 
+/* Video-Out */
 static int get_video_status(struct toshiba_acpi_dev *dev, u32 *status)
 {
-       u32 hci_result;
+       u32 result = hci_read(dev, HCI_VIDEO_OUT, status);
 
-       hci_result = hci_read(dev, HCI_VIDEO_OUT, status);
-       return hci_result == TOS_SUCCESS ? 0 : -EIO;
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to get Video-Out failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
+
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int video_proc_show(struct seq_file *m, void *v)
 {
        struct toshiba_acpi_dev *dev = m->private;
        u32 value;
-       int ret;
 
-       ret = get_video_status(dev, &value);
-       if (!ret) {
+       if (!get_video_status(dev, &value)) {
                int is_lcd = (value & HCI_VIDEO_OUT_LCD) ? 1 : 0;
                int is_crt = (value & HCI_VIDEO_OUT_CRT) ? 1 : 0;
                int is_tv = (value & HCI_VIDEO_OUT_TV) ? 1 : 0;
@@ -1347,9 +1262,10 @@ static int video_proc_show(struct seq_file *m, void *v)
                seq_printf(m, "lcd_out:                 %d\n", is_lcd);
                seq_printf(m, "crt_out:                 %d\n", is_crt);
                seq_printf(m, "tv_out:                  %d\n", is_tv);
+               return 0;
        }
 
-       return ret;
+       return -EIO;
 }
 
 static int video_proc_open(struct inode *inode, struct file *file)
@@ -1361,13 +1277,14 @@ static ssize_t video_proc_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *pos)
 {
        struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
-       char *cmd, *buffer;
-       int ret;
-       int value;
+       char *buffer;
+       char *cmd;
        int remain = count;
        int lcd_out = -1;
        int crt_out = -1;
        int tv_out = -1;
+       int value;
+       int ret;
        u32 video_out;
 
        cmd = kmalloc(count + 1, GFP_KERNEL);
@@ -1419,7 +1336,7 @@ static ssize_t video_proc_write(struct file *file, const char __user *buf,
                        ret = write_acpi_int(METHOD_VIDEO_OUT, new_video_out);
        }
 
-       return ret ? ret : count;
+       return ret ? -EIO : count;
 }
 
 static const struct file_operations video_proc_fops = {
@@ -1431,27 +1348,43 @@ static const struct file_operations video_proc_fops = {
        .write          = video_proc_write,
 };
 
+/* Fan status */
 static int get_fan_status(struct toshiba_acpi_dev *dev, u32 *status)
 {
-       u32 hci_result;
+       u32 result = hci_read(dev, HCI_FAN, status);
+
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to get Fan status failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
+
+       return result == TOS_SUCCESS ? 0 : -EIO;
+}
+
+static int set_fan_status(struct toshiba_acpi_dev *dev, u32 status)
+{
+       u32 result = hci_write(dev, HCI_FAN, status);
+
+       if (result == TOS_FAILURE)
+               pr_err("ACPI call to set Fan status failed\n");
+       else if (result == TOS_NOT_SUPPORTED)
+               return -ENODEV;
 
-       hci_result = hci_read(dev, HCI_FAN, status);
-       return hci_result == TOS_SUCCESS ? 0 : -EIO;
+       return result == TOS_SUCCESS ? 0 : -EIO;
 }
 
 static int fan_proc_show(struct seq_file *m, void *v)
 {
        struct toshiba_acpi_dev *dev = m->private;
-       int ret;
        u32 value;
 
-       ret = get_fan_status(dev, &value);
-       if (!ret) {
-               seq_printf(m, "running:                 %d\n", (value > 0));
-               seq_printf(m, "force_on:                %d\n", dev->force_fan);
-       }
+       if (get_fan_status(dev, &value))
+               return -EIO;
 
-       return ret;
+       seq_printf(m, "running:                 %d\n", (value > 0));
+       seq_printf(m, "force_on:                %d\n", dev->force_fan);
+
+       return 0;
 }
 
 static int fan_proc_open(struct inode *inode, struct file *file)
@@ -1466,23 +1399,20 @@ static ssize_t fan_proc_write(struct file *file, const char __user *buf,
        char cmd[42];
        size_t len;
        int value;
-       u32 hci_result;
 
        len = min(count, sizeof(cmd) - 1);
        if (copy_from_user(cmd, buf, len))
                return -EFAULT;
        cmd[len] = '\0';
 
-       if (sscanf(cmd, " force_on : %i", &value) == 1 &&
-           value >= 0 && value <= 1) {
-               hci_result = hci_write(dev, HCI_FAN, value);
-               if (hci_result == TOS_SUCCESS)
-                       dev->force_fan = value;
-               else
-                       return -EIO;
-       } else {
+       if (sscanf(cmd, " force_on : %i", &value) != 1 &&
+           value != 0 && value != 1)
                return -EINVAL;
-       }
+
+       if (set_fan_status(dev, value))
+               return -EIO;
+
+       dev->force_fan = value;
 
        return count;
 }
@@ -1499,32 +1429,10 @@ static const struct file_operations fan_proc_fops = {
 static int keys_proc_show(struct seq_file *m, void *v)
 {
        struct toshiba_acpi_dev *dev = m->private;
-       u32 hci_result;
-       u32 value;
-
-       if (!dev->key_event_valid && dev->system_event_supported) {
-               hci_result = hci_read(dev, HCI_SYSTEM_EVENT, &value);
-               if (hci_result == TOS_SUCCESS) {
-                       dev->key_event_valid = 1;
-                       dev->last_key_event = value;
-               } else if (hci_result == TOS_FIFO_EMPTY) {
-                       /* Better luck next time */
-               } else if (hci_result == TOS_NOT_SUPPORTED) {
-                       /*
-                        * This is a workaround for an unresolved issue on
-                        * some machines where system events sporadically
-                        * become disabled.
-                        */
-                       hci_result = hci_write(dev, HCI_SYSTEM_EVENT, 1);
-                       pr_notice("Re-enabled hotkeys\n");
-               } else {
-                       pr_err("Error reading hotkey status\n");
-                       return -EIO;
-               }
-       }
 
        seq_printf(m, "hotkey_ready:            %d\n", dev->key_event_valid);
        seq_printf(m, "hotkey:                  0x%04x\n", dev->last_key_event);
+
        return 0;
 }
 
@@ -1641,7 +1549,6 @@ static ssize_t fan_store(struct device *dev,
                         const char *buf, size_t count)
 {
        struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
-       u32 result;
        int state;
        int ret;
 
@@ -1652,11 +1559,9 @@ static ssize_t fan_store(struct device *dev,
        if (state != 0 && state != 1)
                return -EINVAL;
 
-       result = hci_write(toshiba, HCI_FAN, state);
-       if (result == TOS_FAILURE)
-               return -EIO;
-       else if (result == TOS_NOT_SUPPORTED)
-               return -ENODEV;
+       ret = set_fan_status(toshiba, state);
+       if (ret)
+               return ret;
 
        return count;
 }
@@ -1682,7 +1587,6 @@ static ssize_t kbd_backlight_mode_store(struct device *dev,
 {
        struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
        int mode;
-       int time;
        int ret;
 
 
@@ -1713,7 +1617,7 @@ static ssize_t kbd_backlight_mode_store(struct device *dev,
        /* Only make a change if the actual mode has changed */
        if (toshiba->kbd_mode != mode) {
                /* Shift the time to "base time" (0x3c0000 == 60 seconds) */
-               time = toshiba->kbd_time << HCI_MISC_SHIFT;
+               int time = toshiba->kbd_time << HCI_MISC_SHIFT;
 
                /* OR the "base time" to the actual method format */
                if (toshiba->kbd_type == 1) {
@@ -2261,6 +2165,81 @@ static struct attribute_group toshiba_attr_group = {
        .attrs = toshiba_attributes,
 };
 
+/*
+ * Misc device
+ */
+static int toshiba_acpi_smm_bridge(SMMRegisters *regs)
+{
+       u32 in[TCI_WORDS] = { regs->eax, regs->ebx, regs->ecx,
+                             regs->edx, regs->esi, regs->edi };
+       u32 out[TCI_WORDS];
+       acpi_status status;
+
+       status = tci_raw(toshiba_acpi, in, out);
+       if (ACPI_FAILURE(status)) {
+               pr_err("ACPI call to query SMM registers failed\n");
+               return -EIO;
+       }
+
+       /* Fillout the SMM struct with the TCI call results */
+       regs->eax = out[0];
+       regs->ebx = out[1];
+       regs->ecx = out[2];
+       regs->edx = out[3];
+       regs->esi = out[4];
+       regs->edi = out[5];
+
+       return 0;
+}
+
+static long toshiba_acpi_ioctl(struct file *fp, unsigned int cmd,
+                              unsigned long arg)
+{
+       SMMRegisters __user *argp = (SMMRegisters __user *)arg;
+       SMMRegisters regs;
+       int ret;
+
+       if (!argp)
+               return -EINVAL;
+
+       switch (cmd) {
+       case TOSH_SMM:
+               if (copy_from_user(&regs, argp, sizeof(SMMRegisters)))
+                       return -EFAULT;
+               ret = toshiba_acpi_smm_bridge(&regs);
+               if (ret)
+                       return ret;
+               if (copy_to_user(argp, &regs, sizeof(SMMRegisters)))
+                       return -EFAULT;
+               break;
+       case TOSHIBA_ACPI_SCI:
+               if (copy_from_user(&regs, argp, sizeof(SMMRegisters)))
+                       return -EFAULT;
+               /* Ensure we are being called with a SCI_{GET, SET} register */
+               if (regs.eax != SCI_GET && regs.eax != SCI_SET)
+                       return -EINVAL;
+               if (!sci_open(toshiba_acpi))
+                       return -EIO;
+               ret = toshiba_acpi_smm_bridge(&regs);
+               sci_close(toshiba_acpi);
+               if (ret)
+                       return ret;
+               if (copy_to_user(argp, &regs, sizeof(SMMRegisters)))
+                       return -EFAULT;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static const struct file_operations toshiba_acpi_fops = {
+       .owner          = THIS_MODULE,
+       .unlocked_ioctl = toshiba_acpi_ioctl,
+       .llseek         = noop_llseek,
+};
+
 /*
  * Hotkeys
  */
@@ -2361,22 +2340,28 @@ static void toshiba_acpi_report_hotkey(struct toshiba_acpi_dev *dev,
 
 static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev)
 {
-       u32 hci_result, value;
-       int retries = 3;
-       int scancode;
-
        if (dev->info_supported) {
-               scancode = toshiba_acpi_query_hotkey(dev);
-               if (scancode < 0)
+               int scancode = toshiba_acpi_query_hotkey(dev);
+
+               if (scancode < 0) {
                        pr_err("Failed to query hotkey event\n");
-               else if (scancode != 0)
+               } else if (scancode != 0) {
                        toshiba_acpi_report_hotkey(dev, scancode);
+                       dev->key_event_valid = 1;
+                       dev->last_key_event = scancode;
+               }
        } else if (dev->system_event_supported) {
+               u32 result;
+               u32 value;
+               int retries = 3;
+
                do {
-                       hci_result = hci_read(dev, HCI_SYSTEM_EVENT, &value);
-                       switch (hci_result) {
+                       result = hci_read(dev, HCI_SYSTEM_EVENT, &value);
+                       switch (result) {
                        case TOS_SUCCESS:
                                toshiba_acpi_report_hotkey(dev, (int)value);
+                               dev->key_event_valid = 1;
+                               dev->last_key_event = value;
                                break;
                        case TOS_NOT_SUPPORTED:
                                /*
@@ -2384,15 +2369,15 @@ static void toshiba_acpi_process_hotkeys(struct toshiba_acpi_dev *dev)
                                 * issue on some machines where system events
                                 * sporadically become disabled.
                                 */
-                               hci_result =
-                                       hci_write(dev, HCI_SYSTEM_EVENT, 1);
-                               pr_notice("Re-enabled hotkeys\n");
+                               result = hci_write(dev, HCI_SYSTEM_EVENT, 1);
+                               if (result == TOS_SUCCESS)
+                                       pr_notice("Re-enabled hotkeys\n");
                                /* Fall through */
                        default:
                                retries--;
                                break;
                        }
-               } while (retries && hci_result != TOS_FIFO_EMPTY);
+               } while (retries && result != TOS_FIFO_EMPTY);
        }
 }
 
@@ -2404,15 +2389,18 @@ static int toshiba_acpi_setup_keyboard(struct toshiba_acpi_dev *dev)
        u32 hci_result;
        int error;
 
+       if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID)) {
+               pr_info("WMI event detected, hotkeys will not be monitored\n");
+               return 0;
+       }
+
        error = toshiba_acpi_enable_hotkeys(dev);
        if (error)
                return error;
 
-       error = toshiba_hotkey_event_type_get(dev, &events_type);
-       if (error) {
-               pr_err("Unable to query Hotkey Event Type\n");
-               return error;
-       }
+       if (toshiba_hotkey_event_type_get(dev, &events_type))
+               pr_notice("Unable to query Hotkey Event Type\n");
+
        dev->hotkey_event_type = events_type;
 
        dev->hotkey_dev = input_allocate_device();
@@ -2496,7 +2484,6 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev)
        struct backlight_properties props;
        int brightness;
        int ret;
-       bool enabled;
 
        /*
         * Some machines don't support the backlight methods at all, and
@@ -2513,10 +2500,6 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev)
                return 0;
        }
 
-       /* Determine whether or not BIOS supports transflective backlight */
-       ret = get_tr_backlight_status(dev, &enabled);
-       dev->tr_backlight_supported = !ret;
-
        /*
         * Tell acpi-video-detect code to prefer vendor backlight on all
         * systems with transflective backlight and on dmi matched systems.
@@ -2552,10 +2535,52 @@ static int toshiba_acpi_setup_backlight(struct toshiba_acpi_dev *dev)
        return 0;
 }
 
+static void print_supported_features(struct toshiba_acpi_dev *dev)
+{
+       pr_info("Supported laptop features:");
+
+       if (dev->hotkey_dev)
+               pr_cont(" hotkeys");
+       if (dev->backlight_dev)
+               pr_cont(" backlight");
+       if (dev->video_supported)
+               pr_cont(" video-out");
+       if (dev->fan_supported)
+               pr_cont(" fan");
+       if (dev->tr_backlight_supported)
+               pr_cont(" transflective-backlight");
+       if (dev->illumination_supported)
+               pr_cont(" illumination");
+       if (dev->kbd_illum_supported)
+               pr_cont(" keyboard-backlight");
+       if (dev->touchpad_supported)
+               pr_cont(" touchpad");
+       if (dev->eco_supported)
+               pr_cont(" eco-led");
+       if (dev->accelerometer_supported)
+               pr_cont(" accelerometer-axes");
+       if (dev->usb_sleep_charge_supported)
+               pr_cont(" usb-sleep-charge");
+       if (dev->usb_rapid_charge_supported)
+               pr_cont(" usb-rapid-charge");
+       if (dev->usb_sleep_music_supported)
+               pr_cont(" usb-sleep-music");
+       if (dev->kbd_function_keys_supported)
+               pr_cont(" special-function-keys");
+       if (dev->panel_power_on_supported)
+               pr_cont(" panel-power-on");
+       if (dev->usb_three_supported)
+               pr_cont(" usb3");
+
+       pr_cont("\n");
+}
+
 static int toshiba_acpi_remove(struct acpi_device *acpi_dev)
 {
        struct toshiba_acpi_dev *dev = acpi_driver_data(acpi_dev);
 
+       misc_deregister(&dev->miscdev);
+
        remove_toshiba_proc_entries(dev);
 
        if (dev->sysfs_created)
@@ -2574,13 +2599,13 @@ static int toshiba_acpi_remove(struct acpi_device *acpi_dev)
 
        backlight_device_unregister(dev->backlight_dev);
 
-       if (dev->illumination_supported)
+       if (dev->illumination_led_registered)
                led_classdev_unregister(&dev->led_dev);
 
        if (dev->kbd_led_registered)
                led_classdev_unregister(&dev->kbd_led);
 
-       if (dev->eco_supported)
+       if (dev->eco_led_registered)
                led_classdev_unregister(&dev->eco_led);
 
        if (toshiba_acpi)
@@ -2627,6 +2652,17 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev)
                return -ENOMEM;
        dev->acpi_dev = acpi_dev;
        dev->method_hci = hci_method;
+       dev->miscdev.minor = MISC_DYNAMIC_MINOR;
+       dev->miscdev.name = "toshiba_acpi";
+       dev->miscdev.fops = &toshiba_acpi_fops;
+
+       ret = misc_register(&dev->miscdev);
+       if (ret) {
+               pr_err("Failed to register miscdevice\n");
+               kfree(dev);
+               return ret;
+       }
+
        acpi_dev->driver_data = dev;
        dev_set_drvdata(&acpi_dev->dev, dev);
 
@@ -2643,29 +2679,35 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev)
        if (toshiba_acpi_setup_keyboard(dev))
                pr_info("Unable to activate hotkeys\n");
 
+       /* Determine whether or not BIOS supports transflective backlight */
+       ret = get_tr_backlight_status(dev, &dummy);
+       dev->tr_backlight_supported = !ret;
+
        ret = toshiba_acpi_setup_backlight(dev);
        if (ret)
                goto error;
 
-       if (toshiba_illumination_available(dev)) {
+       toshiba_illumination_available(dev);
+       if (dev->illumination_supported) {
                dev->led_dev.name = "toshiba::illumination";
                dev->led_dev.max_brightness = 1;
                dev->led_dev.brightness_set = toshiba_illumination_set;
                dev->led_dev.brightness_get = toshiba_illumination_get;
                if (!led_classdev_register(&acpi_dev->dev, &dev->led_dev))
-                       dev->illumination_supported = 1;
+                       dev->illumination_led_registered = true;
        }
 
-       if (toshiba_eco_mode_available(dev)) {
+       toshiba_eco_mode_available(dev);
+       if (dev->eco_supported) {
                dev->eco_led.name = "toshiba::eco_mode";
                dev->eco_led.max_brightness = 1;
                dev->eco_led.brightness_set = toshiba_eco_mode_set_status;
                dev->eco_led.brightness_get = toshiba_eco_mode_get_status;
                if (!led_classdev_register(&dev->acpi_dev->dev, &dev->eco_led))
-                       dev->eco_supported = 1;
+                       dev->eco_led_registered = true;
        }
 
-       dev->kbd_illum_supported = toshiba_kbd_illum_available(dev);
+       toshiba_kbd_illum_available(dev);
        /*
         * Only register the LED if KBD illumination is supported
         * and the keyboard backlight operation mode is set to FN-Z
@@ -2676,14 +2718,13 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev)
                dev->kbd_led.brightness_set = toshiba_kbd_backlight_set;
                dev->kbd_led.brightness_get = toshiba_kbd_backlight_get;
                if (!led_classdev_register(&dev->acpi_dev->dev, &dev->kbd_led))
-                       dev->kbd_led_registered = 1;
+                       dev->kbd_led_registered = true;
        }
 
        ret = toshiba_touchpad_get(dev, &dummy);
        dev->touchpad_supported = !ret;
 
-       ret = toshiba_accelerometer_supported(dev);
-       dev->accelerometer_supported = !ret;
+       toshiba_accelerometer_available(dev);
 
        toshiba_usb_sleep_charge_available(dev);
 
@@ -2705,6 +2746,8 @@ static int toshiba_acpi_add(struct acpi_device *acpi_dev)
        ret = get_fan_status(dev, &dummy);
        dev->fan_supported = !ret;
 
+       print_supported_features(dev);
+
        /*
         * Enable the "Special Functions" mode only if they are
         * supported and if they are activated.
@@ -2738,6 +2781,14 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event)
 
        switch (event) {
        case 0x80: /* Hotkeys and some system events */
+               /*
+                * Machines with this WMI GUID aren't supported due to bugs in
+                * their AML.
+                *
+                * Return silently to avoid triggering a netlink event.
+                */
+               if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID))
+                       return;
                toshiba_acpi_process_hotkeys(dev);
                break;
        case 0x81: /* Dock events */
@@ -2781,10 +2832,14 @@ static void toshiba_acpi_notify(struct acpi_device *acpi_dev, u32 event)
 static int toshiba_acpi_suspend(struct device *device)
 {
        struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device));
-       u32 result;
 
-       if (dev->hotkey_dev)
+       if (dev->hotkey_dev) {
+               u32 result;
+
                result = hci_write(dev, HCI_HOTKEY_EVENT, HCI_HOTKEY_DISABLE);
+               if (result != TOS_SUCCESS)
+                       pr_info("Unable to disable hotkeys\n");
+       }
 
        return 0;
 }
@@ -2792,10 +2847,10 @@ static int toshiba_acpi_suspend(struct device *device)
 static int toshiba_acpi_resume(struct device *device)
 {
        struct toshiba_acpi_dev *dev = acpi_driver_data(to_acpi_device(device));
-       int error;
 
        if (dev->hotkey_dev) {
-               error = toshiba_acpi_enable_hotkeys(dev);
+               int error = toshiba_acpi_enable_hotkeys(dev);
+
                if (error)
                        pr_info("Unable to re-enable hotkeys\n");
        }
@@ -2824,14 +2879,6 @@ static int __init toshiba_acpi_init(void)
 {
        int ret;
 
-       /*
-        * Machines with this WMI guid aren't supported due to bugs in
-        * their AML. This check relies on wmi initializing before
-        * toshiba_acpi to guarantee guids have been identified.
-        */
-       if (wmi_has_guid(TOSHIBA_WMI_EVENT_GUID))
-               return -ENODEV;
-
        toshiba_proc_dir = proc_mkdir(PROC_TOSHIBA, acpi_root_dir);
        if (!toshiba_proc_dir) {
                pr_err("Unable to create proc dir " PROC_TOSHIBA "\n");
index aac47573f9ed83e4c42ea84aef9c3940644e4db1..eb391a2818330e9423f2030722a041eddedc9130 100644 (file)
@@ -194,34 +194,6 @@ static bool wmi_parse_guid(const u8 *src, u8 *dest)
        return true;
 }
 
-/*
- * Convert a raw GUID to the ACII string representation
- */
-static int wmi_gtoa(const char *in, char *out)
-{
-       int i;
-
-       for (i = 3; i >= 0; i--)
-               out += sprintf(out, "%02X", in[i] & 0xFF);
-
-       out += sprintf(out, "-");
-       out += sprintf(out, "%02X", in[5] & 0xFF);
-       out += sprintf(out, "%02X", in[4] & 0xFF);
-       out += sprintf(out, "-");
-       out += sprintf(out, "%02X", in[7] & 0xFF);
-       out += sprintf(out, "%02X", in[6] & 0xFF);
-       out += sprintf(out, "-");
-       out += sprintf(out, "%02X", in[8] & 0xFF);
-       out += sprintf(out, "%02X", in[9] & 0xFF);
-       out += sprintf(out, "-");
-
-       for (i = 10; i <= 15; i++)
-               out += sprintf(out, "%02X", in[i] & 0xFF);
-
-       *out = '\0';
-       return 0;
-}
-
 static bool find_guid(const char *guid_string, struct wmi_block **out)
 {
        char tmp[16], guid_input[16];
@@ -457,11 +429,7 @@ EXPORT_SYMBOL_GPL(wmi_set_block);
 
 static void wmi_dump_wdg(const struct guid_block *g)
 {
-       char guid_string[37];
-
-       wmi_gtoa(g->guid, guid_string);
-
-       pr_info("%s:\n", guid_string);
+       pr_info("%pUL:\n", g->guid);
        pr_info("\tobject_id: %c%c\n", g->object_id[0], g->object_id[1]);
        pr_info("\tnotify_id: %02X\n", g->notify_id);
        pr_info("\treserved: %02X\n", g->reserved);
@@ -661,7 +629,6 @@ EXPORT_SYMBOL_GPL(wmi_has_guid);
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
                             char *buf)
 {
-       char guid_string[37];
        struct wmi_block *wblock;
 
        wblock = dev_get_drvdata(dev);
@@ -670,9 +637,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
                return strlen(buf);
        }
 
-       wmi_gtoa(wblock->gblock.guid, guid_string);
-
-       return sprintf(buf, "wmi:%s\n", guid_string);
+       return sprintf(buf, "wmi:%pUL\n", wblock->gblock.guid);
 }
 static DEVICE_ATTR_RO(modalias);
 
@@ -695,7 +660,7 @@ static int wmi_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
        if (!wblock)
                return -ENOMEM;
 
-       wmi_gtoa(wblock->gblock.guid, guid_string);
+       sprintf(guid_string, "%pUL", wblock->gblock.guid);
 
        strcpy(&env->buf[env->buflen - 1], "wmi:");
        memcpy(&env->buf[env->buflen - 1 + 4], guid_string, 36);
@@ -721,12 +686,9 @@ static struct class wmi_class = {
 static int wmi_create_device(const struct guid_block *gblock,
                             struct wmi_block *wblock, acpi_handle handle)
 {
-       char guid_string[37];
-
        wblock->dev.class = &wmi_class;
 
-       wmi_gtoa(gblock->guid, guid_string);
-       dev_set_name(&wblock->dev, "%s", guid_string);
+       dev_set_name(&wblock->dev, "%pUL", gblock->guid);
 
        dev_set_drvdata(&wblock->dev, wblock);
 
@@ -877,7 +839,6 @@ static void acpi_wmi_notify(struct acpi_device *device, u32 event)
        struct guid_block *block;
        struct wmi_block *wblock;
        struct list_head *p;
-       char guid_string[37];
 
        list_for_each(p, &wmi_block_list) {
                wblock = list_entry(p, struct wmi_block, list);
@@ -888,8 +849,8 @@ static void acpi_wmi_notify(struct acpi_device *device, u32 event)
                        if (wblock->handler)
                                wblock->handler(event, wblock->handler_data);
                        if (debug_event) {
-                               wmi_gtoa(wblock->gblock.guid, guid_string);
-                               pr_info("DEBUG Event GUID: %s\n", guid_string);
+                               pr_info("DEBUG Event GUID: %pUL\n",
+                                       wblock->gblock.guid);
                        }
 
                        acpi_bus_generate_netlink_event(
index 9357aa779048a241e6dafe6bbaa1058b9eabb89f..7ad3295752ef35371953705ad6b44cb5b6e3a6e1 100644 (file)
@@ -97,8 +97,6 @@ static int pnp_assign_mem(struct pnp_dev *dev, struct pnp_mem *rule, int idx)
        /* ??? rule->flags restricted to 8 bits, all tests bogus ??? */
        if (!(rule->flags & IORESOURCE_MEM_WRITEABLE))
                res->flags |= IORESOURCE_READONLY;
-       if (rule->flags & IORESOURCE_MEM_CACHEABLE)
-               res->flags |= IORESOURCE_CACHEABLE;
        if (rule->flags & IORESOURCE_MEM_RANGELENGTH)
                res->flags |= IORESOURCE_RANGELENGTH;
        if (rule->flags & IORESOURCE_MEM_SHADOWABLE)
index 1c202ccbd2a61e741d7dd820a5b22741874dd50e..907293e6f2a4a3fafacd13fa4390955541eed694 100644 (file)
@@ -619,7 +619,7 @@ static int cm_get_battery_temperature(struct charger_manager *cm,
 
 #ifdef CONFIG_THERMAL
        if (cm->tzd_batt) {
-               ret = thermal_zone_get_temp(cm->tzd_batt, (unsigned long *)temp);
+               ret = thermal_zone_get_temp(cm->tzd_batt, temp);
                if (!ret)
                        /* Calibrate temperature unit */
                        *temp /= 100;
index 869284c2e1e85e7d1035b281ffb64aae06f976e6..456987c88baab9f4b8a0d2b7224b10d659b8bddc 100644 (file)
@@ -557,7 +557,7 @@ EXPORT_SYMBOL_GPL(power_supply_unreg_notifier);
 
 #ifdef CONFIG_THERMAL
 static int power_supply_read_temp(struct thermal_zone_device *tzd,
-               unsigned long *temp)
+               int *temp)
 {
        struct power_supply *psy;
        union power_supply_propval val;
index f4f2c1f76c326d9093c9d9ea7587dd189aed5182..74f2d3ff1d7cf4242935974a738f9d1fb2749111 100644 (file)
@@ -91,7 +91,7 @@
 #define TWL4030_MSTATEC_COMPLETE1      0x0b
 #define TWL4030_MSTATEC_COMPLETE4      0x0e
 
-#if IS_ENABLED(CONFIG_TWL4030_MADC)
+#if IS_REACHABLE(CONFIG_TWL4030_MADC)
 /*
  * If AC (Accessory Charger) voltage exceeds 4.5V (MADC 11)
  * then AC is available.
@@ -1057,13 +1057,9 @@ static int twl4030_bci_probe(struct platform_device *pdev)
 
                phynode = of_find_compatible_node(bci->dev->of_node->parent,
                                                  NULL, "ti,twl4030-usb");
-               if (phynode) {
+               if (phynode)
                        bci->transceiver = devm_usb_get_phy_by_node(
                                bci->dev, phynode, &bci->usb_nb);
-                       if (IS_ERR(bci->transceiver) &&
-                           PTR_ERR(bci->transceiver) == -EPROBE_DEFER)
-                               return -EPROBE_DEFER;
-               }
        }
 
        /* Enable interrupts now. */
index 948d9abd27f1159355d60705727683b9e0deb4d1..062630ab742451c8d4d06b61262d0a4753bb4711 100644 (file)
@@ -180,6 +180,18 @@ config PWM_LP3943
          To compile this driver as a module, choose M here: the module
          will be called pwm-lp3943.
 
+config PWM_LPC18XX_SCT
+       tristate "LPC18xx/43xx PWM/SCT support"
+       depends on ARCH_LPC18XX
+       help
+         Generic PWM framework driver for NXP LPC18xx PWM/SCT which
+         supports 16 channels.
+         A maximum of 15 channels can be requested simultaneously and
+         must have the same period.
+
+         To compile this driver as a module, choose M here: the module
+         will be called pwm-lpc18xx-sct.
+
 config PWM_LPC32XX
        tristate "LPC32XX PWM support"
        depends on ARCH_LPC32XX
index d186f35a65388d532709ee1d323f9241f3e97f35..a0e00c09ead3d05e6fb92c68f4744cfcf5177f17 100644 (file)
@@ -15,6 +15,7 @@ obj-$(CONFIG_PWM_IMG)         += pwm-img.o
 obj-$(CONFIG_PWM_IMX)          += pwm-imx.o
 obj-$(CONFIG_PWM_JZ4740)       += pwm-jz4740.o
 obj-$(CONFIG_PWM_LP3943)       += pwm-lp3943.o
+obj-$(CONFIG_PWM_LPC18XX_SCT)  += pwm-lpc18xx-sct.o
 obj-$(CONFIG_PWM_LPC32XX)      += pwm-lpc32xx.o
 obj-$(CONFIG_PWM_LPSS)         += pwm-lpss.o
 obj-$(CONFIG_PWM_LPSS_PCI)     += pwm-lpss-pci.o
index 3a7769fe53dee4f3c4c5f0bdca3e829e0fa8d5b6..3f9df3ea33508da41334178038c121f6a3939f79 100644 (file)
@@ -200,6 +200,8 @@ static void of_pwmchip_remove(struct pwm_chip *chip)
  * pwm_set_chip_data() - set private chip data for a PWM
  * @pwm: PWM device
  * @data: pointer to chip-specific data
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_set_chip_data(struct pwm_device *pwm, void *data)
 {
@@ -215,6 +217,8 @@ EXPORT_SYMBOL_GPL(pwm_set_chip_data);
 /**
  * pwm_get_chip_data() - get private chip data for a PWM
  * @pwm: PWM device
+ *
+ * Returns: A pointer to the chip-private data for the PWM device.
  */
 void *pwm_get_chip_data(struct pwm_device *pwm)
 {
@@ -230,6 +234,8 @@ EXPORT_SYMBOL_GPL(pwm_get_chip_data);
  * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
  * will be used. The initial polarity for all channels is specified by the
  * @polarity parameter.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwmchip_add_with_polarity(struct pwm_chip *chip,
                              enum pwm_polarity polarity)
@@ -291,6 +297,8 @@ EXPORT_SYMBOL_GPL(pwmchip_add_with_polarity);
  *
  * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
  * will be used. The initial polarity for all channels is normal.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwmchip_add(struct pwm_chip *chip)
 {
@@ -304,6 +312,8 @@ EXPORT_SYMBOL_GPL(pwmchip_add);
  *
  * Removes a PWM chip. This function may return busy if the PWM chip provides
  * a PWM device that is still requested.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwmchip_remove(struct pwm_chip *chip)
 {
@@ -338,10 +348,13 @@ EXPORT_SYMBOL_GPL(pwmchip_remove);
 
 /**
  * pwm_request() - request a PWM device
- * @pwm_id: global PWM device index
+ * @pwm: global PWM device index
  * @label: PWM device label
  *
  * This function is deprecated, use pwm_get() instead.
+ *
+ * Returns: A pointer to a PWM device or an ERR_PTR()-encoded error code on
+ * failure.
  */
 struct pwm_device *pwm_request(int pwm, const char *label)
 {
@@ -376,9 +389,9 @@ EXPORT_SYMBOL_GPL(pwm_request);
  * @index: per-chip index of the PWM to request
  * @label: a literal description string of this PWM
  *
- * Returns the PWM at the given index of the given PWM chip. A negative error
- * code is returned if the index is not valid for the specified PWM chip or
- * if the PWM device cannot be requested.
+ * Returns: A pointer to the PWM device at the given index of the given PWM
+ * chip. A negative error code is returned if the index is not valid for the
+ * specified PWM chip or if the PWM device cannot be requested.
  */
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
                                         unsigned int index,
@@ -419,6 +432,8 @@ EXPORT_SYMBOL_GPL(pwm_free);
  * @pwm: PWM device
  * @duty_ns: "on" time (in nanoseconds)
  * @period_ns: duration (in nanoseconds) of one cycle
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns)
 {
@@ -443,7 +458,10 @@ EXPORT_SYMBOL_GPL(pwm_config);
  * @pwm: PWM device
  * @polarity: new polarity of the PWM signal
  *
- * Note that the polarity cannot be configured while the PWM device is enabled
+ * Note that the polarity cannot be configured while the PWM device is
+ * enabled.
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
 {
@@ -455,7 +473,7 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity)
        if (!pwm->chip->ops->set_polarity)
                return -ENOSYS;
 
-       if (test_bit(PWMF_ENABLED, &pwm->flags))
+       if (pwm_is_enabled(pwm))
                return -EBUSY;
 
        err = pwm->chip->ops->set_polarity(pwm->chip, pwm, polarity);
@@ -471,6 +489,8 @@ EXPORT_SYMBOL_GPL(pwm_set_polarity);
 /**
  * pwm_enable() - start a PWM output toggling
  * @pwm: PWM device
+ *
+ * Returns: 0 on success or a negative error code on failure.
  */
 int pwm_enable(struct pwm_device *pwm)
 {
@@ -524,6 +544,9 @@ static struct pwm_chip *of_node_to_pwmchip(struct device_node *np)
  * lookup of the PWM index. This also means that the "pwm-names" property
  * becomes mandatory for devices that look up the PWM device via the con_id
  * parameter.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *of_pwm_get(struct device_node *np, const char *con_id)
 {
@@ -630,6 +653,9 @@ void pwm_remove_table(struct pwm_lookup *table, size_t num)
  *
  * Once a PWM chip has been found the specified PWM device will be requested
  * and is ready to be used.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 {
@@ -752,6 +778,9 @@ static void devm_pwm_release(struct device *dev, void *res)
  *
  * This function performs like pwm_get() but the acquired PWM device will
  * automatically be released on driver detach.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *devm_pwm_get(struct device *dev, const char *con_id)
 {
@@ -781,6 +810,9 @@ EXPORT_SYMBOL_GPL(devm_pwm_get);
  *
  * This function performs like of_pwm_get() but the acquired PWM device will
  * automatically be released on driver detach.
+ *
+ * Returns: A pointer to the requested PWM device or an ERR_PTR()-encoded
+ * error code on failure.
  */
 struct pwm_device *devm_of_pwm_get(struct device *dev, struct device_node *np,
                                   const char *con_id)
@@ -832,7 +864,7 @@ EXPORT_SYMBOL_GPL(devm_pwm_put);
   * pwm_can_sleep() - report whether PWM access will sleep
   * @pwm: PWM device
   *
-  * It returns true if accessing the PWM can sleep, false otherwise.
+  * Returns: True if accessing the PWM can sleep, false otherwise.
   */
 bool pwm_can_sleep(struct pwm_device *pwm)
 {
@@ -853,7 +885,7 @@ static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
                if (test_bit(PWMF_REQUESTED, &pwm->flags))
                        seq_puts(s, " requested");
 
-               if (test_bit(PWMF_ENABLED, &pwm->flags))
+               if (pwm_is_enabled(pwm))
                        seq_puts(s, " enabled");
 
                seq_puts(s, "\n");
@@ -924,6 +956,5 @@ static int __init pwm_debugfs_init(void)
 
        return 0;
 }
-
 subsys_initcall(pwm_debugfs_init);
 #endif /* CONFIG_DEBUG_FS */
index fa5feaba25a5d768d47aa100f4a9e55c634aabce..5df1db40fc075add53c4084b55f94993800b0b9e 100644 (file)
@@ -217,6 +217,11 @@ static const struct atmel_hlcdc_pwm_errata atmel_hlcdc_pwm_sama5d3_errata = {
 };
 
 static const struct of_device_id atmel_hlcdc_dt_ids[] = {
+       {
+               .compatible = "atmel,at91sam9n12-hlcdc",
+               /* 9n12 has same errata as 9x5 HLCDC PWM */
+               .data = &atmel_hlcdc_pwm_at91sam9x5_errata,
+       },
        {
                .compatible = "atmel,at91sam9x5-hlcdc",
                .data = &atmel_hlcdc_pwm_at91sam9x5_errata,
index d14e0677c92ddacb6838f8cb1d7f269fc2937b7e..6da01b3bf6f463b606cac8e3b5cb2d834243456a 100644 (file)
@@ -347,7 +347,7 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        tcbpwm->duty = duty;
 
        /* If the PWM is enabled, call enable to apply the new conf */
-       if (test_bit(PWMF_ENABLED, &pwm->flags))
+       if (pwm_is_enabled(pwm))
                atmel_tcb_pwm_enable(chip, pwm);
 
        return 0;
index a947c9095d9d6fc99d2b66be702e915eaa26014c..0e4bd4e8e5823727c03b7701ad893b4cae1f7e7e 100644 (file)
@@ -114,7 +114,7 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        u32 val;
        int ret;
 
-       if (test_bit(PWMF_ENABLED, &pwm->flags) && (period_ns != pwm->period)) {
+       if (pwm_is_enabled(pwm) && (period_ns != pwm_get_period(pwm))) {
                dev_err(chip->dev, "cannot change PWM period while enabled\n");
                return -EBUSY;
        }
@@ -176,7 +176,7 @@ static void atmel_pwm_config_v1(struct pwm_chip *chip, struct pwm_device *pwm,
         * If the PWM channel is enabled, only update CDTY by using the update
         * register, it needs to set bit 10 of CMR to 0
         */
-       if (test_bit(PWMF_ENABLED, &pwm->flags))
+       if (pwm_is_enabled(pwm))
                return;
        /*
         * If the PWM channel is disabled, write value to duty and period
@@ -191,7 +191,7 @@ static void atmel_pwm_config_v2(struct pwm_chip *chip, struct pwm_device *pwm,
 {
        struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip);
 
-       if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (pwm_is_enabled(pwm)) {
                /*
                 * If the PWM channel is enabled, using the duty update register
                 * to update the value.
index 7af8fea2dc5b3e7f5f639fd03c6b1246f7fa4d60..c634183220232194c3a0192121681c5610feb809 100644 (file)
@@ -76,19 +76,36 @@ static inline struct kona_pwmc *to_kona_pwmc(struct pwm_chip *_chip)
        return container_of(_chip, struct kona_pwmc, chip);
 }
 
-static void kona_pwmc_apply_settings(struct kona_pwmc *kp, unsigned int chan)
+/*
+ * Clear trigger bit but set smooth bit to maintain old output.
+ */
+static void kona_pwmc_prepare_for_settings(struct kona_pwmc *kp,
+       unsigned int chan)
 {
        unsigned int value = readl(kp->base + PWM_CONTROL_OFFSET);
 
-       /* Clear trigger bit but set smooth bit to maintain old output */
        value |= 1 << PWM_CONTROL_SMOOTH_SHIFT(chan);
        value &= ~(1 << PWM_CONTROL_TRIGGER_SHIFT(chan));
        writel(value, kp->base + PWM_CONTROL_OFFSET);
 
+       /*
+        * There must be a min 400ns delay between clearing trigger and setting
+        * it. Failing to do this may result in no PWM signal.
+        */
+       ndelay(400);
+}
+
+static void kona_pwmc_apply_settings(struct kona_pwmc *kp, unsigned int chan)
+{
+       unsigned int value = readl(kp->base + PWM_CONTROL_OFFSET);
+
        /* Set trigger bit and clear smooth bit to apply new settings */
        value &= ~(1 << PWM_CONTROL_SMOOTH_SHIFT(chan));
        value |= 1 << PWM_CONTROL_TRIGGER_SHIFT(chan);
        writel(value, kp->base + PWM_CONTROL_OFFSET);
+
+       /* Trigger bit must be held high for at least 400 ns. */
+       ndelay(400);
 }
 
 static int kona_pwmc_config(struct pwm_chip *chip, struct pwm_device *pwm,
@@ -133,8 +150,14 @@ static int kona_pwmc_config(struct pwm_chip *chip, struct pwm_device *pwm,
                        return -EINVAL;
        }
 
-       /* If the PWM channel is enabled, write the settings to the HW */
-       if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+       /*
+        * Don't apply settings if disabled. The period and duty cycle are
+        * always calculated above to ensure the new values are
+        * validated immediately instead of on enable.
+        */
+       if (pwm_is_enabled(pwm)) {
+               kona_pwmc_prepare_for_settings(kp, chan);
+
                value = readl(kp->base + PRESCALE_OFFSET);
                value &= ~PRESCALE_MASK(chan);
                value |= prescale << PRESCALE_SHIFT(chan);
@@ -164,6 +187,8 @@ static int kona_pwmc_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
                return ret;
        }
 
+       kona_pwmc_prepare_for_settings(kp, chan);
+
        value = readl(kp->base + PWM_CONTROL_OFFSET);
 
        if (polarity == PWM_POLARITY_NORMAL)
@@ -175,9 +200,6 @@ static int kona_pwmc_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
 
        kona_pwmc_apply_settings(kp, chan);
 
-       /* Wait for waveform to settle before gating off the clock */
-       ndelay(400);
-
        clk_disable_unprepare(kp->clk);
 
        return 0;
@@ -194,7 +216,8 @@ static int kona_pwmc_enable(struct pwm_chip *chip, struct pwm_device *pwm)
                return ret;
        }
 
-       ret = kona_pwmc_config(chip, pwm, pwm->duty_cycle, pwm->period);
+       ret = kona_pwmc_config(chip, pwm, pwm_get_duty_cycle(pwm),
+                              pwm_get_period(pwm));
        if (ret < 0) {
                clk_disable_unprepare(kp->clk);
                return ret;
@@ -207,13 +230,20 @@ static void kona_pwmc_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 {
        struct kona_pwmc *kp = to_kona_pwmc(chip);
        unsigned int chan = pwm->hwpwm;
+       unsigned int value;
+
+       kona_pwmc_prepare_for_settings(kp, chan);
 
        /* Simulate a disable by configuring for zero duty */
        writel(0, kp->base + DUTY_CYCLE_HIGH_OFFSET(chan));
-       kona_pwmc_apply_settings(kp, chan);
+       writel(0, kp->base + PERIOD_COUNT_OFFSET(chan));
 
-       /* Wait for waveform to settle before gating off the clock */
-       ndelay(400);
+       /* Set prescale to 0 for this channel */
+       value = readl(kp->base + PRESCALE_OFFSET);
+       value &= ~PRESCALE_MASK(chan);
+       writel(value, kp->base + PRESCALE_OFFSET);
+
+       kona_pwmc_apply_settings(kp, chan);
 
        clk_disable_unprepare(kp->clk);
 }
@@ -287,7 +317,7 @@ static int kona_pwmc_remove(struct platform_device *pdev)
        unsigned int chan;
 
        for (chan = 0; chan < kp->chip.npwm; chan++)
-               if (test_bit(PWMF_ENABLED, &kp->chip.pwms[chan].flags))
+               if (pwm_is_enabled(&kp->chip.pwms[chan]))
                        clk_disable_unprepare(kp->clk);
 
        return pwmchip_remove(&kp->chip);
index e593e9c45c51c1118f88ae12cc4a546498c77b40..bbf10ae02f0ecf9a147b4c6ee81e161113095519 100644 (file)
@@ -82,7 +82,7 @@ static int ep93xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
         * The clock needs to be enabled to access the PWM registers.
         * Configuration can be changed at any time.
         */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (!pwm_is_enabled(pwm)) {
                ret = clk_enable(ep93xx_pwm->clk);
                if (ret)
                        return ret;
@@ -113,7 +113,7 @@ static int ep93xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                ret = -EINVAL;
        }
 
-       if (!test_bit(PWMF_ENABLED, &pwm->flags))
+       if (!pwm_is_enabled(pwm))
                clk_disable(ep93xx_pwm->clk);
 
        return ret;
index 66d6f0c5c421c210a08d66977d3f7d2a93a1aaa8..d600fd5cd4bac9a434a20798328d513d89b5452f 100644 (file)
@@ -114,7 +114,7 @@ static int imx_pwm_config_v2(struct pwm_chip *chip,
        unsigned long long c;
        unsigned long period_cycles, duty_cycles, prescale;
        unsigned int period_ms;
-       bool enable = test_bit(PWMF_ENABLED, &pwm->flags);
+       bool enable = pwm_is_enabled(pwm);
        int wait_count = 0, fifoav;
        u32 cr, sr;
 
@@ -129,7 +129,8 @@ static int imx_pwm_config_v2(struct pwm_chip *chip,
                sr = readl(imx->mmio_base + MX3_PWMSR);
                fifoav = sr & MX3_PWMSR_FIFOAV_MASK;
                if (fifoav == MX3_PWMSR_FIFOAV_4WORDS) {
-                       period_ms = DIV_ROUND_UP(pwm->period, NSEC_PER_MSEC);
+                       period_ms = DIV_ROUND_UP(pwm_get_period(pwm),
+                                                NSEC_PER_MSEC);
                        msleep(period_ms);
 
                        sr = readl(imx->mmio_base + MX3_PWMSR);
diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c
new file mode 100644 (file)
index 0000000..9163085
--- /dev/null
@@ -0,0 +1,465 @@
+/*
+ * NXP LPC18xx State Configurable Timer - Pulse Width Modulator driver
+ *
+ * Copyright (c) 2015 Ariel D'Alessandro <ariel@vanguardiasur.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * Notes
+ * =====
+ * NXP LPC18xx provides a State Configurable Timer (SCT) which can be configured
+ * as a Pulse Width Modulator.
+ *
+ * SCT supports 16 outputs, 16 events and 16 registers. Each event will be
+ * triggered when its related register matches the SCT counter value, and it
+ * will set or clear a selected output.
+ *
+ * One of the events is preselected to generate the period, thus the maximum
+ * number of simultaneous channels is limited to 15. Notice that period is
+ * global to all the channels, thus PWM driver will refuse setting different
+ * values to it, unless there's only one channel requested.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+
+/* LPC18xx SCT registers */
+#define LPC18XX_PWM_CONFIG             0x000
+#define LPC18XX_PWM_CONFIG_UNIFY       BIT(0)
+#define LPC18XX_PWM_CONFIG_NORELOAD    BIT(7)
+
+#define LPC18XX_PWM_CTRL               0x004
+#define LPC18XX_PWM_CTRL_HALT          BIT(2)
+#define LPC18XX_PWM_BIDIR              BIT(4)
+#define LPC18XX_PWM_PRE_SHIFT          5
+#define LPC18XX_PWM_PRE_MASK           (0xff << LPC18XX_PWM_PRE_SHIFT)
+#define LPC18XX_PWM_PRE(x)             (x << LPC18XX_PWM_PRE_SHIFT)
+
+#define LPC18XX_PWM_LIMIT              0x008
+
+#define LPC18XX_PWM_RES_BASE           0x058
+#define LPC18XX_PWM_RES_SHIFT(_ch)     (_ch * 2)
+#define LPC18XX_PWM_RES(_ch, _action)  (_action << LPC18XX_PWM_RES_SHIFT(_ch))
+#define LPC18XX_PWM_RES_MASK(_ch)      (0x3 << LPC18XX_PWM_RES_SHIFT(_ch))
+
+#define LPC18XX_PWM_MATCH_BASE         0x100
+#define LPC18XX_PWM_MATCH(_ch)         (LPC18XX_PWM_MATCH_BASE + _ch * 4)
+
+#define LPC18XX_PWM_MATCHREL_BASE      0x200
+#define LPC18XX_PWM_MATCHREL(_ch)      (LPC18XX_PWM_MATCHREL_BASE + _ch * 4)
+
+#define LPC18XX_PWM_EVSTATEMSK_BASE    0x300
+#define LPC18XX_PWM_EVSTATEMSK(_ch)    (LPC18XX_PWM_EVSTATEMSK_BASE + _ch * 8)
+#define LPC18XX_PWM_EVSTATEMSK_ALL     0xffffffff
+
+#define LPC18XX_PWM_EVCTRL_BASE                0x304
+#define LPC18XX_PWM_EVCTRL(_ev)                (LPC18XX_PWM_EVCTRL_BASE + _ev * 8)
+
+#define LPC18XX_PWM_EVCTRL_MATCH(_ch)  _ch
+
+#define LPC18XX_PWM_EVCTRL_COMB_SHIFT  12
+#define LPC18XX_PWM_EVCTRL_COMB_MATCH  (0x1 << LPC18XX_PWM_EVCTRL_COMB_SHIFT)
+
+#define LPC18XX_PWM_OUTPUTSET_BASE     0x500
+#define LPC18XX_PWM_OUTPUTSET(_ch)     (LPC18XX_PWM_OUTPUTSET_BASE + _ch * 8)
+
+#define LPC18XX_PWM_OUTPUTCL_BASE      0x504
+#define LPC18XX_PWM_OUTPUTCL(_ch)      (LPC18XX_PWM_OUTPUTCL_BASE + _ch * 8)
+
+/* LPC18xx SCT unified counter */
+#define LPC18XX_PWM_TIMER_MAX          0xffffffff
+
+/* LPC18xx SCT events */
+#define LPC18XX_PWM_EVENT_PERIOD       0
+#define LPC18XX_PWM_EVENT_MAX          16
+
+/* SCT conflict resolution */
+enum lpc18xx_pwm_res_action {
+       LPC18XX_PWM_RES_NONE,
+       LPC18XX_PWM_RES_SET,
+       LPC18XX_PWM_RES_CLEAR,
+       LPC18XX_PWM_RES_TOGGLE,
+};
+
+struct lpc18xx_pwm_data {
+       unsigned int duty_event;
+};
+
+struct lpc18xx_pwm_chip {
+       struct device *dev;
+       struct pwm_chip chip;
+       void __iomem *base;
+       struct clk *pwm_clk;
+       unsigned long clk_rate;
+       unsigned int period_ns;
+       unsigned int min_period_ns;
+       unsigned int max_period_ns;
+       unsigned int period_event;
+       unsigned long event_map;
+       struct mutex res_lock;
+       struct mutex period_lock;
+};
+
+static inline struct lpc18xx_pwm_chip *
+to_lpc18xx_pwm_chip(struct pwm_chip *chip)
+{
+       return container_of(chip, struct lpc18xx_pwm_chip, chip);
+}
+
+static inline void lpc18xx_pwm_writel(struct lpc18xx_pwm_chip *lpc18xx_pwm,
+                                     u32 reg, u32 val)
+{
+       writel(val, lpc18xx_pwm->base + reg);
+}
+
+static inline u32 lpc18xx_pwm_readl(struct lpc18xx_pwm_chip *lpc18xx_pwm,
+                                   u32 reg)
+{
+       return readl(lpc18xx_pwm->base + reg);
+}
+
+static void lpc18xx_pwm_set_conflict_res(struct lpc18xx_pwm_chip *lpc18xx_pwm,
+                                        struct pwm_device *pwm,
+                                        enum lpc18xx_pwm_res_action action)
+{
+       u32 val;
+
+       mutex_lock(&lpc18xx_pwm->res_lock);
+
+       /*
+        * Simultaneous set and clear may happen on an output, that is the case
+        * when duty_ns == period_ns. LPC18xx SCT allows to set a conflict
+        * resolution action to be taken in such a case.
+        */
+       val = lpc18xx_pwm_readl(lpc18xx_pwm, LPC18XX_PWM_RES_BASE);
+       val &= ~LPC18XX_PWM_RES_MASK(pwm->hwpwm);
+       val |= LPC18XX_PWM_RES(pwm->hwpwm, action);
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_RES_BASE, val);
+
+       mutex_unlock(&lpc18xx_pwm->res_lock);
+}
+
+static void lpc18xx_pwm_config_period(struct pwm_chip *chip, int period_ns)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       u64 val;
+
+       val = (u64)period_ns * lpc18xx_pwm->clk_rate;
+       do_div(val, NSEC_PER_SEC);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_MATCH(lpc18xx_pwm->period_event),
+                          (u32)val - 1);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_MATCHREL(lpc18xx_pwm->period_event),
+                          (u32)val - 1);
+}
+
+static void lpc18xx_pwm_config_duty(struct pwm_chip *chip,
+                                   struct pwm_device *pwm, int duty_ns)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       struct lpc18xx_pwm_data *lpc18xx_data = pwm_get_chip_data(pwm);
+       u64 val;
+
+       val = (u64)duty_ns * lpc18xx_pwm->clk_rate;
+       do_div(val, NSEC_PER_SEC);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_MATCH(lpc18xx_data->duty_event),
+                          (u32)val);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_MATCHREL(lpc18xx_data->duty_event),
+                          (u32)val);
+}
+
+static int lpc18xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
+                             int duty_ns, int period_ns)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       int requested_events, i;
+
+       if (period_ns < lpc18xx_pwm->min_period_ns ||
+           period_ns > lpc18xx_pwm->max_period_ns) {
+               dev_err(chip->dev, "period %d not in range\n", period_ns);
+               return -ERANGE;
+       }
+
+       mutex_lock(&lpc18xx_pwm->period_lock);
+
+       requested_events = bitmap_weight(&lpc18xx_pwm->event_map,
+                                        LPC18XX_PWM_EVENT_MAX);
+
+       /*
+        * The PWM supports only a single period for all PWM channels.
+        * Once the period is set, it can only be changed if no more than one
+        * channel is requested at that moment.
+        */
+       if (requested_events > 2 && lpc18xx_pwm->period_ns != period_ns &&
+           lpc18xx_pwm->period_ns) {
+               dev_err(chip->dev, "conflicting period requested for PWM %u\n",
+                       pwm->hwpwm);
+               mutex_unlock(&lpc18xx_pwm->period_lock);
+               return -EBUSY;
+       }
+
+       if ((requested_events <= 2 && lpc18xx_pwm->period_ns != period_ns) ||
+           !lpc18xx_pwm->period_ns) {
+               lpc18xx_pwm->period_ns = period_ns;
+               for (i = 0; i < chip->npwm; i++)
+                       pwm_set_period(&chip->pwms[i], period_ns);
+               lpc18xx_pwm_config_period(chip, period_ns);
+       }
+
+       mutex_unlock(&lpc18xx_pwm->period_lock);
+
+       lpc18xx_pwm_config_duty(chip, pwm, duty_ns);
+
+       return 0;
+}
+
+static int lpc18xx_pwm_set_polarity(struct pwm_chip *chip,
+                                   struct pwm_device *pwm,
+                                   enum pwm_polarity polarity)
+{
+       return 0;
+}
+
+static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       struct lpc18xx_pwm_data *lpc18xx_data = pwm_get_chip_data(pwm);
+       enum lpc18xx_pwm_res_action res_action;
+       unsigned int set_event, clear_event;
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_EVCTRL(lpc18xx_data->duty_event),
+                          LPC18XX_PWM_EVCTRL_MATCH(lpc18xx_data->duty_event) |
+                          LPC18XX_PWM_EVCTRL_COMB_MATCH);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_EVSTATEMSK(lpc18xx_data->duty_event),
+                          LPC18XX_PWM_EVSTATEMSK_ALL);
+
+       if (pwm->polarity == PWM_POLARITY_NORMAL) {
+               set_event = lpc18xx_pwm->period_event;
+               clear_event = lpc18xx_data->duty_event;
+               res_action = LPC18XX_PWM_RES_SET;
+       } else {
+               set_event = lpc18xx_data->duty_event;
+               clear_event = lpc18xx_pwm->period_event;
+               res_action = LPC18XX_PWM_RES_CLEAR;
+       }
+
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_OUTPUTSET(pwm->hwpwm),
+                          BIT(set_event));
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_OUTPUTCL(pwm->hwpwm),
+                          BIT(clear_event));
+       lpc18xx_pwm_set_conflict_res(lpc18xx_pwm, pwm, res_action);
+
+       return 0;
+}
+
+static void lpc18xx_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       struct lpc18xx_pwm_data *lpc18xx_data = pwm_get_chip_data(pwm);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_EVCTRL(lpc18xx_data->duty_event), 0);
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_OUTPUTSET(pwm->hwpwm), 0);
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_OUTPUTCL(pwm->hwpwm), 0);
+}
+
+static int lpc18xx_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       struct lpc18xx_pwm_data *lpc18xx_data = pwm_get_chip_data(pwm);
+       unsigned long event;
+
+       event = find_first_zero_bit(&lpc18xx_pwm->event_map,
+                                   LPC18XX_PWM_EVENT_MAX);
+
+       if (event >= LPC18XX_PWM_EVENT_MAX) {
+               dev_err(lpc18xx_pwm->dev,
+                       "maximum number of simultaneous channels reached\n");
+               return -EBUSY;
+       };
+
+       set_bit(event, &lpc18xx_pwm->event_map);
+       lpc18xx_data->duty_event = event;
+       lpc18xx_pwm_config_duty(chip, pwm, pwm_get_duty_cycle(pwm));
+
+       return 0;
+}
+
+static void lpc18xx_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip);
+       struct lpc18xx_pwm_data *lpc18xx_data = pwm_get_chip_data(pwm);
+
+       pwm_disable(pwm);
+       pwm_set_duty_cycle(pwm, 0);
+       clear_bit(lpc18xx_data->duty_event, &lpc18xx_pwm->event_map);
+}
+
+static const struct pwm_ops lpc18xx_pwm_ops = {
+       .config = lpc18xx_pwm_config,
+       .set_polarity = lpc18xx_pwm_set_polarity,
+       .enable = lpc18xx_pwm_enable,
+       .disable = lpc18xx_pwm_disable,
+       .request = lpc18xx_pwm_request,
+       .free = lpc18xx_pwm_free,
+       .owner = THIS_MODULE,
+};
+
+static const struct of_device_id lpc18xx_pwm_of_match[] = {
+       { .compatible = "nxp,lpc1850-sct-pwm" },
+       {}
+};
+MODULE_DEVICE_TABLE(of, lpc18xx_pwm_of_match);
+
+static int lpc18xx_pwm_probe(struct platform_device *pdev)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm;
+       struct pwm_device *pwm;
+       struct resource *res;
+       int ret, i;
+       u64 val;
+
+       lpc18xx_pwm = devm_kzalloc(&pdev->dev, sizeof(*lpc18xx_pwm),
+                                  GFP_KERNEL);
+       if (!lpc18xx_pwm)
+               return -ENOMEM;
+
+       lpc18xx_pwm->dev = &pdev->dev;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       lpc18xx_pwm->base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(lpc18xx_pwm->base))
+               return PTR_ERR(lpc18xx_pwm->base);
+
+       lpc18xx_pwm->pwm_clk = devm_clk_get(&pdev->dev, "pwm");
+       if (IS_ERR(lpc18xx_pwm->pwm_clk)) {
+               dev_err(&pdev->dev, "failed to get pwm clock\n");
+               return PTR_ERR(lpc18xx_pwm->pwm_clk);
+       }
+
+       ret = clk_prepare_enable(lpc18xx_pwm->pwm_clk);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "could not prepare or enable pwm clock\n");
+               return ret;
+       }
+
+       lpc18xx_pwm->clk_rate = clk_get_rate(lpc18xx_pwm->pwm_clk);
+
+       mutex_init(&lpc18xx_pwm->res_lock);
+       mutex_init(&lpc18xx_pwm->period_lock);
+
+       val = (u64)NSEC_PER_SEC * LPC18XX_PWM_TIMER_MAX;
+       do_div(val, lpc18xx_pwm->clk_rate);
+       lpc18xx_pwm->max_period_ns = val;
+
+       lpc18xx_pwm->min_period_ns = DIV_ROUND_UP(NSEC_PER_SEC,
+                                                 lpc18xx_pwm->clk_rate);
+
+       lpc18xx_pwm->chip.dev = &pdev->dev;
+       lpc18xx_pwm->chip.ops = &lpc18xx_pwm_ops;
+       lpc18xx_pwm->chip.base = -1;
+       lpc18xx_pwm->chip.npwm = 16;
+       lpc18xx_pwm->chip.of_xlate = of_pwm_xlate_with_flags;
+       lpc18xx_pwm->chip.of_pwm_n_cells = 3;
+
+       /* SCT counter must be in unify (32 bit) mode */
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_CONFIG,
+                          LPC18XX_PWM_CONFIG_UNIFY);
+
+       /*
+        * Everytime the timer counter reaches the period value, the related
+        * event will be triggered and the counter reset to 0.
+        */
+       set_bit(LPC18XX_PWM_EVENT_PERIOD, &lpc18xx_pwm->event_map);
+       lpc18xx_pwm->period_event = LPC18XX_PWM_EVENT_PERIOD;
+
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_EVSTATEMSK(lpc18xx_pwm->period_event),
+                          LPC18XX_PWM_EVSTATEMSK_ALL);
+
+       val = LPC18XX_PWM_EVCTRL_MATCH(lpc18xx_pwm->period_event) |
+             LPC18XX_PWM_EVCTRL_COMB_MATCH;
+       lpc18xx_pwm_writel(lpc18xx_pwm,
+                          LPC18XX_PWM_EVCTRL(lpc18xx_pwm->period_event), val);
+
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_LIMIT,
+                          BIT(lpc18xx_pwm->period_event));
+
+       ret = pwmchip_add(&lpc18xx_pwm->chip);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "pwmchip_add failed: %d\n", ret);
+               goto disable_pwmclk;
+       }
+
+       for (i = 0; i < lpc18xx_pwm->chip.npwm; i++) {
+               pwm = &lpc18xx_pwm->chip.pwms[i];
+               pwm->chip_data = devm_kzalloc(lpc18xx_pwm->dev,
+                                             sizeof(struct lpc18xx_pwm_data),
+                                             GFP_KERNEL);
+               if (!pwm->chip_data) {
+                       ret = -ENOMEM;
+                       goto remove_pwmchip;
+               }
+       }
+
+       platform_set_drvdata(pdev, lpc18xx_pwm);
+
+       val = lpc18xx_pwm_readl(lpc18xx_pwm, LPC18XX_PWM_CTRL);
+       val &= ~LPC18XX_PWM_BIDIR;
+       val &= ~LPC18XX_PWM_CTRL_HALT;
+       val &= ~LPC18XX_PWM_PRE_MASK;
+       val |= LPC18XX_PWM_PRE(0);
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_CTRL, val);
+
+       return 0;
+
+remove_pwmchip:
+       pwmchip_remove(&lpc18xx_pwm->chip);
+disable_pwmclk:
+       clk_disable_unprepare(lpc18xx_pwm->pwm_clk);
+       return ret;
+}
+
+static int lpc18xx_pwm_remove(struct platform_device *pdev)
+{
+       struct lpc18xx_pwm_chip *lpc18xx_pwm = platform_get_drvdata(pdev);
+       u32 val;
+
+       val = lpc18xx_pwm_readl(lpc18xx_pwm, LPC18XX_PWM_CTRL);
+       lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_CTRL,
+                          val | LPC18XX_PWM_CTRL_HALT);
+
+       clk_disable_unprepare(lpc18xx_pwm->pwm_clk);
+
+       return pwmchip_remove(&lpc18xx_pwm->chip);
+}
+
+static struct platform_driver lpc18xx_pwm_driver = {
+       .driver = {
+               .name = "lpc18xx-sct-pwm",
+               .of_match_table = lpc18xx_pwm_of_match,
+       },
+       .probe = lpc18xx_pwm_probe,
+       .remove = lpc18xx_pwm_remove,
+};
+module_platform_driver(lpc18xx_pwm_driver);
+
+MODULE_AUTHOR("Ariel D'Alessandro <ariel@vanguardiasur.com.ar>");
+MODULE_DESCRIPTION("NXP LPC18xx PWM driver");
+MODULE_LICENSE("GPL v2");
index b430811e14f582a9ad66e4cad3f86fbe4569a073..9a596324ebef8a1bede6324083aefc7315520c44 100644 (file)
@@ -77,7 +77,7 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
         * If the PWM channel is disabled, make sure to turn on the clock
         * before writing the register. Otherwise, keep it enabled.
         */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (!pwm_is_enabled(pwm)) {
                ret = clk_prepare_enable(mxs->clk);
                if (ret)
                        return ret;
@@ -92,7 +92,7 @@ static int mxs_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        /*
         * If the PWM is not enabled, turn the clock off again to save power.
         */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags))
+       if (!pwm_is_enabled(pwm))
                clk_disable_unprepare(mxs->clk);
 
        return 0;
index 34b5c275a92a3ca592c5fee09b8ca45634caea49..70448a6079b02ede68edf1b2242a5e7b049fb207 100644 (file)
@@ -2,6 +2,7 @@
  * Driver for PCA9685 16-channel 12-bit PWM LED controller
  *
  * Copyright (C) 2013 Steffen Trumtrar <s.trumtrar@pengutronix.de>
+ * Copyright (C) 2015 Clemens Gruber <clemens.gruber@pqgruber.com>
  *
  * based on the pwm-twl-led.c driver
  *
 #include <linux/pwm.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
+#include <linux/delay.h>
+
+/*
+ * Because the PCA9685 has only one prescaler per chip, changing the period of
+ * one channel affects the period of all 16 PWM outputs!
+ * However, the ratio between each configured duty cycle and the chip-wide
+ * period remains constant, because the OFF time is set in proportion to the
+ * counter range.
+ */
 
 #define PCA9685_MODE1          0x00
 #define PCA9685_MODE2          0x01
 #define PCA9685_ALL_LED_OFF_H  0xFD
 #define PCA9685_PRESCALE       0xFE
 
+#define PCA9685_PRESCALE_MIN   0x03    /* => max. frequency of 1526 Hz */
+#define PCA9685_PRESCALE_MAX   0xFF    /* => min. frequency of 24 Hz */
+
+#define PCA9685_COUNTER_RANGE  4096
+#define PCA9685_DEFAULT_PERIOD 5000000 /* Default period_ns = 1/200 Hz */
+#define PCA9685_OSC_CLOCK_MHZ  25      /* Internal oscillator with 25 MHz */
+
 #define PCA9685_NUMREGS                0xFF
 #define PCA9685_MAXCHAN                0x10
 
 #define LED_FULL               (1 << 4)
+#define MODE1_RESTART          (1 << 7)
 #define MODE1_SLEEP            (1 << 4)
 #define MODE2_INVRT            (1 << 4)
 #define MODE2_OUTDRV           (1 << 2)
@@ -59,6 +77,8 @@ struct pca9685 {
        struct pwm_chip chip;
        struct regmap *regmap;
        int active_cnt;
+       int duty_ns;
+       int period_ns;
 };
 
 static inline struct pca9685 *to_pca(struct pwm_chip *chip)
@@ -72,6 +92,47 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        struct pca9685 *pca = to_pca(chip);
        unsigned long long duty;
        unsigned int reg;
+       int prescale;
+
+       if (period_ns != pca->period_ns) {
+               prescale = DIV_ROUND_CLOSEST(PCA9685_OSC_CLOCK_MHZ * period_ns,
+                                            PCA9685_COUNTER_RANGE * 1000) - 1;
+
+               if (prescale >= PCA9685_PRESCALE_MIN &&
+                       prescale <= PCA9685_PRESCALE_MAX) {
+                       /* Put chip into sleep mode */
+                       regmap_update_bits(pca->regmap, PCA9685_MODE1,
+                                          MODE1_SLEEP, MODE1_SLEEP);
+
+                       /* Change the chip-wide output frequency */
+                       regmap_write(pca->regmap, PCA9685_PRESCALE, prescale);
+
+                       /* Wake the chip up */
+                       regmap_update_bits(pca->regmap, PCA9685_MODE1,
+                                          MODE1_SLEEP, 0x0);
+
+                       /* Wait 500us for the oscillator to be back up */
+                       udelay(500);
+
+                       pca->period_ns = period_ns;
+
+                       /*
+                        * If the duty cycle did not change, restart PWM with
+                        * the same duty cycle to period ratio and return.
+                        */
+                       if (duty_ns == pca->duty_ns) {
+                               regmap_update_bits(pca->regmap, PCA9685_MODE1,
+                                                  MODE1_RESTART, 0x1);
+                               return 0;
+                       }
+               } else {
+                       dev_err(chip->dev,
+                               "prescaler not set: period out of bounds!\n");
+                       return -EINVAL;
+               }
+       }
+
+       pca->duty_ns = duty_ns;
 
        if (duty_ns < 1) {
                if (pwm->hwpwm >= PCA9685_MAXCHAN)
@@ -85,6 +146,22 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        }
 
        if (duty_ns == period_ns) {
+               /* Clear both OFF registers */
+               if (pwm->hwpwm >= PCA9685_MAXCHAN)
+                       reg = PCA9685_ALL_LED_OFF_L;
+               else
+                       reg = LED_N_OFF_L(pwm->hwpwm);
+
+               regmap_write(pca->regmap, reg, 0x0);
+
+               if (pwm->hwpwm >= PCA9685_MAXCHAN)
+                       reg = PCA9685_ALL_LED_OFF_H;
+               else
+                       reg = LED_N_OFF_H(pwm->hwpwm);
+
+               regmap_write(pca->regmap, reg, 0x0);
+
+               /* Set the full ON bit */
                if (pwm->hwpwm >= PCA9685_MAXCHAN)
                        reg = PCA9685_ALL_LED_ON_H;
                else
@@ -95,7 +172,7 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                return 0;
        }
 
-       duty = 4096 * (unsigned long long)duty_ns;
+       duty = PCA9685_COUNTER_RANGE * (unsigned long long)duty_ns;
        duty = DIV_ROUND_UP_ULL(duty, period_ns);
 
        if (pwm->hwpwm >= PCA9685_MAXCHAN)
@@ -112,6 +189,14 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
        regmap_write(pca->regmap, reg, ((int)duty >> 8) & 0xf);
 
+       /* Clear the full ON bit, otherwise the set OFF time has no effect */
+       if (pwm->hwpwm >= PCA9685_MAXCHAN)
+               reg = PCA9685_ALL_LED_ON_H;
+       else
+               reg = LED_N_ON_H(pwm->hwpwm);
+
+       regmap_write(pca->regmap, reg, 0);
+
        return 0;
 }
 
@@ -228,6 +313,8 @@ static int pca9685_pwm_probe(struct i2c_client *client,
                        ret);
                return ret;
        }
+       pca->duty_ns = 0;
+       pca->period_ns = PCA9685_DEFAULT_PERIOD;
 
        i2c_set_clientdata(client, pca);
 
@@ -285,7 +372,6 @@ MODULE_DEVICE_TABLE(of, pca9685_dt_ids);
 static struct i2c_driver pca9685_i2c_driver = {
        .driver = {
                .name = "pca9685-pwm",
-               .owner = THIS_MODULE,
                .of_match_table = pca9685_dt_ids,
        },
        .probe = pca9685_pwm_probe,
index ee63f9e9d0fb752833f5dd596a8c3838d4e93cdb..075c1a764ba293dab3d1e39df05bc4b49f67b1b3 100644 (file)
@@ -301,7 +301,7 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm,
        pwm->duty = duty;
 
        /* If the channel is disabled we're done. */
-       if (!test_bit(PWMF_ENABLED, &_pwm->flags))
+       if (!pwm_is_enabled(_pwm))
                return 0;
 
        if (duty_only && pwm->timer_on) {
index 9442df244101772259a0ddb6824fe6035c4dd568..7d9cc9049522348dd15951f927ed4649b1cdf364 100644 (file)
@@ -83,7 +83,7 @@ static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip,
                          PWM_CONTINUOUS;
        u32 val;
 
-       if (pwm->polarity == PWM_POLARITY_INVERSED)
+       if (pwm_get_polarity(pwm) == PWM_POLARITY_INVERSED)
                enable_conf |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE;
        else
                enable_conf |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE;
index cabd7d8e05cc0fdd79e42da12046d43d5e7032af..d4de0607b502a87c03498f7362b731dd44f06d50 100644 (file)
@@ -112,7 +112,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
         * If the PWM channel is disabled, make sure to turn on the clock
         * before writing the register. Otherwise, keep it enabled.
         */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (!pwm_is_enabled(pwm)) {
                err = clk_prepare_enable(pc->clk);
                if (err < 0)
                        return err;
@@ -124,7 +124,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
        /*
         * If the PWM is not enabled, turn the clock off again to save power.
         */
-       if (!test_bit(PWMF_ENABLED, &pwm->flags))
+       if (!pwm_is_enabled(pwm))
                clk_disable_unprepare(pc->clk);
 
        return 0;
@@ -214,7 +214,7 @@ static int tegra_pwm_remove(struct platform_device *pdev)
        for (i = 0; i < NUM_PWM; i++) {
                struct pwm_device *pwm = &pc->chip.pwms[i];
 
-               if (!test_bit(PWMF_ENABLED, &pwm->flags))
+               if (!pwm_is_enabled(pwm))
                        if (clk_prepare_enable(pc->clk) < 0)
                                continue;
 
index e557befdf4e65902c0c87a513bdb6dd479be5177..616af764a27682ed0301e897ab731f5ffe8c5a13 100644 (file)
@@ -97,7 +97,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 
        writew(reg_val, pc->mmio_base + ECCTL2);
 
-       if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (!pwm_is_enabled(pwm)) {
                /* Update active registers if not running */
                writel(duty_cycles, pc->mmio_base + CAP2);
                writel(period_cycles, pc->mmio_base + CAP1);
@@ -111,7 +111,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
                writel(period_cycles, pc->mmio_base + CAP3);
        }
 
-       if (!test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (!pwm_is_enabled(pwm)) {
                reg_val = readw(pc->mmio_base + ECCTL2);
                /* Disable APWM mode to put APWM output Low */
                reg_val &= ~ECCTL2_APWM_MODE;
@@ -179,7 +179,7 @@ static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 
 static void ecap_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-       if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (pwm_is_enabled(pwm)) {
                dev_warn(chip->dev, "Removing PWM device without disabling\n");
                pm_runtime_put_sync(chip->dev);
        }
@@ -306,7 +306,7 @@ static int ecap_pwm_suspend(struct device *dev)
        ecap_pwm_save_context(pc);
 
        /* Disable explicitly if PWM is running */
-       if (test_bit(PWMF_ENABLED, &pwm->flags))
+       if (pwm_is_enabled(pwm))
                pm_runtime_put_sync(dev);
 
        return 0;
@@ -318,7 +318,7 @@ static int ecap_pwm_resume(struct device *dev)
        struct pwm_device *pwm = pc->chip.pwms;
 
        /* Enable explicitly if PWM was running */
-       if (test_bit(PWMF_ENABLED, &pwm->flags))
+       if (pwm_is_enabled(pwm))
                pm_runtime_get_sync(dev);
 
        ecap_pwm_restore_context(pc);
index 694b3cf7694be1bc14543d1ff1173a0486d2a4e2..6a41e66015b67fa84865891cbc3e97e12de6539f 100644 (file)
@@ -407,7 +407,7 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
        struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
 
-       if (test_bit(PWMF_ENABLED, &pwm->flags)) {
+       if (pwm_is_enabled(pwm)) {
                dev_warn(chip->dev, "Removing PWM device without disabling\n");
                pm_runtime_put_sync(chip->dev);
        }
@@ -565,7 +565,7 @@ static int ehrpwm_pwm_suspend(struct device *dev)
        for (i = 0; i < pc->chip.npwm; i++) {
                struct pwm_device *pwm = &pc->chip.pwms[i];
 
-               if (!test_bit(PWMF_ENABLED, &pwm->flags))
+               if (!pwm_is_enabled(pwm))
                        continue;
 
                /* Disable explicitly if PWM is running */
@@ -582,7 +582,7 @@ static int ehrpwm_pwm_resume(struct device *dev)
        for (i = 0; i < pc->chip.npwm; i++) {
                struct pwm_device *pwm = &pc->chip.pwms[i];
 
-               if (!test_bit(PWMF_ENABLED, &pwm->flags))
+               if (!pwm_is_enabled(pwm))
                        continue;
 
                /* Enable explicitly if PWM was running */
index 4bd0c639e16da9d49598f637c5473ae37e419def..c472772f00a7880611e23e62cfbd1731d04dfa27 100644 (file)
@@ -46,7 +46,7 @@ static ssize_t pwm_period_show(struct device *child,
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
 
-       return sprintf(buf, "%u\n", pwm->period);
+       return sprintf(buf, "%u\n", pwm_get_period(pwm));
 }
 
 static ssize_t pwm_period_store(struct device *child,
@@ -61,7 +61,7 @@ static ssize_t pwm_period_store(struct device *child,
        if (ret)
                return ret;
 
-       ret = pwm_config(pwm, pwm->duty_cycle, val);
+       ret = pwm_config(pwm, pwm_get_duty_cycle(pwm), val);
 
        return ret ? : size;
 }
@@ -72,7 +72,7 @@ static ssize_t pwm_duty_cycle_show(struct device *child,
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
 
-       return sprintf(buf, "%u\n", pwm->duty_cycle);
+       return sprintf(buf, "%u\n", pwm_get_duty_cycle(pwm));
 }
 
 static ssize_t pwm_duty_cycle_store(struct device *child,
@@ -87,7 +87,7 @@ static ssize_t pwm_duty_cycle_store(struct device *child,
        if (ret)
                return ret;
 
-       ret = pwm_config(pwm, val, pwm->period);
+       ret = pwm_config(pwm, val, pwm_get_period(pwm));
 
        return ret ? : size;
 }
@@ -97,7 +97,7 @@ static ssize_t pwm_enable_show(struct device *child,
                               char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
-       int enabled = test_bit(PWMF_ENABLED, &pwm->flags);
+       int enabled = pwm_is_enabled(pwm);
 
        return sprintf(buf, "%d\n", enabled);
 }
@@ -133,8 +133,19 @@ static ssize_t pwm_polarity_show(struct device *child,
                                 char *buf)
 {
        const struct pwm_device *pwm = child_to_pwm_device(child);
+       const char *polarity = "unknown";
 
-       return sprintf(buf, "%s\n", pwm->polarity ? "inversed" : "normal");
+       switch (pwm_get_polarity(pwm)) {
+       case PWM_POLARITY_NORMAL:
+               polarity = "normal";
+               break;
+
+       case PWM_POLARITY_INVERSED:
+               polarity = "inversed";
+               break;
+       }
+
+       return sprintf(buf, "%s\n", polarity);
 }
 
 static ssize_t pwm_polarity_store(struct device *child,
@@ -301,9 +312,9 @@ static struct attribute *pwm_chip_attrs[] = {
 ATTRIBUTE_GROUPS(pwm_chip);
 
 static struct class pwm_class = {
-       .name           = "pwm",
-       .owner          = THIS_MODULE,
-       .dev_groups     = pwm_chip_groups,
+       .name = "pwm",
+       .owner = THIS_MODULE,
+       .dev_groups = pwm_chip_groups,
 };
 
 static int pwmchip_sysfs_match(struct device *parent, const void *data)
index de9f272a0faf75d7562665aebba60444338f21c5..7a85ac9e32c5da9168c1e79d3ae82d230d9dd628 100644 (file)
@@ -1262,7 +1262,7 @@ static struct regulator *create_regulator(struct regulator_dev *rdev,
        regulator->debugfs = debugfs_create_dir(regulator->supply_name,
                                                rdev->debugfs);
        if (!regulator->debugfs) {
-               rdev_warn(rdev, "Failed to create debugfs directory\n");
+               rdev_dbg(rdev, "Failed to create debugfs directory\n");
        } else {
                debugfs_create_u32("uA_load", 0444, regulator->debugfs,
                                   &regulator->uA_load);
index d2d290413113b753412193c5f6e1c37fda5c3524..9aaf646ece55beddf718fd8ea0ce4b89eac3f478 100644 (file)
@@ -89,6 +89,7 @@ static int ath79_reset_probe(struct platform_device *pdev)
        if (IS_ERR(ath79_reset->base))
                return PTR_ERR(ath79_reset->base);
 
+       spin_lock_init(&ath79_reset->lock);
        ath79_reset->rcdev.ops = &ath79_reset_ops;
        ath79_reset->rcdev.owner = THIS_MODULE;
        ath79_reset->rcdev.of_node = pdev->dev.of_node;
index 533bfa3b60397a5fc3be70b5061b9f77c0c385b8..9d4290617cee5ab2969999ddb098db46b30f8e9d 100644 (file)
@@ -945,11 +945,11 @@ config RTC_DRV_DA9055
          will be called rtc-da9055
 
 config RTC_DRV_DA9063
-       tristate "Dialog Semiconductor DA9063 RTC"
-       depends on MFD_DA9063
+       tristate "Dialog Semiconductor DA9063/DA9062 RTC"
+       depends on MFD_DA9063 || MFD_DA9062
        help
          If you say yes here you will get support for the RTC subsystem
-         of the Dialog Semiconductor DA9063.
+         for the Dialog Semiconductor PMIC chips DA9063 and DA9062.
 
          This driver can also be built as a module. If so, the module
          will be called "rtc-da9063".
@@ -1116,6 +1116,13 @@ config RTC_DRV_OPAL
          This driver can also be built as a module. If so, the module
          will be called rtc-opal.
 
+config RTC_DRV_ZYNQMP
+       tristate "Xilinx Zynq Ultrascale+ MPSoC RTC"
+       depends on OF
+       help
+         If you say yes here you get support for the RTC controller found on
+         Xilinx Zynq Ultrascale+ MPSoC.
+
 comment "on-CPU RTC drivers"
 
 config RTC_DRV_DAVINCI
@@ -1306,11 +1313,13 @@ config RTC_DRV_GENERIC
          just say Y.
 
 config RTC_DRV_PXA
-       tristate "PXA27x/PXA3xx"
-       depends on ARCH_PXA
-       help
-        If you say Y here you will get access to the real time clock
-        built into your PXA27x or PXA3xx CPU.
+       tristate "PXA27x/PXA3xx"
+       depends on ARCH_PXA
+       select RTC_DRV_SA1100
+       help
+         If you say Y here you will get access to the real time clock
+         built into your PXA27x or PXA3xx CPU. This RTC is actually 2 RTCs
+         consisting of an SA1100 compatible RTC and the extended PXA RTC.
 
         This RTC driver uses PXA RTC registers available since pxa27x
         series (RDxR, RYxR) instead of legacy RCNR, RTAR.
@@ -1456,6 +1465,18 @@ config RTC_DRV_JZ4740
          This driver can also be buillt as a module. If so, the module
          will be called rtc-jz4740.
 
+config RTC_DRV_LPC24XX
+       tristate "NXP RTC for LPC178x/18xx/408x/43xx"
+       depends on ARCH_LPC18XX || COMPILE_TEST
+       depends on OF && HAS_IOMEM
+       help
+         This enables support for the NXP RTC found which can be found on
+         NXP LPC178x/18xx/408x/43xx devices.
+
+         If you have one of the devices above enable this driver to use
+         the hardware RTC. This driver can also be buillt as a module. If
+         so, the module will be called rtc-lpc24xx.
+
 config RTC_DRV_LPC32XX
        depends on ARCH_LPC32XX
        tristate "NXP LPC32XX RTC"
index 1b09a62fcf4b8811b99c878ee2f3e630452e0098..e491eb524434220a09b766ba39cd369521b26244 100644 (file)
@@ -74,6 +74,7 @@ obj-$(CONFIG_RTC_DRV_ISL12057)        += rtc-isl12057.o
 obj-$(CONFIG_RTC_DRV_ISL1208)  += rtc-isl1208.o
 obj-$(CONFIG_RTC_DRV_JZ4740)   += rtc-jz4740.o
 obj-$(CONFIG_RTC_DRV_LP8788)   += rtc-lp8788.o
+obj-$(CONFIG_RTC_DRV_LPC24XX)  += rtc-lpc24xx.o
 obj-$(CONFIG_RTC_DRV_LPC32XX)  += rtc-lpc32xx.o
 obj-$(CONFIG_RTC_DRV_LOONGSON1)        += rtc-ls1x.o
 obj-$(CONFIG_RTC_DRV_M41T80)   += rtc-m41t80.o
@@ -158,3 +159,4 @@ obj-$(CONFIG_RTC_DRV_WM831X)        += rtc-wm831x.o
 obj-$(CONFIG_RTC_DRV_WM8350)   += rtc-wm8350.o
 obj-$(CONFIG_RTC_DRV_X1205)    += rtc-x1205.o
 obj-$(CONFIG_RTC_DRV_XGENE)    += rtc-xgene.o
+obj-$(CONFIG_RTC_DRV_ZYNQMP)   += rtc-zynqmp.o
index ea2a315df6b7bb5fb8fcae65cf4d21ce054f94b0..de86578bcd6d79ed4b4f7428a58930832709cbbf 100644 (file)
@@ -202,6 +202,7 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
        rtc->max_user_freq = 64;
        rtc->dev.parent = dev;
        rtc->dev.class = rtc_class;
+       rtc->dev.groups = rtc_get_dev_attribute_groups();
        rtc->dev.release = rtc_device_release;
 
        mutex_init(&rtc->ops_lock);
@@ -234,12 +235,12 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 
        err = device_register(&rtc->dev);
        if (err) {
+               /* This will free both memory and the ID */
                put_device(&rtc->dev);
-               goto exit_kfree;
+               goto exit;
        }
 
        rtc_dev_add_device(rtc);
-       rtc_sysfs_add_device(rtc);
        rtc_proc_add_device(rtc);
 
        dev_info(dev, "rtc core: registered %s as %s\n",
@@ -247,9 +248,6 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev,
 
        return rtc;
 
-exit_kfree:
-       kfree(rtc);
-
 exit_ida:
        ida_simple_remove(&rtc_ida, id);
 
@@ -268,19 +266,17 @@ EXPORT_SYMBOL_GPL(rtc_device_register);
  */
 void rtc_device_unregister(struct rtc_device *rtc)
 {
-       if (get_device(&rtc->dev) != NULL) {
-               mutex_lock(&rtc->ops_lock);
-               /* remove innards of this RTC, then disable it, before
-                * letting any rtc_class_open() users access it again
-                */
-               rtc_sysfs_del_device(rtc);
-               rtc_dev_del_device(rtc);
-               rtc_proc_del_device(rtc);
-               device_unregister(&rtc->dev);
-               rtc->ops = NULL;
-               mutex_unlock(&rtc->ops_lock);
-               put_device(&rtc->dev);
-       }
+       mutex_lock(&rtc->ops_lock);
+       /*
+        * Remove innards of this RTC, then disable it, before
+        * letting any rtc_class_open() users access it again
+        */
+       rtc_dev_del_device(rtc);
+       rtc_proc_del_device(rtc);
+       device_del(&rtc->dev);
+       rtc->ops = NULL;
+       mutex_unlock(&rtc->ops_lock);
+       put_device(&rtc->dev);
 }
 EXPORT_SYMBOL_GPL(rtc_device_unregister);
 
@@ -363,7 +359,6 @@ static int __init rtc_init(void)
        }
        rtc_class->pm = RTC_CLASS_DEV_PM_OPS;
        rtc_dev_init();
-       rtc_sysfs_init(rtc_class);
        return 0;
 }
 
index 11b639067312f8485454cba5f039129044d4db4f..5836751b8203eb576a7af9f049c706ba27ee8d93 100644 (file)
@@ -564,7 +564,7 @@ enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer)
 void rtc_update_irq(struct rtc_device *rtc,
                unsigned long num, unsigned long events)
 {
-       if (unlikely(IS_ERR_OR_NULL(rtc)))
+       if (IS_ERR_OR_NULL(rtc))
                return;
 
        pm_stay_awake(rtc->dev.parent);
index 7df0579d9852c03735fe5e9dab118ba141dd8122..466bf7f9a285a5c455a26d199083384d93a4e121 100644 (file)
@@ -251,17 +251,26 @@ static SIMPLE_DEV_PM_OPS(pm80x_rtc_pm_ops, pm80x_rtc_suspend, pm80x_rtc_resume);
 static int pm80x_rtc_probe(struct platform_device *pdev)
 {
        struct pm80x_chip *chip = dev_get_drvdata(pdev->dev.parent);
-       struct pm80x_platform_data *pm80x_pdata =
-                               dev_get_platdata(pdev->dev.parent);
-       struct pm80x_rtc_pdata *pdata = NULL;
+       struct pm80x_rtc_pdata *pdata = dev_get_platdata(&pdev->dev);
        struct pm80x_rtc_info *info;
+       struct device_node *node = pdev->dev.of_node;
        struct rtc_time tm;
        unsigned long ticks = 0;
        int ret;
 
-       pdata = dev_get_platdata(&pdev->dev);
-       if (pdata == NULL)
-               dev_warn(&pdev->dev, "No platform data!\n");
+       if (!pdata && !node) {
+               dev_err(&pdev->dev,
+                       "pm80x-rtc requires platform data or of_node\n");
+               return -EINVAL;
+       }
+
+       if (!pdata) {
+               pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+               if (!pdata) {
+                       dev_err(&pdev->dev, "failed to allocate memory\n");
+                       return -ENOMEM;
+               }
+       }
 
        info =
            devm_kzalloc(&pdev->dev, sizeof(struct pm80x_rtc_info), GFP_KERNEL);
@@ -327,11 +336,8 @@ static int pm80x_rtc_probe(struct platform_device *pdev)
        regmap_update_bits(info->map, PM800_RTC_CONTROL, PM800_RTC1_USE_XO,
                           PM800_RTC1_USE_XO);
 
-       if (pm80x_pdata) {
-               pdata = pm80x_pdata->rtc;
-               if (pdata)
-                       info->rtc_dev->dev.platform_data = &pdata->rtc_wakeup;
-       }
+       /* remember whether this power up is caused by PMIC RTC or not */
+       info->rtc_dev->dev.platform_data = &pdata->rtc_wakeup;
 
        device_init_wakeup(&pdev->dev, 1);
 
index b5cbc1bf5a3e5fb50efd2dabc897598cef5365e6..a319bf1e49dea40cd43ee7f857dae16bd6b32af0 100644 (file)
@@ -1009,6 +1009,7 @@ static const struct of_device_id abb5zes3_dt_match[] = {
        { .compatible = "abracon,abb5zes3" },
        { },
 };
+MODULE_DEVICE_TABLE(of, abb5zes3_dt_match);
 #endif
 
 static const struct i2c_device_id abb5zes3_id[] = {
@@ -1020,7 +1021,6 @@ MODULE_DEVICE_TABLE(i2c, abb5zes3_id);
 static struct i2c_driver abb5zes3_driver = {
        .driver = {
                .name = DRV_NAME,
-               .owner = THIS_MODULE,
                .pm = &abb5zes3_rtc_pm_ops,
                .of_match_table = of_match_ptr(abb5zes3_dt_match),
        },
index 133d2e2e1a2590fca85cdfe8daa9c9e5930386e8..51407c4c7bd2be69ccb287575880ada7870bbcd0 100644 (file)
@@ -445,7 +445,9 @@ static const struct rtc_class_ops ab8540_rtc_ops = {
 static const struct platform_device_id ab85xx_rtc_ids[] = {
        { "ab8500-rtc", (kernel_ulong_t)&ab8500_rtc_ops, },
        { "ab8540-rtc", (kernel_ulong_t)&ab8540_rtc_ops, },
+       { /* sentinel */ }
 };
+MODULE_DEVICE_TABLE(platform, ab85xx_rtc_ids);
 
 static int ab8500_rtc_probe(struct platform_device *pdev)
 {
index 4337c3bc6acef06e41daf18032acd43398a16503..afea84c7a155cc962dae5f6b85ebbef106e8c44e 100644 (file)
@@ -28,7 +28,7 @@
 #define ABX8XX_REG_WD          0x07
 
 #define ABX8XX_REG_CTRL1       0x10
-#define ABX8XX_CTRL_WRITE      BIT(1)
+#define ABX8XX_CTRL_WRITE      BIT(0)
 #define ABX8XX_CTRL_12_24      BIT(6)
 
 #define ABX8XX_REG_CFG_KEY     0x1f
index 2b08cac62f07a5c6fed07b0598025234befe2c18..9a3f2a6f512e014b60b86974bdd6875e95e643ad 100644 (file)
@@ -40,13 +40,6 @@ struct armada38x_rtc {
        void __iomem        *regs;
        void __iomem        *regs_soc;
        spinlock_t          lock;
-       /*
-        * While setting the time, the RTC TIME register should not be
-        * accessed. Setting the RTC time involves sleeping during
-        * 100ms, so a mutex instead of a spinlock is used to protect
-        * it
-        */
-       struct mutex        mutex_time;
        int                 irq;
 };
 
@@ -64,9 +57,9 @@ static void rtc_delayed_write(u32 val, struct armada38x_rtc *rtc, int offset)
 static int armada38x_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
        struct armada38x_rtc *rtc = dev_get_drvdata(dev);
-       unsigned long time, time_check;
+       unsigned long time, time_check, flags;
 
-       mutex_lock(&rtc->mutex_time);
+       spin_lock_irqsave(&rtc->lock, flags);
        time = readl(rtc->regs + RTC_TIME);
        /*
         * WA for failing time set attempts. As stated in HW ERRATA if
@@ -77,7 +70,7 @@ static int armada38x_rtc_read_time(struct device *dev, struct rtc_time *tm)
        if ((time_check - time) > 1)
                time_check = readl(rtc->regs + RTC_TIME);
 
-       mutex_unlock(&rtc->mutex_time);
+       spin_unlock_irqrestore(&rtc->lock, flags);
 
        rtc_time_to_tm(time_check, tm);
 
@@ -88,23 +81,23 @@ static int armada38x_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
        struct armada38x_rtc *rtc = dev_get_drvdata(dev);
        int ret = 0;
-       unsigned long time;
+       unsigned long time, flags;
 
        ret = rtc_tm_to_time(tm, &time);
 
        if (ret)
                goto out;
        /*
-        * Setting the RTC time not always succeeds. According to the
-        * errata we need to first write on the status register and
-        * then wait for 100ms before writing to the time register to be
-        * sure that the data will be taken into account.
+        * According to errata FE-3124064, Write to RTC TIME register
+        * may fail. As a workaround, after writing to RTC TIME
+        * register, issue a dummy write of 0x0 twice to RTC Status
+        * register.
         */
-       mutex_lock(&rtc->mutex_time);
-       rtc_delayed_write(0, rtc, RTC_STATUS);
-       msleep(100);
+       spin_lock_irqsave(&rtc->lock, flags);
        rtc_delayed_write(time, rtc, RTC_TIME);
-       mutex_unlock(&rtc->mutex_time);
+       rtc_delayed_write(0, rtc, RTC_STATUS);
+       rtc_delayed_write(0, rtc, RTC_STATUS);
+       spin_unlock_irqrestore(&rtc->lock, flags);
 
 out:
        return ret;
@@ -229,7 +222,6 @@ static __init int armada38x_rtc_probe(struct platform_device *pdev)
                return -ENOMEM;
 
        spin_lock_init(&rtc->lock);
-       mutex_init(&rtc->mutex_time);
 
        res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rtc");
        rtc->regs = devm_ioremap_resource(&pdev->dev, res);
@@ -303,6 +295,7 @@ static const struct of_device_id armada38x_rtc_of_match_table[] = {
        { .compatible = "marvell,armada-380-rtc", },
        {}
 };
+MODULE_DEVICE_TABLE(of, armada38x_rtc_of_match_table);
 #endif
 
 static struct platform_driver armada38x_rtc_driver = {
index 9f38eda69154d4b2b5fdbf270638502d4f86157d..56cc5821118bfbcaf1c71eb5a334595d93fbc18e 100644 (file)
@@ -45,7 +45,7 @@ static void as3722_time_to_reg(u8 *rbuff, struct rtc_time *tm)
        rbuff[1] = bin2bcd(tm->tm_min);
        rbuff[2] = bin2bcd(tm->tm_hour);
        rbuff[3] = bin2bcd(tm->tm_mday);
-       rbuff[4] = bin2bcd(tm->tm_mon);
+       rbuff[4] = bin2bcd(tm->tm_mon + 1);
        rbuff[5] = bin2bcd(tm->tm_year - (AS3722_RTC_START_YEAR - 1900));
 }
 
@@ -55,7 +55,7 @@ static void as3722_reg_to_time(u8 *rbuff, struct rtc_time *tm)
        tm->tm_min = bcd2bin(rbuff[1] & 0x7F);
        tm->tm_hour = bcd2bin(rbuff[2] & 0x3F);
        tm->tm_mday = bcd2bin(rbuff[3] & 0x3F);
-       tm->tm_mon = bcd2bin(rbuff[4] & 0x1F);
+       tm->tm_mon = bcd2bin(rbuff[4] & 0x1F) - 1;
        tm->tm_year = (AS3722_RTC_START_YEAR - 1900) + bcd2bin(rbuff[5] & 0x7F);
        return;
 }
index 35efd3f75b1802a20198df1b6098d4fd0697cd1d..cb62e214b52a0066f0a0402c7d89d8b0de74b4fb 100644 (file)
  *
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/time.h>
-#include <linux/rtc.h>
 #include <linux/bcd.h>
+#include <linux/clk.h>
+#include <linux/completion.h>
 #include <linux/interrupt.h>
-#include <linux/spinlock.h>
 #include <linux/ioctl.h>
-#include <linux/completion.h>
 #include <linux/io.h>
-#include <linux/of.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/of_device.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/spinlock.h>
 #include <linux/suspend.h>
+#include <linux/time.h>
 #include <linux/uaccess.h>
 
 #include "rtc-at91rm9200.h"
@@ -59,6 +60,7 @@ static bool suspended;
 static DEFINE_SPINLOCK(suspended_lock);
 static unsigned long cached_events;
 static u32 at91_rtc_imr;
+static struct clk *sclk;
 
 static void at91_rtc_write_ier(u32 mask)
 {
@@ -407,6 +409,16 @@ static int __init at91_rtc_probe(struct platform_device *pdev)
                return -ENOMEM;
        }
 
+       sclk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(sclk))
+               return PTR_ERR(sclk);
+
+       ret = clk_prepare_enable(sclk);
+       if (ret) {
+               dev_err(&pdev->dev, "Could not enable slow clock\n");
+               return ret;
+       }
+
        at91_rtc_write(AT91_RTC_CR, 0);
        at91_rtc_write(AT91_RTC_MR, 0);         /* 24 hour mode */
 
@@ -420,7 +432,7 @@ static int __init at91_rtc_probe(struct platform_device *pdev)
                               "at91_rtc", pdev);
        if (ret) {
                dev_err(&pdev->dev, "IRQ %d already in use.\n", irq);
-               return ret;
+               goto err_clk;
        }
 
        /* cpu init code should really have flagged this device as
@@ -431,8 +443,10 @@ static int __init at91_rtc_probe(struct platform_device *pdev)
 
        rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
                                &at91_rtc_ops, THIS_MODULE);
-       if (IS_ERR(rtc))
-               return PTR_ERR(rtc);
+       if (IS_ERR(rtc)) {
+               ret = PTR_ERR(rtc);
+               goto err_clk;
+       }
        platform_set_drvdata(pdev, rtc);
 
        /* enable SECEV interrupt in order to initialize at91_rtc_upd_rdy
@@ -442,6 +456,11 @@ static int __init at91_rtc_probe(struct platform_device *pdev)
 
        dev_info(&pdev->dev, "AT91 Real Time Clock driver.\n");
        return 0;
+
+err_clk:
+       clk_disable_unprepare(sclk);
+
+       return ret;
 }
 
 /*
@@ -454,6 +473,8 @@ static int __exit at91_rtc_remove(struct platform_device *pdev)
                                        AT91_RTC_SECEV | AT91_RTC_TIMEV |
                                        AT91_RTC_CALEV);
 
+       clk_disable_unprepare(sclk);
+
        return 0;
 }
 
index 5ccaee32df7223ad1aeb2d8cb74233caa7fae351..7206e2fa43837545a11f61a5b298843678b31636 100644 (file)
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/time.h>
-#include <linux/rtc.h>
+#include <linux/clk.h>
 #include <linux/interrupt.h>
 #include <linux/ioctl.h>
-#include <linux/slab.h>
-#include <linux/platform_data/atmel.h>
 #include <linux/io.h>
+#include <linux/kernel.h>
 #include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
 #include <linux/regmap.h>
+#include <linux/rtc.h>
+#include <linux/slab.h>
 #include <linux/suspend.h>
-#include <linux/clk.h>
+#include <linux/time.h>
 
 /*
  * This driver uses two configurable hardware resources that live in the
@@ -425,18 +425,19 @@ static int at91_rtc_probe(struct platform_device *pdev)
        if (IS_ERR(rtc->sclk))
                return PTR_ERR(rtc->sclk);
 
-       sclk_rate = clk_get_rate(rtc->sclk);
-       if (!sclk_rate || sclk_rate > AT91_RTT_RTPRES) {
-               dev_err(&pdev->dev, "Invalid slow clock rate\n");
-               return -EINVAL;
-       }
-
        ret = clk_prepare_enable(rtc->sclk);
        if (ret) {
                dev_err(&pdev->dev, "Could not enable slow clock\n");
                return ret;
        }
 
+       sclk_rate = clk_get_rate(rtc->sclk);
+       if (!sclk_rate || sclk_rate > AT91_RTT_RTPRES) {
+               dev_err(&pdev->dev, "Invalid slow clock rate\n");
+               ret = -EINVAL;
+               goto err_clk;
+       }
+
        mr = rtt_readl(rtc, MR);
 
        /* unless RTT is counting at 1 Hz, re-initialize it */
@@ -451,8 +452,10 @@ static int at91_rtc_probe(struct platform_device *pdev)
 
        rtc->rtcdev = devm_rtc_device_register(&pdev->dev, pdev->name,
                                        &at91_rtc_ops, THIS_MODULE);
-       if (IS_ERR(rtc->rtcdev))
-               return PTR_ERR(rtc->rtcdev);
+       if (IS_ERR(rtc->rtcdev)) {
+               ret = PTR_ERR(rtc->rtcdev);
+               goto err_clk;
+       }
 
        /* register irq handler after we know what name we'll use */
        ret = devm_request_irq(&pdev->dev, rtc->irq, at91_rtc_interrupt,
@@ -460,7 +463,7 @@ static int at91_rtc_probe(struct platform_device *pdev)
                               dev_name(&rtc->rtcdev->dev), rtc);
        if (ret) {
                dev_dbg(&pdev->dev, "can't share IRQ %d?\n", rtc->irq);
-               return ret;
+               goto err_clk;
        }
 
        /* NOTE:  sam9260 rev A silicon has a ROM bug which resets the
@@ -474,6 +477,11 @@ static int at91_rtc_probe(struct platform_device *pdev)
                                dev_name(&rtc->rtcdev->dev));
 
        return 0;
+
+err_clk:
+       clk_disable_unprepare(rtc->sclk);
+
+       return ret;
 }
 
 /*
@@ -487,8 +495,7 @@ static int at91_rtc_remove(struct platform_device *pdev)
        /* disable all interrupts */
        rtt_writel(rtc, MR, mr & ~(AT91_RTT_ALMIEN | AT91_RTT_RTTINCIEN));
 
-       if (!IS_ERR(rtc->sclk))
-               clk_disable_unprepare(rtc->sclk);
+       clk_disable_unprepare(rtc->sclk);
 
        return 0;
 }
index 3d44b11721ea03737578dc71657669cbdb195cd5..535a5f9338d026ec8f433fb4960dc7227728ddf6 100644 (file)
@@ -361,7 +361,7 @@ static int bfin_rtc_probe(struct platform_device *pdev)
        /* Register our RTC with the RTC framework */
        rtc->rtc_dev = devm_rtc_device_register(dev, pdev->name, &bfin_rtc_ops,
                                                THIS_MODULE);
-       if (unlikely(IS_ERR(rtc->rtc_dev)))
+       if (IS_ERR(rtc->rtc_dev))
                return PTR_ERR(rtc->rtc_dev);
 
        /* Grab the IRQ and init the hardware */
index 92679df6d6e222dd12e4f2471f9e1b3109048c50..0299988b4f136812bad0e35c4943826608d5a114 100644 (file)
@@ -212,7 +212,7 @@ static int bq32k_probe(struct i2c_client *client,
        if (error)
                return error;
 
-       if (client && client->dev.of_node)
+       if (client->dev.of_node)
                trickle_charger_of_init(dev, client->dev.of_node);
 
        rtc = devm_rtc_device_register(&client->dev, bq32k_driver.driver.name,
@@ -234,7 +234,6 @@ MODULE_DEVICE_TABLE(i2c, bq32k_id);
 static struct i2c_driver bq32k_driver = {
        .driver = {
                .name   = "bq32k",
-               .owner  = THIS_MODULE,
        },
        .probe          = bq32k_probe,
        .id_table       = bq32k_id,
index a82556a0757a2f18fe1b96201cdf601c3d867e51..8f7034ba7d9e30e5ef929ab0362186df249ed3be 100644 (file)
@@ -41,7 +41,6 @@
 #include <linux/pm.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
-#include <linux/dmi.h>
 
 /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */
 #include <asm-generic/rtc.h>
@@ -51,6 +50,7 @@ struct cmos_rtc {
        struct device           *dev;
        int                     irq;
        struct resource         *iomem;
+       time64_t                alarm_expires;
 
        void                    (*wake_on)(struct device *);
        void                    (*wake_off)(struct device *);
@@ -377,53 +377,11 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 
        spin_unlock_irq(&rtc_lock);
 
-       return 0;
-}
-
-/*
- * Do not disable RTC alarm on shutdown - workaround for b0rked BIOSes.
- */
-static bool alarm_disable_quirk;
+       cmos->alarm_expires = rtc_tm_to_time64(&t->time);
 
-static int __init set_alarm_disable_quirk(const struct dmi_system_id *id)
-{
-       alarm_disable_quirk = true;
-       pr_info("BIOS has alarm-disable quirk - RTC alarms disabled\n");
        return 0;
 }
 
-static const struct dmi_system_id rtc_quirks[] __initconst = {
-       /* https://bugzilla.novell.com/show_bug.cgi?id=805740 */
-       {
-               .callback = set_alarm_disable_quirk,
-               .ident    = "IBM Truman",
-               .matches  = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "4852570"),
-               },
-       },
-       /* https://bugzilla.novell.com/show_bug.cgi?id=812592 */
-       {
-               .callback = set_alarm_disable_quirk,
-               .ident    = "Gigabyte GA-990XA-UD3",
-               .matches  = {
-                       DMI_MATCH(DMI_SYS_VENDOR,
-                                       "Gigabyte Technology Co., Ltd."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "GA-990XA-UD3"),
-               },
-       },
-       /* http://permalink.gmane.org/gmane.linux.kernel/1604474 */
-       {
-               .callback = set_alarm_disable_quirk,
-               .ident    = "Toshiba Satellite L300",
-               .matches  = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Satellite L300"),
-               },
-       },
-       {}
-};
-
 static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
        struct cmos_rtc *cmos = dev_get_drvdata(dev);
@@ -432,9 +390,6 @@ static int cmos_alarm_irq_enable(struct device *dev, unsigned int enabled)
        if (!is_valid_irq(cmos->irq))
                return -EINVAL;
 
-       if (alarm_disable_quirk)
-               return 0;
-
        spin_lock_irqsave(&rtc_lock, flags);
 
        if (enabled)
@@ -512,13 +467,6 @@ cmos_nvram_read(struct file *filp, struct kobject *kobj,
 {
        int     retval;
 
-       if (unlikely(off >= attr->size))
-               return 0;
-       if (unlikely(off < 0))
-               return -EINVAL;
-       if ((off + count) > attr->size)
-               count = attr->size - off;
-
        off += NVRAM_OFFSET;
        spin_lock_irq(&rtc_lock);
        for (retval = 0; count; count--, off++, retval++) {
@@ -543,12 +491,6 @@ cmos_nvram_write(struct file *filp, struct kobject *kobj,
        int             retval;
 
        cmos = dev_get_drvdata(container_of(kobj, struct device, kobj));
-       if (unlikely(off >= attr->size))
-               return -EFBIG;
-       if (unlikely(off < 0))
-               return -EINVAL;
-       if ((off + count) > attr->size)
-               count = attr->size - off;
 
        /* NOTE:  on at least PCs and Ataris, the boot firmware uses a
         * checksum on part of the NVRAM data.  That's currently ignored
@@ -860,6 +802,51 @@ static void __exit cmos_do_remove(struct device *dev)
        cmos->dev = NULL;
 }
 
+static int cmos_aie_poweroff(struct device *dev)
+{
+       struct cmos_rtc *cmos = dev_get_drvdata(dev);
+       struct rtc_time now;
+       time64_t t_now;
+       int retval = 0;
+       unsigned char rtc_control;
+
+       if (!cmos->alarm_expires)
+               return -EINVAL;
+
+       spin_lock_irq(&rtc_lock);
+       rtc_control = CMOS_READ(RTC_CONTROL);
+       spin_unlock_irq(&rtc_lock);
+
+       /* We only care about the situation where AIE is disabled. */
+       if (rtc_control & RTC_AIE)
+               return -EBUSY;
+
+       cmos_read_time(dev, &now);
+       t_now = rtc_tm_to_time64(&now);
+
+       /*
+        * When enabling "RTC wake-up" in BIOS setup, the machine reboots
+        * automatically right after shutdown on some buggy boxes.
+        * This automatic rebooting issue won't happen when the alarm
+        * time is larger than now+1 seconds.
+        *
+        * If the alarm time is equal to now+1 seconds, the issue can be
+        * prevented by cancelling the alarm.
+        */
+       if (cmos->alarm_expires == t_now + 1) {
+               struct rtc_wkalrm alarm;
+
+               /* Cancel the AIE timer by configuring the past time. */
+               rtc_time64_to_tm(t_now - 1, &alarm.time);
+               alarm.enabled = 0;
+               retval = cmos_set_alarm(dev, &alarm);
+       } else if (cmos->alarm_expires > t_now + 1) {
+               retval = -EBUSY;
+       }
+
+       return retval;
+}
+
 #ifdef CONFIG_PM
 
 static int cmos_suspend(struct device *dev)
@@ -1094,8 +1081,12 @@ static void cmos_pnp_shutdown(struct pnp_dev *pnp)
        struct device *dev = &pnp->dev;
        struct cmos_rtc *cmos = dev_get_drvdata(dev);
 
-       if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(dev))
-               return;
+       if (system_state == SYSTEM_POWER_OFF) {
+               int retval = cmos_poweroff(dev);
+
+               if (cmos_aie_poweroff(dev) < 0 && !retval)
+                       return;
+       }
 
        cmos_do_shutdown(cmos->irq);
 }
@@ -1200,8 +1191,12 @@ static void cmos_platform_shutdown(struct platform_device *pdev)
        struct device *dev = &pdev->dev;
        struct cmos_rtc *cmos = dev_get_drvdata(dev);
 
-       if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(dev))
-               return;
+       if (system_state == SYSTEM_POWER_OFF) {
+               int retval = cmos_poweroff(dev);
+
+               if (cmos_aie_poweroff(dev) < 0 && !retval)
+                       return;
+       }
 
        cmos_do_shutdown(cmos->irq);
 }
@@ -1243,8 +1238,6 @@ static int __init cmos_init(void)
                        platform_driver_registered = true;
        }
 
-       dmi_check_system(rtc_quirks);
-
        if (retval == 0)
                return 0;
 
index 56343b2fbc685c386ed577bd027933629197cf35..101b7a240e0fa8e482ef140b921c5b31811e829a 100644 (file)
@@ -263,6 +263,7 @@ static const struct of_device_id coh901331_dt_match[] = {
        { .compatible = "stericsson,coh901331" },
        {},
 };
+MODULE_DEVICE_TABLE(of, coh901331_dt_match);
 
 static struct platform_driver coh901331_driver = {
        .driver = {
index 5f9df7430a22728a7a1ce7941ef9f5898c476b70..a098aea197fc72dad6b0e1819c2ab739cb7dd042 100644 (file)
@@ -48,23 +48,10 @@ static inline void rtc_proc_del_device(struct rtc_device *rtc)
 #endif
 
 #ifdef CONFIG_RTC_INTF_SYSFS
-
-extern void __init rtc_sysfs_init(struct class *);
-extern void rtc_sysfs_add_device(struct rtc_device *rtc);
-extern void rtc_sysfs_del_device(struct rtc_device *rtc);
-
+const struct attribute_group **rtc_get_dev_attribute_groups(void);
 #else
-
-static inline void rtc_sysfs_init(struct class *rtc)
-{
-}
-
-static inline void rtc_sysfs_add_device(struct rtc_device *rtc)
+static inline const struct attribute_group **rtc_get_dev_attribute_groups(void)
 {
+       return NULL;
 }
-
-static inline void rtc_sysfs_del_device(struct rtc_device *rtc)
-{
-}
-
 #endif
index 7ffc5707f8b9da03b8d1e6088b4acfbd929cc26e..00a8f7f4f87cbc58426f11bb8cf83dfcf38ab90e 100644 (file)
  * Library General Public License for more details.
  */
 
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/init.h>
+#include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/interrupt.h>
+#include <linux/regmap.h>
 #include <linux/rtc.h>
 #include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/regmap.h>
+
+#include <linux/mfd/da9062/registers.h>
 #include <linux/mfd/da9063/registers.h>
 #include <linux/mfd/da9063/core.h>
 
 #define YEARS_FROM_DA9063(year)                ((year) + 100)
 #define MONTHS_FROM_DA9063(month)      ((month) - 1)
 
-#define RTC_ALARM_DATA_LEN (DA9063_AD_REG_ALARM_Y - DA9063_AD_REG_ALARM_MI + 1)
-
-#define RTC_DATA_LEN   (DA9063_REG_COUNT_Y - DA9063_REG_COUNT_S + 1)
-#define RTC_SEC                0
-#define RTC_MIN                1
-#define RTC_HOUR       2
-#define RTC_DAY                3
-#define RTC_MONTH      4
-#define RTC_YEAR       5
-
-struct da9063_rtc {
-       struct rtc_device       *rtc_dev;
-       struct da9063           *hw;
-       struct rtc_time         alarm_time;
-       bool                    rtc_sync;
-       int                     alarm_year;
-       int                     alarm_start;
-       int                     alarm_len;
-       int                     data_start;
+enum {
+       RTC_SEC = 0,
+       RTC_MIN = 1,
+       RTC_HOUR = 2,
+       RTC_DAY = 3,
+       RTC_MONTH = 4,
+       RTC_YEAR = 5,
+       RTC_DATA_LEN
+};
+
+struct da9063_compatible_rtc_regmap {
+       /* REGS */
+       int rtc_enable_reg;
+       int rtc_enable_32k_crystal_reg;
+       int rtc_alarm_secs_reg;
+       int rtc_alarm_year_reg;
+       int rtc_count_secs_reg;
+       int rtc_count_year_reg;
+       int rtc_event_reg;
+       /* MASKS */
+       int rtc_enable_mask;
+       int rtc_crystal_mask;
+       int rtc_event_alarm_mask;
+       int rtc_alarm_on_mask;
+       int rtc_alarm_status_mask;
+       int rtc_tick_on_mask;
+       int rtc_ready_to_read_mask;
+       int rtc_count_sec_mask;
+       int rtc_count_min_mask;
+       int rtc_count_hour_mask;
+       int rtc_count_day_mask;
+       int rtc_count_month_mask;
+       int rtc_count_year_mask;
+       /* ALARM CONFIG */
+       int rtc_data_start;
+       int rtc_alarm_len;
+};
+
+struct da9063_compatible_rtc {
+       struct rtc_device *rtc_dev;
+       struct rtc_time alarm_time;
+       struct regmap *regmap;
+       const struct da9063_compatible_rtc_regmap *config;
+       bool rtc_sync;
+};
+
+static const struct da9063_compatible_rtc_regmap da9063_ad_regs = {
+       /* REGS */
+       .rtc_enable_reg             = DA9063_REG_CONTROL_E,
+       .rtc_alarm_secs_reg         = DA9063_AD_REG_ALARM_MI,
+       .rtc_alarm_year_reg         = DA9063_AD_REG_ALARM_Y,
+       .rtc_count_secs_reg         = DA9063_REG_COUNT_S,
+       .rtc_count_year_reg         = DA9063_REG_COUNT_Y,
+       .rtc_event_reg              = DA9063_REG_EVENT_A,
+       /* MASKS */
+       .rtc_enable_mask            = DA9063_RTC_EN,
+       .rtc_crystal_mask           = DA9063_CRYSTAL,
+       .rtc_enable_32k_crystal_reg = DA9063_REG_EN_32K,
+       .rtc_event_alarm_mask       = DA9063_E_ALARM,
+       .rtc_alarm_on_mask          = DA9063_ALARM_ON,
+       .rtc_alarm_status_mask      = DA9063_ALARM_STATUS_ALARM |
+                                     DA9063_ALARM_STATUS_TICK,
+       .rtc_tick_on_mask           = DA9063_TICK_ON,
+       .rtc_ready_to_read_mask     = DA9063_RTC_READ,
+       .rtc_count_sec_mask         = DA9063_COUNT_SEC_MASK,
+       .rtc_count_min_mask         = DA9063_COUNT_MIN_MASK,
+       .rtc_count_hour_mask        = DA9063_COUNT_HOUR_MASK,
+       .rtc_count_day_mask         = DA9063_COUNT_DAY_MASK,
+       .rtc_count_month_mask       = DA9063_COUNT_MONTH_MASK,
+       .rtc_count_year_mask        = DA9063_COUNT_YEAR_MASK,
+       /* ALARM CONFIG */
+       .rtc_data_start             = RTC_MIN,
+       .rtc_alarm_len              = RTC_DATA_LEN - 1,
+};
+
+static const struct da9063_compatible_rtc_regmap da9063_bb_regs = {
+       /* REGS */
+       .rtc_enable_reg             = DA9063_REG_CONTROL_E,
+       .rtc_alarm_secs_reg         = DA9063_BB_REG_ALARM_S,
+       .rtc_alarm_year_reg         = DA9063_BB_REG_ALARM_Y,
+       .rtc_count_secs_reg         = DA9063_REG_COUNT_S,
+       .rtc_count_year_reg         = DA9063_REG_COUNT_Y,
+       .rtc_event_reg              = DA9063_REG_EVENT_A,
+       /* MASKS */
+       .rtc_enable_mask            = DA9063_RTC_EN,
+       .rtc_crystal_mask           = DA9063_CRYSTAL,
+       .rtc_enable_32k_crystal_reg = DA9063_REG_EN_32K,
+       .rtc_event_alarm_mask       = DA9063_E_ALARM,
+       .rtc_alarm_on_mask          = DA9063_ALARM_ON,
+       .rtc_alarm_status_mask      = DA9063_ALARM_STATUS_ALARM |
+                                     DA9063_ALARM_STATUS_TICK,
+       .rtc_tick_on_mask           = DA9063_TICK_ON,
+       .rtc_ready_to_read_mask     = DA9063_RTC_READ,
+       .rtc_count_sec_mask         = DA9063_COUNT_SEC_MASK,
+       .rtc_count_min_mask         = DA9063_COUNT_MIN_MASK,
+       .rtc_count_hour_mask        = DA9063_COUNT_HOUR_MASK,
+       .rtc_count_day_mask         = DA9063_COUNT_DAY_MASK,
+       .rtc_count_month_mask       = DA9063_COUNT_MONTH_MASK,
+       .rtc_count_year_mask        = DA9063_COUNT_YEAR_MASK,
+       /* ALARM CONFIG */
+       .rtc_data_start             = RTC_SEC,
+       .rtc_alarm_len              = RTC_DATA_LEN,
+};
+
+static const struct da9063_compatible_rtc_regmap da9062_aa_regs = {
+       /* REGS */
+       .rtc_enable_reg             = DA9062AA_CONTROL_E,
+       .rtc_alarm_secs_reg         = DA9062AA_ALARM_S,
+       .rtc_alarm_year_reg         = DA9062AA_ALARM_Y,
+       .rtc_count_secs_reg         = DA9062AA_COUNT_S,
+       .rtc_count_year_reg         = DA9062AA_COUNT_Y,
+       .rtc_event_reg              = DA9062AA_EVENT_A,
+       /* MASKS */
+       .rtc_enable_mask            = DA9062AA_RTC_EN_MASK,
+       .rtc_crystal_mask           = DA9062AA_CRYSTAL_MASK,
+       .rtc_enable_32k_crystal_reg = DA9062AA_EN_32K,
+       .rtc_event_alarm_mask       = DA9062AA_M_ALARM_MASK,
+       .rtc_alarm_on_mask          = DA9062AA_ALARM_ON_MASK,
+       .rtc_alarm_status_mask      = (0x02 << 6),
+       .rtc_tick_on_mask           = DA9062AA_TICK_ON_MASK,
+       .rtc_ready_to_read_mask     = DA9062AA_RTC_READ_MASK,
+       .rtc_count_sec_mask         = DA9062AA_COUNT_SEC_MASK,
+       .rtc_count_min_mask         = DA9062AA_COUNT_MIN_MASK,
+       .rtc_count_hour_mask        = DA9062AA_COUNT_HOUR_MASK,
+       .rtc_count_day_mask         = DA9062AA_COUNT_DAY_MASK,
+       .rtc_count_month_mask       = DA9062AA_COUNT_MONTH_MASK,
+       .rtc_count_year_mask        = DA9062AA_COUNT_YEAR_MASK,
+       /* ALARM CONFIG */
+       .rtc_data_start             = RTC_SEC,
+       .rtc_alarm_len              = RTC_DATA_LEN,
+};
+
+static const struct of_device_id da9063_compatible_reg_id_table[] = {
+       { .compatible = "dlg,da9063-rtc", .data = &da9063_bb_regs },
+       { .compatible = "dlg,da9062-rtc", .data = &da9062_aa_regs },
+       { },
 };
+MODULE_DEVICE_TABLE(of, da9063_compatible_reg_id_table);
 
-static void da9063_data_to_tm(u8 *data, struct rtc_time *tm)
+static void da9063_data_to_tm(u8 *data, struct rtc_time *tm,
+                             struct da9063_compatible_rtc *rtc)
 {
-       tm->tm_sec  = data[RTC_SEC]  & DA9063_COUNT_SEC_MASK;
-       tm->tm_min  = data[RTC_MIN]  & DA9063_COUNT_MIN_MASK;
-       tm->tm_hour = data[RTC_HOUR] & DA9063_COUNT_HOUR_MASK;
-       tm->tm_mday = data[RTC_DAY]  & DA9063_COUNT_DAY_MASK;
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
+
+       tm->tm_sec  = data[RTC_SEC]  & config->rtc_count_sec_mask;
+       tm->tm_min  = data[RTC_MIN]  & config->rtc_count_min_mask;
+       tm->tm_hour = data[RTC_HOUR] & config->rtc_count_hour_mask;
+       tm->tm_mday = data[RTC_DAY]  & config->rtc_count_day_mask;
        tm->tm_mon  = MONTHS_FROM_DA9063(data[RTC_MONTH] &
-                                        DA9063_COUNT_MONTH_MASK);
+                                        config->rtc_count_month_mask);
        tm->tm_year = YEARS_FROM_DA9063(data[RTC_YEAR] &
-                                       DA9063_COUNT_YEAR_MASK);
+                                       config->rtc_count_year_mask);
 }
 
-static void da9063_tm_to_data(struct rtc_time *tm, u8 *data)
+static void da9063_tm_to_data(struct rtc_time *tm, u8 *data,
+                             struct da9063_compatible_rtc *rtc)
 {
-       data[RTC_SEC] &= ~DA9063_COUNT_SEC_MASK;
-       data[RTC_SEC] |= tm->tm_sec & DA9063_COUNT_SEC_MASK;
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
+
+       data[RTC_SEC] &= ~config->rtc_count_sec_mask;
+       data[RTC_SEC] |= tm->tm_sec & config->rtc_count_sec_mask;
 
-       data[RTC_MIN] &= ~DA9063_COUNT_MIN_MASK;
-       data[RTC_MIN] |= tm->tm_min & DA9063_COUNT_MIN_MASK;
+       data[RTC_MIN] &= ~config->rtc_count_min_mask;
+       data[RTC_MIN] |= tm->tm_min & config->rtc_count_min_mask;
 
-       data[RTC_HOUR] &= ~DA9063_COUNT_HOUR_MASK;
-       data[RTC_HOUR] |= tm->tm_hour & DA9063_COUNT_HOUR_MASK;
+       data[RTC_HOUR] &= ~config->rtc_count_hour_mask;
+       data[RTC_HOUR] |= tm->tm_hour & config->rtc_count_hour_mask;
 
-       data[RTC_DAY] &= ~DA9063_COUNT_DAY_MASK;
-       data[RTC_DAY] |= tm->tm_mday & DA9063_COUNT_DAY_MASK;
+       data[RTC_DAY] &= ~config->rtc_count_day_mask;
+       data[RTC_DAY] |= tm->tm_mday & config->rtc_count_day_mask;
 
-       data[RTC_MONTH] &= ~DA9063_COUNT_MONTH_MASK;
+       data[RTC_MONTH] &= ~config->rtc_count_month_mask;
        data[RTC_MONTH] |= MONTHS_TO_DA9063(tm->tm_mon) &
-                               DA9063_COUNT_MONTH_MASK;
+                               config->rtc_count_month_mask;
 
-       data[RTC_YEAR] &= ~DA9063_COUNT_YEAR_MASK;
+       data[RTC_YEAR] &= ~config->rtc_count_year_mask;
        data[RTC_YEAR] |= YEARS_TO_DA9063(tm->tm_year) &
-                               DA9063_COUNT_YEAR_MASK;
+                               config->rtc_count_year_mask;
 }
 
 static int da9063_rtc_stop_alarm(struct device *dev)
 {
-       struct da9063_rtc *rtc = dev_get_drvdata(dev);
+       struct da9063_compatible_rtc *rtc = dev_get_drvdata(dev);
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
 
-       return regmap_update_bits(rtc->hw->regmap, rtc->alarm_year,
-                                 DA9063_ALARM_ON, 0);
+       return regmap_update_bits(rtc->regmap,
+                                 config->rtc_alarm_year_reg,
+                                 config->rtc_alarm_on_mask,
+                                 0);
 }
 
 static int da9063_rtc_start_alarm(struct device *dev)
 {
-       struct da9063_rtc *rtc = dev_get_drvdata(dev);
+       struct da9063_compatible_rtc *rtc = dev_get_drvdata(dev);
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
 
-       return regmap_update_bits(rtc->hw->regmap, rtc->alarm_year,
-                                 DA9063_ALARM_ON, DA9063_ALARM_ON);
+       return regmap_update_bits(rtc->regmap,
+                                 config->rtc_alarm_year_reg,
+                                 config->rtc_alarm_on_mask,
+                                 config->rtc_alarm_on_mask);
 }
 
 static int da9063_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
-       struct da9063_rtc *rtc = dev_get_drvdata(dev);
+       struct da9063_compatible_rtc *rtc = dev_get_drvdata(dev);
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
        unsigned long tm_secs;
        unsigned long al_secs;
        u8 data[RTC_DATA_LEN];
        int ret;
 
-       ret = regmap_bulk_read(rtc->hw->regmap, DA9063_REG_COUNT_S,
+       ret = regmap_bulk_read(rtc->regmap,
+                              config->rtc_count_secs_reg,
                               data, RTC_DATA_LEN);
        if (ret < 0) {
                dev_err(dev, "Failed to read RTC time data: %d\n", ret);
                return ret;
        }
 
-       if (!(data[RTC_SEC] & DA9063_RTC_READ)) {
+       if (!(data[RTC_SEC] & config->rtc_ready_to_read_mask)) {
                dev_dbg(dev, "RTC not yet ready to be read by the host\n");
                return -EINVAL;
        }
 
-       da9063_data_to_tm(data, tm);
+       da9063_data_to_tm(data, tm, rtc);
 
        rtc_tm_to_time(tm, &tm_secs);
        rtc_tm_to_time(&rtc->alarm_time, &al_secs);
@@ -137,12 +272,14 @@ static int da9063_rtc_read_time(struct device *dev, struct rtc_time *tm)
 
 static int da9063_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
-       struct da9063_rtc *rtc = dev_get_drvdata(dev);
+       struct da9063_compatible_rtc *rtc = dev_get_drvdata(dev);
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
        u8 data[RTC_DATA_LEN];
        int ret;
 
-       da9063_tm_to_data(tm, data);
-       ret = regmap_bulk_write(rtc->hw->regmap, DA9063_REG_COUNT_S,
+       da9063_tm_to_data(tm, data, rtc);
+       ret = regmap_bulk_write(rtc->regmap,
+                               config->rtc_count_secs_reg,
                                data, RTC_DATA_LEN);
        if (ret < 0)
                dev_err(dev, "Failed to set RTC time data: %d\n", ret);
@@ -152,26 +289,31 @@ static int da9063_rtc_set_time(struct device *dev, struct rtc_time *tm)
 
 static int da9063_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
-       struct da9063_rtc *rtc = dev_get_drvdata(dev);
+       struct da9063_compatible_rtc *rtc = dev_get_drvdata(dev);
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
        u8 data[RTC_DATA_LEN];
        int ret;
        unsigned int val;
 
        data[RTC_SEC] = 0;
-       ret = regmap_bulk_read(rtc->hw->regmap, rtc->alarm_start,
-                              &data[rtc->data_start], rtc->alarm_len);
+       ret = regmap_bulk_read(rtc->regmap,
+                              config->rtc_alarm_secs_reg,
+                              &data[config->rtc_data_start],
+                              config->rtc_alarm_len);
        if (ret < 0)
                return ret;
 
-       da9063_data_to_tm(data, &alrm->time);
+       da9063_data_to_tm(data, &alrm->time, rtc);
 
-       alrm->enabled = !!(data[RTC_YEAR] & DA9063_ALARM_ON);
+       alrm->enabled = !!(data[RTC_YEAR] & config->rtc_alarm_on_mask);
 
-       ret = regmap_read(rtc->hw->regmap, DA9063_REG_EVENT_A, &val);
+       ret = regmap_read(rtc->regmap,
+                         config->rtc_event_reg,
+                         &val);
        if (ret < 0)
                return ret;
 
-       if (val & (DA9063_E_ALARM))
+       if (val & config->rtc_event_alarm_mask)
                alrm->pending = 1;
        else
                alrm->pending = 0;
@@ -181,11 +323,12 @@ static int da9063_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 static int da9063_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
-       struct da9063_rtc *rtc = dev_get_drvdata(dev);
+       struct da9063_compatible_rtc *rtc = dev_get_drvdata(dev);
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
        u8 data[RTC_DATA_LEN];
        int ret;
 
-       da9063_tm_to_data(&alrm->time, data);
+       da9063_tm_to_data(&alrm->time, data, rtc);
 
        ret = da9063_rtc_stop_alarm(dev);
        if (ret < 0) {
@@ -193,14 +336,16 @@ static int da9063_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
                return ret;
        }
 
-       ret = regmap_bulk_write(rtc->hw->regmap, rtc->alarm_start,
-                              &data[rtc->data_start], rtc->alarm_len);
+       ret = regmap_bulk_write(rtc->regmap,
+                               config->rtc_alarm_secs_reg,
+                               &data[config->rtc_data_start],
+                               config->rtc_alarm_len);
        if (ret < 0) {
                dev_err(dev, "Failed to write alarm: %d\n", ret);
                return ret;
        }
 
-       da9063_data_to_tm(data, &rtc->alarm_time);
+       da9063_data_to_tm(data, &rtc->alarm_time, rtc);
 
        if (alrm->enabled) {
                ret = da9063_rtc_start_alarm(dev);
@@ -213,7 +358,8 @@ static int da9063_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
        return ret;
 }
 
-static int da9063_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
+static int da9063_rtc_alarm_irq_enable(struct device *dev,
+                                      unsigned int enabled)
 {
        if (enabled)
                return da9063_rtc_start_alarm(dev);
@@ -223,10 +369,13 @@ static int da9063_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 
 static irqreturn_t da9063_alarm_event(int irq, void *data)
 {
-       struct da9063_rtc *rtc = data;
+       struct da9063_compatible_rtc *rtc = data;
+       const struct da9063_compatible_rtc_regmap *config = rtc->config;
 
-       regmap_update_bits(rtc->hw->regmap, rtc->alarm_year,
-                          DA9063_ALARM_ON, 0);
+       regmap_update_bits(rtc->regmap,
+                          config->rtc_alarm_year_reg,
+                          config->rtc_alarm_on_mask,
+                          0);
 
        rtc->rtc_sync = true;
        rtc_update_irq(rtc->rtc_dev, 1, RTC_IRQF | RTC_AF);
@@ -244,72 +393,92 @@ static const struct rtc_class_ops da9063_rtc_ops = {
 
 static int da9063_rtc_probe(struct platform_device *pdev)
 {
-       struct da9063 *da9063 = dev_get_drvdata(pdev->dev.parent);
-       struct da9063_rtc *rtc;
+       struct da9063_compatible_rtc *rtc;
+       const struct da9063_compatible_rtc_regmap *config;
+       const struct of_device_id *match;
        int irq_alarm;
        u8 data[RTC_DATA_LEN];
        int ret;
 
-       ret = regmap_update_bits(da9063->regmap, DA9063_REG_CONTROL_E,
-                                DA9063_RTC_EN, DA9063_RTC_EN);
-       if (ret < 0) {
-               dev_err(&pdev->dev, "Failed to enable RTC\n");
-               goto err;
-       }
+       if (!pdev->dev.of_node)
+               return -ENXIO;
 
-       ret = regmap_update_bits(da9063->regmap, DA9063_REG_EN_32K,
-                                DA9063_CRYSTAL, DA9063_CRYSTAL);
-       if (ret < 0) {
-               dev_err(&pdev->dev, "Failed to run 32kHz oscillator\n");
-               goto err;
-       }
+       match = of_match_node(da9063_compatible_reg_id_table,
+                             pdev->dev.of_node);
 
        rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
        if (!rtc)
                return -ENOMEM;
 
-       if (da9063->variant_code == PMIC_DA9063_AD) {
-               rtc->alarm_year = DA9063_AD_REG_ALARM_Y;
-               rtc->alarm_start = DA9063_AD_REG_ALARM_MI;
-               rtc->alarm_len = RTC_ALARM_DATA_LEN;
-               rtc->data_start = RTC_MIN;
-       } else {
-               rtc->alarm_year = DA9063_BB_REG_ALARM_Y;
-               rtc->alarm_start = DA9063_BB_REG_ALARM_S;
-               rtc->alarm_len = RTC_DATA_LEN;
-               rtc->data_start = RTC_SEC;
+       rtc->config = match->data;
+       if (of_device_is_compatible(pdev->dev.of_node, "dlg,da9063-rtc")) {
+               struct da9063 *chip = dev_get_drvdata(pdev->dev.parent);
+
+               if (chip->variant_code == PMIC_DA9063_AD)
+                       rtc->config = &da9063_ad_regs;
        }
 
-       ret = regmap_update_bits(da9063->regmap, rtc->alarm_start,
-                       DA9063_ALARM_STATUS_TICK | DA9063_ALARM_STATUS_ALARM,
-                       0);
+       rtc->regmap = dev_get_regmap(pdev->dev.parent, NULL);
+       if (!rtc->regmap) {
+               dev_warn(&pdev->dev, "Parent regmap unavailable.\n");
+               return -ENXIO;
+       }
+
+       config = rtc->config;
+       ret = regmap_update_bits(rtc->regmap,
+                                config->rtc_enable_reg,
+                                config->rtc_enable_mask,
+                                config->rtc_enable_mask);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "Failed to enable RTC\n");
+               return ret;
+       }
+
+       ret = regmap_update_bits(rtc->regmap,
+                                config->rtc_enable_32k_crystal_reg,
+                                config->rtc_crystal_mask,
+                                config->rtc_crystal_mask);
+       if (ret < 0) {
+               dev_err(&pdev->dev, "Failed to run 32kHz oscillator\n");
+               return ret;
+       }
+
+       ret = regmap_update_bits(rtc->regmap,
+                                config->rtc_alarm_secs_reg,
+                                config->rtc_alarm_status_mask,
+                                0);
        if (ret < 0) {
                dev_err(&pdev->dev, "Failed to access RTC alarm register\n");
-               goto err;
+               return ret;
        }
 
-       ret = regmap_update_bits(da9063->regmap, rtc->alarm_start,
+       ret = regmap_update_bits(rtc->regmap,
+                                config->rtc_alarm_secs_reg,
                                 DA9063_ALARM_STATUS_ALARM,
                                 DA9063_ALARM_STATUS_ALARM);
        if (ret < 0) {
                dev_err(&pdev->dev, "Failed to access RTC alarm register\n");
-               goto err;
+               return ret;
        }
 
-       ret = regmap_update_bits(da9063->regmap, rtc->alarm_year,
-                                DA9063_TICK_ON, 0);
+       ret = regmap_update_bits(rtc->regmap,
+                                config->rtc_alarm_year_reg,
+                                config->rtc_tick_on_mask,
+                                0);
        if (ret < 0) {
                dev_err(&pdev->dev, "Failed to disable TICKs\n");
-               goto err;
+               return ret;
        }
 
        data[RTC_SEC] = 0;
-       ret = regmap_bulk_read(da9063->regmap, rtc->alarm_start,
-                              &data[rtc->data_start], rtc->alarm_len);
+       ret = regmap_bulk_read(rtc->regmap,
+                              config->rtc_alarm_secs_reg,
+                              &data[config->rtc_data_start],
+                              config->rtc_alarm_len);
        if (ret < 0) {
                dev_err(&pdev->dev, "Failed to read initial alarm data: %d\n",
                        ret);
-               goto err;
+               return ret;
        }
 
        platform_set_drvdata(pdev, rtc);
@@ -322,18 +491,16 @@ static int da9063_rtc_probe(struct platform_device *pdev)
        if (ret) {
                dev_err(&pdev->dev, "Failed to request ALARM IRQ %d: %d\n",
                        irq_alarm, ret);
-               goto err;
+               return ret;
        }
 
-       rtc->hw = da9063;
        rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, DA9063_DRVNAME_RTC,
                                           &da9063_rtc_ops, THIS_MODULE);
        if (IS_ERR(rtc->rtc_dev))
                return PTR_ERR(rtc->rtc_dev);
 
-       da9063_data_to_tm(data, &rtc->alarm_time);
+       da9063_data_to_tm(data, &rtc->alarm_time, rtc);
        rtc->rtc_sync = false;
-err:
        return ret;
 }
 
@@ -341,6 +508,7 @@ static struct platform_driver da9063_rtc_driver = {
        .probe          = da9063_rtc_probe,
        .driver         = {
                .name   = DA9063_DRVNAME_RTC,
+               .of_match_table = da9063_compatible_reg_id_table,
        },
 };
 
index 799c34bcb26f3b54cfc45100775990918727bb53..a6d9434addf6f79cc1a0b210f0ac75aaa2563637 100644 (file)
@@ -477,6 +477,7 @@ void rtc_dev_prepare(struct rtc_device *rtc)
 
        cdev_init(&rtc->char_dev, &rtc_dev_fops);
        rtc->char_dev.owner = rtc->owner;
+       rtc->char_dev.kobj.parent = &rtc->dev.kobj;
 }
 
 void rtc_dev_add_device(struct rtc_device *rtc)
index 12b07158a3664de6aaaa2e3f42ae2266182473be..baa5d047f9c826bbd8da5111e421d6e564ed89e8 100644 (file)
@@ -538,15 +538,6 @@ ds1305_nvram_read(struct file *filp, struct kobject *kobj,
 
        spi = container_of(kobj, struct spi_device, dev.kobj);
 
-       if (unlikely(off >= DS1305_NVRAM_LEN))
-               return 0;
-       if (count >= DS1305_NVRAM_LEN)
-               count = DS1305_NVRAM_LEN;
-       if ((off + count) > DS1305_NVRAM_LEN)
-               count = DS1305_NVRAM_LEN - off;
-       if (unlikely(!count))
-               return count;
-
        addr = DS1305_NVRAM + off;
        msg_init(&m, x, &addr, count, NULL, buf);
 
@@ -569,15 +560,6 @@ ds1305_nvram_write(struct file *filp, struct kobject *kobj,
 
        spi = container_of(kobj, struct spi_device, dev.kobj);
 
-       if (unlikely(off >= DS1305_NVRAM_LEN))
-               return -EFBIG;
-       if (count >= DS1305_NVRAM_LEN)
-               count = DS1305_NVRAM_LEN;
-       if ((off + count) > DS1305_NVRAM_LEN)
-               count = DS1305_NVRAM_LEN - off;
-       if (unlikely(!count))
-               return count;
-
        addr = (DS1305_WRITE | DS1305_NVRAM) + off;
        msg_init(&m, x, &addr, count, buf, NULL);
 
index 6e76de1856fc14c6ae5168cf0980735ec02d5550..a705e6490808f277e21102bf208a5e77c1d35bb1 100644 (file)
  * published by the Free Software Foundation.
  */
 
-#include <linux/module.h>
+#include <linux/bcd.h>
+#include <linux/i2c.h>
 #include <linux/init.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/pm_wakeirq.h>
+#include <linux/rtc/ds1307.h>
+#include <linux/rtc.h>
 #include <linux/slab.h>
-#include <linux/i2c.h>
 #include <linux/string.h>
-#include <linux/rtc.h>
-#include <linux/bcd.h>
-#include <linux/rtc/ds1307.h>
 
 /*
  * We can't determine type by probing, but if we expect pre-Linux code
@@ -114,7 +117,7 @@ struct ds1307 {
 #define HAS_ALARM      1               /* bit 1 == irq claimed */
        struct i2c_client       *client;
        struct rtc_device       *rtc;
-       struct work_struct      work;
+       int                     wakeirq;
        s32 (*read_block_data)(const struct i2c_client *client, u8 command,
                               u8 length, u8 *values);
        s32 (*write_block_data)(const struct i2c_client *client, u8 command,
@@ -311,27 +314,17 @@ static s32 ds1307_native_smbus_read_block_data(const struct i2c_client *client,
 /*----------------------------------------------------------------------*/
 
 /*
- * The IRQ logic includes a "real" handler running in IRQ context just
- * long enough to schedule this workqueue entry.   We need a task context
- * to talk to the RTC, since I2C I/O calls require that; and disable the
- * IRQ until we clear its status on the chip, so that this handler can
- * work with any type of triggering (not just falling edge).
- *
  * The ds1337 and ds1339 both have two alarms, but we only use the first
  * one (with a "seconds" field).  For ds1337 we expect nINTA is our alarm
  * signal; ds1339 chips have only one alarm signal.
  */
-static void ds1307_work(struct work_struct *work)
+static irqreturn_t ds1307_irq(int irq, void *dev_id)
 {
-       struct ds1307           *ds1307;
-       struct i2c_client       *client;
-       struct mutex            *lock;
+       struct i2c_client       *client = dev_id;
+       struct ds1307           *ds1307 = i2c_get_clientdata(client);
+       struct mutex            *lock = &ds1307->rtc->ops_lock;
        int                     stat, control;
 
-       ds1307 = container_of(work, struct ds1307, work);
-       client = ds1307->client;
-       lock = &ds1307->rtc->ops_lock;
-
        mutex_lock(lock);
        stat = i2c_smbus_read_byte_data(client, DS1337_REG_STATUS);
        if (stat < 0)
@@ -352,18 +345,8 @@ static void ds1307_work(struct work_struct *work)
        }
 
 out:
-       if (test_bit(HAS_ALARM, &ds1307->flags))
-               enable_irq(client->irq);
        mutex_unlock(lock);
-}
 
-static irqreturn_t ds1307_irq(int irq, void *dev_id)
-{
-       struct i2c_client       *client = dev_id;
-       struct ds1307           *ds1307 = i2c_get_clientdata(client);
-
-       disable_irq_nosync(irq);
-       schedule_work(&ds1307->work);
        return IRQ_HANDLED;
 }
 
@@ -634,13 +617,14 @@ static const struct rtc_class_ops ds13xx_rtc_ops = {
                                         MCP794XX_BIT_ALMX_C1 | \
                                         MCP794XX_BIT_ALMX_C2)
 
-static void mcp794xx_work(struct work_struct *work)
+static irqreturn_t mcp794xx_irq(int irq, void *dev_id)
 {
-       struct ds1307 *ds1307 = container_of(work, struct ds1307, work);
-       struct i2c_client *client = ds1307->client;
+       struct i2c_client       *client = dev_id;
+       struct ds1307           *ds1307 = i2c_get_clientdata(client);
+       struct mutex            *lock = &ds1307->rtc->ops_lock;
        int reg, ret;
 
-       mutex_lock(&ds1307->rtc->ops_lock);
+       mutex_lock(lock);
 
        /* Check and clear alarm 0 interrupt flag. */
        reg = i2c_smbus_read_byte_data(client, MCP794XX_REG_ALARM0_CTRL);
@@ -665,9 +649,9 @@ static void mcp794xx_work(struct work_struct *work)
        rtc_update_irq(ds1307->rtc, 1, RTC_AF | RTC_IRQF);
 
 out:
-       if (test_bit(HAS_ALARM, &ds1307->flags))
-               enable_irq(client->irq);
-       mutex_unlock(&ds1307->rtc->ops_lock);
+       mutex_unlock(lock);
+
+       return IRQ_HANDLED;
 }
 
 static int mcp794xx_read_alarm(struct device *dev, struct rtc_wkalrm *t)
@@ -798,13 +782,6 @@ ds1307_nvram_read(struct file *filp, struct kobject *kobj,
        client = kobj_to_i2c_client(kobj);
        ds1307 = i2c_get_clientdata(client);
 
-       if (unlikely(off >= ds1307->nvram->size))
-               return 0;
-       if ((off + count) > ds1307->nvram->size)
-               count = ds1307->nvram->size - off;
-       if (unlikely(!count))
-               return count;
-
        result = ds1307->read_block_data(client, ds1307->nvram_offset + off,
                                                                count, buf);
        if (result < 0)
@@ -824,13 +801,6 @@ ds1307_nvram_write(struct file *filp, struct kobject *kobj,
        client = kobj_to_i2c_client(kobj);
        ds1307 = i2c_get_clientdata(client);
 
-       if (unlikely(off >= ds1307->nvram->size))
-               return -EFBIG;
-       if ((off + count) > ds1307->nvram->size)
-               count = ds1307->nvram->size - off;
-       if (unlikely(!count))
-               return count;
-
        result = ds1307->write_block_data(client, ds1307->nvram_offset + off,
                                                                count, buf);
        if (result < 0) {
@@ -896,6 +866,8 @@ static int ds1307_probe(struct i2c_client *client,
        bool                    want_irq = false;
        unsigned char           *buf;
        struct ds1307_platform_data *pdata = dev_get_platdata(&client->dev);
+       irq_handler_t   irq_handler = ds1307_irq;
+
        static const int        bbsqi_bitpos[] = {
                [ds_1337] = 0,
                [ds_1339] = DS1339_BIT_BBSQI,
@@ -962,8 +934,6 @@ static int ds1307_probe(struct i2c_client *client,
                 * running on Vbackup (BBSQI/BBSQW)
                 */
                if (ds1307->client->irq > 0 && chip->alarm) {
-                       INIT_WORK(&ds1307->work, ds1307_work);
-
                        ds1307->regs[0] |= DS1337_BIT_INTCN
                                        | bbsqi_bitpos[ds1307->type];
                        ds1307->regs[0] &= ~(DS1337_BIT_A2IE | DS1337_BIT_A1IE);
@@ -1053,7 +1023,7 @@ static int ds1307_probe(struct i2c_client *client,
        case mcp794xx:
                rtc_ops = &mcp794xx_rtc_ops;
                if (ds1307->client->irq > 0 && chip->alarm) {
-                       INIT_WORK(&ds1307->work, mcp794xx_work);
+                       irq_handler = mcp794xx_irq;
                        want_irq = true;
                }
                break;
@@ -1176,18 +1146,43 @@ read_rtc:
        }
 
        if (want_irq) {
-               err = request_irq(client->irq, ds1307_irq, IRQF_SHARED,
-                         ds1307->rtc->name, client);
+               struct device_node *node = client->dev.of_node;
+
+               err = devm_request_threaded_irq(&client->dev,
+                                               client->irq, NULL, irq_handler,
+                                               IRQF_SHARED | IRQF_ONESHOT,
+                                               ds1307->rtc->name, client);
                if (err) {
                        client->irq = 0;
                        dev_err(&client->dev, "unable to request IRQ!\n");
-               } else {
+                       goto no_irq;
+               }
+
+               set_bit(HAS_ALARM, &ds1307->flags);
+               dev_dbg(&client->dev, "got IRQ %d\n", client->irq);
+
+               /* Currently supported by OF code only! */
+               if (!node)
+                       goto no_irq;
+
+               err = of_irq_get(node, 1);
+               if (err <= 0) {
+                       if (err == -EPROBE_DEFER)
+                               goto exit;
+                       goto no_irq;
+               }
+               ds1307->wakeirq = err;
 
-                       set_bit(HAS_ALARM, &ds1307->flags);
-                       dev_dbg(&client->dev, "got IRQ %d\n", client->irq);
+               err = dev_pm_set_dedicated_wake_irq(&client->dev,
+                                                   ds1307->wakeirq);
+               if (err) {
+                       dev_err(&client->dev, "unable to setup wakeIRQ %d!\n",
+                               err);
+                       goto exit;
                }
        }
 
+no_irq:
        if (chip->nvram_size) {
 
                ds1307->nvram = devm_kzalloc(&client->dev,
@@ -1231,10 +1226,8 @@ static int ds1307_remove(struct i2c_client *client)
 {
        struct ds1307 *ds1307 = i2c_get_clientdata(client);
 
-       if (test_and_clear_bit(HAS_ALARM, &ds1307->flags)) {
-               free_irq(client->irq, client);
-               cancel_work_sync(&ds1307->work);
-       }
+       if (ds1307->wakeirq)
+               dev_pm_clear_wake_irq(&client->dev);
 
        if (test_and_clear_bit(HAS_NVRAM, &ds1307->flags))
                sysfs_remove_bin_file(&client->dev.kobj, ds1307->nvram);
@@ -1245,7 +1238,6 @@ static int ds1307_remove(struct i2c_client *client)
 static struct i2c_driver ds1307_driver = {
        .driver = {
                .name   = "rtc-ds1307",
-               .owner  = THIS_MODULE,
        },
        .probe          = ds1307_probe,
        .remove         = ds1307_remove,
index ae9f997223b1f4aad22b6b500855cd5eb1f548b3..79a06dd3c1856be19f6bbf4caa17666511e0e2c8 100644 (file)
@@ -162,12 +162,6 @@ static ssize_t ds1343_nvram_write(struct file *filp, struct kobject *kobj,
        struct device *dev = kobj_to_dev(kobj);
        struct ds1343_priv *priv = dev_get_drvdata(dev);
 
-       if (unlikely(!count))
-               return count;
-
-       if ((count + off) > DS1343_NVRAM_LEN)
-               count = DS1343_NVRAM_LEN - off;
-
        address = DS1343_NVRAM + off;
 
        ret = regmap_bulk_write(priv->map, address, buf, count);
@@ -187,12 +181,6 @@ static ssize_t ds1343_nvram_read(struct file *filp, struct kobject *kobj,
        struct device *dev = kobj_to_dev(kobj);
        struct ds1343_priv *priv = dev_get_drvdata(dev);
 
-       if (unlikely(!count))
-               return count;
-
-       if ((count + off) > DS1343_NVRAM_LEN)
-               count = DS1343_NVRAM_LEN - off;
-
        address = DS1343_NVRAM + off;
 
        ret = regmap_bulk_read(priv->map, address, buf, count);
index 72c9333752339ce322cc3b47af40865e0a1e1275..3b3049c8c9e04ddb8ebfb3c391eba41b320286f3 100644 (file)
@@ -664,8 +664,6 @@ static int ds1374_remove(struct i2c_client *client)
 {
        struct ds1374 *ds1374 = i2c_get_clientdata(client);
 #ifdef CONFIG_RTC_DRV_DS1374_WDT
-       int res;
-
        misc_deregister(&ds1374_miscdev);
        ds1374_miscdev.parent = NULL;
        unregister_reboot_notifier(&ds1374_wdt_notifier);
@@ -688,7 +686,7 @@ static int ds1374_suspend(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
 
-       if (client->irq >= 0 && device_may_wakeup(&client->dev))
+       if (client->irq > 0 && device_may_wakeup(&client->dev))
                enable_irq_wake(client->irq);
        return 0;
 }
@@ -697,7 +695,7 @@ static int ds1374_resume(struct device *dev)
 {
        struct i2c_client *client = to_i2c_client(dev);
 
-       if (client->irq >= 0 && device_may_wakeup(&client->dev))
+       if (client->irq > 0 && device_may_wakeup(&client->dev))
                disable_irq_wake(client->irq);
        return 0;
 }
@@ -708,7 +706,6 @@ static SIMPLE_DEV_PM_OPS(ds1374_pm, ds1374_suspend, ds1374_resume);
 static struct i2c_driver ds1374_driver = {
        .driver = {
                .name = "rtc-ds1374",
-               .owner = THIS_MODULE,
                .pm = &ds1374_pm,
        },
        .probe = ds1374_probe,
index 7415c2b4d6e8e5988baf6fcdcec198ba7a25c387..da3d04ce83bd81a2fa0ef9d7aea3ee51fb215303 100644 (file)
@@ -64,7 +64,7 @@ enum ds1511reg {
 #define DS1511_KIE     0x04
 #define DS1511_WDE     0x02
 #define DS1511_WDS     0x01
-#define DS1511_RAM_MAX 0xff
+#define DS1511_RAM_MAX 0x100
 
 #define RTC_CMD                DS1511_CONTROL_B
 #define RTC_CMD1       DS1511_CONTROL_A
@@ -159,7 +159,7 @@ ds1511_wdog_set(unsigned long deciseconds)
        /*
         * set wdog enable and wdog 'steering' bit to issue a reset
         */
-       rtc_write(DS1511_WDE | DS1511_WDS, RTC_CMD);
+       rtc_write(rtc_read(RTC_CMD) | DS1511_WDE | DS1511_WDS, RTC_CMD);
 }
 
 void
@@ -407,26 +407,10 @@ ds1511_nvram_read(struct file *filp, struct kobject *kobj,
 {
        ssize_t count;
 
-       /*
-        * if count is more than one, turn on "burst" mode
-        * turn it off when you're done
-        */
-       if (size > 1)
-               rtc_write((rtc_read(RTC_CMD) | DS1511_BME), RTC_CMD);
-
-       if (pos > DS1511_RAM_MAX)
-               pos = DS1511_RAM_MAX;
-
-       if (size + pos > DS1511_RAM_MAX + 1)
-               size = DS1511_RAM_MAX - pos + 1;
-
        rtc_write(pos, DS1511_RAMADDR_LSB);
-       for (count = 0; size > 0; count++, size--)
+       for (count = 0; count < size; count++)
                *buf++ = rtc_read(DS1511_RAMDATA);
 
-       if (count > 1)
-               rtc_write((rtc_read(RTC_CMD) & ~DS1511_BME), RTC_CMD);
-
        return count;
 }
 
@@ -437,26 +421,10 @@ ds1511_nvram_write(struct file *filp, struct kobject *kobj,
 {
        ssize_t count;
 
-       /*
-        * if count is more than one, turn on "burst" mode
-        * turn it off when you're done
-        */
-       if (size > 1)
-               rtc_write((rtc_read(RTC_CMD) | DS1511_BME), RTC_CMD);
-
-       if (pos > DS1511_RAM_MAX)
-               pos = DS1511_RAM_MAX;
-
-       if (size + pos > DS1511_RAM_MAX + 1)
-               size = DS1511_RAM_MAX - pos + 1;
-
        rtc_write(pos, DS1511_RAMADDR_LSB);
-       for (count = 0; size > 0; count++, size--)
+       for (count = 0; count < size; count++)
                rtc_write(*buf++, DS1511_RAMDATA);
 
-       if (count > 1)
-               rtc_write((rtc_read(RTC_CMD) & ~DS1511_BME), RTC_CMD);
-
        return count;
 }
 
@@ -490,7 +458,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
        /*
         * turn on the clock and the crystal, etc.
         */
-       rtc_write(0, RTC_CMD);
+       rtc_write(DS1511_BME, RTC_CMD);
        rtc_write(0, RTC_CMD1);
        /*
         * clear the wdog counter
index a24e091bcb41d85f6443f63910a7f8661e4e03bb..38422ab4ec5a12687882e717b688a00d1a60ae1f 100644 (file)
@@ -245,7 +245,7 @@ static ssize_t ds1553_nvram_read(struct file *filp, struct kobject *kobj,
        void __iomem *ioaddr = pdata->ioaddr;
        ssize_t count;
 
-       for (count = 0; size > 0 && pos < RTC_OFFSET; count++, size--)
+       for (count = 0; count < size; count++)
                *buf++ = readb(ioaddr + pos++);
        return count;
 }
@@ -260,7 +260,7 @@ static ssize_t ds1553_nvram_write(struct file *filp, struct kobject *kobj,
        void __iomem *ioaddr = pdata->ioaddr;
        ssize_t count;
 
-       for (count = 0; size > 0 && pos < RTC_OFFSET; count++, size--)
+       for (count = 0; count < size; count++)
                writeb(*buf++, ioaddr + pos++);
        return count;
 }
index 818a3635a8c8675c2c7f750d19671b561bbd6835..05a51ef52703271dc195ec4fad27a498adcb5578 100644 (file)
@@ -2145,27 +2145,7 @@ static struct platform_driver ds1685_rtc_driver = {
        .probe          = ds1685_rtc_probe,
        .remove         = ds1685_rtc_remove,
 };
-
-/**
- * ds1685_rtc_init - rtc module init.
- */
-static int __init
-ds1685_rtc_init(void)
-{
-       return platform_driver_register(&ds1685_rtc_driver);
-}
-
-/**
- * ds1685_rtc_exit - rtc module exit.
- */
-static void __exit
-ds1685_rtc_exit(void)
-{
-       platform_driver_unregister(&ds1685_rtc_driver);
-}
-
-module_init(ds1685_rtc_init);
-module_exit(ds1685_rtc_exit);
+module_platform_driver(ds1685_rtc_driver);
 /* ----------------------------------------------------------------------- */
 
 
index 0f8d8ace15156fb82cd0c51394eb28a5b6ed0b5c..c5168b3bcf1a6024cb60bc48fa4f7dfca1476dc7 100644 (file)
@@ -134,7 +134,7 @@ static ssize_t ds1742_nvram_read(struct file *filp, struct kobject *kobj,
        void __iomem *ioaddr = pdata->ioaddr_nvram;
        ssize_t count;
 
-       for (count = 0; size > 0 && pos < pdata->size_nvram; count++, size--)
+       for (count = 0; count < size; count++)
                *buf++ = readb(ioaddr + pos++);
        return count;
 }
@@ -149,7 +149,7 @@ static ssize_t ds1742_nvram_write(struct file *filp, struct kobject *kobj,
        void __iomem *ioaddr = pdata->ioaddr_nvram;
        ssize_t count;
 
-       for (count = 0; size > 0 && pos < pdata->size_nvram; count++, size--)
+       for (count = 0; count < size; count++)
                writeb(*buf++, ioaddr + pos++);
        return count;
 }
index 7e48e532214fe3735c743f5edc180b2457480a70..4e99ace66f74d10a9beba95a2ec3208d79c50182 100644 (file)
@@ -443,7 +443,7 @@ static int ds3232_remove(struct i2c_client *client)
 {
        struct ds3232 *ds3232 = i2c_get_clientdata(client);
 
-       if (client->irq >= 0) {
+       if (client->irq > 0) {
                mutex_lock(&ds3232->mutex);
                ds3232->exiting = 1;
                mutex_unlock(&ds3232->mutex);
@@ -463,7 +463,10 @@ static int ds3232_suspend(struct device *dev)
 
        if (device_can_wakeup(dev)) {
                ds3232->suspended = true;
-               irq_set_irq_wake(client->irq, 1);
+               if (irq_set_irq_wake(client->irq, 1)) {
+                       dev_warn_once(dev, "Cannot set wakeup source\n");
+                       ds3232->suspended = false;
+               }
        }
 
        return 0;
@@ -500,7 +503,6 @@ MODULE_DEVICE_TABLE(i2c, ds3232_id);
 static struct i2c_driver ds3232_driver = {
        .driver = {
                .name = "rtc-ds3232",
-               .owner = THIS_MODULE,
                .pm     = &ds3232_pm_ops,
        },
        .probe = ds3232_probe,
index 83c3b3029fa774321332961b7abad8796088686b..576eadbba296799eca4bc2ad45d13769799a7cf1 100644 (file)
@@ -523,7 +523,6 @@ exit_free:
 static struct i2c_driver fm3130_driver = {
        .driver = {
                .name   = "rtc-fm3130",
-               .owner  = THIS_MODULE,
        },
        .probe          = fm3130_probe,
        .id_table       = fm3130_id,
index 35f4486738fcef0c496a631858ce3961c6c20793..e84184647d1525fe720c6ed73f86da92f3d634ce 100644 (file)
@@ -148,10 +148,7 @@ static int gemini_rtc_probe(struct platform_device *pdev)
 
        rtc->rtc_dev = rtc_device_register(pdev->name, dev,
                                           &gemini_rtc_ops, THIS_MODULE);
-       if (likely(IS_ERR(rtc->rtc_dev)))
-               return PTR_ERR(rtc->rtc_dev);
-
-       return 0;
+       return PTR_ERR_OR_ZERO(rtc->rtc_dev);
 }
 
 static int gemini_rtc_remove(struct platform_device *pdev)
index e9da7959d3fe17983c0654c5a1f7c144bb266031..097325d96db566e08ec8b70aaa9d595f6a044235 100644 (file)
@@ -599,7 +599,6 @@ MODULE_DEVICE_TABLE(of, hym8563_dt_idtable);
 static struct i2c_driver hym8563_driver = {
        .driver         = {
                .name   = "rtc-hym8563",
-               .owner  = THIS_MODULE,
                .pm     = &hym8563_pm_ops,
                .of_match_table = hym8563_dt_idtable,
        },
index f9b082784b9064a234aa313ec6836361d172d0d6..839d1fd63cd78d233d1039c86f28e85ed2fc25a4 100644 (file)
@@ -151,12 +151,7 @@ static int isl12022_get_datetime(struct i2c_client *client, struct rtc_time *tm)
                tm->tm_sec, tm->tm_min, tm->tm_hour,
                tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
-       /* The clock can give out invalid datetime, but we cannot return
-        * -EINVAL otherwise hwclock will refuse to set the time on bootup. */
-       if (rtc_valid_tm(tm) < 0)
-               dev_err(&client->dev, "retrieved date and time is invalid.\n");
-
-       return 0;
+       return rtc_valid_tm(tm);
 }
 
 static int isl12022_set_datetime(struct i2c_client *client, struct rtc_time *tm)
@@ -279,6 +274,7 @@ static const struct of_device_id isl12022_dt_match[] = {
        { .compatible = "isil,isl12022" },
        { },
 };
+MODULE_DEVICE_TABLE(of, isl12022_dt_match);
 #endif
 
 static const struct i2c_device_id isl12022_id[] = {
index da818d3337cec5d2bd552ac046286763fbf4a1fa..a0462e5430c79c2378fef402b4a5270e18cdefa9 100644 (file)
@@ -648,6 +648,7 @@ static const struct of_device_id isl12057_dt_match[] = {
        { .compatible = "isil,isl12057" },
        { },
 };
+MODULE_DEVICE_TABLE(of, isl12057_dt_match);
 #endif
 
 static const struct i2c_device_id isl12057_id[] = {
@@ -659,7 +660,6 @@ MODULE_DEVICE_TABLE(i2c, isl12057_id);
 static struct i2c_driver isl12057_driver = {
        .driver = {
                .name = DRV_NAME,
-               .owner = THIS_MODULE,
                .pm = &isl12057_rtc_pm_ops,
                .of_match_table = of_match_ptr(isl12057_dt_match),
        },
diff --git a/drivers/rtc/rtc-lpc24xx.c b/drivers/rtc/rtc-lpc24xx.c
new file mode 100644 (file)
index 0000000..59d9959
--- /dev/null
@@ -0,0 +1,310 @@
+/*
+ * RTC driver for NXP LPC178x/18xx/43xx Real-Time Clock (RTC)
+ *
+ * Copyright (C) 2011 NXP Semiconductors
+ * Copyright (C) 2015 Joachim Eastwood <manabian@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+/* LPC24xx RTC register offsets and bits */
+#define LPC24XX_ILR            0x00
+#define  LPC24XX_RTCCIF                BIT(0)
+#define  LPC24XX_RTCALF                BIT(1)
+#define LPC24XX_CTC            0x04
+#define LPC24XX_CCR            0x08
+#define  LPC24XX_CLKEN         BIT(0)
+#define  LPC178X_CCALEN                BIT(4)
+#define LPC24XX_CIIR           0x0c
+#define LPC24XX_AMR            0x10
+#define  LPC24XX_ALARM_DISABLE 0xff
+#define LPC24XX_CTIME0         0x14
+#define LPC24XX_CTIME1         0x18
+#define LPC24XX_CTIME2         0x1c
+#define LPC24XX_SEC            0x20
+#define LPC24XX_MIN            0x24
+#define LPC24XX_HOUR           0x28
+#define LPC24XX_DOM            0x2c
+#define LPC24XX_DOW            0x30
+#define LPC24XX_DOY            0x34
+#define LPC24XX_MONTH          0x38
+#define LPC24XX_YEAR           0x3c
+#define LPC24XX_ALSEC          0x60
+#define LPC24XX_ALMIN          0x64
+#define LPC24XX_ALHOUR         0x68
+#define LPC24XX_ALDOM          0x6c
+#define LPC24XX_ALDOW          0x70
+#define LPC24XX_ALDOY          0x74
+#define LPC24XX_ALMON          0x78
+#define LPC24XX_ALYEAR         0x7c
+
+/* Macros to read fields in consolidated time (CT) registers */
+#define CT0_SECS(x)            (((x) >> 0)  & 0x3f)
+#define CT0_MINS(x)            (((x) >> 8)  & 0x3f)
+#define CT0_HOURS(x)           (((x) >> 16) & 0x1f)
+#define CT0_DOW(x)             (((x) >> 24) & 0x07)
+#define CT1_DOM(x)             (((x) >> 0)  & 0x1f)
+#define CT1_MONTH(x)           (((x) >> 8)  & 0x0f)
+#define CT1_YEAR(x)            (((x) >> 16) & 0xfff)
+#define CT2_DOY(x)             (((x) >> 0)  & 0xfff)
+
+#define rtc_readl(dev, reg)            readl((dev)->rtc_base + (reg))
+#define rtc_writel(dev, reg, val)      writel((val), (dev)->rtc_base + (reg))
+
+struct lpc24xx_rtc {
+       void __iomem *rtc_base;
+       struct rtc_device *rtc;
+       struct clk *clk_rtc;
+       struct clk *clk_reg;
+};
+
+static int lpc24xx_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct lpc24xx_rtc *rtc = dev_get_drvdata(dev);
+
+       /* Disable RTC during update */
+       rtc_writel(rtc, LPC24XX_CCR, LPC178X_CCALEN);
+
+       rtc_writel(rtc, LPC24XX_SEC,    tm->tm_sec);
+       rtc_writel(rtc, LPC24XX_MIN,    tm->tm_min);
+       rtc_writel(rtc, LPC24XX_HOUR,   tm->tm_hour);
+       rtc_writel(rtc, LPC24XX_DOW,    tm->tm_wday);
+       rtc_writel(rtc, LPC24XX_DOM,    tm->tm_mday);
+       rtc_writel(rtc, LPC24XX_DOY,    tm->tm_yday);
+       rtc_writel(rtc, LPC24XX_MONTH,  tm->tm_mon);
+       rtc_writel(rtc, LPC24XX_YEAR,   tm->tm_year);
+
+       rtc_writel(rtc, LPC24XX_CCR, LPC24XX_CLKEN | LPC178X_CCALEN);
+
+       return 0;
+}
+
+static int lpc24xx_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct lpc24xx_rtc *rtc = dev_get_drvdata(dev);
+       u32 ct0, ct1, ct2;
+
+       ct0 = rtc_readl(rtc, LPC24XX_CTIME0);
+       ct1 = rtc_readl(rtc, LPC24XX_CTIME1);
+       ct2 = rtc_readl(rtc, LPC24XX_CTIME2);
+
+       tm->tm_sec  = CT0_SECS(ct0);
+       tm->tm_min  = CT0_MINS(ct0);
+       tm->tm_hour = CT0_HOURS(ct0);
+       tm->tm_wday = CT0_DOW(ct0);
+       tm->tm_mon  = CT1_MONTH(ct1);
+       tm->tm_mday = CT1_DOM(ct1);
+       tm->tm_year = CT1_YEAR(ct1);
+       tm->tm_yday = CT2_DOY(ct2);
+
+       return rtc_valid_tm(tm);
+}
+
+static int lpc24xx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
+{
+       struct lpc24xx_rtc *rtc = dev_get_drvdata(dev);
+       struct rtc_time *tm = &wkalrm->time;
+
+       tm->tm_sec  = rtc_readl(rtc, LPC24XX_ALSEC);
+       tm->tm_min  = rtc_readl(rtc, LPC24XX_ALMIN);
+       tm->tm_hour = rtc_readl(rtc, LPC24XX_ALHOUR);
+       tm->tm_mday = rtc_readl(rtc, LPC24XX_ALDOM);
+       tm->tm_wday = rtc_readl(rtc, LPC24XX_ALDOW);
+       tm->tm_yday = rtc_readl(rtc, LPC24XX_ALDOY);
+       tm->tm_mon  = rtc_readl(rtc, LPC24XX_ALMON);
+       tm->tm_year = rtc_readl(rtc, LPC24XX_ALYEAR);
+
+       wkalrm->enabled = rtc_readl(rtc, LPC24XX_AMR) == 0;
+       wkalrm->pending = !!(rtc_readl(rtc, LPC24XX_ILR) & LPC24XX_RTCCIF);
+
+       return rtc_valid_tm(&wkalrm->time);
+}
+
+static int lpc24xx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *wkalrm)
+{
+       struct lpc24xx_rtc *rtc = dev_get_drvdata(dev);
+       struct rtc_time *tm = &wkalrm->time;
+
+       /* Disable alarm irq during update */
+       rtc_writel(rtc, LPC24XX_AMR, LPC24XX_ALARM_DISABLE);
+
+       rtc_writel(rtc, LPC24XX_ALSEC,  tm->tm_sec);
+       rtc_writel(rtc, LPC24XX_ALMIN,  tm->tm_min);
+       rtc_writel(rtc, LPC24XX_ALHOUR, tm->tm_hour);
+       rtc_writel(rtc, LPC24XX_ALDOM,  tm->tm_mday);
+       rtc_writel(rtc, LPC24XX_ALDOW,  tm->tm_wday);
+       rtc_writel(rtc, LPC24XX_ALDOY,  tm->tm_yday);
+       rtc_writel(rtc, LPC24XX_ALMON,  tm->tm_mon);
+       rtc_writel(rtc, LPC24XX_ALYEAR, tm->tm_year);
+
+       if (wkalrm->enabled)
+               rtc_writel(rtc, LPC24XX_AMR, 0);
+
+       return 0;
+}
+
+static int lpc24xx_rtc_alarm_irq_enable(struct device *dev, unsigned int enable)
+{
+       struct lpc24xx_rtc *rtc = dev_get_drvdata(dev);
+
+       if (enable)
+               rtc_writel(rtc, LPC24XX_AMR, 0);
+       else
+               rtc_writel(rtc, LPC24XX_AMR, LPC24XX_ALARM_DISABLE);
+
+       return 0;
+}
+
+static irqreturn_t lpc24xx_rtc_interrupt(int irq, void *data)
+{
+       unsigned long events = RTC_IRQF;
+       struct lpc24xx_rtc *rtc = data;
+       u32 rtc_iir;
+
+       /* Check interrupt cause */
+       rtc_iir = rtc_readl(rtc, LPC24XX_ILR);
+       if (rtc_iir & LPC24XX_RTCALF) {
+               events |= RTC_AF;
+               rtc_writel(rtc, LPC24XX_AMR, LPC24XX_ALARM_DISABLE);
+       }
+
+       /* Clear interrupt status and report event */
+       rtc_writel(rtc, LPC24XX_ILR, rtc_iir);
+       rtc_update_irq(rtc->rtc, 1, events);
+
+       return IRQ_HANDLED;
+}
+
+static const struct rtc_class_ops lpc24xx_rtc_ops = {
+       .read_time              = lpc24xx_rtc_read_time,
+       .set_time               = lpc24xx_rtc_set_time,
+       .read_alarm             = lpc24xx_rtc_read_alarm,
+       .set_alarm              = lpc24xx_rtc_set_alarm,
+       .alarm_irq_enable       = lpc24xx_rtc_alarm_irq_enable,
+};
+
+static int lpc24xx_rtc_probe(struct platform_device *pdev)
+{
+       struct lpc24xx_rtc *rtc;
+       struct resource *res;
+       int irq, ret;
+
+       rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL);
+       if (!rtc)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       rtc->rtc_base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(rtc->rtc_base))
+               return PTR_ERR(rtc->rtc_base);
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq < 0) {
+               dev_warn(&pdev->dev, "can't get interrupt resource\n");
+               return irq;
+       }
+
+       rtc->clk_rtc = devm_clk_get(&pdev->dev, "rtc");
+       if (IS_ERR(rtc->clk_rtc)) {
+               dev_err(&pdev->dev, "error getting rtc clock\n");
+               return PTR_ERR(rtc->clk_rtc);
+       }
+
+       rtc->clk_reg = devm_clk_get(&pdev->dev, "reg");
+       if (IS_ERR(rtc->clk_reg)) {
+               dev_err(&pdev->dev, "error getting reg clock\n");
+               return PTR_ERR(rtc->clk_reg);
+       }
+
+       ret = clk_prepare_enable(rtc->clk_rtc);
+       if (ret) {
+               dev_err(&pdev->dev, "unable to enable rtc clock\n");
+               return ret;
+       }
+
+       ret = clk_prepare_enable(rtc->clk_reg);
+       if (ret) {
+               dev_err(&pdev->dev, "unable to enable reg clock\n");
+               goto disable_rtc_clk;
+       }
+
+       platform_set_drvdata(pdev, rtc);
+
+       /* Clear any pending interrupts */
+       rtc_writel(rtc, LPC24XX_ILR, LPC24XX_RTCCIF | LPC24XX_RTCALF);
+
+       /* Enable RTC count */
+       rtc_writel(rtc, LPC24XX_CCR, LPC24XX_CLKEN | LPC178X_CCALEN);
+
+       ret = devm_request_irq(&pdev->dev, irq, lpc24xx_rtc_interrupt, 0,
+                              pdev->name, rtc);
+       if (ret < 0) {
+               dev_warn(&pdev->dev, "can't request interrupt\n");
+               goto disable_clks;
+       }
+
+       rtc->rtc = devm_rtc_device_register(&pdev->dev, "lpc24xx-rtc",
+                                           &lpc24xx_rtc_ops, THIS_MODULE);
+       if (IS_ERR(rtc->rtc)) {
+               dev_err(&pdev->dev, "can't register rtc device\n");
+               ret = PTR_ERR(rtc->rtc);
+               goto disable_clks;
+       }
+
+       return 0;
+
+disable_clks:
+       clk_disable_unprepare(rtc->clk_reg);
+disable_rtc_clk:
+       clk_disable_unprepare(rtc->clk_rtc);
+       return ret;
+}
+
+static int lpc24xx_rtc_remove(struct platform_device *pdev)
+{
+       struct lpc24xx_rtc *rtc = platform_get_drvdata(pdev);
+
+       /* Ensure all interrupt sources are masked */
+       rtc_writel(rtc, LPC24XX_AMR, LPC24XX_ALARM_DISABLE);
+       rtc_writel(rtc, LPC24XX_CIIR, 0);
+
+       rtc_writel(rtc, LPC24XX_CCR, LPC178X_CCALEN);
+
+       clk_disable_unprepare(rtc->clk_rtc);
+       clk_disable_unprepare(rtc->clk_reg);
+
+       return 0;
+}
+
+static const struct of_device_id lpc24xx_rtc_match[] = {
+       { .compatible = "nxp,lpc1788-rtc" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, lpc24xx_rtc_match);
+
+static struct platform_driver lpc24xx_rtc_driver = {
+       .probe  = lpc24xx_rtc_probe,
+       .remove = lpc24xx_rtc_remove,
+       .driver = {
+               .name = "lpc24xx-rtc",
+               .of_match_table = lpc24xx_rtc_match,
+       },
+};
+module_platform_driver(lpc24xx_rtc_driver);
+
+MODULE_AUTHOR("Kevin Wells <wellsk40@gmail.com>");
+MODULE_DESCRIPTION("RTC driver for the LPC178x/18xx/408x/43xx SoCs");
+MODULE_LICENSE("GPL");
index 90abb5bd589c8e2998594c7c3312e3f4c314cd47..d99a705bec07ac28103bc1d19ce52b97ffd25385 100644 (file)
@@ -345,11 +345,12 @@ static ssize_t m48t59_nvram_read(struct file *filp, struct kobject *kobj,
        ssize_t cnt = 0;
        unsigned long flags;
 
-       for (; size > 0 && pos < pdata->offset; cnt++, size--) {
-               spin_lock_irqsave(&m48t59->lock, flags);
+       spin_lock_irqsave(&m48t59->lock, flags);
+
+       for (; cnt < size; cnt++)
                *buf++ = M48T59_READ(cnt);
-               spin_unlock_irqrestore(&m48t59->lock, flags);
-       }
+
+       spin_unlock_irqrestore(&m48t59->lock, flags);
 
        return cnt;
 }
@@ -365,11 +366,12 @@ static ssize_t m48t59_nvram_write(struct file *filp, struct kobject *kobj,
        ssize_t cnt = 0;
        unsigned long flags;
 
-       for (; size > 0 && pos < pdata->offset; cnt++, size--) {
-               spin_lock_irqsave(&m48t59->lock, flags);
+       spin_lock_irqsave(&m48t59->lock, flags);
+
+       for (; cnt < size; cnt++)
                M48T59_WRITE(*buf++, cnt);
-               spin_unlock_irqrestore(&m48t59->lock, flags);
-       }
+
+       spin_unlock_irqrestore(&m48t59->lock, flags);
 
        return cnt;
 }
index 9e02bcda0c0915c11ecf6adaf6347d8d0e671033..db984d4bf9526bbc78e501ff8da6a7036801872b 100644 (file)
@@ -521,6 +521,7 @@ static const struct platform_device_id rtc_id[] = {
        { "max8997-rtc", 0 },
        {},
 };
+MODULE_DEVICE_TABLE(platform, rtc_id);
 
 static struct platform_driver max8997_rtc_driver = {
        .driver         = {
index 73759c9a4527aeb1cce098be6d1b7f48f47af55a..07b30a373a929f2e5391cc8f04efb5c3613789d5 100644 (file)
@@ -312,6 +312,7 @@ static const struct of_device_id moxart_rtc_match[] = {
        { .compatible = "moxa,moxart-rtc" },
        { },
 };
+MODULE_DEVICE_TABLE(of, moxart_rtc_match);
 
 static struct platform_driver moxart_rtc_driver = {
        .probe  = moxart_rtc_probe,
index 1767e18d5bd4846ad60fe9c4081105d76d04459e..4ca4daa0b8f32ae3761f759245332d1baccaf8c0 100644 (file)
@@ -406,6 +406,7 @@ static const struct of_device_id mpc5121_rtc_match[] = {
        { .compatible = "fsl,mpc5200-rtc", },
        {},
 };
+MODULE_DEVICE_TABLE(of, mpc5121_rtc_match);
 #endif
 
 static struct platform_driver mpc5121_rtc_driver = {
index eab230be5a54fdfcfd9c41f39e0d9f1cb515a9e4..06a5c52b292f292fd619a4049ba264e84b064f70 100644 (file)
@@ -373,15 +373,42 @@ static int mtk_rtc_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int mt6397_rtc_suspend(struct device *dev)
+{
+       struct mt6397_rtc *rtc = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               enable_irq_wake(rtc->irq);
+
+       return 0;
+}
+
+static int mt6397_rtc_resume(struct device *dev)
+{
+       struct mt6397_rtc *rtc = dev_get_drvdata(dev);
+
+       if (device_may_wakeup(dev))
+               disable_irq_wake(rtc->irq);
+
+       return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(mt6397_pm_ops, mt6397_rtc_suspend,
+                       mt6397_rtc_resume);
+
 static const struct of_device_id mt6397_rtc_of_match[] = {
        { .compatible = "mediatek,mt6397-rtc", },
        { }
 };
+MODULE_DEVICE_TABLE(of, mt6397_rtc_of_match);
 
 static struct platform_driver mtk_rtc_driver = {
        .driver = {
                .name = "mt6397-rtc",
                .of_match_table = mt6397_rtc_of_match,
+               .pm = &mt6397_pm_ops,
        },
        .probe  = mtk_rtc_probe,
        .remove = mtk_rtc_remove,
index 7f50d2ef7f6ef43590b745d37abd3ded9edb7b22..79bb28617d458ec99f77518472f2f1505960d2b0 100644 (file)
@@ -324,6 +324,7 @@ static const struct of_device_id rtc_mv_of_match_table[] = {
        { .compatible = "marvell,orion-rtc", },
        {}
 };
+MODULE_DEVICE_TABLE(of, rtc_mv_of_match_table);
 #endif
 
 static struct platform_driver mv_rtc_driver = {
index 8b6355ffaff990621354dcb8687230f527faf2b0..ec2e9c5fb993c7023c9262af5c0b691905443bff 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/io.h>
+#include <linux/clk.h>
 
 /*
  * The OMAP RTC is a year/month/day/hours/minutes/seconds BCD clock
 
 /* OMAP_RTC_OSC_REG bit fields: */
 #define OMAP_RTC_OSC_32KCLK_EN         BIT(6)
+#define OMAP_RTC_OSC_SEL_32KCLK_SRC    BIT(3)
 
 /* OMAP_RTC_IRQWAKEEN bit fields: */
 #define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN        BIT(1)
@@ -132,10 +134,12 @@ struct omap_rtc_device_type {
 struct omap_rtc {
        struct rtc_device *rtc;
        void __iomem *base;
+       struct clk *clk;
        int irq_alarm;
        int irq_timer;
        u8 interrupts_reg;
        bool is_pmic_controller;
+       bool has_ext_clk;
        const struct omap_rtc_device_type *type;
 };
 
@@ -553,6 +557,15 @@ static int omap_rtc_probe(struct platform_device *pdev)
        if (rtc->irq_alarm <= 0)
                return -ENOENT;
 
+       rtc->clk = devm_clk_get(&pdev->dev, "ext-clk");
+       if (!IS_ERR(rtc->clk))
+               rtc->has_ext_clk = true;
+       else
+               rtc->clk = devm_clk_get(&pdev->dev, "int-clk");
+
+       if (!IS_ERR(rtc->clk))
+               clk_prepare_enable(rtc->clk);
+
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        rtc->base = devm_ioremap_resource(&pdev->dev, res);
        if (IS_ERR(rtc->base))
@@ -627,6 +640,16 @@ static int omap_rtc_probe(struct platform_device *pdev)
        if (reg != new_ctrl)
                rtc_write(rtc, OMAP_RTC_CTRL_REG, new_ctrl);
 
+       /*
+        * If we have the external clock then switch to it so we can keep
+        * ticking across suspend.
+        */
+       if (rtc->has_ext_clk) {
+               reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
+               rtc_write(rtc, OMAP_RTC_OSC_REG,
+                         reg | OMAP_RTC_OSC_SEL_32KCLK_SRC);
+       }
+
        rtc->type->lock(rtc);
 
        device_init_wakeup(&pdev->dev, true);
@@ -672,6 +695,7 @@ err:
 static int __exit omap_rtc_remove(struct platform_device *pdev)
 {
        struct omap_rtc *rtc = platform_get_drvdata(pdev);
+       u8 reg;
 
        if (pm_power_off == omap_rtc_power_off &&
                        omap_rtc_power_off_rtc == rtc) {
@@ -681,10 +705,19 @@ static int __exit omap_rtc_remove(struct platform_device *pdev)
 
        device_init_wakeup(&pdev->dev, 0);
 
+       if (!IS_ERR(rtc->clk))
+               clk_disable_unprepare(rtc->clk);
+
        rtc->type->unlock(rtc);
        /* leave rtc running, but disable irqs */
        rtc_write(rtc, OMAP_RTC_INTERRUPTS_REG, 0);
 
+       if (rtc->has_ext_clk) {
+               reg = rtc_read(rtc, OMAP_RTC_OSC_REG);
+               reg &= ~OMAP_RTC_OSC_SEL_32KCLK_SRC;
+               rtc_write(rtc, OMAP_RTC_OSC_REG, reg);
+       }
+
        rtc->type->lock(rtc);
 
        /* Disable the clock/module */
index 7061dcae2b09d51ee746715f85eb530948300c0a..6fbf9e617151d1d2376816e725c2c55e8b7e61eb 100644 (file)
@@ -190,11 +190,9 @@ exit:
        return rc;
 }
 
-static const struct rtc_class_ops opal_rtc_ops = {
+static struct rtc_class_ops opal_rtc_ops = {
        .read_time      = opal_get_rtc_time,
        .set_time       = opal_set_rtc_time,
-       .read_alarm     = opal_get_tpo_time,
-       .set_alarm      = opal_set_tpo_time,
 };
 
 static int opal_rtc_probe(struct platform_device *pdev)
@@ -202,8 +200,11 @@ static int opal_rtc_probe(struct platform_device *pdev)
        struct rtc_device *rtc;
 
        if (pdev->dev.of_node && of_get_property(pdev->dev.of_node, "has-tpo",
-                                                NULL))
+                                                NULL)) {
                device_set_wakeup_capable(&pdev->dev, true);
+               opal_rtc_ops.read_alarm = opal_get_tpo_time;
+               opal_rtc_ops.set_alarm = opal_set_tpo_time;
+       }
 
        rtc = devm_rtc_device_register(&pdev->dev, DRVNAME, &opal_rtc_ops,
                                       THIS_MODULE);
@@ -236,7 +237,6 @@ static struct platform_driver opal_rtc_driver = {
        .id_table       = opal_rtc_driver_ids,
        .driver         = {
                .name           = DRVNAME,
-               .owner          = THIS_MODULE,
                .of_match_table = opal_rtc_match,
        },
 };
index 8a7556cbcb7f016b29764786bf184a7da071687e..1c47650fe624fea3bd9e7b709f6c50e4ff2d8eea 100644 (file)
@@ -165,13 +165,7 @@ static int pcf2123_rtc_read_time(struct device *dev, struct rtc_time *tm)
                        tm->tm_sec, tm->tm_min, tm->tm_hour,
                        tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
-       /* the clock can give out invalid datetime, but we cannot return
-        * -EINVAL otherwise hwclock will refuse to set the time on bootup.
-        */
-       if (rtc_valid_tm(tm) < 0)
-               dev_err(dev, "retrieved date/time is not valid.\n");
-
-       return 0;
+       return rtc_valid_tm(tm);
 }
 
 static int pcf2123_rtc_set_time(struct device *dev, struct rtc_time *tm)
index 9bd842e977492d6afc93a651403b0eb9db809cf9..4b11d31f71740b4de9897ec46be1ec0b4cb39835 100644 (file)
 #define PCF2127_REG_MO          (0x08)
 #define PCF2127_REG_YR          (0x09)
 
+#define PCF2127_OSF             BIT(7)  /* Oscillator Fail flag */
+
 static struct i2c_driver pcf2127_driver;
 
 struct pcf2127 {
        struct rtc_device *rtc;
        int voltage_low; /* indicates if a low_voltage was detected */
+       int oscillator_failed; /* OSF was detected and date is unreliable */
 };
 
 /*
@@ -59,7 +62,18 @@ static int pcf2127_get_datetime(struct i2c_client *client, struct rtc_time *tm)
        if (buf[PCF2127_REG_CTRL3] & 0x04) {
                pcf2127->voltage_low = 1;
                dev_info(&client->dev,
-                       "low voltage detected, date/time is not reliable.\n");
+                       "low voltage detected, check/replace RTC battery.\n");
+       }
+
+       if (buf[PCF2127_REG_SC] & PCF2127_OSF) {
+               /*
+                * no need clear the flag here,
+                * it will be cleared once the new date is saved
+                */
+               pcf2127->oscillator_failed = 1;
+               dev_warn(&client->dev,
+                        "oscillator stop detected, date/time is not reliable\n");
+               return -EINVAL;
        }
 
        dev_dbg(&client->dev,
@@ -88,17 +102,12 @@ static int pcf2127_get_datetime(struct i2c_client *client, struct rtc_time *tm)
                tm->tm_sec, tm->tm_min, tm->tm_hour,
                tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
-       /* the clock can give out invalid datetime, but we cannot return
-        * -EINVAL otherwise hwclock will refuse to set the time on bootup.
-        */
-       if (rtc_valid_tm(tm) < 0)
-               dev_err(&client->dev, "retrieved date/time is not valid.\n");
-
-       return 0;
+       return rtc_valid_tm(tm);
 }
 
 static int pcf2127_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 {
+       struct pcf2127 *pcf2127 = i2c_get_clientdata(client);
        unsigned char buf[8];
        int i = 0, err;
 
@@ -112,7 +121,7 @@ static int pcf2127_set_datetime(struct i2c_client *client, struct rtc_time *tm)
        buf[i++] = PCF2127_REG_SC;
 
        /* hours, minutes and seconds */
-       buf[i++] = bin2bcd(tm->tm_sec);
+       buf[i++] = bin2bcd(tm->tm_sec); /* this will also clear OSF flag */
        buf[i++] = bin2bcd(tm->tm_min);
        buf[i++] = bin2bcd(tm->tm_hour);
        buf[i++] = bin2bcd(tm->tm_mday);
@@ -132,6 +141,9 @@ static int pcf2127_set_datetime(struct i2c_client *client, struct rtc_time *tm)
                return -EIO;
        }
 
+       /* clear OSF flag in client data */
+       pcf2127->oscillator_failed = 0;
+
        return 0;
 }
 
@@ -144,7 +156,9 @@ static int pcf2127_rtc_ioctl(struct device *dev,
        switch (cmd) {
        case RTC_VL_READ:
                if (pcf2127->voltage_low)
-                       dev_info(dev, "low voltage detected, date/time is not reliable.\n");
+                       dev_info(dev, "low voltage detected, check/replace battery\n");
+               if (pcf2127->oscillator_failed)
+                       dev_info(dev, "oscillator stop detected, date/time is not reliable\n");
 
                if (copy_to_user((void __user *)arg, &pcf2127->voltage_low,
                                        sizeof(int)))
@@ -217,7 +231,6 @@ MODULE_DEVICE_TABLE(of, pcf2127_of_match);
 static struct i2c_driver pcf2127_driver = {
        .driver         = {
                .name   = "rtc-pcf2127",
-               .owner  = THIS_MODULE,
                .of_match_table = of_match_ptr(pcf2127_of_match),
        },
        .probe          = pcf2127_probe,
index 6a12bf62c504bf736a7f1b371d8546a275b1dd03..b6d73dd881f248bce67976754fb5f932477d5686 100644 (file)
@@ -189,7 +189,6 @@ MODULE_DEVICE_TABLE(of, pcf85063_of_match);
 static struct i2c_driver pcf85063_driver = {
        .driver         = {
                .name   = "rtc-pcf85063",
-               .owner  = THIS_MODULE,
                .of_match_table = of_match_ptr(pcf85063_of_match),
        },
        .probe          = pcf85063_probe,
index 4cdb64be061bd7d175417581010d7f702ed6b006..e7ebcc0b7e59b55eecfc8daf7d2540a4b4282217 100644 (file)
@@ -334,7 +334,6 @@ MODULE_DEVICE_TABLE(of, pcf8523_of_match);
 static struct i2c_driver pcf8523_driver = {
        .driver = {
                .name = DRIVER_NAME,
-               .owner = THIS_MODULE,
                .of_match_table = of_match_ptr(pcf8523_of_match),
        },
        .probe = pcf8523_probe,
index 8bba022be946ebc4380bc692c78637ff487c292a..e569243db57efb4f5aca9f4634047a6bae3072b5 100644 (file)
@@ -483,7 +483,6 @@ MODULE_DEVICE_TABLE(of, pcf8563_of_match);
 static struct i2c_driver pcf8563_driver = {
        .driver         = {
                .name   = "rtc-pcf8563",
-               .owner  = THIS_MODULE,
                .of_match_table = of_match_ptr(pcf8563_of_match),
        },
        .probe          = pcf8563_probe,
index 5911a6dca29199d976fd71b72e818eda1480431c..7ca9e8871d77d5e384b30af8b8c0617b58ab9000 100644 (file)
@@ -309,7 +309,6 @@ MODULE_DEVICE_TABLE(i2c, pcf8583_id);
 static struct i2c_driver pcf8583_driver = {
        .driver = {
                .name   = "pcf8583",
-               .owner  = THIS_MODULE,
        },
        .probe          = pcf8583_probe,
        .id_table       = pcf8583_id,
index 99181fff88fd6f4e3518a1e2b3c166b674b59305..41dcb7ddb906040ce18722e3c93e076f05c0e4b4 100644 (file)
@@ -476,6 +476,6 @@ static struct amba_driver pl031_driver = {
 
 module_amba_driver(pl031_driver);
 
-MODULE_AUTHOR("Deepak Saxena <dsaxena@plexity.net");
+MODULE_AUTHOR("Deepak Saxena <dsaxena@plexity.net>");
 MODULE_DESCRIPTION("ARM AMBA PL031 RTC Driver");
 MODULE_LICENSE("GPL");
index 4561f375327dbefd5f532a961dfa478934e856e6..fe4985b546088731f9dac7b3338111a35d0db07f 100644 (file)
@@ -32,6 +32,8 @@
 
 #include <mach/hardware.h>
 
+#include "rtc-sa1100.h"
+
 #define RTC_DEF_DIVIDER                (32768 - 1)
 #define RTC_DEF_TRIM           0
 #define MAXFREQ_PERIODIC       1000
        __raw_writel((value), (pxa_rtc)->base + (reg))
 
 struct pxa_rtc {
+       struct sa1100_rtc sa1100_rtc;
        struct resource *ress;
        void __iomem            *base;
-       int                     irq_1Hz;
-       int                     irq_Alrm;
        struct rtc_device       *rtc;
        spinlock_t              lock;           /* Protects this structure */
 };
@@ -184,25 +185,25 @@ static int pxa_rtc_open(struct device *dev)
        struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
        int ret;
 
-       ret = request_irq(pxa_rtc->irq_1Hz, pxa_rtc_irq, 0,
+       ret = request_irq(pxa_rtc->sa1100_rtc.irq_1hz, pxa_rtc_irq, 0,
                          "rtc 1Hz", dev);
        if (ret < 0) {
-               dev_err(dev, "can't get irq %i, err %d\n", pxa_rtc->irq_1Hz,
-                       ret);
+               dev_err(dev, "can't get irq %i, err %d\n",
+                       pxa_rtc->sa1100_rtc.irq_1hz, ret);
                goto err_irq_1Hz;
        }
-       ret = request_irq(pxa_rtc->irq_Alrm, pxa_rtc_irq, 0,
+       ret = request_irq(pxa_rtc->sa1100_rtc.irq_alarm, pxa_rtc_irq, 0,
                          "rtc Alrm", dev);
        if (ret < 0) {
-               dev_err(dev, "can't get irq %i, err %d\n", pxa_rtc->irq_Alrm,
-                       ret);
+               dev_err(dev, "can't get irq %i, err %d\n",
+                       pxa_rtc->sa1100_rtc.irq_alarm, ret);
                goto err_irq_Alrm;
        }
 
        return 0;
 
 err_irq_Alrm:
-       free_irq(pxa_rtc->irq_1Hz, dev);
+       free_irq(pxa_rtc->sa1100_rtc.irq_1hz, dev);
 err_irq_1Hz:
        return ret;
 }
@@ -215,8 +216,8 @@ static void pxa_rtc_release(struct device *dev)
        rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_RDALE1 | RTSR_HZE);
        spin_unlock_irq(&pxa_rtc->lock);
 
-       free_irq(pxa_rtc->irq_Alrm, dev);
-       free_irq(pxa_rtc->irq_1Hz, dev);
+       free_irq(pxa_rtc->sa1100_rtc.irq_1hz, dev);
+       free_irq(pxa_rtc->sa1100_rtc.irq_alarm, dev);
 }
 
 static int pxa_alarm_irq_enable(struct device *dev, unsigned int enabled)
@@ -320,12 +321,13 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct pxa_rtc *pxa_rtc;
+       struct sa1100_rtc *sa1100_rtc;
        int ret;
-       u32 rttr;
 
        pxa_rtc = devm_kzalloc(dev, sizeof(*pxa_rtc), GFP_KERNEL);
        if (!pxa_rtc)
                return -ENOMEM;
+       sa1100_rtc = &pxa_rtc->sa1100_rtc;
 
        spin_lock_init(&pxa_rtc->lock);
        platform_set_drvdata(pdev, pxa_rtc);
@@ -336,13 +338,13 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
                return -ENXIO;
        }
 
-       pxa_rtc->irq_1Hz = platform_get_irq(pdev, 0);
-       if (pxa_rtc->irq_1Hz < 0) {
+       sa1100_rtc->irq_1hz = platform_get_irq(pdev, 0);
+       if (sa1100_rtc->irq_1hz < 0) {
                dev_err(dev, "No 1Hz IRQ resource defined\n");
                return -ENXIO;
        }
-       pxa_rtc->irq_Alrm = platform_get_irq(pdev, 1);
-       if (pxa_rtc->irq_Alrm < 0) {
+       sa1100_rtc->irq_alarm = platform_get_irq(pdev, 1);
+       if (sa1100_rtc->irq_alarm < 0) {
                dev_err(dev, "No alarm IRQ resource defined\n");
                return -ENXIO;
        }
@@ -354,15 +356,14 @@ static int __init pxa_rtc_probe(struct platform_device *pdev)
                return -ENOMEM;
        }
 
-       /*
-        * If the clock divider is uninitialized then reset it to the
-        * default value to get the 1Hz clock.
-        */
-       if (rtc_readl(pxa_rtc, RTTR) == 0) {
-               rttr = RTC_DEF_DIVIDER + (RTC_DEF_TRIM << 16);
-               rtc_writel(pxa_rtc, RTTR, rttr);
-               dev_warn(dev, "warning: initializing default clock"
-                        " divider/trim value\n");
+       sa1100_rtc->rcnr = pxa_rtc->base + 0x0;
+       sa1100_rtc->rtsr = pxa_rtc->base + 0x8;
+       sa1100_rtc->rtar = pxa_rtc->base + 0x4;
+       sa1100_rtc->rttr = pxa_rtc->base + 0xc;
+       ret = sa1100_rtc_init(pdev, sa1100_rtc);
+       if (!ret) {
+               dev_err(dev, "Unable to init SA1100 RTC sub-device\n");
+               return ret;
        }
 
        rtsr_clear_bits(pxa_rtc, RTSR_PIALE | RTSR_RDALE1 | RTSR_HZE);
@@ -402,7 +403,7 @@ static int pxa_rtc_suspend(struct device *dev)
        struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
 
        if (device_may_wakeup(dev))
-               enable_irq_wake(pxa_rtc->irq_Alrm);
+               enable_irq_wake(pxa_rtc->sa1100_rtc.irq_alarm);
        return 0;
 }
 
@@ -411,7 +412,7 @@ static int pxa_rtc_resume(struct device *dev)
        struct pxa_rtc *pxa_rtc = dev_get_drvdata(dev);
 
        if (device_may_wakeup(dev))
-               disable_irq_wake(pxa_rtc->irq_Alrm);
+               disable_irq_wake(pxa_rtc->sa1100_rtc.irq_alarm);
        return 0;
 }
 #endif
index b548551f385ccd170a8dd507496c052dc8d1e4ef..026035373ae65a446122c6ebf39df9364b58718a 100644 (file)
@@ -170,7 +170,7 @@ static ssize_t rp5c01_nvram_read(struct file *filp, struct kobject *kobj,
 
        spin_lock_irq(&priv->lock);
 
-       for (count = 0; size > 0 && pos < RP5C01_MODE; count++, size--) {
+       for (count = 0; count < size; count++) {
                u8 data;
 
                rp5c01_write(priv,
@@ -200,7 +200,7 @@ static ssize_t rp5c01_nvram_write(struct file *filp, struct kobject *kobj,
 
        spin_lock_irq(&priv->lock);
 
-       for (count = 0; size > 0 && pos < RP5C01_MODE; count++, size--) {
+       for (count = 0; count < size; count++) {
                u8 data = *buf++;
 
                rp5c01_write(priv,
index e6298e02b400e7717c5fb549bc13a75203f59ea5..24c3d69ce1b97887485a2d668716a439c0a7625a 100644 (file)
  * modify it under the terms of the GNU General Public License
  * version 2 as published by the Free Software Foundation.
  */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
 #include <linux/bcd.h>
+#include <linux/bitops.h>
 #include <linux/i2c.h>
-#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/rtc.h>
 
 /* Register definitions */
 #define RX8025_BIT_CTRL1_CT    (7 << 0)
 /* 1 Hz periodic level irq */
 #define RX8025_BIT_CTRL1_CT_1HZ        4
-#define RX8025_BIT_CTRL1_TEST  (1 << 3)
-#define RX8025_BIT_CTRL1_1224  (1 << 5)
-#define RX8025_BIT_CTRL1_DALE  (1 << 6)
-#define RX8025_BIT_CTRL1_WALE  (1 << 7)
-
-#define RX8025_BIT_CTRL2_DAFG  (1 << 0)
-#define RX8025_BIT_CTRL2_WAFG  (1 << 1)
-#define RX8025_BIT_CTRL2_CTFG  (1 << 2)
-#define RX8025_BIT_CTRL2_PON   (1 << 4)
-#define RX8025_BIT_CTRL2_XST   (1 << 5)
-#define RX8025_BIT_CTRL2_VDET  (1 << 6)
+#define RX8025_BIT_CTRL1_TEST  BIT(3)
+#define RX8025_BIT_CTRL1_1224  BIT(5)
+#define RX8025_BIT_CTRL1_DALE  BIT(6)
+#define RX8025_BIT_CTRL1_WALE  BIT(7)
+
+#define RX8025_BIT_CTRL2_DAFG  BIT(0)
+#define RX8025_BIT_CTRL2_WAFG  BIT(1)
+#define RX8025_BIT_CTRL2_CTFG  BIT(2)
+#define RX8025_BIT_CTRL2_PON   BIT(4)
+#define RX8025_BIT_CTRL2_XST   BIT(5)
+#define RX8025_BIT_CTRL2_VDET  BIT(6)
 
 /* Clock precision adjustment */
 #define RX8025_ADJ_RESOLUTION  3050 /* in ppb */
@@ -74,84 +72,84 @@ MODULE_DEVICE_TABLE(i2c, rx8025_id);
 struct rx8025_data {
        struct i2c_client *client;
        struct rtc_device *rtc;
-       struct work_struct work;
        u8 ctrl1;
-       unsigned exiting:1;
 };
 
-static int rx8025_read_reg(struct i2c_client *client, int number, u8 *value)
+static s32 rx8025_read_reg(const struct i2c_client *client, u8 number)
 {
-       int ret = i2c_smbus_read_byte_data(client, (number << 4) | 0x08);
-
-       if (ret < 0) {
-               dev_err(&client->dev, "Unable to read register #%d\n", number);
-               return ret;
-       }
-
-       *value = ret;
-       return 0;
+       return i2c_smbus_read_byte_data(client, number << 4);
 }
 
-static int rx8025_read_regs(struct i2c_client *client,
-                           int number, u8 length, u8 *values)
+static int rx8025_read_regs(const struct i2c_client *client,
+                           u8 number, u8 length, u8 *values)
 {
-       int ret = i2c_smbus_read_i2c_block_data(client, (number << 4) | 0x08,
-                                               length, values);
-
-       if (ret != length) {
-               dev_err(&client->dev, "Unable to read registers #%d..#%d\n",
-                       number, number + length - 1);
+       int ret = i2c_smbus_read_i2c_block_data(client, number << 4, length,
+                                               values);
+       if (ret != length)
                return ret < 0 ? ret : -EIO;
-       }
 
        return 0;
 }
 
-static int rx8025_write_reg(struct i2c_client *client, int number, u8 value)
+static s32 rx8025_write_reg(const struct i2c_client *client, u8 number,
+                           u8 value)
 {
-       int ret = i2c_smbus_write_byte_data(client, number << 4, value);
-
-       if (ret)
-               dev_err(&client->dev, "Unable to write register #%d\n",
-                       number);
+       return i2c_smbus_write_byte_data(client, number << 4, value);
+}
 
-       return ret;
+static s32 rx8025_write_regs(const struct i2c_client *client,
+                            u8 number, u8 length, const u8 *values)
+{
+       return i2c_smbus_write_i2c_block_data(client, number << 4,
+                                             length, values);
 }
 
-static int rx8025_write_regs(struct i2c_client *client,
-                            int number, u8 length, u8 *values)
+static int rx8025_check_validity(struct device *dev)
 {
-       int ret = i2c_smbus_write_i2c_block_data(client, (number << 4) | 0x08,
-                                                length, values);
+       struct rx8025_data *rx8025 = dev_get_drvdata(dev);
+       int ctrl2;
+
+       ctrl2 = rx8025_read_reg(rx8025->client, RX8025_REG_CTRL2);
+       if (ctrl2 < 0)
+               return ctrl2;
+
+       if (ctrl2 & RX8025_BIT_CTRL2_VDET)
+               dev_warn(dev, "power voltage drop detected\n");
+
+       if (ctrl2 & RX8025_BIT_CTRL2_PON) {
+               dev_warn(dev, "power-on reset detected, date is invalid\n");
+               return -EINVAL;
+       }
 
-       if (ret)
-               dev_err(&client->dev, "Unable to write registers #%d..#%d\n",
-                       number, number + length - 1);
+       if (!(ctrl2 & RX8025_BIT_CTRL2_XST)) {
+               dev_warn(dev, "crystal stopped, date is invalid\n");
+               return -EINVAL;
+       }
 
-       return ret;
+       return 0;
 }
 
-static irqreturn_t rx8025_irq(int irq, void *dev_id)
+static int rx8025_reset_validity(struct i2c_client *client)
 {
-       struct i2c_client *client = dev_id;
-       struct rx8025_data *rx8025 = i2c_get_clientdata(client);
+       int ctrl2 = rx8025_read_reg(client, RX8025_REG_CTRL2);
 
-       disable_irq_nosync(irq);
-       schedule_work(&rx8025->work);
-       return IRQ_HANDLED;
+       if (ctrl2 < 0)
+               return ctrl2;
+
+       ctrl2 &= ~(RX8025_BIT_CTRL2_PON | RX8025_BIT_CTRL2_VDET);
+
+       return rx8025_write_reg(client, RX8025_REG_CTRL2,
+                               ctrl2 | RX8025_BIT_CTRL2_XST);
 }
 
-static void rx8025_work(struct work_struct *work)
+static irqreturn_t rx8025_handle_irq(int irq, void *dev_id)
 {
-       struct rx8025_data *rx8025 = container_of(work, struct rx8025_data,
-                                                 work);
-       struct i2c_client *client = rx8025->client;
-       struct mutex *lock = &rx8025->rtc->ops_lock;
-       u8 status;
-
-       mutex_lock(lock);
+       struct i2c_client *client = dev_id;
+       struct rx8025_data *rx8025 = i2c_get_clientdata(client);
+       int status;
 
-       if (rx8025_read_reg(client, RX8025_REG_CTRL2, &status))
+       status = rx8025_read_reg(client, RX8025_REG_CTRL2);
+       if (status < 0)
                goto out;
 
        if (!(status & RX8025_BIT_CTRL2_XST))
@@ -161,9 +159,7 @@ static void rx8025_work(struct work_struct *work)
        if (status & RX8025_BIT_CTRL2_CTFG) {
                /* periodic */
                status &= ~RX8025_BIT_CTRL2_CTFG;
-               local_irq_disable();
                rtc_update_irq(rx8025->rtc, 1, RTC_PF | RTC_IRQF);
-               local_irq_enable();
        }
 
        if (status & RX8025_BIT_CTRL2_DAFG) {
@@ -172,20 +168,11 @@ static void rx8025_work(struct work_struct *work)
                if (rx8025_write_reg(client, RX8025_REG_CTRL1,
                                     rx8025->ctrl1 & ~RX8025_BIT_CTRL1_DALE))
                        goto out;
-               local_irq_disable();
                rtc_update_irq(rx8025->rtc, 1, RTC_AF | RTC_IRQF);
-               local_irq_enable();
        }
 
-       /* acknowledge IRQ */
-       rx8025_write_reg(client, RX8025_REG_CTRL2,
-                        status | RX8025_BIT_CTRL2_XST);
-
 out:
-       if (!rx8025->exiting)
-               enable_irq(client->irq);
-
-       mutex_unlock(lock);
+       return IRQ_HANDLED;
 }
 
 static int rx8025_get_time(struct device *dev, struct rtc_time *dt)
@@ -194,6 +181,10 @@ static int rx8025_get_time(struct device *dev, struct rtc_time *dt)
        u8 date[7];
        int err;
 
+       err = rx8025_check_validity(dev);
+       if (err)
+               return err;
+
        err = rx8025_read_regs(rx8025->client, RX8025_REG_SEC, 7, date);
        if (err)
                return err;
@@ -213,10 +204,7 @@ static int rx8025_get_time(struct device *dev, struct rtc_time *dt)
 
        dt->tm_mday = bcd2bin(date[RX8025_REG_MDAY] & 0x3f);
        dt->tm_mon = bcd2bin(date[RX8025_REG_MONTH] & 0x1f) - 1;
-       dt->tm_year = bcd2bin(date[RX8025_REG_YEAR]);
-
-       if (dt->tm_year < 70)
-               dt->tm_year += 100;
+       dt->tm_year = bcd2bin(date[RX8025_REG_YEAR]) + 100;
 
        dev_dbg(dev, "%s: date %ds %dm %dh %dmd %dm %dy\n", __func__,
                dt->tm_sec, dt->tm_min, dt->tm_hour,
@@ -229,12 +217,10 @@ static int rx8025_set_time(struct device *dev, struct rtc_time *dt)
 {
        struct rx8025_data *rx8025 = dev_get_drvdata(dev);
        u8 date[7];
+       int ret;
 
-       /*
-        * BUG: The HW assumes every year that is a multiple of 4 to be a leap
-        * year.  Next time this is wrong is 2100, which will not be a leap
-        * year.
-        */
+       if ((dt->tm_year < 100) || (dt->tm_year > 199))
+               return -EINVAL;
 
        /*
         * Here the read-only bits are written as "0".  I'm not sure if that
@@ -251,17 +237,21 @@ static int rx8025_set_time(struct device *dev, struct rtc_time *dt)
        date[RX8025_REG_WDAY] = bin2bcd(dt->tm_wday);
        date[RX8025_REG_MDAY] = bin2bcd(dt->tm_mday);
        date[RX8025_REG_MONTH] = bin2bcd(dt->tm_mon + 1);
-       date[RX8025_REG_YEAR] = bin2bcd(dt->tm_year % 100);
+       date[RX8025_REG_YEAR] = bin2bcd(dt->tm_year - 100);
 
        dev_dbg(dev,
                "%s: write 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
                __func__,
                date[0], date[1], date[2], date[3], date[4], date[5], date[6]);
 
-       return rx8025_write_regs(rx8025->client, RX8025_REG_SEC, 7, date);
+       ret = rx8025_write_regs(rx8025->client, RX8025_REG_SEC, 7, date);
+       if (ret < 0)
+               return ret;
+
+       return rx8025_reset_validity(rx8025->client);
 }
 
-static int rx8025_init_client(struct i2c_client *client, int *need_reset)
+static int rx8025_init_client(struct i2c_client *client)
 {
        struct rx8025_data *rx8025 = i2c_get_clientdata(client);
        u8 ctrl[2], ctrl2;
@@ -275,38 +265,18 @@ static int rx8025_init_client(struct i2c_client *client, int *need_reset)
        /* Keep test bit zero ! */
        rx8025->ctrl1 = ctrl[0] & ~RX8025_BIT_CTRL1_TEST;
 
-       if (ctrl[1] & RX8025_BIT_CTRL2_PON) {
-               dev_warn(&client->dev, "power-on reset was detected, "
-                        "you may have to readjust the clock\n");
-               *need_reset = 1;
-       }
-
-       if (ctrl[1] & RX8025_BIT_CTRL2_VDET) {
-               dev_warn(&client->dev, "a power voltage drop was detected, "
-                        "you may have to readjust the clock\n");
-               *need_reset = 1;
-       }
-
-       if (!(ctrl[1] & RX8025_BIT_CTRL2_XST)) {
-               dev_warn(&client->dev, "Oscillation stop was detected,"
-                        "you may have to readjust the clock\n");
-               *need_reset = 1;
-       }
-
        if (ctrl[1] & (RX8025_BIT_CTRL2_DAFG | RX8025_BIT_CTRL2_WAFG)) {
                dev_warn(&client->dev, "Alarm was detected\n");
                need_clear = 1;
        }
 
-       if (!(ctrl[1] & RX8025_BIT_CTRL2_CTFG))
+       if (ctrl[1] & RX8025_BIT_CTRL2_CTFG)
                need_clear = 1;
 
-       if (*need_reset || need_clear) {
-               ctrl2 = ctrl[0];
-               ctrl2 &= ~(RX8025_BIT_CTRL2_PON | RX8025_BIT_CTRL2_VDET |
-                          RX8025_BIT_CTRL2_CTFG | RX8025_BIT_CTRL2_WAFG |
+       if (need_clear) {
+               ctrl2 = ctrl[1];
+               ctrl2 &= ~(RX8025_BIT_CTRL2_CTFG | RX8025_BIT_CTRL2_WAFG |
                           RX8025_BIT_CTRL2_DAFG);
-               ctrl2 |= RX8025_BIT_CTRL2_XST;
 
                err = rx8025_write_reg(client, RX8025_REG_CTRL2, ctrl2);
        }
@@ -319,8 +289,8 @@ static int rx8025_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 {
        struct rx8025_data *rx8025 = dev_get_drvdata(dev);
        struct i2c_client *client = rx8025->client;
-       u8 ctrl2, ald[2];
-       int err;
+       u8 ald[2];
+       int ctrl2, err;
 
        if (client->irq <= 0)
                return -EINVAL;
@@ -329,9 +299,9 @@ static int rx8025_read_alarm(struct device *dev, struct rtc_wkalrm *t)
        if (err)
                return err;
 
-       err = rx8025_read_reg(client, RX8025_REG_CTRL2, &ctrl2);
-       if (err)
-               return err;
+       ctrl2 = rx8025_read_reg(client, RX8025_REG_CTRL2);
+       if (ctrl2 < 0)
+               return ctrl2;
 
        dev_dbg(dev, "%s: read alarm 0x%02x 0x%02x ctrl2 %02x\n",
                __func__, ald[0], ald[1], ctrl2);
@@ -452,12 +422,11 @@ static struct rtc_class_ops rx8025_rtc_ops = {
 static int rx8025_get_clock_adjust(struct device *dev, int *adj)
 {
        struct i2c_client *client = to_i2c_client(dev);
-       u8 digoff;
-       int err;
+       int digoff;
 
-       err = rx8025_read_reg(client, RX8025_REG_DIGOFF, &digoff);
-       if (err)
-               return err;
+       digoff = rx8025_read_reg(client, RX8025_REG_DIGOFF);
+       if (digoff < 0)
+               return digoff;
 
        *adj = digoff >= 64 ? digoff - 128 : digoff;
        if (*adj > 0)
@@ -539,88 +508,53 @@ static int rx8025_probe(struct i2c_client *client,
 {
        struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
        struct rx8025_data *rx8025;
-       int err, need_reset = 0;
+       int err = 0;
 
        if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA
                                     | I2C_FUNC_SMBUS_I2C_BLOCK)) {
                dev_err(&adapter->dev,
                        "doesn't support required functionality\n");
-               err = -EIO;
-               goto errout;
+               return -EIO;
        }
 
        rx8025 = devm_kzalloc(&client->dev, sizeof(*rx8025), GFP_KERNEL);
        if (!rx8025) {
-               err = -ENOMEM;
-               goto errout;
+               return -ENOMEM;
        }
 
        rx8025->client = client;
        i2c_set_clientdata(client, rx8025);
-       INIT_WORK(&rx8025->work, rx8025_work);
 
-       err = rx8025_init_client(client, &need_reset);
+       err = rx8025_init_client(client);
        if (err)
-               goto errout;
-
-       if (need_reset) {
-               struct rtc_time tm;
-               dev_info(&client->dev,
-                        "bad conditions detected, resetting date\n");
-               rtc_time_to_tm(0, &tm); /* 1970/1/1 */
-               rx8025_set_time(&client->dev, &tm);
-       }
+               return err;
 
        rx8025->rtc = devm_rtc_device_register(&client->dev, client->name,
                                          &rx8025_rtc_ops, THIS_MODULE);
        if (IS_ERR(rx8025->rtc)) {
-               err = PTR_ERR(rx8025->rtc);
                dev_err(&client->dev, "unable to register the class device\n");
-               goto errout;
+               return PTR_ERR(rx8025->rtc);
        }
 
        if (client->irq > 0) {
                dev_info(&client->dev, "IRQ %d supplied\n", client->irq);
-               err = request_irq(client->irq, rx8025_irq,
-                                 0, "rx8025", client);
+               err = devm_request_threaded_irq(&client->dev, client->irq, NULL,
+                                               rx8025_handle_irq, 0, "rx8025",
+                                               client);
                if (err) {
-                       dev_err(&client->dev, "unable to request IRQ\n");
-                       goto errout;
+                       dev_err(&client->dev, "unable to request IRQ, alarms disabled\n");
+                       client->irq = 0;
                }
        }
 
-       rx8025->rtc->irq_freq = 1;
        rx8025->rtc->max_user_freq = 1;
 
        err = rx8025_sysfs_register(&client->dev);
-       if (err)
-               goto errout_irq;
-
-       return 0;
-
-errout_irq:
-       if (client->irq > 0)
-               free_irq(client->irq, client);
-
-errout:
-       dev_err(&adapter->dev, "probing for rx8025 failed\n");
        return err;
 }
 
 static int rx8025_remove(struct i2c_client *client)
 {
-       struct rx8025_data *rx8025 = i2c_get_clientdata(client);
-       struct mutex *lock = &rx8025->rtc->ops_lock;
-
-       if (client->irq > 0) {
-               mutex_lock(lock);
-               rx8025->exiting = 1;
-               mutex_unlock(lock);
-
-               free_irq(client->irq, client);
-               cancel_work_sync(&rx8025->work);
-       }
-
        rx8025_sysfs_unregister(&client->dev);
        return 0;
 }
@@ -628,7 +562,6 @@ static int rx8025_remove(struct i2c_client *client)
 static struct i2c_driver rx8025_driver = {
        .driver = {
                .name = "rtc-rx8025",
-               .owner = THIS_MODULE,
        },
        .probe          = rx8025_probe,
        .remove         = rx8025_remove,
index de8d9c4277826ca1e1e830e7dbdbea59a6a6a6ed..161e25d016c39ce86efef904db558926fbf6079a 100644 (file)
@@ -315,7 +315,6 @@ MODULE_DEVICE_TABLE(i2c, rx8581_id);
 static struct i2c_driver rx8581_driver = {
        .driver         = {
                .name   = "rtc-rx8581",
-               .owner  = THIS_MODULE,
        },
        .probe          = rx8581_probe,
        .id_table       = rx8581_id,
index a0f832362199078e439850223d51c185597b7b78..7cc8f73a3fe8f65174c43922529a8523177ded27 100644 (file)
@@ -39,6 +39,7 @@ struct s3c_rtc {
        void __iomem *base;
        struct clk *rtc_clk;
        struct clk *rtc_src_clk;
+       bool clk_disabled;
 
        struct s3c_rtc_data *data;
 
@@ -71,9 +72,12 @@ static void s3c_rtc_enable_clk(struct s3c_rtc *info)
        unsigned long irq_flags;
 
        spin_lock_irqsave(&info->alarm_clk_lock, irq_flags);
-       clk_enable(info->rtc_clk);
-       if (info->data->needs_src_clk)
-               clk_enable(info->rtc_src_clk);
+       if (info->clk_disabled) {
+               clk_enable(info->rtc_clk);
+               if (info->data->needs_src_clk)
+                       clk_enable(info->rtc_src_clk);
+               info->clk_disabled = false;
+       }
        spin_unlock_irqrestore(&info->alarm_clk_lock, irq_flags);
 }
 
@@ -82,9 +86,12 @@ static void s3c_rtc_disable_clk(struct s3c_rtc *info)
        unsigned long irq_flags;
 
        spin_lock_irqsave(&info->alarm_clk_lock, irq_flags);
-       if (info->data->needs_src_clk)
-               clk_disable(info->rtc_src_clk);
-       clk_disable(info->rtc_clk);
+       if (!info->clk_disabled) {
+               if (info->data->needs_src_clk)
+                       clk_disable(info->rtc_src_clk);
+               clk_disable(info->rtc_clk);
+               info->clk_disabled = true;
+       }
        spin_unlock_irqrestore(&info->alarm_clk_lock, irq_flags);
 }
 
@@ -128,6 +135,11 @@ static int s3c_rtc_setaie(struct device *dev, unsigned int enabled)
 
        s3c_rtc_disable_clk(info);
 
+       if (enabled)
+               s3c_rtc_enable_clk(info);
+       else
+               s3c_rtc_disable_clk(info);
+
        return 0;
 }
 
@@ -410,8 +422,9 @@ static int s3c_rtc_remove(struct platform_device *pdev)
 
        s3c_rtc_setaie(info->dev, 0);
 
+       if (info->data->needs_src_clk)
+               clk_unprepare(info->rtc_src_clk);
        clk_unprepare(info->rtc_clk);
-       info->rtc_clk = NULL;
 
        return 0;
 }
@@ -482,6 +495,7 @@ static int s3c_rtc_probe(struct platform_device *pdev)
                if (IS_ERR(info->rtc_src_clk)) {
                        dev_err(&pdev->dev,
                                "failed to find rtc source clock\n");
+                       clk_disable_unprepare(info->rtc_clk);
                        return PTR_ERR(info->rtc_src_clk);
                }
                clk_prepare_enable(info->rtc_src_clk);
index 8c70d785ba739c314221fb98fe2533e160ee3180..f2504b4eef3455297b6b9ac7d38f4e6889834be8 100644 (file)
@@ -635,6 +635,16 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info)
        case S2MPS13X:
                data[0] = (0 << BCD_EN_SHIFT) | (1 << MODEL24_SHIFT);
                ret = regmap_write(info->regmap, info->regs->ctrl, data[0]);
+               if (ret < 0)
+                       break;
+
+               /*
+                * Should set WUDR & (RUDR or AUDR) bits to high after writing
+                * RTC_CTRL register like writing Alarm registers. We can't find
+                * the description from datasheet but vendor code does that
+                * really.
+                */
+               ret = s5m8767_rtc_set_alarm_reg(info);
                break;
 
        default:
@@ -797,6 +807,7 @@ static const struct platform_device_id s5m_rtc_id[] = {
        { "s2mps14-rtc",        S2MPS14X },
        { },
 };
+MODULE_DEVICE_TABLE(platform, s5m_rtc_id);
 
 static struct platform_driver s5m_rtc_driver = {
        .driver         = {
index b6e1ca08c2c0ed395c8867e8b6cb947e366caf98..c2187bf6c7e41e09af3a04aa50a2e2f40eb76c24 100644 (file)
 #include <linux/bitops.h>
 #include <linux/io.h>
 
-#include <mach/hardware.h>
-#include <mach/irqs.h>
+#define RTSR_HZE               BIT(3)  /* HZ interrupt enable */
+#define RTSR_ALE               BIT(2)  /* RTC alarm interrupt enable */
+#define RTSR_HZ                        BIT(1)  /* HZ rising-edge detected */
+#define RTSR_AL                        BIT(0)  /* RTC alarm detected */
 
-#if defined(CONFIG_ARCH_PXA) || defined(CONFIG_ARCH_MMP)
-#include <mach/regs-rtc.h>
-#endif
+#include "rtc-sa1100.h"
 
 #define RTC_DEF_DIVIDER                (32768 - 1)
 #define RTC_DEF_TRIM           0
 #define RTC_FREQ               1024
 
-struct sa1100_rtc {
-       spinlock_t              lock;
-       int                     irq_1hz;
-       int                     irq_alarm;
-       struct rtc_device       *rtc;
-       struct clk              *clk;
-};
 
 static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id)
 {
@@ -63,16 +56,16 @@ static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id)
 
        spin_lock(&info->lock);
 
-       rtsr = RTSR;
+       rtsr = readl_relaxed(info->rtsr);
        /* clear interrupt sources */
-       RTSR = 0;
+       writel_relaxed(0, info->rtsr);
        /* Fix for a nasty initialization problem the in SA11xx RTSR register.
         * See also the comments in sa1100_rtc_probe(). */
        if (rtsr & (RTSR_ALE | RTSR_HZE)) {
                /* This is the original code, before there was the if test
                 * above. This code does not clear interrupts that were not
                 * enabled. */
-               RTSR = (RTSR_AL | RTSR_HZ) & (rtsr >> 2);
+               writel_relaxed((RTSR_AL | RTSR_HZ) & (rtsr >> 2), info->rtsr);
        } else {
                /* For some reason, it is possible to enter this routine
                 * without interruptions enabled, it has been tested with
@@ -81,13 +74,13 @@ static irqreturn_t sa1100_rtc_interrupt(int irq, void *dev_id)
                 * This situation leads to an infinite "loop" of interrupt
                 * routine calling and as a result the processor seems to
                 * lock on its first call to open(). */
-               RTSR = RTSR_AL | RTSR_HZ;
+               writel_relaxed(RTSR_AL | RTSR_HZ, info->rtsr);
        }
 
        /* clear alarm interrupt if it has occurred */
        if (rtsr & RTSR_AL)
                rtsr &= ~RTSR_ALE;
-       RTSR = rtsr & (RTSR_ALE | RTSR_HZE);
+       writel_relaxed(rtsr & (RTSR_ALE | RTSR_HZE), info->rtsr);
 
        /* update irq data & counter */
        if (rtsr & RTSR_AL)
@@ -135,7 +128,7 @@ static void sa1100_rtc_release(struct device *dev)
        struct sa1100_rtc *info = dev_get_drvdata(dev);
 
        spin_lock_irq(&info->lock);
-       RTSR = 0;
+       writel_relaxed(0, info->rtsr);
        spin_unlock_irq(&info->lock);
 
        free_irq(info->irq_alarm, dev);
@@ -144,39 +137,46 @@ static void sa1100_rtc_release(struct device *dev)
 
 static int sa1100_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled)
 {
+       u32 rtsr;
        struct sa1100_rtc *info = dev_get_drvdata(dev);
 
        spin_lock_irq(&info->lock);
+       rtsr = readl_relaxed(info->rtsr);
        if (enabled)
-               RTSR |= RTSR_ALE;
+               rtsr |= RTSR_ALE;
        else
-               RTSR &= ~RTSR_ALE;
+               rtsr &= ~RTSR_ALE;
+       writel_relaxed(rtsr, info->rtsr);
        spin_unlock_irq(&info->lock);
        return 0;
 }
 
 static int sa1100_rtc_read_time(struct device *dev, struct rtc_time *tm)
 {
-       rtc_time_to_tm(RCNR, tm);
+       struct sa1100_rtc *info = dev_get_drvdata(dev);
+
+       rtc_time_to_tm(readl_relaxed(info->rcnr), tm);
        return 0;
 }
 
 static int sa1100_rtc_set_time(struct device *dev, struct rtc_time *tm)
 {
+       struct sa1100_rtc *info = dev_get_drvdata(dev);
        unsigned long time;
        int ret;
 
        ret = rtc_tm_to_time(tm, &time);
        if (ret == 0)
-               RCNR = time;
+               writel_relaxed(time, info->rcnr);
        return ret;
 }
 
 static int sa1100_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
        u32     rtsr;
+       struct sa1100_rtc *info = dev_get_drvdata(dev);
 
-       rtsr = RTSR;
+       rtsr = readl_relaxed(info->rtsr);
        alrm->enabled = (rtsr & RTSR_ALE) ? 1 : 0;
        alrm->pending = (rtsr & RTSR_AL) ? 1 : 0;
        return 0;
@@ -192,12 +192,13 @@ static int sa1100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
        ret = rtc_tm_to_time(&alrm->time, &time);
        if (ret != 0)
                goto out;
-       RTSR = RTSR & (RTSR_HZE|RTSR_ALE|RTSR_AL);
-       RTAR = time;
+       writel_relaxed(readl_relaxed(info->rtsr) &
+               (RTSR_HZE | RTSR_ALE | RTSR_AL), info->rtsr);
+       writel_relaxed(time, info->rtar);
        if (alrm->enabled)
-               RTSR |= RTSR_ALE;
+               writel_relaxed(readl_relaxed(info->rtsr) | RTSR_ALE, info->rtsr);
        else
-               RTSR &= ~RTSR_ALE;
+               writel_relaxed(readl_relaxed(info->rtsr) & ~RTSR_ALE, info->rtsr);
 out:
        spin_unlock_irq(&info->lock);
 
@@ -206,8 +207,10 @@ out:
 
 static int sa1100_rtc_proc(struct device *dev, struct seq_file *seq)
 {
-       seq_printf(seq, "trim/divider\t\t: 0x%08x\n", (u32) RTTR);
-       seq_printf(seq, "RTSR\t\t\t: 0x%08x\n", (u32)RTSR);
+       struct sa1100_rtc *info = dev_get_drvdata(dev);
+
+       seq_printf(seq, "trim/divider\t\t: 0x%08x\n", readl_relaxed(info->rttr));
+       seq_printf(seq, "RTSR\t\t\t: 0x%08x\n", readl_relaxed(info->rtsr));
 
        return 0;
 }
@@ -223,29 +226,18 @@ static const struct rtc_class_ops sa1100_rtc_ops = {
        .alarm_irq_enable = sa1100_rtc_alarm_irq_enable,
 };
 
-static int sa1100_rtc_probe(struct platform_device *pdev)
+int sa1100_rtc_init(struct platform_device *pdev, struct sa1100_rtc *info)
 {
        struct rtc_device *rtc;
-       struct sa1100_rtc *info;
-       int irq_1hz, irq_alarm, ret = 0;
+       int ret;
 
-       irq_1hz = platform_get_irq_byname(pdev, "rtc 1Hz");
-       irq_alarm = platform_get_irq_byname(pdev, "rtc alarm");
-       if (irq_1hz < 0 || irq_alarm < 0)
-               return -ENODEV;
+       spin_lock_init(&info->lock);
 
-       info = devm_kzalloc(&pdev->dev, sizeof(struct sa1100_rtc), GFP_KERNEL);
-       if (!info)
-               return -ENOMEM;
        info->clk = devm_clk_get(&pdev->dev, NULL);
        if (IS_ERR(info->clk)) {
                dev_err(&pdev->dev, "failed to find rtc clock source\n");
                return PTR_ERR(info->clk);
        }
-       info->irq_1hz = irq_1hz;
-       info->irq_alarm = irq_alarm;
-       spin_lock_init(&info->lock);
-       platform_set_drvdata(pdev, info);
 
        ret = clk_prepare_enable(info->clk);
        if (ret)
@@ -257,22 +249,19 @@ static int sa1100_rtc_probe(struct platform_device *pdev)
         * If the clock divider is uninitialized then reset it to the
         * default value to get the 1Hz clock.
         */
-       if (RTTR == 0) {
-               RTTR = RTC_DEF_DIVIDER + (RTC_DEF_TRIM << 16);
+       if (readl_relaxed(info->rttr) == 0) {
+               writel_relaxed(RTC_DEF_DIVIDER + (RTC_DEF_TRIM << 16), info->rttr);
                dev_warn(&pdev->dev, "warning: "
                        "initializing default clock divider/trim value\n");
                /* The current RTC value probably doesn't make sense either */
-               RCNR = 0;
+               writel_relaxed(0, info->rcnr);
        }
 
-       device_init_wakeup(&pdev->dev, 1);
-
        rtc = devm_rtc_device_register(&pdev->dev, pdev->name, &sa1100_rtc_ops,
                                        THIS_MODULE);
-
        if (IS_ERR(rtc)) {
-               ret = PTR_ERR(rtc);
-               goto err_dev;
+               clk_disable_unprepare(info->clk);
+               return PTR_ERR(rtc);
        }
        info->rtc = rtc;
 
@@ -298,12 +287,52 @@ static int sa1100_rtc_probe(struct platform_device *pdev)
         *
         * Notice that clearing bit 1 and 0 is accomplished by writting ONES to
         * the corresponding bits in RTSR. */
-       RTSR = RTSR_AL | RTSR_HZ;
+       writel_relaxed(RTSR_AL | RTSR_HZ, info->rtsr);
 
        return 0;
-err_dev:
-       clk_disable_unprepare(info->clk);
-       return ret;
+}
+EXPORT_SYMBOL_GPL(sa1100_rtc_init);
+
+static int sa1100_rtc_probe(struct platform_device *pdev)
+{
+       struct sa1100_rtc *info;
+       struct resource *iores;
+       void __iomem *base;
+       int irq_1hz, irq_alarm;
+
+       irq_1hz = platform_get_irq_byname(pdev, "rtc 1Hz");
+       irq_alarm = platform_get_irq_byname(pdev, "rtc alarm");
+       if (irq_1hz < 0 || irq_alarm < 0)
+               return -ENODEV;
+
+       info = devm_kzalloc(&pdev->dev, sizeof(struct sa1100_rtc), GFP_KERNEL);
+       if (!info)
+               return -ENOMEM;
+       info->irq_1hz = irq_1hz;
+       info->irq_alarm = irq_alarm;
+
+       iores = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       base = devm_ioremap_resource(&pdev->dev, iores);
+       if (IS_ERR(base))
+               return PTR_ERR(base);
+
+       if (IS_ENABLED(CONFIG_ARCH_SA1100) ||
+           of_device_is_compatible(pdev->dev.of_node, "mrvl,sa1100-rtc")) {
+               info->rcnr = base + 0x04;
+               info->rtsr = base + 0x10;
+               info->rtar = base + 0x00;
+               info->rttr = base + 0x08;
+       } else {
+               info->rcnr = base + 0x0;
+               info->rtsr = base + 0x8;
+               info->rtar = base + 0x4;
+               info->rttr = base + 0xc;
+       }
+
+       platform_set_drvdata(pdev, info);
+       device_init_wakeup(&pdev->dev, 1);
+
+       return sa1100_rtc_init(pdev, info);
 }
 
 static int sa1100_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-sa1100.h b/drivers/rtc/rtc-sa1100.h
new file mode 100644 (file)
index 0000000..2c79c0c
--- /dev/null
@@ -0,0 +1,23 @@
+#ifndef __RTC_SA1100_H__
+#define __RTC_SA1100_H__
+
+#include <linux/kernel.h>
+
+struct clk;
+struct platform_device;
+
+struct sa1100_rtc {
+       spinlock_t              lock;
+       void __iomem            *rcnr;
+       void __iomem            *rtar;
+       void __iomem            *rtsr;
+       void __iomem            *rttr;
+       int                     irq_1hz;
+       int                     irq_alarm;
+       struct rtc_device       *rtc;
+       struct clk              *clk;
+};
+
+int sa1100_rtc_init(struct platform_device *pdev, struct sa1100_rtc *info);
+
+#endif
index edc3b43282d4033c531d21b55e14679a6f577aa0..7367f617145cdeb4694d319ae0a1fb543a94f6b8 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/of.h>
+#include <linux/regmap.h>
 #include <linux/rtc/sirfsoc_rtciobrg.h>
 
 
@@ -48,12 +49,27 @@ struct sirfsoc_rtc_drv {
        /* Overflow for every 8 years extra time */
        u32                     overflow_rtc;
        spinlock_t              lock;
+       struct regmap *regmap;
 #ifdef CONFIG_PM
        u32             saved_counter;
        u32             saved_overflow_rtc;
 #endif
 };
 
+static u32 sirfsoc_rtc_readl(struct sirfsoc_rtc_drv *rtcdrv, u32 offset)
+{
+       u32 val;
+
+       regmap_read(rtcdrv->regmap, rtcdrv->rtc_base + offset, &val);
+       return val;
+}
+
+static void sirfsoc_rtc_writel(struct sirfsoc_rtc_drv *rtcdrv,
+                              u32 offset, u32 val)
+{
+       regmap_write(rtcdrv->regmap, rtcdrv->rtc_base + offset, val);
+}
+
 static int sirfsoc_rtc_read_alarm(struct device *dev,
                struct rtc_wkalrm *alrm)
 {
@@ -64,9 +80,9 @@ static int sirfsoc_rtc_read_alarm(struct device *dev,
 
        spin_lock_irq(&rtcdrv->lock);
 
-       rtc_count = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
+       rtc_count = sirfsoc_rtc_readl(rtcdrv, RTC_CN);
 
-       rtc_alarm = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_ALARM0);
+       rtc_alarm = sirfsoc_rtc_readl(rtcdrv, RTC_ALARM0);
        memset(alrm, 0, sizeof(struct rtc_wkalrm));
 
        /*
@@ -82,8 +98,7 @@ static int sirfsoc_rtc_read_alarm(struct device *dev,
                rtc_time_to_tm(rtcdrv->overflow_rtc
                                << (BITS_PER_LONG - RTC_SHIFT)
                                | rtc_alarm >> RTC_SHIFT, &(alrm->time));
-       if (sirfsoc_rtc_iobrg_readl(
-                       rtcdrv->rtc_base + RTC_STATUS) & SIRFSOC_RTC_AL0E)
+       if (sirfsoc_rtc_readl(rtcdrv, RTC_STATUS) & SIRFSOC_RTC_AL0E)
                alrm->enabled = 1;
 
        spin_unlock_irq(&rtcdrv->lock);
@@ -103,8 +118,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
 
                spin_lock_irq(&rtcdrv->lock);
 
-               rtc_status_reg = sirfsoc_rtc_iobrg_readl(
-                               rtcdrv->rtc_base + RTC_STATUS);
+               rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
                if (rtc_status_reg & SIRFSOC_RTC_AL0E) {
                        /*
                         * An ongoing alarm in progress - ingore it and not
@@ -113,8 +127,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
                        dev_info(dev, "An old alarm was set, will be replaced by a new one\n");
                }
 
-               sirfsoc_rtc_iobrg_writel(
-                       rtc_alarm << RTC_SHIFT, rtcdrv->rtc_base + RTC_ALARM0);
+               sirfsoc_rtc_writel(rtcdrv, RTC_ALARM0, rtc_alarm << RTC_SHIFT);
                rtc_status_reg &= ~0x07; /* mask out the lower status bits */
                /*
                 * This bit RTC_AL sets it as a wake-up source for Sleep Mode
@@ -123,8 +136,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
                rtc_status_reg |= SIRFSOC_RTC_AL0;
                /* enable the RTC alarm interrupt */
                rtc_status_reg |= SIRFSOC_RTC_AL0E;
-               sirfsoc_rtc_iobrg_writel(
-                       rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS);
+               sirfsoc_rtc_writel(rtcdrv, RTC_STATUS, rtc_status_reg);
 
                spin_unlock_irq(&rtcdrv->lock);
        } else {
@@ -135,8 +147,7 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
                 */
                spin_lock_irq(&rtcdrv->lock);
 
-               rtc_status_reg = sirfsoc_rtc_iobrg_readl(
-                               rtcdrv->rtc_base + RTC_STATUS);
+               rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
                if (rtc_status_reg & SIRFSOC_RTC_AL0E) {
                        /* clear the RTC status register's alarm bit */
                        rtc_status_reg &= ~0x07;
@@ -145,8 +156,8 @@ static int sirfsoc_rtc_set_alarm(struct device *dev,
                        /* Clear the Alarm enable bit */
                        rtc_status_reg &= ~(SIRFSOC_RTC_AL0E);
 
-                       sirfsoc_rtc_iobrg_writel(rtc_status_reg,
-                                       rtcdrv->rtc_base + RTC_STATUS);
+                       sirfsoc_rtc_writel(rtcdrv, RTC_STATUS,
+                                          rtc_status_reg);
                }
 
                spin_unlock_irq(&rtcdrv->lock);
@@ -167,9 +178,9 @@ static int sirfsoc_rtc_read_time(struct device *dev,
         * fail, read several times to make sure get stable value.
         */
        do {
-               tmp_rtc = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
+               tmp_rtc = sirfsoc_rtc_readl(rtcdrv, RTC_CN);
                cpu_relax();
-       } while (tmp_rtc != sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN));
+       } while (tmp_rtc != sirfsoc_rtc_readl(rtcdrv, RTC_CN));
 
        rtc_time_to_tm(rtcdrv->overflow_rtc << (BITS_PER_LONG - RTC_SHIFT) |
                                        tmp_rtc >> RTC_SHIFT, tm);
@@ -187,10 +198,8 @@ static int sirfsoc_rtc_set_time(struct device *dev,
 
        rtcdrv->overflow_rtc = rtc_time >> (BITS_PER_LONG - RTC_SHIFT);
 
-       sirfsoc_rtc_iobrg_writel(rtcdrv->overflow_rtc,
-                       rtcdrv->rtc_base + RTC_SW_VALUE);
-       sirfsoc_rtc_iobrg_writel(
-                       rtc_time << RTC_SHIFT, rtcdrv->rtc_base + RTC_CN);
+       sirfsoc_rtc_writel(rtcdrv, RTC_SW_VALUE, rtcdrv->overflow_rtc);
+       sirfsoc_rtc_writel(rtcdrv, RTC_CN, rtc_time << RTC_SHIFT);
 
        return 0;
 }
@@ -222,14 +231,13 @@ static int sirfsoc_rtc_alarm_irq_enable(struct device *dev,
 
        spin_lock_irq(&rtcdrv->lock);
 
-       rtc_status_reg = sirfsoc_rtc_iobrg_readl(
-                               rtcdrv->rtc_base + RTC_STATUS);
+       rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
        if (enabled)
                rtc_status_reg |= SIRFSOC_RTC_AL0E;
        else
                rtc_status_reg &= ~SIRFSOC_RTC_AL0E;
 
-       sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS);
+       sirfsoc_rtc_writel(rtcdrv, RTC_STATUS, rtc_status_reg);
 
        spin_unlock_irq(&rtcdrv->lock);
 
@@ -254,7 +262,7 @@ static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata)
 
        spin_lock(&rtcdrv->lock);
 
-       rtc_status_reg = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_STATUS);
+       rtc_status_reg = sirfsoc_rtc_readl(rtcdrv, RTC_STATUS);
        /* this bit will be set ONLY if an alarm was active
         * and it expired NOW
         * So this is being used as an ASSERT
@@ -270,7 +278,8 @@ static irqreturn_t sirfsoc_rtc_irq_handler(int irq, void *pdata)
                /* Clear the Alarm enable bit */
                rtc_status_reg &= ~(SIRFSOC_RTC_AL0E);
        }
-       sirfsoc_rtc_iobrg_writel(rtc_status_reg, rtcdrv->rtc_base + RTC_STATUS);
+
+       sirfsoc_rtc_writel(rtcdrv, RTC_STATUS, rtc_status_reg);
 
        spin_unlock(&rtcdrv->lock);
 
@@ -287,6 +296,13 @@ static const struct of_device_id sirfsoc_rtc_of_match[] = {
        { .compatible = "sirf,prima2-sysrtc"},
        {},
 };
+
+const struct regmap_config sysrtc_regmap_config = {
+       .reg_bits = 32,
+       .val_bits = 32,
+       .fast_io = true,
+};
+
 MODULE_DEVICE_TABLE(of, sirfsoc_rtc_of_match);
 
 static int sirfsoc_rtc_probe(struct platform_device *pdev)
@@ -314,27 +330,35 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev)
        /* Register rtc alarm as a wakeup source */
        device_init_wakeup(&pdev->dev, 1);
 
+       rtcdrv->regmap = devm_regmap_init_iobg(&pdev->dev,
+                       &sysrtc_regmap_config);
+       if (IS_ERR(rtcdrv->regmap)) {
+               err = PTR_ERR(rtcdrv->regmap);
+               dev_err(&pdev->dev, "Failed to allocate register map: %d\n",
+                       err);
+               return err;
+       }
+
        /*
         * Set SYS_RTC counter in RTC_HZ HZ Units
         * We are using 32K RTC crystal (32768 / RTC_HZ / 2) -1
         * If 16HZ, therefore RTC_DIV = 1023;
         */
        rtc_div = ((32768 / RTC_HZ) / 2) - 1;
-       sirfsoc_rtc_iobrg_writel(rtc_div, rtcdrv->rtc_base + RTC_DIV);
+       sirfsoc_rtc_writel(rtcdrv, RTC_DIV, rtc_div);
 
        /* 0x3 -> RTC_CLK */
-       sirfsoc_rtc_iobrg_writel(SIRFSOC_RTC_CLK,
-                       rtcdrv->rtc_base + RTC_CLOCK_SWITCH);
+       sirfsoc_rtc_writel(rtcdrv, RTC_CLOCK_SWITCH, SIRFSOC_RTC_CLK);
 
        /* reset SYS RTC ALARM0 */
-       sirfsoc_rtc_iobrg_writel(0x0, rtcdrv->rtc_base + RTC_ALARM0);
+       sirfsoc_rtc_writel(rtcdrv, RTC_ALARM0, 0x0);
 
        /* reset SYS RTC ALARM1 */
-       sirfsoc_rtc_iobrg_writel(0x0, rtcdrv->rtc_base + RTC_ALARM1);
+       sirfsoc_rtc_writel(rtcdrv, RTC_ALARM1, 0x0);
 
        /* Restore RTC Overflow From Register After Command Reboot */
        rtcdrv->overflow_rtc =
-               sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE);
+               sirfsoc_rtc_readl(rtcdrv, RTC_SW_VALUE);
 
        rtcdrv->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
                        &sirfsoc_rtc_ops, THIS_MODULE);
@@ -372,10 +396,10 @@ static int sirfsoc_rtc_suspend(struct device *dev)
 {
        struct sirfsoc_rtc_drv *rtcdrv = dev_get_drvdata(dev);
        rtcdrv->overflow_rtc =
-               sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_SW_VALUE);
+               sirfsoc_rtc_readl(rtcdrv, RTC_SW_VALUE);
 
        rtcdrv->saved_counter =
-               sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
+               sirfsoc_rtc_readl(rtcdrv, RTC_CN);
        rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc;
        if (device_may_wakeup(dev) && !enable_irq_wake(rtcdrv->irq))
                rtcdrv->irq_wake = 1;
@@ -392,12 +416,10 @@ static int sirfsoc_rtc_resume(struct device *dev)
         * if resume from snapshot and the rtc power is lost,
         * restroe the rtc settings
         */
-       if (SIRFSOC_RTC_CLK != sirfsoc_rtc_iobrg_readl(
-                       rtcdrv->rtc_base + RTC_CLOCK_SWITCH)) {
+       if (SIRFSOC_RTC_CLK != sirfsoc_rtc_readl(rtcdrv, RTC_CLOCK_SWITCH)) {
                u32 rtc_div;
                /* 0x3 -> RTC_CLK */
-               sirfsoc_rtc_iobrg_writel(SIRFSOC_RTC_CLK,
-                       rtcdrv->rtc_base + RTC_CLOCK_SWITCH);
+               sirfsoc_rtc_writel(rtcdrv, RTC_CLOCK_SWITCH, SIRFSOC_RTC_CLK);
                /*
                 * Set SYS_RTC counter in RTC_HZ HZ Units
                 * We are using 32K RTC crystal (32768 / RTC_HZ / 2) -1
@@ -405,13 +427,13 @@ static int sirfsoc_rtc_resume(struct device *dev)
                 */
                rtc_div = ((32768 / RTC_HZ) / 2) - 1;
 
-               sirfsoc_rtc_iobrg_writel(rtc_div, rtcdrv->rtc_base + RTC_DIV);
+               sirfsoc_rtc_writel(rtcdrv, RTC_DIV, rtc_div);
 
                /* reset SYS RTC ALARM0 */
-               sirfsoc_rtc_iobrg_writel(0x0, rtcdrv->rtc_base + RTC_ALARM0);
+               sirfsoc_rtc_writel(rtcdrv, RTC_ALARM0, 0x0);
 
                /* reset SYS RTC ALARM1 */
-               sirfsoc_rtc_iobrg_writel(0x0, rtcdrv->rtc_base + RTC_ALARM1);
+               sirfsoc_rtc_writel(rtcdrv, RTC_ALARM1, 0x0);
        }
        rtcdrv->overflow_rtc = rtcdrv->saved_overflow_rtc;
 
@@ -419,15 +441,14 @@ static int sirfsoc_rtc_resume(struct device *dev)
         * if current counter is small than previous,
         * it means overflow in sleep
         */
-       tmp = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
+       tmp = sirfsoc_rtc_readl(rtcdrv, RTC_CN);
        if (tmp <= rtcdrv->saved_counter)
                rtcdrv->overflow_rtc++;
        /*
         *PWRC Value Be Changed When Suspend, Restore Overflow
         * In Memory To Register
         */
-       sirfsoc_rtc_iobrg_writel(rtcdrv->overflow_rtc,
-                       rtcdrv->rtc_base + RTC_SW_VALUE);
+       sirfsoc_rtc_writel(rtcdrv, RTC_SW_VALUE, rtcdrv->overflow_rtc);
 
        if (device_may_wakeup(dev) && rtcdrv->irq_wake) {
                disable_irq_wake(rtcdrv->irq);
index 0e93b714ee4148e221bf55b878930d8b20d4aad2..ba6a83b5b5c9a0859c8cd4527d625bd2333052ae 100644 (file)
@@ -254,7 +254,7 @@ static ssize_t stk17ta8_nvram_read(struct file *filp, struct kobject *kobj,
        void __iomem *ioaddr = pdata->ioaddr;
        ssize_t count;
 
-       for (count = 0; size > 0 && pos < RTC_OFFSET; count++, size--)
+       for (count = 0; count < size; count++)
                *buf++ = readb(ioaddr + pos++);
        return count;
 }
@@ -269,7 +269,7 @@ static ssize_t stk17ta8_nvram_write(struct file *filp, struct kobject *kobj,
        void __iomem *ioaddr = pdata->ioaddr;
        ssize_t count;
 
-       for (count = 0; size > 0 && pos < RTC_OFFSET; count++, size--)
+       for (count = 0; count < size; count++)
                writeb(*buf++, ioaddr + pos++);
        return count;
 }
index babd43bf3ddc36d101171a62fa78445ef454db42..7273855ed02eff9eaad5c93663ebe67f030ebc71 100644 (file)
@@ -122,20 +122,8 @@ hctosys_show(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR_RO(hctosys);
 
-static struct attribute *rtc_attrs[] = {
-       &dev_attr_name.attr,
-       &dev_attr_date.attr,
-       &dev_attr_time.attr,
-       &dev_attr_since_epoch.attr,
-       &dev_attr_max_user_freq.attr,
-       &dev_attr_hctosys.attr,
-       NULL,
-};
-ATTRIBUTE_GROUPS(rtc);
-
 static ssize_t
-rtc_sysfs_show_wakealarm(struct device *dev, struct device_attribute *attr,
-               char *buf)
+wakealarm_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
        ssize_t retval;
        unsigned long alarm;
@@ -159,7 +147,7 @@ rtc_sysfs_show_wakealarm(struct device *dev, struct device_attribute *attr,
 }
 
 static ssize_t
-rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr,
+wakealarm_store(struct device *dev, struct device_attribute *attr,
                const char *buf, size_t n)
 {
        ssize_t retval;
@@ -221,45 +209,57 @@ rtc_sysfs_set_wakealarm(struct device *dev, struct device_attribute *attr,
        retval = rtc_set_alarm(rtc, &alm);
        return (retval < 0) ? retval : n;
 }
-static DEVICE_ATTR(wakealarm, S_IRUGO | S_IWUSR,
-               rtc_sysfs_show_wakealarm, rtc_sysfs_set_wakealarm);
+static DEVICE_ATTR_RW(wakealarm);
 
+static struct attribute *rtc_attrs[] = {
+       &dev_attr_name.attr,
+       &dev_attr_date.attr,
+       &dev_attr_time.attr,
+       &dev_attr_since_epoch.attr,
+       &dev_attr_max_user_freq.attr,
+       &dev_attr_hctosys.attr,
+       &dev_attr_wakealarm.attr,
+       NULL,
+};
 
 /* The reason to trigger an alarm with no process watching it (via sysfs)
  * is its side effect:  waking from a system state like suspend-to-RAM or
  * suspend-to-disk.  So: no attribute unless that side effect is possible.
  * (Userspace may disable that mechanism later.)
  */
-static inline int rtc_does_wakealarm(struct rtc_device *rtc)
+static bool rtc_does_wakealarm(struct rtc_device *rtc)
 {
        if (!device_can_wakeup(rtc->dev.parent))
-               return 0;
+               return false;
+
        return rtc->ops->set_alarm != NULL;
 }
 
-
-void rtc_sysfs_add_device(struct rtc_device *rtc)
+static umode_t rtc_attr_is_visible(struct kobject *kobj,
+                                  struct attribute *attr, int n)
 {
-       int err;
+       struct device *dev = container_of(kobj, struct device, kobj);
+       struct rtc_device *rtc = to_rtc_device(dev);
+       umode_t mode = attr->mode;
 
-       /* not all RTCs support both alarms and wakeup */
-       if (!rtc_does_wakealarm(rtc))
-               return;
+       if (attr == &dev_attr_wakealarm.attr)
+               if (!rtc_does_wakealarm(rtc))
+                       mode = 0;
 
-       err = device_create_file(&rtc->dev, &dev_attr_wakealarm);
-       if (err)
-               dev_err(rtc->dev.parent,
-                       "failed to create alarm attribute, %d\n", err);
+       return mode;
 }
 
-void rtc_sysfs_del_device(struct rtc_device *rtc)
-{
-       /* REVISIT did we add it successfully? */
-       if (rtc_does_wakealarm(rtc))
-               device_remove_file(&rtc->dev, &dev_attr_wakealarm);
-}
+static struct attribute_group rtc_attr_group = {
+       .is_visible     = rtc_attr_is_visible,
+       .attrs          = rtc_attrs,
+};
+
+static const struct attribute_group *rtc_attr_groups[] = {
+       &rtc_attr_group,
+       NULL
+};
 
-void __init rtc_sysfs_init(struct class *rtc_class)
+const struct attribute_group **rtc_get_dev_attribute_groups(void)
 {
-       rtc_class->dev_groups = rtc_groups;
+       return rtc_attr_groups;
 }
index cb7f94ede5165b96b4adbdb8b382af6f39d792a0..560d9a5e02253fe3ef5cf159c71d94198498b45c 100644 (file)
@@ -199,8 +199,7 @@ static ssize_t tx4939_rtc_nvram_read(struct file *filp, struct kobject *kobj,
        ssize_t count;
 
        spin_lock_irq(&pdata->lock);
-       for (count = 0; size > 0 && pos < TX4939_RTC_REG_RAMSIZE;
-            count++, size--) {
+       for (count = 0; count < size; count++) {
                __raw_writel(pos++, &rtcreg->adr);
                *buf++ = __raw_readl(&rtcreg->dat);
        }
@@ -218,8 +217,7 @@ static ssize_t tx4939_rtc_nvram_write(struct file *filp, struct kobject *kobj,
        ssize_t count;
 
        spin_lock_irq(&pdata->lock);
-       for (count = 0; size > 0 && pos < TX4939_RTC_REG_RAMSIZE;
-            count++, size--) {
+       for (count = 0; count < size; count++) {
                __raw_writel(pos++, &rtcreg->adr);
                __raw_writel(*buf++, &rtcreg->dat);
        }
index a58b6d17e6f074926c2437a44b3533a5fecfab17..27e896995e9b30e966b49a4794257f8b5da098a6 100644 (file)
@@ -271,6 +271,7 @@ static const struct of_device_id wmt_dt_ids[] = {
        { .compatible = "via,vt8500-rtc", },
        {}
 };
+MODULE_DEVICE_TABLE(of, wmt_dt_ids);
 
 static struct platform_driver vt8500_rtc_driver = {
        .probe          = vt8500_rtc_probe,
diff --git a/drivers/rtc/rtc-zynqmp.c b/drivers/rtc/rtc-zynqmp.c
new file mode 100644 (file)
index 0000000..8b28762
--- /dev/null
@@ -0,0 +1,279 @@
+/*
+ * Xilinx Zynq Ultrascale+ MPSoC Real Time Clock Driver
+ *
+ * Copyright (C) 2015 Xilinx, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+
+/* RTC Registers */
+#define RTC_SET_TM_WR          0x00
+#define RTC_SET_TM_RD          0x04
+#define RTC_CALIB_WR           0x08
+#define RTC_CALIB_RD           0x0C
+#define RTC_CUR_TM             0x10
+#define RTC_CUR_TICK           0x14
+#define RTC_ALRM               0x18
+#define RTC_INT_STS            0x20
+#define RTC_INT_MASK           0x24
+#define RTC_INT_EN             0x28
+#define RTC_INT_DIS            0x2C
+#define RTC_CTRL               0x40
+
+#define RTC_FR_EN              BIT(20)
+#define RTC_FR_DATSHIFT                16
+#define RTC_TICK_MASK          0xFFFF
+#define RTC_INT_SEC            BIT(0)
+#define RTC_INT_ALRM           BIT(1)
+#define RTC_OSC_EN             BIT(24)
+
+#define RTC_CALIB_DEF          0x198233
+#define RTC_CALIB_MASK         0x1FFFFF
+#define RTC_SEC_MAX_VAL                0xFFFFFFFF
+
+struct xlnx_rtc_dev {
+       struct rtc_device       *rtc;
+       void __iomem            *reg_base;
+       int                     alarm_irq;
+       int                     sec_irq;
+};
+
+static int xlnx_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+       struct xlnx_rtc_dev *xrtcdev = dev_get_drvdata(dev);
+       unsigned long new_time;
+
+       new_time = rtc_tm_to_time64(tm);
+
+       if (new_time > RTC_SEC_MAX_VAL)
+               return -EINVAL;
+
+       writel(new_time, xrtcdev->reg_base + RTC_SET_TM_WR);
+
+       return 0;
+}
+
+static int xlnx_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+       struct xlnx_rtc_dev *xrtcdev = dev_get_drvdata(dev);
+
+       rtc_time64_to_tm(readl(xrtcdev->reg_base + RTC_CUR_TM), tm);
+
+       return rtc_valid_tm(tm);
+}
+
+static int xlnx_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct xlnx_rtc_dev *xrtcdev = dev_get_drvdata(dev);
+
+       rtc_time64_to_tm(readl(xrtcdev->reg_base + RTC_ALRM), &alrm->time);
+       alrm->enabled = readl(xrtcdev->reg_base + RTC_INT_MASK) & RTC_INT_ALRM;
+
+       return 0;
+}
+
+static int xlnx_rtc_alarm_irq_enable(struct device *dev, u32 enabled)
+{
+       struct xlnx_rtc_dev *xrtcdev = dev_get_drvdata(dev);
+
+       if (enabled)
+               writel(RTC_INT_ALRM, xrtcdev->reg_base + RTC_INT_EN);
+       else
+               writel(RTC_INT_ALRM, xrtcdev->reg_base + RTC_INT_DIS);
+
+       return 0;
+}
+
+static int xlnx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+       struct xlnx_rtc_dev *xrtcdev = dev_get_drvdata(dev);
+       unsigned long alarm_time;
+
+       alarm_time = rtc_tm_to_time64(&alrm->time);
+
+       if (alarm_time > RTC_SEC_MAX_VAL)
+               return -EINVAL;
+
+       writel((u32)alarm_time, (xrtcdev->reg_base + RTC_ALRM));
+
+       xlnx_rtc_alarm_irq_enable(dev, alrm->enabled);
+
+       return 0;
+}
+
+static void xlnx_init_rtc(struct xlnx_rtc_dev *xrtcdev, u32 calibval)
+{
+       /*
+        * Based on crystal freq of 33.330 KHz
+        * set the seconds counter and enable, set fractions counter
+        * to default value suggested as per design spec
+        * to correct RTC delay in frequency over period of time.
+        */
+       calibval &= RTC_CALIB_MASK;
+       writel(calibval, (xrtcdev->reg_base + RTC_CALIB_WR));
+}
+
+static const struct rtc_class_ops xlnx_rtc_ops = {
+       .set_time         = xlnx_rtc_set_time,
+       .read_time        = xlnx_rtc_read_time,
+       .read_alarm       = xlnx_rtc_read_alarm,
+       .set_alarm        = xlnx_rtc_set_alarm,
+       .alarm_irq_enable = xlnx_rtc_alarm_irq_enable,
+};
+
+static irqreturn_t xlnx_rtc_interrupt(int irq, void *id)
+{
+       struct xlnx_rtc_dev *xrtcdev = (struct xlnx_rtc_dev *)id;
+       unsigned int status;
+
+       status = readl(xrtcdev->reg_base + RTC_INT_STS);
+       /* Check if interrupt asserted */
+       if (!(status & (RTC_INT_SEC | RTC_INT_ALRM)))
+               return IRQ_NONE;
+
+       /* Clear interrupt */
+       writel(status, xrtcdev->reg_base + RTC_INT_STS);
+
+       if (status & RTC_INT_SEC)
+               rtc_update_irq(xrtcdev->rtc, 1, RTC_IRQF | RTC_UF);
+       if (status & RTC_INT_ALRM)
+               rtc_update_irq(xrtcdev->rtc, 1, RTC_IRQF | RTC_AF);
+
+       return IRQ_HANDLED;
+}
+
+static int xlnx_rtc_probe(struct platform_device *pdev)
+{
+       struct xlnx_rtc_dev *xrtcdev;
+       struct resource *res;
+       int ret;
+       unsigned int calibvalue;
+
+       xrtcdev = devm_kzalloc(&pdev->dev, sizeof(*xrtcdev), GFP_KERNEL);
+       if (!xrtcdev)
+               return -ENOMEM;
+
+       platform_set_drvdata(pdev, xrtcdev);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+       xrtcdev->reg_base = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(xrtcdev->reg_base))
+               return PTR_ERR(xrtcdev->reg_base);
+
+       xrtcdev->alarm_irq = platform_get_irq_byname(pdev, "alarm");
+       if (xrtcdev->alarm_irq < 0) {
+               dev_err(&pdev->dev, "no irq resource\n");
+               return xrtcdev->alarm_irq;
+       }
+       ret = devm_request_irq(&pdev->dev, xrtcdev->alarm_irq,
+                              xlnx_rtc_interrupt, 0,
+                              dev_name(&pdev->dev), xrtcdev);
+       if (ret) {
+               dev_err(&pdev->dev, "request irq failed\n");
+               return ret;
+       }
+
+       xrtcdev->sec_irq = platform_get_irq_byname(pdev, "sec");
+       if (xrtcdev->sec_irq < 0) {
+               dev_err(&pdev->dev, "no irq resource\n");
+               return xrtcdev->sec_irq;
+       }
+       ret = devm_request_irq(&pdev->dev, xrtcdev->sec_irq,
+                              xlnx_rtc_interrupt, 0,
+                              dev_name(&pdev->dev), xrtcdev);
+       if (ret) {
+               dev_err(&pdev->dev, "request irq failed\n");
+               return ret;
+       }
+
+       ret = of_property_read_u32(pdev->dev.of_node, "calibration",
+                                  &calibvalue);
+       if (ret)
+               calibvalue = RTC_CALIB_DEF;
+
+       xlnx_init_rtc(xrtcdev, calibvalue);
+
+       device_init_wakeup(&pdev->dev, 1);
+
+       xrtcdev->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
+                                        &xlnx_rtc_ops, THIS_MODULE);
+       return PTR_ERR_OR_ZERO(xrtcdev->rtc);
+}
+
+static int xlnx_rtc_remove(struct platform_device *pdev)
+{
+       xlnx_rtc_alarm_irq_enable(&pdev->dev, 0);
+       device_init_wakeup(&pdev->dev, 0);
+
+       return 0;
+}
+
+static int __maybe_unused xlnx_rtc_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct xlnx_rtc_dev *xrtcdev = platform_get_drvdata(pdev);
+
+       if (device_may_wakeup(&pdev->dev))
+               enable_irq_wake(xrtcdev->alarm_irq);
+       else
+               xlnx_rtc_alarm_irq_enable(dev, 0);
+
+       return 0;
+}
+
+static int __maybe_unused xlnx_rtc_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct xlnx_rtc_dev *xrtcdev = platform_get_drvdata(pdev);
+
+       if (device_may_wakeup(&pdev->dev))
+               disable_irq_wake(xrtcdev->alarm_irq);
+       else
+               xlnx_rtc_alarm_irq_enable(dev, 1);
+
+       return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(xlnx_rtc_pm_ops, xlnx_rtc_suspend, xlnx_rtc_resume);
+
+static const struct of_device_id xlnx_rtc_of_match[] = {
+       {.compatible = "xlnx,zynqmp-rtc" },
+       { }
+};
+MODULE_DEVICE_TABLE(of, xlnx_rtc_of_match);
+
+static struct platform_driver xlnx_rtc_driver = {
+       .probe          = xlnx_rtc_probe,
+       .remove         = xlnx_rtc_remove,
+       .driver         = {
+               .name   = KBUILD_MODNAME,
+               .pm     = &xlnx_rtc_pm_ops,
+               .of_match_table = xlnx_rtc_of_match,
+       },
+};
+
+module_platform_driver(xlnx_rtc_driver);
+
+MODULE_DESCRIPTION("Xilinx Zynq MPSoC RTC driver");
+MODULE_AUTHOR("Xilinx Inc.");
+MODULE_LICENSE("GPL v2");
index 2b744fbba68e04d699c31552a2ee5ca9c56ba86c..5ed44fe21380645f56c73dc0bef067bffc09c59e 100644 (file)
@@ -29,7 +29,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
 static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
-                                void **kaddr, unsigned long *pfn, long size);
+                        void __pmem **kaddr, unsigned long *pfn);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -881,18 +881,20 @@ fail:
 
 static long
 dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
-                       void **kaddr, unsigned long *pfn, long size)
+                       void __pmem **kaddr, unsigned long *pfn)
 {
        struct dcssblk_dev_info *dev_info;
        unsigned long offset, dev_sz;
+       void *addr;
 
        dev_info = bdev->bd_disk->private_data;
        if (!dev_info)
                return -ENODEV;
        dev_sz = dev_info->end - dev_info->start;
        offset = secnum * 512;
-       *kaddr = (void *) (dev_info->start + offset);
-       *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
+       addr = (void *) (dev_info->start + offset);
+       *pfn = virt_to_phys(addr) >> PAGE_SHIFT;
+       *kaddr = (void __pmem *) addr;
 
        return dev_sz - offset;
 }
index 01bf1f5cf2e95a7f40f722f51e4df87ea257e724..4eb45546a3aaf39421e6434a181890f6145fd604 100644 (file)
@@ -1206,16 +1206,8 @@ static void sprinthx(unsigned char *title, struct seq_file *m,
 static void sprinthx4(unsigned char *title, struct seq_file *m,
                      unsigned int *array, unsigned int len)
 {
-       int r;
-
        seq_printf(m, "\n%s\n", title);
-       for (r = 0; r < len; r++) {
-               if ((r % 8) == 0)
-                       seq_printf(m, "    ");
-               seq_printf(m, "%08X ", array[r]);
-               if ((r % 8) == 7)
-                       seq_putc(m, '\n');
-       }
+       seq_hex_dump(m, "    ", DUMP_PREFIX_NONE, 32, 4, array, len, false);
        seq_putc(m, '\n');
 }
 
index 471d0879176621ce411a66a706a35ef8d63a2171..1a8c9b53fafad5900ef775599a7f34abd34978eb 100644 (file)
@@ -172,6 +172,7 @@ scsi_mod-$(CONFIG_SYSCTL)   += scsi_sysctl.o
 scsi_mod-$(CONFIG_SCSI_PROC_FS)        += scsi_proc.o
 scsi_mod-y                     += scsi_trace.o scsi_logging.o
 scsi_mod-$(CONFIG_PM)          += scsi_pm.o
+scsi_mod-$(CONFIG_SCSI_DH)     += scsi_dh.o
 
 hv_storvsc-y                   := storvsc_drv.o
 
index 31e8576cbaab40ad16998735cee279131533780c..f6c336b05d5bbf6e0354c3f18697fd3350f3d09e 100644 (file)
@@ -100,12 +100,7 @@ static int asd_map_memio(struct asd_ha_struct *asd_ha)
                                   pci_name(asd_ha->pcidev));
                        goto Err;
                }
-               if (io_handle->flags & IORESOURCE_CACHEABLE)
-                       io_handle->addr = ioremap(io_handle->start,
-                                                 io_handle->len);
-               else
-                       io_handle->addr = ioremap_nocache(io_handle->start,
-                                                         io_handle->len);
+               io_handle->addr = ioremap(io_handle->start, io_handle->len);
                if (!io_handle->addr) {
                        asd_printk("couldn't map MBAR%d of %s\n", i==0?0:1,
                                   pci_name(asd_ha->pcidev));
index edb43fda9f36f34d5834f3f516fc9227aea67af6..c831e30411fa12c0e87714a09c9f6c1bf108fd38 100644 (file)
@@ -983,7 +983,7 @@ static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha,
 {
        int err, i;
        u32 offs, size;
-       struct asd_ll_el *el;
+       struct asd_ll_el *el = NULL;
        struct asd_ctrla_phy_settings *ps;
        struct asd_ctrla_phy_settings dflt_ps;
 
@@ -1004,6 +1004,7 @@ static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha,
 
                size = sizeof(struct asd_ctrla_phy_settings);
                ps = &dflt_ps;
+               goto out_process;
        }
 
        if (size == 0)
@@ -1028,7 +1029,7 @@ static int asd_process_ctrl_a_user(struct asd_ha_struct *asd_ha,
                ASD_DPRINTK("couldn't find ctrla phy settings struct\n");
                goto out2;
        }
-
+out_process:
        err = asd_process_ctrla_phy_settings(asd_ha, ps);
        if (err) {
                ASD_DPRINTK("couldn't process ctrla phy settings\n");
index 6ac74fb4ea9a5b96262da820afef2a367623b287..333db5953607e49b284f780f1b4532c81fe5ea01 100644 (file)
@@ -259,10 +259,7 @@ static bool arcmsr_remap_pciregion(struct AdapterControlBlock *acb)
                addr = (unsigned long)pci_resource_start(pdev, 0);
                range = pci_resource_len(pdev, 0);
                flags = pci_resource_flags(pdev, 0);
-               if (flags & IORESOURCE_CACHEABLE)
-                       mem_base0 = ioremap(addr, range);
-               else
-                       mem_base0 = ioremap_nocache(addr, range);
+               mem_base0 = ioremap(addr, range);
                if (!mem_base0) {
                        pr_notice("arcmsr%d: memory mapping region fail\n",
                                acb->host->host_no);
index 315d6d6dcfc868996e0a4fc83a1065d78cac7d55..98f7e8cca52df25e017887b3d491b2272d7a798e 100644 (file)
@@ -3665,19 +3665,19 @@ bfa_cb_sfp_state_query(struct bfa_sfp_s *sfp)
                if (sfp->state_query_cbfn)
                        sfp->state_query_cbfn(sfp->state_query_cbarg,
                                        sfp->status);
-                       sfp->media = NULL;
-               }
+               sfp->media = NULL;
+       }
 
-               if (sfp->portspeed) {
-                       sfp->status = bfa_sfp_speed_valid(sfp, sfp->portspeed);
-                       if (sfp->state_query_cbfn)
-                               sfp->state_query_cbfn(sfp->state_query_cbarg,
-                                               sfp->status);
-                               sfp->portspeed = BFA_PORT_SPEED_UNKNOWN;
-               }
+       if (sfp->portspeed) {
+               sfp->status = bfa_sfp_speed_valid(sfp, sfp->portspeed);
+               if (sfp->state_query_cbfn)
+                       sfp->state_query_cbfn(sfp->state_query_cbarg,
+                                       sfp->status);
+               sfp->portspeed = BFA_PORT_SPEED_UNKNOWN;
+       }
 
-               sfp->state_query_lock = 0;
-               sfp->state_query_cbfn = NULL;
+       sfp->state_query_lock = 0;
+       sfp->state_query_cbfn = NULL;
 }
 
 /*
@@ -3878,7 +3878,7 @@ bfa_sfp_show_comp(struct bfa_sfp_s *sfp, struct bfi_mbmsg_s *msg)
                bfa_trc(sfp, sfp->data_valid);
                if (sfp->data_valid) {
                        u32     size = sizeof(struct sfp_mem_s);
-                       u8 *des = (u8 *) &(sfp->sfpmem);
+                       u8 *des = (u8 *)(sfp->sfpmem);
                        memcpy(des, sfp->dbuf_kva, size);
                }
                /*
index 69abd0ad48e2d2d8a2c06350f2cbda6cd2d92af7..e5647d59224fca77a44b2ab8e3bf7662c457eb4c 100644 (file)
@@ -3,7 +3,7 @@
 #
 
 menuconfig SCSI_DH
-       tristate "SCSI Device Handlers"
+       bool "SCSI Device Handlers"
        depends on SCSI
        default n
        help
index e1d2ea083e159d0585d783a8c8757acb93ad0300..09866c50fbb4ab75bd7c3fd10138a71195e6aa49 100644 (file)
@@ -1,7 +1,6 @@
 #
 # SCSI Device Handler
 #
-obj-$(CONFIG_SCSI_DH)          += scsi_dh.o
 obj-$(CONFIG_SCSI_DH_RDAC)     += scsi_dh_rdac.o
 obj-$(CONFIG_SCSI_DH_HP_SW)    += scsi_dh_hp_sw.o
 obj-$(CONFIG_SCSI_DH_EMC)      += scsi_dh_emc.o
diff --git a/drivers/scsi/device_handler/scsi_dh.c b/drivers/scsi/device_handler/scsi_dh.c
deleted file mode 100644 (file)
index 1efebc9..0000000
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * SCSI device handler infrastruture.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright IBM Corporation, 2007
- *      Authors:
- *               Chandra Seetharaman <sekharan@us.ibm.com>
- *               Mike Anderson <andmike@linux.vnet.ibm.com>
- */
-
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <scsi/scsi_dh.h>
-#include "../scsi_priv.h"
-
-static DEFINE_SPINLOCK(list_lock);
-static LIST_HEAD(scsi_dh_list);
-
-static struct scsi_device_handler *get_device_handler(const char *name)
-{
-       struct scsi_device_handler *tmp, *found = NULL;
-
-       spin_lock(&list_lock);
-       list_for_each_entry(tmp, &scsi_dh_list, list) {
-               if (!strncmp(tmp->name, name, strlen(tmp->name))) {
-                       found = tmp;
-                       break;
-               }
-       }
-       spin_unlock(&list_lock);
-       return found;
-}
-
-/*
- * device_handler_match_function - Match a device handler to a device
- * @sdev - SCSI device to be tested
- *
- * Tests @sdev against the match function of all registered device_handler.
- * Returns the found device handler or NULL if not found.
- */
-static struct scsi_device_handler *
-device_handler_match_function(struct scsi_device *sdev)
-{
-       struct scsi_device_handler *tmp_dh, *found_dh = NULL;
-
-       spin_lock(&list_lock);
-       list_for_each_entry(tmp_dh, &scsi_dh_list, list) {
-               if (tmp_dh->match && tmp_dh->match(sdev)) {
-                       found_dh = tmp_dh;
-                       break;
-               }
-       }
-       spin_unlock(&list_lock);
-       return found_dh;
-}
-
-/*
- * device_handler_match - Attach a device handler to a device
- * @scsi_dh - The device handler to match against or NULL
- * @sdev - SCSI device to be tested against @scsi_dh
- *
- * Tests @sdev against the device handler @scsi_dh or against
- * all registered device_handler if @scsi_dh == NULL.
- * Returns the found device handler or NULL if not found.
- */
-static struct scsi_device_handler *
-device_handler_match(struct scsi_device_handler *scsi_dh,
-                    struct scsi_device *sdev)
-{
-       struct scsi_device_handler *found_dh;
-
-       found_dh = device_handler_match_function(sdev);
-
-       if (scsi_dh && found_dh != scsi_dh)
-               found_dh = NULL;
-
-       return found_dh;
-}
-
-/*
- * scsi_dh_handler_attach - Attach a device handler to a device
- * @sdev - SCSI device the device handler should attach to
- * @scsi_dh - The device handler to attach
- */
-static int scsi_dh_handler_attach(struct scsi_device *sdev,
-                                 struct scsi_device_handler *scsi_dh)
-{
-       struct scsi_dh_data *d;
-
-       if (sdev->scsi_dh_data) {
-               if (sdev->scsi_dh_data->scsi_dh != scsi_dh)
-                       return -EBUSY;
-
-               kref_get(&sdev->scsi_dh_data->kref);
-               return 0;
-       }
-
-       if (!try_module_get(scsi_dh->module))
-               return -EINVAL;
-
-       d = scsi_dh->attach(sdev);
-       if (IS_ERR(d)) {
-               sdev_printk(KERN_ERR, sdev, "%s: Attach failed (%ld)\n",
-                           scsi_dh->name, PTR_ERR(d));
-               module_put(scsi_dh->module);
-               return PTR_ERR(d);
-       }
-
-       d->scsi_dh = scsi_dh;
-       kref_init(&d->kref);
-       d->sdev = sdev;
-
-       spin_lock_irq(sdev->request_queue->queue_lock);
-       sdev->scsi_dh_data = d;
-       spin_unlock_irq(sdev->request_queue->queue_lock);
-       return 0;
-}
-
-static void __detach_handler (struct kref *kref)
-{
-       struct scsi_dh_data *scsi_dh_data =
-               container_of(kref, struct scsi_dh_data, kref);
-       struct scsi_device_handler *scsi_dh = scsi_dh_data->scsi_dh;
-       struct scsi_device *sdev = scsi_dh_data->sdev;
-
-       scsi_dh->detach(sdev);
-
-       spin_lock_irq(sdev->request_queue->queue_lock);
-       sdev->scsi_dh_data = NULL;
-       spin_unlock_irq(sdev->request_queue->queue_lock);
-
-       sdev_printk(KERN_NOTICE, sdev, "%s: Detached\n", scsi_dh->name);
-       module_put(scsi_dh->module);
-}
-
-/*
- * scsi_dh_handler_detach - Detach a device handler from a device
- * @sdev - SCSI device the device handler should be detached from
- * @scsi_dh - Device handler to be detached
- *
- * Detach from a device handler. If a device handler is specified,
- * only detach if the currently attached handler matches @scsi_dh.
- */
-static void scsi_dh_handler_detach(struct scsi_device *sdev,
-                                  struct scsi_device_handler *scsi_dh)
-{
-       if (!sdev->scsi_dh_data)
-               return;
-
-       if (scsi_dh && scsi_dh != sdev->scsi_dh_data->scsi_dh)
-               return;
-
-       if (!scsi_dh)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-
-       if (scsi_dh)
-               kref_put(&sdev->scsi_dh_data->kref, __detach_handler);
-}
-
-/*
- * Functions for sysfs attribute 'dh_state'
- */
-static ssize_t
-store_dh_state(struct device *dev, struct device_attribute *attr,
-              const char *buf, size_t count)
-{
-       struct scsi_device *sdev = to_scsi_device(dev);
-       struct scsi_device_handler *scsi_dh;
-       int err = -EINVAL;
-
-       if (sdev->sdev_state == SDEV_CANCEL ||
-           sdev->sdev_state == SDEV_DEL)
-               return -ENODEV;
-
-       if (!sdev->scsi_dh_data) {
-               /*
-                * Attach to a device handler
-                */
-               if (!(scsi_dh = get_device_handler(buf)))
-                       return err;
-               err = scsi_dh_handler_attach(sdev, scsi_dh);
-       } else {
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-               if (!strncmp(buf, "detach", 6)) {
-                       /*
-                        * Detach from a device handler
-                        */
-                       scsi_dh_handler_detach(sdev, scsi_dh);
-                       err = 0;
-               } else if (!strncmp(buf, "activate", 8)) {
-                       /*
-                        * Activate a device handler
-                        */
-                       if (scsi_dh->activate)
-                               err = scsi_dh->activate(sdev, NULL, NULL);
-                       else
-                               err = 0;
-               }
-       }
-
-       return err<0?err:count;
-}
-
-static ssize_t
-show_dh_state(struct device *dev, struct device_attribute *attr, char *buf)
-{
-       struct scsi_device *sdev = to_scsi_device(dev);
-
-       if (!sdev->scsi_dh_data)
-               return snprintf(buf, 20, "detached\n");
-
-       return snprintf(buf, 20, "%s\n", sdev->scsi_dh_data->scsi_dh->name);
-}
-
-static struct device_attribute scsi_dh_state_attr =
-       __ATTR(dh_state, S_IRUGO | S_IWUSR, show_dh_state,
-              store_dh_state);
-
-/*
- * scsi_dh_sysfs_attr_add - Callback for scsi_init_dh
- */
-static int scsi_dh_sysfs_attr_add(struct device *dev, void *data)
-{
-       struct scsi_device *sdev;
-       int err;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       err = device_create_file(&sdev->sdev_gendev,
-                                &scsi_dh_state_attr);
-
-       return 0;
-}
-
-/*
- * scsi_dh_sysfs_attr_remove - Callback for scsi_exit_dh
- */
-static int scsi_dh_sysfs_attr_remove(struct device *dev, void *data)
-{
-       struct scsi_device *sdev;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       device_remove_file(&sdev->sdev_gendev,
-                          &scsi_dh_state_attr);
-
-       return 0;
-}
-
-/*
- * scsi_dh_notifier - notifier chain callback
- */
-static int scsi_dh_notifier(struct notifier_block *nb,
-                           unsigned long action, void *data)
-{
-       struct device *dev = data;
-       struct scsi_device *sdev;
-       int err = 0;
-       struct scsi_device_handler *devinfo = NULL;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       if (action == BUS_NOTIFY_ADD_DEVICE) {
-               err = device_create_file(dev, &scsi_dh_state_attr);
-               /* don't care about err */
-               devinfo = device_handler_match(NULL, sdev);
-               if (devinfo)
-                       err = scsi_dh_handler_attach(sdev, devinfo);
-       } else if (action == BUS_NOTIFY_DEL_DEVICE) {
-               device_remove_file(dev, &scsi_dh_state_attr);
-               scsi_dh_handler_detach(sdev, NULL);
-       }
-       return err;
-}
-
-/*
- * scsi_dh_notifier_add - Callback for scsi_register_device_handler
- */
-static int scsi_dh_notifier_add(struct device *dev, void *data)
-{
-       struct scsi_device_handler *scsi_dh = data;
-       struct scsi_device *sdev;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       if (!get_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       if (device_handler_match(scsi_dh, sdev))
-               scsi_dh_handler_attach(sdev, scsi_dh);
-
-       put_device(dev);
-
-       return 0;
-}
-
-/*
- * scsi_dh_notifier_remove - Callback for scsi_unregister_device_handler
- */
-static int scsi_dh_notifier_remove(struct device *dev, void *data)
-{
-       struct scsi_device_handler *scsi_dh = data;
-       struct scsi_device *sdev;
-
-       if (!scsi_is_sdev_device(dev))
-               return 0;
-
-       if (!get_device(dev))
-               return 0;
-
-       sdev = to_scsi_device(dev);
-
-       scsi_dh_handler_detach(sdev, scsi_dh);
-
-       put_device(dev);
-
-       return 0;
-}
-
-/*
- * scsi_register_device_handler - register a device handler personality
- *      module.
- * @scsi_dh - device handler to be registered.
- *
- * Returns 0 on success, -EBUSY if handler already registered.
- */
-int scsi_register_device_handler(struct scsi_device_handler *scsi_dh)
-{
-
-       if (get_device_handler(scsi_dh->name))
-               return -EBUSY;
-
-       if (!scsi_dh->attach || !scsi_dh->detach)
-               return -EINVAL;
-
-       spin_lock(&list_lock);
-       list_add(&scsi_dh->list, &scsi_dh_list);
-       spin_unlock(&list_lock);
-
-       bus_for_each_dev(&scsi_bus_type, NULL, scsi_dh, scsi_dh_notifier_add);
-       printk(KERN_INFO "%s: device handler registered\n", scsi_dh->name);
-
-       return SCSI_DH_OK;
-}
-EXPORT_SYMBOL_GPL(scsi_register_device_handler);
-
-/*
- * scsi_unregister_device_handler - register a device handler personality
- *      module.
- * @scsi_dh - device handler to be unregistered.
- *
- * Returns 0 on success, -ENODEV if handler not registered.
- */
-int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh)
-{
-
-       if (!get_device_handler(scsi_dh->name))
-               return -ENODEV;
-
-       bus_for_each_dev(&scsi_bus_type, NULL, scsi_dh,
-                        scsi_dh_notifier_remove);
-
-       spin_lock(&list_lock);
-       list_del(&scsi_dh->list);
-       spin_unlock(&list_lock);
-       printk(KERN_INFO "%s: device handler unregistered\n", scsi_dh->name);
-
-       return SCSI_DH_OK;
-}
-EXPORT_SYMBOL_GPL(scsi_unregister_device_handler);
-
-/*
- * scsi_dh_activate - activate the path associated with the scsi_device
- *      corresponding to the given request queue.
- *     Returns immediately without waiting for activation to be completed.
- * @q    - Request queue that is associated with the scsi_device to be
- *         activated.
- * @fn   - Function to be called upon completion of the activation.
- *         Function fn is called with data (below) and the error code.
- *         Function fn may be called from the same calling context. So,
- *         do not hold the lock in the caller which may be needed in fn.
- * @data - data passed to the function fn upon completion.
- *
- */
-int scsi_dh_activate(struct request_queue *q, activate_complete fn, void *data)
-{
-       int err = 0;
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh = NULL;
-       struct device *dev = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev) {
-               spin_unlock_irqrestore(q->queue_lock, flags);
-               err = SCSI_DH_NOSYS;
-               if (fn)
-                       fn(data, err);
-               return err;
-       }
-
-       if (sdev->scsi_dh_data)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-       dev = get_device(&sdev->sdev_gendev);
-       if (!scsi_dh || !dev ||
-           sdev->sdev_state == SDEV_CANCEL ||
-           sdev->sdev_state == SDEV_DEL)
-               err = SCSI_DH_NOSYS;
-       if (sdev->sdev_state == SDEV_OFFLINE)
-               err = SCSI_DH_DEV_OFFLINED;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (err) {
-               if (fn)
-                       fn(data, err);
-               goto out;
-       }
-
-       if (scsi_dh->activate)
-               err = scsi_dh->activate(sdev, fn, data);
-out:
-       put_device(dev);
-       return err;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_activate);
-
-/*
- * scsi_dh_set_params - set the parameters for the device as per the
- *      string specified in params.
- * @q - Request queue that is associated with the scsi_device for
- *      which the parameters to be set.
- * @params - parameters in the following format
- *      "no_of_params\0param1\0param2\0param3\0...\0"
- *      for example, string for 2 parameters with value 10 and 21
- *      is specified as "2\010\021\0".
- */
-int scsi_dh_set_params(struct request_queue *q, const char *params)
-{
-       int err = -SCSI_DH_NOSYS;
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (sdev && sdev->scsi_dh_data)
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-       if (scsi_dh && scsi_dh->set_params && get_device(&sdev->sdev_gendev))
-               err = 0;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (err)
-               return err;
-       err = scsi_dh->set_params(sdev, params);
-       put_device(&sdev->sdev_gendev);
-       return err;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_set_params);
-
-/*
- * scsi_dh_handler_exist - Return TRUE(1) if a device handler exists for
- *     the given name. FALSE(0) otherwise.
- * @name - name of the device handler.
- */
-int scsi_dh_handler_exist(const char *name)
-{
-       return (get_device_handler(name) != NULL);
-}
-EXPORT_SYMBOL_GPL(scsi_dh_handler_exist);
-
-/*
- * scsi_dh_attach - Attach device handler
- * @q - Request queue that is associated with the scsi_device
- *      the handler should be attached to
- * @name - name of the handler to attach
- */
-int scsi_dh_attach(struct request_queue *q, const char *name)
-{
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh;
-       int err = 0;
-
-       scsi_dh = get_device_handler(name);
-       if (!scsi_dh)
-               return -EINVAL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev || !get_device(&sdev->sdev_gendev))
-               err = -ENODEV;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (!err) {
-               err = scsi_dh_handler_attach(sdev, scsi_dh);
-               put_device(&sdev->sdev_gendev);
-       }
-       return err;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_attach);
-
-/*
- * scsi_dh_detach - Detach device handler
- * @q - Request queue that is associated with the scsi_device
- *      the handler should be detached from
- *
- * This function will detach the device handler only
- * if the sdev is not part of the internal list, ie
- * if it has been attached manually.
- */
-void scsi_dh_detach(struct request_queue *q)
-{
-       unsigned long flags;
-       struct scsi_device *sdev;
-       struct scsi_device_handler *scsi_dh = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev || !get_device(&sdev->sdev_gendev))
-               sdev = NULL;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (!sdev)
-               return;
-
-       if (sdev->scsi_dh_data) {
-               scsi_dh = sdev->scsi_dh_data->scsi_dh;
-               scsi_dh_handler_detach(sdev, scsi_dh);
-       }
-       put_device(&sdev->sdev_gendev);
-}
-EXPORT_SYMBOL_GPL(scsi_dh_detach);
-
-/*
- * scsi_dh_attached_handler_name - Get attached device handler's name
- * @q - Request queue that is associated with the scsi_device
- *      that may have a device handler attached
- * @gfp - the GFP mask used in the kmalloc() call when allocating memory
- *
- * Returns name of attached handler, NULL if no handler is attached.
- * Caller must take care to free the returned string.
- */
-const char *scsi_dh_attached_handler_name(struct request_queue *q, gfp_t gfp)
-{
-       unsigned long flags;
-       struct scsi_device *sdev;
-       const char *handler_name = NULL;
-
-       spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
-       if (!sdev || !get_device(&sdev->sdev_gendev))
-               sdev = NULL;
-       spin_unlock_irqrestore(q->queue_lock, flags);
-
-       if (!sdev)
-               return NULL;
-
-       if (sdev->scsi_dh_data)
-               handler_name = kstrdup(sdev->scsi_dh_data->scsi_dh->name, gfp);
-
-       put_device(&sdev->sdev_gendev);
-       return handler_name;
-}
-EXPORT_SYMBOL_GPL(scsi_dh_attached_handler_name);
-
-static struct notifier_block scsi_dh_nb = {
-       .notifier_call = scsi_dh_notifier
-};
-
-static int __init scsi_dh_init(void)
-{
-       int r;
-
-       r = bus_register_notifier(&scsi_bus_type, &scsi_dh_nb);
-
-       if (!r)
-               bus_for_each_dev(&scsi_bus_type, NULL, NULL,
-                                scsi_dh_sysfs_attr_add);
-
-       return r;
-}
-
-static void __exit scsi_dh_exit(void)
-{
-       bus_for_each_dev(&scsi_bus_type, NULL, NULL,
-                        scsi_dh_sysfs_attr_remove);
-       bus_unregister_notifier(&scsi_bus_type, &scsi_dh_nb);
-}
-
-module_init(scsi_dh_init);
-module_exit(scsi_dh_exit);
-
-MODULE_DESCRIPTION("SCSI device handler");
-MODULE_AUTHOR("Chandra Seetharaman <sekharan@us.ibm.com>");
-MODULE_LICENSE("GPL");
index 854b568b993157938b7a02d28ab40af072508268..cc2773b5de68f5c8751ddb25d940cc0c2eef9017 100644 (file)
@@ -62,7 +62,6 @@
 #define ALUA_OPTIMIZE_STPG             1
 
 struct alua_dh_data {
-       struct scsi_dh_data     dh_data;
        int                     group_id;
        int                     rel_port;
        int                     tpgs;
@@ -86,11 +85,6 @@ struct alua_dh_data {
 static char print_alua_state(int);
 static int alua_check_sense(struct scsi_device *, struct scsi_sense_hdr *);
 
-static inline struct alua_dh_data *get_alua_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct alua_dh_data, dh_data);
-}
-
 static int realloc_buffer(struct alua_dh_data *h, unsigned len)
 {
        if (h->buff && h->buff != h->inq)
@@ -708,7 +702,7 @@ out:
  */
 static int alua_set_params(struct scsi_device *sdev, const char *params)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
        unsigned int optimize = 0, argc;
        const char *p = params;
        int result = SCSI_DH_OK;
@@ -746,7 +740,7 @@ MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than
 static int alua_activate(struct scsi_device *sdev,
                        activate_complete fn, void *data)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
        int err = SCSI_DH_OK;
        int stpg = 0;
 
@@ -804,7 +798,7 @@ out:
  */
 static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->state == TPGS_STATE_TRANSITIONING)
@@ -819,23 +813,18 @@ static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
 
 }
 
-static bool alua_match(struct scsi_device *sdev)
-{
-       return (scsi_device_tpgs(sdev) != 0);
-}
-
 /*
  * alua_bus_attach - Attach device handler
  * @sdev: device to be attached to
  */
-static struct scsi_dh_data *alua_bus_attach(struct scsi_device *sdev)
+static int alua_bus_attach(struct scsi_device *sdev)
 {
        struct alua_dh_data *h;
        int err;
 
        h = kzalloc(sizeof(*h) , GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->tpgs = TPGS_MODE_UNINITIALIZED;
        h->state = TPGS_STATE_OPTIMIZED;
        h->group_id = -1;
@@ -848,11 +837,11 @@ static struct scsi_dh_data *alua_bus_attach(struct scsi_device *sdev)
        if (err != SCSI_DH_OK && err != SCSI_DH_DEV_OFFLINED)
                goto failed;
 
-       sdev_printk(KERN_NOTICE, sdev, "%s: Attached\n", ALUA_DH_NAME);
-       return &h->dh_data;
+       sdev->handler_data = h;
+       return 0;
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 /*
@@ -861,10 +850,11 @@ failed:
  */
 static void alua_bus_detach(struct scsi_device *sdev)
 {
-       struct alua_dh_data *h = get_alua_data(sdev);
+       struct alua_dh_data *h = sdev->handler_data;
 
        if (h->buff && h->inq != h->buff)
                kfree(h->buff);
+       sdev->handler_data = NULL;
        kfree(h);
 }
 
@@ -877,7 +867,6 @@ static struct scsi_device_handler alua_dh = {
        .check_sense = alua_check_sense,
        .activate = alua_activate,
        .set_params = alua_set_params,
-       .match = alua_match,
 };
 
 static int __init alua_init(void)
index 6ed1caadbc6abacf2a26ab82cf745f69876fba11..e6fb97cb12f43a7128267702bf5d568699a80b3e 100644 (file)
@@ -72,7 +72,6 @@ static const char * lun_state[] =
 };
 
 struct clariion_dh_data {
-       struct scsi_dh_data dh_data;
        /*
         * Flags:
         *  CLARIION_SHORT_TRESPASS
@@ -114,13 +113,6 @@ struct clariion_dh_data {
        int current_sp;
 };
 
-static inline struct clariion_dh_data
-                       *get_clariion_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct clariion_dh_data,
-                       dh_data);
-}
-
 /*
  * Parse MODE_SELECT cmd reply.
  */
@@ -450,7 +442,7 @@ static int clariion_check_sense(struct scsi_device *sdev,
 
 static int clariion_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct clariion_dh_data *h = get_clariion_data(sdev);
+       struct clariion_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->lun_state != CLARIION_LUN_OWNED) {
@@ -533,7 +525,7 @@ retry:
 static int clariion_activate(struct scsi_device *sdev,
                                activate_complete fn, void *data)
 {
-       struct clariion_dh_data *csdev = get_clariion_data(sdev);
+       struct clariion_dh_data *csdev = sdev->handler_data;
        int result;
 
        result = clariion_send_inquiry(sdev, csdev);
@@ -574,7 +566,7 @@ done:
  */
 static int clariion_set_params(struct scsi_device *sdev, const char *params)
 {
-       struct clariion_dh_data *csdev = get_clariion_data(sdev);
+       struct clariion_dh_data *csdev = sdev->handler_data;
        unsigned int hr = 0, st = 0, argc;
        const char *p = params;
        int result = SCSI_DH_OK;
@@ -622,42 +614,14 @@ done:
        return result;
 }
 
-static const struct {
-       char *vendor;
-       char *model;
-} clariion_dev_list[] = {
-       {"DGC", "RAID"},
-       {"DGC", "DISK"},
-       {"DGC", "VRAID"},
-       {NULL, NULL},
-};
-
-static bool clariion_match(struct scsi_device *sdev)
-{
-       int i;
-
-       if (scsi_device_tpgs(sdev))
-               return false;
-
-       for (i = 0; clariion_dev_list[i].vendor; i++) {
-               if (!strncmp(sdev->vendor, clariion_dev_list[i].vendor,
-                       strlen(clariion_dev_list[i].vendor)) &&
-                   !strncmp(sdev->model, clariion_dev_list[i].model,
-                       strlen(clariion_dev_list[i].model))) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-static struct scsi_dh_data *clariion_bus_attach(struct scsi_device *sdev)
+static int clariion_bus_attach(struct scsi_device *sdev)
 {
        struct clariion_dh_data *h;
        int err;
 
        h = kzalloc(sizeof(*h) , GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->lun_state = CLARIION_LUN_UNINITIALIZED;
        h->default_sp = CLARIION_UNBOUND_LU;
        h->current_sp = CLARIION_UNBOUND_LU;
@@ -675,18 +639,19 @@ static struct scsi_dh_data *clariion_bus_attach(struct scsi_device *sdev)
                    CLARIION_NAME, h->current_sp + 'A',
                    h->port, lun_state[h->lun_state],
                    h->default_sp + 'A');
-       return &h->dh_data;
+
+       sdev->handler_data = h;
+       return 0;
 
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 static void clariion_bus_detach(struct scsi_device *sdev)
 {
-       struct clariion_dh_data *h = get_clariion_data(sdev);
-
-       kfree(h);
+       kfree(sdev->handler_data);
+       sdev->handler_data = NULL;
 }
 
 static struct scsi_device_handler clariion_dh = {
@@ -698,7 +663,6 @@ static struct scsi_device_handler clariion_dh = {
        .activate       = clariion_activate,
        .prep_fn        = clariion_prep_fn,
        .set_params     = clariion_set_params,
-       .match          = clariion_match,
 };
 
 static int __init clariion_init(void)
index 485d99544a1566f099f3c2c3a472134b986a9c5d..9406d5f4a3d3893b6ac8f20308a8ff812c90177f 100644 (file)
@@ -38,7 +38,6 @@
 #define HP_SW_PATH_PASSIVE             1
 
 struct hp_sw_dh_data {
-       struct scsi_dh_data dh_data;
        unsigned char sense[SCSI_SENSE_BUFFERSIZE];
        int path_state;
        int retries;
@@ -50,11 +49,6 @@ struct hp_sw_dh_data {
 
 static int hp_sw_start_stop(struct hp_sw_dh_data *);
 
-static inline struct hp_sw_dh_data *get_hp_sw_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct hp_sw_dh_data, dh_data);
-}
-
 /*
  * tur_done - Handle TEST UNIT READY return status
  * @sdev: sdev the command has been sent to
@@ -267,7 +261,7 @@ static int hp_sw_start_stop(struct hp_sw_dh_data *h)
 
 static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct hp_sw_dh_data *h = get_hp_sw_data(sdev);
+       struct hp_sw_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->path_state != HP_SW_PATH_ACTIVE) {
@@ -292,7 +286,7 @@ static int hp_sw_activate(struct scsi_device *sdev,
                                activate_complete fn, void *data)
 {
        int ret = SCSI_DH_OK;
-       struct hp_sw_dh_data *h = get_hp_sw_data(sdev);
+       struct hp_sw_dh_data *h = sdev->handler_data;
 
        ret = hp_sw_tur(sdev, h);
 
@@ -311,43 +305,14 @@ static int hp_sw_activate(struct scsi_device *sdev,
        return 0;
 }
 
-static const struct {
-       char *vendor;
-       char *model;
-} hp_sw_dh_data_list[] = {
-       {"COMPAQ", "MSA1000 VOLUME"},
-       {"COMPAQ", "HSV110"},
-       {"HP", "HSV100"},
-       {"DEC", "HSG80"},
-       {NULL, NULL},
-};
-
-static bool hp_sw_match(struct scsi_device *sdev)
-{
-       int i;
-
-       if (scsi_device_tpgs(sdev))
-               return false;
-
-       for (i = 0; hp_sw_dh_data_list[i].vendor; i++) {
-               if (!strncmp(sdev->vendor, hp_sw_dh_data_list[i].vendor,
-                       strlen(hp_sw_dh_data_list[i].vendor)) &&
-                   !strncmp(sdev->model, hp_sw_dh_data_list[i].model,
-                       strlen(hp_sw_dh_data_list[i].model))) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-static struct scsi_dh_data *hp_sw_bus_attach(struct scsi_device *sdev)
+static int hp_sw_bus_attach(struct scsi_device *sdev)
 {
        struct hp_sw_dh_data *h;
        int ret;
 
        h = kzalloc(sizeof(*h), GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->path_state = HP_SW_PATH_UNINITIALIZED;
        h->retries = HP_SW_RETRIES;
        h->sdev = sdev;
@@ -359,17 +324,18 @@ static struct scsi_dh_data *hp_sw_bus_attach(struct scsi_device *sdev)
        sdev_printk(KERN_INFO, sdev, "%s: attached to %s path\n",
                    HP_SW_NAME, h->path_state == HP_SW_PATH_ACTIVE?
                    "active":"passive");
-       return &h->dh_data;
+
+       sdev->handler_data = h;
+       return 0;
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 static void hp_sw_bus_detach( struct scsi_device *sdev )
 {
-       struct hp_sw_dh_data *h = get_hp_sw_data(sdev);
-
-       kfree(h);
+       kfree(sdev->handler_data);
+       sdev->handler_data = NULL;
 }
 
 static struct scsi_device_handler hp_sw_dh = {
@@ -379,7 +345,6 @@ static struct scsi_device_handler hp_sw_dh = {
        .detach         = hp_sw_bus_detach,
        .activate       = hp_sw_activate,
        .prep_fn        = hp_sw_prep_fn,
-       .match          = hp_sw_match,
 };
 
 static int __init hp_sw_init(void)
index b46ace3d4bf0cd9c23f62f3422ca6b9eb7026435..3613581343159dc67e8980424b2346d301dec01f 100644 (file)
@@ -181,7 +181,6 @@ struct c2_inquiry {
 };
 
 struct rdac_dh_data {
-       struct scsi_dh_data     dh_data;
        struct rdac_controller  *ctlr;
 #define UNINITIALIZED_LUN      (1 << 8)
        unsigned                lun;
@@ -260,11 +259,6 @@ do { \
                sdev_printk(KERN_INFO, sdev, RDAC_NAME ": " f "\n", ## arg); \
 } while (0);
 
-static inline struct rdac_dh_data *get_rdac_data(struct scsi_device *sdev)
-{
-       return container_of(sdev->scsi_dh_data, struct rdac_dh_data, dh_data);
-}
-
 static struct request *get_rdac_req(struct scsi_device *sdev,
                        void *buffer, unsigned buflen, int rw)
 {
@@ -544,7 +538,7 @@ static int mode_select_handle_sense(struct scsi_device *sdev,
 {
        struct scsi_sense_hdr sense_hdr;
        int err = SCSI_DH_IO, ret;
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
 
        ret = scsi_normalize_sense(sensebuf, SCSI_SENSE_BUFFERSIZE, &sense_hdr);
        if (!ret)
@@ -589,7 +583,7 @@ static void send_mode_select(struct work_struct *work)
                container_of(work, struct rdac_controller, ms_work);
        struct request *rq;
        struct scsi_device *sdev = ctlr->ms_sdev;
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
        struct request_queue *q = sdev->request_queue;
        int err, retry_cnt = RDAC_RETRY_COUNT;
        struct rdac_queue_data *tmp, *qdata;
@@ -648,7 +642,7 @@ static int queue_mode_select(struct scsi_device *sdev,
        if (!qdata)
                return SCSI_DH_RETRY;
 
-       qdata->h = get_rdac_data(sdev);
+       qdata->h = sdev->handler_data;
        qdata->callback_fn = fn;
        qdata->callback_data = data;
 
@@ -667,7 +661,7 @@ static int queue_mode_select(struct scsi_device *sdev,
 static int rdac_activate(struct scsi_device *sdev,
                        activate_complete fn, void *data)
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
        int err = SCSI_DH_OK;
        int act = 0;
 
@@ -702,7 +696,7 @@ done:
 
 static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
        int ret = BLKPREP_OK;
 
        if (h->state != RDAC_STATE_ACTIVE) {
@@ -716,7 +710,7 @@ static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 static int rdac_check_sense(struct scsi_device *sdev,
                                struct scsi_sense_hdr *sense_hdr)
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
 
        RDAC_LOG(RDAC_LOG_SENSE, sdev, "array %s, ctlr %d, "
                        "I/O returned with sense %02x/%02x/%02x",
@@ -778,56 +772,7 @@ static int rdac_check_sense(struct scsi_device *sdev,
        return SCSI_RETURN_NOT_HANDLED;
 }
 
-static const struct {
-       char *vendor;
-       char *model;
-} rdac_dev_list[] = {
-       {"IBM", "1722"},
-       {"IBM", "1724"},
-       {"IBM", "1726"},
-       {"IBM", "1742"},
-       {"IBM", "1745"},
-       {"IBM", "1746"},
-       {"IBM", "1813"},
-       {"IBM", "1814"},
-       {"IBM", "1815"},
-       {"IBM", "1818"},
-       {"IBM", "3526"},
-       {"SGI", "TP9"},
-       {"SGI", "IS"},
-       {"STK", "OPENstorage D280"},
-       {"STK", "FLEXLINE 380"},
-       {"SUN", "CSM"},
-       {"SUN", "LCSM100"},
-       {"SUN", "STK6580_6780"},
-       {"SUN", "SUN_6180"},
-       {"SUN", "ArrayStorage"},
-       {"DELL", "MD3"},
-       {"NETAPP", "INF-01-00"},
-       {"LSI", "INF-01-00"},
-       {"ENGENIO", "INF-01-00"},
-       {NULL, NULL},
-};
-
-static bool rdac_match(struct scsi_device *sdev)
-{
-       int i;
-
-       if (scsi_device_tpgs(sdev))
-               return false;
-
-       for (i = 0; rdac_dev_list[i].vendor; i++) {
-               if (!strncmp(sdev->vendor, rdac_dev_list[i].vendor,
-                       strlen(rdac_dev_list[i].vendor)) &&
-                   !strncmp(sdev->model, rdac_dev_list[i].model,
-                       strlen(rdac_dev_list[i].model))) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-static struct scsi_dh_data *rdac_bus_attach(struct scsi_device *sdev)
+static int rdac_bus_attach(struct scsi_device *sdev)
 {
        struct rdac_dh_data *h;
        int err;
@@ -836,7 +781,7 @@ static struct scsi_dh_data *rdac_bus_attach(struct scsi_device *sdev)
 
        h = kzalloc(sizeof(*h) , GFP_KERNEL);
        if (!h)
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        h->lun = UNINITIALIZED_LUN;
        h->state = RDAC_STATE_ACTIVE;
 
@@ -861,7 +806,8 @@ static struct scsi_dh_data *rdac_bus_attach(struct scsi_device *sdev)
                    RDAC_NAME, h->lun, mode[(int)h->mode],
                    lun_state[(int)h->lun_state]);
 
-       return &h->dh_data;
+       sdev->handler_data = h;
+       return 0;
 
 clean_ctlr:
        spin_lock(&list_lock);
@@ -870,12 +816,12 @@ clean_ctlr:
 
 failed:
        kfree(h);
-       return ERR_PTR(-EINVAL);
+       return -EINVAL;
 }
 
 static void rdac_bus_detach( struct scsi_device *sdev )
 {
-       struct rdac_dh_data *h = get_rdac_data(sdev);
+       struct rdac_dh_data *h = sdev->handler_data;
 
        if (h->ctlr && h->ctlr->ms_queued)
                flush_workqueue(kmpath_rdacd);
@@ -884,6 +830,7 @@ static void rdac_bus_detach( struct scsi_device *sdev )
        if (h->ctlr)
                kref_put(&h->ctlr->kref, release_controller);
        spin_unlock(&list_lock);
+       sdev->handler_data = NULL;
        kfree(h);
 }
 
@@ -895,7 +842,6 @@ static struct scsi_device_handler rdac_dh = {
        .attach = rdac_bus_attach,
        .detach = rdac_bus_detach,
        .activate = rdac_activate,
-       .match = rdac_match,
 };
 
 static int __init rdac_init(void)
index ec193a8357d70cdf3cbc0354bb7a10b92f7e3372..d3eb80c46bbe2224ac9cd82b2e3833f28e7ab297 100644 (file)
@@ -364,7 +364,7 @@ static int fcoe_interface_setup(struct fcoe_interface *fcoe,
         * on the ethertype for the given device
         */
        fcoe->fcoe_packet_type.func = fcoe_rcv;
-       fcoe->fcoe_packet_type.type = __constant_htons(ETH_P_FCOE);
+       fcoe->fcoe_packet_type.type = htons(ETH_P_FCOE);
        fcoe->fcoe_packet_type.dev = netdev;
        dev_add_pack(&fcoe->fcoe_packet_type);
 
index 341191952155d75700173e29a2caa8ad856a24cd..b62836ddbbee55c1fc7d6c65f4926a281fff387b 100644 (file)
@@ -4555,7 +4555,7 @@ static ssize_t ipr_store_raw_mode(struct device *dev,
        spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags);
        res = (struct ipr_resource_entry *)sdev->hostdata;
        if (res) {
-               if (ioa_cfg->sis64 && ipr_is_af_dasd_device(res)) {
+               if (ipr_is_af_dasd_device(res)) {
                        res->raw_mode = simple_strtoul(buf, NULL, 10);
                        len = strlen(buf);
                        if (res->sdev)
@@ -6383,9 +6383,13 @@ static int ipr_queuecommand(struct Scsi_Host *shost,
            (!ipr_is_gscsi(res) || scsi_cmd->cmnd[0] == IPR_QUERY_RSRC_STATE)) {
                ioarcb->cmd_pkt.request_type = IPR_RQTYPE_IOACMD;
        }
-       if (res->raw_mode && ipr_is_af_dasd_device(res))
+       if (res->raw_mode && ipr_is_af_dasd_device(res)) {
                ioarcb->cmd_pkt.request_type = IPR_RQTYPE_PIPE;
 
+               if (scsi_cmd->underflow == 0)
+                       ioarcb->cmd_pkt.flags_hi |= IPR_FLAGS_HI_NO_ULEN_CHK;
+       }
+
        if (ioa_cfg->sis64)
                rc = ipr_build_ioadl64(ioa_cfg, ipr_cmd);
        else
index 98d9bb6ff725ff46621a408bdf1f208175e21daf..33c74d3436c947a7f11ca22498206f6efa97fcc2 100644 (file)
@@ -853,12 +853,9 @@ static void iscsi_scsi_cmd_rsp(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
                                     SAM_STAT_CHECK_CONDITION;
                        scsi_build_sense_buffer(1, sc->sense_buffer,
                                                ILLEGAL_REQUEST, 0x10, ascq);
-                       sc->sense_buffer[7] = 0xc; /* Additional sense length */
-                       sc->sense_buffer[8] = 0;   /* Information desc type */
-                       sc->sense_buffer[9] = 0xa; /* Additional desc length */
-                       sc->sense_buffer[10] = 0x80; /* Validity bit */
-
-                       put_unaligned_be64(sector, &sc->sense_buffer[12]);
+                       scsi_set_sense_information(sc->sense_buffer,
+                                                  SCSI_SENSE_BUFFERSIZE,
+                                                  sector);
                        goto out;
                }
        }
index eb627724417e14432ca122d2e29feed1014ce04f..4abb93a83e0ffac5dc3d3d1f17174b7205d6b6ec 100644 (file)
@@ -2284,7 +2284,7 @@ lpfc_mbx_cmpl_rdp_page_a2(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox)
                        (struct lpfc_rdp_context *)(mbox->context2);
 
        if (bf_get(lpfc_mqe_status, &mbox->u.mqe))
-               goto error;
+               goto error_mbuf_free;
 
        lpfc_sli_bemem_bcopy(mp->virt, &rdp_context->page_a2,
                                DMP_SFF_PAGE_A2_SIZE);
@@ -2299,13 +2299,14 @@ lpfc_mbx_cmpl_rdp_page_a2(struct lpfc_hba *phba, LPFC_MBOXQ_t *mbox)
        mbox->mbox_cmpl = lpfc_mbx_cmpl_rdp_link_stat;
        mbox->context2 = (struct lpfc_rdp_context *) rdp_context;
        if (lpfc_sli_issue_mbox(phba, mbox, MBX_NOWAIT) == MBX_NOT_FINISHED)
-               goto error;
+               goto error_cmd_free;
 
        return;
 
-error:
+error_mbuf_free:
        lpfc_mbuf_free(phba, mp->virt, mp->phys);
        kfree(mp);
+error_cmd_free:
        lpfc_sli4_mbox_cmd_free(phba, mbox);
        rdp_context->cmpl(phba, rdp_context, FAILURE);
 }
index 6dec7cff316f46c377e015e5ad5424339089040e..c167911221e96af131039171ecc7a07e4e34a987 100644 (file)
@@ -112,9 +112,12 @@ _scsih_set_fwfault_debug(const char *val, struct kernel_param *kp)
        if (ret)
                return ret;
 
+       /* global ioc spinlock to protect controller list on list operations */
        printk(KERN_INFO "setting fwfault_debug(%d)\n", mpt2sas_fwfault_debug);
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list)
                ioc->fwfault_debug = mpt2sas_fwfault_debug;
+       spin_unlock(&gioc_lock);
        return 0;
 }
 
@@ -4437,6 +4440,8 @@ mpt2sas_base_free_resources(struct MPT2SAS_ADAPTER *ioc)
        dexitprintk(ioc, printk(MPT2SAS_INFO_FMT "%s\n", ioc->name,
            __func__));
 
+       /* synchronizing freeing resource with pci_access_mutex lock */
+       mutex_lock(&ioc->pci_access_mutex);
        if (ioc->chip_phys && ioc->chip) {
                _base_mask_interrupts(ioc);
                ioc->shost_recovery = 1;
@@ -4456,6 +4461,7 @@ mpt2sas_base_free_resources(struct MPT2SAS_ADAPTER *ioc)
                pci_disable_pcie_error_reporting(pdev);
                pci_disable_device(pdev);
        }
+       mutex_unlock(&ioc->pci_access_mutex);
        return;
 }
 
index caff8d10cca42dae8d6433e07934f30cbfe61764..97ea360c6920e20f13160b3d96610f84f04ab0b7 100644 (file)
  * @flags: MPT_TARGET_FLAGS_XXX flags
  * @deleted: target flaged for deletion
  * @tm_busy: target is busy with TM request.
+ * @sdev: The sas_device associated with this target
  */
 struct MPT2SAS_TARGET {
        struct scsi_target *starget;
@@ -248,6 +249,7 @@ struct MPT2SAS_TARGET {
        u32     flags;
        u8      deleted;
        u8      tm_busy;
+       struct _sas_device *sdev;
 };
 
 
@@ -376,8 +378,24 @@ struct _sas_device {
        u8      phy;
        u8      responding;
        u8      pfa_led_on;
+       struct kref refcount;
 };
 
+static inline void sas_device_get(struct _sas_device *s)
+{
+       kref_get(&s->refcount);
+}
+
+static inline void sas_device_free(struct kref *r)
+{
+       kfree(container_of(r, struct _sas_device, refcount));
+}
+
+static inline void sas_device_put(struct _sas_device *s)
+{
+       kref_put(&s->refcount, sas_device_free);
+}
+
 /**
  * struct _raid_device - raid volume link list
  * @list: sas device list
@@ -799,6 +817,12 @@ typedef void (*MPT2SAS_FLUSH_RUNNING_CMDS)(struct MPT2SAS_ADAPTER *ioc);
  * @delayed_tr_list: target reset link list
  * @delayed_tr_volume_list: volume target reset link list
  * @@temp_sensors_count: flag to carry the number of temperature sensors
+ * @pci_access_mutex: Mutex to synchronize ioctl,sysfs show path and
+ * pci resource handling. PCI resource freeing will lead to free
+ * vital hardware/memory resource, which might be in use by cli/sysfs
+ * path functions resulting in Null pointer reference followed by kernel
+ * crash. To avoid the above race condition we use mutex syncrhonization
+ * which ensures the syncrhonization between cli/sysfs_show path
  */
 struct MPT2SAS_ADAPTER {
        struct list_head list;
@@ -1015,6 +1039,7 @@ struct MPT2SAS_ADAPTER {
        u8              mfg_pg10_hide_flag;
        u8              hide_drives;
 
+       struct mutex pci_access_mutex;
 };
 
 typedef u8 (*MPT_CALLBACK)(struct MPT2SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
@@ -1023,6 +1048,17 @@ typedef u8 (*MPT_CALLBACK)(struct MPT2SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 
 /* base shared API */
 extern struct list_head mpt2sas_ioc_list;
+/* spinlock on list operations over IOCs
+ * Case: when multiple warpdrive cards(IOCs) are in use
+ * Each IOC will added to the ioc list stucture on initialization.
+ * Watchdog threads run at regular intervals to check IOC for any
+ * fault conditions which will trigger the dead_ioc thread to
+ * deallocate pci resource, resulting deleting the IOC netry from list,
+ * this deletion need to protected by spinlock to enusre that
+ * ioc removal is syncrhonized, if not synchronized it might lead to
+ * list_del corruption as the ioc list is traversed in cli path
+ */
+extern spinlock_t gioc_lock;
 void mpt2sas_base_start_watchdog(struct MPT2SAS_ADAPTER *ioc);
 void mpt2sas_base_stop_watchdog(struct MPT2SAS_ADAPTER *ioc);
 
@@ -1095,11 +1131,12 @@ struct _sas_node *mpt2sas_scsih_expander_find_by_handle(struct MPT2SAS_ADAPTER *
     u16 handle);
 struct _sas_node *mpt2sas_scsih_expander_find_by_sas_address(struct MPT2SAS_ADAPTER
     *ioc, u64 sas_address);
-struct _sas_device *mpt2sas_scsih_sas_device_find_by_sas_address(
+struct _sas_device *mpt2sas_get_sdev_by_addr(
+    struct MPT2SAS_ADAPTER *ioc, u64 sas_address);
+struct _sas_device *__mpt2sas_get_sdev_by_addr(
     struct MPT2SAS_ADAPTER *ioc, u64 sas_address);
 
 void mpt2sas_port_enable_complete(struct MPT2SAS_ADAPTER *ioc);
-
 void mpt2sas_scsih_reset_handler(struct MPT2SAS_ADAPTER *ioc, int reset_phase);
 
 /* config shared API */
index 4e509604b5716483c0c52a734b4776a637bb44b2..3694b63bd9931520bb9320fb121a2cfb353a2f35 100644 (file)
@@ -427,13 +427,16 @@ static int
 _ctl_verify_adapter(int ioc_number, struct MPT2SAS_ADAPTER **iocpp)
 {
        struct MPT2SAS_ADAPTER *ioc;
-
+       /* global ioc lock to protect controller on list operations */
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list) {
                if (ioc->id != ioc_number)
                        continue;
+               spin_unlock(&gioc_lock);
                *iocpp = ioc;
                return ioc_number;
        }
+       spin_unlock(&gioc_lock);
        *iocpp = NULL;
        return -1;
 }
@@ -522,10 +525,15 @@ _ctl_poll(struct file *filep, poll_table *wait)
 
        poll_wait(filep, &ctl_poll_wait, wait);
 
+       /* global ioc lock to protect controller on list operations */
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list) {
-               if (ioc->aen_event_read_flag)
+               if (ioc->aen_event_read_flag) {
+                       spin_unlock(&gioc_lock);
                        return POLLIN | POLLRDNORM;
+               }
        }
+       spin_unlock(&gioc_lock);
        return 0;
 }
 
@@ -2168,16 +2176,23 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg,
 
        if (_ctl_verify_adapter(ioctl_header.ioc_number, &ioc) == -1 || !ioc)
                return -ENODEV;
+       /* pci_access_mutex lock acquired by ioctl path */
+       mutex_lock(&ioc->pci_access_mutex);
        if (ioc->shost_recovery || ioc->pci_error_recovery ||
-           ioc->is_driver_loading)
-               return -EAGAIN;
+               ioc->is_driver_loading || ioc->remove_host) {
+               ret = -EAGAIN;
+               goto out_unlock_pciaccess;
+       }
 
        state = (file->f_flags & O_NONBLOCK) ? NON_BLOCKING : BLOCKING;
        if (state == NON_BLOCKING) {
-               if (!mutex_trylock(&ioc->ctl_cmds.mutex))
-                       return -EAGAIN;
+               if (!mutex_trylock(&ioc->ctl_cmds.mutex)) {
+                       ret = -EAGAIN;
+                       goto out_unlock_pciaccess;
+               }
        } else if (mutex_lock_interruptible(&ioc->ctl_cmds.mutex)) {
-               return -ERESTARTSYS;
+               ret = -ERESTARTSYS;
+               goto out_unlock_pciaccess;
        }
 
        switch (cmd) {
@@ -2258,6 +2273,8 @@ _ctl_ioctl_main(struct file *file, unsigned int cmd, void __user *arg,
        }
 
        mutex_unlock(&ioc->ctl_cmds.mutex);
+out_unlock_pciaccess:
+       mutex_unlock(&ioc->pci_access_mutex);
        return ret;
 }
 
@@ -2711,6 +2728,12 @@ _ctl_BRM_status_show(struct device *cdev, struct device_attribute *attr,
                    "warpdrive\n", ioc->name, __func__);
                goto out;
        }
+       /* pci_access_mutex lock acquired by sysfs show path */
+       mutex_lock(&ioc->pci_access_mutex);
+       if (ioc->pci_error_recovery || ioc->remove_host) {
+               mutex_unlock(&ioc->pci_access_mutex);
+               return 0;
+       }
 
        /* allocate upto GPIOVal 36 entries */
        sz = offsetof(Mpi2IOUnitPage3_t, GPIOVal) + (sizeof(u16) * 36);
@@ -2749,6 +2772,7 @@ _ctl_BRM_status_show(struct device *cdev, struct device_attribute *attr,
 
  out:
        kfree(io_unit_pg3);
+       mutex_unlock(&ioc->pci_access_mutex);
        return rc;
 }
 static DEVICE_ATTR(BRM_status, S_IRUGO, _ctl_BRM_status_show, NULL);
index 3f26147bbc646535c643adf961001855a1797f96..0ad09b2bff9c69d86e11b9416f64e96f19e804bc 100644 (file)
@@ -79,7 +79,8 @@ static int _scsih_scan_finished(struct Scsi_Host *shost, unsigned long time);
 
 /* global parameters */
 LIST_HEAD(mpt2sas_ioc_list);
-
+/* global ioc lock for list operations */
+DEFINE_SPINLOCK(gioc_lock);
 /* local parameters */
 static u8 scsi_io_cb_idx = -1;
 static u8 tm_cb_idx = -1;
@@ -176,9 +177,37 @@ struct fw_event_work {
        u8                      VP_ID;
        u8                      ignore;
        u16                     event;
+       struct kref             refcount;
        char                    event_data[0] __aligned(4);
 };
 
+static void fw_event_work_free(struct kref *r)
+{
+       kfree(container_of(r, struct fw_event_work, refcount));
+}
+
+static void fw_event_work_get(struct fw_event_work *fw_work)
+{
+       kref_get(&fw_work->refcount);
+}
+
+static void fw_event_work_put(struct fw_event_work *fw_work)
+{
+       kref_put(&fw_work->refcount, fw_event_work_free);
+}
+
+static struct fw_event_work *alloc_fw_event_work(int len)
+{
+       struct fw_event_work *fw_event;
+
+       fw_event = kzalloc(sizeof(*fw_event) + len, GFP_ATOMIC);
+       if (!fw_event)
+               return NULL;
+
+       kref_init(&fw_event->refcount);
+       return fw_event;
+}
+
 /* raid transport support */
 static struct raid_template *mpt2sas_raid_template;
 
@@ -293,8 +322,10 @@ _scsih_set_debug_level(const char *val, struct kernel_param *kp)
                return ret;
 
        printk(KERN_INFO "setting logging_level(0x%08x)\n", logging_level);
+       spin_lock(&gioc_lock);
        list_for_each_entry(ioc, &mpt2sas_ioc_list, list)
                ioc->logging_level = logging_level;
+       spin_unlock(&gioc_lock);
        return 0;
 }
 module_param_call(logging_level, _scsih_set_debug_level, param_get_int,
@@ -526,8 +557,61 @@ _scsih_determine_boot_device(struct MPT2SAS_ADAPTER *ioc,
        }
 }
 
+static struct _sas_device *
+__mpt2sas_get_sdev_from_target(struct MPT2SAS_ADAPTER *ioc,
+               struct MPT2SAS_TARGET *tgt_priv)
+{
+       struct _sas_device *ret;
+
+       assert_spin_locked(&ioc->sas_device_lock);
+
+       ret = tgt_priv->sdev;
+       if (ret)
+               sas_device_get(ret);
+
+       return ret;
+}
+
+static struct _sas_device *
+mpt2sas_get_sdev_from_target(struct MPT2SAS_ADAPTER *ioc,
+               struct MPT2SAS_TARGET *tgt_priv)
+{
+       struct _sas_device *ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       ret = __mpt2sas_get_sdev_from_target(ioc, tgt_priv);
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       return ret;
+}
+
+
+struct _sas_device *
+__mpt2sas_get_sdev_by_addr(struct MPT2SAS_ADAPTER *ioc,
+    u64 sas_address)
+{
+       struct _sas_device *sas_device;
+
+       assert_spin_locked(&ioc->sas_device_lock);
+
+       list_for_each_entry(sas_device, &ioc->sas_device_list, list)
+               if (sas_device->sas_address == sas_address)
+                       goto found_device;
+
+       list_for_each_entry(sas_device, &ioc->sas_device_init_list, list)
+               if (sas_device->sas_address == sas_address)
+                       goto found_device;
+
+       return NULL;
+
+found_device:
+       sas_device_get(sas_device);
+       return sas_device;
+}
+
 /**
- * mpt2sas_scsih_sas_device_find_by_sas_address - sas device search
+ * mpt2sas_get_sdev_by_addr - sas device search
  * @ioc: per adapter object
  * @sas_address: sas address
  * Context: Calling function should acquire ioc->sas_device_lock
@@ -536,24 +620,44 @@ _scsih_determine_boot_device(struct MPT2SAS_ADAPTER *ioc,
  * object.
  */
 struct _sas_device *
-mpt2sas_scsih_sas_device_find_by_sas_address(struct MPT2SAS_ADAPTER *ioc,
+mpt2sas_get_sdev_by_addr(struct MPT2SAS_ADAPTER *ioc,
     u64 sas_address)
 {
        struct _sas_device *sas_device;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
+                       sas_address);
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       return sas_device;
+}
+
+static struct _sas_device *
+__mpt2sas_get_sdev_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
+{
+       struct _sas_device *sas_device;
+
+       assert_spin_locked(&ioc->sas_device_lock);
 
        list_for_each_entry(sas_device, &ioc->sas_device_list, list)
-               if (sas_device->sas_address == sas_address)
-                       return sas_device;
+               if (sas_device->handle == handle)
+                       goto found_device;
 
        list_for_each_entry(sas_device, &ioc->sas_device_init_list, list)
-               if (sas_device->sas_address == sas_address)
-                       return sas_device;
+               if (sas_device->handle == handle)
+                       goto found_device;
 
        return NULL;
+
+found_device:
+       sas_device_get(sas_device);
+       return sas_device;
 }
 
 /**
- * _scsih_sas_device_find_by_handle - sas device search
+ * mpt2sas_get_sdev_by_handle - sas device search
  * @ioc: per adapter object
  * @handle: sas device handle (assigned by firmware)
  * Context: Calling function should acquire ioc->sas_device_lock
@@ -562,19 +666,16 @@ mpt2sas_scsih_sas_device_find_by_sas_address(struct MPT2SAS_ADAPTER *ioc,
  * object.
  */
 static struct _sas_device *
-_scsih_sas_device_find_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
+mpt2sas_get_sdev_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 {
        struct _sas_device *sas_device;
+       unsigned long flags;
 
-       list_for_each_entry(sas_device, &ioc->sas_device_list, list)
-               if (sas_device->handle == handle)
-                       return sas_device;
-
-       list_for_each_entry(sas_device, &ioc->sas_device_init_list, list)
-               if (sas_device->handle == handle)
-                       return sas_device;
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
-       return NULL;
+       return sas_device;
 }
 
 /**
@@ -583,7 +684,7 @@ _scsih_sas_device_find_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
  * @sas_device: the sas_device object
  * Context: This function will acquire ioc->sas_device_lock.
  *
- * Removing object and freeing associated memory from the ioc->sas_device_list.
+ * If sas_device is on the list, remove it and decrement its reference count.
  */
 static void
 _scsih_sas_device_remove(struct MPT2SAS_ADAPTER *ioc,
@@ -594,9 +695,15 @@ _scsih_sas_device_remove(struct MPT2SAS_ADAPTER *ioc,
        if (!sas_device)
                return;
 
+       /*
+        * The lock serializes access to the list, but we still need to verify
+        * that nobody removed the entry while we were waiting on the lock.
+        */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       list_del(&sas_device->list);
-       kfree(sas_device);
+       if (!list_empty(&sas_device->list)) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 }
 
@@ -620,6 +727,7 @@ _scsih_sas_device_add(struct MPT2SAS_ADAPTER *ioc,
            sas_device->handle, (unsigned long long)sas_device->sas_address));
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device_get(sas_device);
        list_add_tail(&sas_device->list, &ioc->sas_device_list);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -659,6 +767,7 @@ _scsih_sas_device_init_add(struct MPT2SAS_ADAPTER *ioc,
            sas_device->handle, (unsigned long long)sas_device->sas_address));
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       sas_device_get(sas_device);
        list_add_tail(&sas_device->list, &ioc->sas_device_init_list);
        _scsih_determine_boot_device(ioc, sas_device, 0);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
@@ -1208,12 +1317,15 @@ _scsih_change_queue_depth(struct scsi_device *sdev, int qdepth)
                goto not_sata;
        if ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME))
                goto not_sata;
+
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-          sas_device_priv_data->sas_target->sas_address);
-       if (sas_device && sas_device->device_info &
-           MPI2_SAS_DEVICE_INFO_SATA_DEVICE)
-               max_depth = MPT2SAS_SATA_QUEUE_DEPTH;
+       sas_device = __mpt2sas_get_sdev_from_target(ioc, sas_target_priv_data);
+       if (sas_device) {
+               if (sas_device->device_info & MPI2_SAS_DEVICE_INFO_SATA_DEVICE)
+                       max_depth = MPT2SAS_SATA_QUEUE_DEPTH;
+
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
  not_sata:
@@ -1271,18 +1383,20 @@ _scsih_target_alloc(struct scsi_target *starget)
        /* sas/sata devices */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        rphy = dev_to_rphy(starget->dev.parent);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
           rphy->identify.sas_address);
 
        if (sas_device) {
                sas_target_priv_data->handle = sas_device->handle;
                sas_target_priv_data->sas_address = sas_device->sas_address;
+               sas_target_priv_data->sdev = sas_device;
                sas_device->starget = starget;
                sas_device->id = starget->id;
                sas_device->channel = starget->channel;
                if (test_bit(sas_device->handle, ioc->pd_handles))
                        sas_target_priv_data->flags |=
                            MPT_TARGET_FLAGS_RAID_COMPONENT;
+
        }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -1324,13 +1438,21 @@ _scsih_target_destroy(struct scsi_target *starget)
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        rphy = dev_to_rphy(starget->dev.parent);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-          rphy->identify.sas_address);
+       sas_device = __mpt2sas_get_sdev_from_target(ioc, sas_target_priv_data);
        if (sas_device && (sas_device->starget == starget) &&
            (sas_device->id == starget->id) &&
            (sas_device->channel == starget->channel))
                sas_device->starget = NULL;
 
+       if (sas_device) {
+               /*
+                * Corresponding get() is in _scsih_target_alloc()
+                */
+               sas_target_priv_data->sdev = NULL;
+               sas_device_put(sas_device);
+
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
  out:
@@ -1386,7 +1508,7 @@ _scsih_slave_alloc(struct scsi_device *sdev)
 
        if (!(sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME)) {
                spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+               sas_device = __mpt2sas_get_sdev_by_addr(ioc,
                                sas_target_priv_data->sas_address);
                if (sas_device && (sas_device->starget == NULL)) {
                        sdev_printk(KERN_INFO, sdev,
@@ -1394,6 +1516,10 @@ _scsih_slave_alloc(struct scsi_device *sdev)
                             __func__, __LINE__);
                        sas_device->starget = starget;
                }
+
+               if (sas_device)
+                       sas_device_put(sas_device);
+
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
 
@@ -1428,10 +1554,13 @@ _scsih_slave_destroy(struct scsi_device *sdev)
 
        if (!(sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME)) {
                spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                  sas_target_priv_data->sas_address);
+               sas_device = __mpt2sas_get_sdev_from_target(ioc,
+                               sas_target_priv_data);
                if (sas_device && !sas_target_priv_data->num_luns)
                        sas_device->starget = NULL;
+
+               if (sas_device)
+                       sas_device_put(sas_device);
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
 
@@ -2078,7 +2207,7 @@ _scsih_slave_configure(struct scsi_device *sdev)
        }
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
           sas_device_priv_data->sas_target->sas_address);
        if (!sas_device) {
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
@@ -2112,17 +2241,18 @@ _scsih_slave_configure(struct scsi_device *sdev)
            (unsigned long long) sas_device->enclosure_logical_id,
            sas_device->slot);
 
+       sas_device_put(sas_device);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        if (!ssp_target)
                _scsih_display_sata_capabilities(ioc, handle, sdev);
 
-
        _scsih_change_queue_depth(sdev, qdepth);
 
        if (ssp_target) {
                sas_read_port_mode_page(sdev);
                _scsih_enable_tlr(ioc, sdev);
        }
+
        return 0;
 }
 
@@ -2509,8 +2639,7 @@ _scsih_tm_display_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd)
                    device_str, (unsigned long long)priv_target->sas_address);
        } else {
                spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                   priv_target->sas_address);
+               sas_device = __mpt2sas_get_sdev_from_target(ioc, priv_target);
                if (sas_device) {
                        if (priv_target->flags &
                            MPT_TARGET_FLAGS_RAID_COMPONENT) {
@@ -2529,6 +2658,8 @@ _scsih_tm_display_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd)
                            "enclosure_logical_id(0x%016llx), slot(%d)\n",
                           (unsigned long long)sas_device->enclosure_logical_id,
                            sas_device->slot);
+
+                       sas_device_put(sas_device);
                }
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
@@ -2604,12 +2735,12 @@ _scsih_dev_reset(struct scsi_cmnd *scmd)
 {
        struct MPT2SAS_ADAPTER *ioc = shost_priv(scmd->device->host);
        struct MPT2SAS_DEVICE *sas_device_priv_data;
-       struct _sas_device *sas_device;
-       unsigned long flags;
+       struct _sas_device *sas_device = NULL;
        u16     handle;
        int r;
 
        struct scsi_target *starget = scmd->device->sdev_target;
+       struct MPT2SAS_TARGET *target_priv_data = starget->hostdata;
 
        starget_printk(KERN_INFO, starget, "attempting device reset! "
            "scmd(%p)\n", scmd);
@@ -2629,12 +2760,10 @@ _scsih_dev_reset(struct scsi_cmnd *scmd)
        handle = 0;
        if (sas_device_priv_data->sas_target->flags &
            MPT_TARGET_FLAGS_RAID_COMPONENT) {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc,
-                  sas_device_priv_data->sas_target->handle);
+               sas_device = mpt2sas_get_sdev_from_target(ioc,
+                               target_priv_data);
                if (sas_device)
                        handle = sas_device->volume_handle;
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        } else
                handle = sas_device_priv_data->sas_target->handle;
 
@@ -2651,6 +2780,10 @@ _scsih_dev_reset(struct scsi_cmnd *scmd)
  out:
        sdev_printk(KERN_INFO, scmd->device, "device reset: %s scmd(%p)\n",
            ((r == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+       if (sas_device)
+               sas_device_put(sas_device);
+
        return r;
 }
 
@@ -2665,11 +2798,11 @@ _scsih_target_reset(struct scsi_cmnd *scmd)
 {
        struct MPT2SAS_ADAPTER *ioc = shost_priv(scmd->device->host);
        struct MPT2SAS_DEVICE *sas_device_priv_data;
-       struct _sas_device *sas_device;
-       unsigned long flags;
+       struct _sas_device *sas_device = NULL;
        u16     handle;
        int r;
        struct scsi_target *starget = scmd->device->sdev_target;
+       struct MPT2SAS_TARGET *target_priv_data = starget->hostdata;
 
        starget_printk(KERN_INFO, starget, "attempting target reset! "
            "scmd(%p)\n", scmd);
@@ -2689,12 +2822,10 @@ _scsih_target_reset(struct scsi_cmnd *scmd)
        handle = 0;
        if (sas_device_priv_data->sas_target->flags &
            MPT_TARGET_FLAGS_RAID_COMPONENT) {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc,
-                  sas_device_priv_data->sas_target->handle);
+               sas_device = mpt2sas_get_sdev_from_target(ioc,
+                               target_priv_data);
                if (sas_device)
                        handle = sas_device->volume_handle;
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        } else
                handle = sas_device_priv_data->sas_target->handle;
 
@@ -2711,6 +2842,10 @@ _scsih_target_reset(struct scsi_cmnd *scmd)
  out:
        starget_printk(KERN_INFO, starget, "target reset: %s scmd(%p)\n",
            ((r == SUCCESS) ? "SUCCESS" : "FAILED"), scmd);
+
+       if (sas_device)
+               sas_device_put(sas_device);
+
        return r;
 }
 
@@ -2768,36 +2903,39 @@ _scsih_fw_event_add(struct MPT2SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
                return;
 
        spin_lock_irqsave(&ioc->fw_event_lock, flags);
+       fw_event_work_get(fw_event);
        list_add_tail(&fw_event->list, &ioc->fw_event_list);
        INIT_DELAYED_WORK(&fw_event->delayed_work, _firmware_event_work);
+       fw_event_work_get(fw_event);
        queue_delayed_work(ioc->firmware_event_thread,
            &fw_event->delayed_work, 0);
        spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 }
 
 /**
- * _scsih_fw_event_free - delete fw_event
+ * _scsih_fw_event_del_from_list - delete fw_event from the list
  * @ioc: per adapter object
  * @fw_event: object describing the event
  * Context: This function will acquire ioc->fw_event_lock.
  *
- * This removes firmware event object from link list, frees associated memory.
+ * If the fw_event is on the fw_event_list, remove it and do a put.
  *
  * Return nothing.
  */
 static void
-_scsih_fw_event_free(struct MPT2SAS_ADAPTER *ioc, struct fw_event_work
+_scsih_fw_event_del_from_list(struct MPT2SAS_ADAPTER *ioc, struct fw_event_work
     *fw_event)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&ioc->fw_event_lock, flags);
-       list_del(&fw_event->list);
-       kfree(fw_event);
+       if (!list_empty(&fw_event->list)) {
+               list_del_init(&fw_event->list);
+               fw_event_work_put(fw_event);
+       }
        spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
 }
 
-
 /**
  * _scsih_error_recovery_delete_devices - remove devices not responding
  * @ioc: per adapter object
@@ -2812,13 +2950,14 @@ _scsih_error_recovery_delete_devices(struct MPT2SAS_ADAPTER *ioc)
        if (ioc->is_driver_loading)
                return;
 
-       fw_event = kzalloc(sizeof(struct fw_event_work), GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
 
        fw_event->event = MPT2SAS_REMOVE_UNRESPONDING_DEVICES;
        fw_event->ioc = ioc;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
 }
 
 /**
@@ -2832,12 +2971,29 @@ mpt2sas_port_enable_complete(struct MPT2SAS_ADAPTER *ioc)
 {
        struct fw_event_work *fw_event;
 
-       fw_event = kzalloc(sizeof(struct fw_event_work), GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
        fw_event->event = MPT2SAS_PORT_ENABLE_COMPLETE;
        fw_event->ioc = ioc;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
+}
+
+static struct fw_event_work *dequeue_next_fw_event(struct MPT2SAS_ADAPTER *ioc)
+{
+       unsigned long flags;
+       struct fw_event_work *fw_event = NULL;
+
+       spin_lock_irqsave(&ioc->fw_event_lock, flags);
+       if (!list_empty(&ioc->fw_event_list)) {
+               fw_event = list_first_entry(&ioc->fw_event_list,
+                               struct fw_event_work, list);
+               list_del_init(&fw_event->list);
+       }
+       spin_unlock_irqrestore(&ioc->fw_event_lock, flags);
+
+       return fw_event;
 }
 
 /**
@@ -2852,17 +3008,25 @@ mpt2sas_port_enable_complete(struct MPT2SAS_ADAPTER *ioc)
 static void
 _scsih_fw_event_cleanup_queue(struct MPT2SAS_ADAPTER *ioc)
 {
-       struct fw_event_work *fw_event, *next;
+       struct fw_event_work *fw_event;
 
        if (list_empty(&ioc->fw_event_list) ||
             !ioc->firmware_event_thread || in_interrupt())
                return;
 
-       list_for_each_entry_safe(fw_event, next, &ioc->fw_event_list, list) {
-               if (cancel_delayed_work_sync(&fw_event->delayed_work)) {
-                       _scsih_fw_event_free(ioc, fw_event);
-                       continue;
-               }
+       while ((fw_event = dequeue_next_fw_event(ioc))) {
+               /*
+                * Wait on the fw_event to complete. If this returns 1, then
+                * the event was never executed, and we need a put for the
+                * reference the delayed_work had on the fw_event.
+                *
+                * If it did execute, we wait for it to finish, and the put will
+                * happen from _firmware_event_work()
+                */
+               if (cancel_delayed_work_sync(&fw_event->delayed_work))
+                       fw_event_work_put(fw_event);
+
+               fw_event_work_put(fw_event);
        }
 }
 
@@ -3002,15 +3166,15 @@ _scsih_block_io_to_children_attached_to_ex(struct MPT2SAS_ADAPTER *ioc,
 
        list_for_each_entry(mpt2sas_port,
           &sas_expander->sas_port_list, port_list) {
-               if (mpt2sas_port->remote_identify.device_type ==
-                   SAS_END_DEVICE) {
+               if (mpt2sas_port->remote_identify.device_type == SAS_END_DEVICE) {
                        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-                       sas_device =
-                           mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                          mpt2sas_port->remote_identify.sas_address);
-                       if (sas_device)
+                       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
+                                       mpt2sas_port->remote_identify.sas_address);
+                       if (sas_device) {
                                set_bit(sas_device->handle,
-                                   ioc->blocking_handles);
+                                               ioc->blocking_handles);
+                               sas_device_put(sas_device);
+                       }
                        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
                }
        }
@@ -3080,7 +3244,7 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 {
        Mpi2SCSITaskManagementRequest_t *mpi_request;
        u16 smid;
-       struct _sas_device *sas_device;
+       struct _sas_device *sas_device = NULL;
        struct MPT2SAS_TARGET *sas_target_priv_data = NULL;
        u64 sas_address = 0;
        unsigned long flags;
@@ -3110,7 +3274,7 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
                return;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (sas_device && sas_device->starget &&
             sas_device->starget->hostdata) {
                sas_target_priv_data = sas_device->starget->hostdata;
@@ -3131,14 +3295,14 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        if (!smid) {
                delayed_tr = kzalloc(sizeof(*delayed_tr), GFP_ATOMIC);
                if (!delayed_tr)
-                       return;
+                       goto out;
                INIT_LIST_HEAD(&delayed_tr->list);
                delayed_tr->handle = handle;
                list_add_tail(&delayed_tr->list, &ioc->delayed_tr_list);
                dewtprintk(ioc, printk(MPT2SAS_INFO_FMT
                    "DELAYED:tr:handle(0x%04x), (open)\n",
                    ioc->name, handle));
-               return;
+               goto out;
        }
 
        dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "tr_send:handle(0x%04x), "
@@ -3150,6 +3314,9 @@ _scsih_tm_tr_send(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        mpi_request->DevHandle = cpu_to_le16(handle);
        mpi_request->TaskType = MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET;
        mpt2sas_base_put_smid_hi_priority(ioc, smid);
+out:
+       if (sas_device)
+               sas_device_put(sas_device);
 }
 
 
@@ -4068,7 +4235,6 @@ _scsih_scsi_ioc_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
        char *desc_scsi_state = ioc->tmp_string;
        u32 log_info = le32_to_cpu(mpi_reply->IOCLogInfo);
        struct _sas_device *sas_device = NULL;
-       unsigned long flags;
        struct scsi_target *starget = scmd->device->sdev_target;
        struct MPT2SAS_TARGET *priv_target = starget->hostdata;
        char *device_str = NULL;
@@ -4200,9 +4366,7 @@ _scsih_scsi_ioc_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                printk(MPT2SAS_WARN_FMT "\t%s wwid(0x%016llx)\n", ioc->name,
                    device_str, (unsigned long long)priv_target->sas_address);
        } else {
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-                   priv_target->sas_address);
+               sas_device = mpt2sas_get_sdev_from_target(ioc, priv_target);
                if (sas_device) {
                        printk(MPT2SAS_WARN_FMT "\tsas_address(0x%016llx), "
                            "phy(%d)\n", ioc->name, sas_device->sas_address,
@@ -4211,8 +4375,9 @@ _scsih_scsi_ioc_info(struct MPT2SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                            "\tenclosure_logical_id(0x%016llx), slot(%d)\n",
                            ioc->name, sas_device->enclosure_logical_id,
                            sas_device->slot);
+
+                       sas_device_put(sas_device);
                }
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
 
        printk(MPT2SAS_WARN_FMT "\thandle(0x%04x), ioc_status(%s)(0x%04x), "
@@ -4259,7 +4424,7 @@ _scsih_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        Mpi2SepRequest_t mpi_request;
        struct _sas_device *sas_device;
 
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
        if (!sas_device)
                return;
 
@@ -4274,7 +4439,7 @@ _scsih_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
            &mpi_request)) != 0) {
                printk(MPT2SAS_ERR_FMT "failure at %s:%d/%s()!\n", ioc->name,
                __FILE__, __LINE__, __func__);
-               return;
+               goto out;
        }
        sas_device->pfa_led_on = 1;
 
@@ -4284,8 +4449,10 @@ _scsih_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
                 "enclosure_processor: ioc_status (0x%04x), loginfo(0x%08x)\n",
                 ioc->name, le16_to_cpu(mpi_reply.IOCStatus),
                 le32_to_cpu(mpi_reply.IOCLogInfo)));
-               return;
+               goto out;
        }
+out:
+       sas_device_put(sas_device);
 }
 
 /**
@@ -4340,13 +4507,14 @@ _scsih_send_event_to_turn_on_pfa_led(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 {
        struct fw_event_work *fw_event;
 
-       fw_event = kzalloc(sizeof(struct fw_event_work), GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(0);
        if (!fw_event)
                return;
        fw_event->event = MPT2SAS_TURN_ON_PFA_LED;
        fw_event->device_handle = handle;
        fw_event->ioc = ioc;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
 }
 
 /**
@@ -4370,19 +4538,17 @@ _scsih_smart_predicted_fault(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 
        /* only handle non-raid devices */
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (!sas_device) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
+               goto out_unlock;
        }
        starget = sas_device->starget;
        sas_target_priv_data = starget->hostdata;
 
        if ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_RAID_COMPONENT) ||
-          ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME))) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+          ((sas_target_priv_data->flags & MPT_TARGET_FLAGS_VOLUME)))
+               goto out_unlock;
+
        starget_printk(KERN_WARNING, starget, "predicted fault\n");
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -4396,7 +4562,7 @@ _scsih_smart_predicted_fault(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        if (!event_reply) {
                printk(MPT2SAS_ERR_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
-               return;
+               goto out;
        }
 
        event_reply->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
@@ -4413,6 +4579,14 @@ _scsih_smart_predicted_fault(struct MPT2SAS_ADAPTER *ioc, u16 handle)
        event_data->SASAddress = cpu_to_le64(sas_target_priv_data->sas_address);
        mpt2sas_ctl_add_to_event_log(ioc, event_reply);
        kfree(event_reply);
+out:
+       if (sas_device)
+               sas_device_put(sas_device);
+       return;
+
+out_unlock:
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+       goto out;
 }
 
 /**
@@ -5148,14 +5322,13 @@ _scsih_check_device(struct MPT2SAS_ADAPTER *ioc, u16 handle)
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        sas_address = le64_to_cpu(sas_device_pg0.SASAddress);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            sas_address);
 
        if (!sas_device) {
                printk(MPT2SAS_ERR_FMT "device is not present "
                    "handle(0x%04x), no sas_device!!!\n", ioc->name, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
+               goto out_unlock;
        }
 
        if (unlikely(sas_device->handle != handle)) {
@@ -5172,19 +5345,24 @@ _scsih_check_device(struct MPT2SAS_ADAPTER *ioc, u16 handle)
            MPI2_SAS_DEVICE0_FLAGS_DEVICE_PRESENT)) {
                printk(MPT2SAS_ERR_FMT "device is not present "
                    "handle(0x%04x), flags!!!\n", ioc->name, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
+               goto out_unlock;
        }
 
        /* check if there were any issues with discovery */
        if (_scsih_check_access_status(ioc, sas_address, handle,
-           sas_device_pg0.AccessStatus)) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+           sas_device_pg0.AccessStatus))
+               goto out_unlock;
+
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        _scsih_ublock_io_device(ioc, sas_address);
+       if (sas_device)
+               sas_device_put(sas_device);
+       return;
 
+out_unlock:
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+       if (sas_device)
+               sas_device_put(sas_device);
 }
 
 /**
@@ -5208,7 +5386,6 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
        u32 ioc_status;
        __le64 sas_address;
        u32 device_info;
-       unsigned long flags;
 
        if ((mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply, &sas_device_pg0,
            MPI2_SAS_DEVICE_PGAD_FORM_HANDLE, handle))) {
@@ -5250,14 +5427,13 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
                return -1;
        }
 
-
-       spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = mpt2sas_get_sdev_by_addr(ioc,
            sas_address);
-       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
-       if (sas_device)
+       if (sas_device) {
+               sas_device_put(sas_device);
                return 0;
+       }
 
        sas_device = kzalloc(sizeof(struct _sas_device),
            GFP_KERNEL);
@@ -5267,6 +5443,7 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
                return -1;
        }
 
+       kref_init(&sas_device->refcount);
        sas_device->handle = handle;
        if (_scsih_get_sas_address(ioc, le16_to_cpu
                (sas_device_pg0.ParentDevHandle),
@@ -5296,6 +5473,7 @@ _scsih_add_device(struct MPT2SAS_ADAPTER *ioc, u16 handle, u8 phy_num, u8 is_pd)
        else
                _scsih_sas_device_add(ioc, sas_device);
 
+       sas_device_put(sas_device);
        return 0;
 }
 
@@ -5344,7 +5522,6 @@ _scsih_remove_device(struct MPT2SAS_ADAPTER *ioc,
            "handle(0x%04x), sas_addr(0x%016llx)\n", ioc->name, __func__,
            sas_device->handle, (unsigned long long)
            sas_device->sas_address));
-       kfree(sas_device);
 }
 /**
  * _scsih_device_remove_by_handle - removing device object by handle
@@ -5363,12 +5540,17 @@ _scsih_device_remove_by_handle(struct MPT2SAS_ADAPTER *ioc, u16 handle)
                return;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-       if (sas_device)
-               list_del(&sas_device->list);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
+       if (sas_device) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-       if (sas_device)
+
+       if (sas_device) {
                _scsih_remove_device(ioc, sas_device);
+               sas_device_put(sas_device);
+       }
 }
 
 /**
@@ -5389,13 +5571,17 @@ mpt2sas_device_remove_by_sas_address(struct MPT2SAS_ADAPTER *ioc,
                return;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
-           sas_address);
-       if (sas_device)
-               list_del(&sas_device->list);
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc, sas_address);
+       if (sas_device) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-       if (sas_device)
+
+       if (sas_device) {
                _scsih_remove_device(ioc, sas_device);
+               sas_device_put(sas_device);
+       }
 }
 #ifdef CONFIG_SCSI_MPT2SAS_LOGGING
 /**
@@ -5716,26 +5902,28 @@ _scsih_sas_device_status_change_event(struct MPT2SAS_ADAPTER *ioc,
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        sas_address = le64_to_cpu(event_data->SASAddress);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            sas_address);
 
-       if (!sas_device || !sas_device->starget) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+       if (!sas_device || !sas_device->starget)
+               goto out;
 
        target_priv_data = sas_device->starget->hostdata;
-       if (!target_priv_data) {
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               return;
-       }
+       if (!target_priv_data)
+               goto out;
 
        if (event_data->ReasonCode ==
            MPI2_EVENT_SAS_DEV_STAT_RC_INTERNAL_DEVICE_RESET)
                target_priv_data->tm_busy = 1;
        else
                target_priv_data->tm_busy = 0;
+
+out:
+       if (sas_device)
+               sas_device_put(sas_device);
+
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
 }
 
 #ifdef CONFIG_SCSI_MPT2SAS_LOGGING
@@ -6123,7 +6311,7 @@ _scsih_sas_pd_expose(struct MPT2SAS_ADAPTER *ioc,
        u16 handle = le16_to_cpu(element->PhysDiskDevHandle);
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (sas_device) {
                sas_device->volume_handle = 0;
                sas_device->volume_wwid = 0;
@@ -6142,6 +6330,8 @@ _scsih_sas_pd_expose(struct MPT2SAS_ADAPTER *ioc,
        /* exposing raid component */
        if (starget)
                starget_for_each_device(starget, NULL, _scsih_reprobe_lun);
+
+       sas_device_put(sas_device);
 }
 
 /**
@@ -6170,7 +6360,7 @@ _scsih_sas_pd_hide(struct MPT2SAS_ADAPTER *ioc,
                    &volume_wwid);
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       sas_device = __mpt2sas_get_sdev_by_handle(ioc, handle);
        if (sas_device) {
                set_bit(handle, ioc->pd_handles);
                if (sas_device->starget && sas_device->starget->hostdata) {
@@ -6189,6 +6379,8 @@ _scsih_sas_pd_hide(struct MPT2SAS_ADAPTER *ioc,
        /* hiding raid component */
        if (starget)
                starget_for_each_device(starget, (void *)1, _scsih_reprobe_lun);
+
+       sas_device_put(sas_device);
 }
 
 /**
@@ -6221,7 +6413,6 @@ _scsih_sas_pd_add(struct MPT2SAS_ADAPTER *ioc,
     Mpi2EventIrConfigElement_t *element)
 {
        struct _sas_device *sas_device;
-       unsigned long flags;
        u16 handle = le16_to_cpu(element->PhysDiskDevHandle);
        Mpi2ConfigReply_t mpi_reply;
        Mpi2SasDevicePage0_t sas_device_pg0;
@@ -6231,11 +6422,11 @@ _scsih_sas_pd_add(struct MPT2SAS_ADAPTER *ioc,
 
        set_bit(handle, ioc->pd_handles);
 
-       spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-       if (sas_device)
+       sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
+       if (sas_device) {
+               sas_device_put(sas_device);
                return;
+       }
 
        if ((mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply, &sas_device_pg0,
            MPI2_SAS_DEVICE_PGAD_FORM_HANDLE, handle))) {
@@ -6509,7 +6700,6 @@ _scsih_sas_ir_physical_disk_event(struct MPT2SAS_ADAPTER *ioc,
        u16 handle, parent_handle;
        u32 state;
        struct _sas_device *sas_device;
-       unsigned long flags;
        Mpi2ConfigReply_t mpi_reply;
        Mpi2SasDevicePage0_t sas_device_pg0;
        u32 ioc_status;
@@ -6542,12 +6732,11 @@ _scsih_sas_ir_physical_disk_event(struct MPT2SAS_ADAPTER *ioc,
                if (!ioc->is_warpdrive)
                        set_bit(handle, ioc->pd_handles);
 
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-
-               if (sas_device)
+               sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
+               if (sas_device) {
+                       sas_device_put(sas_device);
                        return;
+               }
 
                if ((mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply,
                    &sas_device_pg0, MPI2_SAS_DEVICE_PGAD_FORM_HANDLE,
@@ -7015,6 +7204,7 @@ _scsih_remove_unresponding_sas_devices(struct MPT2SAS_ADAPTER *ioc)
        struct _raid_device *raid_device, *raid_device_next;
        struct list_head tmp_list;
        unsigned long flags;
+       LIST_HEAD(head);
 
        printk(MPT2SAS_INFO_FMT "removing unresponding devices: start\n",
            ioc->name);
@@ -7022,14 +7212,29 @@ _scsih_remove_unresponding_sas_devices(struct MPT2SAS_ADAPTER *ioc)
        /* removing unresponding end devices */
        printk(MPT2SAS_INFO_FMT "removing unresponding devices: end-devices\n",
            ioc->name);
+
+       /*
+        * Iterate, pulling off devices marked as non-responding. We become the
+        * owner for the reference the list had on any object we prune.
+        */
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_for_each_entry_safe(sas_device, sas_device_next,
-           &ioc->sas_device_list, list) {
+                       &ioc->sas_device_list, list) {
                if (!sas_device->responding)
-                       mpt2sas_device_remove_by_sas_address(ioc,
-                               sas_device->sas_address);
+                       list_move_tail(&sas_device->list, &head);
                else
                        sas_device->responding = 0;
        }
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       /*
+        * Now, uninitialize and remove the unresponding devices we pruned.
+        */
+       list_for_each_entry_safe(sas_device, sas_device_next, &head, list) {
+               _scsih_remove_device(ioc, sas_device);
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
 
        /* removing unresponding volumes */
        if (ioc->ir_firmware) {
@@ -7179,11 +7384,11 @@ _scsih_scan_for_devices_after_reset(struct MPT2SAS_ADAPTER *ioc)
                }
                phys_disk_num = pd_pg0.PhysDiskNum;
                handle = le16_to_cpu(pd_pg0.DevHandle);
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               if (sas_device)
+               sas_device = mpt2sas_get_sdev_by_handle(ioc, handle);
+               if (sas_device) {
+                       sas_device_put(sas_device);
                        continue;
+               }
                if (mpt2sas_config_get_sas_device_pg0(ioc, &mpi_reply,
                    &sas_device_pg0, MPI2_SAS_DEVICE_PGAD_FORM_HANDLE,
                    handle) != 0)
@@ -7302,12 +7507,12 @@ _scsih_scan_for_devices_after_reset(struct MPT2SAS_ADAPTER *ioc)
                if (!(_scsih_is_end_device(
                    le32_to_cpu(sas_device_pg0.DeviceInfo))))
                        continue;
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+               sas_device = mpt2sas_get_sdev_by_addr(ioc,
                    le64_to_cpu(sas_device_pg0.SASAddress));
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
-               if (sas_device)
+               if (sas_device) {
+                       sas_device_put(sas_device);
                        continue;
+               }
                parent_handle = le16_to_cpu(sas_device_pg0.ParentDevHandle);
                if (!_scsih_get_sas_address(ioc, parent_handle, &sas_address)) {
                        printk(MPT2SAS_INFO_FMT "\tBEFORE adding end device: "
@@ -7410,17 +7615,27 @@ _firmware_event_work(struct work_struct *work)
            struct fw_event_work, delayed_work.work);
        struct MPT2SAS_ADAPTER *ioc = fw_event->ioc;
 
+       _scsih_fw_event_del_from_list(ioc, fw_event);
+
        /* the queue is being flushed so ignore this event */
-       if (ioc->remove_host ||
-           ioc->pci_error_recovery) {
-               _scsih_fw_event_free(ioc, fw_event);
+       if (ioc->remove_host || ioc->pci_error_recovery) {
+               fw_event_work_put(fw_event);
                return;
        }
 
        switch (fw_event->event) {
        case MPT2SAS_REMOVE_UNRESPONDING_DEVICES:
-               while (scsi_host_in_recovery(ioc->shost) || ioc->shost_recovery)
+               while (scsi_host_in_recovery(ioc->shost) ||
+                               ioc->shost_recovery) {
+                       /*
+                        * If we're unloading, bail. Otherwise, this can become
+                        * an infinite loop.
+                        */
+                       if (ioc->remove_host)
+                               goto out;
+
                        ssleep(1);
+               }
                _scsih_remove_unresponding_sas_devices(ioc);
                _scsih_scan_for_devices_after_reset(ioc);
                break;
@@ -7469,7 +7684,8 @@ _firmware_event_work(struct work_struct *work)
                _scsih_sas_ir_operation_status_event(ioc, fw_event);
                break;
        }
-       _scsih_fw_event_free(ioc, fw_event);
+out:
+       fw_event_work_put(fw_event);
 }
 
 /**
@@ -7607,7 +7823,7 @@ mpt2sas_scsih_event_callback(struct MPT2SAS_ADAPTER *ioc, u8 msix_index,
        }
 
        sz = le16_to_cpu(mpi_reply->EventDataLength) * 4;
-       fw_event = kzalloc(sizeof(*fw_event) + sz, GFP_ATOMIC);
+       fw_event = alloc_fw_event_work(sz);
        if (!fw_event) {
                printk(MPT2SAS_ERR_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
@@ -7620,6 +7836,7 @@ mpt2sas_scsih_event_callback(struct MPT2SAS_ADAPTER *ioc, u8 msix_index,
        fw_event->VP_ID = mpi_reply->VP_ID;
        fw_event->event = event;
        _scsih_fw_event_add(ioc, fw_event);
+       fw_event_work_put(fw_event);
        return;
 }
 
@@ -7867,7 +8084,9 @@ _scsih_remove(struct pci_dev *pdev)
        sas_remove_host(shost);
        scsi_remove_host(shost);
        mpt2sas_base_detach(ioc);
+       spin_lock(&gioc_lock);
        list_del(&ioc->list);
+       spin_unlock(&gioc_lock);
        scsi_host_put(shost);
 }
 
@@ -7966,6 +8185,48 @@ _scsih_probe_raid(struct MPT2SAS_ADAPTER *ioc)
        }
 }
 
+static struct _sas_device *get_next_sas_device(struct MPT2SAS_ADAPTER *ioc)
+{
+       struct _sas_device *sas_device = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+       if (!list_empty(&ioc->sas_device_init_list)) {
+               sas_device = list_first_entry(&ioc->sas_device_init_list,
+                               struct _sas_device, list);
+               sas_device_get(sas_device);
+       }
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+       return sas_device;
+}
+
+static void sas_device_make_active(struct MPT2SAS_ADAPTER *ioc,
+               struct _sas_device *sas_device)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioc->sas_device_lock, flags);
+
+       /*
+        * Since we dropped the lock during the call to port_add(), we need to
+        * be careful here that somebody else didn't move or delete this item
+        * while we were busy with other things.
+        *
+        * If it was on the list, we need a put() for the reference the list
+        * had. Either way, we need a get() for the destination list.
+        */
+       if (!list_empty(&sas_device->list)) {
+               list_del_init(&sas_device->list);
+               sas_device_put(sas_device);
+       }
+
+       sas_device_get(sas_device);
+       list_add_tail(&sas_device->list, &ioc->sas_device_list);
+
+       spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+}
+
 /**
  * _scsih_probe_sas - reporting sas devices to sas transport
  * @ioc: per adapter object
@@ -7975,34 +8236,30 @@ _scsih_probe_raid(struct MPT2SAS_ADAPTER *ioc)
 static void
 _scsih_probe_sas(struct MPT2SAS_ADAPTER *ioc)
 {
-       struct _sas_device *sas_device, *next;
-       unsigned long flags;
-
-       /* SAS Device List */
-       list_for_each_entry_safe(sas_device, next, &ioc->sas_device_init_list,
-           list) {
+       struct _sas_device *sas_device;
 
-               if (ioc->hide_drives)
-                       continue;
+       if (ioc->hide_drives)
+               return;
 
+       while ((sas_device = get_next_sas_device(ioc))) {
                if (!mpt2sas_transport_port_add(ioc, sas_device->handle,
-                   sas_device->sas_address_parent)) {
-                       list_del(&sas_device->list);
-                       kfree(sas_device);
+                               sas_device->sas_address_parent)) {
+                       _scsih_sas_device_remove(ioc, sas_device);
+                       sas_device_put(sas_device);
                        continue;
                } else if (!sas_device->starget) {
                        if (!ioc->is_driver_loading) {
                                mpt2sas_transport_port_remove(ioc,
-                                       sas_device->sas_address,
-                                       sas_device->sas_address_parent);
-                               list_del(&sas_device->list);
-                               kfree(sas_device);
+                                               sas_device->sas_address,
+                                               sas_device->sas_address_parent);
+                               _scsih_sas_device_remove(ioc, sas_device);
+                               sas_device_put(sas_device);
                                continue;
                        }
                }
-               spin_lock_irqsave(&ioc->sas_device_lock, flags);
-               list_move_tail(&sas_device->list, &ioc->sas_device_list);
-               spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
+
+               sas_device_make_active(ioc, sas_device);
+               sas_device_put(sas_device);
        }
 }
 
@@ -8142,7 +8399,9 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        ioc = shost_priv(shost);
        memset(ioc, 0, sizeof(struct MPT2SAS_ADAPTER));
        INIT_LIST_HEAD(&ioc->list);
+       spin_lock(&gioc_lock);
        list_add_tail(&ioc->list, &mpt2sas_ioc_list);
+       spin_unlock(&gioc_lock);
        ioc->shost = shost;
        ioc->id = mpt_ids++;
        sprintf(ioc->name, "%s%d", MPT2SAS_DRIVER_NAME, ioc->id);
@@ -8167,6 +8426,8 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        ioc->schedule_dead_ioc_flush_running_cmds = &_scsih_flush_running_cmds;
        /* misc semaphores and spin locks */
        mutex_init(&ioc->reset_in_progress_mutex);
+       /* initializing pci_access_mutex lock */
+       mutex_init(&ioc->pci_access_mutex);
        spin_lock_init(&ioc->ioc_reset_in_progress_lock);
        spin_lock_init(&ioc->scsi_lookup_lock);
        spin_lock_init(&ioc->sas_device_lock);
@@ -8269,7 +8530,9 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  out_attach_fail:
        destroy_workqueue(ioc->firmware_event_thread);
  out_thread_fail:
+       spin_lock(&gioc_lock);
        list_del(&ioc->list);
+       spin_unlock(&gioc_lock);
        scsi_host_put(shost);
        return rv;
 }
index ff2500ab9ba47b084ab7d256620fe446f9525f10..af868009395d290474faaeb554f43e998ce28ddf 100644 (file)
@@ -1323,15 +1323,17 @@ _transport_get_enclosure_identifier(struct sas_rphy *rphy, u64 *identifier)
        int rc;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            rphy->identify.sas_address);
        if (sas_device) {
                *identifier = sas_device->enclosure_logical_id;
                rc = 0;
+               sas_device_put(sas_device);
        } else {
                *identifier = 0;
                rc = -ENXIO;
        }
+
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        return rc;
 }
@@ -1351,12 +1353,14 @@ _transport_get_bay_identifier(struct sas_rphy *rphy)
        int rc;
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
-       sas_device = mpt2sas_scsih_sas_device_find_by_sas_address(ioc,
+       sas_device = __mpt2sas_get_sdev_by_addr(ioc,
            rphy->identify.sas_address);
-       if (sas_device)
+       if (sas_device) {
                rc = sas_device->slot;
-       else
+               sas_device_put(sas_device);
+       } else {
                rc = -ENXIO;
+       }
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        return rc;
 }
index c34c1157907be84479c5c157364afde19de70dae..ec27ad2d186f9b6b8a25d795284822f75f09e667 100644 (file)
@@ -8,7 +8,7 @@
  *                 scatter/gather formats.
  * Creation Date:  June 21, 2006
  *
- * mpi2.h Version:  02.00.31
+ * mpi2.h Version:  02.00.35
  *
  * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
  *       prefix are for use only on MPI v2.5 products, and must not be used
  *                     Added MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET.
  * 04-09-13  02.00.30  Bumped MPI2_HEADER_VERSION_UNIT.
  * 04-17-13  02.00.31  Bumped MPI2_HEADER_VERSION_UNIT.
+ * 08-19-13  02.00.32  Bumped MPI2_HEADER_VERSION_UNIT.
+ * 12-05-13  02.00.33  Bumped MPI2_HEADER_VERSION_UNIT.
+ * 01-08-14  02.00.34  Bumped MPI2_HEADER_VERSION_UNIT
+ * 06-13-14  02.00.35  Bumped MPI2_HEADER_VERSION_UNIT.
  * --------------------------------------------------------------------------
  */
 
 #define MPI2_VERSION_02_05                  (0x0205)
 
 /*Unit and Dev versioning for this MPI header set */
-#define MPI2_HEADER_VERSION_UNIT            (0x1F)
+#define MPI2_HEADER_VERSION_UNIT            (0x23)
 #define MPI2_HEADER_VERSION_DEV             (0x00)
 #define MPI2_HEADER_VERSION_UNIT_MASK       (0xFF00)
 #define MPI2_HEADER_VERSION_UNIT_SHIFT      (8)
index e261a3153bb365bd80e67b69f75b0cfc249deab3..581fdb375db519dfc181dc3eb9bcfee35ffe8361 100644 (file)
@@ -6,7 +6,7 @@
  *         Title:  MPI Configuration messages and pages
  * Creation Date:  November 10, 2006
  *
- *   mpi2_cnfg.h Version:  02.00.26
+ *   mpi2_cnfg.h Version:  02.00.29
  *
  * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
  *       prefix are for use only on MPI v2.5 products, and must not be used
  *                     match the specification.
  * 08-19-13  02.00.26  Added reserved words to MPI2_CONFIG_PAGE_IO_UNIT_7 for
  *                     future use.
+ * 12-05-13  02.00.27  Added MPI2_MANPAGE7_FLAG_BASE_ENCLOSURE_LEVEL for
+ *                    MPI2_CONFIG_PAGE_MAN_7.
+ *                    Added EnclosureLevel and ConnectorName fields to
+ *                    MPI2_CONFIG_PAGE_SAS_DEV_0.
+ *                    Added MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID for
+ *                    MPI2_CONFIG_PAGE_SAS_DEV_0.
+ *                    Added EnclosureLevel field to
+ *                    MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0.
+ *                    Added MPI2_SAS_ENCLS0_FLAGS_ENCL_LEVEL_VALID for
+ *                    MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0.
+ * 01-08-14  02.00.28  Added more defines for the BiosOptions field of
+ *                    MPI2_CONFIG_PAGE_BIOS_1.
+ * 06-13-14  02.00.29  Added SSUTimeout field to MPI2_CONFIG_PAGE_BIOS_1, and
+ *                    more defines for the BiosOptions field..
  * --------------------------------------------------------------------------
  */
 
@@ -724,6 +738,7 @@ typedef struct _MPI2_CONFIG_PAGE_MAN_7 {
 #define MPI2_MANUFACTURING7_PAGEVERSION                 (0x01)
 
 /*defines for the Flags field */
+#define MPI2_MANPAGE7_FLAG_BASE_ENCLOSURE_LEVEL         (0x00000008)
 #define MPI2_MANPAGE7_FLAG_EVENTREPLAY_SLOT_ORDER       (0x00000002)
 #define MPI2_MANPAGE7_FLAG_USE_SLOT_INFO                (0x00000001)
 
@@ -1311,7 +1326,9 @@ typedef struct _MPI2_CONFIG_PAGE_BIOS_1 {
        MPI2_CONFIG_PAGE_HEADER Header;                     /*0x00 */
        U32                     BiosOptions;                /*0x04 */
        U32                     IOCSettings;                /*0x08 */
-       U32                     Reserved1;                  /*0x0C */
+       U8                      SSUTimeout;                 /*0x0C */
+       U8                      Reserved1;                  /*0x0D */
+       U16                     Reserved2;                  /*0x0E */
        U32                     DeviceSettings;             /*0x10 */
        U16                     NumberOfDevices;            /*0x14 */
        U16                     UEFIVersion;                /*0x16 */
@@ -1323,9 +1340,24 @@ typedef struct _MPI2_CONFIG_PAGE_BIOS_1 {
        *PTR_MPI2_CONFIG_PAGE_BIOS_1,
        Mpi2BiosPage1_t, *pMpi2BiosPage1_t;
 
-#define MPI2_BIOSPAGE1_PAGEVERSION                      (0x05)
+#define MPI2_BIOSPAGE1_PAGEVERSION                      (0x07)
 
 /*values for BIOS Page 1 BiosOptions field */
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_MASK                         (0x00003800)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_PBDHL                        (0x00000000)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_ENCSLOSURE                   (0x00000800)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_LWWID                        (0x00001000)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_PSENS                        (0x00001800)
+#define MPI2_BIOSPAGE1_OPTIONS_PNS_ESPHY                        (0x00002000)
+
+#define MPI2_BIOSPAGE1_OPTIONS_X86_DISABLE_BIOS                (0x00000400)
+
+#define MPI2_BIOSPAGE1_OPTIONS_MASK_REGISTRATION_UEFI_BSD      (0x00000300)
+#define MPI2_BIOSPAGE1_OPTIONS_USE_BIT0_REGISTRATION_UEFI_BSD  (0x00000000)
+#define MPI2_BIOSPAGE1_OPTIONS_FULL_REGISTRATION_UEFI_BSD      (0x00000100)
+#define MPI2_BIOSPAGE1_OPTIONS_ADAPTER_REGISTRATION_UEFI_BSD   (0x00000200)
+#define MPI2_BIOSPAGE1_OPTIONS_DISABLE_REGISTRATION_UEFI_BSD   (0x00000300)
+
 #define MPI2_BIOSPAGE1_OPTIONS_MASK_OEM_ID                  (0x000000F0)
 #define MPI2_BIOSPAGE1_OPTIONS_LSI_OEM_ID                   (0x00000000)
 
@@ -2633,9 +2665,9 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_DEV_0 {
        U8
                ControlGroup;           /*0x2E */
        U8
-               Reserved1;              /*0x2F */
+               EnclosureLevel;         /*0x2F */
        U32
-               Reserved2;              /*0x30 */
+               ConnectorName[4];       /*0x30 */
        U32
                Reserved3;              /*0x34 */
 } MPI2_CONFIG_PAGE_SAS_DEV_0,
@@ -2643,7 +2675,7 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_DEV_0 {
        Mpi2SasDevicePage0_t,
        *pMpi2SasDevicePage0_t;
 
-#define MPI2_SASDEVICE0_PAGEVERSION         (0x08)
+#define MPI2_SASDEVICE0_PAGEVERSION         (0x09)
 
 /*values for SAS Device Page 0 AccessStatus field */
 #define MPI2_SAS_DEVICE0_ASTATUS_NO_ERRORS                  (0x00)
@@ -2683,6 +2715,7 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_DEV_0 {
 #define MPI2_SAS_DEVICE0_FLAGS_SATA_NCQ_SUPPORTED           (0x0020)
 #define MPI2_SAS_DEVICE0_FLAGS_SATA_FUA_SUPPORTED           (0x0010)
 #define MPI2_SAS_DEVICE0_FLAGS_PORT_SELECTOR_ATTACH         (0x0008)
+#define MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID             (0x0002)
 #define MPI2_SAS_DEVICE0_FLAGS_DEVICE_PRESENT               (0x0001)
 
 
@@ -3019,8 +3052,10 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0 {
                NumSlots;                   /*0x18 */
        U16
                StartSlot;                  /*0x1A */
-       U16
+       U8
                Reserved2;                  /*0x1C */
+       U8
+               EnclosureLevel;             /*0x1D */
        U16
                SEPDevHandle;               /*0x1E */
        U32
@@ -3031,9 +3066,10 @@ typedef struct _MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0 {
        *PTR_MPI2_CONFIG_PAGE_SAS_ENCLOSURE_0,
        Mpi2SasEnclosurePage0_t, *pMpi2SasEnclosurePage0_t;
 
-#define MPI2_SASENCLOSURE0_PAGEVERSION      (0x03)
+#define MPI2_SASENCLOSURE0_PAGEVERSION      (0x04)
 
 /*values for SAS Enclosure Page 0 Flags field */
+#define MPI2_SAS_ENCLS0_FLAGS_ENCL_LEVEL_VALID      (0x0010)
 #define MPI2_SAS_ENCLS0_FLAGS_MNG_MASK              (0x000F)
 #define MPI2_SAS_ENCLS0_FLAGS_MNG_UNKNOWN           (0x0000)
 #define MPI2_SAS_ENCLS0_FLAGS_MNG_IOC_SES           (0x0001)
index 4908309578061deeef869a92d66e9ff5f4009426..d7598cc4bb8ed7bbb109aaf45be15d1037be6917 100644 (file)
@@ -6,7 +6,7 @@
  *         Title:  MPI IOC, Port, Event, FW Download, and FW Upload messages
  * Creation Date:  October 11, 2006
  *
- * mpi2_ioc.h Version:  02.00.23
+ * mpi2_ioc.h Version:  02.00.24
  *
  * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25
  *       prefix are for use only on MPI v2.5 products, and must not be used
  *                     Added MPI2_IOCFACTS_CAPABILITY_RDPQ_ARRAY_CAPABLE.
  *                     Added MPI2_FW_DOWNLOAD_ITYPE_PUBLIC_KEY.
  *                     Added Encrypted Hash Extended Image.
+ * 12-05-13  02.00.24  Added MPI25_HASH_IMAGE_TYPE_BIOS.
  * --------------------------------------------------------------------------
  */
 
@@ -1598,6 +1599,7 @@ Mpi25EncryptedHashEntry_t, *pMpi25EncryptedHashEntry_t;
 /* values for HashImageType */
 #define MPI25_HASH_IMAGE_TYPE_UNUSED           (0x00)
 #define MPI25_HASH_IMAGE_TYPE_FIRMWARE         (0x01)
+#define MPI25_HASH_IMAGE_TYPE_BIOS              (0x02)
 
 /* values for HashAlgorithm */
 #define MPI25_HASH_ALGORITHM_UNUSED            (0x00)
index 904910d8a7374c10e2b2fd2165c37afe34d68058..1629e5bce7e12b9fea43e8d4642dfdff48c537a3 100644 (file)
@@ -6,7 +6,7 @@
  *         Title:  MPI diagnostic tool structures and definitions
  * Creation Date:  March 26, 2007
  *
- *   mpi2_tool.h Version:  02.00.11
+ *   mpi2_tool.h Version:  02.00.12
  *
  * Version History
  * ---------------
@@ -33,6 +33,7 @@
  * 07-26-12  02.00.10  Modified MPI2_TOOLBOX_DIAGNOSTIC_CLI_REQUEST so that
  *                     it uses MPI Chain SGE as well as MPI Simple SGE.
  * 08-19-13  02.00.11  Added MPI2_TOOLBOX_TEXT_DISPLAY_TOOL and related info.
+ * 01-08-14  02.00.12  Added MPI2_TOOLBOX_CLEAN_BIT26_PRODUCT_SPECIFIC.
  * --------------------------------------------------------------------------
  */
 
@@ -100,6 +101,7 @@ typedef struct _MPI2_TOOLBOX_CLEAN_REQUEST {
 #define MPI2_TOOLBOX_CLEAN_OTHER_PERSIST_PAGES      (0x20000000)
 #define MPI2_TOOLBOX_CLEAN_FW_CURRENT               (0x10000000)
 #define MPI2_TOOLBOX_CLEAN_FW_BACKUP                (0x08000000)
+#define MPI2_TOOLBOX_CLEAN_BIT26_PRODUCT_SPECIFIC   (0x04000000)
 #define MPI2_TOOLBOX_CLEAN_MEGARAID                 (0x02000000)
 #define MPI2_TOOLBOX_CLEAN_INITIALIZATION           (0x01000000)
 #define MPI2_TOOLBOX_CLEAN_FLASH                    (0x00000004)
index 43f87e904b9886a89caab3e0a1560f485f2ee12e..d4f1dcdb8361937c8b0cb75491ec45e1b4981188 100644 (file)
@@ -83,10 +83,10 @@ static int msix_disable = -1;
 module_param(msix_disable, int, 0);
 MODULE_PARM_DESC(msix_disable, " disable msix routed interrupts (default=0)");
 
-static int max_msix_vectors = 8;
+static int max_msix_vectors = -1;
 module_param(max_msix_vectors, int, 0);
 MODULE_PARM_DESC(max_msix_vectors,
-       " max msix vectors - (default=8)");
+       " max msix vectors");
 
 static int mpt3sas_fwfault_debug;
 MODULE_PARM_DESC(mpt3sas_fwfault_debug,
@@ -1009,8 +1009,30 @@ _base_interrupt(int irq, void *bus_id)
        }
 
        wmb();
-       writel(reply_q->reply_post_host_index | (msix_index <<
-           MPI2_RPHI_MSIX_INDEX_SHIFT), &ioc->chip->ReplyPostHostIndex);
+
+       /* Update Reply Post Host Index.
+        * For those HBA's which support combined reply queue feature
+        * 1. Get the correct Supplemental Reply Post Host Index Register.
+        *    i.e. (msix_index / 8)th entry from Supplemental Reply Post Host
+        *    Index Register address bank i.e replyPostRegisterIndex[],
+        * 2. Then update this register with new reply host index value
+        *    in ReplyPostIndex field and the MSIxIndex field with
+        *    msix_index value reduced to a value between 0 and 7,
+        *    using a modulo 8 operation. Since each Supplemental Reply Post
+        *    Host Index Register supports 8 MSI-X vectors.
+        *
+        * For other HBA's just update the Reply Post Host Index register with
+        * new reply host index value in ReplyPostIndex Field and msix_index
+        * value in MSIxIndex field.
+        */
+       if (ioc->msix96_vector)
+               writel(reply_q->reply_post_host_index | ((msix_index  & 7) <<
+                       MPI2_RPHI_MSIX_INDEX_SHIFT),
+                       ioc->replyPostRegisterIndex[msix_index/8]);
+       else
+               writel(reply_q->reply_post_host_index | (msix_index <<
+                       MPI2_RPHI_MSIX_INDEX_SHIFT),
+                       &ioc->chip->ReplyPostHostIndex);
        atomic_dec(&reply_q->busy);
        return IRQ_HANDLED;
 }
@@ -1338,7 +1360,7 @@ _base_build_sg_scmd_ieee(struct MPT3SAS_ADAPTER *ioc,
 
        sg_scmd = scsi_sglist(scmd);
        sges_left = scsi_dma_map(scmd);
-       if (!sges_left) {
+       if (sges_left < 0) {
                sdev_printk(KERN_ERR, scmd->device,
                        "pci_map_sg failed: request for %d bytes!\n",
                        scsi_bufflen(scmd));
@@ -1407,7 +1429,7 @@ _base_build_sg_scmd_ieee(struct MPT3SAS_ADAPTER *ioc,
  fill_in_last_segment:
 
        /* fill the last segment */
-       while (sges_left) {
+       while (sges_left > 0) {
                if (sges_left == 1)
                        _base_add_sg_single_ieee(sg_local,
                            simple_sgl_flags_last, 0, sg_dma_len(sg_scmd),
@@ -1560,8 +1582,6 @@ _base_check_enable_msix(struct MPT3SAS_ADAPTER *ioc)
 
        pci_read_config_word(ioc->pdev, base + 2, &message_control);
        ioc->msix_vector_count = (message_control & 0x3FF) + 1;
-       if (ioc->msix_vector_count > 8)
-               ioc->msix_vector_count = 8;
        dinitprintk(ioc, pr_info(MPT3SAS_FMT
                "msix is supported, vector_count(%d)\n",
                ioc->name, ioc->msix_vector_count));
@@ -1792,6 +1812,36 @@ _base_enable_msix(struct MPT3SAS_ADAPTER *ioc)
        return r;
 }
 
+/**
+ * mpt3sas_base_unmap_resources - free controller resources
+ * @ioc: per adapter object
+ */
+void
+mpt3sas_base_unmap_resources(struct MPT3SAS_ADAPTER *ioc)
+{
+       struct pci_dev *pdev = ioc->pdev;
+
+       dexitprintk(ioc, printk(MPT3SAS_FMT "%s\n",
+               ioc->name, __func__));
+
+       _base_free_irq(ioc);
+       _base_disable_msix(ioc);
+
+       if (ioc->msix96_vector)
+               kfree(ioc->replyPostRegisterIndex);
+
+       if (ioc->chip_phys) {
+               iounmap(ioc->chip);
+               ioc->chip_phys = 0;
+       }
+
+       if (pci_is_enabled(pdev)) {
+               pci_release_selected_regions(ioc->pdev, ioc->bars);
+               pci_disable_pcie_error_reporting(pdev);
+               pci_disable_device(pdev);
+       }
+}
+
 /**
  * mpt3sas_base_map_resources - map in controller resources (io/irq/memap)
  * @ioc: per adapter object
@@ -1882,6 +1932,36 @@ mpt3sas_base_map_resources(struct MPT3SAS_ADAPTER *ioc)
        if (r)
                goto out_fail;
 
+       /* Use the Combined reply queue feature only for SAS3 C0 & higher
+        * revision HBAs and also only when reply queue count is greater than 8
+        */
+       if (ioc->msix96_vector && ioc->reply_queue_count > 8) {
+               /* Determine the Supplemental Reply Post Host Index Registers
+                * Addresse. Supplemental Reply Post Host Index Registers
+                * starts at offset MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET and
+                * each register is at offset bytes of
+                * MPT3_SUP_REPLY_POST_HOST_INDEX_REG_OFFSET from previous one.
+                */
+               ioc->replyPostRegisterIndex = kcalloc(
+                    MPT3_SUP_REPLY_POST_HOST_INDEX_REG_COUNT,
+                    sizeof(resource_size_t *), GFP_KERNEL);
+               if (!ioc->replyPostRegisterIndex) {
+                       dfailprintk(ioc, printk(MPT3SAS_FMT
+                       "allocation for reply Post Register Index failed!!!\n",
+                                                                  ioc->name));
+                       r = -ENOMEM;
+                       goto out_fail;
+               }
+
+               for (i = 0; i < MPT3_SUP_REPLY_POST_HOST_INDEX_REG_COUNT; i++) {
+                       ioc->replyPostRegisterIndex[i] = (resource_size_t *)
+                            ((u8 *)&ioc->chip->Doorbell +
+                            MPI25_SUP_REPLY_POST_HOST_INDEX_OFFSET +
+                            (i * MPT3_SUP_REPLY_POST_HOST_INDEX_REG_OFFSET));
+               }
+       } else
+               ioc->msix96_vector = 0;
+
        list_for_each_entry(reply_q, &ioc->reply_queue_list, list)
                pr_info(MPT3SAS_FMT "%s: IRQ %d\n",
                    reply_q->name,  ((ioc->msix_enable) ? "PCI-MSI-X enabled" :
@@ -1897,12 +1977,7 @@ mpt3sas_base_map_resources(struct MPT3SAS_ADAPTER *ioc)
        return 0;
 
  out_fail:
-       if (ioc->chip_phys)
-               iounmap(ioc->chip);
-       ioc->chip_phys = 0;
-       pci_release_selected_regions(ioc->pdev, ioc->bars);
-       pci_disable_pcie_error_reporting(pdev);
-       pci_disable_device(pdev);
+       mpt3sas_base_unmap_resources(ioc);
        return r;
 }
 
@@ -2291,6 +2366,99 @@ _base_display_intel_branding(struct MPT3SAS_ADAPTER *ioc)
 
 
 
+/**
+ * _base_display_dell_branding - Display branding string
+ * @ioc: per adapter object
+ *
+ * Return nothing.
+ */
+static void
+_base_display_dell_branding(struct MPT3SAS_ADAPTER *ioc)
+{
+       if (ioc->pdev->subsystem_vendor != PCI_VENDOR_ID_DELL)
+               return;
+
+       switch (ioc->pdev->device) {
+       case MPI25_MFGPAGE_DEVID_SAS3008:
+               switch (ioc->pdev->subsystem_device) {
+               case MPT3SAS_DELL_12G_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_DELL_12G_HBA_BRANDING);
+                       break;
+               default:
+                       pr_info(MPT3SAS_FMT
+                          "Dell 12Gbps HBA: Subsystem ID: 0x%X\n", ioc->name,
+                          ioc->pdev->subsystem_device);
+                       break;
+               }
+               break;
+       default:
+               pr_info(MPT3SAS_FMT
+                       "Dell 12Gbps HBA: Subsystem ID: 0x%X\n", ioc->name,
+                       ioc->pdev->subsystem_device);
+               break;
+       }
+}
+
+/**
+ * _base_display_cisco_branding - Display branding string
+ * @ioc: per adapter object
+ *
+ * Return nothing.
+ */
+static void
+_base_display_cisco_branding(struct MPT3SAS_ADAPTER *ioc)
+{
+       if (ioc->pdev->subsystem_vendor != PCI_VENDOR_ID_CISCO)
+               return;
+
+       switch (ioc->pdev->device) {
+       case MPI25_MFGPAGE_DEVID_SAS3008:
+               switch (ioc->pdev->subsystem_device) {
+               case MPT3SAS_CISCO_12G_8E_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_CISCO_12G_8E_HBA_BRANDING);
+                       break;
+               case MPT3SAS_CISCO_12G_8I_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_CISCO_12G_8I_HBA_BRANDING);
+                       break;
+               case MPT3SAS_CISCO_12G_AVILA_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                               MPT3SAS_CISCO_12G_AVILA_HBA_BRANDING);
+                       break;
+               default:
+                       pr_info(MPT3SAS_FMT
+                         "Cisco 12Gbps SAS HBA: Subsystem ID: 0x%X\n",
+                         ioc->name, ioc->pdev->subsystem_device);
+                       break;
+               }
+               break;
+       case MPI25_MFGPAGE_DEVID_SAS3108_1:
+               switch (ioc->pdev->subsystem_device) {
+               case MPT3SAS_CISCO_12G_AVILA_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                       MPT3SAS_CISCO_12G_AVILA_HBA_BRANDING);
+                       break;
+               case MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_SSDID:
+                       pr_info(MPT3SAS_FMT "%s\n", ioc->name,
+                       MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_BRANDING);
+                       break;
+               default:
+                       pr_info(MPT3SAS_FMT
+                        "Cisco 12Gbps SAS HBA: Subsystem ID: 0x%X\n",
+                        ioc->name, ioc->pdev->subsystem_device);
+                       break;
+               }
+               break;
+       default:
+                pr_info(MPT3SAS_FMT
+                       "Cisco 12Gbps SAS HBA: Subsystem ID: 0x%X\n",
+                       ioc->name, ioc->pdev->subsystem_device);
+               break;
+       }
+}
+
 /**
  * _base_display_ioc_capabilities - Disply IOC's capabilities.
  * @ioc: per adapter object
@@ -2321,6 +2489,8 @@ _base_display_ioc_capabilities(struct MPT3SAS_ADAPTER *ioc)
            bios_version & 0x000000FF);
 
        _base_display_intel_branding(ioc);
+       _base_display_dell_branding(ioc);
+       _base_display_cisco_branding(ioc);
 
        pr_info(MPT3SAS_FMT "Protocol=(", ioc->name);
 
@@ -3138,6 +3308,9 @@ _base_wait_on_iocstate(struct MPT3SAS_ADAPTER *ioc, u32 ioc_state, int timeout,
  *
  * Notes: MPI2_HIS_IOC2SYS_DB_STATUS - set to one when IOC writes to doorbell.
  */
+static int
+_base_diag_reset(struct MPT3SAS_ADAPTER *ioc, int sleep_flag);
+
 static int
 _base_wait_for_doorbell_int(struct MPT3SAS_ADAPTER *ioc, int timeout,
        int sleep_flag)
@@ -3680,6 +3853,64 @@ _base_get_port_facts(struct MPT3SAS_ADAPTER *ioc, int port, int sleep_flag)
        return 0;
 }
 
+/**
+ * _base_wait_for_iocstate - Wait until the card is in READY or OPERATIONAL
+ * @ioc: per adapter object
+ * @timeout:
+ * @sleep_flag: CAN_SLEEP or NO_SLEEP
+ *
+ * Returns 0 for success, non-zero for failure.
+ */
+static int
+_base_wait_for_iocstate(struct MPT3SAS_ADAPTER *ioc, int timeout,
+       int sleep_flag)
+{
+       u32 ioc_state;
+       int rc;
+
+       dinitprintk(ioc, printk(MPT3SAS_FMT "%s\n", ioc->name,
+           __func__));
+
+       if (ioc->pci_error_recovery) {
+               dfailprintk(ioc, printk(MPT3SAS_FMT
+                   "%s: host in pci error recovery\n", ioc->name, __func__));
+               return -EFAULT;
+       }
+
+       ioc_state = mpt3sas_base_get_iocstate(ioc, 0);
+       dhsprintk(ioc, printk(MPT3SAS_FMT "%s: ioc_state(0x%08x)\n",
+           ioc->name, __func__, ioc_state));
+
+       if (((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_READY) ||
+           (ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_OPERATIONAL)
+               return 0;
+
+       if (ioc_state & MPI2_DOORBELL_USED) {
+               dhsprintk(ioc, printk(MPT3SAS_FMT
+                   "unexpected doorbell active!\n", ioc->name));
+               goto issue_diag_reset;
+       }
+
+       if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
+               mpt3sas_base_fault_info(ioc, ioc_state &
+                   MPI2_DOORBELL_DATA_MASK);
+               goto issue_diag_reset;
+       }
+
+       ioc_state = _base_wait_on_iocstate(ioc, MPI2_IOC_STATE_READY,
+           timeout, sleep_flag);
+       if (ioc_state) {
+               dfailprintk(ioc, printk(MPT3SAS_FMT
+                   "%s: failed going to ready state (ioc_state=0x%x)\n",
+                   ioc->name, __func__, ioc_state));
+               return -EFAULT;
+       }
+
+ issue_diag_reset:
+       rc = _base_diag_reset(ioc, sleep_flag);
+       return rc;
+}
+
 /**
  * _base_get_ioc_facts - obtain ioc facts reply and save in ioc
  * @ioc: per adapter object
@@ -3698,6 +3929,13 @@ _base_get_ioc_facts(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
        dinitprintk(ioc, pr_info(MPT3SAS_FMT "%s\n", ioc->name,
            __func__));
 
+       r = _base_wait_for_iocstate(ioc, 10, sleep_flag);
+       if (r) {
+               dfailprintk(ioc, printk(MPT3SAS_FMT
+                   "%s: failed getting to correct state\n",
+                   ioc->name, __func__));
+               return r;
+       }
        mpi_reply_sz = sizeof(Mpi2IOCFactsReply_t);
        mpi_request_sz = sizeof(Mpi2IOCFactsRequest_t);
        memset(&mpi_request, 0, mpi_request_sz);
@@ -3783,7 +4021,7 @@ _base_send_ioc_init(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
        mpi_request.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
        mpi_request.VF_ID = 0; /* TODO */
        mpi_request.VP_ID = 0;
-       mpi_request.MsgVersion = cpu_to_le16(MPI2_VERSION);
+       mpi_request.MsgVersion = cpu_to_le16(MPI25_VERSION);
        mpi_request.HeaderVersion = cpu_to_le16(MPI2_HEADER_VERSION);
 
        if (_base_is_controller_msix_enabled(ioc))
@@ -4524,8 +4762,15 @@ _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
 
        /* initialize reply post host index */
        list_for_each_entry(reply_q, &ioc->reply_queue_list, list) {
-               writel(reply_q->msix_index << MPI2_RPHI_MSIX_INDEX_SHIFT,
-                   &ioc->chip->ReplyPostHostIndex);
+               if (ioc->msix96_vector)
+                       writel((reply_q->msix_index & 7)<<
+                          MPI2_RPHI_MSIX_INDEX_SHIFT,
+                          ioc->replyPostRegisterIndex[reply_q->msix_index/8]);
+               else
+                       writel(reply_q->msix_index <<
+                               MPI2_RPHI_MSIX_INDEX_SHIFT,
+                               &ioc->chip->ReplyPostHostIndex);
+
                if (!_base_is_controller_msix_enabled(ioc))
                        goto skip_init_reply_post_host_index;
        }
@@ -4564,8 +4809,6 @@ _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc, int sleep_flag)
 void
 mpt3sas_base_free_resources(struct MPT3SAS_ADAPTER *ioc)
 {
-       struct pci_dev *pdev = ioc->pdev;
-
        dexitprintk(ioc, pr_info(MPT3SAS_FMT "%s\n", ioc->name,
            __func__));
 
@@ -4576,18 +4819,7 @@ mpt3sas_base_free_resources(struct MPT3SAS_ADAPTER *ioc)
                ioc->shost_recovery = 0;
        }
 
-       _base_free_irq(ioc);
-       _base_disable_msix(ioc);
-
-       if (ioc->chip_phys && ioc->chip)
-               iounmap(ioc->chip);
-       ioc->chip_phys = 0;
-
-       if (pci_is_enabled(pdev)) {
-               pci_release_selected_regions(ioc->pdev, ioc->bars);
-               pci_disable_pcie_error_reporting(pdev);
-               pci_disable_device(pdev);
-       }
+       mpt3sas_base_unmap_resources(ioc);
        return;
 }
 
@@ -4602,6 +4834,7 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
 {
        int r, i;
        int cpu_id, last_cpu_id = 0;
+       u8 revision;
 
        dinitprintk(ioc, pr_info(MPT3SAS_FMT "%s\n", ioc->name,
            __func__));
@@ -4621,6 +4854,20 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
                goto out_free_resources;
        }
 
+       /* Check whether the controller revision is C0 or above.
+        * only C0 and above revision controllers support 96 MSI-X vectors.
+        */
+       revision = ioc->pdev->revision;
+
+       if ((ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3004 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3008 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_1 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_2 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_5 ||
+            ioc->pdev->device == MPI25_MFGPAGE_DEVID_SAS3108_6) &&
+            (revision >= 0x02))
+               ioc->msix96_vector = 1;
+
        ioc->rdpq_array_enable_assigned = 0;
        ioc->dma_mask = 0;
        r = mpt3sas_base_map_resources(ioc);
@@ -4643,7 +4890,6 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
        ioc->build_sg_scmd = &_base_build_sg_scmd_ieee;
        ioc->build_sg = &_base_build_sg_ieee;
        ioc->build_zero_len_sge = &_base_build_zero_len_sge_ieee;
-       ioc->mpi25 = 1;
        ioc->sge_size_ieee = sizeof(Mpi2IeeeSgeSimple64_t);
 
        /*
index afa881682bef4d489a5f5e6a1f67a85dd5688b88..f0e462b0880d21a645327bf60c221cc0f112c76a 100644 (file)
@@ -71,8 +71,8 @@
 #define MPT3SAS_DRIVER_NAME            "mpt3sas"
 #define MPT3SAS_AUTHOR "Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>"
 #define MPT3SAS_DESCRIPTION    "LSI MPT Fusion SAS 3.0 Device Driver"
-#define MPT3SAS_DRIVER_VERSION         "04.100.00.00"
-#define MPT3SAS_MAJOR_VERSION          4
+#define MPT3SAS_DRIVER_VERSION         "09.100.00.00"
+#define MPT3SAS_MAJOR_VERSION          9
 #define MPT3SAS_MINOR_VERSION          100
 #define MPT3SAS_BUILD_VERSION          0
 #define MPT3SAS_RELEASE_VERSION        00
 #define MPT3SAS_INTEL_RS3FC044_SSDID   0x3523
 #define MPT3SAS_INTEL_RS3UC080_SSDID    0x3524
 
+/*
+ * Dell HBA branding
+ */
+#define MPT3SAS_DELL_12G_HBA_BRANDING       \
+       "Dell 12Gbps HBA"
+
+/*
+ * Dell HBA SSDIDs
+ */
+#define MPT3SAS_DELL_12G_HBA_SSDID     0x1F46
+
+/*
+ * Cisco HBA branding
+ */
+#define MPT3SAS_CISCO_12G_8E_HBA_BRANDING              \
+               "Cisco 9300-8E 12G SAS HBA"
+#define MPT3SAS_CISCO_12G_8I_HBA_BRANDING              \
+               "Cisco 9300-8i 12G SAS HBA"
+#define MPT3SAS_CISCO_12G_AVILA_HBA_BRANDING   \
+               "Cisco 12G Modular SAS Pass through Controller"
+#define MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_BRANDING                \
+               "UCS C3X60 12G SAS Pass through Controller"
+/*
+ * Cisco HBA SSSDIDs
+ */
+#define MPT3SAS_CISCO_12G_8E_HBA_SSDID  0x14C
+#define MPT3SAS_CISCO_12G_8I_HBA_SSDID  0x154
+#define MPT3SAS_CISCO_12G_AVILA_HBA_SSDID  0x155
+#define MPT3SAS_CISCO_12G_COLUSA_MEZZANINE_HBA_SSDID  0x156
+
 /*
  * status bits for ioc->diag_buffer_status
  */
 #define MPT3_DIAG_BUFFER_IS_RELEASED   (0x02)
 #define MPT3_DIAG_BUFFER_IS_DIAG_RESET (0x04)
 
+/*
+ * Combined Reply Queue constants,
+ * There are twelve Supplemental Reply Post Host Index Registers
+ * and each register is at offset 0x10 bytes from the previous one.
+ */
+#define MPT3_SUP_REPLY_POST_HOST_INDEX_REG_COUNT 12
+#define MPT3_SUP_REPLY_POST_HOST_INDEX_REG_OFFSET (0x10)
 
 /* OEM Identifiers */
 #define MFG10_OEM_ID_INVALID                   (0x00000000)
 #define MFG10_GF0_SSD_DATA_SCRUB_DISABLE       (0x00000008)
 #define MFG10_GF0_SINGLE_DRIVE_R0              (0x00000010)
 
+#define VIRTUAL_IO_FAILED_RETRY                        (0x32010081)
+
 /* OEM Specific Flags will come from OEM specific header files */
 struct Mpi2ManufacturingPage10_t {
        MPI2_CONFIG_PAGE_HEADER Header;         /* 00h */
@@ -294,7 +333,8 @@ struct _internal_cmd {
  * @responding: used in _scsih_sas_device_mark_responding
  * @fast_path: fast path feature enable bit
  * @pfa_led_on: flag for PFA LED status
- *
+ * @pend_sas_rphy_add: flag to check if device is in sas_rphy_add()
+ *     addition routine.
  */
 struct _sas_device {
        struct list_head list;
@@ -315,6 +355,9 @@ struct _sas_device {
        u8      responding;
        u8      fast_path;
        u8      pfa_led_on;
+       u8      pend_sas_rphy_add;
+       u8      enclosure_level;
+       u8      connector_name[4];
 };
 
 /**
@@ -728,7 +771,8 @@ typedef void (*MPT3SAS_FLUSH_RUNNING_CMDS)(struct MPT3SAS_ADAPTER *ioc);
  *                             is assigned only ones
  * @reply_queue_count: number of reply queue's
  * @reply_queue_list: link list contaning the reply queue info
- * @reply_post_host_index: head index in the pool where FW completes IO
+ * @msix96_vector: 96 MSI-X vector support
+ * @replyPostRegisterIndex: index of next position in Reply Desc Post Queue
  * @delayed_tr_list: target reset link list
  * @delayed_tr_volume_list: volume target reset link list
  * @@temp_sensors_count: flag to carry the number of temperature sensors
@@ -814,7 +858,6 @@ struct MPT3SAS_ADAPTER {
        MPT_BUILD_SG_SCMD build_sg_scmd;
        MPT_BUILD_SG    build_sg;
        MPT_BUILD_ZERO_LEN_SGE build_zero_len_sge;
-       u8              mpi25;
        u16             sge_size_ieee;
 
        /* function ptr for MPI sg elements only */
@@ -937,6 +980,10 @@ struct MPT3SAS_ADAPTER {
        u8              reply_queue_count;
        struct list_head reply_queue_list;
 
+       u8              msix96_vector;
+       /* reply post register index */
+       resource_size_t **replyPostRegisterIndex;
+
        struct list_head delayed_tr_list;
        struct list_head delayed_tr_volume_list;
        u8              temp_sensors_count;
index 5a97e3286719d8150a6dbf24bcd062f2effa4373..8ccef38523fa46c823672878978e1ab8d022018a 100644 (file)
@@ -585,6 +585,22 @@ _scsih_sas_device_remove(struct MPT3SAS_ADAPTER *ioc,
 
        if (!sas_device)
                return;
+       pr_info(MPT3SAS_FMT
+           "removing handle(0x%04x), sas_addr(0x%016llx)\n",
+           ioc->name, sas_device->handle,
+           (unsigned long long) sas_device->sas_address);
+
+       if (sas_device->enclosure_handle != 0)
+               pr_info(MPT3SAS_FMT
+                  "removing enclosure logical id(0x%016llx), slot(%d)\n",
+                  ioc->name, (unsigned long long)
+                  sas_device->enclosure_logical_id, sas_device->slot);
+
+       if (sas_device->connector_name[0] != '\0')
+               pr_info(MPT3SAS_FMT
+                  "removing enclosure level(0x%04x), connector name( %s)\n",
+                  ioc->name, sas_device->enclosure_level,
+                  sas_device->connector_name);
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_del(&sas_device->list);
@@ -663,6 +679,18 @@ _scsih_sas_device_add(struct MPT3SAS_ADAPTER *ioc,
                ioc->name, __func__, sas_device->handle,
                (unsigned long long)sas_device->sas_address));
 
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure logical id(0x%016llx), slot( %d)\n",
+                   ioc->name, __func__, (unsigned long long)
+                   sas_device->enclosure_logical_id, sas_device->slot));
+
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure level(0x%04x), connector name( %s)\n",
+                   ioc->name, __func__,
+                   sas_device->enclosure_level, sas_device->connector_name));
+
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_add_tail(&sas_device->list, &ioc->sas_device_list);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
@@ -704,6 +732,18 @@ _scsih_sas_device_init_add(struct MPT3SAS_ADAPTER *ioc,
                __func__, sas_device->handle,
                (unsigned long long)sas_device->sas_address));
 
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure logical id(0x%016llx), slot( %d)\n",
+                   ioc->name, __func__, (unsigned long long)
+                   sas_device->enclosure_logical_id, sas_device->slot));
+
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enclosure level(0x%04x), connector name( %s)\n",
+                   ioc->name, __func__, sas_device->enclosure_level,
+                   sas_device->connector_name));
+
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_add_tail(&sas_device->list, &ioc->sas_device_init_list);
        _scsih_determine_boot_device(ioc, sas_device, 0);
@@ -1772,10 +1812,16 @@ _scsih_slave_configure(struct scsi_device *sdev)
            "sas_addr(0x%016llx), phy(%d), device_name(0x%016llx)\n",
            ds, handle, (unsigned long long)sas_device->sas_address,
            sas_device->phy, (unsigned long long)sas_device->device_name);
-       sdev_printk(KERN_INFO, sdev,
-               "%s: enclosure_logical_id(0x%016llx), slot(%d)\n",
-               ds, (unsigned long long)
-           sas_device->enclosure_logical_id, sas_device->slot);
+       if (sas_device->enclosure_handle != 0)
+               sdev_printk(KERN_INFO, sdev,
+                    "%s: enclosure_logical_id(0x%016llx), slot(%d)\n",
+                    ds, (unsigned long long)
+                    sas_device->enclosure_logical_id, sas_device->slot);
+       if (sas_device->connector_name[0] != '\0')
+               sdev_printk(KERN_INFO, sdev,
+                    "%s: enclosure level(0x%04x), connector name( %s)\n",
+                    ds, sas_device->enclosure_level,
+                    sas_device->connector_name);
 
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
@@ -2189,10 +2235,17 @@ _scsih_tm_display_info(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd)
                            sas_device->handle,
                            (unsigned long long)sas_device->sas_address,
                            sas_device->phy);
-                       starget_printk(KERN_INFO, starget,
-                           "enclosure_logical_id(0x%016llx), slot(%d)\n",
-                          (unsigned long long)sas_device->enclosure_logical_id,
-                           sas_device->slot);
+                       if (sas_device->enclosure_handle != 0)
+                               starget_printk(KERN_INFO, starget,
+                                "enclosure_logical_id(0x%016llx), slot(%d)\n",
+                                (unsigned long long)
+                                sas_device->enclosure_logical_id,
+                                sas_device->slot);
+                       if (sas_device->connector_name)
+                               starget_printk(KERN_INFO, starget,
+                               "enclosure level(0x%04x),connector name(%s)\n",
+                                sas_device->enclosure_level,
+                                sas_device->connector_name);
                }
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
@@ -2551,6 +2604,75 @@ _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
        }
 }
 
+/**
+ * _scsih_internal_device_block - block the sdev device
+ * @sdev: per device object
+ * @sas_device_priv_data : per device driver private data
+ *
+ * make sure device is blocked without error, if not
+ * print an error
+ */
+static void
+_scsih_internal_device_block(struct scsi_device *sdev,
+                       struct MPT3SAS_DEVICE *sas_device_priv_data)
+{
+       int r = 0;
+
+       sdev_printk(KERN_INFO, sdev, "device_block, handle(0x%04x)\n",
+           sas_device_priv_data->sas_target->handle);
+       sas_device_priv_data->block = 1;
+
+       r = scsi_internal_device_block(sdev);
+       if (r == -EINVAL)
+               sdev_printk(KERN_WARNING, sdev,
+                   "device_block failed with return(%d) for handle(0x%04x)\n",
+                   sas_device_priv_data->sas_target->handle, r);
+}
+
+/**
+ * _scsih_internal_device_unblock - unblock the sdev device
+ * @sdev: per device object
+ * @sas_device_priv_data : per device driver private data
+ * make sure device is unblocked without error, if not retry
+ * by blocking and then unblocking
+ */
+
+static void
+_scsih_internal_device_unblock(struct scsi_device *sdev,
+                       struct MPT3SAS_DEVICE *sas_device_priv_data)
+{
+       int r = 0;
+
+       sdev_printk(KERN_WARNING, sdev, "device_unblock and setting to running, "
+           "handle(0x%04x)\n", sas_device_priv_data->sas_target->handle);
+       sas_device_priv_data->block = 0;
+       r = scsi_internal_device_unblock(sdev, SDEV_RUNNING);
+       if (r == -EINVAL) {
+               /* The device has been set to SDEV_RUNNING by SD layer during
+                * device addition but the request queue is still stopped by
+                * our earlier block call. We need to perform a block again
+                * to get the device to SDEV_BLOCK and then to SDEV_RUNNING */
+
+               sdev_printk(KERN_WARNING, sdev,
+                   "device_unblock failed with return(%d) for handle(0x%04x) "
+                   "performing a block followed by an unblock\n",
+                   sas_device_priv_data->sas_target->handle, r);
+               sas_device_priv_data->block = 1;
+               r = scsi_internal_device_block(sdev);
+               if (r)
+                       sdev_printk(KERN_WARNING, sdev, "retried device_block "
+                           "failed with return(%d) for handle(0x%04x)\n",
+                           sas_device_priv_data->sas_target->handle, r);
+
+               sas_device_priv_data->block = 0;
+               r = scsi_internal_device_unblock(sdev, SDEV_RUNNING);
+               if (r)
+                       sdev_printk(KERN_WARNING, sdev, "retried device_unblock"
+                           " failed with return(%d) for handle(0x%04x)\n",
+                           sas_device_priv_data->sas_target->handle, r);
+       }
+}
+
 /**
  * _scsih_ublock_io_all_device - unblock every device
  * @ioc: per adapter object
@@ -2570,11 +2692,10 @@ _scsih_ublock_io_all_device(struct MPT3SAS_ADAPTER *ioc)
                if (!sas_device_priv_data->block)
                        continue;
 
-               sas_device_priv_data->block = 0;
                dewtprintk(ioc, sdev_printk(KERN_INFO, sdev,
                        "device_running, handle(0x%04x)\n",
                    sas_device_priv_data->sas_target->handle));
-               scsi_internal_device_unblock(sdev, SDEV_RUNNING);
+               _scsih_internal_device_unblock(sdev, sas_device_priv_data);
        }
 }
 
@@ -2599,10 +2720,9 @@ _scsih_ublock_io_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address)
                if (sas_device_priv_data->sas_target->sas_address
                    != sas_address)
                        continue;
-               if (sas_device_priv_data->block) {
-                       sas_device_priv_data->block = 0;
-                       scsi_internal_device_unblock(sdev, SDEV_RUNNING);
-               }
+               if (sas_device_priv_data->block)
+                       _scsih_internal_device_unblock(sdev,
+                               sas_device_priv_data);
        }
 }
 
@@ -2625,10 +2745,7 @@ _scsih_block_io_all_device(struct MPT3SAS_ADAPTER *ioc)
                        continue;
                if (sas_device_priv_data->block)
                        continue;
-               sas_device_priv_data->block = 1;
-               scsi_internal_device_block(sdev);
-               sdev_printk(KERN_INFO, sdev, "device_blocked, handle(0x%04x)\n",
-                   sas_device_priv_data->sas_target->handle);
+               _scsih_internal_device_block(sdev, sas_device_priv_data);
        }
 }
 
@@ -2644,6 +2761,11 @@ _scsih_block_io_device(struct MPT3SAS_ADAPTER *ioc, u16 handle)
 {
        struct MPT3SAS_DEVICE *sas_device_priv_data;
        struct scsi_device *sdev;
+       struct _sas_device *sas_device;
+
+       sas_device = _scsih_sas_device_find_by_handle(ioc, handle);
+       if (!sas_device)
+               return;
 
        shost_for_each_device(sdev, ioc->shost) {
                sas_device_priv_data = sdev->hostdata;
@@ -2653,10 +2775,9 @@ _scsih_block_io_device(struct MPT3SAS_ADAPTER *ioc, u16 handle)
                        continue;
                if (sas_device_priv_data->block)
                        continue;
-               sas_device_priv_data->block = 1;
-               scsi_internal_device_block(sdev);
-               sdev_printk(KERN_INFO, sdev,
-                       "device_blocked, handle(0x%04x)\n", handle);
+               if (sas_device->pend_sas_rphy_add)
+                       continue;
+               _scsih_internal_device_block(sdev, sas_device_priv_data);
        }
 }
 
@@ -2806,6 +2927,18 @@ _scsih_tm_tr_send(struct MPT3SAS_ADAPTER *ioc, u16 handle)
                        "setting delete flag: handle(0x%04x), sas_addr(0x%016llx)\n",
                        ioc->name, handle,
                    (unsigned long long)sas_address));
+               if (sas_device->enclosure_handle != 0)
+                       dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                        "setting delete flag:enclosure logical id(0x%016llx),"
+                        " slot(%d)\n", ioc->name, (unsigned long long)
+                         sas_device->enclosure_logical_id,
+                         sas_device->slot));
+               if (sas_device->connector_name)
+                       dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                        "setting delete flag: enclosure level(0x%04x),"
+                        " connector name( %s)\n", ioc->name,
+                         sas_device->enclosure_level,
+                         sas_device->connector_name));
                _scsih_ublock_io_device(ioc, sas_address);
                sas_target_priv_data->handle = MPT3SAS_INVALID_DEVICE_HANDLE;
        }
@@ -3821,10 +3954,19 @@ _scsih_scsi_ioc_info(struct MPT3SAS_ADAPTER *ioc, struct scsi_cmnd *scmd,
                                "\tsas_address(0x%016llx), phy(%d)\n",
                                ioc->name, (unsigned long long)
                            sas_device->sas_address, sas_device->phy);
-                       pr_warn(MPT3SAS_FMT
-                           "\tenclosure_logical_id(0x%016llx), slot(%d)\n",
-                           ioc->name, (unsigned long long)
-                           sas_device->enclosure_logical_id, sas_device->slot);
+                       if (sas_device->enclosure_handle != 0)
+                               pr_warn(MPT3SAS_FMT
+                                 "\tenclosure_logical_id(0x%016llx),"
+                                 "slot(%d)\n", ioc->name,
+                                 (unsigned long long)
+                                 sas_device->enclosure_logical_id,
+                                 sas_device->slot);
+                       if (sas_device->connector_name[0])
+                               pr_warn(MPT3SAS_FMT
+                                 "\tenclosure level(0x%04x),"
+                                 " connector name( %s)\n", ioc->name,
+                                 sas_device->enclosure_level,
+                                 sas_device->connector_name);
                }
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
        }
@@ -3999,7 +4141,16 @@ _scsih_smart_predicted_fault(struct MPT3SAS_ADAPTER *ioc, u16 handle)
                spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
                return;
        }
-       starget_printk(KERN_WARNING, starget, "predicted fault\n");
+       if (sas_device->enclosure_handle != 0)
+               starget_printk(KERN_INFO, starget, "predicted fault, "
+                       "enclosure logical id(0x%016llx), slot(%d)\n",
+                       (unsigned long long)sas_device->enclosure_logical_id,
+                       sas_device->slot);
+       if (sas_device->connector_name[0] != '\0')
+               starget_printk(KERN_WARNING, starget, "predicted fault, "
+                       "enclosure level(0x%04x), connector name( %s)\n",
+                       sas_device->enclosure_level,
+                       sas_device->connector_name);
        spin_unlock_irqrestore(&ioc->sas_device_lock, flags);
 
        if (ioc->pdev->subsystem_vendor == PCI_VENDOR_ID_IBM)
@@ -4119,8 +4270,15 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
                        _scsih_smart_predicted_fault(ioc,
                            le16_to_cpu(mpi_reply->DevHandle));
                mpt3sas_trigger_scsi(ioc, data.skey, data.asc, data.ascq);
-       }
 
+#ifdef CONFIG_SCSI_MPT3SAS_LOGGING
+               if (!(ioc->logging_level & MPT_DEBUG_REPLY) &&
+                    ((scmd->sense_buffer[2] == UNIT_ATTENTION) ||
+                    (scmd->sense_buffer[2] == MEDIUM_ERROR) ||
+                    (scmd->sense_buffer[2] == HARDWARE_ERROR)))
+                       _scsih_scsi_ioc_info(ioc, scmd, mpi_reply, smid);
+#endif
+       }
        switch (ioc_status) {
        case MPI2_IOCSTATUS_BUSY:
        case MPI2_IOCSTATUS_INSUFFICIENT_RESOURCES:
@@ -4146,6 +4304,9 @@ _scsih_io_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index, u32 reply)
                                scmd->device->expecting_cc_ua = 1;
                        }
                        break;
+               } else if (log_info == VIRTUAL_IO_FAILED_RETRY) {
+                       scmd->result = DID_RESET << 16;
+                       break;
                }
                scmd->result = DID_SOFT_ERROR << 16;
                break;
@@ -4788,6 +4949,16 @@ _scsih_check_device(struct MPT3SAS_ADAPTER *ioc,
                        sas_device->handle, handle);
                sas_target_priv_data->handle = handle;
                sas_device->handle = handle;
+               if (sas_device_pg0.Flags &
+                    MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID) {
+                       sas_device->enclosure_level =
+                               le16_to_cpu(sas_device_pg0.EnclosureLevel);
+                       memcpy(&sas_device->connector_name[0],
+                               &sas_device_pg0.ConnectorName[0], 4);
+               } else {
+                       sas_device->enclosure_level = 0;
+                       sas_device->connector_name[0] = '\0';
+               }
        }
 
        /* check if device is present */
@@ -4894,14 +5065,24 @@ _scsih_add_device(struct MPT3SAS_ADAPTER *ioc, u16 handle, u8 phy_num,
                    ioc->name, __FILE__, __LINE__, __func__);
        sas_device->enclosure_handle =
            le16_to_cpu(sas_device_pg0.EnclosureHandle);
-       sas_device->slot =
-           le16_to_cpu(sas_device_pg0.Slot);
+       if (sas_device->enclosure_handle != 0)
+               sas_device->slot =
+                   le16_to_cpu(sas_device_pg0.Slot);
        sas_device->device_info = device_info;
        sas_device->sas_address = sas_address;
        sas_device->phy = sas_device_pg0.PhyNum;
        sas_device->fast_path = (le16_to_cpu(sas_device_pg0.Flags) &
            MPI25_SAS_DEVICE0_FLAGS_FAST_PATH_CAPABLE) ? 1 : 0;
 
+       if (sas_device_pg0.Flags & MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID) {
+               sas_device->enclosure_level =
+                       le16_to_cpu(sas_device_pg0.EnclosureLevel);
+               memcpy(&sas_device->connector_name[0],
+                       &sas_device_pg0.ConnectorName[0], 4);
+       } else {
+               sas_device->enclosure_level = 0;
+               sas_device->connector_name[0] = '\0';
+       }
        /* get enclosure_logical_id */
        if (sas_device->enclosure_handle && !(mpt3sas_config_get_enclosure_pg0(
           ioc, &mpi_reply, &enclosure_pg0, MPI2_SAS_ENCLOS_PGAD_FORM_HANDLE,
@@ -4943,6 +5124,18 @@ _scsih_remove_device(struct MPT3SAS_ADAPTER *ioc,
                ioc->name, __func__,
            sas_device->handle, (unsigned long long)
            sas_device->sas_address));
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: enter: enclosure logical id(0x%016llx), slot(%d)\n",
+                   ioc->name, __func__,
+                   (unsigned long long)sas_device->enclosure_logical_id,
+                   sas_device->slot));
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                 "%s: enter: enclosure level(0x%04x), connector name( %s)\n",
+                 ioc->name, __func__,
+                 sas_device->enclosure_level,
+                 sas_device->connector_name));
 
        if (sas_device->starget && sas_device->starget->hostdata) {
                sas_target_priv_data = sas_device->starget->hostdata;
@@ -4959,12 +5152,34 @@ _scsih_remove_device(struct MPT3SAS_ADAPTER *ioc,
                "removing handle(0x%04x), sas_addr(0x%016llx)\n",
                ioc->name, sas_device->handle,
            (unsigned long long) sas_device->sas_address);
+       if (sas_device->enclosure_handle != 0)
+               pr_info(MPT3SAS_FMT
+                 "removing : enclosure logical id(0x%016llx), slot(%d)\n",
+                 ioc->name,
+                 (unsigned long long)sas_device->enclosure_logical_id,
+                 sas_device->slot);
+       if (sas_device->connector_name[0] != '\0')
+               pr_info(MPT3SAS_FMT
+                 "removing enclosure level(0x%04x), connector name( %s)\n",
+                 ioc->name, sas_device->enclosure_level,
+                 sas_device->connector_name);
 
        dewtprintk(ioc, pr_info(MPT3SAS_FMT
                "%s: exit: handle(0x%04x), sas_addr(0x%016llx)\n",
                ioc->name, __func__,
-           sas_device->handle, (unsigned long long)
-           sas_device->sas_address));
+               sas_device->handle, (unsigned long long)
+               sas_device->sas_address));
+       if (sas_device->enclosure_handle != 0)
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: exit: enclosure logical id(0x%016llx), slot(%d)\n",
+                   ioc->name, __func__,
+                   (unsigned long long)sas_device->enclosure_logical_id,
+                   sas_device->slot));
+       if (sas_device->connector_name[0] != '\0')
+               dewtprintk(ioc, pr_info(MPT3SAS_FMT
+                   "%s: exit: enclosure level(0x%04x), connector name(%s)\n",
+                   ioc->name, __func__, sas_device->enclosure_level,
+                   sas_device->connector_name));
 
        kfree(sas_device);
 }
@@ -6357,9 +6572,7 @@ _scsih_prep_device_scan(struct MPT3SAS_ADAPTER *ioc)
 /**
  * _scsih_mark_responding_sas_device - mark a sas_devices as responding
  * @ioc: per adapter object
- * @sas_address: sas address
- * @slot: enclosure slot id
- * @handle: device handle
+ * @sas_device_pg0: SAS Device page 0
  *
  * After host reset, find out whether devices are still responding.
  * Used in _scsih_remove_unresponsive_sas_devices.
@@ -6367,8 +6580,8 @@ _scsih_prep_device_scan(struct MPT3SAS_ADAPTER *ioc)
  * Return nothing.
  */
 static void
-_scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
-       u16 slot, u16 handle)
+_scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc,
+Mpi2SasDevicePage0_t *sas_device_pg0)
 {
        struct MPT3SAS_TARGET *sas_target_priv_data = NULL;
        struct scsi_target *starget;
@@ -6377,8 +6590,8 @@ _scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
 
        spin_lock_irqsave(&ioc->sas_device_lock, flags);
        list_for_each_entry(sas_device, &ioc->sas_device_list, list) {
-               if (sas_device->sas_address == sas_address &&
-                   sas_device->slot == slot) {
+               if ((sas_device->sas_address == sas_device_pg0->SASAddress) &&
+                       (sas_device->slot == sas_device_pg0->Slot)) {
                        sas_device->responding = 1;
                        starget = sas_device->starget;
                        if (starget && starget->hostdata) {
@@ -6387,22 +6600,40 @@ _scsih_mark_responding_sas_device(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
                                sas_target_priv_data->deleted = 0;
                        } else
                                sas_target_priv_data = NULL;
-                       if (starget)
+                       if (starget) {
                                starget_printk(KERN_INFO, starget,
-                                   "handle(0x%04x), sas_addr(0x%016llx), "
-                                   "enclosure logical id(0x%016llx), "
-                                   "slot(%d)\n", handle,
-                                   (unsigned long long)sas_device->sas_address,
+                                   "handle(0x%04x), sas_addr(0x%016llx)\n",
+                                   sas_device_pg0->DevHandle,
                                    (unsigned long long)
-                                   sas_device->enclosure_logical_id,
-                                   sas_device->slot);
-                       if (sas_device->handle == handle)
+                                   sas_device->sas_address);
+
+                               if (sas_device->enclosure_handle != 0)
+                                       starget_printk(KERN_INFO, starget,
+                                        "enclosure logical id(0x%016llx),"
+                                        " slot(%d)\n",
+                                        (unsigned long long)
+                                        sas_device->enclosure_logical_id,
+                                        sas_device->slot);
+                       }
+                       if (sas_device_pg0->Flags &
+                             MPI2_SAS_DEVICE0_FLAGS_ENCL_LEVEL_VALID) {
+                               sas_device->enclosure_level =
+                                  le16_to_cpu(sas_device_pg0->EnclosureLevel);
+                               memcpy(&sas_device->connector_name[0],
+                                       &sas_device_pg0->ConnectorName[0], 4);
+                       } else {
+                               sas_device->enclosure_level = 0;
+                               sas_device->connector_name[0] = '\0';
+                       }
+
+                       if (sas_device->handle == sas_device_pg0->DevHandle)
                                goto out;
                        pr_info("\thandle changed from(0x%04x)!!!\n",
                            sas_device->handle);
-                       sas_device->handle = handle;
+                       sas_device->handle = sas_device_pg0->DevHandle;
                        if (sas_target_priv_data)
-                               sas_target_priv_data->handle = handle;
+                               sas_target_priv_data->handle =
+                                       sas_device_pg0->DevHandle;
                        goto out;
                }
        }
@@ -6441,13 +6672,15 @@ _scsih_search_responding_sas_devices(struct MPT3SAS_ADAPTER *ioc)
                    MPI2_IOCSTATUS_MASK;
                if (ioc_status != MPI2_IOCSTATUS_SUCCESS)
                        break;
-               handle = le16_to_cpu(sas_device_pg0.DevHandle);
+               handle = sas_device_pg0.DevHandle =
+                               le16_to_cpu(sas_device_pg0.DevHandle);
                device_info = le32_to_cpu(sas_device_pg0.DeviceInfo);
                if (!(_scsih_is_end_device(device_info)))
                        continue;
-               _scsih_mark_responding_sas_device(ioc,
-                   le64_to_cpu(sas_device_pg0.SASAddress),
-                   le16_to_cpu(sas_device_pg0.Slot), handle);
+               sas_device_pg0.SASAddress =
+                               le64_to_cpu(sas_device_pg0.SASAddress);
+               sas_device_pg0.Slot = le16_to_cpu(sas_device_pg0.Slot);
+               _scsih_mark_responding_sas_device(ioc, &sas_device_pg0);
        }
 
  out:
@@ -7854,8 +8087,8 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        /* event thread */
        snprintf(ioc->firmware_event_name, sizeof(ioc->firmware_event_name),
            "fw_event%d", ioc->id);
-       ioc->firmware_event_thread = create_singlethread_workqueue(
-           ioc->firmware_event_name);
+       ioc->firmware_event_thread = alloc_ordered_workqueue(
+           ioc->firmware_event_name, WQ_MEM_RECLAIM);
        if (!ioc->firmware_event_thread) {
                pr_err(MPT3SAS_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
index efb98afc46e08208e33a07fdf7eabb90e3b52a17..70fd019e7ee585c14deea0e4ec1ffcff66709f8e 100644 (file)
@@ -649,6 +649,7 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle,
        unsigned long flags;
        struct _sas_node *sas_node;
        struct sas_rphy *rphy;
+       struct _sas_device *sas_device = NULL;
        int i;
        struct sas_port *port;
 
@@ -731,10 +732,27 @@ mpt3sas_transport_port_add(struct MPT3SAS_ADAPTER *ioc, u16 handle,
                    mpt3sas_port->remote_identify.device_type);
 
        rphy->identify = mpt3sas_port->remote_identify;
+
+       if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE) {
+               sas_device = mpt3sas_scsih_sas_device_find_by_sas_address(ioc,
+                                   mpt3sas_port->remote_identify.sas_address);
+               if (!sas_device) {
+                       dfailprintk(ioc, printk(MPT3SAS_FMT
+                               "failure at %s:%d/%s()!\n",
+                               ioc->name, __FILE__, __LINE__, __func__));
+                       goto out_fail;
+               }
+               sas_device->pend_sas_rphy_add = 1;
+       }
+
        if ((sas_rphy_add(rphy))) {
                pr_err(MPT3SAS_FMT "failure at %s:%d/%s()!\n",
                    ioc->name, __FILE__, __LINE__, __func__);
        }
+
+       if (mpt3sas_port->remote_identify.device_type == SAS_END_DEVICE)
+               sas_device->pend_sas_rphy_add = 0;
+
        if ((ioc->logging_level & MPT_DEBUG_TRANSPORT))
                dev_printk(KERN_INFO, &rphy->dev,
                        "add: handle(0x%04x), sas_addr(0x%016llx)\n",
@@ -1946,7 +1964,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        } else {
                dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
                    blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
-               if (!dma_addr_out) {
+               if (pci_dma_mapping_error(ioc->pdev, dma_addr_out)) {
                        pr_info(MPT3SAS_FMT "%s(): DMA Addr out = NULL\n",
                            ioc->name, __func__);
                        rc = -ENOMEM;
@@ -1968,7 +1986,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        } else {
                dma_addr_in =  pci_map_single(ioc->pdev, bio_data(rsp->bio),
                    blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
-               if (!dma_addr_in) {
+               if (pci_dma_mapping_error(ioc->pdev, dma_addr_in)) {
                        pr_info(MPT3SAS_FMT "%s(): DMA Addr in = NULL\n",
                            ioc->name, __func__);
                        rc = -ENOMEM;
index f466a6aa8830c9d2341a21c92e3d1904a7521bd5..e2d555c1bffc16f94a87ed6b009d868cc3784a3e 100644 (file)
@@ -324,13 +324,9 @@ int mvs_ioremap(struct mvs_info *mvi, int bar, int bar_ex)
                        goto err_out;
 
                res_flag_ex = pci_resource_flags(pdev, bar_ex);
-               if (res_flag_ex & IORESOURCE_MEM) {
-                       if (res_flag_ex & IORESOURCE_CACHEABLE)
-                               mvi->regs_ex = ioremap(res_start, res_len);
-                       else
-                               mvi->regs_ex = ioremap_nocache(res_start,
-                                               res_len);
-               } else
+               if (res_flag_ex & IORESOURCE_MEM)
+                       mvi->regs_ex = ioremap(res_start, res_len);
+               else
                        mvi->regs_ex = (void *)res_start;
                if (!mvi->regs_ex)
                        goto err_out;
@@ -345,10 +341,7 @@ int mvs_ioremap(struct mvs_info *mvi, int bar, int bar_ex)
        }
 
        res_flag = pci_resource_flags(pdev, bar);
-       if (res_flag & IORESOURCE_CACHEABLE)
-               mvi->regs = ioremap(res_start, res_len);
-       else
-               mvi->regs = ioremap_nocache(res_start, res_len);
+       mvi->regs = ioremap(res_start, res_len);
 
        if (!mvi->regs) {
                if (mvi->regs_ex && (res_flag_ex & IORESOURCE_MEM))
index 39306b1e704c5202764d95a510bab4431e0f97dc..04e67a190652ec8f614a99492bc13bdf08a0e01c 100644 (file)
@@ -2642,6 +2642,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
                ts->resp = SAS_TASK_COMPLETE;
                ts->stat = SAS_OPEN_REJECT;
                ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
+               break;
        default:
                PM8001_IO_DBG(pm8001_ha,
                        pm8001_printk("Unknown status 0x%x\n", status));
index 0e1628f2018e572a8b8300fe4ef0753bb95fc7dd..9a389f1508de8518cf08b0b62dee1c42251a2907 100644 (file)
@@ -2337,6 +2337,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
                ts->resp = SAS_TASK_COMPLETE;
                ts->stat = SAS_OPEN_REJECT;
                ts->open_rej_reason = SAS_OREJ_RSVD_RETRY;
+               break;
        default:
                PM8001_IO_DBG(pm8001_ha,
                        pm8001_printk("Unknown status 0x%x\n", status));
index 33f60c92e20e91ade58dbcb3537a1276a9d03ff4..a0f732b138e4b5868867a4c94e191ffd02ec43a2 100644 (file)
@@ -32,10 +32,10 @@ config SCSI_QLA_FC
        They are also included in the linux-firmware tree as well.
 
 config TCM_QLA2XXX
-       tristate "TCM_QLA2XXX fabric module for Qlogic 2xxx series target mode HBAs"
+       tristate "TCM_QLA2XXX fabric module for QLogic 24xx+ series target mode HBAs"
        depends on SCSI_QLA_FC && TARGET_CORE
        depends on LIBFC
        select BTREE
        default n
        ---help---
-       Say Y here to enable the TCM_QLA2XXX fabric module for Qlogic 2xxx series target mode HBAs
+       Say Y here to enable the TCM_QLA2XXX fabric module for QLogic 24xx+ series target mode HBAs
index 7ed7bae6172b21ab43453ebfc3af7f2d72aec200..ac65cb7b48861b81dc1daa0ea531e506cb5752fa 100644 (file)
@@ -1359,9 +1359,7 @@ static void tcm_qla2xxx_free_session(struct qla_tgt_sess *sess)
        struct qla_hw_data *ha = tgt->ha;
        scsi_qla_host_t *vha = pci_get_drvdata(ha->pdev);
        struct se_session *se_sess;
-       struct se_node_acl *se_nacl;
        struct tcm_qla2xxx_lport *lport;
-       struct tcm_qla2xxx_nacl *nacl;
 
        BUG_ON(in_interrupt());
 
@@ -1371,8 +1369,6 @@ static void tcm_qla2xxx_free_session(struct qla_tgt_sess *sess)
                dump_stack();
                return;
        }
-       se_nacl = se_sess->se_node_acl;
-       nacl = container_of(se_nacl, struct tcm_qla2xxx_nacl, se_node_acl);
 
        lport = vha->vha_tgt.target_lport_ptr;
        if (!lport) {
@@ -1680,7 +1676,6 @@ static int tcm_qla2xxx_lport_register_npiv_cb(struct scsi_qla_host *base_vha,
                        (struct tcm_qla2xxx_lport *)target_lport_ptr;
        struct tcm_qla2xxx_lport *base_lport =
                        (struct tcm_qla2xxx_lport *)base_vha->vha_tgt.target_lport_ptr;
-       struct tcm_qla2xxx_tpg *base_tpg;
        struct fc_vport_identifiers vport_id;
 
        if (!qla_tgt_mode_enabled(base_vha)) {
@@ -1693,7 +1688,6 @@ static int tcm_qla2xxx_lport_register_npiv_cb(struct scsi_qla_host *base_vha,
                pr_err("qla2xxx base_lport or tpg_1 not available\n");
                return -EPERM;
        }
-       base_tpg = base_lport->tpg_1;
 
        memset(&vport_id, 0, sizeof(vport_id));
        vport_id.port_name = npiv_wwpn;
@@ -1810,6 +1804,11 @@ static const struct target_core_fabric_ops tcm_qla2xxx_ops = {
        .module                         = THIS_MODULE,
        .name                           = "qla2xxx",
        .node_acl_size                  = sizeof(struct tcm_qla2xxx_nacl),
+       /*
+        * XXX: Limit assumes single page per scatter-gather-list entry.
+        * Current maximum is ~4.9 MB per se_cmd->t_data_sg with PAGE_SIZE=4096
+        */
+       .max_data_sg_nents              = 1200,
        .get_fabric_name                = tcm_qla2xxx_get_fabric_name,
        .tpg_get_wwn                    = tcm_qla2xxx_get_fabric_wwn,
        .tpg_get_tag                    = tcm_qla2xxx_get_tag,
@@ -1958,7 +1957,7 @@ static void __exit tcm_qla2xxx_exit(void)
        tcm_qla2xxx_deregister_configfs();
 }
 
-MODULE_DESCRIPTION("TCM QLA2XXX series NPIV enabled fabric driver");
+MODULE_DESCRIPTION("TCM QLA24XX+ series NPIV enabled fabric driver");
 MODULE_LICENSE("GPL");
 module_init(tcm_qla2xxx_init);
 module_exit(tcm_qla2xxx_exit);
index 2ff092252b7624b0f62938b8c59d78dce1d5b616..c126966130ab792b5dac44ab4a96f4c272debe66 100644 (file)
@@ -5,6 +5,8 @@
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/errno.h>
+#include <asm/unaligned.h>
 #include <scsi/scsi_common.h>
 
 /* NB: These are exposed through /proc/scsi/scsi and form part of the ABI.
@@ -176,3 +178,110 @@ bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len,
        return true;
 }
 EXPORT_SYMBOL(scsi_normalize_sense);
+
+/**
+ * scsi_sense_desc_find - search for a given descriptor type in        descriptor sense data format.
+ * @sense_buffer:      byte array of descriptor format sense data
+ * @sb_len:            number of valid bytes in sense_buffer
+ * @desc_type:         value of descriptor type to find
+ *                     (e.g. 0 -> information)
+ *
+ * Notes:
+ *     only valid when sense data is in descriptor format
+ *
+ * Return value:
+ *     pointer to start of (first) descriptor if found else NULL
+ */
+const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
+                               int desc_type)
+{
+       int add_sen_len, add_len, desc_len, k;
+       const u8 * descp;
+
+       if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7])))
+               return NULL;
+       if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73))
+               return NULL;
+       add_sen_len = (add_sen_len < (sb_len - 8)) ?
+                       add_sen_len : (sb_len - 8);
+       descp = &sense_buffer[8];
+       for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) {
+               descp += desc_len;
+               add_len = (k < (add_sen_len - 1)) ? descp[1]: -1;
+               desc_len = add_len + 2;
+               if (descp[0] == desc_type)
+                       return descp;
+               if (add_len < 0) // short descriptor ??
+                       break;
+       }
+       return NULL;
+}
+EXPORT_SYMBOL(scsi_sense_desc_find);
+
+/**
+ * scsi_build_sense_buffer - build sense data in a buffer
+ * @desc:      Sense format (non zero == descriptor format,
+ *              0 == fixed format)
+ * @buf:       Where to build sense data
+ * @key:       Sense key
+ * @asc:       Additional sense code
+ * @ascq:      Additional sense code qualifier
+ *
+ **/
+void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq)
+{
+       if (desc) {
+               buf[0] = 0x72;  /* descriptor, current */
+               buf[1] = key;
+               buf[2] = asc;
+               buf[3] = ascq;
+               buf[7] = 0;
+       } else {
+               buf[0] = 0x70;  /* fixed, current */
+               buf[2] = key;
+               buf[7] = 0xa;
+               buf[12] = asc;
+               buf[13] = ascq;
+       }
+}
+EXPORT_SYMBOL(scsi_build_sense_buffer);
+
+/**
+ * scsi_set_sense_information - set the information field in a
+ *             formatted sense data buffer
+ * @buf:       Where to build sense data
+ * @buf_len:    buffer length
+ * @info:      64-bit information value to be set
+ *
+ * Return value:
+ *     0 on success or EINVAL for invalid sense buffer length
+ **/
+int scsi_set_sense_information(u8 *buf, int buf_len, u64 info)
+{
+       if ((buf[0] & 0x7f) == 0x72) {
+               u8 *ucp, len;
+
+               len = buf[7];
+               ucp = (char *)scsi_sense_desc_find(buf, len + 8, 0);
+               if (!ucp) {
+                       buf[7] = len + 0xc;
+                       ucp = buf + 8 + len;
+               }
+
+               if (buf_len < len + 0xc)
+                       /* Not enough room for info */
+                       return -EINVAL;
+
+               ucp[0] = 0;
+               ucp[1] = 0xa;
+               ucp[2] = 0x80; /* Valid bit */
+               ucp[3] = 0;
+               put_unaligned_be64(info, &ucp[4]);
+       } else if ((buf[0] & 0x7f) == 0x70) {
+               buf[0] |= 0x80;
+               put_unaligned_be64(info, &buf[3]);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(scsi_set_sense_information);
index 30268bb2ddb6a3e78dc114606ed8c0b4b615d54a..dfcc45bb03b1f30e808e611a567c2f76cc734d3c 100644 (file)
@@ -25,6 +25,9 @@
  *        module options to "modprobe scsi_debug num_tgts=2" [20021221]
  */
 
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
 #include <linux/module.h>
 
 #include <linux/kernel.h>
@@ -201,7 +204,6 @@ static const char *scsi_debug_version_date = "20141022";
 /* If REPORT LUNS has luns >= 256 it can choose "flat space" (value 1)
  * or "peripheral device" addressing (value 0) */
 #define SAM2_LUN_ADDRESS_METHOD 0
-#define SAM2_WLUN_REPORT_LUNS 0xc101
 
 /* SCSI_DEBUG_CANQUEUE is the maximum number of commands that can be queued
  * (for response) at one time. Can be reduced by max_queue option. Command
@@ -698,7 +700,7 @@ static void sdebug_max_tgts_luns(void)
                else
                        hpnt->max_id = scsi_debug_num_tgts;
                /* scsi_debug_max_luns; */
-               hpnt->max_lun = SAM2_WLUN_REPORT_LUNS;
+               hpnt->max_lun = SCSI_W_LUN_REPORT_LUNS + 1;
        }
        spin_unlock(&sdebug_host_list_lock);
 }
@@ -1288,7 +1290,7 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
        arr = kzalloc(SDEBUG_MAX_INQ_ARR_SZ, GFP_ATOMIC);
        if (! arr)
                return DID_REQUEUE << 16;
-       have_wlun = (scp->device->lun == SAM2_WLUN_REPORT_LUNS);
+       have_wlun = (scp->device->lun == SCSI_W_LUN_REPORT_LUNS);
        if (have_wlun)
                pq_pdt = 0x1e;  /* present, wlun */
        else if (scsi_debug_no_lun_0 && (0 == devip->lun))
@@ -1427,12 +1429,11 @@ static int resp_requests(struct scsi_cmnd * scp,
        unsigned char * sbuff;
        unsigned char *cmd = scp->cmnd;
        unsigned char arr[SCSI_SENSE_BUFFERSIZE];
-       bool dsense, want_dsense;
+       bool dsense;
        int len = 18;
 
        memset(arr, 0, sizeof(arr));
        dsense = !!(cmd[1] & 1);
-       want_dsense = dsense || scsi_debug_dsense;
        sbuff = scp->sense_buffer;
        if ((iec_m_pg[2] & 0x4) && (6 == (iec_m_pg[3] & 0xf))) {
                if (dsense) {
@@ -2446,8 +2447,7 @@ static int dif_verify(struct sd_dif_tuple *sdt, const void *data,
        __be16 csum = dif_compute_csum(data, scsi_debug_sector_size);
 
        if (sdt->guard_tag != csum) {
-               pr_err("%s: GUARD check failed on sector %lu rcvd 0x%04x, data 0x%04x\n",
-                       __func__,
+               pr_err("GUARD check failed on sector %lu rcvd 0x%04x, data 0x%04x\n",
                        (unsigned long)sector,
                        be16_to_cpu(sdt->guard_tag),
                        be16_to_cpu(csum));
@@ -2455,14 +2455,14 @@ static int dif_verify(struct sd_dif_tuple *sdt, const void *data,
        }
        if (scsi_debug_dif == SD_DIF_TYPE1_PROTECTION &&
            be32_to_cpu(sdt->ref_tag) != (sector & 0xffffffff)) {
-               pr_err("%s: REF check failed on sector %lu\n",
-                       __func__, (unsigned long)sector);
+               pr_err("REF check failed on sector %lu\n",
+                       (unsigned long)sector);
                return 0x03;
        }
        if (scsi_debug_dif == SD_DIF_TYPE2_PROTECTION &&
            be32_to_cpu(sdt->ref_tag) != ei_lba) {
-               pr_err("%s: REF check failed on sector %lu\n",
-                       __func__, (unsigned long)sector);
+               pr_err("REF check failed on sector %lu\n",
+                       (unsigned long)sector);
                return 0x03;
        }
        return 0;
@@ -2680,7 +2680,7 @@ resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
        return 0;
 }
 
-void dump_sector(unsigned char *buf, int len)
+static void dump_sector(unsigned char *buf, int len)
 {
        int i, j, n;
 
@@ -3365,8 +3365,8 @@ static int resp_report_luns(struct scsi_cmnd * scp,
                one_lun[i].scsi_lun[1] = lun & 0xff;
        }
        if (want_wlun) {
-               one_lun[i].scsi_lun[0] = (SAM2_WLUN_REPORT_LUNS >> 8) & 0xff;
-               one_lun[i].scsi_lun[1] = SAM2_WLUN_REPORT_LUNS & 0xff;
+               one_lun[i].scsi_lun[0] = (SCSI_W_LUN_REPORT_LUNS >> 8) & 0xff;
+               one_lun[i].scsi_lun[1] = SCSI_W_LUN_REPORT_LUNS & 0xff;
                i++;
        }
        alloc_len = (unsigned char *)(one_lun + i) - arr;
@@ -3449,7 +3449,7 @@ static void sdebug_q_cmd_complete(unsigned long indx)
        atomic_inc(&sdebug_completions);
        qa_indx = indx;
        if ((qa_indx < 0) || (qa_indx >= SCSI_DEBUG_CANQUEUE)) {
-               pr_err("%s: wild qa_indx=%d\n", __func__, qa_indx);
+               pr_err("wild qa_indx=%d\n", qa_indx);
                return;
        }
        spin_lock_irqsave(&queued_arr_lock, iflags);
@@ -3457,21 +3457,21 @@ static void sdebug_q_cmd_complete(unsigned long indx)
        scp = sqcp->a_cmnd;
        if (NULL == scp) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: scp is NULL\n", __func__);
+               pr_err("scp is NULL\n");
                return;
        }
        devip = (struct sdebug_dev_info *)scp->device->hostdata;
        if (devip)
                atomic_dec(&devip->num_in_q);
        else
-               pr_err("%s: devip=NULL\n", __func__);
+               pr_err("devip=NULL\n");
        if (atomic_read(&retired_max_queue) > 0)
                retiring = 1;
 
        sqcp->a_cmnd = NULL;
        if (!test_and_clear_bit(qa_indx, queued_in_use_bm)) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: Unexpected completion\n", __func__);
+               pr_err("Unexpected completion\n");
                return;
        }
 
@@ -3481,7 +3481,7 @@ static void sdebug_q_cmd_complete(unsigned long indx)
                retval = atomic_read(&retired_max_queue);
                if (qa_indx >= retval) {
                        spin_unlock_irqrestore(&queued_arr_lock, iflags);
-                       pr_err("%s: index %d too large\n", __func__, retval);
+                       pr_err("index %d too large\n", retval);
                        return;
                }
                k = find_last_bit(queued_in_use_bm, retval);
@@ -3509,7 +3509,7 @@ sdebug_q_cmd_hrt_complete(struct hrtimer *timer)
        atomic_inc(&sdebug_completions);
        qa_indx = sd_hrtp->qa_indx;
        if ((qa_indx < 0) || (qa_indx >= SCSI_DEBUG_CANQUEUE)) {
-               pr_err("%s: wild qa_indx=%d\n", __func__, qa_indx);
+               pr_err("wild qa_indx=%d\n", qa_indx);
                goto the_end;
        }
        spin_lock_irqsave(&queued_arr_lock, iflags);
@@ -3517,21 +3517,21 @@ sdebug_q_cmd_hrt_complete(struct hrtimer *timer)
        scp = sqcp->a_cmnd;
        if (NULL == scp) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: scp is NULL\n", __func__);
+               pr_err("scp is NULL\n");
                goto the_end;
        }
        devip = (struct sdebug_dev_info *)scp->device->hostdata;
        if (devip)
                atomic_dec(&devip->num_in_q);
        else
-               pr_err("%s: devip=NULL\n", __func__);
+               pr_err("devip=NULL\n");
        if (atomic_read(&retired_max_queue) > 0)
                retiring = 1;
 
        sqcp->a_cmnd = NULL;
        if (!test_and_clear_bit(qa_indx, queued_in_use_bm)) {
                spin_unlock_irqrestore(&queued_arr_lock, iflags);
-               pr_err("%s: Unexpected completion\n", __func__);
+               pr_err("Unexpected completion\n");
                goto the_end;
        }
 
@@ -3541,7 +3541,7 @@ sdebug_q_cmd_hrt_complete(struct hrtimer *timer)
                retval = atomic_read(&retired_max_queue);
                if (qa_indx >= retval) {
                        spin_unlock_irqrestore(&queued_arr_lock, iflags);
-                       pr_err("%s: index %d too large\n", __func__, retval);
+                       pr_err("index %d too large\n", retval);
                        goto the_end;
                }
                k = find_last_bit(queued_in_use_bm, retval);
@@ -3580,7 +3580,7 @@ static struct sdebug_dev_info * devInfoReg(struct scsi_device * sdev)
                return devip;
        sdbg_host = *(struct sdebug_host_info **)shost_priv(sdev->host);
        if (!sdbg_host) {
-               pr_err("%s: Host info NULL\n", __func__);
+               pr_err("Host info NULL\n");
                return NULL;
         }
        list_for_each_entry(devip, &sdbg_host->dev_info_list, dev_list) {
@@ -3596,8 +3596,7 @@ static struct sdebug_dev_info * devInfoReg(struct scsi_device * sdev)
        if (!open_devip) { /* try and make a new one */
                open_devip = sdebug_device_create(sdbg_host, GFP_ATOMIC);
                if (!open_devip) {
-                       printk(KERN_ERR "%s: out of memory at line %d\n",
-                               __func__, __LINE__);
+                       pr_err("out of memory at line %d\n", __LINE__);
                        return NULL;
                }
        }
@@ -3615,7 +3614,7 @@ static struct sdebug_dev_info * devInfoReg(struct scsi_device * sdev)
 static int scsi_debug_slave_alloc(struct scsi_device *sdp)
 {
        if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
-               printk(KERN_INFO "scsi_debug: slave_alloc <%u %u %u %llu>\n",
+               pr_info("slave_alloc <%u %u %u %llu>\n",
                       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
        queue_flag_set_unlocked(QUEUE_FLAG_BIDI, sdp->request_queue);
        return 0;
@@ -3626,7 +3625,7 @@ static int scsi_debug_slave_configure(struct scsi_device *sdp)
        struct sdebug_dev_info *devip;
 
        if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
-               printk(KERN_INFO "scsi_debug: slave_configure <%u %u %u %llu>\n",
+               pr_info("slave_configure <%u %u %u %llu>\n",
                       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
        if (sdp->host->max_cmd_len != SCSI_DEBUG_MAX_CMD_LEN)
                sdp->host->max_cmd_len = SCSI_DEBUG_MAX_CMD_LEN;
@@ -3646,7 +3645,7 @@ static void scsi_debug_slave_destroy(struct scsi_device *sdp)
                (struct sdebug_dev_info *)sdp->hostdata;
 
        if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
-               printk(KERN_INFO "scsi_debug: slave_destroy <%u %u %u %llu>\n",
+               pr_info("slave_destroy <%u %u %u %llu>\n",
                       sdp->host->host_no, sdp->channel, sdp->id, sdp->lun);
        if (devip) {
                /* make this slot available for re-use */
@@ -3897,8 +3896,7 @@ static void __init sdebug_build_parts(unsigned char *ramp,
                return;
        if (scsi_debug_num_parts > SDEBUG_MAX_PARTS) {
                scsi_debug_num_parts = SDEBUG_MAX_PARTS;
-               pr_warn("%s: reducing partitions to %d\n", __func__,
-                       SDEBUG_MAX_PARTS);
+               pr_warn("reducing partitions to %d\n", SDEBUG_MAX_PARTS);
        }
        num_sectors = (int)sdebug_store_sectors;
        sectors_per_part = (num_sectors - sdebug_sectors_per)
@@ -3942,14 +3940,20 @@ schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
        unsigned long iflags;
        int k, num_in_q, qdepth, inject;
        struct sdebug_queued_cmd *sqcp = NULL;
-       struct scsi_device *sdp = cmnd->device;
+       struct scsi_device *sdp;
+
+       /* this should never happen */
+       if (WARN_ON(!cmnd))
+               return SCSI_MLQUEUE_HOST_BUSY;
 
-       if (NULL == cmnd || NULL == devip) {
-               pr_warn("%s: called with NULL cmnd or devip pointer\n",
-                       __func__);
+       if (NULL == devip) {
+               pr_warn("called devip == NULL\n");
                /* no particularly good error to report back */
                return SCSI_MLQUEUE_HOST_BUSY;
        }
+
+       sdp = cmnd->device;
+
        if ((scsi_result) && (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts))
                sdev_printk(KERN_INFO, sdp, "%s: non-zero result=0x%x\n",
                            __func__, scsi_result);
@@ -4383,8 +4387,7 @@ static ssize_t fake_rw_store(struct device_driver *ddp, const char *buf,
 
                                fake_storep = vmalloc(sz);
                                if (NULL == fake_storep) {
-                                       pr_err("%s: out of memory, 9\n",
-                                              __func__);
+                                       pr_err("out of memory, 9\n");
                                        return -ENOMEM;
                                }
                                memset(fake_storep, 0, sz);
@@ -4784,8 +4787,7 @@ static int __init scsi_debug_init(void)
        atomic_set(&retired_max_queue, 0);
 
        if (scsi_debug_ndelay >= 1000000000) {
-               pr_warn("%s: ndelay must be less than 1 second, ignored\n",
-                       __func__);
+               pr_warn("ndelay must be less than 1 second, ignored\n");
                scsi_debug_ndelay = 0;
        } else if (scsi_debug_ndelay > 0)
                scsi_debug_delay = DELAY_OVERRIDDEN;
@@ -4797,8 +4799,7 @@ static int __init scsi_debug_init(void)
        case 4096:
                break;
        default:
-               pr_err("%s: invalid sector_size %d\n", __func__,
-                      scsi_debug_sector_size);
+               pr_err("invalid sector_size %d\n", scsi_debug_sector_size);
                return -EINVAL;
        }
 
@@ -4811,29 +4812,28 @@ static int __init scsi_debug_init(void)
                break;
 
        default:
-               pr_err("%s: dif must be 0, 1, 2 or 3\n", __func__);
+               pr_err("dif must be 0, 1, 2 or 3\n");
                return -EINVAL;
        }
 
        if (scsi_debug_guard > 1) {
-               pr_err("%s: guard must be 0 or 1\n", __func__);
+               pr_err("guard must be 0 or 1\n");
                return -EINVAL;
        }
 
        if (scsi_debug_ato > 1) {
-               pr_err("%s: ato must be 0 or 1\n", __func__);
+               pr_err("ato must be 0 or 1\n");
                return -EINVAL;
        }
 
        if (scsi_debug_physblk_exp > 15) {
-               pr_err("%s: invalid physblk_exp %u\n", __func__,
-                      scsi_debug_physblk_exp);
+               pr_err("invalid physblk_exp %u\n", scsi_debug_physblk_exp);
                return -EINVAL;
        }
 
        if (scsi_debug_lowest_aligned > 0x3fff) {
-               pr_err("%s: lowest_aligned too big: %u\n", __func__,
-                      scsi_debug_lowest_aligned);
+               pr_err("lowest_aligned too big: %u\n",
+                       scsi_debug_lowest_aligned);
                return -EINVAL;
        }
 
@@ -4863,7 +4863,7 @@ static int __init scsi_debug_init(void)
        if (0 == scsi_debug_fake_rw) {
                fake_storep = vmalloc(sz);
                if (NULL == fake_storep) {
-                       pr_err("%s: out of memory, 1\n", __func__);
+                       pr_err("out of memory, 1\n");
                        return -ENOMEM;
                }
                memset(fake_storep, 0, sz);
@@ -4877,11 +4877,10 @@ static int __init scsi_debug_init(void)
                dif_size = sdebug_store_sectors * sizeof(struct sd_dif_tuple);
                dif_storep = vmalloc(dif_size);
 
-               pr_err("%s: dif_storep %u bytes @ %p\n", __func__, dif_size,
-                       dif_storep);
+               pr_err("dif_storep %u bytes @ %p\n", dif_size, dif_storep);
 
                if (dif_storep == NULL) {
-                       pr_err("%s: out of mem. (DIX)\n", __func__);
+                       pr_err("out of mem. (DIX)\n");
                        ret = -ENOMEM;
                        goto free_vm;
                }
@@ -4903,18 +4902,17 @@ static int __init scsi_debug_init(void)
                if (scsi_debug_unmap_alignment &&
                    scsi_debug_unmap_granularity <=
                    scsi_debug_unmap_alignment) {
-                       pr_err("%s: ERR: unmap_granularity <= unmap_alignment\n",
-                              __func__);
+                       pr_err("ERR: unmap_granularity <= unmap_alignment\n");
                        return -EINVAL;
                }
 
                map_size = lba_to_map_index(sdebug_store_sectors - 1) + 1;
                map_storep = vmalloc(BITS_TO_LONGS(map_size) * sizeof(long));
 
-               pr_info("%s: %lu provisioning blocks\n", __func__, map_size);
+               pr_info("%lu provisioning blocks\n", map_size);
 
                if (map_storep == NULL) {
-                       pr_err("%s: out of mem. (MAP)\n", __func__);
+                       pr_err("out of mem. (MAP)\n");
                        ret = -ENOMEM;
                        goto free_vm;
                }
@@ -4928,18 +4926,18 @@ static int __init scsi_debug_init(void)
 
        pseudo_primary = root_device_register("pseudo_0");
        if (IS_ERR(pseudo_primary)) {
-               pr_warn("%s: root_device_register() error\n", __func__);
+               pr_warn("root_device_register() error\n");
                ret = PTR_ERR(pseudo_primary);
                goto free_vm;
        }
        ret = bus_register(&pseudo_lld_bus);
        if (ret < 0) {
-               pr_warn("%s: bus_register error: %d\n", __func__, ret);
+               pr_warn("bus_register error: %d\n", ret);
                goto dev_unreg;
        }
        ret = driver_register(&sdebug_driverfs_driver);
        if (ret < 0) {
-               pr_warn("%s: driver_register error: %d\n", __func__, ret);
+               pr_warn("driver_register error: %d\n", ret);
                goto bus_unreg;
        }
 
@@ -4948,16 +4946,14 @@ static int __init scsi_debug_init(void)
 
         for (k = 0; k < host_to_add; k++) {
                 if (sdebug_add_adapter()) {
-                       pr_err("%s: sdebug_add_adapter failed k=%d\n",
-                               __func__, k);
+                       pr_err("sdebug_add_adapter failed k=%d\n", k);
                         break;
                 }
         }
 
-       if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts) {
-               pr_info("%s: built %d host(s)\n", __func__,
-                       scsi_debug_add_host);
-       }
+       if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
+               pr_info("built %d host(s)\n", scsi_debug_add_host);
+
        return 0;
 
 bus_unreg:
@@ -4965,10 +4961,8 @@ bus_unreg:
 dev_unreg:
        root_device_unregister(pseudo_primary);
 free_vm:
-       if (map_storep)
-               vfree(map_storep);
-       if (dif_storep)
-               vfree(dif_storep);
+       vfree(map_storep);
+       vfree(dif_storep);
        vfree(fake_storep);
 
        return ret;
@@ -4986,9 +4980,7 @@ static void __exit scsi_debug_exit(void)
        bus_unregister(&pseudo_lld_bus);
        root_device_unregister(pseudo_primary);
 
-       if (dif_storep)
-               vfree(dif_storep);
-
+       vfree(dif_storep);
        vfree(fake_storep);
 }
 
@@ -5012,8 +5004,7 @@ static int sdebug_add_adapter(void)
 
         sdbg_host = kzalloc(sizeof(*sdbg_host),GFP_KERNEL);
         if (NULL == sdbg_host) {
-                printk(KERN_ERR "%s: out of memory at line %d\n",
-                       __func__, __LINE__);
+               pr_err("out of memory at line %d\n", __LINE__);
                 return -ENOMEM;
         }
 
@@ -5023,8 +5014,7 @@ static int sdebug_add_adapter(void)
         for (k = 0; k < devs_per_host; k++) {
                sdbg_devinfo = sdebug_device_create(sdbg_host, GFP_KERNEL);
                if (!sdbg_devinfo) {
-                        printk(KERN_ERR "%s: out of memory at line %d\n",
-                               __func__, __LINE__);
+                       pr_err("out of memory at line %d\n", __LINE__);
                         error = -ENOMEM;
                        goto clean;
                 }
@@ -5178,7 +5168,7 @@ scsi_debug_queuecommand(struct scsi_cmnd *scp)
                }
                sdev_printk(KERN_INFO, sdp, "%s: cmd %s\n", my_name, b);
        }
-       has_wlun_rl = (sdp->lun == SAM2_WLUN_REPORT_LUNS);
+       has_wlun_rl = (sdp->lun == SCSI_W_LUN_REPORT_LUNS);
        if ((sdp->lun >= scsi_debug_max_luns) && !has_wlun_rl)
                return schedule_resp(scp, NULL, errsts_no_connect, 0);
 
@@ -5338,7 +5328,7 @@ static int sdebug_driver_probe(struct device * dev)
                sdebug_driver_template.use_clustering = ENABLE_CLUSTERING;
        hpnt = scsi_host_alloc(&sdebug_driver_template, sizeof(sdbg_host));
        if (NULL == hpnt) {
-               pr_err("%s: scsi_host_alloc failed\n", __func__);
+               pr_err("scsi_host_alloc failed\n");
                error = -ENODEV;
                return error;
        }
@@ -5349,7 +5339,8 @@ static int sdebug_driver_probe(struct device * dev)
                hpnt->max_id = scsi_debug_num_tgts + 1;
        else
                hpnt->max_id = scsi_debug_num_tgts;
-       hpnt->max_lun = SAM2_WLUN_REPORT_LUNS;  /* = scsi_debug_max_luns; */
+       /* = scsi_debug_max_luns; */
+       hpnt->max_lun = SCSI_W_LUN_REPORT_LUNS + 1;
 
        host_prot = 0;
 
@@ -5381,7 +5372,7 @@ static int sdebug_driver_probe(struct device * dev)
 
        scsi_host_set_prot(hpnt, host_prot);
 
-       printk(KERN_INFO "scsi_debug: host protection%s%s%s%s%s%s%s\n",
+       pr_info("host protection%s%s%s%s%s%s%s\n",
               (host_prot & SHOST_DIF_TYPE1_PROTECTION) ? " DIF1" : "",
               (host_prot & SHOST_DIF_TYPE2_PROTECTION) ? " DIF2" : "",
               (host_prot & SHOST_DIF_TYPE3_PROTECTION) ? " DIF3" : "",
@@ -5409,7 +5400,7 @@ static int sdebug_driver_probe(struct device * dev)
 
         error = scsi_add_host(hpnt, &sdbg_host->dev);
         if (error) {
-                printk(KERN_ERR "%s: scsi_add_host failed\n", __func__);
+               pr_err("scsi_add_host failed\n");
                 error = -ENODEV;
                scsi_host_put(hpnt);
         } else
@@ -5426,8 +5417,7 @@ static int sdebug_driver_remove(struct device * dev)
        sdbg_host = to_sdebug_host(dev);
 
        if (!sdbg_host) {
-               printk(KERN_ERR "%s: Unable to locate host info\n",
-                      __func__);
+               pr_err("Unable to locate host info\n");
                return -ENODEV;
        }
 
diff --git a/drivers/scsi/scsi_dh.c b/drivers/scsi/scsi_dh.c
new file mode 100644 (file)
index 0000000..edb044a
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * SCSI device handler infrastruture.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2007
+ *      Authors:
+ *               Chandra Seetharaman <sekharan@us.ibm.com>
+ *               Mike Anderson <andmike@linux.vnet.ibm.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <scsi/scsi_dh.h>
+#include "scsi_priv.h"
+
+static DEFINE_SPINLOCK(list_lock);
+static LIST_HEAD(scsi_dh_list);
+
+struct scsi_dh_blist {
+       const char *vendor;
+       const char *model;
+       const char *driver;
+};
+
+static const struct scsi_dh_blist scsi_dh_blist[] = {
+       {"DGC", "RAID",                 "clariion" },
+       {"DGC", "DISK",                 "clariion" },
+       {"DGC", "VRAID",                "clariion" },
+
+       {"COMPAQ", "MSA1000 VOLUME",    "hp_sw" },
+       {"COMPAQ", "HSV110",            "hp_sw" },
+       {"HP", "HSV100",                "hp_sw"},
+       {"DEC", "HSG80",                "hp_sw"},
+
+       {"IBM", "1722",                 "rdac", },
+       {"IBM", "1724",                 "rdac", },
+       {"IBM", "1726",                 "rdac", },
+       {"IBM", "1742",                 "rdac", },
+       {"IBM", "1745",                 "rdac", },
+       {"IBM", "1746",                 "rdac", },
+       {"IBM", "1813",                 "rdac", },
+       {"IBM", "1814",                 "rdac", },
+       {"IBM", "1815",                 "rdac", },
+       {"IBM", "1818",                 "rdac", },
+       {"IBM", "3526",                 "rdac", },
+       {"SGI", "TP9",                  "rdac", },
+       {"SGI", "IS",                   "rdac", },
+       {"STK", "OPENstorage D280",     "rdac", },
+       {"STK", "FLEXLINE 380",         "rdac", },
+       {"SUN", "CSM",                  "rdac", },
+       {"SUN", "LCSM100",              "rdac", },
+       {"SUN", "STK6580_6780",         "rdac", },
+       {"SUN", "SUN_6180",             "rdac", },
+       {"SUN", "ArrayStorage",         "rdac", },
+       {"DELL", "MD3",                 "rdac", },
+       {"NETAPP", "INF-01-00",         "rdac", },
+       {"LSI", "INF-01-00",            "rdac", },
+       {"ENGENIO", "INF-01-00",        "rdac", },
+       {NULL, NULL,                    NULL },
+};
+
+static const char *
+scsi_dh_find_driver(struct scsi_device *sdev)
+{
+       const struct scsi_dh_blist *b;
+
+       if (scsi_device_tpgs(sdev))
+               return "alua";
+
+       for (b = scsi_dh_blist; b->vendor; b++) {
+               if (!strncmp(sdev->vendor, b->vendor, strlen(b->vendor)) &&
+                   !strncmp(sdev->model, b->model, strlen(b->model))) {
+                       return b->driver;
+               }
+       }
+       return NULL;
+}
+
+
+static struct scsi_device_handler *__scsi_dh_lookup(const char *name)
+{
+       struct scsi_device_handler *tmp, *found = NULL;
+
+       spin_lock(&list_lock);
+       list_for_each_entry(tmp, &scsi_dh_list, list) {
+               if (!strncmp(tmp->name, name, strlen(tmp->name))) {
+                       found = tmp;
+                       break;
+               }
+       }
+       spin_unlock(&list_lock);
+       return found;
+}
+
+static struct scsi_device_handler *scsi_dh_lookup(const char *name)
+{
+       struct scsi_device_handler *dh;
+
+       dh = __scsi_dh_lookup(name);
+       if (!dh) {
+               request_module(name);
+               dh = __scsi_dh_lookup(name);
+       }
+
+       return dh;
+}
+
+/*
+ * scsi_dh_handler_attach - Attach a device handler to a device
+ * @sdev - SCSI device the device handler should attach to
+ * @scsi_dh - The device handler to attach
+ */
+static int scsi_dh_handler_attach(struct scsi_device *sdev,
+                                 struct scsi_device_handler *scsi_dh)
+{
+       int error;
+
+       if (!try_module_get(scsi_dh->module))
+               return -EINVAL;
+
+       error = scsi_dh->attach(sdev);
+       if (error) {
+               sdev_printk(KERN_ERR, sdev, "%s: Attach failed (%d)\n",
+                           scsi_dh->name, error);
+               module_put(scsi_dh->module);
+       } else
+               sdev->handler = scsi_dh;
+
+       return error;
+}
+
+/*
+ * scsi_dh_handler_detach - Detach a device handler from a device
+ * @sdev - SCSI device the device handler should be detached from
+ */
+static void scsi_dh_handler_detach(struct scsi_device *sdev)
+{
+       sdev->handler->detach(sdev);
+       sdev_printk(KERN_NOTICE, sdev, "%s: Detached\n", sdev->handler->name);
+       module_put(sdev->handler->module);
+}
+
+/*
+ * Functions for sysfs attribute 'dh_state'
+ */
+static ssize_t
+store_dh_state(struct device *dev, struct device_attribute *attr,
+              const char *buf, size_t count)
+{
+       struct scsi_device *sdev = to_scsi_device(dev);
+       struct scsi_device_handler *scsi_dh;
+       int err = -EINVAL;
+
+       if (sdev->sdev_state == SDEV_CANCEL ||
+           sdev->sdev_state == SDEV_DEL)
+               return -ENODEV;
+
+       if (!sdev->handler) {
+               /*
+                * Attach to a device handler
+                */
+               scsi_dh = scsi_dh_lookup(buf);
+               if (!scsi_dh)
+                       return err;
+               err = scsi_dh_handler_attach(sdev, scsi_dh);
+       } else {
+               if (!strncmp(buf, "detach", 6)) {
+                       /*
+                        * Detach from a device handler
+                        */
+                       sdev_printk(KERN_WARNING, sdev,
+                                   "can't detach handler %s.\n",
+                                   sdev->handler->name);
+                       err = -EINVAL;
+               } else if (!strncmp(buf, "activate", 8)) {
+                       /*
+                        * Activate a device handler
+                        */
+                       if (sdev->handler->activate)
+                               err = sdev->handler->activate(sdev, NULL, NULL);
+                       else
+                               err = 0;
+               }
+       }
+
+       return err<0?err:count;
+}
+
+static ssize_t
+show_dh_state(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct scsi_device *sdev = to_scsi_device(dev);
+
+       if (!sdev->handler)
+               return snprintf(buf, 20, "detached\n");
+
+       return snprintf(buf, 20, "%s\n", sdev->handler->name);
+}
+
+static struct device_attribute scsi_dh_state_attr =
+       __ATTR(dh_state, S_IRUGO | S_IWUSR, show_dh_state,
+              store_dh_state);
+
+int scsi_dh_add_device(struct scsi_device *sdev)
+{
+       struct scsi_device_handler *devinfo = NULL;
+       const char *drv;
+       int err;
+
+       err = device_create_file(&sdev->sdev_gendev, &scsi_dh_state_attr);
+       if (err)
+               return err;
+
+       drv = scsi_dh_find_driver(sdev);
+       if (drv)
+               devinfo = scsi_dh_lookup(drv);
+       if (devinfo)
+               err = scsi_dh_handler_attach(sdev, devinfo);
+       return err;
+}
+
+void scsi_dh_remove_device(struct scsi_device *sdev)
+{
+       if (sdev->handler)
+               scsi_dh_handler_detach(sdev);
+       device_remove_file(&sdev->sdev_gendev, &scsi_dh_state_attr);
+}
+
+/*
+ * scsi_register_device_handler - register a device handler personality
+ *      module.
+ * @scsi_dh - device handler to be registered.
+ *
+ * Returns 0 on success, -EBUSY if handler already registered.
+ */
+int scsi_register_device_handler(struct scsi_device_handler *scsi_dh)
+{
+       if (__scsi_dh_lookup(scsi_dh->name))
+               return -EBUSY;
+
+       if (!scsi_dh->attach || !scsi_dh->detach)
+               return -EINVAL;
+
+       spin_lock(&list_lock);
+       list_add(&scsi_dh->list, &scsi_dh_list);
+       spin_unlock(&list_lock);
+
+       printk(KERN_INFO "%s: device handler registered\n", scsi_dh->name);
+
+       return SCSI_DH_OK;
+}
+EXPORT_SYMBOL_GPL(scsi_register_device_handler);
+
+/*
+ * scsi_unregister_device_handler - register a device handler personality
+ *      module.
+ * @scsi_dh - device handler to be unregistered.
+ *
+ * Returns 0 on success, -ENODEV if handler not registered.
+ */
+int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh)
+{
+       if (!__scsi_dh_lookup(scsi_dh->name))
+               return -ENODEV;
+
+       spin_lock(&list_lock);
+       list_del(&scsi_dh->list);
+       spin_unlock(&list_lock);
+       printk(KERN_INFO "%s: device handler unregistered\n", scsi_dh->name);
+
+       return SCSI_DH_OK;
+}
+EXPORT_SYMBOL_GPL(scsi_unregister_device_handler);
+
+static struct scsi_device *get_sdev_from_queue(struct request_queue *q)
+{
+       struct scsi_device *sdev;
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       sdev = q->queuedata;
+       if (!sdev || !get_device(&sdev->sdev_gendev))
+               sdev = NULL;
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       return sdev;
+}
+
+/*
+ * scsi_dh_activate - activate the path associated with the scsi_device
+ *      corresponding to the given request queue.
+ *     Returns immediately without waiting for activation to be completed.
+ * @q    - Request queue that is associated with the scsi_device to be
+ *         activated.
+ * @fn   - Function to be called upon completion of the activation.
+ *         Function fn is called with data (below) and the error code.
+ *         Function fn may be called from the same calling context. So,
+ *         do not hold the lock in the caller which may be needed in fn.
+ * @data - data passed to the function fn upon completion.
+ *
+ */
+int scsi_dh_activate(struct request_queue *q, activate_complete fn, void *data)
+{
+       struct scsi_device *sdev;
+       int err = SCSI_DH_NOSYS;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev) {
+               if (fn)
+                       fn(data, err);
+               return err;
+       }
+
+       if (!sdev->handler)
+               goto out_fn;
+       err = SCSI_DH_NOTCONN;
+       if (sdev->sdev_state == SDEV_CANCEL ||
+           sdev->sdev_state == SDEV_DEL)
+               goto out_fn;
+
+       err = SCSI_DH_DEV_OFFLINED;
+       if (sdev->sdev_state == SDEV_OFFLINE)
+               goto out_fn;
+
+       if (sdev->handler->activate)
+               err = sdev->handler->activate(sdev, fn, data);
+
+out_put_device:
+       put_device(&sdev->sdev_gendev);
+       return err;
+
+out_fn:
+       if (fn)
+               fn(data, err);
+       goto out_put_device;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_activate);
+
+/*
+ * scsi_dh_set_params - set the parameters for the device as per the
+ *      string specified in params.
+ * @q - Request queue that is associated with the scsi_device for
+ *      which the parameters to be set.
+ * @params - parameters in the following format
+ *      "no_of_params\0param1\0param2\0param3\0...\0"
+ *      for example, string for 2 parameters with value 10 and 21
+ *      is specified as "2\010\021\0".
+ */
+int scsi_dh_set_params(struct request_queue *q, const char *params)
+{
+       struct scsi_device *sdev;
+       int err = -SCSI_DH_NOSYS;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev)
+               return err;
+
+       if (sdev->handler && sdev->handler->set_params)
+               err = sdev->handler->set_params(sdev, params);
+       put_device(&sdev->sdev_gendev);
+       return err;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_set_params);
+
+/*
+ * scsi_dh_attach - Attach device handler
+ * @q - Request queue that is associated with the scsi_device
+ *      the handler should be attached to
+ * @name - name of the handler to attach
+ */
+int scsi_dh_attach(struct request_queue *q, const char *name)
+{
+       struct scsi_device *sdev;
+       struct scsi_device_handler *scsi_dh;
+       int err = 0;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev)
+               return -ENODEV;
+
+       scsi_dh = scsi_dh_lookup(name);
+       if (!scsi_dh) {
+               err = -EINVAL;
+               goto out_put_device;
+       }
+
+       if (sdev->handler) {
+               if (sdev->handler != scsi_dh)
+                       err = -EBUSY;
+               goto out_put_device;
+       }
+
+       err = scsi_dh_handler_attach(sdev, scsi_dh);
+
+out_put_device:
+       put_device(&sdev->sdev_gendev);
+       return err;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_attach);
+
+/*
+ * scsi_dh_attached_handler_name - Get attached device handler's name
+ * @q - Request queue that is associated with the scsi_device
+ *      that may have a device handler attached
+ * @gfp - the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Returns name of attached handler, NULL if no handler is attached.
+ * Caller must take care to free the returned string.
+ */
+const char *scsi_dh_attached_handler_name(struct request_queue *q, gfp_t gfp)
+{
+       struct scsi_device *sdev;
+       const char *handler_name = NULL;
+
+       sdev = get_sdev_from_queue(q);
+       if (!sdev)
+               return NULL;
+
+       if (sdev->handler)
+               handler_name = kstrdup(sdev->handler->name, gfp);
+       put_device(&sdev->sdev_gendev);
+       return handler_name;
+}
+EXPORT_SYMBOL_GPL(scsi_dh_attached_handler_name);
index afd34a608fe7eb8758457cc05d5aa05825a34476..66a96cd98b975dcdbd5429cf02f069b52771fa84 100644 (file)
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_driver.h>
 #include <scsi/scsi_eh.h>
+#include <scsi/scsi_common.h>
 #include <scsi/scsi_transport.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_ioctl.h>
+#include <scsi/scsi_dh.h>
 #include <scsi/sg.h>
 
 #include "scsi_priv.h"
@@ -463,11 +465,10 @@ static int scsi_check_sense(struct scsi_cmnd *scmd)
        if (scsi_sense_is_deferred(&sshdr))
                return NEEDS_RETRY;
 
-       if (sdev->scsi_dh_data && sdev->scsi_dh_data->scsi_dh &&
-                       sdev->scsi_dh_data->scsi_dh->check_sense) {
+       if (sdev->handler && sdev->handler->check_sense) {
                int rc;
 
-               rc = sdev->scsi_dh_data->scsi_dh->check_sense(sdev, &sshdr);
+               rc = sdev->handler->check_sense(sdev, &sshdr);
                if (rc != SCSI_RETURN_NOT_HANDLED)
                        return rc;
                /* handler does not care. Drop down to default handling */
@@ -2178,8 +2179,17 @@ int scsi_error_handler(void *data)
         * We never actually get interrupted because kthread_run
         * disables signal delivery for the created thread.
         */
-       while (!kthread_should_stop()) {
+       while (true) {
+               /*
+                * The sequence in kthread_stop() sets the stop flag first
+                * then wakes the process.  To avoid missed wakeups, the task
+                * should always be in a non running state before the stop
+                * flag is checked
+                */
                set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread_should_stop())
+                       break;
+
                if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) ||
                    shost->host_failed != atomic_read(&shost->host_busy)) {
                        SCSI_LOG_ERROR_RECOVERY(1,
@@ -2415,45 +2425,6 @@ bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd,
 }
 EXPORT_SYMBOL(scsi_command_normalize_sense);
 
-/**
- * scsi_sense_desc_find - search for a given descriptor type in        descriptor sense data format.
- * @sense_buffer:      byte array of descriptor format sense data
- * @sb_len:            number of valid bytes in sense_buffer
- * @desc_type:         value of descriptor type to find
- *                     (e.g. 0 -> information)
- *
- * Notes:
- *     only valid when sense data is in descriptor format
- *
- * Return value:
- *     pointer to start of (first) descriptor if found else NULL
- */
-const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
-                               int desc_type)
-{
-       int add_sen_len, add_len, desc_len, k;
-       const u8 * descp;
-
-       if ((sb_len < 8) || (0 == (add_sen_len = sense_buffer[7])))
-               return NULL;
-       if ((sense_buffer[0] < 0x72) || (sense_buffer[0] > 0x73))
-               return NULL;
-       add_sen_len = (add_sen_len < (sb_len - 8)) ?
-                       add_sen_len : (sb_len - 8);
-       descp = &sense_buffer[8];
-       for (desc_len = 0, k = 0; k < add_sen_len; k += desc_len) {
-               descp += desc_len;
-               add_len = (k < (add_sen_len - 1)) ? descp[1]: -1;
-               desc_len = add_len + 2;
-               if (descp[0] == desc_type)
-                       return descp;
-               if (add_len < 0) // short descriptor ??
-                       break;
-       }
-       return NULL;
-}
-EXPORT_SYMBOL(scsi_sense_desc_find);
-
 /**
  * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format)
  * @sense_buffer:      byte array of sense data
@@ -2503,31 +2474,3 @@ int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
        }
 }
 EXPORT_SYMBOL(scsi_get_sense_info_fld);
-
-/**
- * scsi_build_sense_buffer - build sense data in a buffer
- * @desc:      Sense format (non zero == descriptor format,
- *             0 == fixed format)
- * @buf:       Where to build sense data
- * @key:       Sense key
- * @asc:       Additional sense code
- * @ascq:      Additional sense code qualifier
- *
- **/
-void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq)
-{
-       if (desc) {
-               buf[0] = 0x72;  /* descriptor, current */
-               buf[1] = key;
-               buf[2] = asc;
-               buf[3] = ascq;
-               buf[7] = 0;
-       } else {
-               buf[0] = 0x70;  /* fixed, current */
-               buf[2] = key;
-               buf[7] = 0xa;
-               buf[12] = asc;
-               buf[13] = ascq;
-       }
-}
-EXPORT_SYMBOL(scsi_build_sense_buffer);
index 882864f5cbae8b8d775eecfdd851aee44fa2e0fe..cbfc5990052b6b2733ae1c8a81467d3a0e9e70f4 100644 (file)
@@ -31,6 +31,7 @@
 #include <scsi/scsi_driver.h>
 #include <scsi/scsi_eh.h>
 #include <scsi/scsi_host.h>
+#include <scsi/scsi_dh.h>
 
 #include <trace/events/scsi.h>
 
@@ -1248,9 +1249,8 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 {
        struct scsi_cmnd *cmd = req->special;
 
-       if (unlikely(sdev->scsi_dh_data && sdev->scsi_dh_data->scsi_dh
-                        && sdev->scsi_dh_data->scsi_dh->prep_fn)) {
-               int ret = sdev->scsi_dh_data->scsi_dh->prep_fn(sdev, req);
+       if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
+               int ret = sdev->handler->prep_fn(sdev, req);
                if (ret != BLKPREP_OK)
                        return ret;
        }
index e3902fc66278d147b2e8c37c58211b336449b9f8..644bb7339b55bd89068e2a1c34a096dab682cb62 100644 (file)
@@ -170,6 +170,15 @@ static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
 extern struct async_domain scsi_sd_pm_domain;
 extern struct async_domain scsi_sd_probe_domain;
 
+/* scsi_dh.c */
+#ifdef CONFIG_SCSI_DH
+int scsi_dh_add_device(struct scsi_device *sdev);
+void scsi_dh_remove_device(struct scsi_device *sdev);
+#else
+static inline int scsi_dh_add_device(struct scsi_device *sdev) { return 0; }
+static inline void scsi_dh_remove_device(struct scsi_device *sdev) { }
+#endif
+
 /* 
  * internal scsi timeout functions: for use by mid-layer and transport
  * classes.
index 9ad41168d26df1897814121766cd2e8fbd3a0243..b333389f248ffec291958014a39829156a188bd0 100644 (file)
@@ -1030,11 +1030,20 @@ int scsi_sysfs_add_sdev(struct scsi_device *sdev)
                                "failed to add device: %d\n", error);
                return error;
        }
+
+       error = scsi_dh_add_device(sdev);
+       if (error) {
+               sdev_printk(KERN_INFO, sdev,
+                               "failed to add device handler: %d\n", error);
+               return error;
+       }
+
        device_enable_async_suspend(&sdev->sdev_dev);
        error = device_add(&sdev->sdev_dev);
        if (error) {
                sdev_printk(KERN_INFO, sdev,
                                "failed to add class device: %d\n", error);
+               scsi_dh_remove_device(sdev);
                device_del(&sdev->sdev_gendev);
                return error;
        }
@@ -1074,6 +1083,7 @@ void __scsi_remove_device(struct scsi_device *sdev)
                bsg_unregister_queue(sdev->request_queue);
                device_unregister(&sdev->sdev_dev);
                transport_remove_device(dev);
+               scsi_dh_remove_device(sdev);
                device_del(dev);
        } else
                put_device(&sdev->sdev_dev);
index 9a058194b9bdb2ae37079663af21a9f4dad33502..30d26e345dcc797430667ebe2ea0fa1f288fc372 100644 (file)
@@ -1222,13 +1222,6 @@ show_sas_rphy_enclosure_identifier(struct device *dev,
        u64 identifier;
        int error;
 
-       /*
-        * Only devices behind an expander are supported, because the
-        * enclosure identifier is a SMP feature.
-        */
-       if (scsi_is_sas_phy_local(phy))
-               return -EINVAL;
-
        error = i->f->get_enclosure_identifier(rphy, &identifier);
        if (error)
                return error;
@@ -1248,9 +1241,6 @@ show_sas_rphy_bay_identifier(struct device *dev,
        struct sas_internal *i = to_sas_internal(shost->transportt);
        int val;
 
-       if (scsi_is_sas_phy_local(phy))
-               return -EINVAL;
-
        val = i->f->get_bay_identifier(rphy);
        if (val < 0)
                return val;
index e26e81de7c45ab31e4e3dd7e40db7854d756190b..d50c5ed8f428c69380101ce7ccab16fcc4e805d4 100644 (file)
@@ -12,9 +12,9 @@
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
+#include <linux/io.h>
 
 #include <asm/sun3x.h>
-#include <asm/io.h>
 #include <asm/dma.h>
 #include <asm/dvma.h>
 
index fad22caf0efffd7551e6203abaeabd99f3ecf3df..9dc8687bf0480e53b2d89431ece47f8504fd5779 100644 (file)
@@ -377,7 +377,6 @@ static int map_data_for_request(struct vscsifrnt_info *info,
        unsigned int data_len = scsi_bufflen(sc);
        unsigned int data_grants = 0, seg_grants = 0;
        struct scatterlist *sg;
-       unsigned long mfn;
        struct scsiif_request_segment *seg;
 
        ring_req->nr_segments = 0;
@@ -420,9 +419,9 @@ static int map_data_for_request(struct vscsifrnt_info *info,
                        ref = gnttab_claim_grant_reference(&gref_head);
                        BUG_ON(ref == -ENOSPC);
 
-                       mfn = pfn_to_mfn(page_to_pfn(page));
                        gnttab_grant_foreign_access_ref(ref,
-                               info->dev->otherend_id, mfn, 1);
+                               info->dev->otherend_id,
+                               xen_page_to_gfn(page), 1);
                        shadow->gref[ref_cnt] = ref;
                        ring_req->seg[ref_cnt].gref   = ref;
                        ring_req->seg[ref_cnt].offset = (uint16_t)off;
@@ -454,9 +453,10 @@ static int map_data_for_request(struct vscsifrnt_info *info,
                        ref = gnttab_claim_grant_reference(&gref_head);
                        BUG_ON(ref == -ENOSPC);
 
-                       mfn = pfn_to_mfn(page_to_pfn(page));
                        gnttab_grant_foreign_access_ref(ref,
-                               info->dev->otherend_id, mfn, grant_ro);
+                               info->dev->otherend_id,
+                               xen_page_to_gfn(page),
+                               grant_ro);
 
                        shadow->gref[ref_cnt] = ref;
                        seg->gref   = ref;
index 327adcf117c18efbd9e454c001a6578d30c2088e..a6155c917d52d03a088a2ccbbd5a25220f392c60 100644 (file)
@@ -96,6 +96,7 @@ static const struct {
  * @smd:               handle to qcom_smd
  * @of_node:           of_node handle for information related to this edge
  * @edge_id:           identifier of this edge
+ * @remote_pid:                identifier of remote processor
  * @irq:               interrupt for signals on this edge
  * @ipc_regmap:                regmap handle holding the outgoing ipc register
  * @ipc_offset:                offset within @ipc_regmap of the register for ipc
@@ -111,6 +112,7 @@ struct qcom_smd_edge {
        struct qcom_smd *smd;
        struct device_node *of_node;
        unsigned edge_id;
+       unsigned remote_pid;
 
        int irq;
 
@@ -310,7 +312,7 @@ static void qcom_smd_channel_reset(struct qcom_smd_channel *channel)
        SET_TX_CHANNEL_INFO(channel, fHEAD, 0);
        SET_TX_CHANNEL_INFO(channel, fTAIL, 0);
        SET_TX_CHANNEL_INFO(channel, fSTATE, 1);
-       SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
+       SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
        SET_TX_CHANNEL_INFO(channel, head, 0);
        SET_TX_CHANNEL_INFO(channel, tail, 0);
 
@@ -572,7 +574,7 @@ static irqreturn_t qcom_smd_edge_intr(int irq, void *data)
         * have to scan if the amount of available space in smem have changed
         * since last scan.
         */
-       available = qcom_smem_get_free_space(edge->edge_id);
+       available = qcom_smem_get_free_space(edge->remote_pid);
        if (available != edge->smem_available) {
                edge->smem_available = available;
                edge->need_rescan = true;
@@ -681,7 +683,7 @@ int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len)
                        goto out;
                }
 
-               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
+               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
 
                ret = wait_event_interruptible(channel->fblockread_event,
                                       qcom_smd_get_tx_avail(channel) >= tlen ||
@@ -689,7 +691,7 @@ int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len)
                if (ret)
                        goto out;
 
-               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 0);
+               SET_TX_CHANNEL_INFO(channel, fBLOCKREADINTR, 1);
        }
 
        SET_TX_CHANNEL_INFO(channel, fTAIL, 0);
@@ -976,7 +978,8 @@ static struct qcom_smd_channel *qcom_smd_create_channel(struct qcom_smd_edge *ed
        spin_lock_init(&channel->recv_lock);
        init_waitqueue_head(&channel->fblockread_event);
 
-       ret = qcom_smem_get(edge->edge_id, smem_info_item, (void **)&info, &info_size);
+       ret = qcom_smem_get(edge->remote_pid, smem_info_item, (void **)&info,
+                           &info_size);
        if (ret)
                goto free_name_and_channel;
 
@@ -997,7 +1000,8 @@ static struct qcom_smd_channel *qcom_smd_create_channel(struct qcom_smd_edge *ed
                goto free_name_and_channel;
        }
 
-       ret = qcom_smem_get(edge->edge_id, smem_fifo_item, &fifo_base, &fifo_size);
+       ret = qcom_smem_get(edge->remote_pid, smem_fifo_item, &fifo_base,
+                           &fifo_size);
        if (ret)
                goto free_name_and_channel;
 
@@ -1041,7 +1045,7 @@ static void qcom_discover_channels(struct qcom_smd_edge *edge)
        int i;
 
        for (tbl = 0; tbl < SMD_ALLOC_TBL_COUNT; tbl++) {
-               ret = qcom_smem_get(edge->edge_id,
+               ret = qcom_smem_get(edge->remote_pid,
                                    smem_items[tbl].alloc_tbl_id,
                                    (void **)&alloc_tbl,
                                    NULL);
@@ -1184,6 +1188,10 @@ static int qcom_smd_parse_edge(struct device *dev,
                return -EINVAL;
        }
 
+       edge->remote_pid = QCOM_SMEM_HOST_ANY;
+       key = "qcom,remote-pid";
+       of_property_read_u32(node, key, &edge->remote_pid);
+
        syscon_np = of_parse_phandle(node, "qcom,ipc", 0);
        if (!syscon_np) {
                dev_err(dev, "no qcom,ipc node\n");
index 7c2c324c4b10cee9a1430c7c68b2c079ea7e7ce7..52365188a1c20288a754dc7e1a530e4b25570ac3 100644 (file)
@@ -258,10 +258,6 @@ static int qcom_smem_alloc_private(struct qcom_smem *smem,
        size_t alloc_size;
        void *p;
 
-       /* We're not going to find it if there's no matching partition */
-       if (host >= SMEM_HOST_COUNT || !smem->partitions[host])
-               return -ENOENT;
-
        phdr = smem->partitions[host];
 
        p = (void *)phdr + sizeof(*phdr);
@@ -371,8 +367,9 @@ int qcom_smem_alloc(unsigned host, unsigned item, size_t size)
        if (ret)
                return ret;
 
-       ret = qcom_smem_alloc_private(__smem, host, item, size);
-       if (ret == -ENOENT)
+       if (host < SMEM_HOST_COUNT && __smem->partitions[host])
+               ret = qcom_smem_alloc_private(__smem, host, item, size);
+       else
                ret = qcom_smem_alloc_global(__smem, item, size);
 
        hwspin_unlock_irqrestore(__smem->hwlock, &flags);
@@ -428,10 +425,6 @@ static int qcom_smem_get_private(struct qcom_smem *smem,
        struct smem_private_entry *hdr;
        void *p;
 
-       /* We're not going to find it if there's no matching partition */
-       if (host >= SMEM_HOST_COUNT || !smem->partitions[host])
-               return -ENOENT;
-
        phdr = smem->partitions[host];
 
        p = (void *)phdr + sizeof(*phdr);
@@ -484,8 +477,9 @@ int qcom_smem_get(unsigned host, unsigned item, void **ptr, size_t *size)
        if (ret)
                return ret;
 
-       ret = qcom_smem_get_private(__smem, host, item, ptr, size);
-       if (ret == -ENOENT)
+       if (host < SMEM_HOST_COUNT && __smem->partitions[host])
+               ret = qcom_smem_get_private(__smem, host, item, ptr, size);
+       else
                ret = qcom_smem_get_global(__smem, item, ptr, size);
 
        hwspin_unlock_irqrestore(__smem->hwlock, &flags);
index e29293c0c71e315e9bee9610dad6ee440960087f..39d950584c9ffe585bfe40e20ee94a806c8d5be8 100644 (file)
@@ -72,6 +72,8 @@ source "drivers/staging/nvec/Kconfig"
 
 source "drivers/staging/media/Kconfig"
 
+source "drivers/staging/rdma/Kconfig"
+
 source "drivers/staging/android/Kconfig"
 
 source "drivers/staging/board/Kconfig"
index 50824dde2c09af863f2921ecd2512d7361d9ef48..e4f33d91872b54920879bb5d3d586dfeb743665e 100644 (file)
@@ -29,6 +29,7 @@ obj-$(CONFIG_FT1000)          += ft1000/
 obj-$(CONFIG_SPEAKUP)          += speakup/
 obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI4)   += ste_rmi4/
 obj-$(CONFIG_MFD_NVEC)         += nvec/
+obj-$(CONFIG_STAGING_RDMA)     += rdma/
 obj-$(CONFIG_ANDROID)          += android/
 obj-$(CONFIG_STAGING_BOARD)    += board/
 obj-$(CONFIG_WIMAX_GDM72XX)    += gdm72xx/
index eec878e183f5d81e51649efad85a470805fb0080..217aa537c4eb9770a0ca0abf34626a8146f5049b 100644 (file)
@@ -997,7 +997,7 @@ static void ion_vm_close(struct vm_area_struct *vma)
        mutex_unlock(&buffer->lock);
 }
 
-static struct vm_operations_struct ion_vma_ops = {
+static const struct vm_operations_struct ion_vma_ops = {
        .open = ion_vm_open,
        .close = ion_vm_close,
        .fault = ion_vm_fault,
index 81df77bd55cc98687840b6a86664e89fa080c44d..9c41652ee908b5e8254b29f4e3dbf4cf584bed9d 100644 (file)
@@ -91,7 +91,7 @@ static const struct board_staging_dev armadillo800eva_devices[] __initconst = {
                .pdev           = &lcdc0_device,
                .clocks         = lcdc0_clocks,
                .nclocks        = ARRAY_SIZE(lcdc0_clocks),
-               .domain         = "a4lc",
+               .domain         = "/system-controller@e6180000/pm-domains/c5/a4lc@1"
        },
 };
 
index 29d456e29f38feac15d8728cfe3c8085cc49d405..3eb5eb8f069c236da870eb4b13a677eb560a1a1f 100644 (file)
@@ -135,6 +135,40 @@ int __init board_staging_register_clock(const struct board_staging_clk *bsc)
        return error;
 }
 
+#ifdef CONFIG_PM_GENERIC_DOMAINS_OF
+static int board_staging_add_dev_domain(struct platform_device *pdev,
+                                       const char *domain)
+{
+       struct of_phandle_args pd_args;
+       struct generic_pm_domain *pd;
+       struct device_node *np;
+
+       np = of_find_node_by_path(domain);
+       if (!np) {
+               pr_err("Cannot find domain node %s\n", domain);
+               return -ENOENT;
+       }
+
+       pd_args.np = np;
+       pd_args.args_count = 0;
+       pd = of_genpd_get_from_provider(&pd_args);
+       if (IS_ERR(pd)) {
+               pr_err("Cannot find genpd %s (%ld)\n", domain, PTR_ERR(pd));
+               return PTR_ERR(pd);
+
+       }
+       pr_debug("Found genpd %s for device %s\n", pd->name, pdev->name);
+
+       return pm_genpd_add_device(pd, &pdev->dev);
+}
+#else
+static inline int board_staging_add_dev_domain(struct platform_device *pdev,
+                                              const char *domain)
+{
+       return 0;
+}
+#endif
+
 int __init board_staging_register_device(const struct board_staging_dev *dev)
 {
        struct platform_device *pdev = dev->pdev;
@@ -161,7 +195,7 @@ int __init board_staging_register_device(const struct board_staging_dev *dev)
        }
 
        if (dev->domain)
-               __pm_genpd_name_add_device(dev->domain, &pdev->dev, NULL);
+               board_staging_add_dev_domain(pdev, dev->domain);
 
        return error;
 }
index fd54d098ab02248eabff1b250afc123b27433260..0e8a45102933ea0c0a103581959ad54c01918abc 100644 (file)
@@ -2156,7 +2156,7 @@ static void comedi_vm_close(struct vm_area_struct *area)
        comedi_buf_map_put(bm);
 }
 
-static struct vm_operations_struct comedi_vm_ops = {
+static const struct vm_operations_struct comedi_vm_ops = {
        .open = comedi_vm_open,
        .close = comedi_vm_close,
 };
index 0768bc42a5db5a7756e5132baf8f741b2cc0e2e7..14ef1f67dd420bb66d8a2ff90851a3544a277cf8 100644 (file)
@@ -28,6 +28,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/io.h>
 #include "../comedidev.h"
 
 /*
diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig
new file mode 100644 (file)
index 0000000..d7f6235
--- /dev/null
@@ -0,0 +1,33 @@
+menuconfig STAGING_RDMA
+        bool "RDMA staging drivers"
+       depends on INFINIBAND
+       depends on PCI || BROKEN
+       depends on HAS_IOMEM
+       depends on NET
+       depends on INET
+        default n
+        ---help---
+          This option allows you to select a number of RDMA drivers that
+         fall into one of two categories: deprecated drivers being held
+         here before finally being removed or new drivers that still need
+         some work before being moved to the normal RDMA driver area.
+
+          If you wish to work on these drivers, to help improve them, or
+          to report problems you have with them, please use the
+         linux-rdma@vger.kernel.org mailing list.
+
+          If in doubt, say N here.
+
+
+# Please keep entries in alphabetic order
+if STAGING_RDMA
+
+source "drivers/staging/rdma/amso1100/Kconfig"
+
+source "drivers/staging/rdma/ehca/Kconfig"
+
+source "drivers/staging/rdma/hfi1/Kconfig"
+
+source "drivers/staging/rdma/ipath/Kconfig"
+
+endif
diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile
new file mode 100644 (file)
index 0000000..139d78e
--- /dev/null
@@ -0,0 +1,5 @@
+# Entries for RDMA_STAGING tree
+obj-$(CONFIG_INFINIBAND_AMSO1100)      += amso1100/
+obj-$(CONFIG_INFINIBAND_EHCA)  += ehca/
+obj-$(CONFIG_INFINIBAND_HFI1)  += hfi1/
+obj-$(CONFIG_INFINIBAND_IPATH) += ipath/
diff --git a/drivers/staging/rdma/amso1100/Kbuild b/drivers/staging/rdma/amso1100/Kbuild
new file mode 100644 (file)
index 0000000..950dfab
--- /dev/null
@@ -0,0 +1,6 @@
+ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
+
+obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
+
+iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
+       c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/staging/rdma/amso1100/Kconfig b/drivers/staging/rdma/amso1100/Kconfig
new file mode 100644 (file)
index 0000000..e6ce5f2
--- /dev/null
@@ -0,0 +1,15 @@
+config INFINIBAND_AMSO1100
+       tristate "Ammasso 1100 HCA support"
+       depends on PCI && INET
+       ---help---
+         This is a low-level driver for the Ammasso 1100 host
+         channel adapter (HCA).
+
+config INFINIBAND_AMSO1100_DEBUG
+       bool "Verbose debugging output"
+       depends on INFINIBAND_AMSO1100
+       default n
+       ---help---
+         This option causes the amso1100 driver to produce a bunch of
+         debug messages.  Select this if you are developing the driver
+         or trying to diagnose a problem.
diff --git a/drivers/staging/rdma/amso1100/TODO b/drivers/staging/rdma/amso1100/TODO
new file mode 100644 (file)
index 0000000..18b00a5
--- /dev/null
@@ -0,0 +1,4 @@
+7/2015
+
+The amso1100 driver has been deprecated and moved to drivers/staging.
+It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/amso1100/c2.c b/drivers/staging/rdma/amso1100/c2.c
new file mode 100644 (file)
index 0000000..766a71c
--- /dev/null
@@ -0,0 +1,1241 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_provider.h"
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+
+static int debug = -1;         /* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static int c2_up(struct net_device *netdev);
+static int c2_down(struct net_device *netdev);
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+static void c2_tx_interrupt(struct net_device *netdev);
+static void c2_rx_interrupt(struct net_device *netdev);
+static irqreturn_t c2_interrupt(int irq, void *dev_id);
+static void c2_tx_timeout(struct net_device *netdev);
+static int c2_change_mtu(struct net_device *netdev, int new_mtu);
+static void c2_reset(struct c2_port *c2_port);
+
+static struct pci_device_id c2_pci_table[] = {
+       { PCI_DEVICE(0x18b8, 0xb001) },
+       { 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, c2_pci_table);
+
+static void c2_print_macaddr(struct net_device *netdev)
+{
+       pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+static void c2_set_rxbufsize(struct c2_port *c2_port)
+{
+       struct net_device *netdev = c2_port->netdev;
+
+       if (netdev->mtu > RX_BUF_SIZE)
+               c2_port->rx_buf_size =
+                   netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
+                   NET_IP_ALIGN;
+       else
+               c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
+}
+
+/*
+ * Allocate TX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
+                           dma_addr_t base, void __iomem * mmio_txp_ring)
+{
+       struct c2_tx_desc *tx_desc;
+       struct c2_txp_desc __iomem *txp_desc;
+       struct c2_element *elem;
+       int i;
+
+       tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
+       if (!tx_ring->start)
+               return -ENOMEM;
+
+       elem = tx_ring->start;
+       tx_desc = vaddr;
+       txp_desc = mmio_txp_ring;
+       for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
+               tx_desc->len = 0;
+               tx_desc->status = 0;
+
+               /* Set TXP_HTXD_UNINIT */
+               __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+                            (void __iomem *) txp_desc + C2_TXP_ADDR);
+               __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
+               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+                            (void __iomem *) txp_desc + C2_TXP_FLAGS);
+
+               elem->skb = NULL;
+               elem->ht_desc = tx_desc;
+               elem->hw_desc = txp_desc;
+
+               if (i == tx_ring->count - 1) {
+                       elem->next = tx_ring->start;
+                       tx_desc->next_offset = base;
+               } else {
+                       elem->next = elem + 1;
+                       tx_desc->next_offset =
+                           base + (i + 1) * sizeof(*tx_desc);
+               }
+       }
+
+       tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
+
+       return 0;
+}
+
+/*
+ * Allocate RX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
+                           dma_addr_t base, void __iomem * mmio_rxp_ring)
+{
+       struct c2_rx_desc *rx_desc;
+       struct c2_rxp_desc __iomem *rxp_desc;
+       struct c2_element *elem;
+       int i;
+
+       rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
+       if (!rx_ring->start)
+               return -ENOMEM;
+
+       elem = rx_ring->start;
+       rx_desc = vaddr;
+       rxp_desc = mmio_rxp_ring;
+       for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
+               rx_desc->len = 0;
+               rx_desc->status = 0;
+
+               /* Set RXP_HRXD_UNINIT */
+               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
+                      (void __iomem *) rxp_desc + C2_RXP_STATUS);
+               __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
+               __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
+               __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+                            (void __iomem *) rxp_desc + C2_RXP_ADDR);
+               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+                            (void __iomem *) rxp_desc + C2_RXP_FLAGS);
+
+               elem->skb = NULL;
+               elem->ht_desc = rx_desc;
+               elem->hw_desc = rxp_desc;
+
+               if (i == rx_ring->count - 1) {
+                       elem->next = rx_ring->start;
+                       rx_desc->next_offset = base;
+               } else {
+                       elem->next = elem + 1;
+                       rx_desc->next_offset =
+                           base + (i + 1) * sizeof(*rx_desc);
+               }
+       }
+
+       rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
+
+       return 0;
+}
+
+/* Setup buffer for receiving */
+static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
+{
+       struct c2_dev *c2dev = c2_port->c2dev;
+       struct c2_rx_desc *rx_desc = elem->ht_desc;
+       struct sk_buff *skb;
+       dma_addr_t mapaddr;
+       u32 maplen;
+       struct c2_rxp_hdr *rxp_hdr;
+
+       skb = dev_alloc_skb(c2_port->rx_buf_size);
+       if (unlikely(!skb)) {
+               pr_debug("%s: out of memory for receive\n",
+                       c2_port->netdev->name);
+               return -ENOMEM;
+       }
+
+       /* Zero out the rxp hdr in the sk_buff */
+       memset(skb->data, 0, sizeof(*rxp_hdr));
+
+       skb->dev = c2_port->netdev;
+
+       maplen = c2_port->rx_buf_size;
+       mapaddr =
+           pci_map_single(c2dev->pcidev, skb->data, maplen,
+                          PCI_DMA_FROMDEVICE);
+
+       /* Set the sk_buff RXP_header to RXP_HRXD_READY */
+       rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+       rxp_hdr->flags = RXP_HRXD_READY;
+
+       __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+       __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
+                    elem->hw_desc + C2_RXP_LEN);
+       __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
+       __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+                    elem->hw_desc + C2_RXP_FLAGS);
+
+       elem->skb = skb;
+       elem->mapaddr = mapaddr;
+       elem->maplen = maplen;
+       rx_desc->len = maplen;
+
+       return 0;
+}
+
+/*
+ * Allocate buffers for the Rx ring
+ * For receive:  rx_ring.to_clean is next received frame
+ */
+static int c2_rx_fill(struct c2_port *c2_port)
+{
+       struct c2_ring *rx_ring = &c2_port->rx_ring;
+       struct c2_element *elem;
+       int ret = 0;
+
+       elem = rx_ring->start;
+       do {
+               if (c2_rx_alloc(c2_port, elem)) {
+                       ret = 1;
+                       break;
+               }
+       } while ((elem = elem->next) != rx_ring->start);
+
+       rx_ring->to_clean = rx_ring->start;
+       return ret;
+}
+
+/* Free all buffers in RX ring, assumes receiver stopped */
+static void c2_rx_clean(struct c2_port *c2_port)
+{
+       struct c2_dev *c2dev = c2_port->c2dev;
+       struct c2_ring *rx_ring = &c2_port->rx_ring;
+       struct c2_element *elem;
+       struct c2_rx_desc *rx_desc;
+
+       elem = rx_ring->start;
+       do {
+               rx_desc = elem->ht_desc;
+               rx_desc->len = 0;
+
+               __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+               __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+               __raw_writew(0, elem->hw_desc + C2_RXP_LEN);
+               __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+                            elem->hw_desc + C2_RXP_ADDR);
+               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+                            elem->hw_desc + C2_RXP_FLAGS);
+
+               if (elem->skb) {
+                       pci_unmap_single(c2dev->pcidev, elem->mapaddr,
+                                        elem->maplen, PCI_DMA_FROMDEVICE);
+                       dev_kfree_skb(elem->skb);
+                       elem->skb = NULL;
+               }
+       } while ((elem = elem->next) != rx_ring->start);
+}
+
+static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
+{
+       struct c2_tx_desc *tx_desc = elem->ht_desc;
+
+       tx_desc->len = 0;
+
+       pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
+                        PCI_DMA_TODEVICE);
+
+       if (elem->skb) {
+               dev_kfree_skb_any(elem->skb);
+               elem->skb = NULL;
+       }
+
+       return 0;
+}
+
+/* Free all buffers in TX ring, assumes transmitter stopped */
+static void c2_tx_clean(struct c2_port *c2_port)
+{
+       struct c2_ring *tx_ring = &c2_port->tx_ring;
+       struct c2_element *elem;
+       struct c2_txp_desc txp_htxd;
+       int retry;
+       unsigned long flags;
+
+       spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+       elem = tx_ring->start;
+
+       do {
+               retry = 0;
+               do {
+                       txp_htxd.flags =
+                           readw(elem->hw_desc + C2_TXP_FLAGS);
+
+                       if (txp_htxd.flags == TXP_HTXD_READY) {
+                               retry = 1;
+                               __raw_writew(0,
+                                            elem->hw_desc + C2_TXP_LEN);
+                               __raw_writeq(0,
+                                            elem->hw_desc + C2_TXP_ADDR);
+                               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
+                                            elem->hw_desc + C2_TXP_FLAGS);
+                               c2_port->netdev->stats.tx_dropped++;
+                               break;
+                       } else {
+                               __raw_writew(0,
+                                            elem->hw_desc + C2_TXP_LEN);
+                               __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+                                            elem->hw_desc + C2_TXP_ADDR);
+                               __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+                                            elem->hw_desc + C2_TXP_FLAGS);
+                       }
+
+                       c2_tx_free(c2_port->c2dev, elem);
+
+               } while ((elem = elem->next) != tx_ring->start);
+       } while (retry);
+
+       c2_port->tx_avail = c2_port->tx_ring.count - 1;
+       c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
+
+       if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+               netif_wake_queue(c2_port->netdev);
+
+       spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+}
+
+/*
+ * Process transmit descriptors marked 'DONE' by the firmware,
+ * freeing up their unneeded sk_buffs.
+ */
+static void c2_tx_interrupt(struct net_device *netdev)
+{
+       struct c2_port *c2_port = netdev_priv(netdev);
+       struct c2_dev *c2dev = c2_port->c2dev;
+       struct c2_ring *tx_ring = &c2_port->tx_ring;
+       struct c2_element *elem;
+       struct c2_txp_desc txp_htxd;
+
+       spin_lock(&c2_port->tx_lock);
+
+       for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
+            elem = elem->next) {
+               txp_htxd.flags =
+                   be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
+
+               if (txp_htxd.flags != TXP_HTXD_DONE)
+                       break;
+
+               if (netif_msg_tx_done(c2_port)) {
+                       /* PCI reads are expensive in fast path */
+                       txp_htxd.len =
+                           be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
+                       pr_debug("%s: tx done slot %3Zu status 0x%x len "
+                               "%5u bytes\n",
+                               netdev->name, elem - tx_ring->start,
+                               txp_htxd.flags, txp_htxd.len);
+               }
+
+               c2_tx_free(c2dev, elem);
+               ++(c2_port->tx_avail);
+       }
+
+       tx_ring->to_clean = elem;
+
+       if (netif_queue_stopped(netdev)
+           && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+               netif_wake_queue(netdev);
+
+       spin_unlock(&c2_port->tx_lock);
+}
+
+static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
+{
+       struct c2_rx_desc *rx_desc = elem->ht_desc;
+       struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+
+       if (rxp_hdr->status != RXP_HRXD_OK ||
+           rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
+               pr_debug("BAD RXP_HRXD\n");
+               pr_debug("  rx_desc : %p\n", rx_desc);
+               pr_debug("    index : %Zu\n",
+                       elem - c2_port->rx_ring.start);
+               pr_debug("    len   : %u\n", rx_desc->len);
+               pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
+                       (void *) __pa((unsigned long) rxp_hdr));
+               pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
+               pr_debug("    status: 0x%x\n", rxp_hdr->status);
+               pr_debug("    len   : %u\n", rxp_hdr->len);
+               pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
+       }
+
+       /* Setup the skb for reuse since we're dropping this pkt */
+       elem->skb->data = elem->skb->head;
+       skb_reset_tail_pointer(elem->skb);
+
+       /* Zero out the rxp hdr in the sk_buff */
+       memset(elem->skb->data, 0, sizeof(*rxp_hdr));
+
+       /* Write the descriptor to the adapter's rx ring */
+       __raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+       __raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+       __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
+                    elem->hw_desc + C2_RXP_LEN);
+       __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
+                    elem->hw_desc + C2_RXP_ADDR);
+       __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+                    elem->hw_desc + C2_RXP_FLAGS);
+
+       pr_debug("packet dropped\n");
+       c2_port->netdev->stats.rx_dropped++;
+}
+
+static void c2_rx_interrupt(struct net_device *netdev)
+{
+       struct c2_port *c2_port = netdev_priv(netdev);
+       struct c2_dev *c2dev = c2_port->c2dev;
+       struct c2_ring *rx_ring = &c2_port->rx_ring;
+       struct c2_element *elem;
+       struct c2_rx_desc *rx_desc;
+       struct c2_rxp_hdr *rxp_hdr;
+       struct sk_buff *skb;
+       dma_addr_t mapaddr;
+       u32 maplen, buflen;
+       unsigned long flags;
+
+       spin_lock_irqsave(&c2dev->lock, flags);
+
+       /* Begin where we left off */
+       rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
+
+       for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
+            elem = elem->next) {
+               rx_desc = elem->ht_desc;
+               mapaddr = elem->mapaddr;
+               maplen = elem->maplen;
+               skb = elem->skb;
+               rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+
+               if (rxp_hdr->flags != RXP_HRXD_DONE)
+                       break;
+               buflen = rxp_hdr->len;
+
+               /* Sanity check the RXP header */
+               if (rxp_hdr->status != RXP_HRXD_OK ||
+                   buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
+                       c2_rx_error(c2_port, elem);
+                       continue;
+               }
+
+               /*
+                * Allocate and map a new skb for replenishing the host
+                * RX desc
+                */
+               if (c2_rx_alloc(c2_port, elem)) {
+                       c2_rx_error(c2_port, elem);
+                       continue;
+               }
+
+               /* Unmap the old skb */
+               pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
+                                PCI_DMA_FROMDEVICE);
+
+               prefetch(skb->data);
+
+               /*
+                * Skip past the leading 8 bytes comprising of the
+                * "struct c2_rxp_hdr", prepended by the adapter
+                * to the usual Ethernet header ("struct ethhdr"),
+                * to the start of the raw Ethernet packet.
+                *
+                * Fix up the various fields in the sk_buff before
+                * passing it up to netif_rx(). The transfer size
+                * (in bytes) specified by the adapter len field of
+                * the "struct rxp_hdr_t" does NOT include the
+                * "sizeof(struct c2_rxp_hdr)".
+                */
+               skb->data += sizeof(*rxp_hdr);
+               skb_set_tail_pointer(skb, buflen);
+               skb->len = buflen;
+               skb->protocol = eth_type_trans(skb, netdev);
+
+               netif_rx(skb);
+
+               netdev->stats.rx_packets++;
+               netdev->stats.rx_bytes += buflen;
+       }
+
+       /* Save where we left off */
+       rx_ring->to_clean = elem;
+       c2dev->cur_rx = elem - rx_ring->start;
+       C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+       spin_unlock_irqrestore(&c2dev->lock, flags);
+}
+
+/*
+ * Handle netisr0 TX & RX interrupts.
+ */
+static irqreturn_t c2_interrupt(int irq, void *dev_id)
+{
+       unsigned int netisr0, dmaisr;
+       int handled = 0;
+       struct c2_dev *c2dev = (struct c2_dev *) dev_id;
+
+       /* Process CCILNET interrupts */
+       netisr0 = readl(c2dev->regs + C2_NISR0);
+       if (netisr0) {
+
+               /*
+                * There is an issue with the firmware that always
+                * provides the status of RX for both TX & RX
+                * interrupts.  So process both queues here.
+                */
+               c2_rx_interrupt(c2dev->netdev);
+               c2_tx_interrupt(c2dev->netdev);
+
+               /* Clear the interrupt */
+               writel(netisr0, c2dev->regs + C2_NISR0);
+               handled++;
+       }
+
+       /* Process RNIC interrupts */
+       dmaisr = readl(c2dev->regs + C2_DISR);
+       if (dmaisr) {
+               writel(dmaisr, c2dev->regs + C2_DISR);
+               c2_rnic_interrupt(c2dev);
+               handled++;
+       }
+
+       if (handled) {
+               return IRQ_HANDLED;
+       } else {
+               return IRQ_NONE;
+       }
+}
+
+static int c2_up(struct net_device *netdev)
+{
+       struct c2_port *c2_port = netdev_priv(netdev);
+       struct c2_dev *c2dev = c2_port->c2dev;
+       struct c2_element *elem;
+       struct c2_rxp_hdr *rxp_hdr;
+       struct in_device *in_dev;
+       size_t rx_size, tx_size;
+       int ret, i;
+       unsigned int netimr0;
+
+       if (netif_msg_ifup(c2_port))
+               pr_debug("%s: enabling interface\n", netdev->name);
+
+       /* Set the Rx buffer size based on MTU */
+       c2_set_rxbufsize(c2_port);
+
+       /* Allocate DMA'able memory for Tx/Rx host descriptor rings */
+       rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
+       tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
+
+       c2_port->mem_size = tx_size + rx_size;
+       c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
+                                            &c2_port->dma);
+       if (c2_port->mem == NULL) {
+               pr_debug("Unable to allocate memory for "
+                       "host descriptor rings\n");
+               return -ENOMEM;
+       }
+
+       /* Create the Rx host descriptor ring */
+       if ((ret =
+            c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
+                             c2dev->mmio_rxp_ring))) {
+               pr_debug("Unable to create RX ring\n");
+               goto bail0;
+       }
+
+       /* Allocate Rx buffers for the host descriptor ring */
+       if (c2_rx_fill(c2_port)) {
+               pr_debug("Unable to fill RX ring\n");
+               goto bail1;
+       }
+
+       /* Create the Tx host descriptor ring */
+       if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
+                                   c2_port->dma + rx_size,
+                                   c2dev->mmio_txp_ring))) {
+               pr_debug("Unable to create TX ring\n");
+               goto bail1;
+       }
+
+       /* Set the TX pointer to where we left off */
+       c2_port->tx_avail = c2_port->tx_ring.count - 1;
+       c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
+           c2_port->tx_ring.start + c2dev->cur_tx;
+
+       /* missing: Initialize MAC */
+
+       BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
+
+       /* Reset the adapter, ensures the driver is in sync with the RXP */
+       c2_reset(c2_port);
+
+       /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
+       for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
+            i++, elem++) {
+               rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+               rxp_hdr->flags = 0;
+               __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+                            elem->hw_desc + C2_RXP_FLAGS);
+       }
+
+       /* Enable network packets */
+       netif_start_queue(netdev);
+
+       /* Enable IRQ */
+       writel(0, c2dev->regs + C2_IDIS);
+       netimr0 = readl(c2dev->regs + C2_NIMR0);
+       netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
+       writel(netimr0, c2dev->regs + C2_NIMR0);
+
+       /* Tell the stack to ignore arp requests for ipaddrs bound to
+        * other interfaces.  This is needed to prevent the host stack
+        * from responding to arp requests to the ipaddr bound on the
+        * rdma interface.
+        */
+       in_dev = in_dev_get(netdev);
+       IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
+       in_dev_put(in_dev);
+
+       return 0;
+
+      bail1:
+       c2_rx_clean(c2_port);
+       kfree(c2_port->rx_ring.start);
+
+      bail0:
+       pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+                           c2_port->dma);
+
+       return ret;
+}
+
+static int c2_down(struct net_device *netdev)
+{
+       struct c2_port *c2_port = netdev_priv(netdev);
+       struct c2_dev *c2dev = c2_port->c2dev;
+
+       if (netif_msg_ifdown(c2_port))
+               pr_debug("%s: disabling interface\n",
+                       netdev->name);
+
+       /* Wait for all the queued packets to get sent */
+       c2_tx_interrupt(netdev);
+
+       /* Disable network packets */
+       netif_stop_queue(netdev);
+
+       /* Disable IRQs by clearing the interrupt mask */
+       writel(1, c2dev->regs + C2_IDIS);
+       writel(0, c2dev->regs + C2_NIMR0);
+
+       /* missing: Stop transmitter */
+
+       /* missing: Stop receiver */
+
+       /* Reset the adapter, ensures the driver is in sync with the RXP */
+       c2_reset(c2_port);
+
+       /* missing: Turn off LEDs here */
+
+       /* Free all buffers in the host descriptor rings */
+       c2_tx_clean(c2_port);
+       c2_rx_clean(c2_port);
+
+       /* Free the host descriptor rings */
+       kfree(c2_port->rx_ring.start);
+       kfree(c2_port->tx_ring.start);
+       pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+                           c2_port->dma);
+
+       return 0;
+}
+
+static void c2_reset(struct c2_port *c2_port)
+{
+       struct c2_dev *c2dev = c2_port->c2dev;
+       unsigned int cur_rx = c2dev->cur_rx;
+
+       /* Tell the hardware to quiesce */
+       C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
+
+       /*
+        * The hardware will reset the C2_PCI_HRX_QUI bit once
+        * the RXP is quiesced.  Wait 2 seconds for this.
+        */
+       ssleep(2);
+
+       cur_rx = C2_GET_CUR_RX(c2dev);
+
+       if (cur_rx & C2_PCI_HRX_QUI)
+               pr_debug("c2_reset: failed to quiesce the hardware!\n");
+
+       cur_rx &= ~C2_PCI_HRX_QUI;
+
+       c2dev->cur_rx = cur_rx;
+
+       pr_debug("Current RX: %u\n", c2dev->cur_rx);
+}
+
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+       struct c2_port *c2_port = netdev_priv(netdev);
+       struct c2_dev *c2dev = c2_port->c2dev;
+       struct c2_ring *tx_ring = &c2_port->tx_ring;
+       struct c2_element *elem;
+       dma_addr_t mapaddr;
+       u32 maplen;
+       unsigned long flags;
+       unsigned int i;
+
+       spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+       if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
+               netif_stop_queue(netdev);
+               spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+               pr_debug("%s: Tx ring full when queue awake!\n",
+                       netdev->name);
+               return NETDEV_TX_BUSY;
+       }
+
+       maplen = skb_headlen(skb);
+       mapaddr =
+           pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
+
+       elem = tx_ring->to_use;
+       elem->skb = skb;
+       elem->mapaddr = mapaddr;
+       elem->maplen = maplen;
+
+       /* Tell HW to xmit */
+       __raw_writeq((__force u64) cpu_to_be64(mapaddr),
+                    elem->hw_desc + C2_TXP_ADDR);
+       __raw_writew((__force u16) cpu_to_be16(maplen),
+                    elem->hw_desc + C2_TXP_LEN);
+       __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+                    elem->hw_desc + C2_TXP_FLAGS);
+
+       netdev->stats.tx_packets++;
+       netdev->stats.tx_bytes += maplen;
+
+       /* Loop thru additional data fragments and queue them */
+       if (skb_shinfo(skb)->nr_frags) {
+               for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+                       const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+                       maplen = skb_frag_size(frag);
+                       mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
+                                                  0, maplen, DMA_TO_DEVICE);
+                       elem = elem->next;
+                       elem->skb = NULL;
+                       elem->mapaddr = mapaddr;
+                       elem->maplen = maplen;
+
+                       /* Tell HW to xmit */
+                       __raw_writeq((__force u64) cpu_to_be64(mapaddr),
+                                    elem->hw_desc + C2_TXP_ADDR);
+                       __raw_writew((__force u16) cpu_to_be16(maplen),
+                                    elem->hw_desc + C2_TXP_LEN);
+                       __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+                                    elem->hw_desc + C2_TXP_FLAGS);
+
+                       netdev->stats.tx_packets++;
+                       netdev->stats.tx_bytes += maplen;
+               }
+       }
+
+       tx_ring->to_use = elem->next;
+       c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
+
+       if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
+               netif_stop_queue(netdev);
+               if (netif_msg_tx_queued(c2_port))
+                       pr_debug("%s: transmit queue full\n",
+                               netdev->name);
+       }
+
+       spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+       netdev->trans_start = jiffies;
+
+       return NETDEV_TX_OK;
+}
+
+static void c2_tx_timeout(struct net_device *netdev)
+{
+       struct c2_port *c2_port = netdev_priv(netdev);
+
+       if (netif_msg_timer(c2_port))
+               pr_debug("%s: tx timeout\n", netdev->name);
+
+       c2_tx_clean(c2_port);
+}
+
+static int c2_change_mtu(struct net_device *netdev, int new_mtu)
+{
+       int ret = 0;
+
+       if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+               return -EINVAL;
+
+       netdev->mtu = new_mtu;
+
+       if (netif_running(netdev)) {
+               c2_down(netdev);
+
+               c2_up(netdev);
+       }
+
+       return ret;
+}
+
+static const struct net_device_ops c2_netdev = {
+       .ndo_open               = c2_up,
+       .ndo_stop               = c2_down,
+       .ndo_start_xmit         = c2_xmit_frame,
+       .ndo_tx_timeout         = c2_tx_timeout,
+       .ndo_change_mtu         = c2_change_mtu,
+       .ndo_set_mac_address    = eth_mac_addr,
+       .ndo_validate_addr      = eth_validate_addr,
+};
+
+/* Initialize network device */
+static struct net_device *c2_devinit(struct c2_dev *c2dev,
+                                    void __iomem * mmio_addr)
+{
+       struct c2_port *c2_port = NULL;
+       struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
+
+       if (!netdev) {
+               pr_debug("c2_port etherdev alloc failed");
+               return NULL;
+       }
+
+       SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+       netdev->netdev_ops = &c2_netdev;
+       netdev->watchdog_timeo = C2_TX_TIMEOUT;
+       netdev->irq = c2dev->pcidev->irq;
+
+       c2_port = netdev_priv(netdev);
+       c2_port->netdev = netdev;
+       c2_port->c2dev = c2dev;
+       c2_port->msg_enable = netif_msg_init(debug, default_msg);
+       c2_port->tx_ring.count = C2_NUM_TX_DESC;
+       c2_port->rx_ring.count = C2_NUM_RX_DESC;
+
+       spin_lock_init(&c2_port->tx_lock);
+
+       /* Copy our 48-bit ethernet hardware address */
+       memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
+
+       /* Validate the MAC address */
+       if (!is_valid_ether_addr(netdev->dev_addr)) {
+               pr_debug("Invalid MAC Address\n");
+               c2_print_macaddr(netdev);
+               free_netdev(netdev);
+               return NULL;
+       }
+
+       c2dev->netdev = netdev;
+
+       return netdev;
+}
+
+static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+       int ret = 0, i;
+       unsigned long reg0_start, reg0_flags, reg0_len;
+       unsigned long reg2_start, reg2_flags, reg2_len;
+       unsigned long reg4_start, reg4_flags, reg4_len;
+       unsigned kva_map_size;
+       struct net_device *netdev = NULL;
+       struct c2_dev *c2dev = NULL;
+       void __iomem *mmio_regs = NULL;
+
+       printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
+               DRV_VERSION);
+
+       /* Enable PCI device */
+       ret = pci_enable_device(pcidev);
+       if (ret) {
+               printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
+                       pci_name(pcidev));
+               goto bail0;
+       }
+
+       reg0_start = pci_resource_start(pcidev, BAR_0);
+       reg0_len = pci_resource_len(pcidev, BAR_0);
+       reg0_flags = pci_resource_flags(pcidev, BAR_0);
+
+       reg2_start = pci_resource_start(pcidev, BAR_2);
+       reg2_len = pci_resource_len(pcidev, BAR_2);
+       reg2_flags = pci_resource_flags(pcidev, BAR_2);
+
+       reg4_start = pci_resource_start(pcidev, BAR_4);
+       reg4_len = pci_resource_len(pcidev, BAR_4);
+       reg4_flags = pci_resource_flags(pcidev, BAR_4);
+
+       pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
+       pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
+       pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
+
+       /* Make sure PCI base addr are MMIO */
+       if (!(reg0_flags & IORESOURCE_MEM) ||
+           !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
+               printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+               ret = -ENODEV;
+               goto bail1;
+       }
+
+       /* Check for weird/broken PCI region reporting */
+       if ((reg0_len < C2_REG0_SIZE) ||
+           (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
+               printk(KERN_ERR PFX "Invalid PCI region sizes\n");
+               ret = -ENODEV;
+               goto bail1;
+       }
+
+       /* Reserve PCI I/O and memory resources */
+       ret = pci_request_regions(pcidev, DRV_NAME);
+       if (ret) {
+               printk(KERN_ERR PFX "%s: Unable to request regions\n",
+                       pci_name(pcidev));
+               goto bail1;
+       }
+
+       if ((sizeof(dma_addr_t) > 4)) {
+               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+               if (ret < 0) {
+                       printk(KERN_ERR PFX "64b DMA configuration failed\n");
+                       goto bail2;
+               }
+       } else {
+               ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+               if (ret < 0) {
+                       printk(KERN_ERR PFX "32b DMA configuration failed\n");
+                       goto bail2;
+               }
+       }
+
+       /* Enables bus-mastering on the device */
+       pci_set_master(pcidev);
+
+       /* Remap the adapter PCI registers in BAR4 */
+       mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+                                   sizeof(struct c2_adapter_pci_regs));
+       if (!mmio_regs) {
+               printk(KERN_ERR PFX
+                       "Unable to remap adapter PCI registers in BAR4\n");
+               ret = -EIO;
+               goto bail2;
+       }
+
+       /* Validate PCI regs magic */
+       for (i = 0; i < sizeof(c2_magic); i++) {
+               if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
+                       printk(KERN_ERR PFX "Downlevel Firmware boot loader "
+                               "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
+                              "utility to update your boot loader\n",
+                               i + 1, sizeof(c2_magic),
+                               readb(mmio_regs + C2_REGS_MAGIC + i),
+                               c2_magic[i]);
+                       printk(KERN_ERR PFX "Adapter not claimed\n");
+                       iounmap(mmio_regs);
+                       ret = -EIO;
+                       goto bail2;
+               }
+       }
+
+       /* Validate the adapter version */
+       if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
+               printk(KERN_ERR PFX "Version mismatch "
+                       "[fw=%u, c2=%u], Adapter not claimed\n",
+                       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
+                       C2_VERSION);
+               ret = -EINVAL;
+               iounmap(mmio_regs);
+               goto bail2;
+       }
+
+       /* Validate the adapter IVN */
+       if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
+               printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
+                      "the OpenIB device support kit. "
+                      "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
+                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
+                      C2_IVN);
+               ret = -EINVAL;
+               iounmap(mmio_regs);
+               goto bail2;
+       }
+
+       /* Allocate hardware structure */
+       c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
+       if (!c2dev) {
+               printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
+                       pci_name(pcidev));
+               ret = -ENOMEM;
+               iounmap(mmio_regs);
+               goto bail2;
+       }
+
+       memset(c2dev, 0, sizeof(*c2dev));
+       spin_lock_init(&c2dev->lock);
+       c2dev->pcidev = pcidev;
+       c2dev->cur_tx = 0;
+
+       /* Get the last RX index */
+       c2dev->cur_rx =
+           (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
+            0xffffc000) / sizeof(struct c2_rxp_desc);
+
+       /* Request an interrupt line for the driver */
+       ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
+       if (ret) {
+               printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+                       pci_name(pcidev), pcidev->irq);
+               iounmap(mmio_regs);
+               goto bail3;
+       }
+
+       /* Set driver specific data */
+       pci_set_drvdata(pcidev, c2dev);
+
+       /* Initialize network device */
+       if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
+               ret = -ENOMEM;
+               iounmap(mmio_regs);
+               goto bail4;
+       }
+
+       /* Save off the actual size prior to unmapping mmio_regs */
+       kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
+
+       /* Unmap the adapter PCI registers in BAR4 */
+       iounmap(mmio_regs);
+
+       /* Register network device */
+       ret = register_netdev(netdev);
+       if (ret) {
+               printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
+                       ret);
+               goto bail5;
+       }
+
+       /* Disable network packets */
+       netif_stop_queue(netdev);
+
+       /* Remap the adapter HRXDQ PA space to kernel VA space */
+       c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
+                                              C2_RXP_HRXDQ_SIZE);
+       if (!c2dev->mmio_rxp_ring) {
+               printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
+               ret = -EIO;
+               goto bail6;
+       }
+
+       /* Remap the adapter HTXDQ PA space to kernel VA space */
+       c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
+                                              C2_TXP_HTXDQ_SIZE);
+       if (!c2dev->mmio_txp_ring) {
+               printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
+               ret = -EIO;
+               goto bail7;
+       }
+
+       /* Save off the current RX index in the last 4 bytes of the TXP Ring */
+       C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+       /* Remap the PCI registers in adapter BAR0 to kernel VA space */
+       c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
+       if (!c2dev->regs) {
+               printk(KERN_ERR PFX "Unable to remap BAR0\n");
+               ret = -EIO;
+               goto bail8;
+       }
+
+       /* Remap the PCI registers in adapter BAR4 to kernel VA space */
+       c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
+       c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+                                    kva_map_size);
+       if (!c2dev->kva) {
+               printk(KERN_ERR PFX "Unable to remap BAR4\n");
+               ret = -EIO;
+               goto bail9;
+       }
+
+       /* Print out the MAC address */
+       c2_print_macaddr(netdev);
+
+       ret = c2_rnic_init(c2dev);
+       if (ret) {
+               printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
+               goto bail10;
+       }
+
+       ret = c2_register_device(c2dev);
+       if (ret)
+               goto bail10;
+
+       return 0;
+
+ bail10:
+       iounmap(c2dev->kva);
+
+ bail9:
+       iounmap(c2dev->regs);
+
+ bail8:
+       iounmap(c2dev->mmio_txp_ring);
+
+ bail7:
+       iounmap(c2dev->mmio_rxp_ring);
+
+ bail6:
+       unregister_netdev(netdev);
+
+ bail5:
+       free_netdev(netdev);
+
+ bail4:
+       free_irq(pcidev->irq, c2dev);
+
+ bail3:
+       ib_dealloc_device(&c2dev->ibdev);
+
+ bail2:
+       pci_release_regions(pcidev);
+
+ bail1:
+       pci_disable_device(pcidev);
+
+ bail0:
+       return ret;
+}
+
+static void c2_remove(struct pci_dev *pcidev)
+{
+       struct c2_dev *c2dev = pci_get_drvdata(pcidev);
+       struct net_device *netdev = c2dev->netdev;
+
+       /* Unregister with OpenIB */
+       c2_unregister_device(c2dev);
+
+       /* Clean up the RNIC resources */
+       c2_rnic_term(c2dev);
+
+       /* Remove network device from the kernel */
+       unregister_netdev(netdev);
+
+       /* Free network device */
+       free_netdev(netdev);
+
+       /* Free the interrupt line */
+       free_irq(pcidev->irq, c2dev);
+
+       /* missing: Turn LEDs off here */
+
+       /* Unmap adapter PA space */
+       iounmap(c2dev->kva);
+       iounmap(c2dev->regs);
+       iounmap(c2dev->mmio_txp_ring);
+       iounmap(c2dev->mmio_rxp_ring);
+
+       /* Free the hardware structure */
+       ib_dealloc_device(&c2dev->ibdev);
+
+       /* Release reserved PCI I/O and memory resources */
+       pci_release_regions(pcidev);
+
+       /* Disable PCI device */
+       pci_disable_device(pcidev);
+
+       /* Clear driver specific data */
+       pci_set_drvdata(pcidev, NULL);
+}
+
+static struct pci_driver c2_pci_driver = {
+       .name = DRV_NAME,
+       .id_table = c2_pci_table,
+       .probe = c2_probe,
+       .remove = c2_remove,
+};
+
+module_pci_driver(c2_pci_driver);
diff --git a/drivers/staging/rdma/amso1100/c2.h b/drivers/staging/rdma/amso1100/c2.h
new file mode 100644 (file)
index 0000000..d619d73
--- /dev/null
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __C2_H
+#define __C2_H
+
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+
+#include "c2_provider.h"
+#include "c2_mq.h"
+#include "c2_status.h"
+
+#define DRV_NAME     "c2"
+#define DRV_VERSION  "1.1"
+#define PFX          DRV_NAME ": "
+
+#define BAR_0                0
+#define BAR_2                2
+#define BAR_4                4
+
+#define RX_BUF_SIZE         (1536 + 8)
+#define ETH_JUMBO_MTU        9000
+#define C2_MAGIC            "CEPHEUS"
+#define C2_VERSION           4
+#define C2_IVN              (18 & 0x7fffffff)
+
+#define C2_REG0_SIZE        (16 * 1024)
+#define C2_REG2_SIZE        (2 * 1024 * 1024)
+#define C2_REG4_SIZE        (256 * 1024 * 1024)
+#define C2_NUM_TX_DESC       341
+#define C2_NUM_RX_DESC       256
+#define C2_PCI_REGS_OFFSET  (0x10000)
+#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
+#define C2_RXP_HRXDQ_SIZE   (4096)
+#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
+#define C2_TXP_HTXDQ_SIZE   (4096)
+#define C2_TX_TIMEOUT      (6*HZ)
+
+/* CEPHEUS */
+static const u8 c2_magic[] = {
+       0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
+};
+
+enum adapter_pci_regs {
+       C2_REGS_MAGIC = 0x0000,
+       C2_REGS_VERS = 0x0008,
+       C2_REGS_IVN = 0x000C,
+       C2_REGS_PCI_WINSIZE = 0x0010,
+       C2_REGS_Q0_QSIZE = 0x0014,
+       C2_REGS_Q0_MSGSIZE = 0x0018,
+       C2_REGS_Q0_POOLSTART = 0x001C,
+       C2_REGS_Q0_SHARED = 0x0020,
+       C2_REGS_Q1_QSIZE = 0x0024,
+       C2_REGS_Q1_MSGSIZE = 0x0028,
+       C2_REGS_Q1_SHARED = 0x0030,
+       C2_REGS_Q2_QSIZE = 0x0034,
+       C2_REGS_Q2_MSGSIZE = 0x0038,
+       C2_REGS_Q2_SHARED = 0x0040,
+       C2_REGS_ENADDR = 0x004C,
+       C2_REGS_RDMA_ENADDR = 0x0054,
+       C2_REGS_HRX_CUR = 0x006C,
+};
+
+struct c2_adapter_pci_regs {
+       char reg_magic[8];
+       u32 version;
+       u32 ivn;
+       u32 pci_window_size;
+       u32 q0_q_size;
+       u32 q0_msg_size;
+       u32 q0_pool_start;
+       u32 q0_shared;
+       u32 q1_q_size;
+       u32 q1_msg_size;
+       u32 q1_pool_start;
+       u32 q1_shared;
+       u32 q2_q_size;
+       u32 q2_msg_size;
+       u32 q2_pool_start;
+       u32 q2_shared;
+       u32 log_start;
+       u32 log_size;
+       u8 host_enaddr[8];
+       u8 rdma_enaddr[8];
+       u32 crash_entry;
+       u32 crash_ready[2];
+       u32 fw_txd_cur;
+       u32 fw_hrxd_cur;
+       u32 fw_rxd_cur;
+};
+
+enum pci_regs {
+       C2_HISR = 0x0000,
+       C2_DISR = 0x0004,
+       C2_HIMR = 0x0008,
+       C2_DIMR = 0x000C,
+       C2_NISR0 = 0x0010,
+       C2_NISR1 = 0x0014,
+       C2_NIMR0 = 0x0018,
+       C2_NIMR1 = 0x001C,
+       C2_IDIS = 0x0020,
+};
+
+enum {
+       C2_PCI_HRX_INT = 1 << 8,
+       C2_PCI_HTX_INT = 1 << 17,
+       C2_PCI_HRX_QUI = 1 << 31,
+};
+
+/*
+ * Cepheus registers in BAR0.
+ */
+struct c2_pci_regs {
+       u32 hostisr;
+       u32 dmaisr;
+       u32 hostimr;
+       u32 dmaimr;
+       u32 netisr0;
+       u32 netisr1;
+       u32 netimr0;
+       u32 netimr1;
+       u32 int_disable;
+};
+
+/* TXP flags */
+enum c2_txp_flags {
+       TXP_HTXD_DONE = 0,
+       TXP_HTXD_READY = 1 << 0,
+       TXP_HTXD_UNINIT = 1 << 1,
+};
+
+/* RXP flags */
+enum c2_rxp_flags {
+       RXP_HRXD_UNINIT = 0,
+       RXP_HRXD_READY = 1 << 0,
+       RXP_HRXD_DONE = 1 << 1,
+};
+
+/* RXP status */
+enum c2_rxp_status {
+       RXP_HRXD_ZERO = 0,
+       RXP_HRXD_OK = 1 << 0,
+       RXP_HRXD_BUF_OV = 1 << 1,
+};
+
+/* TXP descriptor fields */
+enum txp_desc {
+       C2_TXP_FLAGS = 0x0000,
+       C2_TXP_LEN = 0x0002,
+       C2_TXP_ADDR = 0x0004,
+};
+
+/* RXP descriptor fields */
+enum rxp_desc {
+       C2_RXP_FLAGS = 0x0000,
+       C2_RXP_STATUS = 0x0002,
+       C2_RXP_COUNT = 0x0004,
+       C2_RXP_LEN = 0x0006,
+       C2_RXP_ADDR = 0x0008,
+};
+
+struct c2_txp_desc {
+       u16 flags;
+       u16 len;
+       u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_desc {
+       u16 flags;
+       u16 status;
+       u16 count;
+       u16 len;
+       u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_hdr {
+       u16 flags;
+       u16 status;
+       u16 len;
+       u16 rsvd;
+} __attribute__ ((packed));
+
+struct c2_tx_desc {
+       u32 len;
+       u32 status;
+       dma_addr_t next_offset;
+};
+
+struct c2_rx_desc {
+       u32 len;
+       u32 status;
+       dma_addr_t next_offset;
+};
+
+struct c2_alloc {
+       u32 last;
+       u32 max;
+       spinlock_t lock;
+       unsigned long *table;
+};
+
+struct c2_array {
+       struct {
+               void **page;
+               int used;
+       } *page_list;
+};
+
+/*
+ * The MQ shared pointer pool is organized as a linked list of
+ * chunks. Each chunk contains a linked list of free shared pointers
+ * that can be allocated to a given user mode client.
+ *
+ */
+struct sp_chunk {
+       struct sp_chunk *next;
+       dma_addr_t dma_addr;
+       DEFINE_DMA_UNMAP_ADDR(mapping);
+       u16 head;
+       u16 shared_ptr[0];
+};
+
+struct c2_pd_table {
+       u32 last;
+       u32 max;
+       spinlock_t lock;
+       unsigned long *table;
+};
+
+struct c2_qp_table {
+       struct idr idr;
+       spinlock_t lock;
+};
+
+struct c2_element {
+       struct c2_element *next;
+       void *ht_desc;          /* host     descriptor */
+       void __iomem *hw_desc;  /* hardware descriptor */
+       struct sk_buff *skb;
+       dma_addr_t mapaddr;
+       u32 maplen;
+};
+
+struct c2_ring {
+       struct c2_element *to_clean;
+       struct c2_element *to_use;
+       struct c2_element *start;
+       unsigned long count;
+};
+
+struct c2_dev {
+       struct ib_device ibdev;
+       void __iomem *regs;
+       void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
+       void __iomem *mmio_rxp_ring;
+       spinlock_t lock;
+       struct pci_dev *pcidev;
+       struct net_device *netdev;
+       struct net_device *pseudo_netdev;
+       unsigned int cur_tx;
+       unsigned int cur_rx;
+       u32 adapter_handle;
+       int device_cap_flags;
+       void __iomem *kva;      /* KVA device memory */
+       unsigned long pa;       /* PA device memory */
+       void **qptr_array;
+
+       struct kmem_cache *host_msg_cache;
+
+       struct list_head cca_link;              /* adapter list */
+       struct list_head eh_wakeup_list;        /* event wakeup list */
+       wait_queue_head_t req_vq_wo;
+
+       /* Cached RNIC properties */
+       struct ib_device_attr props;
+
+       struct c2_pd_table pd_table;
+       struct c2_qp_table qp_table;
+       int ports;              /* num of GigE ports */
+       int devnum;
+       spinlock_t vqlock;      /* sync vbs req MQ */
+
+       /* Verbs Queues */
+       struct c2_mq req_vq;    /* Verbs Request MQ */
+       struct c2_mq rep_vq;    /* Verbs Reply MQ */
+       struct c2_mq aeq;       /* Async Events MQ */
+
+       /* Kernel client MQs */
+       struct sp_chunk *kern_mqsp_pool;
+
+       /* Device updates these values when posting messages to a host
+        * target queue */
+       u16 req_vq_shared;
+       u16 rep_vq_shared;
+       u16 aeq_shared;
+       u16 irq_claimed;
+
+       /*
+        * Shared host target pages for user-accessible MQs.
+        */
+       int hthead;             /* index of first free entry */
+       void *htpages;          /* kernel vaddr */
+       int htlen;              /* length of htpages memory */
+       void *htuva;            /* user mapped vaddr */
+       spinlock_t htlock;      /* serialize allocation */
+
+       u64 adapter_hint_uva;   /* access to the activity FIFO */
+
+       //      spinlock_t aeq_lock;
+       //      spinlock_t rnic_lock;
+
+       __be16 *hint_count;
+       dma_addr_t hint_count_dma;
+       u16 hints_read;
+
+       int init;               /* TRUE if it's ready */
+       char ae_cache_name[16];
+       char vq_cache_name[16];
+};
+
+struct c2_port {
+       u32 msg_enable;
+       struct c2_dev *c2dev;
+       struct net_device *netdev;
+
+       spinlock_t tx_lock;
+       u32 tx_avail;
+       struct c2_ring tx_ring;
+       struct c2_ring rx_ring;
+
+       void *mem;              /* PCI memory for host rings */
+       dma_addr_t dma;
+       unsigned long mem_size;
+
+       u32 rx_buf_size;
+};
+
+/*
+ * Activity FIFO registers in BAR0.
+ */
+#define PCI_BAR0_HOST_HINT     0x100
+#define PCI_BAR0_ADAPTER_HINT  0x2000
+
+/*
+ * Ammasso PCI vendor id and Cepheus PCI device id.
+ */
+#define CQ_ARMED       0x01
+#define CQ_WAIT_FOR_DMA        0x80
+
+/*
+ * The format of a hint is as follows:
+ * Lower 16 bits are the count of hints for the queue.
+ * Next 15 bits are the qp_index
+ * Upper most bit depends on who reads it:
+ *    If read by producer, then it means Full (1) or Not-Full (0)
+ *    If read by consumer, then it means Empty (1) or Not-Empty (0)
+ */
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+
+/*
+ * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
+ * struct.
+ */
+#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
+
+#ifndef readq
+static inline u64 readq(const void __iomem * addr)
+{
+       u64 ret = readl(addr + 4);
+       ret <<= 32;
+       ret |= readl(addr);
+
+       return ret;
+}
+#endif
+
+#ifndef writeq
+static inline void __raw_writeq(u64 val, void __iomem * addr)
+{
+       __raw_writel((u32) (val), addr);
+       __raw_writel((u32) (val >> 32), (addr + 4));
+}
+#endif
+
+#define C2_SET_CUR_RX(c2dev, cur_rx) \
+       __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
+
+#define C2_GET_CUR_RX(c2dev) \
+       be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
+
+static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct c2_dev, ibdev);
+}
+
+static inline int c2_errno(void *reply)
+{
+       switch (c2_wr_get_result(reply)) {
+       case C2_OK:
+               return 0;
+       case CCERR_NO_BUFS:
+       case CCERR_INSUFFICIENT_RESOURCES:
+       case CCERR_ZERO_RDMA_READ_RESOURCES:
+               return -ENOMEM;
+       case CCERR_MR_IN_USE:
+       case CCERR_QP_IN_USE:
+               return -EBUSY;
+       case CCERR_ADDR_IN_USE:
+               return -EADDRINUSE;
+       case CCERR_ADDR_NOT_AVAIL:
+               return -EADDRNOTAVAIL;
+       case CCERR_CONN_RESET:
+               return -ECONNRESET;
+       case CCERR_NOT_IMPLEMENTED:
+       case CCERR_INVALID_WQE:
+               return -ENOSYS;
+       case CCERR_QP_NOT_PRIVILEGED:
+               return -EPERM;
+       case CCERR_STACK_ERROR:
+               return -EPROTO;
+       case CCERR_ACCESS_VIOLATION:
+       case CCERR_BASE_AND_BOUNDS_VIOLATION:
+               return -EFAULT;
+       case CCERR_STAG_STATE_NOT_INVALID:
+       case CCERR_INVALID_ADDRESS:
+       case CCERR_INVALID_CQ:
+       case CCERR_INVALID_EP:
+       case CCERR_INVALID_MODIFIER:
+       case CCERR_INVALID_MTU:
+       case CCERR_INVALID_PD_ID:
+       case CCERR_INVALID_QP:
+       case CCERR_INVALID_RNIC:
+       case CCERR_INVALID_STAG:
+               return -EINVAL;
+       default:
+               return -EAGAIN;
+       }
+}
+
+/* Device */
+extern int c2_register_device(struct c2_dev *c2dev);
+extern void c2_unregister_device(struct c2_dev *c2dev);
+extern int c2_rnic_init(struct c2_dev *c2dev);
+extern void c2_rnic_term(struct c2_dev *c2dev);
+extern void c2_rnic_interrupt(struct c2_dev *c2dev);
+extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+
+/* QPs */
+extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
+                      struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
+extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
+extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
+extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+                       struct ib_qp_attr *attr, int attr_mask);
+extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+                                int ord, int ird);
+extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+                       struct ib_send_wr **bad_wr);
+extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+                          struct ib_recv_wr **bad_wr);
+extern void c2_init_qp_table(struct c2_dev *c2dev);
+extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
+extern void c2_set_qp_state(struct c2_qp *, int);
+extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
+
+/* PDs */
+extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
+extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
+extern int c2_init_pd_table(struct c2_dev *c2dev);
+extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
+
+/* CQs */
+extern int c2_init_cq(struct c2_dev *c2dev, int entries,
+                     struct c2_ucontext *ctx, struct c2_cq *cq);
+extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
+extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
+extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
+extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+
+/* CM */
+extern int c2_llp_connect(struct iw_cm_id *cm_id,
+                         struct iw_cm_conn_param *iw_param);
+extern int c2_llp_accept(struct iw_cm_id *cm_id,
+                        struct iw_cm_conn_param *iw_param);
+extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
+                        u8 pdata_len);
+extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
+extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
+
+/* MM */
+extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+                                     int page_size, int pbl_depth, u32 length,
+                                     u32 off, u64 *va, enum c2_acf acf,
+                                     struct c2_mr *mr);
+extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
+
+/* AE */
+extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
+
+/* MQSP Allocator */
+extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+                            struct sp_chunk **root);
+extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
+extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+                            dma_addr_t *dma_addr, gfp_t gfp_mask);
+extern void c2_free_mqsp(__be16* mqsp);
+#endif
diff --git a/drivers/staging/rdma/amso1100/c2_ae.c b/drivers/staging/rdma/amso1100/c2_ae.c
new file mode 100644 (file)
index 0000000..cedda25
--- /dev/null
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_status.h"
+#include "c2_ae.h"
+
+static int c2_convert_cm_status(u32 c2_status)
+{
+       switch (c2_status) {
+       case C2_CONN_STATUS_SUCCESS:
+               return 0;
+       case C2_CONN_STATUS_REJECTED:
+               return -ENETRESET;
+       case C2_CONN_STATUS_REFUSED:
+               return -ECONNREFUSED;
+       case C2_CONN_STATUS_TIMEDOUT:
+               return -ETIMEDOUT;
+       case C2_CONN_STATUS_NETUNREACH:
+               return -ENETUNREACH;
+       case C2_CONN_STATUS_HOSTUNREACH:
+               return -EHOSTUNREACH;
+       case C2_CONN_STATUS_INVALID_RNIC:
+               return -EINVAL;
+       case C2_CONN_STATUS_INVALID_QP:
+               return -EINVAL;
+       case C2_CONN_STATUS_INVALID_QP_STATE:
+               return -EINVAL;
+       case C2_CONN_STATUS_ADDR_NOT_AVAIL:
+               return -EADDRNOTAVAIL;
+       default:
+               printk(KERN_ERR PFX
+                      "%s - Unable to convert CM status: %d\n",
+                      __func__, c2_status);
+               return -EIO;
+       }
+}
+
+static const char* to_event_str(int event)
+{
+       static const char* event_str[] = {
+               "CCAE_REMOTE_SHUTDOWN",
+               "CCAE_ACTIVE_CONNECT_RESULTS",
+               "CCAE_CONNECTION_REQUEST",
+               "CCAE_LLP_CLOSE_COMPLETE",
+               "CCAE_TERMINATE_MESSAGE_RECEIVED",
+               "CCAE_LLP_CONNECTION_RESET",
+               "CCAE_LLP_CONNECTION_LOST",
+               "CCAE_LLP_SEGMENT_SIZE_INVALID",
+               "CCAE_LLP_INVALID_CRC",
+               "CCAE_LLP_BAD_FPDU",
+               "CCAE_INVALID_DDP_VERSION",
+               "CCAE_INVALID_RDMA_VERSION",
+               "CCAE_UNEXPECTED_OPCODE",
+               "CCAE_INVALID_DDP_QUEUE_NUMBER",
+               "CCAE_RDMA_READ_NOT_ENABLED",
+               "CCAE_RDMA_WRITE_NOT_ENABLED",
+               "CCAE_RDMA_READ_TOO_SMALL",
+               "CCAE_NO_L_BIT",
+               "CCAE_TAGGED_INVALID_STAG",
+               "CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
+               "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
+               "CCAE_TAGGED_INVALID_PD",
+               "CCAE_WRAP_ERROR",
+               "CCAE_BAD_CLOSE",
+               "CCAE_BAD_LLP_CLOSE",
+               "CCAE_INVALID_MSN_RANGE",
+               "CCAE_INVALID_MSN_GAP",
+               "CCAE_IRRQ_OVERFLOW",
+               "CCAE_IRRQ_MSN_GAP",
+               "CCAE_IRRQ_MSN_RANGE",
+               "CCAE_IRRQ_INVALID_STAG",
+               "CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
+               "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
+               "CCAE_IRRQ_INVALID_PD",
+               "CCAE_IRRQ_WRAP_ERROR",
+               "CCAE_CQ_SQ_COMPLETION_OVERFLOW",
+               "CCAE_CQ_RQ_COMPLETION_ERROR",
+               "CCAE_QP_SRQ_WQE_ERROR",
+               "CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
+               "CCAE_CQ_OVERFLOW",
+               "CCAE_CQ_OPERATION_ERROR",
+               "CCAE_SRQ_LIMIT_REACHED",
+               "CCAE_QP_RQ_LIMIT_REACHED",
+               "CCAE_SRQ_CATASTROPHIC_ERROR",
+               "CCAE_RNIC_CATASTROPHIC_ERROR"
+       };
+
+       if (event < CCAE_REMOTE_SHUTDOWN ||
+           event > CCAE_RNIC_CATASTROPHIC_ERROR)
+               return "<invalid event>";
+
+       event -= CCAE_REMOTE_SHUTDOWN;
+       return event_str[event];
+}
+
+static const char *to_qp_state_str(int state)
+{
+       switch (state) {
+       case C2_QP_STATE_IDLE:
+               return "C2_QP_STATE_IDLE";
+       case C2_QP_STATE_CONNECTING:
+               return "C2_QP_STATE_CONNECTING";
+       case C2_QP_STATE_RTS:
+               return "C2_QP_STATE_RTS";
+       case C2_QP_STATE_CLOSING:
+               return "C2_QP_STATE_CLOSING";
+       case C2_QP_STATE_TERMINATE:
+               return "C2_QP_STATE_TERMINATE";
+       case C2_QP_STATE_ERROR:
+               return "C2_QP_STATE_ERROR";
+       default:
+               return "<invalid QP state>";
+       }
+}
+
+void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
+{
+       struct c2_mq *mq = c2dev->qptr_array[mq_index];
+       union c2wr *wr;
+       void *resource_user_context;
+       struct iw_cm_event cm_event;
+       struct ib_event ib_event;
+       enum c2_resource_indicator resource_indicator;
+       enum c2_event_id event_id;
+       unsigned long flags;
+       int status;
+       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
+       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
+
+       /*
+        * retrieve the message
+        */
+       wr = c2_mq_consume(mq);
+       if (!wr)
+               return;
+
+       memset(&ib_event, 0, sizeof(ib_event));
+       memset(&cm_event, 0, sizeof(cm_event));
+
+       event_id = c2_wr_get_id(wr);
+       resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
+       resource_user_context =
+           (void *) (unsigned long) wr->ae.ae_generic.user_context;
+
+       status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
+
+       pr_debug("event received c2_dev=%p, event_id=%d, "
+               "resource_indicator=%d, user_context=%p, status = %d\n",
+               c2dev, event_id, resource_indicator, resource_user_context,
+               status);
+
+       switch (resource_indicator) {
+       case C2_RES_IND_QP:{
+
+               struct c2_qp *qp = (struct c2_qp *)resource_user_context;
+               struct iw_cm_id *cm_id = qp->cm_id;
+               struct c2wr_ae_active_connect_results *res;
+
+               if (!cm_id) {
+                       pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
+                               qp);
+                       goto ignore_it;
+               }
+               pr_debug("%s: event = %s, user_context=%llx, "
+                       "resource_type=%x, "
+                       "resource=%x, qp_state=%s\n",
+                       __func__,
+                       to_event_str(event_id),
+                       (unsigned long long) wr->ae.ae_generic.user_context,
+                       be32_to_cpu(wr->ae.ae_generic.resource_type),
+                       be32_to_cpu(wr->ae.ae_generic.resource),
+                       to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
+
+               c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
+
+               switch (event_id) {
+               case CCAE_ACTIVE_CONNECT_RESULTS:
+                       res = &wr->ae.ae_active_connect_results;
+                       cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+                       laddr->sin_addr.s_addr = res->laddr;
+                       raddr->sin_addr.s_addr = res->raddr;
+                       laddr->sin_port = res->lport;
+                       raddr->sin_port = res->rport;
+                       if (status == 0) {
+                               cm_event.private_data_len =
+                                       be32_to_cpu(res->private_data_length);
+                               cm_event.private_data = res->private_data;
+                       } else {
+                               spin_lock_irqsave(&qp->lock, flags);
+                               if (qp->cm_id) {
+                                       qp->cm_id->rem_ref(qp->cm_id);
+                                       qp->cm_id = NULL;
+                               }
+                               spin_unlock_irqrestore(&qp->lock, flags);
+                               cm_event.private_data_len = 0;
+                               cm_event.private_data = NULL;
+                       }
+                       if (cm_id->event_handler)
+                               cm_id->event_handler(cm_id, &cm_event);
+                       break;
+               case CCAE_TERMINATE_MESSAGE_RECEIVED:
+               case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
+                       ib_event.device = &c2dev->ibdev;
+                       ib_event.element.qp = &qp->ibqp;
+                       ib_event.event = IB_EVENT_QP_REQ_ERR;
+
+                       if (qp->ibqp.event_handler)
+                               qp->ibqp.event_handler(&ib_event,
+                                                      qp->ibqp.
+                                                      qp_context);
+                       break;
+               case CCAE_BAD_CLOSE:
+               case CCAE_LLP_CLOSE_COMPLETE:
+               case CCAE_LLP_CONNECTION_RESET:
+               case CCAE_LLP_CONNECTION_LOST:
+                       BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
+
+                       spin_lock_irqsave(&qp->lock, flags);
+                       if (qp->cm_id) {
+                               qp->cm_id->rem_ref(qp->cm_id);
+                               qp->cm_id = NULL;
+                       }
+                       spin_unlock_irqrestore(&qp->lock, flags);
+                       cm_event.event = IW_CM_EVENT_CLOSE;
+                       cm_event.status = 0;
+                       if (cm_id->event_handler)
+                               cm_id->event_handler(cm_id, &cm_event);
+                       break;
+               default:
+                       BUG_ON(1);
+                       pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
+                               "CM_ID=%p\n",
+                               __func__, __LINE__,
+                               event_id, qp, cm_id);
+                       break;
+               }
+               break;
+       }
+
+       case C2_RES_IND_EP:{
+
+               struct c2wr_ae_connection_request *req =
+                       &wr->ae.ae_connection_request;
+               struct iw_cm_id *cm_id =
+                       (struct iw_cm_id *)resource_user_context;
+
+               pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
+               if (event_id != CCAE_CONNECTION_REQUEST) {
+                       pr_debug("%s: Invalid event_id: %d\n",
+                               __func__, event_id);
+                       break;
+               }
+               cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+               cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
+               laddr->sin_addr.s_addr = req->laddr;
+               raddr->sin_addr.s_addr = req->raddr;
+               laddr->sin_port = req->lport;
+               raddr->sin_port = req->rport;
+               cm_event.private_data_len =
+                       be32_to_cpu(req->private_data_length);
+               cm_event.private_data = req->private_data;
+               /*
+                * Until ird/ord negotiation via MPAv2 support is added, send
+                * max supported values
+                */
+               cm_event.ird = cm_event.ord = 128;
+
+               if (cm_id->event_handler)
+                       cm_id->event_handler(cm_id, &cm_event);
+               break;
+       }
+
+       case C2_RES_IND_CQ:{
+               struct c2_cq *cq =
+                   (struct c2_cq *) resource_user_context;
+
+               pr_debug("IB_EVENT_CQ_ERR\n");
+               ib_event.device = &c2dev->ibdev;
+               ib_event.element.cq = &cq->ibcq;
+               ib_event.event = IB_EVENT_CQ_ERR;
+
+               if (cq->ibcq.event_handler)
+                       cq->ibcq.event_handler(&ib_event,
+                                              cq->ibcq.cq_context);
+               break;
+       }
+
+       default:
+               printk("Bad resource indicator = %d\n",
+                      resource_indicator);
+               break;
+       }
+
+ ignore_it:
+       c2_mq_free(mq);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_ae.h b/drivers/staging/rdma/amso1100/c2_ae.h
new file mode 100644 (file)
index 0000000..3a065c3
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_AE_H_
+#define _C2_AE_H_
+
+/*
+ * WARNING: If you change this file, also bump C2_IVN_BASE
+ * in common/include/clustercore/c2_ivn.h.
+ */
+
+/*
+ * Asynchronous Event Identifiers
+ *
+ * These start at 0x80 only so it's obvious from inspection that
+ * they are not work-request statuses.  This isn't critical.
+ *
+ * NOTE: these event id's must fit in eight bits.
+ */
+enum c2_event_id {
+       CCAE_REMOTE_SHUTDOWN = 0x80,
+       CCAE_ACTIVE_CONNECT_RESULTS,
+       CCAE_CONNECTION_REQUEST,
+       CCAE_LLP_CLOSE_COMPLETE,
+       CCAE_TERMINATE_MESSAGE_RECEIVED,
+       CCAE_LLP_CONNECTION_RESET,
+       CCAE_LLP_CONNECTION_LOST,
+       CCAE_LLP_SEGMENT_SIZE_INVALID,
+       CCAE_LLP_INVALID_CRC,
+       CCAE_LLP_BAD_FPDU,
+       CCAE_INVALID_DDP_VERSION,
+       CCAE_INVALID_RDMA_VERSION,
+       CCAE_UNEXPECTED_OPCODE,
+       CCAE_INVALID_DDP_QUEUE_NUMBER,
+       CCAE_RDMA_READ_NOT_ENABLED,
+       CCAE_RDMA_WRITE_NOT_ENABLED,
+       CCAE_RDMA_READ_TOO_SMALL,
+       CCAE_NO_L_BIT,
+       CCAE_TAGGED_INVALID_STAG,
+       CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
+       CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
+       CCAE_TAGGED_INVALID_PD,
+       CCAE_WRAP_ERROR,
+       CCAE_BAD_CLOSE,
+       CCAE_BAD_LLP_CLOSE,
+       CCAE_INVALID_MSN_RANGE,
+       CCAE_INVALID_MSN_GAP,
+       CCAE_IRRQ_OVERFLOW,
+       CCAE_IRRQ_MSN_GAP,
+       CCAE_IRRQ_MSN_RANGE,
+       CCAE_IRRQ_INVALID_STAG,
+       CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
+       CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
+       CCAE_IRRQ_INVALID_PD,
+       CCAE_IRRQ_WRAP_ERROR,
+       CCAE_CQ_SQ_COMPLETION_OVERFLOW,
+       CCAE_CQ_RQ_COMPLETION_ERROR,
+       CCAE_QP_SRQ_WQE_ERROR,
+       CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
+       CCAE_CQ_OVERFLOW,
+       CCAE_CQ_OPERATION_ERROR,
+       CCAE_SRQ_LIMIT_REACHED,
+       CCAE_QP_RQ_LIMIT_REACHED,
+       CCAE_SRQ_CATASTROPHIC_ERROR,
+       CCAE_RNIC_CATASTROPHIC_ERROR
+/* WARNING If you add more id's, make sure their values fit in eight bits. */
+};
+
+/*
+ * Resource Indicators and Identifiers
+ */
+enum c2_resource_indicator {
+       C2_RES_IND_QP = 1,
+       C2_RES_IND_EP,
+       C2_RES_IND_CQ,
+       C2_RES_IND_SRQ,
+};
+
+#endif /* _C2_AE_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_alloc.c b/drivers/staging/rdma/amso1100/c2_alloc.c
new file mode 100644 (file)
index 0000000..78d247e
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/bitmap.h>
+
+#include "c2.h"
+
+static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
+                              struct sp_chunk **head)
+{
+       int i;
+       struct sp_chunk *new_head;
+       dma_addr_t dma_addr;
+
+       new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
+                                     &dma_addr, gfp_mask);
+       if (new_head == NULL)
+               return -ENOMEM;
+
+       new_head->dma_addr = dma_addr;
+       dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
+
+       new_head->next = NULL;
+       new_head->head = 0;
+
+       /* build list where each index is the next free slot */
+       for (i = 0;
+            i < (PAGE_SIZE - sizeof(struct sp_chunk) -
+                 sizeof(u16)) / sizeof(u16) - 1;
+            i++) {
+               new_head->shared_ptr[i] = i + 1;
+       }
+       /* terminate list */
+       new_head->shared_ptr[i] = 0xFFFF;
+
+       *head = new_head;
+       return 0;
+}
+
+int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+                     struct sp_chunk **root)
+{
+       return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
+}
+
+void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
+{
+       struct sp_chunk *next;
+
+       while (root) {
+               next = root->next;
+               dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
+                                 dma_unmap_addr(root, mapping));
+               root = next;
+       }
+}
+
+__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+                     dma_addr_t *dma_addr, gfp_t gfp_mask)
+{
+       u16 mqsp;
+
+       while (head) {
+               mqsp = head->head;
+               if (mqsp != 0xFFFF) {
+                       head->head = head->shared_ptr[mqsp];
+                       break;
+               } else if (head->next == NULL) {
+                       if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
+                           0) {
+                               head = head->next;
+                               mqsp = head->head;
+                               head->head = head->shared_ptr[mqsp];
+                               break;
+                       } else
+                               return NULL;
+               } else
+                       head = head->next;
+       }
+       if (head) {
+               *dma_addr = head->dma_addr +
+                           ((unsigned long) &(head->shared_ptr[mqsp]) -
+                            (unsigned long) head);
+               pr_debug("%s addr %p dma_addr %llx\n", __func__,
+                        &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
+               return (__force __be16 *) &(head->shared_ptr[mqsp]);
+       }
+       return NULL;
+}
+
+void c2_free_mqsp(__be16 *mqsp)
+{
+       struct sp_chunk *head;
+       u16 idx;
+
+       /* The chunk containing this ptr begins at the page boundary */
+       head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
+
+       /* Link head to new mqsp */
+       *mqsp = (__force __be16) head->head;
+
+       /* Compute the shared_ptr index */
+       idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
+       idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
+
+       /* Point this index at the head */
+       head->shared_ptr[idx] = head->head;
+
+       /* Point head at this index */
+       head->head = idx;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_cm.c b/drivers/staging/rdma/amso1100/c2_cm.c
new file mode 100644 (file)
index 0000000..23bfa94
--- /dev/null
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_vq.h"
+#include <rdma/iw_cm.h>
+
+int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+       struct c2_dev *c2dev = to_c2dev(cm_id->device);
+       struct ib_qp *ibqp;
+       struct c2_qp *qp;
+       struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */
+       struct c2_vq_req *vq_req;
+       int err;
+       struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+       if (cm_id->remote_addr.ss_family != AF_INET)
+               return -ENOSYS;
+
+       ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+       if (!ibqp)
+               return -EINVAL;
+       qp = to_c2qp(ibqp);
+
+       /* Associate QP <--> CM_ID */
+       cm_id->provider_data = qp;
+       cm_id->add_ref(cm_id);
+       qp->cm_id = cm_id;
+
+       /*
+        * only support the max private_data length
+        */
+       if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+               err = -EINVAL;
+               goto bail0;
+       }
+       /*
+        * Set the rdma read limits
+        */
+       err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+       if (err)
+               goto bail0;
+
+       /*
+        * Create and send a WR_QP_CONNECT...
+        */
+       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+       if (!wr) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       c2_wr_set_id(wr, CCWR_QP_CONNECT);
+       wr->hdr.context = 0;
+       wr->rnic_handle = c2dev->adapter_handle;
+       wr->qp_handle = qp->adapter_handle;
+
+       wr->remote_addr = raddr->sin_addr.s_addr;
+       wr->remote_port = raddr->sin_port;
+
+       /*
+        * Move any private data from the callers's buf into
+        * the WR.
+        */
+       if (iw_param->private_data) {
+               wr->private_data_length =
+                       cpu_to_be32(iw_param->private_data_len);
+               memcpy(&wr->private_data[0], iw_param->private_data,
+                      iw_param->private_data_len);
+       } else
+               wr->private_data_length = 0;
+
+       /*
+        * Send WR to adapter.  NOTE: There is no synch reply from
+        * the adapter.
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) wr);
+       vq_req_free(c2dev, vq_req);
+
+ bail1:
+       kfree(wr);
+ bail0:
+       if (err) {
+               /*
+                * If we fail, release reference on QP and
+                * disassociate QP from CM_ID
+                */
+               cm_id->provider_data = NULL;
+               qp->cm_id = NULL;
+               cm_id->rem_ref(cm_id);
+       }
+       return err;
+}
+
+int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+       struct c2_dev *c2dev;
+       struct c2wr_ep_listen_create_req wr;
+       struct c2wr_ep_listen_create_rep *reply;
+       struct c2_vq_req *vq_req;
+       int err;
+       struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+       if (cm_id->local_addr.ss_family != AF_INET)
+               return -ENOSYS;
+
+       c2dev = to_c2dev(cm_id->device);
+       if (c2dev == NULL)
+               return -EINVAL;
+
+       /*
+        * Allocate verbs request.
+        */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       /*
+        * Build the WR
+        */
+       c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
+       wr.hdr.context = (u64) (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.local_addr = laddr->sin_addr.s_addr;
+       wr.local_port = laddr->sin_port;
+       wr.backlog = cpu_to_be32(backlog);
+       wr.user_context = (u64) (unsigned long) cm_id;
+
+       /*
+        * Reference the request struct.  Dereferenced in the int handler.
+        */
+       vq_req_get(c2dev, vq_req);
+
+       /*
+        * Send WR to adapter
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       /*
+        * Wait for reply from adapter
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail0;
+
+       /*
+        * Process reply
+        */
+       reply =
+           (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       if ((err = c2_errno(reply)) != 0)
+               goto bail1;
+
+       /*
+        * Keep the adapter handle. Used in subsequent destroy
+        */
+       cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
+
+       /*
+        * free vq stuff
+        */
+       vq_repbuf_free(c2dev, reply);
+       vq_req_free(c2dev, vq_req);
+
+       return 0;
+
+ bail1:
+       vq_repbuf_free(c2dev, reply);
+ bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+
+int c2_llp_service_destroy(struct iw_cm_id *cm_id)
+{
+
+       struct c2_dev *c2dev;
+       struct c2wr_ep_listen_destroy_req wr;
+       struct c2wr_ep_listen_destroy_rep *reply;
+       struct c2_vq_req *vq_req;
+       int err;
+
+       c2dev = to_c2dev(cm_id->device);
+       if (c2dev == NULL)
+               return -EINVAL;
+
+       /*
+        * Allocate verbs request.
+        */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       /*
+        * Build the WR
+        */
+       c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
+
+       /*
+        * reference the request struct.  dereferenced in the int handler.
+        */
+       vq_req_get(c2dev, vq_req);
+
+       /*
+        * Send WR to adapter
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       /*
+        * Wait for reply from adapter
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail0;
+
+       /*
+        * Process reply
+        */
+       reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+       if ((err = c2_errno(reply)) != 0)
+               goto bail1;
+
+ bail1:
+       vq_repbuf_free(c2dev, reply);
+ bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+       struct c2_dev *c2dev = to_c2dev(cm_id->device);
+       struct c2_qp *qp;
+       struct ib_qp *ibqp;
+       struct c2wr_cr_accept_req *wr;  /* variable length WR */
+       struct c2_vq_req *vq_req;
+       struct c2wr_cr_accept_rep *reply;       /* VQ Reply msg ptr. */
+       int err;
+
+       ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+       if (!ibqp)
+               return -EINVAL;
+       qp = to_c2qp(ibqp);
+
+       /* Set the RDMA read limits */
+       err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+       if (err)
+               goto bail0;
+
+       /* Allocate verbs request. */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+       vq_req->qp = qp;
+       vq_req->cm_id = cm_id;
+       vq_req->event = IW_CM_EVENT_ESTABLISHED;
+
+       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+       if (!wr) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       /* Build the WR */
+       c2_wr_set_id(wr, CCWR_CR_ACCEPT);
+       wr->hdr.context = (unsigned long) vq_req;
+       wr->rnic_handle = c2dev->adapter_handle;
+       wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
+       wr->qp_handle = qp->adapter_handle;
+
+       /* Replace the cr_handle with the QP after accept */
+       cm_id->provider_data = qp;
+       cm_id->add_ref(cm_id);
+       qp->cm_id = cm_id;
+
+       cm_id->provider_data = qp;
+
+       /* Validate private_data length */
+       if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+               err = -EINVAL;
+               goto bail1;
+       }
+
+       if (iw_param->private_data) {
+               wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
+               memcpy(&wr->private_data[0],
+                      iw_param->private_data, iw_param->private_data_len);
+       } else
+               wr->private_data_length = 0;
+
+       /* Reference the request struct.  Dereferenced in the int handler. */
+       vq_req_get(c2dev, vq_req);
+
+       /* Send WR to adapter */
+       err = vq_send_wr(c2dev, (union c2wr *) wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail1;
+       }
+
+       /* Wait for reply from adapter */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail1;
+
+       /* Check that reply is present */
+       reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       err = c2_errno(reply);
+       vq_repbuf_free(c2dev, reply);
+
+       if (!err)
+               c2_set_qp_state(qp, C2_QP_STATE_RTS);
+ bail1:
+       kfree(wr);
+       vq_req_free(c2dev, vq_req);
+ bail0:
+       if (err) {
+               /*
+                * If we fail, release reference on QP and
+                * disassociate QP from CM_ID
+                */
+               cm_id->provider_data = NULL;
+               qp->cm_id = NULL;
+               cm_id->rem_ref(cm_id);
+       }
+       return err;
+}
+
+int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+       struct c2_dev *c2dev;
+       struct c2wr_cr_reject_req wr;
+       struct c2_vq_req *vq_req;
+       struct c2wr_cr_reject_rep *reply;
+       int err;
+
+       c2dev = to_c2dev(cm_id->device);
+
+       /*
+        * Allocate verbs request.
+        */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       /*
+        * Build the WR
+        */
+       c2_wr_set_id(&wr, CCWR_CR_REJECT);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
+
+       /*
+        * reference the request struct.  dereferenced in the int handler.
+        */
+       vq_req_get(c2dev, vq_req);
+
+       /*
+        * Send WR to adapter
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       /*
+        * Wait for reply from adapter
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail0;
+
+       /*
+        * Process reply
+        */
+       reply = (struct c2wr_cr_reject_rep *) (unsigned long)
+               vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+       err = c2_errno(reply);
+       /*
+        * free vq stuff
+        */
+       vq_repbuf_free(c2dev, reply);
+
+ bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c
new file mode 100644 (file)
index 0000000..1b63185
--- /dev/null
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
+
+static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
+{
+       struct c2_cq *cq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&c2dev->lock, flags);
+       cq = c2dev->qptr_array[cqn];
+       if (!cq) {
+               spin_unlock_irqrestore(&c2dev->lock, flags);
+               return NULL;
+       }
+       atomic_inc(&cq->refcount);
+       spin_unlock_irqrestore(&c2dev->lock, flags);
+       return cq;
+}
+
+static void c2_cq_put(struct c2_cq *cq)
+{
+       if (atomic_dec_and_test(&cq->refcount))
+               wake_up(&cq->wait);
+}
+
+void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
+{
+       struct c2_cq *cq;
+
+       cq = c2_cq_get(c2dev, mq_index);
+       if (!cq) {
+               printk("discarding events on destroyed CQN=%d\n", mq_index);
+               return;
+       }
+
+       (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+       c2_cq_put(cq);
+}
+
+void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
+{
+       struct c2_cq *cq;
+       struct c2_mq *q;
+
+       cq = c2_cq_get(c2dev, mq_index);
+       if (!cq)
+               return;
+
+       spin_lock_irq(&cq->lock);
+       q = &cq->mq;
+       if (q && !c2_mq_empty(q)) {
+               u16 priv = q->priv;
+               struct c2wr_ce *msg;
+
+               while (priv != be16_to_cpu(*q->shared)) {
+                       msg = (struct c2wr_ce *)
+                               (q->msg_pool.host + priv * q->msg_size);
+                       if (msg->qp_user_context == (u64) (unsigned long) qp) {
+                               msg->qp_user_context = (u64) 0;
+                       }
+                       priv = (priv + 1) % q->q_size;
+               }
+       }
+       spin_unlock_irq(&cq->lock);
+       c2_cq_put(cq);
+}
+
+static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
+{
+       switch (status) {
+       case C2_OK:
+               return IB_WC_SUCCESS;
+       case CCERR_FLUSHED:
+               return IB_WC_WR_FLUSH_ERR;
+       case CCERR_BASE_AND_BOUNDS_VIOLATION:
+               return IB_WC_LOC_PROT_ERR;
+       case CCERR_ACCESS_VIOLATION:
+               return IB_WC_LOC_ACCESS_ERR;
+       case CCERR_TOTAL_LENGTH_TOO_BIG:
+               return IB_WC_LOC_LEN_ERR;
+       case CCERR_INVALID_WINDOW:
+               return IB_WC_MW_BIND_ERR;
+       default:
+               return IB_WC_GENERAL_ERR;
+       }
+}
+
+
+static inline int c2_poll_one(struct c2_dev *c2dev,
+                             struct c2_cq *cq, struct ib_wc *entry)
+{
+       struct c2wr_ce *ce;
+       struct c2_qp *qp;
+       int is_recv = 0;
+
+       ce = c2_mq_consume(&cq->mq);
+       if (!ce) {
+               return -EAGAIN;
+       }
+
+       /*
+        * if the qp returned is null then this qp has already
+        * been freed and we are unable process the completion.
+        * try pulling the next message
+        */
+       while ((qp =
+               (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
+               c2_mq_free(&cq->mq);
+               ce = c2_mq_consume(&cq->mq);
+               if (!ce)
+                       return -EAGAIN;
+       }
+
+       entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
+       entry->wr_id = ce->hdr.context;
+       entry->qp = &qp->ibqp;
+       entry->wc_flags = 0;
+       entry->slid = 0;
+       entry->sl = 0;
+       entry->src_qp = 0;
+       entry->dlid_path_bits = 0;
+       entry->pkey_index = 0;
+
+       switch (c2_wr_get_id(ce)) {
+       case C2_WR_TYPE_SEND:
+               entry->opcode = IB_WC_SEND;
+               break;
+       case C2_WR_TYPE_RDMA_WRITE:
+               entry->opcode = IB_WC_RDMA_WRITE;
+               break;
+       case C2_WR_TYPE_RDMA_READ:
+               entry->opcode = IB_WC_RDMA_READ;
+               break;
+       case C2_WR_TYPE_BIND_MW:
+               entry->opcode = IB_WC_BIND_MW;
+               break;
+       case C2_WR_TYPE_RECV:
+               entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
+               entry->opcode = IB_WC_RECV;
+               is_recv = 1;
+               break;
+       default:
+               break;
+       }
+
+       /* consume the WQEs */
+       if (is_recv)
+               c2_mq_lconsume(&qp->rq_mq, 1);
+       else
+               c2_mq_lconsume(&qp->sq_mq,
+                              be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
+
+       /* free the message */
+       c2_mq_free(&cq->mq);
+
+       return 0;
+}
+
+int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+       struct c2_dev *c2dev = to_c2dev(ibcq->device);
+       struct c2_cq *cq = to_c2cq(ibcq);
+       unsigned long flags;
+       int npolled, err;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       for (npolled = 0; npolled < num_entries; ++npolled) {
+
+               err = c2_poll_one(c2dev, cq, entry + npolled);
+               if (err)
+                       break;
+       }
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       return npolled;
+}
+
+int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+       struct c2_mq_shared __iomem *shared;
+       struct c2_cq *cq;
+       unsigned long flags;
+       int ret = 0;
+
+       cq = to_c2cq(ibcq);
+       shared = cq->mq.peer;
+
+       if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+               writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
+       else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+               writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
+       else
+               return -EINVAL;
+
+       writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
+
+       /*
+        * Now read back shared->armed to make the PCI
+        * write synchronous.  This is necessary for
+        * correct cq notification semantics.
+        */
+       readb(&shared->armed);
+
+       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+               spin_lock_irqsave(&cq->lock, flags);
+               ret = !c2_mq_empty(&cq->mq);
+               spin_unlock_irqrestore(&cq->lock, flags);
+       }
+
+       return ret;
+}
+
+static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
+{
+       dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
+                         mq->msg_pool.host, dma_unmap_addr(mq, mapping));
+}
+
+static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
+                          size_t q_size, size_t msg_size)
+{
+       u8 *pool_start;
+
+       if (q_size > SIZE_MAX / msg_size)
+               return -EINVAL;
+
+       pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
+                                       &mq->host_dma, GFP_KERNEL);
+       if (!pool_start)
+               return -ENOMEM;
+
+       c2_mq_rep_init(mq,
+                      0,               /* index (currently unknown) */
+                      q_size,
+                      msg_size,
+                      pool_start,
+                      NULL,    /* peer (currently unknown) */
+                      C2_MQ_HOST_TARGET);
+
+       dma_unmap_addr_set(mq, mapping, mq->host_dma);
+
+       return 0;
+}
+
+int c2_init_cq(struct c2_dev *c2dev, int entries,
+              struct c2_ucontext *ctx, struct c2_cq *cq)
+{
+       struct c2wr_cq_create_req wr;
+       struct c2wr_cq_create_rep *reply;
+       unsigned long peer_pa;
+       struct c2_vq_req *vq_req;
+       int err;
+
+       might_sleep();
+
+       cq->ibcq.cqe = entries - 1;
+       cq->is_kernel = !ctx;
+
+       /* Allocate a shared pointer */
+       cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                     &cq->mq.shared_dma, GFP_KERNEL);
+       if (!cq->mq.shared)
+               return -ENOMEM;
+
+       /* Allocate pages for the message pool */
+       err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
+       if (err)
+               goto bail0;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_CQ_CREATE);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.msg_size = cpu_to_be32(cq->mq.msg_size);
+       wr.depth = cpu_to_be32(cq->mq.q_size);
+       wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
+       wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
+       wr.user_context = (u64) (unsigned long) (cq);
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail2;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail2;
+
+       reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail2;
+       }
+
+       if ((err = c2_errno(reply)) != 0)
+               goto bail3;
+
+       cq->adapter_handle = reply->cq_handle;
+       cq->mq.index = be32_to_cpu(reply->mq_index);
+
+       peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
+       cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
+       if (!cq->mq.peer) {
+               err = -ENOMEM;
+               goto bail3;
+       }
+
+       vq_repbuf_free(c2dev, reply);
+       vq_req_free(c2dev, vq_req);
+
+       spin_lock_init(&cq->lock);
+       atomic_set(&cq->refcount, 1);
+       init_waitqueue_head(&cq->wait);
+
+       /*
+        * Use the MQ index allocated by the adapter to
+        * store the CQ in the qptr_array
+        */
+       cq->cqn = cq->mq.index;
+       c2dev->qptr_array[cq->cqn] = cq;
+
+       return 0;
+
+      bail3:
+       vq_repbuf_free(c2dev, reply);
+      bail2:
+       vq_req_free(c2dev, vq_req);
+      bail1:
+       c2_free_cq_buf(c2dev, &cq->mq);
+      bail0:
+       c2_free_mqsp(cq->mq.shared);
+
+       return err;
+}
+
+void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
+{
+       int err;
+       struct c2_vq_req *vq_req;
+       struct c2wr_cq_destroy_req wr;
+       struct c2wr_cq_destroy_rep *reply;
+
+       might_sleep();
+
+       /* Clear CQ from the qptr array */
+       spin_lock_irq(&c2dev->lock);
+       c2dev->qptr_array[cq->mq.index] = NULL;
+       atomic_dec(&cq->refcount);
+       spin_unlock_irq(&c2dev->lock);
+
+       wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req) {
+               goto bail0;
+       }
+
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.cq_handle = cq->adapter_handle;
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail1;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail1;
+
+       reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+       if (reply)
+               vq_repbuf_free(c2dev, reply);
+      bail1:
+       vq_req_free(c2dev, vq_req);
+      bail0:
+       if (cq->is_kernel) {
+               c2_free_cq_buf(c2dev, &cq->mq);
+       }
+
+       return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_intr.c b/drivers/staging/rdma/amso1100/c2_intr.c
new file mode 100644 (file)
index 0000000..3a17d9b
--- /dev/null
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_vq.h"
+
+static void handle_mq(struct c2_dev *c2dev, u32 index);
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
+
+/*
+ * Handle RNIC interrupts
+ */
+void c2_rnic_interrupt(struct c2_dev *c2dev)
+{
+       unsigned int mq_index;
+
+       while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
+               mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
+               if (mq_index & 0x80000000) {
+                       break;
+               }
+
+               c2dev->hints_read++;
+               handle_mq(c2dev, mq_index);
+       }
+
+}
+
+/*
+ * Top level MQ handler
+ */
+static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
+{
+       if (c2dev->qptr_array[mq_index] == NULL) {
+               pr_debug("handle_mq: stray activity for mq_index=%d\n",
+                        mq_index);
+               return;
+       }
+
+       switch (mq_index) {
+       case (0):
+               /*
+                * An index of 0 in the activity queue
+                * indicates the req vq now has messages
+                * available...
+                *
+                * Wake up any waiters waiting on req VQ
+                * message availability.
+                */
+               wake_up(&c2dev->req_vq_wo);
+               break;
+       case (1):
+               handle_vq(c2dev, mq_index);
+               break;
+       case (2):
+               /* We have to purge the VQ in case there are pending
+                * accept reply requests that would result in the
+                * generation of an ESTABLISHED event. If we don't
+                * generate these first, a CLOSE event could end up
+                * being delivered before the ESTABLISHED event.
+                */
+               handle_vq(c2dev, 1);
+
+               c2_ae_event(c2dev, mq_index);
+               break;
+       default:
+               /* There is no event synchronization between CQ events
+                * and AE or CM events. In fact, CQE could be
+                * delivered for all of the I/O up to and including the
+                * FLUSH for a peer disconenct prior to the ESTABLISHED
+                * event being delivered to the app. The reason for this
+                * is that CM events are delivered on a thread, while AE
+                * and CM events are delivered on interrupt context.
+                */
+               c2_cq_event(c2dev, mq_index);
+               break;
+       }
+
+       return;
+}
+
+/*
+ * Handles verbs WR replies.
+ */
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
+{
+       void *adapter_msg, *reply_msg;
+       struct c2wr_hdr *host_msg;
+       struct c2wr_hdr tmp;
+       struct c2_mq *reply_vq;
+       struct c2_vq_req *req;
+       struct iw_cm_event cm_event;
+       int err;
+
+       reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
+
+       /*
+        * get next msg from mq_index into adapter_msg.
+        * don't free it yet.
+        */
+       adapter_msg = c2_mq_consume(reply_vq);
+       if (adapter_msg == NULL) {
+               return;
+       }
+
+       host_msg = vq_repbuf_alloc(c2dev);
+
+       /*
+        * If we can't get a host buffer, then we'll still
+        * wakeup the waiter, we just won't give him the msg.
+        * It is assumed the waiter will deal with this...
+        */
+       if (!host_msg) {
+               pr_debug("handle_vq: no repbufs!\n");
+
+               /*
+                * just copy the WR header into a local variable.
+                * this allows us to still demux on the context
+                */
+               host_msg = &tmp;
+               memcpy(host_msg, adapter_msg, sizeof(tmp));
+               reply_msg = NULL;
+       } else {
+               memcpy(host_msg, adapter_msg, reply_vq->msg_size);
+               reply_msg = host_msg;
+       }
+
+       /*
+        * consume the msg from the MQ
+        */
+       c2_mq_free(reply_vq);
+
+       /*
+        * wakeup the waiter.
+        */
+       req = (struct c2_vq_req *) (unsigned long) host_msg->context;
+       if (req == NULL) {
+               /*
+                * We should never get here, as the adapter should
+                * never send us a reply that we're not expecting.
+                */
+               if (reply_msg != NULL)
+                       vq_repbuf_free(c2dev, host_msg);
+               pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
+               return;
+       }
+
+       if (reply_msg)
+               err = c2_errno(reply_msg);
+       else
+               err = -ENOMEM;
+
+       if (!err) switch (req->event) {
+       case IW_CM_EVENT_ESTABLISHED:
+               c2_set_qp_state(req->qp,
+                               C2_QP_STATE_RTS);
+               /*
+                * Until ird/ord negotiation via MPAv2 support is added, send
+                * max supported values
+                */
+               cm_event.ird = cm_event.ord = 128;
+       case IW_CM_EVENT_CLOSE:
+
+               /*
+                * Move the QP to RTS if this is
+                * the established event
+                */
+               cm_event.event = req->event;
+               cm_event.status = 0;
+               cm_event.local_addr = req->cm_id->local_addr;
+               cm_event.remote_addr = req->cm_id->remote_addr;
+               cm_event.private_data = NULL;
+               cm_event.private_data_len = 0;
+               req->cm_id->event_handler(req->cm_id, &cm_event);
+               break;
+       default:
+               break;
+       }
+
+       req->reply_msg = (u64) (unsigned long) (reply_msg);
+       atomic_set(&req->reply_ready, 1);
+       wake_up(&req->wait_object);
+
+       /*
+        * If the request was cancelled, then this put will
+        * free the vq_req memory...and reply_msg!!!
+        */
+       vq_req_put(c2dev, req);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mm.c b/drivers/staging/rdma/amso1100/c2_mm.c
new file mode 100644 (file)
index 0000000..119c4f3
--- /dev/null
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+
+#define PBL_VIRT 1
+#define PBL_PHYS 2
+
+/*
+ * Send all the PBL messages to convey the remainder of the PBL
+ * Wait for the adapter's reply on the last one.
+ * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
+ *
+ * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
+ *       Reply buffer _is_ freed by this function.
+ */
+static int
+send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
+                 unsigned long va, u32 pbl_depth,
+                 struct c2_vq_req *vq_req, int pbl_type)
+{
+       u32 pbe_count;          /* amt that fits in a PBL msg */
+       u32 count;              /* amt in this PBL MSG. */
+       struct c2wr_nsmr_pbl_req *wr;   /* PBL WR ptr */
+       struct c2wr_nsmr_pbl_rep *reply;        /* reply ptr */
+       int err, pbl_virt, pbl_index, i;
+
+       switch (pbl_type) {
+       case PBL_VIRT:
+               pbl_virt = 1;
+               break;
+       case PBL_PHYS:
+               pbl_virt = 0;
+               break;
+       default:
+               return -EINVAL;
+               break;
+       }
+
+       pbe_count = (c2dev->req_vq.msg_size -
+                    sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
+       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+       if (!wr) {
+               return -ENOMEM;
+       }
+       c2_wr_set_id(wr, CCWR_NSMR_PBL);
+
+       /*
+        * Only the last PBL message will generate a reply from the verbs,
+        * so we set the context to 0 indicating there is no kernel verbs
+        * handler blocked awaiting this reply.
+        */
+       wr->hdr.context = 0;
+       wr->rnic_handle = c2dev->adapter_handle;
+       wr->stag_index = stag_index;    /* already swapped */
+       wr->flags = 0;
+       pbl_index = 0;
+       while (pbl_depth) {
+               count = min(pbe_count, pbl_depth);
+               wr->addrs_length = cpu_to_be32(count);
+
+               /*
+                *  If this is the last message, then reference the
+                *  vq request struct cuz we're gonna wait for a reply.
+                *  also make this PBL msg as the last one.
+                */
+               if (count == pbl_depth) {
+                       /*
+                        * reference the request struct.  dereferenced in the
+                        * int handler.
+                        */
+                       vq_req_get(c2dev, vq_req);
+                       wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
+
+                       /*
+                        * This is the last PBL message.
+                        * Set the context to our VQ Request Object so we can
+                        * wait for the reply.
+                        */
+                       wr->hdr.context = (unsigned long) vq_req;
+               }
+
+               /*
+                * If pbl_virt is set then va is a virtual address
+                * that describes a virtually contiguous memory
+                * allocation. The wr needs the start of each virtual page
+                * to be converted to the corresponding physical address
+                * of the page. If pbl_virt is not set then va is an array
+                * of physical addresses and there is no conversion to do.
+                * Just fill in the wr with what is in the array.
+                */
+               for (i = 0; i < count; i++) {
+                       if (pbl_virt) {
+                               va += PAGE_SIZE;
+                       } else {
+                               wr->paddrs[i] =
+                                   cpu_to_be64(((u64 *)va)[pbl_index + i]);
+                       }
+               }
+
+               /*
+                * Send WR to adapter
+                */
+               err = vq_send_wr(c2dev, (union c2wr *) wr);
+               if (err) {
+                       if (count <= pbe_count) {
+                               vq_req_put(c2dev, vq_req);
+                       }
+                       goto bail0;
+               }
+               pbl_depth -= count;
+               pbl_index += count;
+       }
+
+       /*
+        *  Now wait for the reply...
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail0;
+       }
+
+       /*
+        * Process reply
+        */
+       reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       err = c2_errno(reply);
+
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       kfree(wr);
+       return err;
+}
+
+#define C2_PBL_MAX_DEPTH 131072
+int
+c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+                          int page_size, int pbl_depth, u32 length,
+                          u32 offset, u64 *va, enum c2_acf acf,
+                          struct c2_mr *mr)
+{
+       struct c2_vq_req *vq_req;
+       struct c2wr_nsmr_register_req *wr;
+       struct c2wr_nsmr_register_rep *reply;
+       u16 flags;
+       int i, pbe_count, count;
+       int err;
+
+       if (!va || !length || !addr_list || !pbl_depth)
+               return -EINTR;
+
+       /*
+        * Verify PBL depth is within rnic max
+        */
+       if (pbl_depth > C2_PBL_MAX_DEPTH) {
+               return -EINTR;
+       }
+
+       /*
+        * allocate verbs request object
+        */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+       if (!wr) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       /*
+        * build the WR
+        */
+       c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
+       wr->hdr.context = (unsigned long) vq_req;
+       wr->rnic_handle = c2dev->adapter_handle;
+
+       flags = (acf | MEM_VA_BASED | MEM_REMOTE);
+
+       /*
+        * compute how many pbes can fit in the message
+        */
+       pbe_count = (c2dev->req_vq.msg_size -
+                    sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
+
+       if (pbl_depth <= pbe_count) {
+               flags |= MEM_PBL_COMPLETE;
+       }
+       wr->flags = cpu_to_be16(flags);
+       wr->stag_key = 0;       //stag_key;
+       wr->va = cpu_to_be64(*va);
+       wr->pd_id = mr->pd->pd_id;
+       wr->pbe_size = cpu_to_be32(page_size);
+       wr->length = cpu_to_be32(length);
+       wr->pbl_depth = cpu_to_be32(pbl_depth);
+       wr->fbo = cpu_to_be32(offset);
+       count = min(pbl_depth, pbe_count);
+       wr->addrs_length = cpu_to_be32(count);
+
+       /*
+        * fill out the PBL for this message
+        */
+       for (i = 0; i < count; i++) {
+               wr->paddrs[i] = cpu_to_be64(addr_list[i]);
+       }
+
+       /*
+        * regerence the request struct
+        */
+       vq_req_get(c2dev, vq_req);
+
+       /*
+        * send the WR to the adapter
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail1;
+       }
+
+       /*
+        * wait for reply from adapter
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail1;
+       }
+
+       /*
+        * process reply
+        */
+       reply =
+           (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+       if ((err = c2_errno(reply))) {
+               goto bail2;
+       }
+       //*p_pb_entries = be32_to_cpu(reply->pbl_depth);
+       mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
+       vq_repbuf_free(c2dev, reply);
+
+       /*
+        * if there are still more PBEs we need to send them to
+        * the adapter and wait for a reply on the final one.
+        * reuse vq_req for this purpose.
+        */
+       pbl_depth -= count;
+       if (pbl_depth) {
+
+               vq_req->reply_msg = (unsigned long) NULL;
+               atomic_set(&vq_req->reply_ready, 0);
+               err = send_pbl_messages(c2dev,
+                                       cpu_to_be32(mr->ibmr.lkey),
+                                       (unsigned long) &addr_list[i],
+                                       pbl_depth, vq_req, PBL_PHYS);
+               if (err) {
+                       goto bail1;
+               }
+       }
+
+       vq_req_free(c2dev, vq_req);
+       kfree(wr);
+
+       return err;
+
+      bail2:
+       vq_repbuf_free(c2dev, reply);
+      bail1:
+       kfree(wr);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
+{
+       struct c2_vq_req *vq_req;       /* verbs request object */
+       struct c2wr_stag_dealloc_req wr;        /* work request */
+       struct c2wr_stag_dealloc_rep *reply;    /* WR reply  */
+       int err;
+
+
+       /*
+        * allocate verbs request object
+        */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req) {
+               return -ENOMEM;
+       }
+
+       /*
+        * Build the WR
+        */
+       c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
+       wr.hdr.context = (u64) (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.stag_index = cpu_to_be32(stag_index);
+
+       /*
+        * reference the request struct.  dereferenced in the int handler.
+        */
+       vq_req_get(c2dev, vq_req);
+
+       /*
+        * Send WR to adapter
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       /*
+        * Wait for reply from adapter
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail0;
+       }
+
+       /*
+        * Process reply
+        */
+       reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       err = c2_errno(reply);
+
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.c b/drivers/staging/rdma/amso1100/c2_mq.c
new file mode 100644 (file)
index 0000000..0cddc49
--- /dev/null
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include "c2_mq.h"
+
+void *c2_mq_alloc(struct c2_mq *q)
+{
+       BUG_ON(q->magic != C2_MQ_MAGIC);
+       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+       if (c2_mq_full(q)) {
+               return NULL;
+       } else {
+#ifdef DEBUG
+               struct c2wr_hdr *m =
+                   (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+               BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
+               m->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+               return m;
+#else
+               return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+       }
+}
+
+void c2_mq_produce(struct c2_mq *q)
+{
+       BUG_ON(q->magic != C2_MQ_MAGIC);
+       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+       if (!c2_mq_full(q)) {
+               q->priv = (q->priv + 1) % q->q_size;
+               q->hint_count++;
+               /* Update peer's offset. */
+               __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+       }
+}
+
+void *c2_mq_consume(struct c2_mq *q)
+{
+       BUG_ON(q->magic != C2_MQ_MAGIC);
+       BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+       if (c2_mq_empty(q)) {
+               return NULL;
+       } else {
+#ifdef DEBUG
+               struct c2wr_hdr *m = (struct c2wr_hdr *)
+                   (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+               BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
+#endif
+               return m;
+#else
+               return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+       }
+}
+
+void c2_mq_free(struct c2_mq *q)
+{
+       BUG_ON(q->magic != C2_MQ_MAGIC);
+       BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+       if (!c2_mq_empty(q)) {
+
+#ifdef CCMSGMAGIC
+               {
+                       struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
+                           (q->msg_pool.adapter + q->priv * q->msg_size);
+                       __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
+               }
+#endif
+               q->priv = (q->priv + 1) % q->q_size;
+               /* Update peer's offset. */
+               __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+       }
+}
+
+
+void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
+{
+       BUG_ON(q->magic != C2_MQ_MAGIC);
+       BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+       while (wqe_count--) {
+               BUG_ON(c2_mq_empty(q));
+               *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
+       }
+}
+
+#if 0
+u32 c2_mq_count(struct c2_mq *q)
+{
+       s32 count;
+
+       if (q->type == C2_MQ_HOST_TARGET)
+               count = be16_to_cpu(*q->shared) - q->priv;
+       else
+               count = q->priv - be16_to_cpu(*q->shared);
+
+       if (count < 0)
+               count += q->q_size;
+
+       return (u32) count;
+}
+#endif  /*  0  */
+
+void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+                   u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
+{
+       BUG_ON(!q->shared);
+
+       /* This code assumes the byte swapping has already been done! */
+       q->index = index;
+       q->q_size = q_size;
+       q->msg_size = msg_size;
+       q->msg_pool.adapter = pool_start;
+       q->peer = (struct c2_mq_shared __iomem *) peer;
+       q->magic = C2_MQ_MAGIC;
+       q->type = type;
+       q->priv = 0;
+       q->hint_count = 0;
+       return;
+}
+void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+                   u8 *pool_start, u16 __iomem *peer, u32 type)
+{
+       BUG_ON(!q->shared);
+
+       /* This code assumes the byte swapping has already been done! */
+       q->index = index;
+       q->q_size = q_size;
+       q->msg_size = msg_size;
+       q->msg_pool.host = pool_start;
+       q->peer = (struct c2_mq_shared __iomem *) peer;
+       q->magic = C2_MQ_MAGIC;
+       q->type = type;
+       q->priv = 0;
+       q->hint_count = 0;
+       return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_mq.h b/drivers/staging/rdma/amso1100/c2_mq.h
new file mode 100644 (file)
index 0000000..fc1b9a7
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _C2_MQ_H_
+#define _C2_MQ_H_
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "c2_wr.h"
+
+enum c2_shared_regs {
+
+       C2_SHARED_ARMED = 0x10,
+       C2_SHARED_NOTIFY = 0x18,
+       C2_SHARED_SHARED = 0x40,
+};
+
+struct c2_mq_shared {
+       u16 unused1;
+       u8 armed;
+       u8 notification_type;
+       u32 unused2;
+       u16 shared;
+       /* Pad to 64 bytes. */
+       u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
+};
+
+enum c2_mq_type {
+       C2_MQ_HOST_TARGET = 1,
+       C2_MQ_ADAPTER_TARGET = 2,
+};
+
+/*
+ * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
+ * c2_user_mq_t (which is the same format) is for user-mode MQs...
+ */
+#define C2_MQ_MAGIC 0x4d512020 /* 'MQ  ' */
+struct c2_mq {
+       u32 magic;
+       union {
+               u8 *host;
+               u8 __iomem *adapter;
+       } msg_pool;
+       dma_addr_t host_dma;
+       DEFINE_DMA_UNMAP_ADDR(mapping);
+       u16 hint_count;
+       u16 priv;
+       struct c2_mq_shared __iomem *peer;
+       __be16 *shared;
+       dma_addr_t shared_dma;
+       u32 q_size;
+       u32 msg_size;
+       u32 index;
+       enum c2_mq_type type;
+};
+
+static __inline__ int c2_mq_empty(struct c2_mq *q)
+{
+       return q->priv == be16_to_cpu(*q->shared);
+}
+
+static __inline__ int c2_mq_full(struct c2_mq *q)
+{
+       return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
+}
+
+extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
+extern void *c2_mq_alloc(struct c2_mq *q);
+extern void c2_mq_produce(struct c2_mq *q);
+extern void *c2_mq_consume(struct c2_mq *q);
+extern void c2_mq_free(struct c2_mq *q);
+extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+                      u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
+extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+                          u8 *pool_start, u16 __iomem *peer, u32 type);
+
+#endif                         /* _C2_MQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_pd.c b/drivers/staging/rdma/amso1100/c2_pd.c
new file mode 100644 (file)
index 0000000..f3e81dc
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "c2.h"
+#include "c2_provider.h"
+
+int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
+{
+       u32 obj;
+       int ret = 0;
+
+       spin_lock(&c2dev->pd_table.lock);
+       obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
+                                c2dev->pd_table.last);
+       if (obj >= c2dev->pd_table.max)
+               obj = find_first_zero_bit(c2dev->pd_table.table,
+                                         c2dev->pd_table.max);
+       if (obj < c2dev->pd_table.max) {
+               pd->pd_id = obj;
+               __set_bit(obj, c2dev->pd_table.table);
+               c2dev->pd_table.last = obj+1;
+               if (c2dev->pd_table.last >= c2dev->pd_table.max)
+                       c2dev->pd_table.last = 0;
+       } else
+               ret = -ENOMEM;
+       spin_unlock(&c2dev->pd_table.lock);
+       return ret;
+}
+
+void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
+{
+       spin_lock(&c2dev->pd_table.lock);
+       __clear_bit(pd->pd_id, c2dev->pd_table.table);
+       spin_unlock(&c2dev->pd_table.lock);
+}
+
+int c2_init_pd_table(struct c2_dev *c2dev)
+{
+
+       c2dev->pd_table.last = 0;
+       c2dev->pd_table.max = c2dev->props.max_pd;
+       spin_lock_init(&c2dev->pd_table.lock);
+       c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
+                                       sizeof(long), GFP_KERNEL);
+       if (!c2dev->pd_table.table)
+               return -ENOMEM;
+       bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
+       return 0;
+}
+
+void c2_cleanup_pd_table(struct c2_dev *c2dev)
+{
+       kfree(c2dev->pd_table.table);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c
new file mode 100644 (file)
index 0000000..25c3f00
--- /dev/null
@@ -0,0 +1,912 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_arp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "c2.h"
+#include "c2_provider.h"
+#include "c2_user.h"
+
+static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+                          struct ib_udata *uhw)
+{
+       struct c2_dev *c2dev = to_c2dev(ibdev);
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+
+       *props = c2dev->props;
+       return 0;
+}
+
+static int c2_query_port(struct ib_device *ibdev,
+                        u8 port, struct ib_port_attr *props)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       props->max_mtu = IB_MTU_4096;
+       props->lid = 0;
+       props->lmc = 0;
+       props->sm_lid = 0;
+       props->sm_sl = 0;
+       props->state = IB_PORT_ACTIVE;
+       props->phys_state = 0;
+       props->port_cap_flags =
+           IB_PORT_CM_SUP |
+           IB_PORT_REINIT_SUP |
+           IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+       props->gid_tbl_len = 1;
+       props->pkey_tbl_len = 1;
+       props->qkey_viol_cntr = 0;
+       props->active_width = 1;
+       props->active_speed = IB_SPEED_SDR;
+
+       return 0;
+}
+
+static int c2_query_pkey(struct ib_device *ibdev,
+                        u8 port, u16 index, u16 * pkey)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       *pkey = 0;
+       return 0;
+}
+
+static int c2_query_gid(struct ib_device *ibdev, u8 port,
+                       int index, union ib_gid *gid)
+{
+       struct c2_dev *c2dev = to_c2dev(ibdev);
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+       memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
+
+       return 0;
+}
+
+/* Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
+                                            struct ib_udata *udata)
+{
+       struct c2_ucontext *context;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       context = kmalloc(sizeof(*context), GFP_KERNEL);
+       if (!context)
+               return ERR_PTR(-ENOMEM);
+
+       return &context->ibucontext;
+}
+
+static int c2_dealloc_ucontext(struct ib_ucontext *context)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       kfree(context);
+       return 0;
+}
+
+static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return -ENOSYS;
+}
+
+static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
+                                struct ib_ucontext *context,
+                                struct ib_udata *udata)
+{
+       struct c2_pd *pd;
+       int err;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+       if (!pd)
+               return ERR_PTR(-ENOMEM);
+
+       err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
+       if (err) {
+               kfree(pd);
+               return ERR_PTR(err);
+       }
+
+       if (context) {
+               if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
+                       c2_pd_free(to_c2dev(ibdev), pd);
+                       kfree(pd);
+                       return ERR_PTR(-EFAULT);
+               }
+       }
+
+       return &pd->ibpd;
+}
+
+static int c2_dealloc_pd(struct ib_pd *pd)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
+       kfree(pd);
+
+       return 0;
+}
+
+static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return ERR_PTR(-ENOSYS);
+}
+
+static int c2_ah_destroy(struct ib_ah *ah)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return -ENOSYS;
+}
+
+static void c2_add_ref(struct ib_qp *ibqp)
+{
+       struct c2_qp *qp;
+       BUG_ON(!ibqp);
+       qp = to_c2qp(ibqp);
+       atomic_inc(&qp->refcount);
+}
+
+static void c2_rem_ref(struct ib_qp *ibqp)
+{
+       struct c2_qp *qp;
+       BUG_ON(!ibqp);
+       qp = to_c2qp(ibqp);
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
+{
+       struct c2_dev* c2dev = to_c2dev(device);
+       struct c2_qp *qp;
+
+       qp = c2_find_qpn(c2dev, qpn);
+       pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
+               __func__, qp, qpn, device,
+               (qp?atomic_read(&qp->refcount):0));
+
+       return (qp?&qp->ibqp:NULL);
+}
+
+static struct ib_qp *c2_create_qp(struct ib_pd *pd,
+                                 struct ib_qp_init_attr *init_attr,
+                                 struct ib_udata *udata)
+{
+       struct c2_qp *qp;
+       int err;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       if (init_attr->create_flags)
+               return ERR_PTR(-EINVAL);
+
+       switch (init_attr->qp_type) {
+       case IB_QPT_RC:
+               qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+               if (!qp) {
+                       pr_debug("%s: Unable to allocate QP\n", __func__);
+                       return ERR_PTR(-ENOMEM);
+               }
+               spin_lock_init(&qp->lock);
+               if (pd->uobject) {
+                       /* userspace specific */
+               }
+
+               err = c2_alloc_qp(to_c2dev(pd->device),
+                                 to_c2pd(pd), init_attr, qp);
+
+               if (err && pd->uobject) {
+                       /* userspace specific */
+               }
+
+               break;
+       default:
+               pr_debug("%s: Invalid QP type: %d\n", __func__,
+                       init_attr->qp_type);
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (err) {
+               kfree(qp);
+               return ERR_PTR(err);
+       }
+
+       return &qp->ibqp;
+}
+
+static int c2_destroy_qp(struct ib_qp *ib_qp)
+{
+       struct c2_qp *qp = to_c2qp(ib_qp);
+
+       pr_debug("%s:%u qp=%p,qp->state=%d\n",
+               __func__, __LINE__, ib_qp, qp->state);
+       c2_free_qp(to_c2dev(ib_qp->device), qp);
+       kfree(qp);
+       return 0;
+}
+
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
+                                 const struct ib_cq_init_attr *attr,
+                                 struct ib_ucontext *context,
+                                 struct ib_udata *udata)
+{
+       int entries = attr->cqe;
+       struct c2_cq *cq;
+       int err;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq) {
+               pr_debug("%s: Unable to allocate CQ\n", __func__);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
+       if (err) {
+               pr_debug("%s: error initializing CQ\n", __func__);
+               kfree(cq);
+               return ERR_PTR(err);
+       }
+
+       return &cq->ibcq;
+}
+
+static int c2_destroy_cq(struct ib_cq *ib_cq)
+{
+       struct c2_cq *cq = to_c2cq(ib_cq);
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       c2_free_cq(to_c2dev(ib_cq->device), cq);
+       kfree(cq);
+
+       return 0;
+}
+
+static inline u32 c2_convert_access(int acc)
+{
+       return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
+           (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
+           (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
+           C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
+}
+
+static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
+                                   struct ib_phys_buf *buffer_list,
+                                   int num_phys_buf, int acc, u64 * iova_start)
+{
+       struct c2_mr *mr;
+       u64 *page_list;
+       u32 total_len;
+       int err, i, j, k, page_shift, pbl_depth;
+
+       pbl_depth = 0;
+       total_len = 0;
+
+       page_shift = PAGE_SHIFT;
+       /*
+        * If there is only 1 buffer we assume this could
+        * be a map of all phy mem...use a 32k page_shift.
+        */
+       if (num_phys_buf == 1)
+               page_shift += 3;
+
+       for (i = 0; i < num_phys_buf; i++) {
+
+               if (buffer_list[i].addr & ~PAGE_MASK) {
+                       pr_debug("Unaligned Memory Buffer: 0x%x\n",
+                               (unsigned int) buffer_list[i].addr);
+                       return ERR_PTR(-EINVAL);
+               }
+
+               if (!buffer_list[i].size) {
+                       pr_debug("Invalid Buffer Size\n");
+                       return ERR_PTR(-EINVAL);
+               }
+
+               total_len += buffer_list[i].size;
+               pbl_depth += ALIGN(buffer_list[i].size,
+                                  (1 << page_shift)) >> page_shift;
+       }
+
+       page_list = vmalloc(sizeof(u64) * pbl_depth);
+       if (!page_list) {
+               pr_debug("couldn't vmalloc page_list of size %zd\n",
+                       (sizeof(u64) * pbl_depth));
+               return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0, j = 0; i < num_phys_buf; i++) {
+
+               int naddrs;
+
+               naddrs = ALIGN(buffer_list[i].size,
+                              (1 << page_shift)) >> page_shift;
+               for (k = 0; k < naddrs; k++)
+                       page_list[j++] = (buffer_list[i].addr +
+                                                    (k << page_shift));
+       }
+
+       mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               vfree(page_list);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       mr->pd = to_c2pd(ib_pd);
+       mr->umem = NULL;
+       pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
+               "*iova_start %llx, first pa %llx, last pa %llx\n",
+               __func__, page_shift, pbl_depth, total_len,
+               (unsigned long long) *iova_start,
+               (unsigned long long) page_list[0],
+               (unsigned long long) page_list[pbl_depth-1]);
+       err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+                                        (1 << page_shift), pbl_depth,
+                                        total_len, 0, iova_start,
+                                        c2_convert_access(acc), mr);
+       vfree(page_list);
+       if (err) {
+               kfree(mr);
+               return ERR_PTR(err);
+       }
+
+       return &mr->ibmr;
+}
+
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
+{
+       struct ib_phys_buf bl;
+       u64 kva = 0;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       /* AMSO1100 limit */
+       bl.size = 0xffffffff;
+       bl.addr = 0;
+       return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                                   u64 virt, int acc, struct ib_udata *udata)
+{
+       u64 *pages;
+       u64 kva = 0;
+       int shift, n, len;
+       int i, k, entry;
+       int err = 0;
+       struct scatterlist *sg;
+       struct c2_pd *c2pd = to_c2pd(pd);
+       struct c2_mr *c2mr;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
+       if (!c2mr)
+               return ERR_PTR(-ENOMEM);
+       c2mr->pd = c2pd;
+
+       c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+       if (IS_ERR(c2mr->umem)) {
+               err = PTR_ERR(c2mr->umem);
+               kfree(c2mr);
+               return ERR_PTR(err);
+       }
+
+       shift = ffs(c2mr->umem->page_size) - 1;
+       n = c2mr->umem->nmap;
+
+       pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+       if (!pages) {
+               err = -ENOMEM;
+               goto err;
+       }
+
+       i = 0;
+       for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
+               len = sg_dma_len(sg) >> shift;
+               for (k = 0; k < len; ++k) {
+                       pages[i++] =
+                               sg_dma_address(sg) +
+                               (c2mr->umem->page_size * k);
+               }
+       }
+
+       kva = virt;
+       err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
+                                        pages,
+                                        c2mr->umem->page_size,
+                                        i,
+                                        length,
+                                        ib_umem_offset(c2mr->umem),
+                                        &kva,
+                                        c2_convert_access(acc),
+                                        c2mr);
+       kfree(pages);
+       if (err)
+               goto err;
+       return &c2mr->ibmr;
+
+err:
+       ib_umem_release(c2mr->umem);
+       kfree(c2mr);
+       return ERR_PTR(err);
+}
+
+static int c2_dereg_mr(struct ib_mr *ib_mr)
+{
+       struct c2_mr *mr = to_c2mr(ib_mr);
+       int err;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
+       if (err)
+               pr_debug("c2_stag_dealloc failed: %d\n", err);
+       else {
+               if (mr->umem)
+                       ib_umem_release(mr->umem);
+               kfree(mr);
+       }
+
+       return err;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+                       char *buf)
+{
+       struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return sprintf(buf, "%x\n", c2dev->props.hw_ver);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+                          char *buf)
+{
+       struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return sprintf(buf, "%x.%x.%x\n",
+                      (int) (c2dev->props.fw_ver >> 32),
+                      (int) (c2dev->props.fw_ver >> 16) & 0xffff,
+                      (int) (c2dev->props.fw_ver & 0xffff));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+                       char *buf)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return sprintf(buf, "AMSO1100\n");
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+                         char *buf)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c2_dev_attributes[] = {
+       &dev_attr_hw_rev,
+       &dev_attr_fw_ver,
+       &dev_attr_hca_type,
+       &dev_attr_board_id
+};
+
+static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                       int attr_mask, struct ib_udata *udata)
+{
+       int err;
+
+       err =
+           c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
+                        attr_mask);
+
+       return err;
+}
+
+static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return -ENOSYS;
+}
+
+static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return -ENOSYS;
+}
+
+static int c2_process_mad(struct ib_device *ibdev,
+                         int mad_flags,
+                         u8 port_num,
+                         const struct ib_wc *in_wc,
+                         const struct ib_grh *in_grh,
+                         const struct ib_mad_hdr *in_mad,
+                         size_t in_mad_size,
+                         struct ib_mad_hdr *out_mad,
+                         size_t *out_mad_size,
+                         u16 *out_mad_pkey_index)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       return -ENOSYS;
+}
+
+static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       /* Request a connection */
+       return c2_llp_connect(cm_id, iw_param);
+}
+
+static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       /* Accept the new connection */
+       return c2_llp_accept(cm_id, iw_param);
+}
+
+static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+       int err;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       err = c2_llp_reject(cm_id, pdata, pdata_len);
+       return err;
+}
+
+static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+       int err;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       err = c2_llp_service_create(cm_id, backlog);
+       pr_debug("%s:%u err=%d\n",
+               __func__, __LINE__,
+               err);
+       return err;
+}
+
+static int c2_service_destroy(struct iw_cm_id *cm_id)
+{
+       int err;
+       pr_debug("%s:%u\n", __func__, __LINE__);
+
+       err = c2_llp_service_destroy(cm_id);
+
+       return err;
+}
+
+static int c2_pseudo_up(struct net_device *netdev)
+{
+       struct in_device *ind;
+       struct c2_dev *c2dev = netdev->ml_priv;
+
+       ind = in_dev_get(netdev);
+       if (!ind)
+               return 0;
+
+       pr_debug("adding...\n");
+       for_ifa(ind) {
+#ifdef DEBUG
+               u8 *ip = (u8 *) & ifa->ifa_address;
+
+               pr_debug("%s: %d.%d.%d.%d\n",
+                      ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+               c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+       }
+       endfor_ifa(ind);
+       in_dev_put(ind);
+
+       return 0;
+}
+
+static int c2_pseudo_down(struct net_device *netdev)
+{
+       struct in_device *ind;
+       struct c2_dev *c2dev = netdev->ml_priv;
+
+       ind = in_dev_get(netdev);
+       if (!ind)
+               return 0;
+
+       pr_debug("deleting...\n");
+       for_ifa(ind) {
+#ifdef DEBUG
+               u8 *ip = (u8 *) & ifa->ifa_address;
+
+               pr_debug("%s: %d.%d.%d.%d\n",
+                      ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+               c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+       }
+       endfor_ifa(ind);
+       in_dev_put(ind);
+
+       return 0;
+}
+
+static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+       kfree_skb(skb);
+       return NETDEV_TX_OK;
+}
+
+static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
+{
+       if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+               return -EINVAL;
+
+       netdev->mtu = new_mtu;
+
+       /* TODO: Tell rnic about new rmda interface mtu */
+       return 0;
+}
+
+static const struct net_device_ops c2_pseudo_netdev_ops = {
+       .ndo_open               = c2_pseudo_up,
+       .ndo_stop               = c2_pseudo_down,
+       .ndo_start_xmit         = c2_pseudo_xmit_frame,
+       .ndo_change_mtu         = c2_pseudo_change_mtu,
+       .ndo_validate_addr      = eth_validate_addr,
+};
+
+static void setup(struct net_device *netdev)
+{
+       netdev->netdev_ops = &c2_pseudo_netdev_ops;
+
+       netdev->watchdog_timeo = 0;
+       netdev->type = ARPHRD_ETHER;
+       netdev->mtu = 1500;
+       netdev->hard_header_len = ETH_HLEN;
+       netdev->addr_len = ETH_ALEN;
+       netdev->tx_queue_len = 0;
+       netdev->flags |= IFF_NOARP;
+}
+
+static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
+{
+       char name[IFNAMSIZ];
+       struct net_device *netdev;
+
+       /* change ethxxx to iwxxx */
+       strcpy(name, "iw");
+       strcat(name, &c2dev->netdev->name[3]);
+       netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
+       if (!netdev) {
+               printk(KERN_ERR PFX "%s -  etherdev alloc failed",
+                       __func__);
+               return NULL;
+       }
+
+       netdev->ml_priv = c2dev;
+
+       SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+       memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
+
+       /* Print out the MAC address */
+       pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
+
+#if 0
+       /* Disable network packets */
+       netif_stop_queue(netdev);
+#endif
+       return netdev;
+}
+
+static int c2_port_immutable(struct ib_device *ibdev, u8 port_num,
+                            struct ib_port_immutable *immutable)
+{
+       struct ib_port_attr attr;
+       int err;
+
+       err = c2_query_port(ibdev, port_num, &attr);
+       if (err)
+               return err;
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+       return 0;
+}
+
+int c2_register_device(struct c2_dev *dev)
+{
+       int ret = -ENOMEM;
+       int i;
+
+       /* Register pseudo network device */
+       dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
+       if (!dev->pseudo_netdev)
+               goto out;
+
+       ret = register_netdev(dev->pseudo_netdev);
+       if (ret)
+               goto out_free_netdev;
+
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
+       dev->ibdev.owner = THIS_MODULE;
+       dev->ibdev.uverbs_cmd_mask =
+           (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+           (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+           (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+           (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+           (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+           (1ull << IB_USER_VERBS_CMD_REG_MR) |
+           (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+           (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+           (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+           (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+           (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+           (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+           (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+           (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+           (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+           (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+           (1ull << IB_USER_VERBS_CMD_POST_RECV);
+
+       dev->ibdev.node_type = RDMA_NODE_RNIC;
+       memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+       memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
+       dev->ibdev.phys_port_cnt = 1;
+       dev->ibdev.num_comp_vectors = 1;
+       dev->ibdev.dma_device = &dev->pcidev->dev;
+       dev->ibdev.query_device = c2_query_device;
+       dev->ibdev.query_port = c2_query_port;
+       dev->ibdev.query_pkey = c2_query_pkey;
+       dev->ibdev.query_gid = c2_query_gid;
+       dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
+       dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
+       dev->ibdev.mmap = c2_mmap_uar;
+       dev->ibdev.alloc_pd = c2_alloc_pd;
+       dev->ibdev.dealloc_pd = c2_dealloc_pd;
+       dev->ibdev.create_ah = c2_ah_create;
+       dev->ibdev.destroy_ah = c2_ah_destroy;
+       dev->ibdev.create_qp = c2_create_qp;
+       dev->ibdev.modify_qp = c2_modify_qp;
+       dev->ibdev.destroy_qp = c2_destroy_qp;
+       dev->ibdev.create_cq = c2_create_cq;
+       dev->ibdev.destroy_cq = c2_destroy_cq;
+       dev->ibdev.poll_cq = c2_poll_cq;
+       dev->ibdev.get_dma_mr = c2_get_dma_mr;
+       dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
+       dev->ibdev.reg_user_mr = c2_reg_user_mr;
+       dev->ibdev.dereg_mr = c2_dereg_mr;
+       dev->ibdev.get_port_immutable = c2_port_immutable;
+
+       dev->ibdev.alloc_fmr = NULL;
+       dev->ibdev.unmap_fmr = NULL;
+       dev->ibdev.dealloc_fmr = NULL;
+       dev->ibdev.map_phys_fmr = NULL;
+
+       dev->ibdev.attach_mcast = c2_multicast_attach;
+       dev->ibdev.detach_mcast = c2_multicast_detach;
+       dev->ibdev.process_mad = c2_process_mad;
+
+       dev->ibdev.req_notify_cq = c2_arm_cq;
+       dev->ibdev.post_send = c2_post_send;
+       dev->ibdev.post_recv = c2_post_receive;
+
+       dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
+       if (dev->ibdev.iwcm == NULL) {
+               ret = -ENOMEM;
+               goto out_unregister_netdev;
+       }
+       dev->ibdev.iwcm->add_ref = c2_add_ref;
+       dev->ibdev.iwcm->rem_ref = c2_rem_ref;
+       dev->ibdev.iwcm->get_qp = c2_get_qp;
+       dev->ibdev.iwcm->connect = c2_connect;
+       dev->ibdev.iwcm->accept = c2_accept;
+       dev->ibdev.iwcm->reject = c2_reject;
+       dev->ibdev.iwcm->create_listen = c2_service_create;
+       dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
+
+       ret = ib_register_device(&dev->ibdev, NULL);
+       if (ret)
+               goto out_free_iwcm;
+
+       for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
+               ret = device_create_file(&dev->ibdev.dev,
+                                              c2_dev_attributes[i]);
+               if (ret)
+                       goto out_unregister_ibdev;
+       }
+       goto out;
+
+out_unregister_ibdev:
+       ib_unregister_device(&dev->ibdev);
+out_free_iwcm:
+       kfree(dev->ibdev.iwcm);
+out_unregister_netdev:
+       unregister_netdev(dev->pseudo_netdev);
+out_free_netdev:
+       free_netdev(dev->pseudo_netdev);
+out:
+       pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
+       return ret;
+}
+
+void c2_unregister_device(struct c2_dev *dev)
+{
+       pr_debug("%s:%u\n", __func__, __LINE__);
+       unregister_netdev(dev->pseudo_netdev);
+       free_netdev(dev->pseudo_netdev);
+       ib_unregister_device(&dev->ibdev);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_provider.h b/drivers/staging/rdma/amso1100/c2_provider.h
new file mode 100644 (file)
index 0000000..bf18998
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_PROVIDER_H
+#define C2_PROVIDER_H
+#include <linux/inetdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#include "c2_mq.h"
+#include <rdma/iw_cm.h>
+
+#define C2_MPT_FLAG_ATOMIC        (1 << 14)
+#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct c2_buf_list {
+       void *buf;
+       DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+
+/* The user context keeps track of objects allocated for a
+ * particular user-mode client. */
+struct c2_ucontext {
+       struct ib_ucontext ibucontext;
+};
+
+struct c2_mtt;
+
+/* All objects associated with a PD are kept in the
+ * associated user context if present.
+ */
+struct c2_pd {
+       struct ib_pd ibpd;
+       u32 pd_id;
+};
+
+struct c2_mr {
+       struct ib_mr ibmr;
+       struct c2_pd *pd;
+       struct ib_umem *umem;
+};
+
+struct c2_av;
+
+enum c2_ah_type {
+       C2_AH_ON_HCA,
+       C2_AH_PCI_POOL,
+       C2_AH_KMALLOC
+};
+
+struct c2_ah {
+       struct ib_ah ibah;
+};
+
+struct c2_cq {
+       struct ib_cq ibcq;
+       spinlock_t lock;
+       atomic_t refcount;
+       int cqn;
+       int is_kernel;
+       wait_queue_head_t wait;
+
+       u32 adapter_handle;
+       struct c2_mq mq;
+};
+
+struct c2_wq {
+       spinlock_t lock;
+};
+struct iw_cm_id;
+struct c2_qp {
+       struct ib_qp ibqp;
+       struct iw_cm_id *cm_id;
+       spinlock_t lock;
+       atomic_t refcount;
+       wait_queue_head_t wait;
+       int qpn;
+
+       u32 adapter_handle;
+       u32 send_sgl_depth;
+       u32 recv_sgl_depth;
+       u32 rdma_write_sgl_depth;
+       u8 state;
+
+       struct c2_mq sq_mq;
+       struct c2_mq rq_mq;
+};
+
+struct c2_cr_query_attrs {
+       u32 local_addr;
+       u32 remote_addr;
+       u16 local_port;
+       u16 remote_port;
+};
+
+static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct c2_pd, ibpd);
+}
+
+static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
+{
+       return container_of(ibucontext, struct c2_ucontext, ibucontext);
+}
+
+static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct c2_mr, ibmr);
+}
+
+
+static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct c2_ah, ibah);
+}
+
+static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct c2_cq, ibcq);
+}
+
+static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct c2_qp, ibqp);
+}
+
+static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
+{
+       struct in_device *ind;
+       int ret = 0;
+
+       ind = in_dev_get(netdev);
+       if (!ind)
+               return 0;
+
+       for_ifa(ind) {
+               if (ifa->ifa_address == addr) {
+                       ret = 1;
+                       break;
+               }
+       }
+       endfor_ifa(ind);
+       in_dev_put(ind);
+       return ret;
+}
+#endif                         /* C2_PROVIDER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_qp.c b/drivers/staging/rdma/amso1100/c2_qp.c
new file mode 100644 (file)
index 0000000..86708de
--- /dev/null
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_MAX_ORD_PER_QP 128
+#define C2_MAX_IRD_PER_QP 128
+
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+#define NO_SUPPORT -1
+static const u8 c2_opcode[] = {
+       [IB_WR_SEND] = C2_WR_TYPE_SEND,
+       [IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
+       [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
+       [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
+       [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
+       [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
+};
+
+static int to_c2_state(enum ib_qp_state ib_state)
+{
+       switch (ib_state) {
+       case IB_QPS_RESET:
+               return C2_QP_STATE_IDLE;
+       case IB_QPS_RTS:
+               return C2_QP_STATE_RTS;
+       case IB_QPS_SQD:
+               return C2_QP_STATE_CLOSING;
+       case IB_QPS_SQE:
+               return C2_QP_STATE_CLOSING;
+       case IB_QPS_ERR:
+               return C2_QP_STATE_ERROR;
+       default:
+               return -1;
+       }
+}
+
+static int to_ib_state(enum c2_qp_state c2_state)
+{
+       switch (c2_state) {
+       case C2_QP_STATE_IDLE:
+               return IB_QPS_RESET;
+       case C2_QP_STATE_CONNECTING:
+               return IB_QPS_RTR;
+       case C2_QP_STATE_RTS:
+               return IB_QPS_RTS;
+       case C2_QP_STATE_CLOSING:
+               return IB_QPS_SQD;
+       case C2_QP_STATE_ERROR:
+               return IB_QPS_ERR;
+       case C2_QP_STATE_TERMINATE:
+               return IB_QPS_SQE;
+       default:
+               return -1;
+       }
+}
+
+static const char *to_ib_state_str(int ib_state)
+{
+       static const char *state_str[] = {
+               "IB_QPS_RESET",
+               "IB_QPS_INIT",
+               "IB_QPS_RTR",
+               "IB_QPS_RTS",
+               "IB_QPS_SQD",
+               "IB_QPS_SQE",
+               "IB_QPS_ERR"
+       };
+       if (ib_state < IB_QPS_RESET ||
+           ib_state > IB_QPS_ERR)
+               return "<invalid IB QP state>";
+
+       ib_state -= IB_QPS_RESET;
+       return state_str[ib_state];
+}
+
+void c2_set_qp_state(struct c2_qp *qp, int c2_state)
+{
+       int new_state = to_ib_state(c2_state);
+
+       pr_debug("%s: qp[%p] state modify %s --> %s\n",
+              __func__,
+               qp,
+               to_ib_state_str(qp->state),
+               to_ib_state_str(new_state));
+       qp->state = new_state;
+}
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+                struct ib_qp_attr *attr, int attr_mask)
+{
+       struct c2wr_qp_modify_req wr;
+       struct c2wr_qp_modify_rep *reply;
+       struct c2_vq_req *vq_req;
+       unsigned long flags;
+       u8 next_state;
+       int err;
+
+       pr_debug("%s:%d qp=%p, %s --> %s\n",
+               __func__, __LINE__,
+               qp,
+               to_ib_state_str(qp->state),
+               to_ib_state_str(attr->qp_state));
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.qp_handle = qp->adapter_handle;
+       wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+       wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+       wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+       wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+       if (attr_mask & IB_QP_STATE) {
+               /* Ensure the state is valid */
+               if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
+                       err = -EINVAL;
+                       goto bail0;
+               }
+
+               wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
+
+               if (attr->qp_state == IB_QPS_ERR) {
+                       spin_lock_irqsave(&qp->lock, flags);
+                       if (qp->cm_id && qp->state == IB_QPS_RTS) {
+                               pr_debug("Generating CLOSE event for QP-->ERR, "
+                                       "qp=%p, cm_id=%p\n",qp,qp->cm_id);
+                               /* Generate an CLOSE event */
+                               vq_req->cm_id = qp->cm_id;
+                               vq_req->event = IW_CM_EVENT_CLOSE;
+                       }
+                       spin_unlock_irqrestore(&qp->lock, flags);
+               }
+               next_state =  attr->qp_state;
+
+       } else if (attr_mask & IB_QP_CUR_STATE) {
+
+               if (attr->cur_qp_state != IB_QPS_RTR &&
+                   attr->cur_qp_state != IB_QPS_RTS &&
+                   attr->cur_qp_state != IB_QPS_SQD &&
+                   attr->cur_qp_state != IB_QPS_SQE) {
+                       err = -EINVAL;
+                       goto bail0;
+               } else
+                       wr.next_qp_state =
+                           cpu_to_be32(to_c2_state(attr->cur_qp_state));
+
+               next_state = attr->cur_qp_state;
+
+       } else {
+               err = 0;
+               goto bail0;
+       }
+
+       /* reference the request struct */
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail0;
+
+       reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       err = c2_errno(reply);
+       if (!err)
+               qp->state = next_state;
+#ifdef DEBUG
+       else
+               pr_debug("%s: c2_errno=%d\n", __func__, err);
+#endif
+       /*
+        * If we're going to error and generating the event here, then
+        * we need to remove the reference because there will be no
+        * close event generated by the adapter
+       */
+       spin_lock_irqsave(&qp->lock, flags);
+       if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
+               qp->cm_id->rem_ref(qp->cm_id);
+               qp->cm_id = NULL;
+       }
+       spin_unlock_irqrestore(&qp->lock, flags);
+
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+
+       pr_debug("%s:%d qp=%p, cur_state=%s\n",
+               __func__, __LINE__,
+               qp,
+               to_ib_state_str(qp->state));
+       return err;
+}
+
+int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+                         int ord, int ird)
+{
+       struct c2wr_qp_modify_req wr;
+       struct c2wr_qp_modify_rep *reply;
+       struct c2_vq_req *vq_req;
+       int err;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.qp_handle = qp->adapter_handle;
+       wr.ord = cpu_to_be32(ord);
+       wr.ird = cpu_to_be32(ird);
+       wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+       wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+       wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+       /* reference the request struct */
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail0;
+
+       reply = (struct c2wr_qp_modify_rep *) (unsigned long)
+               vq_req->reply_msg;
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       err = c2_errno(reply);
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+       struct c2_vq_req *vq_req;
+       struct c2wr_qp_destroy_req wr;
+       struct c2wr_qp_destroy_rep *reply;
+       unsigned long flags;
+       int err;
+
+       /*
+        * Allocate a verb request message
+        */
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req) {
+               return -ENOMEM;
+       }
+
+       /*
+        * Initialize the WR
+        */
+       c2_wr_set_id(&wr, CCWR_QP_DESTROY);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.qp_handle = qp->adapter_handle;
+
+       /*
+        * reference the request struct.  dereferenced in the int handler.
+        */
+       vq_req_get(c2dev, vq_req);
+
+       spin_lock_irqsave(&qp->lock, flags);
+       if (qp->cm_id && qp->state == IB_QPS_RTS) {
+               pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
+                       "qp=%p, cm_id=%p\n",qp,qp->cm_id);
+               /* Generate an CLOSE event */
+               vq_req->qp = qp;
+               vq_req->cm_id = qp->cm_id;
+               vq_req->event = IW_CM_EVENT_CLOSE;
+       }
+       spin_unlock_irqrestore(&qp->lock, flags);
+
+       /*
+        * Send WR to adapter
+        */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       /*
+        * Wait for reply from adapter
+        */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail0;
+       }
+
+       /*
+        * Process reply
+        */
+       reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       spin_lock_irqsave(&qp->lock, flags);
+       if (qp->cm_id) {
+               qp->cm_id->rem_ref(qp->cm_id);
+               qp->cm_id = NULL;
+       }
+       spin_unlock_irqrestore(&qp->lock, flags);
+
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+       int ret;
+
+       idr_preload(GFP_KERNEL);
+       spin_lock_irq(&c2dev->qp_table.lock);
+
+       ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
+       if (ret >= 0)
+               qp->qpn = ret;
+
+       spin_unlock_irq(&c2dev->qp_table.lock);
+       idr_preload_end();
+       return ret < 0 ? ret : 0;
+}
+
+static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
+{
+       spin_lock_irq(&c2dev->qp_table.lock);
+       idr_remove(&c2dev->qp_table.idr, qpn);
+       spin_unlock_irq(&c2dev->qp_table.lock);
+}
+
+struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
+{
+       unsigned long flags;
+       struct c2_qp *qp;
+
+       spin_lock_irqsave(&c2dev->qp_table.lock, flags);
+       qp = idr_find(&c2dev->qp_table.idr, qpn);
+       spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
+       return qp;
+}
+
+int c2_alloc_qp(struct c2_dev *c2dev,
+               struct c2_pd *pd,
+               struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
+{
+       struct c2wr_qp_create_req wr;
+       struct c2wr_qp_create_rep *reply;
+       struct c2_vq_req *vq_req;
+       struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
+       struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
+       unsigned long peer_pa;
+       u32 q_size, msg_size, mmap_size;
+       void __iomem *mmap;
+       int err;
+
+       err = c2_alloc_qpn(c2dev, qp);
+       if (err)
+               return err;
+       qp->ibqp.qp_num = qp->qpn;
+       qp->ibqp.qp_type = IB_QPT_RC;
+
+       /* Allocate the SQ and RQ shared pointers */
+       qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                        &qp->sq_mq.shared_dma, GFP_KERNEL);
+       if (!qp->sq_mq.shared) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                        &qp->rq_mq.shared_dma, GFP_KERNEL);
+       if (!qp->rq_mq.shared) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       /* Allocate the verbs request */
+       vq_req = vq_req_alloc(c2dev);
+       if (vq_req == NULL) {
+               err = -ENOMEM;
+               goto bail2;
+       }
+
+       /* Initialize the work request */
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_QP_CREATE);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+       wr.sq_cq_handle = send_cq->adapter_handle;
+       wr.rq_cq_handle = recv_cq->adapter_handle;
+       wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
+       wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
+       wr.srq_handle = 0;
+       wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
+                              QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
+       wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+       wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
+       wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+       wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
+       wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
+       wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
+       wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
+       wr.pd_id = pd->pd_id;
+       wr.user_context = (unsigned long) qp;
+
+       vq_req_get(c2dev, vq_req);
+
+       /* Send the WR to the adapter */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail3;
+       }
+
+       /* Wait for the verb reply  */
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail3;
+       }
+
+       /* Process the reply */
+       reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail3;
+       }
+
+       if ((err = c2_wr_get_result(reply)) != 0) {
+               goto bail4;
+       }
+
+       /* Fill in the kernel QP struct */
+       atomic_set(&qp->refcount, 1);
+       qp->adapter_handle = reply->qp_handle;
+       qp->state = IB_QPS_RESET;
+       qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
+       qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
+       qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
+       init_waitqueue_head(&qp->wait);
+
+       /* Initialize the SQ MQ */
+       q_size = be32_to_cpu(reply->sq_depth);
+       msg_size = be32_to_cpu(reply->sq_msg_size);
+       peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
+       mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+       mmap = ioremap_nocache(peer_pa, mmap_size);
+       if (!mmap) {
+               err = -ENOMEM;
+               goto bail5;
+       }
+
+       c2_mq_req_init(&qp->sq_mq,
+                      be32_to_cpu(reply->sq_mq_index),
+                      q_size,
+                      msg_size,
+                      mmap + sizeof(struct c2_mq_shared),      /* pool start */
+                      mmap,                            /* peer */
+                      C2_MQ_ADAPTER_TARGET);
+
+       /* Initialize the RQ mq */
+       q_size = be32_to_cpu(reply->rq_depth);
+       msg_size = be32_to_cpu(reply->rq_msg_size);
+       peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
+       mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+       mmap = ioremap_nocache(peer_pa, mmap_size);
+       if (!mmap) {
+               err = -ENOMEM;
+               goto bail6;
+       }
+
+       c2_mq_req_init(&qp->rq_mq,
+                      be32_to_cpu(reply->rq_mq_index),
+                      q_size,
+                      msg_size,
+                      mmap + sizeof(struct c2_mq_shared),      /* pool start */
+                      mmap,                            /* peer */
+                      C2_MQ_ADAPTER_TARGET);
+
+       vq_repbuf_free(c2dev, reply);
+       vq_req_free(c2dev, vq_req);
+
+       return 0;
+
+      bail6:
+       iounmap(qp->sq_mq.peer);
+      bail5:
+       destroy_qp(c2dev, qp);
+      bail4:
+       vq_repbuf_free(c2dev, reply);
+      bail3:
+       vq_req_free(c2dev, vq_req);
+      bail2:
+       c2_free_mqsp(qp->rq_mq.shared);
+      bail1:
+       c2_free_mqsp(qp->sq_mq.shared);
+      bail0:
+       c2_free_qpn(c2dev, qp->qpn);
+       return err;
+}
+
+static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+       if (send_cq == recv_cq)
+               spin_lock_irq(&send_cq->lock);
+       else if (send_cq > recv_cq) {
+               spin_lock_irq(&send_cq->lock);
+               spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+       } else {
+               spin_lock_irq(&recv_cq->lock);
+               spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+       }
+}
+
+static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+       if (send_cq == recv_cq)
+               spin_unlock_irq(&send_cq->lock);
+       else if (send_cq > recv_cq) {
+               spin_unlock(&recv_cq->lock);
+               spin_unlock_irq(&send_cq->lock);
+       } else {
+               spin_unlock(&send_cq->lock);
+               spin_unlock_irq(&recv_cq->lock);
+       }
+}
+
+void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+       struct c2_cq *send_cq;
+       struct c2_cq *recv_cq;
+
+       send_cq = to_c2cq(qp->ibqp.send_cq);
+       recv_cq = to_c2cq(qp->ibqp.recv_cq);
+
+       /*
+        * Lock CQs here, so that CQ polling code can do QP lookup
+        * without taking a lock.
+        */
+       c2_lock_cqs(send_cq, recv_cq);
+       c2_free_qpn(c2dev, qp->qpn);
+       c2_unlock_cqs(send_cq, recv_cq);
+
+       /*
+        * Destroy qp in the rnic...
+        */
+       destroy_qp(c2dev, qp);
+
+       /*
+        * Mark any unreaped CQEs as null and void.
+        */
+       c2_cq_clean(c2dev, qp, send_cq->cqn);
+       if (send_cq != recv_cq)
+               c2_cq_clean(c2dev, qp, recv_cq->cqn);
+       /*
+        * Unmap the MQs and return the shared pointers
+        * to the message pool.
+        */
+       iounmap(qp->sq_mq.peer);
+       iounmap(qp->rq_mq.peer);
+       c2_free_mqsp(qp->sq_mq.shared);
+       c2_free_mqsp(qp->rq_mq.shared);
+
+       atomic_dec(&qp->refcount);
+       wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/*
+ * Function: move_sgl
+ *
+ * Description:
+ * Move an SGL from the user's work request struct into a CCIL Work Request
+ * message, swapping to WR byte order and ensure the total length doesn't
+ * overflow.
+ *
+ * IN:
+ * dst         - ptr to CCIL Work Request message SGL memory.
+ * src         - ptr to the consumers SGL memory.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int
+move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
+        u8 * actual_count)
+{
+       u32 tot = 0;            /* running total */
+       u8 acount = 0;          /* running total non-0 len sge's */
+
+       while (count > 0) {
+               /*
+                * If the addition of this SGE causes the
+                * total SGL length to exceed 2^32-1, then
+                * fail-n-bail.
+                *
+                * If the current total plus the next element length
+                * wraps, then it will go negative and be less than the
+                * current total...
+                */
+               if ((tot + src->length) < tot) {
+                       return -EINVAL;
+               }
+               /*
+                * Bug: 1456 (as well as 1498 & 1643)
+                * Skip over any sge's supplied with len=0
+                */
+               if (src->length) {
+                       tot += src->length;
+                       dst->stag = cpu_to_be32(src->lkey);
+                       dst->to = cpu_to_be64(src->addr);
+                       dst->length = cpu_to_be32(src->length);
+                       dst++;
+                       acount++;
+               }
+               src++;
+               count--;
+       }
+
+       if (acount == 0) {
+               /*
+                * Bug: 1476 (as well as 1498, 1456 and 1643)
+                * Setup the SGL in the WR to make it easier for the RNIC.
+                * This way, the FW doesn't have to deal with special cases.
+                * Setting length=0 should be sufficient.
+                */
+               dst->stag = 0;
+               dst->to = 0;
+               dst->length = 0;
+       }
+
+       *p_len = tot;
+       *actual_count = acount;
+       return 0;
+}
+
+/*
+ * Function: c2_activity (private function)
+ *
+ * Description:
+ * Post an mq index to the host->adapter activity fifo.
+ *
+ * IN:
+ * c2dev       - ptr to c2dev structure
+ * mq_index    - mq index to post
+ * shared      - value most recently written to shared
+ *
+ * OUT:
+ *
+ * Return:
+ * none
+ */
+static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
+{
+       /*
+        * First read the register to see if the FIFO is full, and if so,
+        * spin until it's not.  This isn't perfect -- there is no
+        * synchronization among the clients of the register, but in
+        * practice it prevents multiple CPU from hammering the bus
+        * with PCI RETRY. Note that when this does happen, the card
+        * cannot get on the bus and the card and system hang in a
+        * deadlock -- thus the need for this code. [TOT]
+        */
+       while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
+               udelay(10);
+
+       __raw_writel(C2_HINT_MAKE(mq_index, shared),
+                    c2dev->regs + PCI_BAR0_ADAPTER_HINT);
+}
+
+/*
+ * Function: qp_wr_post
+ *
+ * Description:
+ * This in-line function allocates a MQ msg, then moves the host-copy of
+ * the completed WR into msg.  Then it posts the message.
+ *
+ * IN:
+ * q           - ptr to user MQ.
+ * wr          - ptr to host-copy of the WR.
+ * qp          - ptr to user qp
+ * size                - Number of bytes to post.  Assumed to be divisible by 4.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
+{
+       union c2wr *msg;
+
+       msg = c2_mq_alloc(q);
+       if (msg == NULL) {
+               return -EINVAL;
+       }
+#ifdef CCMSGMAGIC
+       ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+
+       /*
+        * Since all header fields in the WR are the same as the
+        * CQE, set the following so the adapter need not.
+        */
+       c2_wr_set_result(wr, CCERR_PENDING);
+
+       /*
+        * Copy the wr down to the adapter
+        */
+       memcpy((void *) msg, (void *) wr, size);
+
+       c2_mq_produce(q);
+       return 0;
+}
+
+
+int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+                struct ib_send_wr **bad_wr)
+{
+       struct c2_dev *c2dev = to_c2dev(ibqp->device);
+       struct c2_qp *qp = to_c2qp(ibqp);
+       union c2wr wr;
+       unsigned long lock_flags;
+       int err = 0;
+
+       u32 flags;
+       u32 tot_len;
+       u8 actual_sge_count;
+       u32 msg_size;
+
+       if (qp->state > IB_QPS_RTS) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       while (ib_wr) {
+
+               flags = 0;
+               wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+               if (ib_wr->send_flags & IB_SEND_SIGNALED) {
+                       flags |= SQ_SIGNALED;
+               }
+
+               switch (ib_wr->opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_INV:
+                       if (ib_wr->opcode == IB_WR_SEND) {
+                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
+                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
+                               else
+                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
+                               wr.sqwr.send.remote_stag = 0;
+                       } else {
+                               if (ib_wr->send_flags & IB_SEND_SOLICITED)
+                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
+                               else
+                                       c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
+                               wr.sqwr.send.remote_stag =
+                                       cpu_to_be32(ib_wr->ex.invalidate_rkey);
+                       }
+
+                       msg_size = sizeof(struct c2wr_send_req) +
+                               sizeof(struct c2_data_addr) * ib_wr->num_sge;
+                       if (ib_wr->num_sge > qp->send_sgl_depth) {
+                               err = -EINVAL;
+                               break;
+                       }
+                       if (ib_wr->send_flags & IB_SEND_FENCE) {
+                               flags |= SQ_READ_FENCE;
+                       }
+                       err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
+                                      ib_wr->sg_list,
+                                      ib_wr->num_sge,
+                                      &tot_len, &actual_sge_count);
+                       wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
+                       c2_wr_set_sge_count(&wr, actual_sge_count);
+                       break;
+               case IB_WR_RDMA_WRITE:
+                       c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
+                       msg_size = sizeof(struct c2wr_rdma_write_req) +
+                           (sizeof(struct c2_data_addr) * ib_wr->num_sge);
+                       if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
+                               err = -EINVAL;
+                               break;
+                       }
+                       if (ib_wr->send_flags & IB_SEND_FENCE) {
+                               flags |= SQ_READ_FENCE;
+                       }
+                       wr.sqwr.rdma_write.remote_stag =
+                           cpu_to_be32(ib_wr->wr.rdma.rkey);
+                       wr.sqwr.rdma_write.remote_to =
+                           cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+                       err = move_sgl((struct c2_data_addr *)
+                                      & (wr.sqwr.rdma_write.data),
+                                      ib_wr->sg_list,
+                                      ib_wr->num_sge,
+                                      &tot_len, &actual_sge_count);
+                       wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
+                       c2_wr_set_sge_count(&wr, actual_sge_count);
+                       break;
+               case IB_WR_RDMA_READ:
+                       c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
+                       msg_size = sizeof(struct c2wr_rdma_read_req);
+
+                       /* IWarp only suppots 1 sge for RDMA reads */
+                       if (ib_wr->num_sge > 1) {
+                               err = -EINVAL;
+                               break;
+                       }
+
+                       /*
+                        * Move the local and remote stag/to/len into the WR.
+                        */
+                       wr.sqwr.rdma_read.local_stag =
+                           cpu_to_be32(ib_wr->sg_list->lkey);
+                       wr.sqwr.rdma_read.local_to =
+                           cpu_to_be64(ib_wr->sg_list->addr);
+                       wr.sqwr.rdma_read.remote_stag =
+                           cpu_to_be32(ib_wr->wr.rdma.rkey);
+                       wr.sqwr.rdma_read.remote_to =
+                           cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+                       wr.sqwr.rdma_read.length =
+                           cpu_to_be32(ib_wr->sg_list->length);
+                       break;
+               default:
+                       /* error */
+                       msg_size = 0;
+                       err = -EINVAL;
+                       break;
+               }
+
+               /*
+                * If we had an error on the last wr build, then
+                * break out.  Possible errors include bogus WR
+                * type, and a bogus SGL length...
+                */
+               if (err) {
+                       break;
+               }
+
+               /*
+                * Store flags
+                */
+               c2_wr_set_flags(&wr, flags);
+
+               /*
+                * Post the puppy!
+                */
+               spin_lock_irqsave(&qp->lock, lock_flags);
+               err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
+               if (err) {
+                       spin_unlock_irqrestore(&qp->lock, lock_flags);
+                       break;
+               }
+
+               /*
+                * Enqueue mq index to activity FIFO.
+                */
+               c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
+               spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+               ib_wr = ib_wr->next;
+       }
+
+out:
+       if (err)
+               *bad_wr = ib_wr;
+       return err;
+}
+
+int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+                   struct ib_recv_wr **bad_wr)
+{
+       struct c2_dev *c2dev = to_c2dev(ibqp->device);
+       struct c2_qp *qp = to_c2qp(ibqp);
+       union c2wr wr;
+       unsigned long lock_flags;
+       int err = 0;
+
+       if (qp->state > IB_QPS_RTS) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Try and post each work request
+        */
+       while (ib_wr) {
+               u32 tot_len;
+               u8 actual_sge_count;
+
+               if (ib_wr->num_sge > qp->recv_sgl_depth) {
+                       err = -EINVAL;
+                       break;
+               }
+
+               /*
+                * Create local host-copy of the WR
+                */
+               wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+               c2_wr_set_id(&wr, CCWR_RECV);
+               c2_wr_set_flags(&wr, 0);
+
+               /* sge_count is limited to eight bits. */
+               BUG_ON(ib_wr->num_sge >= 256);
+               err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
+                              ib_wr->sg_list,
+                              ib_wr->num_sge, &tot_len, &actual_sge_count);
+               c2_wr_set_sge_count(&wr, actual_sge_count);
+
+               /*
+                * If we had an error on the last wr build, then
+                * break out.  Possible errors include bogus WR
+                * type, and a bogus SGL length...
+                */
+               if (err) {
+                       break;
+               }
+
+               spin_lock_irqsave(&qp->lock, lock_flags);
+               err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
+               if (err) {
+                       spin_unlock_irqrestore(&qp->lock, lock_flags);
+                       break;
+               }
+
+               /*
+                * Enqueue mq index to activity FIFO
+                */
+               c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
+               spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+               ib_wr = ib_wr->next;
+       }
+
+out:
+       if (err)
+               *bad_wr = ib_wr;
+       return err;
+}
+
+void c2_init_qp_table(struct c2_dev *c2dev)
+{
+       spin_lock_init(&c2dev->qp_table.lock);
+       idr_init(&c2dev->qp_table.idr);
+}
+
+void c2_cleanup_qp_table(struct c2_dev *c2dev)
+{
+       idr_destroy(&c2dev->qp_table.idr);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_rnic.c b/drivers/staging/rdma/amso1100/c2_rnic.c
new file mode 100644 (file)
index 0000000..d2a6d96
--- /dev/null
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <linux/route.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_vq.h"
+
+/* Device capabilities */
+#define C2_MIN_PAGESIZE  1024
+
+#define C2_MAX_MRS       32768
+#define C2_MAX_QPS       16000
+#define C2_MAX_WQE_SZ    256
+#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
+#define C2_MAX_SGES      4
+#define C2_MAX_SGE_RD    1
+#define C2_MAX_CQS       32768
+#define C2_MAX_CQES      4096
+#define C2_MAX_PDS       16384
+
+/*
+ * Send the adapter INIT message to the amso1100
+ */
+static int c2_adapter_init(struct c2_dev *c2dev)
+{
+       struct c2wr_init_req wr;
+       int err;
+
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_INIT);
+       wr.hdr.context = 0;
+       wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
+       wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
+       wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
+       wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
+       wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
+       wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
+
+       /* Post the init message */
+       err = vq_send_wr(c2dev, (union c2wr *) & wr);
+
+       return err;
+}
+
+/*
+ * Send the adapter TERM message to the amso1100
+ */
+static void c2_adapter_term(struct c2_dev *c2dev)
+{
+       struct c2wr_init_req wr;
+
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_TERM);
+       wr.hdr.context = 0;
+
+       /* Post the init message */
+       vq_send_wr(c2dev, (union c2wr *) & wr);
+       c2dev->init = 0;
+
+       return;
+}
+
+/*
+ * Query the adapter
+ */
+static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
+{
+       struct c2_vq_req *vq_req;
+       struct c2wr_rnic_query_req wr;
+       struct c2wr_rnic_query_rep *reply;
+       int err;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
+       wr.hdr.context = (unsigned long) vq_req;
+       wr.rnic_handle = c2dev->adapter_handle;
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) &wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail1;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail1;
+
+       reply =
+           (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply)
+               err = -ENOMEM;
+       else
+               err = c2_errno(reply);
+       if (err)
+               goto bail2;
+
+       props->fw_ver =
+               ((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
+               ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
+               (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
+       memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
+       props->max_mr_size         = 0xFFFFFFFF;
+       props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
+       props->vendor_id           = be32_to_cpu(reply->vendor_id);
+       props->vendor_part_id      = be32_to_cpu(reply->part_number);
+       props->hw_ver              = be32_to_cpu(reply->hw_version);
+       props->max_qp              = be32_to_cpu(reply->max_qps);
+       props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
+       props->device_cap_flags    = c2dev->device_cap_flags;
+       props->max_sge             = C2_MAX_SGES;
+       props->max_sge_rd          = C2_MAX_SGE_RD;
+       props->max_cq              = be32_to_cpu(reply->max_cqs);
+       props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
+       props->max_mr              = be32_to_cpu(reply->max_mrs);
+       props->max_pd              = be32_to_cpu(reply->max_pds);
+       props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
+       props->max_ee_rd_atom      = 0;
+       props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
+       props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
+       props->max_ee_init_rd_atom = 0;
+       props->atomic_cap          = IB_ATOMIC_NONE;
+       props->max_ee              = 0;
+       props->max_rdd             = 0;
+       props->max_mw              = be32_to_cpu(reply->max_mws);
+       props->max_raw_ipv6_qp     = 0;
+       props->max_raw_ethy_qp     = 0;
+       props->max_mcast_grp       = 0;
+       props->max_mcast_qp_attach = 0;
+       props->max_total_mcast_qp_attach = 0;
+       props->max_ah              = 0;
+       props->max_fmr             = 0;
+       props->max_map_per_fmr     = 0;
+       props->max_srq             = 0;
+       props->max_srq_wr          = 0;
+       props->max_srq_sge         = 0;
+       props->max_pkeys           = 0;
+       props->local_ca_ack_delay  = 0;
+
+ bail2:
+       vq_repbuf_free(c2dev, reply);
+
+ bail1:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+/*
+ * Add an IP address to the RNIC interface
+ */
+int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+       struct c2_vq_req *vq_req;
+       struct c2wr_rnic_setconfig_req *wr;
+       struct c2wr_rnic_setconfig_rep *reply;
+       struct c2_netaddr netaddr;
+       int err, len;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       len = sizeof(struct c2_netaddr);
+       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+       if (!wr) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+       wr->hdr.context = (unsigned long) vq_req;
+       wr->rnic_handle = c2dev->adapter_handle;
+       wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
+
+       netaddr.ip_addr = inaddr;
+       netaddr.netmask = inmask;
+       netaddr.mtu = 0;
+
+       memcpy(wr->data, &netaddr, len);
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail1;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail1;
+
+       reply =
+           (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       err = c2_errno(reply);
+       vq_repbuf_free(c2dev, reply);
+
+      bail1:
+       kfree(wr);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+/*
+ * Delete an IP address from the RNIC interface
+ */
+int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+       struct c2_vq_req *vq_req;
+       struct c2wr_rnic_setconfig_req *wr;
+       struct c2wr_rnic_setconfig_rep *reply;
+       struct c2_netaddr netaddr;
+       int err, len;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (!vq_req)
+               return -ENOMEM;
+
+       len = sizeof(struct c2_netaddr);
+       wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+       if (!wr) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+       wr->hdr.context = (unsigned long) vq_req;
+       wr->rnic_handle = c2dev->adapter_handle;
+       wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
+
+       netaddr.ip_addr = inaddr;
+       netaddr.netmask = inmask;
+       netaddr.mtu = 0;
+
+       memcpy(wr->data, &netaddr, len);
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, (union c2wr *) wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail1;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err)
+               goto bail1;
+
+       reply =
+           (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       err = c2_errno(reply);
+       vq_repbuf_free(c2dev, reply);
+
+      bail1:
+       kfree(wr);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+/*
+ * Open a single RNIC instance to use with all
+ * low level openib calls
+ */
+static int c2_rnic_open(struct c2_dev *c2dev)
+{
+       struct c2_vq_req *vq_req;
+       union c2wr wr;
+       struct c2wr_rnic_open_rep *reply;
+       int err;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (vq_req == NULL) {
+               return -ENOMEM;
+       }
+
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
+       wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
+       wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
+       wr.rnic_open.req.port_num = cpu_to_be16(0);
+       wr.rnic_open.req.user_context = (unsigned long) c2dev;
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, &wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail0;
+       }
+
+       reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       if ((err = c2_errno(reply)) != 0) {
+               goto bail1;
+       }
+
+       c2dev->adapter_handle = reply->rnic_handle;
+
+      bail1:
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+/*
+ * Close the RNIC instance
+ */
+static int c2_rnic_close(struct c2_dev *c2dev)
+{
+       struct c2_vq_req *vq_req;
+       union c2wr wr;
+       struct c2wr_rnic_close_rep *reply;
+       int err;
+
+       vq_req = vq_req_alloc(c2dev);
+       if (vq_req == NULL) {
+               return -ENOMEM;
+       }
+
+       memset(&wr, 0, sizeof(wr));
+       c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
+       wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
+       wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
+
+       vq_req_get(c2dev, vq_req);
+
+       err = vq_send_wr(c2dev, &wr);
+       if (err) {
+               vq_req_put(c2dev, vq_req);
+               goto bail0;
+       }
+
+       err = vq_wait_for_reply(c2dev, vq_req);
+       if (err) {
+               goto bail0;
+       }
+
+       reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
+       if (!reply) {
+               err = -ENOMEM;
+               goto bail0;
+       }
+
+       if ((err = c2_errno(reply)) != 0) {
+               goto bail1;
+       }
+
+       c2dev->adapter_handle = 0;
+
+      bail1:
+       vq_repbuf_free(c2dev, reply);
+      bail0:
+       vq_req_free(c2dev, vq_req);
+       return err;
+}
+
+/*
+ * Called by c2_probe to initialize the RNIC. This principally
+ * involves initializing the various limits and resource pools that
+ * comprise the RNIC instance.
+ */
+int c2_rnic_init(struct c2_dev *c2dev)
+{
+       int err;
+       u32 qsize, msgsize;
+       void *q1_pages;
+       void *q2_pages;
+       void __iomem *mmio_regs;
+
+       /* Device capabilities */
+       c2dev->device_cap_flags =
+           (IB_DEVICE_RESIZE_MAX_WR |
+            IB_DEVICE_CURR_QP_STATE_MOD |
+            IB_DEVICE_SYS_IMAGE_GUID |
+            IB_DEVICE_LOCAL_DMA_LKEY |
+            IB_DEVICE_MEM_WINDOW);
+
+       /* Allocate the qptr_array */
+       c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
+       if (!c2dev->qptr_array) {
+               return -ENOMEM;
+       }
+
+       /* Initialize the qptr_array */
+       c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
+       c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
+       c2dev->qptr_array[2] = (void *) &c2dev->aeq;
+
+       /* Initialize data structures */
+       init_waitqueue_head(&c2dev->req_vq_wo);
+       spin_lock_init(&c2dev->vqlock);
+       spin_lock_init(&c2dev->lock);
+
+       /* Allocate MQ shared pointer pool for kernel clients. User
+        * mode client pools are hung off the user context
+        */
+       err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
+       if (err) {
+               goto bail0;
+       }
+
+       /* Allocate shared pointers for Q0, Q1, and Q2 from
+        * the shared pointer pool.
+        */
+
+       c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                            &c2dev->hint_count_dma,
+                                            GFP_KERNEL);
+       c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                            &c2dev->req_vq.shared_dma,
+                                            GFP_KERNEL);
+       c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                            &c2dev->rep_vq.shared_dma,
+                                            GFP_KERNEL);
+       c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+                                         &c2dev->aeq.shared_dma, GFP_KERNEL);
+       if (!c2dev->hint_count || !c2dev->req_vq.shared ||
+           !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+
+       mmio_regs = c2dev->kva;
+       /* Initialize the Verbs Request Queue */
+       c2_mq_req_init(&c2dev->req_vq, 0,
+                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
+                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
+                      mmio_regs +
+                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
+                      mmio_regs +
+                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
+                      C2_MQ_ADAPTER_TARGET);
+
+       /* Initialize the Verbs Reply Queue */
+       qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
+       msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
+       q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+                                     &c2dev->rep_vq.host_dma, GFP_KERNEL);
+       if (!q1_pages) {
+               err = -ENOMEM;
+               goto bail1;
+       }
+       dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
+       pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
+                (unsigned long long) c2dev->rep_vq.host_dma);
+       c2_mq_rep_init(&c2dev->rep_vq,
+                  1,
+                  qsize,
+                  msgsize,
+                  q1_pages,
+                  mmio_regs +
+                  be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
+                  C2_MQ_HOST_TARGET);
+
+       /* Initialize the Asynchronus Event Queue */
+       qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
+       msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
+       q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+                                     &c2dev->aeq.host_dma, GFP_KERNEL);
+       if (!q2_pages) {
+               err = -ENOMEM;
+               goto bail2;
+       }
+       dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
+       pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
+                (unsigned long long) c2dev->aeq.host_dma);
+       c2_mq_rep_init(&c2dev->aeq,
+                      2,
+                      qsize,
+                      msgsize,
+                      q2_pages,
+                      mmio_regs +
+                      be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
+                      C2_MQ_HOST_TARGET);
+
+       /* Initialize the verbs request allocator */
+       err = vq_init(c2dev);
+       if (err)
+               goto bail3;
+
+       /* Enable interrupts on the adapter */
+       writel(0, c2dev->regs + C2_IDIS);
+
+       /* create the WR init message */
+       err = c2_adapter_init(c2dev);
+       if (err)
+               goto bail4;
+       c2dev->init++;
+
+       /* open an adapter instance */
+       err = c2_rnic_open(c2dev);
+       if (err)
+               goto bail4;
+
+       /* Initialize cached the adapter limits */
+       err = c2_rnic_query(c2dev, &c2dev->props);
+       if (err)
+               goto bail5;
+
+       /* Initialize the PD pool */
+       err = c2_init_pd_table(c2dev);
+       if (err)
+               goto bail5;
+
+       /* Initialize the QP pool */
+       c2_init_qp_table(c2dev);
+       return 0;
+
+      bail5:
+       c2_rnic_close(c2dev);
+      bail4:
+       vq_term(c2dev);
+      bail3:
+       dma_free_coherent(&c2dev->pcidev->dev,
+                         c2dev->aeq.q_size * c2dev->aeq.msg_size,
+                         q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
+      bail2:
+       dma_free_coherent(&c2dev->pcidev->dev,
+                         c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+                         q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
+      bail1:
+       c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+      bail0:
+       vfree(c2dev->qptr_array);
+
+       return err;
+}
+
+/*
+ * Called by c2_remove to cleanup the RNIC resources.
+ */
+void c2_rnic_term(struct c2_dev *c2dev)
+{
+
+       /* Close the open adapter instance */
+       c2_rnic_close(c2dev);
+
+       /* Send the TERM message to the adapter */
+       c2_adapter_term(c2dev);
+
+       /* Disable interrupts on the adapter */
+       writel(1, c2dev->regs + C2_IDIS);
+
+       /* Free the QP pool */
+       c2_cleanup_qp_table(c2dev);
+
+       /* Free the PD pool */
+       c2_cleanup_pd_table(c2dev);
+
+       /* Free the verbs request allocator */
+       vq_term(c2dev);
+
+       /* Free the asynchronus event queue */
+       dma_free_coherent(&c2dev->pcidev->dev,
+                         c2dev->aeq.q_size * c2dev->aeq.msg_size,
+                         c2dev->aeq.msg_pool.host,
+                         dma_unmap_addr(&c2dev->aeq, mapping));
+
+       /* Free the verbs reply queue */
+       dma_free_coherent(&c2dev->pcidev->dev,
+                         c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+                         c2dev->rep_vq.msg_pool.host,
+                         dma_unmap_addr(&c2dev->rep_vq, mapping));
+
+       /* Free the MQ shared pointer pool */
+       c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+
+       /* Free the qptr_array */
+       vfree(c2dev->qptr_array);
+
+       return;
+}
diff --git a/drivers/staging/rdma/amso1100/c2_status.h b/drivers/staging/rdma/amso1100/c2_status.h
new file mode 100644 (file)
index 0000000..6ee4aa9
--- /dev/null
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef        _C2_STATUS_H_
+#define _C2_STATUS_H_
+
+/*
+ * Verbs Status Codes
+ */
+enum c2_status {
+       C2_OK = 0,              /* This must be zero */
+       CCERR_INSUFFICIENT_RESOURCES = 1,
+       CCERR_INVALID_MODIFIER = 2,
+       CCERR_INVALID_MODE = 3,
+       CCERR_IN_USE = 4,
+       CCERR_INVALID_RNIC = 5,
+       CCERR_INTERRUPTED_OPERATION = 6,
+       CCERR_INVALID_EH = 7,
+       CCERR_INVALID_CQ = 8,
+       CCERR_CQ_EMPTY = 9,
+       CCERR_NOT_IMPLEMENTED = 10,
+       CCERR_CQ_DEPTH_TOO_SMALL = 11,
+       CCERR_PD_IN_USE = 12,
+       CCERR_INVALID_PD = 13,
+       CCERR_INVALID_SRQ = 14,
+       CCERR_INVALID_ADDRESS = 15,
+       CCERR_INVALID_NETMASK = 16,
+       CCERR_INVALID_QP = 17,
+       CCERR_INVALID_QP_STATE = 18,
+       CCERR_TOO_MANY_WRS_POSTED = 19,
+       CCERR_INVALID_WR_TYPE = 20,
+       CCERR_INVALID_SGL_LENGTH = 21,
+       CCERR_INVALID_SQ_DEPTH = 22,
+       CCERR_INVALID_RQ_DEPTH = 23,
+       CCERR_INVALID_ORD = 24,
+       CCERR_INVALID_IRD = 25,
+       CCERR_QP_ATTR_CANNOT_CHANGE = 26,
+       CCERR_INVALID_STAG = 27,
+       CCERR_QP_IN_USE = 28,
+       CCERR_OUTSTANDING_WRS = 29,
+       CCERR_STAG_IN_USE = 30,
+       CCERR_INVALID_STAG_INDEX = 31,
+       CCERR_INVALID_SGL_FORMAT = 32,
+       CCERR_ADAPTER_TIMEOUT = 33,
+       CCERR_INVALID_CQ_DEPTH = 34,
+       CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
+       CCERR_INVALID_EP = 36,
+       CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
+       CCERR_FLUSHED = 38,
+       CCERR_INVALID_WQE = 39,
+       CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
+       CCERR_REMOTE_TERMINATION_ERROR = 41,
+       CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
+       CCERR_ACCESS_VIOLATION = 43,
+       CCERR_INVALID_PD_ID = 44,
+       CCERR_WRAP_ERROR = 45,
+       CCERR_INV_STAG_ACCESS_ERROR = 46,
+       CCERR_ZERO_RDMA_READ_RESOURCES = 47,
+       CCERR_QP_NOT_PRIVILEGED = 48,
+       CCERR_STAG_STATE_NOT_INVALID = 49,
+       CCERR_INVALID_PAGE_SIZE = 50,
+       CCERR_INVALID_BUFFER_SIZE = 51,
+       CCERR_INVALID_PBE = 52,
+       CCERR_INVALID_FBO = 53,
+       CCERR_INVALID_LENGTH = 54,
+       CCERR_INVALID_ACCESS_RIGHTS = 55,
+       CCERR_PBL_TOO_BIG = 56,
+       CCERR_INVALID_VA = 57,
+       CCERR_INVALID_REGION = 58,
+       CCERR_INVALID_WINDOW = 59,
+       CCERR_TOTAL_LENGTH_TOO_BIG = 60,
+       CCERR_INVALID_QP_ID = 61,
+       CCERR_ADDR_IN_USE = 62,
+       CCERR_ADDR_NOT_AVAIL = 63,
+       CCERR_NET_DOWN = 64,
+       CCERR_NET_UNREACHABLE = 65,
+       CCERR_CONN_ABORTED = 66,
+       CCERR_CONN_RESET = 67,
+       CCERR_NO_BUFS = 68,
+       CCERR_CONN_TIMEDOUT = 69,
+       CCERR_CONN_REFUSED = 70,
+       CCERR_HOST_UNREACHABLE = 71,
+       CCERR_INVALID_SEND_SGL_DEPTH = 72,
+       CCERR_INVALID_RECV_SGL_DEPTH = 73,
+       CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
+       CCERR_INSUFFICIENT_PRIVILEGES = 75,
+       CCERR_STACK_ERROR = 76,
+       CCERR_INVALID_VERSION = 77,
+       CCERR_INVALID_MTU = 78,
+       CCERR_INVALID_IMAGE = 79,
+       CCERR_PENDING = 98,     /* not an error; user internally by adapter */
+       CCERR_DEFER = 99,       /* not an error; used internally by adapter */
+       CCERR_FAILED_WRITE = 100,
+       CCERR_FAILED_ERASE = 101,
+       CCERR_FAILED_VERIFICATION = 102,
+       CCERR_NOT_FOUND = 103,
+
+};
+
+/*
+ * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
+ */
+enum c2_connect_status {
+       C2_CONN_STATUS_SUCCESS = C2_OK,
+       C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
+       C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
+       C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
+       C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
+       C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
+       C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
+       C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
+       C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
+       C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
+       C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
+};
+
+/*
+ * Flash programming status codes.
+ */
+enum c2_flash_status {
+       C2_FLASH_STATUS_SUCCESS = 0x0000,
+       C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
+       C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
+       C2_FLASH_STATUS_ECLBS = 0x0400,
+       C2_FLASH_STATUS_PSLBS = 0x0800,
+       C2_FLASH_STATUS_VPENS = 0x1000,
+};
+
+#endif                         /* _C2_STATUS_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_user.h b/drivers/staging/rdma/amso1100/c2_user.h
new file mode 100644 (file)
index 0000000..7e9e7ad
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_USER_H
+#define C2_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct c2_alloc_ucontext_resp {
+       __u32 qp_tab_size;
+       __u32 uarc_size;
+};
+
+struct c2_alloc_pd_resp {
+       __u32 pdn;
+       __u32 reserved;
+};
+
+struct c2_create_cq {
+       __u32 lkey;
+       __u32 pdn;
+       __u64 arm_db_page;
+       __u64 set_db_page;
+       __u32 arm_db_index;
+       __u32 set_db_index;
+};
+
+struct c2_create_cq_resp {
+       __u32 cqn;
+       __u32 reserved;
+};
+
+struct c2_create_qp {
+       __u32 lkey;
+       __u32 reserved;
+       __u64 sq_db_page;
+       __u64 rq_db_page;
+       __u32 sq_db_index;
+       __u32 rq_db_index;
+};
+
+#endif                         /* C2_USER_H */
diff --git a/drivers/staging/rdma/amso1100/c2_vq.c b/drivers/staging/rdma/amso1100/c2_vq.c
new file mode 100644 (file)
index 0000000..2ec716f
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "c2_vq.h"
+#include "c2_provider.h"
+
+/*
+ * Verbs Request Objects:
+ *
+ * VQ Request Objects are allocated by the kernel verbs handlers.
+ * They contain a wait object, a refcnt, an atomic bool indicating that the
+ * adapter has replied, and a copy of the verb reply work request.
+ * A pointer to the VQ Request Object is passed down in the context
+ * field of the work request message, and reflected back by the adapter
+ * in the verbs reply message.  The function handle_vq() in the interrupt
+ * path will use this pointer to:
+ *     1) append a copy of the verbs reply message
+ *     2) mark that the reply is ready
+ *     3) wake up the kernel verbs handler blocked awaiting the reply.
+ *
+ *
+ * The kernel verbs handlers do a "get" to put a 2nd reference on the
+ * VQ Request object.  If the kernel verbs handler exits before the adapter
+ * can respond, this extra reference will keep the VQ Request object around
+ * until the adapter's reply can be processed.  The reason we need this is
+ * because a pointer to this object is stuffed into the context field of
+ * the verbs work request message, and reflected back in the reply message.
+ * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
+ * kernel verb handler that is blocked awaiting the verb reply.
+ * So handle_vq() will do a "put" on the object when it's done accessing it.
+ * NOTE:  If we guarantee that the kernel verb handler will never bail before
+ *        getting the reply, then we don't need these refcnts.
+ *
+ *
+ * VQ Request objects are freed by the kernel verbs handlers only
+ * after the verb has been processed, or when the adapter fails and
+ * does not reply.
+ *
+ *
+ * Verbs Reply Buffers:
+ *
+ * VQ Reply bufs are local host memory copies of a
+ * outstanding Verb Request reply
+ * message.  The are always allocated by the kernel verbs handlers, and _may_ be
+ * freed by either the kernel verbs handler -or- the interrupt handler.  The
+ * kernel verbs handler _must_ free the repbuf, then free the vq request object
+ * in that order.
+ */
+
+int vq_init(struct c2_dev *c2dev)
+{
+       sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
+               (char) ('0' + c2dev->devnum));
+       c2dev->host_msg_cache =
+           kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
+                             SLAB_HWCACHE_ALIGN, NULL);
+       if (c2dev->host_msg_cache == NULL) {
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+void vq_term(struct c2_dev *c2dev)
+{
+       kmem_cache_destroy(c2dev->host_msg_cache);
+}
+
+/* vq_req_alloc - allocate a VQ Request Object and initialize it.
+ * The refcnt is set to 1.
+ */
+struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
+{
+       struct c2_vq_req *r;
+
+       r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
+       if (r) {
+               init_waitqueue_head(&r->wait_object);
+               r->reply_msg = 0;
+               r->event = 0;
+               r->cm_id = NULL;
+               r->qp = NULL;
+               atomic_set(&r->refcnt, 1);
+               atomic_set(&r->reply_ready, 0);
+       }
+       return r;
+}
+
+
+/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
+ * has already free the VQ Reply Buffer if it existed.
+ */
+void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+       r->reply_msg = 0;
+       if (atomic_dec_and_test(&r->refcnt)) {
+               kfree(r);
+       }
+}
+
+/* vq_req_get - reference a VQ Request Object.  Done
+ * only in the kernel verbs handlers.
+ */
+void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+       atomic_inc(&r->refcnt);
+}
+
+
+/* vq_req_put - dereference and potentially free a VQ Request Object.
+ *
+ * This is only called by handle_vq() on the
+ * interrupt when it is done processing
+ * a verb reply message.  If the associated
+ * kernel verbs handler has already bailed,
+ * then this put will actually free the VQ
+ * Request object _and_ the VQ Reply Buffer
+ * if it exists.
+ */
+void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+       if (atomic_dec_and_test(&r->refcnt)) {
+               if (r->reply_msg != 0)
+                       vq_repbuf_free(c2dev,
+                                      (void *) (unsigned long) r->reply_msg);
+               kfree(r);
+       }
+}
+
+
+/*
+ * vq_repbuf_alloc - allocate a VQ Reply Buffer.
+ */
+void *vq_repbuf_alloc(struct c2_dev *c2dev)
+{
+       return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
+}
+
+/*
+ * vq_send_wr - post a verbs request message to the Verbs Request Queue.
+ * If a message is not available in the MQ, then block until one is available.
+ * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
+ * When the adapter drains the Verbs Request Queue,
+ * it inserts MQ index 0 in to the
+ * adapter->host activity fifo and interrupts the host.
+ */
+int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
+{
+       void *msg;
+       wait_queue_t __wait;
+
+       /*
+        * grab adapter vq lock
+        */
+       spin_lock(&c2dev->vqlock);
+
+       /*
+        * allocate msg
+        */
+       msg = c2_mq_alloc(&c2dev->req_vq);
+
+       /*
+        * If we cannot get a msg, then we'll wait
+        * When a messages are available, the int handler will wake_up()
+        * any waiters.
+        */
+       while (msg == NULL) {
+               pr_debug("%s:%d no available msg in VQ, waiting...\n",
+                      __func__, __LINE__);
+               init_waitqueue_entry(&__wait, current);
+               add_wait_queue(&c2dev->req_vq_wo, &__wait);
+               spin_unlock(&c2dev->vqlock);
+               for (;;) {
+                       set_current_state(TASK_INTERRUPTIBLE);
+                       if (!c2_mq_full(&c2dev->req_vq)) {
+                               break;
+                       }
+                       if (!signal_pending(current)) {
+                               schedule_timeout(1 * HZ);       /* 1 second... */
+                               continue;
+                       }
+                       set_current_state(TASK_RUNNING);
+                       remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+                       return -EINTR;
+               }
+               set_current_state(TASK_RUNNING);
+               remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+               spin_lock(&c2dev->vqlock);
+               msg = c2_mq_alloc(&c2dev->req_vq);
+       }
+
+       /*
+        * copy wr into adapter msg
+        */
+       memcpy(msg, wr, c2dev->req_vq.msg_size);
+
+       /*
+        * post msg
+        */
+       c2_mq_produce(&c2dev->req_vq);
+
+       /*
+        * release adapter vq lock
+        */
+       spin_unlock(&c2dev->vqlock);
+       return 0;
+}
+
+
+/*
+ * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
+ */
+int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
+{
+       if (!wait_event_timeout(req->wait_object,
+                               atomic_read(&req->reply_ready),
+                               60*HZ))
+               return -ETIMEDOUT;
+
+       return 0;
+}
+
+/*
+ * vq_repbuf_free - Free a Verbs Reply Buffer.
+ */
+void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
+{
+       kmem_cache_free(c2dev->host_msg_cache, reply);
+}
diff --git a/drivers/staging/rdma/amso1100/c2_vq.h b/drivers/staging/rdma/amso1100/c2_vq.h
new file mode 100644 (file)
index 0000000..3380562
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_VQ_H_
+#define _C2_VQ_H_
+#include <linux/sched.h>
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_provider.h"
+
+struct c2_vq_req {
+       u64 reply_msg;          /* ptr to reply msg */
+       wait_queue_head_t wait_object;  /* wait object for vq reqs */
+       atomic_t reply_ready;   /* set when reply is ready */
+       atomic_t refcnt;        /* used to cancel WRs... */
+       int event;
+       struct iw_cm_id *cm_id;
+       struct c2_qp *qp;
+};
+
+extern int vq_init(struct c2_dev *c2dev);
+extern void vq_term(struct c2_dev *c2dev);
+
+extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
+extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
+
+extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
+extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
+
+extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
+#endif                         /* _C2_VQ_H_ */
diff --git a/drivers/staging/rdma/amso1100/c2_wr.h b/drivers/staging/rdma/amso1100/c2_wr.h
new file mode 100644 (file)
index 0000000..8d4b4ca
--- /dev/null
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_WR_H_
+#define _C2_WR_H_
+
+#ifdef CCDEBUG
+#define CCWR_MAGIC             0xb07700b0
+#endif
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+/* Maximum allowed size in bytes of private_data exchange
+ * on connect.
+ */
+#define C2_MAX_PRIVATE_DATA_SIZE 200
+
+/*
+ * These types are shared among the adapter, host, and CCIL consumer.
+ */
+enum c2_cq_notification_type {
+       C2_CQ_NOTIFICATION_TYPE_NONE = 1,
+       C2_CQ_NOTIFICATION_TYPE_NEXT,
+       C2_CQ_NOTIFICATION_TYPE_NEXT_SE
+};
+
+enum c2_setconfig_cmd {
+       C2_CFG_ADD_ADDR = 1,
+       C2_CFG_DEL_ADDR = 2,
+       C2_CFG_ADD_ROUTE = 3,
+       C2_CFG_DEL_ROUTE = 4
+};
+
+enum c2_getconfig_cmd {
+       C2_GETCONFIG_ROUTES = 1,
+       C2_GETCONFIG_ADDRS
+};
+
+/*
+ *  CCIL Work Request Identifiers
+ */
+enum c2wr_ids {
+       CCWR_RNIC_OPEN = 1,
+       CCWR_RNIC_QUERY,
+       CCWR_RNIC_SETCONFIG,
+       CCWR_RNIC_GETCONFIG,
+       CCWR_RNIC_CLOSE,
+       CCWR_CQ_CREATE,
+       CCWR_CQ_QUERY,
+       CCWR_CQ_MODIFY,
+       CCWR_CQ_DESTROY,
+       CCWR_QP_CONNECT,
+       CCWR_PD_ALLOC,
+       CCWR_PD_DEALLOC,
+       CCWR_SRQ_CREATE,
+       CCWR_SRQ_QUERY,
+       CCWR_SRQ_MODIFY,
+       CCWR_SRQ_DESTROY,
+       CCWR_QP_CREATE,
+       CCWR_QP_QUERY,
+       CCWR_QP_MODIFY,
+       CCWR_QP_DESTROY,
+       CCWR_NSMR_STAG_ALLOC,
+       CCWR_NSMR_REGISTER,
+       CCWR_NSMR_PBL,
+       CCWR_STAG_DEALLOC,
+       CCWR_NSMR_REREGISTER,
+       CCWR_SMR_REGISTER,
+       CCWR_MR_QUERY,
+       CCWR_MW_ALLOC,
+       CCWR_MW_QUERY,
+       CCWR_EP_CREATE,
+       CCWR_EP_GETOPT,
+       CCWR_EP_SETOPT,
+       CCWR_EP_DESTROY,
+       CCWR_EP_BIND,
+       CCWR_EP_CONNECT,
+       CCWR_EP_LISTEN,
+       CCWR_EP_SHUTDOWN,
+       CCWR_EP_LISTEN_CREATE,
+       CCWR_EP_LISTEN_DESTROY,
+       CCWR_EP_QUERY,
+       CCWR_CR_ACCEPT,
+       CCWR_CR_REJECT,
+       CCWR_CONSOLE,
+       CCWR_TERM,
+       CCWR_FLASH_INIT,
+       CCWR_FLASH,
+       CCWR_BUF_ALLOC,
+       CCWR_BUF_FREE,
+       CCWR_FLASH_WRITE,
+       CCWR_INIT,              /* WARNING: Don't move this ever again! */
+
+
+
+       /* Add new IDs here */
+
+
+
+       /*
+        * WARNING: CCWR_LAST must always be the last verbs id defined!
+        *          All the preceding IDs are fixed, and must not change.
+        *          You can add new IDs, but must not remove or reorder
+        *          any IDs. If you do, YOU will ruin any hope of
+        *          compatibility between versions.
+        */
+       CCWR_LAST,
+
+       /*
+        * Start over at 1 so that arrays indexed by user wr id's
+        * begin at 1.  This is OK since the verbs and user wr id's
+        * are always used on disjoint sets of queues.
+        */
+       /*
+        * The order of the CCWR_SEND_XX verbs must
+        * match the order of the RDMA_OPs
+        */
+       CCWR_SEND = 1,
+       CCWR_SEND_INV,
+       CCWR_SEND_SE,
+       CCWR_SEND_SE_INV,
+       CCWR_RDMA_WRITE,
+       CCWR_RDMA_READ,
+       CCWR_RDMA_READ_INV,
+       CCWR_MW_BIND,
+       CCWR_NSMR_FASTREG,
+       CCWR_STAG_INVALIDATE,
+       CCWR_RECV,
+       CCWR_NOP,
+       CCWR_UNIMPL,
+/* WARNING: This must always be the last user wr id defined! */
+};
+#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
+
+/*
+ * SQ/RQ Work Request Types
+ */
+enum c2_wr_type {
+       C2_WR_TYPE_SEND = CCWR_SEND,
+       C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
+       C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
+       C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
+       C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
+       C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
+       C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
+       C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
+       C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
+       C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
+       C2_WR_TYPE_RECV = CCWR_RECV,
+       C2_WR_TYPE_NOP = CCWR_NOP,
+};
+
+struct c2_netaddr {
+       __be32 ip_addr;
+       __be32 netmask;
+       u32 mtu;
+};
+
+struct c2_route {
+       u32 ip_addr;            /* 0 indicates the default route */
+       u32 netmask;            /* netmask associated with dst */
+       u32 flags;
+       union {
+               u32 ipaddr;     /* address of the nexthop interface */
+               u8 enaddr[6];
+       } nexthop;
+};
+
+/*
+ * A Scatter Gather Entry.
+ */
+struct c2_data_addr {
+       __be32 stag;
+       __be32 length;
+       __be64 to;
+};
+
+/*
+ * MR and MW flags used by the consumer, RI, and RNIC.
+ */
+enum c2_mm_flags {
+       MEM_REMOTE = 0x0001,    /* allow mw binds with remote access. */
+       MEM_VA_BASED = 0x0002,  /* Not Zero-based */
+       MEM_PBL_COMPLETE = 0x0004,      /* PBL array is complete in this msg */
+       MEM_LOCAL_READ = 0x0008,        /* allow local reads */
+       MEM_LOCAL_WRITE = 0x0010,       /* allow local writes */
+       MEM_REMOTE_READ = 0x0020,       /* allow remote reads */
+       MEM_REMOTE_WRITE = 0x0040,      /* allow remote writes */
+       MEM_WINDOW_BIND = 0x0080,       /* binds allowed */
+       MEM_SHARED = 0x0100,    /* set if MR is shared */
+       MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */
+};
+
+/*
+ * CCIL API ACF flags defined in terms of the low level mem flags.
+ * This minimizes translation needed in the user API
+ */
+enum c2_acf {
+       C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
+       C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
+       C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
+       C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
+       C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
+};
+
+/*
+ * Image types of objects written to flash
+ */
+#define C2_FLASH_IMG_BITFILE 1
+#define C2_FLASH_IMG_OPTION_ROM 2
+#define C2_FLASH_IMG_VPD 3
+
+/*
+ *  to fix bug 1815 we define the max size allowable of the
+ *  terminate message (per the IETF spec).Refer to the IETF
+ *  protocol specification, section 12.1.6, page 64)
+ *  The message is prefixed by 20 types of DDP info.
+ *
+ *  Then the message has 6 bytes for the terminate control
+ *  and DDP segment length info plus a DDP header (either
+ *  14 or 18 byts) plus 28 bytes for the RDMA header.
+ *  Thus the max size in:
+ *  20 + (6 + 18 + 28) = 72
+ */
+#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
+
+/*
+ * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
+ */
+#define WR_BUILD_STR_LEN 64
+
+/*
+ * WARNING:  All of these structs need to align any 64bit types on
+ * 64 bit boundaries!  64bit types include u64 and u64.
+ */
+
+/*
+ * Clustercore Work Request Header.  Be sensitive to field layout
+ * and alignment.
+ */
+struct c2wr_hdr {
+       /* wqe_count is part of the cqe.  It is put here so the
+        * adapter can write to it while the wr is pending without
+        * clobbering part of the wr.  This word need not be dma'd
+        * from the host to adapter by libccil, but we copy it anyway
+        * to make the memcpy to the adapter better aligned.
+        */
+       __be32 wqe_count;
+
+       /* Put these fields next so that later 32- and 64-bit
+        * quantities are naturally aligned.
+        */
+       u8 id;
+       u8 result;              /* adapter -> host */
+       u8 sge_count;           /* host -> adapter */
+       u8 flags;               /* host -> adapter */
+
+       u64 context;
+#ifdef CCMSGMAGIC
+       u32 magic;
+       u32 pad;
+#endif
+} __attribute__((packed));
+
+/*
+ *------------------------ RNIC ------------------------
+ */
+
+/*
+ * WR_RNIC_OPEN
+ */
+
+/*
+ * Flags for the RNIC WRs
+ */
+enum c2_rnic_flags {
+       RNIC_IRD_STATIC = 0x0001,
+       RNIC_ORD_STATIC = 0x0002,
+       RNIC_QP_STATIC = 0x0004,
+       RNIC_SRQ_SUPPORTED = 0x0008,
+       RNIC_PBL_BLOCK_MODE = 0x0010,
+       RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
+       RNIC_CQ_OVF_DETECTED = 0x0040,
+       RNIC_PRIV_MODE = 0x0080
+};
+
+struct c2wr_rnic_open_req {
+       struct c2wr_hdr hdr;
+       u64 user_context;
+       __be16 flags;           /* See enum c2_rnic_flags */
+       __be16 port_num;
+} __attribute__((packed));
+
+struct c2wr_rnic_open_rep {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+} __attribute__((packed));
+
+union c2wr_rnic_open {
+       struct c2wr_rnic_open_req req;
+       struct c2wr_rnic_open_rep rep;
+} __attribute__((packed));
+
+struct c2wr_rnic_query_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_QUERY
+ */
+struct c2wr_rnic_query_rep {
+       struct c2wr_hdr hdr;
+       u64 user_context;
+       __be32 vendor_id;
+       __be32 part_number;
+       __be32 hw_version;
+       __be32 fw_ver_major;
+       __be32 fw_ver_minor;
+       __be32 fw_ver_patch;
+       char fw_ver_build_str[WR_BUILD_STR_LEN];
+       __be32 max_qps;
+       __be32 max_qp_depth;
+       u32 max_srq_depth;
+       u32 max_send_sgl_depth;
+       u32 max_rdma_sgl_depth;
+       __be32 max_cqs;
+       __be32 max_cq_depth;
+       u32 max_cq_event_handlers;
+       __be32 max_mrs;
+       u32 max_pbl_depth;
+       __be32 max_pds;
+       __be32 max_global_ird;
+       u32 max_global_ord;
+       __be32 max_qp_ird;
+       __be32 max_qp_ord;
+       u32 flags;
+       __be32 max_mws;
+       u32 pbe_range_low;
+       u32 pbe_range_high;
+       u32 max_srqs;
+       u32 page_size;
+} __attribute__((packed));
+
+union c2wr_rnic_query {
+       struct c2wr_rnic_query_req req;
+       struct c2wr_rnic_query_rep rep;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_GETCONFIG
+ */
+
+struct c2wr_rnic_getconfig_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 option;             /* see c2_getconfig_cmd_t */
+       u64 reply_buf;
+       u32 reply_buf_len;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_getconfig_rep {
+       struct c2wr_hdr hdr;
+       u32 option;             /* see c2_getconfig_cmd_t */
+       u32 count_len;          /* length of the number of addresses configured */
+} __attribute__((packed)) ;
+
+union c2wr_rnic_getconfig {
+       struct c2wr_rnic_getconfig_req req;
+       struct c2wr_rnic_getconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_SETCONFIG
+ */
+struct c2wr_rnic_setconfig_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       __be32 option;          /* See c2_setconfig_cmd_t */
+       /* variable data and pad. See c2_netaddr and c2_route */
+       u8 data[0];
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_setconfig_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_setconfig {
+       struct c2wr_rnic_setconfig_req req;
+       struct c2wr_rnic_setconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_CLOSE
+ */
+struct c2wr_rnic_close_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_close_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_close {
+       struct c2wr_rnic_close_req req;
+       struct c2wr_rnic_close_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ CQ ------------------------
+ */
+struct c2wr_cq_create_req {
+       struct c2wr_hdr hdr;
+       __be64 shared_ht;
+       u64 user_context;
+       __be64 msg_pool;
+       u32 rnic_handle;
+       __be32 msg_size;
+       __be32 depth;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_create_rep {
+       struct c2wr_hdr hdr;
+       __be32 mq_index;
+       __be32 adapter_shared;
+       u32 cq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_cq_create {
+       struct c2wr_cq_create_req req;
+       struct c2wr_cq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 cq_handle;
+       u32 new_depth;
+       u64 new_msg_pool;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_modify {
+       struct c2wr_cq_modify_req req;
+       struct c2wr_cq_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 cq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_destroy {
+       struct c2wr_cq_destroy_req req;
+       struct c2wr_cq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ PD ------------------------
+ */
+struct c2wr_pd_alloc_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_alloc_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_alloc {
+       struct c2wr_pd_alloc_req req;
+       struct c2wr_pd_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_dealloc {
+       struct c2wr_pd_dealloc_req req;
+       struct c2wr_pd_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ SRQ ------------------------
+ */
+struct c2wr_srq_create_req {
+       struct c2wr_hdr hdr;
+       u64 shared_ht;
+       u64 user_context;
+       u32 rnic_handle;
+       u32 srq_depth;
+       u32 srq_limit;
+       u32 sgl_depth;
+       u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_create_rep {
+       struct c2wr_hdr hdr;
+       u32 srq_depth;
+       u32 sgl_depth;
+       u32 msg_size;
+       u32 mq_index;
+       u32 mq_start;
+       u32 srq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_srq_create {
+       struct c2wr_srq_create_req req;
+       struct c2wr_srq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 srq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_srq_destroy {
+       struct c2wr_srq_destroy_req req;
+       struct c2wr_srq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ QP ------------------------
+ */
+enum c2wr_qp_flags {
+       QP_RDMA_READ = 0x00000001,      /* RDMA read enabled? */
+       QP_RDMA_WRITE = 0x00000002,     /* RDMA write enabled? */
+       QP_MW_BIND = 0x00000004,        /* MWs enabled */
+       QP_ZERO_STAG = 0x00000008,      /* enabled? */
+       QP_REMOTE_TERMINATION = 0x00000010,     /* remote end terminated */
+       QP_RDMA_READ_RESPONSE = 0x00000020      /* Remote RDMA read  */
+           /* enabled? */
+};
+
+struct c2wr_qp_create_req {
+       struct c2wr_hdr hdr;
+       __be64 shared_sq_ht;
+       __be64 shared_rq_ht;
+       u64 user_context;
+       u32 rnic_handle;
+       u32 sq_cq_handle;
+       u32 rq_cq_handle;
+       __be32 sq_depth;
+       __be32 rq_depth;
+       u32 srq_handle;
+       u32 srq_limit;
+       __be32 flags;           /* see enum c2wr_qp_flags */
+       __be32 send_sgl_depth;
+       __be32 recv_sgl_depth;
+       __be32 rdma_write_sgl_depth;
+       __be32 ord;
+       __be32 ird;
+       u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_create_rep {
+       struct c2wr_hdr hdr;
+       __be32 sq_depth;
+       __be32 rq_depth;
+       u32 send_sgl_depth;
+       u32 recv_sgl_depth;
+       u32 rdma_write_sgl_depth;
+       u32 ord;
+       u32 ird;
+       __be32 sq_msg_size;
+       __be32 sq_mq_index;
+       __be32 sq_mq_start;
+       __be32 rq_msg_size;
+       __be32 rq_mq_index;
+       __be32 rq_mq_start;
+       u32 qp_handle;
+} __attribute__((packed)) ;
+
+union c2wr_qp_create {
+       struct c2wr_qp_create_req req;
+       struct c2wr_qp_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_rep {
+       struct c2wr_hdr hdr;
+       u64 user_context;
+       u32 rnic_handle;
+       u32 sq_depth;
+       u32 rq_depth;
+       u32 send_sgl_depth;
+       u32 rdma_write_sgl_depth;
+       u32 recv_sgl_depth;
+       u32 ord;
+       u32 ird;
+       u16 qp_state;
+       u16 flags;              /* see c2wr_qp_flags_t */
+       u32 qp_id;
+       u32 local_addr;
+       u32 remote_addr;
+       u16 local_port;
+       u16 remote_port;
+       u32 terminate_msg_length;       /* 0 if not present */
+       u8 data[0];
+       /* Terminate Message in-line here. */
+} __attribute__((packed)) ;
+
+union c2wr_qp_query {
+       struct c2wr_qp_query_req req;
+       struct c2wr_qp_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_req {
+       struct c2wr_hdr hdr;
+       u64 stream_msg;
+       u32 stream_msg_length;
+       u32 rnic_handle;
+       u32 qp_handle;
+       __be32 next_qp_state;
+       __be32 ord;
+       __be32 ird;
+       __be32 sq_depth;
+       __be32 rq_depth;
+       u32 llp_ep_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_rep {
+       struct c2wr_hdr hdr;
+       u32 ord;
+       u32 ird;
+       u32 sq_depth;
+       u32 rq_depth;
+       u32 sq_msg_size;
+       u32 sq_mq_index;
+       u32 sq_mq_start;
+       u32 rq_msg_size;
+       u32 rq_mq_index;
+       u32 rq_mq_start;
+} __attribute__((packed)) ;
+
+union c2wr_qp_modify {
+       struct c2wr_qp_modify_req req;
+       struct c2wr_qp_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_qp_destroy {
+       struct c2wr_qp_destroy_req req;
+       struct c2wr_qp_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
+ * only be posted when a QP is in IDLE state.  After the connect request is
+ * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
+ * No synchronous reply from adapter to this WR.  The results of
+ * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
+ * See c2wr_ae_active_connect_results_t
+ */
+struct c2wr_qp_connect_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 qp_handle;
+       __be32 remote_addr;
+       __be16 remote_port;
+       u16 pad;
+       __be32 private_data_length;
+       u8 private_data[0];     /* Private data in-line. */
+} __attribute__((packed)) ;
+
+struct c2wr_qp_connect {
+       struct c2wr_qp_connect_req req;
+       /* no synchronous reply.         */
+} __attribute__((packed)) ;
+
+
+/*
+ *------------------------ MM ------------------------
+ */
+
+struct c2wr_nsmr_stag_alloc_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 pbl_depth;
+       u32 pd_id;
+       u32 flags;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_stag_alloc_rep {
+       struct c2wr_hdr hdr;
+       u32 pbl_depth;
+       u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_stag_alloc {
+       struct c2wr_nsmr_stag_alloc_req req;
+       struct c2wr_nsmr_stag_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_req {
+       struct c2wr_hdr hdr;
+       __be64 va;
+       u32 rnic_handle;
+       __be16 flags;
+       u8 stag_key;
+       u8 pad;
+       u32 pd_id;
+       __be32 pbl_depth;
+       __be32 pbe_size;
+       __be32 fbo;
+       __be32 length;
+       __be32 addrs_length;
+       /* array of paddrs (must be aligned on a 64bit boundary) */
+       __be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_rep {
+       struct c2wr_hdr hdr;
+       u32 pbl_depth;
+       __be32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_register {
+       struct c2wr_nsmr_register_req req;
+       struct c2wr_nsmr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       __be32 flags;
+       __be32 stag_index;
+       __be32 addrs_length;
+       /* array of paddrs (must be aligned on a 64bit boundary) */
+       __be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_pbl {
+       struct c2wr_nsmr_pbl_req req;
+       struct c2wr_nsmr_pbl_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_rep {
+       struct c2wr_hdr hdr;
+       u8 stag_key;
+       u8 pad[3];
+       u32 pd_id;
+       u32 flags;
+       u32 pbl_depth;
+} __attribute__((packed)) ;
+
+union c2wr_mr_query {
+       struct c2wr_mr_query_req req;
+       struct c2wr_mr_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_rep {
+       struct c2wr_hdr hdr;
+       u8 stag_key;
+       u8 pad[3];
+       u32 pd_id;
+       u32 flags;
+} __attribute__((packed)) ;
+
+union c2wr_mw_query {
+       struct c2wr_mw_query_req req;
+       struct c2wr_mw_query_rep rep;
+} __attribute__((packed)) ;
+
+
+struct c2wr_stag_dealloc_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       __be32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_stag_dealloc_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_stag_dealloc {
+       struct c2wr_stag_dealloc_req req;
+       struct c2wr_stag_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_req {
+       struct c2wr_hdr hdr;
+       u64 va;
+       u32 rnic_handle;
+       u16 flags;
+       u8 stag_key;
+       u8 pad;
+       u32 stag_index;
+       u32 pd_id;
+       u32 pbl_depth;
+       u32 pbe_size;
+       u32 fbo;
+       u32 length;
+       u32 addrs_length;
+       u32 pad1;
+       /* array of paddrs (must be aligned on a 64bit boundary) */
+       u64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_rep {
+       struct c2wr_hdr hdr;
+       u32 pbl_depth;
+       u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_reregister {
+       struct c2wr_nsmr_reregister_req req;
+       struct c2wr_nsmr_reregister_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_req {
+       struct c2wr_hdr hdr;
+       u64 va;
+       u32 rnic_handle;
+       u16 flags;
+       u8 stag_key;
+       u8 pad;
+       u32 stag_index;
+       u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_rep {
+       struct c2wr_hdr hdr;
+       u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_smr_register {
+       struct c2wr_smr_register_req req;
+       struct c2wr_smr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_rep {
+       struct c2wr_hdr hdr;
+       u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_mw_alloc {
+       struct c2wr_mw_alloc_req req;
+       struct c2wr_mw_alloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ WRs -----------------------
+ */
+
+struct c2wr_user_hdr {
+       struct c2wr_hdr hdr;            /* Has status and WR Type */
+} __attribute__((packed)) ;
+
+enum c2_qp_state {
+       C2_QP_STATE_IDLE = 0x01,
+       C2_QP_STATE_CONNECTING = 0x02,
+       C2_QP_STATE_RTS = 0x04,
+       C2_QP_STATE_CLOSING = 0x08,
+       C2_QP_STATE_TERMINATE = 0x10,
+       C2_QP_STATE_ERROR = 0x20,
+};
+
+/* Completion queue entry. */
+struct c2wr_ce {
+       struct c2wr_hdr hdr;            /* Has status and WR Type */
+       u64 qp_user_context;    /* c2_user_qp_t * */
+       u32 qp_state;           /* Current QP State */
+       u32 handle;             /* QPID or EP Handle */
+       __be32 bytes_rcvd;              /* valid for RECV WCs */
+       u32 stag;
+} __attribute__((packed)) ;
+
+
+/*
+ * Flags used for all post-sq WRs.  These must fit in the flags
+ * field of the struct c2wr_hdr (eight bits).
+ */
+enum {
+       SQ_SIGNALED = 0x01,
+       SQ_READ_FENCE = 0x02,
+       SQ_FENCE = 0x04,
+};
+
+/*
+ * Common fields for all post-sq WRs.  Namely the standard header and a
+ * secondary header with fields common to all post-sq WRs.
+ */
+struct c2_sq_hdr {
+       struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * Same as above but for post-rq WRs.
+ */
+struct c2_rq_hdr {
+       struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * use the same struct for all sends.
+ */
+struct c2wr_send_req {
+       struct c2_sq_hdr sq_hdr;
+       __be32 sge_len;
+       __be32 remote_stag;
+       u8 data[0];             /* SGE array */
+} __attribute__((packed));
+
+union c2wr_send {
+       struct c2wr_send_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_write_req {
+       struct c2_sq_hdr sq_hdr;
+       __be64 remote_to;
+       __be32 remote_stag;
+       __be32 sge_len;
+       u8 data[0];             /* SGE array */
+} __attribute__((packed));
+
+union c2wr_rdma_write {
+       struct c2wr_rdma_write_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_read_req {
+       struct c2_sq_hdr sq_hdr;
+       __be64 local_to;
+       __be64 remote_to;
+       __be32 local_stag;
+       __be32 remote_stag;
+       __be32 length;
+} __attribute__((packed));
+
+union c2wr_rdma_read {
+       struct c2wr_rdma_read_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_mw_bind_req {
+       struct c2_sq_hdr sq_hdr;
+       u64 va;
+       u8 stag_key;
+       u8 pad[3];
+       u32 mw_stag_index;
+       u32 mr_stag_index;
+       u32 length;
+       u32 flags;
+} __attribute__((packed));
+
+union c2wr_mw_bind {
+       struct c2wr_mw_bind_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_nsmr_fastreg_req {
+       struct c2_sq_hdr sq_hdr;
+       u64 va;
+       u8 stag_key;
+       u8 pad[3];
+       u32 stag_index;
+       u32 pbe_size;
+       u32 fbo;
+       u32 length;
+       u32 addrs_length;
+       /* array of paddrs (must be aligned on a 64bit boundary) */
+       u64 paddrs[0];
+} __attribute__((packed));
+
+union c2wr_nsmr_fastreg {
+       struct c2wr_nsmr_fastreg_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_stag_invalidate_req {
+       struct c2_sq_hdr sq_hdr;
+       u8 stag_key;
+       u8 pad[3];
+       u32 stag_index;
+} __attribute__((packed));
+
+union c2wr_stag_invalidate {
+       struct c2wr_stag_invalidate_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+union c2wr_sqwr {
+       struct c2_sq_hdr sq_hdr;
+       struct c2wr_send_req send;
+       struct c2wr_send_req send_se;
+       struct c2wr_send_req send_inv;
+       struct c2wr_send_req send_se_inv;
+       struct c2wr_rdma_write_req rdma_write;
+       struct c2wr_rdma_read_req rdma_read;
+       struct c2wr_mw_bind_req mw_bind;
+       struct c2wr_nsmr_fastreg_req nsmr_fastreg;
+       struct c2wr_stag_invalidate_req stag_inv;
+} __attribute__((packed));
+
+
+/*
+ * RQ WRs
+ */
+struct c2wr_rqwr {
+       struct c2_rq_hdr rq_hdr;
+       u8 data[0];             /* array of SGEs */
+} __attribute__((packed));
+
+union c2wr_recv {
+       struct c2wr_rqwr req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+/*
+ * All AEs start with this header.  Most AEs only need to convey the
+ * information in the header.  Some, like LLP connection events, need
+ * more info.  The union typdef c2wr_ae_t has all the possible AEs.
+ *
+ * hdr.context is the user_context from the rnic_open WR.  NULL If this
+ * is not affiliated with an rnic
+ *
+ * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
+ * CCAE_LLP_CLOSE_COMPLETE)
+ *
+ * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
+ *
+ * user_context is the context passed down when the host created the resource.
+ */
+struct c2wr_ae_hdr {
+       struct c2wr_hdr hdr;
+       u64 user_context;       /* user context for this res. */
+       __be32 resource_type;   /* see enum c2_resource_indicator */
+       __be32 resource;        /* handle for resource */
+       __be32 qp_state;        /* current QP State */
+} __attribute__((packed));
+
+/*
+ * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
+ * the adapter moves the QP into RTS state
+ */
+struct c2wr_ae_active_connect_results {
+       struct c2wr_ae_hdr ae_hdr;
+       __be32 laddr;
+       __be32 raddr;
+       __be16 lport;
+       __be16 rport;
+       __be32 private_data_length;
+       u8 private_data[0];     /* data is in-line in the msg. */
+} __attribute__((packed));
+
+/*
+ * When connections are established by the stack (and the private data
+ * MPA frame is received), the adapter will generate an event to the host.
+ * The details of the connection, any private data, and the new connection
+ * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
+ * AE queue:
+ */
+struct c2wr_ae_connection_request {
+       struct c2wr_ae_hdr ae_hdr;
+       u32 cr_handle;          /* connreq handle (sock ptr) */
+       __be32 laddr;
+       __be32 raddr;
+       __be16 lport;
+       __be16 rport;
+       __be32 private_data_length;
+       u8 private_data[0];     /* data is in-line in the msg. */
+} __attribute__((packed));
+
+union c2wr_ae {
+       struct c2wr_ae_hdr ae_generic;
+       struct c2wr_ae_active_connect_results ae_active_connect_results;
+       struct c2wr_ae_connection_request ae_connection_request;
+} __attribute__((packed));
+
+struct c2wr_init_req {
+       struct c2wr_hdr hdr;
+       __be64 hint_count;
+       __be64 q0_host_shared;
+       __be64 q1_host_shared;
+       __be64 q1_host_msg_pool;
+       __be64 q2_host_shared;
+       __be64 q2_host_msg_pool;
+} __attribute__((packed));
+
+struct c2wr_init_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_init {
+       struct c2wr_init_req req;
+       struct c2wr_init_rep rep;
+} __attribute__((packed));
+
+/*
+ * For upgrading flash.
+ */
+
+struct c2wr_flash_init_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+} __attribute__((packed));
+
+struct c2wr_flash_init_rep {
+       struct c2wr_hdr hdr;
+       u32 adapter_flash_buf_offset;
+       u32 adapter_flash_len;
+} __attribute__((packed));
+
+union c2wr_flash_init {
+       struct c2wr_flash_init_req req;
+       struct c2wr_flash_init_rep rep;
+} __attribute__((packed));
+
+struct c2wr_flash_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 len;
+} __attribute__((packed));
+
+struct c2wr_flash_rep {
+       struct c2wr_hdr hdr;
+       u32 status;
+} __attribute__((packed));
+
+union c2wr_flash {
+       struct c2wr_flash_req req;
+       struct c2wr_flash_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 size;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_rep {
+       struct c2wr_hdr hdr;
+       u32 offset;             /* 0 if mem not available */
+       u32 size;               /* 0 if mem not available */
+} __attribute__((packed));
+
+union c2wr_buf_alloc {
+       struct c2wr_buf_alloc_req req;
+       struct c2wr_buf_alloc_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_free_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 offset;             /* Must match value from alloc */
+       u32 size;               /* Must match value from alloc */
+} __attribute__((packed));
+
+struct c2wr_buf_free_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_buf_free {
+       struct c2wr_buf_free_req req;
+       struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_flash_write_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 offset;
+       u32 size;
+       u32 type;
+       u32 flags;
+} __attribute__((packed));
+
+struct c2wr_flash_write_rep {
+       struct c2wr_hdr hdr;
+       u32 status;
+} __attribute__((packed));
+
+union c2wr_flash_write {
+       struct c2wr_flash_write_req req;
+       struct c2wr_flash_write_rep rep;
+} __attribute__((packed));
+
+/*
+ * Messages for LLP connection setup.
+ */
+
+/*
+ * Listen Request.  This allocates a listening endpoint to allow passive
+ * connection setup.  Newly established LLP connections are passed up
+ * via an AE.  See c2wr_ae_connection_request_t
+ */
+struct c2wr_ep_listen_create_req {
+       struct c2wr_hdr hdr;
+       u64 user_context;       /* returned in AEs. */
+       u32 rnic_handle;
+       __be32 local_addr;              /* local addr, or 0  */
+       __be16 local_port;              /* 0 means "pick one" */
+       u16 pad;
+       __be32 backlog;         /* tradional tcp listen bl */
+} __attribute__((packed));
+
+struct c2wr_ep_listen_create_rep {
+       struct c2wr_hdr hdr;
+       u32 ep_handle;          /* handle to new listening ep */
+       u16 local_port;         /* resulting port... */
+       u16 pad;
+} __attribute__((packed));
+
+union c2wr_ep_listen_create {
+       struct c2wr_ep_listen_create_req req;
+       struct c2wr_ep_listen_create_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_ep_listen_destroy {
+       struct c2wr_ep_listen_destroy_req req;
+       struct c2wr_ep_listen_destroy_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_query_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_query_rep {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 local_addr;
+       u32 remote_addr;
+       u16 local_port;
+       u16 remote_port;
+} __attribute__((packed));
+
+union c2wr_ep_query {
+       struct c2wr_ep_query_req req;
+       struct c2wr_ep_query_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * The host passes this down to indicate acceptance of a pending iWARP
+ * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
+ * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
+ */
+struct c2wr_cr_accept_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 qp_handle;          /* QP to bind to this LLP conn */
+       u32 ep_handle;          /* LLP  handle to accept */
+       __be32 private_data_length;
+       u8 private_data[0];     /* data in-line in msg. */
+} __attribute__((packed));
+
+/*
+ * adapter sends reply when private data is successfully submitted to
+ * the LLP.
+ */
+struct c2wr_cr_accept_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_accept {
+       struct c2wr_cr_accept_req req;
+       struct c2wr_cr_accept_rep rep;
+} __attribute__((packed));
+
+/*
+ * The host sends this down if a given iWARP connection request was
+ * rejected by the consumer.  The cr_handle was obtained from a
+ * previous c2wr_ae_connection_request_t AE sent by the adapter.
+ */
+struct  c2wr_cr_reject_req {
+       struct c2wr_hdr hdr;
+       u32 rnic_handle;
+       u32 ep_handle;          /* LLP handle to reject */
+} __attribute__((packed));
+
+/*
+ * Dunno if this is needed, but we'll add it for now.  The adapter will
+ * send the reject_reply after the LLP endpoint has been destroyed.
+ */
+struct  c2wr_cr_reject_rep {
+       struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_reject {
+       struct c2wr_cr_reject_req req;
+       struct c2wr_cr_reject_rep rep;
+} __attribute__((packed));
+
+/*
+ * console command.  Used to implement a debug console over the verbs
+ * request and reply queues.
+ */
+
+/*
+ * Console request message.  It contains:
+ *     - message hdr with id = CCWR_CONSOLE
+ *     - the physaddr/len of host memory to be used for the reply.
+ *     - the command string.  eg:  "netstat -s" or "zoneinfo"
+ */
+struct c2wr_console_req {
+       struct c2wr_hdr hdr;            /* id = CCWR_CONSOLE */
+       u64 reply_buf;          /* pinned host buf for reply */
+       u32 reply_buf_len;      /* length of reply buffer */
+       u8 command[0];          /* NUL terminated ascii string */
+       /* containing the command req */
+} __attribute__((packed));
+
+/*
+ * flags used in the console reply.
+ */
+enum c2_console_flags {
+       CONS_REPLY_TRUNCATED = 0x00000001       /* reply was truncated */
+} __attribute__((packed));
+
+/*
+ * Console reply message.
+ * hdr.result contains the c2_status_t error if the reply was _not_ generated,
+ * or C2_OK if the reply was generated.
+ */
+struct c2wr_console_rep {
+       struct c2wr_hdr hdr;            /* id = CCWR_CONSOLE */
+       u32 flags;
+} __attribute__((packed));
+
+union c2wr_console {
+       struct c2wr_console_req req;
+       struct c2wr_console_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * Giant union with all WRs.  Makes life easier...
+ */
+union c2wr {
+       struct c2wr_hdr hdr;
+       struct c2wr_user_hdr user_hdr;
+       union c2wr_rnic_open rnic_open;
+       union c2wr_rnic_query rnic_query;
+       union c2wr_rnic_getconfig rnic_getconfig;
+       union c2wr_rnic_setconfig rnic_setconfig;
+       union c2wr_rnic_close rnic_close;
+       union c2wr_cq_create cq_create;
+       union c2wr_cq_modify cq_modify;
+       union c2wr_cq_destroy cq_destroy;
+       union c2wr_pd_alloc pd_alloc;
+       union c2wr_pd_dealloc pd_dealloc;
+       union c2wr_srq_create srq_create;
+       union c2wr_srq_destroy srq_destroy;
+       union c2wr_qp_create qp_create;
+       union c2wr_qp_query qp_query;
+       union c2wr_qp_modify qp_modify;
+       union c2wr_qp_destroy qp_destroy;
+       struct c2wr_qp_connect qp_connect;
+       union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
+       union c2wr_nsmr_register nsmr_register;
+       union c2wr_nsmr_pbl nsmr_pbl;
+       union c2wr_mr_query mr_query;
+       union c2wr_mw_query mw_query;
+       union c2wr_stag_dealloc stag_dealloc;
+       union c2wr_sqwr sqwr;
+       struct c2wr_rqwr rqwr;
+       struct c2wr_ce ce;
+       union c2wr_ae ae;
+       union c2wr_init init;
+       union c2wr_ep_listen_create ep_listen_create;
+       union c2wr_ep_listen_destroy ep_listen_destroy;
+       union c2wr_cr_accept cr_accept;
+       union c2wr_cr_reject cr_reject;
+       union c2wr_console console;
+       union c2wr_flash_init flash_init;
+       union c2wr_flash flash;
+       union c2wr_buf_alloc buf_alloc;
+       union c2wr_buf_free buf_free;
+       union c2wr_flash_write flash_write;
+} __attribute__((packed));
+
+
+/*
+ * Accessors for the wr fields that are packed together tightly to
+ * reduce the wr message size.  The wr arguments are void* so that
+ * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
+ * in the struct c2wr union can be passed in.
+ */
+static __inline__ u8 c2_wr_get_id(void *wr)
+{
+       return ((struct c2wr_hdr *) wr)->id;
+}
+static __inline__ void c2_wr_set_id(void *wr, u8 id)
+{
+       ((struct c2wr_hdr *) wr)->id = id;
+}
+static __inline__ u8 c2_wr_get_result(void *wr)
+{
+       return ((struct c2wr_hdr *) wr)->result;
+}
+static __inline__ void c2_wr_set_result(void *wr, u8 result)
+{
+       ((struct c2wr_hdr *) wr)->result = result;
+}
+static __inline__ u8 c2_wr_get_flags(void *wr)
+{
+       return ((struct c2wr_hdr *) wr)->flags;
+}
+static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
+{
+       ((struct c2wr_hdr *) wr)->flags = flags;
+}
+static __inline__ u8 c2_wr_get_sge_count(void *wr)
+{
+       return ((struct c2wr_hdr *) wr)->sge_count;
+}
+static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
+{
+       ((struct c2wr_hdr *) wr)->sge_count = sge_count;
+}
+static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
+{
+       return ((struct c2wr_hdr *) wr)->wqe_count;
+}
+static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
+{
+       ((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
+}
+
+#endif                         /* _C2_WR_H_ */
diff --git a/drivers/staging/rdma/ehca/Kconfig b/drivers/staging/rdma/ehca/Kconfig
new file mode 100644 (file)
index 0000000..3fadd2a
--- /dev/null
@@ -0,0 +1,10 @@
+config INFINIBAND_EHCA
+       tristate "eHCA support"
+       depends on IBMEBUS
+       ---help---
+       This driver supports the deprecated IBM pSeries eHCA InfiniBand
+       adapter.
+
+       To compile the driver as a module, choose M here. The module
+       will be called ib_ehca.
+
diff --git a/drivers/staging/rdma/ehca/Makefile b/drivers/staging/rdma/ehca/Makefile
new file mode 100644 (file)
index 0000000..74d284e
--- /dev/null
@@ -0,0 +1,16 @@
+#  Authors: Heiko J Schick <schickhj@de.ibm.com>
+#           Christoph Raisch <raisch@de.ibm.com>
+#           Joachim Fenkes <fenkes@de.ibm.com>
+#
+#  Copyright (c) 2005 IBM Corporation
+#
+#  All rights reserved.
+#
+#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
+
+obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
+
+ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
+               ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
+               ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
+
diff --git a/drivers/staging/rdma/ehca/TODO b/drivers/staging/rdma/ehca/TODO
new file mode 100644 (file)
index 0000000..199a4a6
--- /dev/null
@@ -0,0 +1,4 @@
+9/2015
+
+The ehca driver has been deprecated and moved to drivers/staging/rdma.
+It will be removed in the 4.6 merge window.
diff --git a/drivers/staging/rdma/ehca/ehca_av.c b/drivers/staging/rdma/ehca/ehca_av.c
new file mode 100644 (file)
index 0000000..4659263
--- /dev/null
@@ -0,0 +1,277 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  address vector functions
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *av_cache;
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+                 enum ib_rate path_rate, u32 *ipd)
+{
+       int path = ib_rate_to_mult(path_rate);
+       int link, ret;
+       struct ib_port_attr pa;
+
+       if (path_rate == IB_RATE_PORT_CURRENT) {
+               *ipd = 0;
+               return 0;
+       }
+
+       if (unlikely(path < 0)) {
+               ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
+                        path_rate);
+               return -EINVAL;
+       }
+
+       ret = ehca_query_port(&shca->ib_device, port, &pa);
+       if (unlikely(ret < 0)) {
+               ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
+               return ret;
+       }
+
+       link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
+
+       if (path >= link)
+               /* no need to throttle if path faster than link */
+               *ipd = 0;
+       else
+               /* IPD = round((link / path) - 1) */
+               *ipd = ((link + (path >> 1)) / path) - 1;
+
+       return 0;
+}
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+       int ret;
+       struct ehca_av *av;
+       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+                                             ib_device);
+
+       av = kmem_cache_alloc(av_cache, GFP_KERNEL);
+       if (!av) {
+               ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
+                        pd, ah_attr);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       av->av.sl = ah_attr->sl;
+       av->av.dlid = ah_attr->dlid;
+       av->av.slid_path_bits = ah_attr->src_path_bits;
+
+       if (ehca_static_rate < 0) {
+               u32 ipd;
+               if (ehca_calc_ipd(shca, ah_attr->port_num,
+                                 ah_attr->static_rate, &ipd)) {
+                       ret = -EINVAL;
+                       goto create_ah_exit1;
+               }
+               av->av.ipd = ipd;
+       } else
+               av->av.ipd = ehca_static_rate;
+
+       av->av.lnh = ah_attr->ah_flags;
+       av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
+                                           ah_attr->grh.traffic_class);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+                                           ah_attr->grh.flow_label);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+                                           ah_attr->grh.hop_limit);
+       av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
+       /* set sgid in grh.word_1 */
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               int rc;
+               struct ib_port_attr port_attr;
+               union ib_gid gid;
+               memset(&port_attr, 0, sizeof(port_attr));
+               rc = ehca_query_port(pd->device, ah_attr->port_num,
+                                    &port_attr);
+               if (rc) { /* invalid port number */
+                       ret = -EINVAL;
+                       ehca_err(pd->device, "Invalid port number "
+                                "ehca_query_port() returned %x "
+                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
+                       goto create_ah_exit1;
+               }
+               memset(&gid, 0, sizeof(gid));
+               rc = ehca_query_gid(pd->device,
+                                   ah_attr->port_num,
+                                   ah_attr->grh.sgid_index, &gid);
+               if (rc) {
+                       ret = -EINVAL;
+                       ehca_err(pd->device, "Failed to retrieve sgid "
+                                "ehca_query_gid() returned %x "
+                                "pd=%p ah_attr=%p", rc, pd, ah_attr);
+                       goto create_ah_exit1;
+               }
+               memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
+       }
+       av->av.pmtu = shca->max_mtu;
+
+       /* dgid comes in grh.word_3 */
+       memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
+              sizeof(ah_attr->grh.dgid));
+
+       return &av->ib_ah;
+
+create_ah_exit1:
+       kmem_cache_free(av_cache, av);
+
+       return ERR_PTR(ret);
+}
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+       struct ehca_av *av;
+       struct ehca_ud_av new_ehca_av;
+       struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
+                                             ib_device);
+
+       memset(&new_ehca_av, 0, sizeof(new_ehca_av));
+       new_ehca_av.sl = ah_attr->sl;
+       new_ehca_av.dlid = ah_attr->dlid;
+       new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
+       new_ehca_av.ipd = ah_attr->static_rate;
+       new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
+                                        (ah_attr->ah_flags & IB_AH_GRH) > 0);
+       new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
+                                               ah_attr->grh.traffic_class);
+       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+                                                ah_attr->grh.flow_label);
+       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+                                                ah_attr->grh.hop_limit);
+       new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
+
+       /* set sgid in grh.word_1 */
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               int rc;
+               struct ib_port_attr port_attr;
+               union ib_gid gid;
+               memset(&port_attr, 0, sizeof(port_attr));
+               rc = ehca_query_port(ah->device, ah_attr->port_num,
+                                    &port_attr);
+               if (rc) { /* invalid port number */
+                       ehca_err(ah->device, "Invalid port number "
+                                "ehca_query_port() returned %x "
+                                "ah=%p ah_attr=%p port_num=%x",
+                                rc, ah, ah_attr, ah_attr->port_num);
+                       return -EINVAL;
+               }
+               memset(&gid, 0, sizeof(gid));
+               rc = ehca_query_gid(ah->device,
+                                   ah_attr->port_num,
+                                   ah_attr->grh.sgid_index, &gid);
+               if (rc) {
+                       ehca_err(ah->device, "Failed to retrieve sgid "
+                                "ehca_query_gid() returned %x "
+                                "ah=%p ah_attr=%p port_num=%x "
+                                "sgid_index=%x",
+                                rc, ah, ah_attr, ah_attr->port_num,
+                                ah_attr->grh.sgid_index);
+                       return -EINVAL;
+               }
+               memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
+       }
+
+       new_ehca_av.pmtu = shca->max_mtu;
+
+       memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
+              sizeof(ah_attr->grh.dgid));
+
+       av = container_of(ah, struct ehca_av, ib_ah);
+       av->av = new_ehca_av;
+
+       return 0;
+}
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+       struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
+
+       memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
+              sizeof(ah_attr->grh.dgid));
+       ah_attr->sl = av->av.sl;
+
+       ah_attr->dlid = av->av.dlid;
+
+       ah_attr->src_path_bits = av->av.slid_path_bits;
+       ah_attr->static_rate = av->av.ipd;
+       ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
+       ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
+                                                   av->av.grh.word_0);
+       ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
+                                               av->av.grh.word_0);
+       ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
+                                                av->av.grh.word_0);
+
+       return 0;
+}
+
+int ehca_destroy_ah(struct ib_ah *ah)
+{
+       kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
+
+       return 0;
+}
+
+int ehca_init_av_cache(void)
+{
+       av_cache = kmem_cache_create("ehca_cache_av",
+                                  sizeof(struct ehca_av), 0,
+                                  SLAB_HWCACHE_ALIGN,
+                                  NULL);
+       if (!av_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_av_cache(void)
+{
+       if (av_cache)
+               kmem_cache_destroy(av_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h
new file mode 100644 (file)
index 0000000..bd45e0f
--- /dev/null
@@ -0,0 +1,482 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Struct definition for eHCA internal structures
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_H__
+#define __EHCA_CLASSES_H__
+
+struct ehca_module;
+struct ehca_qp;
+struct ehca_cq;
+struct ehca_eq;
+struct ehca_mr;
+struct ehca_mw;
+struct ehca_pd;
+struct ehca_av;
+
+#include <linux/wait.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#ifdef CONFIG_PPC64
+#include "ehca_classes_pSeries.h"
+#endif
+#include "ipz_pt_fn.h"
+#include "ehca_qes.h"
+#include "ehca_irq.h"
+
+#define EHCA_EQE_CACHE_SIZE 20
+#define EHCA_MAX_NUM_QUEUES 0xffff
+
+struct ehca_eqe_cache_entry {
+       struct ehca_eqe *eqe;
+       struct ehca_cq *cq;
+};
+
+struct ehca_eq {
+       u32 length;
+       struct ipz_queue ipz_queue;
+       struct ipz_eq_handle ipz_eq_handle;
+       struct work_struct work;
+       struct h_galpas galpas;
+       int is_initialized;
+       struct ehca_pfeq pf;
+       spinlock_t spinlock;
+       struct tasklet_struct interrupt_task;
+       u32 ist;
+       spinlock_t irq_spinlock;
+       struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
+};
+
+struct ehca_sma_attr {
+       u16 lid, lmc, sm_sl, sm_lid;
+       u16 pkey_tbl_len, pkeys[16];
+};
+
+struct ehca_sport {
+       struct ib_cq *ibcq_aqp1;
+       struct ib_qp *ibqp_sqp[2];
+       /* lock to serialze modify_qp() calls for sqp in normal
+        * and irq path (when event PORT_ACTIVE is received first time)
+        */
+       spinlock_t mod_sqp_lock;
+       enum ib_port_state port_state;
+       struct ehca_sma_attr saved_attr;
+       u32 pma_qp_nr;
+};
+
+#define HCA_CAP_MR_PGSIZE_4K  0x80000000
+#define HCA_CAP_MR_PGSIZE_64K 0x40000000
+#define HCA_CAP_MR_PGSIZE_1M  0x20000000
+#define HCA_CAP_MR_PGSIZE_16M 0x10000000
+
+struct ehca_shca {
+       struct ib_device ib_device;
+       struct platform_device *ofdev;
+       u8 num_ports;
+       int hw_level;
+       struct list_head shca_list;
+       struct ipz_adapter_handle ipz_hca_handle;
+       struct ehca_sport sport[2];
+       struct ehca_eq eq;
+       struct ehca_eq neq;
+       struct ehca_mr *maxmr;
+       struct ehca_pd *pd;
+       struct h_galpas galpas;
+       struct mutex modify_mutex;
+       u64 hca_cap;
+       /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
+       u32 hca_cap_mr_pgsize;
+       int max_mtu;
+       int max_num_qps;
+       int max_num_cqs;
+       atomic_t num_cqs;
+       atomic_t num_qps;
+};
+
+struct ehca_pd {
+       struct ib_pd ib_pd;
+       struct ipz_pd fw_pd;
+       /* small queue mgmt */
+       struct mutex lock;
+       struct list_head free[2];
+       struct list_head full[2];
+};
+
+enum ehca_ext_qp_type {
+       EQPT_NORMAL    = 0,
+       EQPT_LLQP      = 1,
+       EQPT_SRQBASE   = 2,
+       EQPT_SRQ       = 3,
+};
+
+/* struct to cache modify_qp()'s parms for GSI/SMI qp */
+struct ehca_mod_qp_parm {
+       int mask;
+       struct ib_qp_attr attr;
+};
+
+#define EHCA_MOD_QP_PARM_MAX 4
+
+#define QMAP_IDX_MASK 0xFFFFULL
+
+/* struct for tracking if cqes have been reported to the application */
+struct ehca_qmap_entry {
+       u16 app_wr_id;
+       u8 reported;
+       u8 cqe_req;
+};
+
+struct ehca_queue_map {
+       struct ehca_qmap_entry *map;
+       unsigned int entries;
+       unsigned int tail;
+       unsigned int left_to_poll;
+       unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
+};
+
+/* function to calculate the next index for the qmap */
+static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
+{
+       unsigned int temp = cur_index + 1;
+       return (temp == limit) ? 0 : temp;
+}
+
+struct ehca_qp {
+       union {
+               struct ib_qp ib_qp;
+               struct ib_srq ib_srq;
+       };
+       u32 qp_type;
+       enum ehca_ext_qp_type ext_type;
+       enum ib_qp_state state;
+       struct ipz_queue ipz_squeue;
+       struct ehca_queue_map sq_map;
+       struct ipz_queue ipz_rqueue;
+       struct ehca_queue_map rq_map;
+       struct h_galpas galpas;
+       u32 qkey;
+       u32 real_qp_num;
+       u32 token;
+       spinlock_t spinlock_s;
+       spinlock_t spinlock_r;
+       u32 sq_max_inline_data_size;
+       struct ipz_qp_handle ipz_qp_handle;
+       struct ehca_pfqp pf;
+       struct ib_qp_init_attr init_attr;
+       struct ehca_cq *send_cq;
+       struct ehca_cq *recv_cq;
+       unsigned int sqerr_purgeflag;
+       struct hlist_node list_entries;
+       /* array to cache modify_qp()'s parms for GSI/SMI qp */
+       struct ehca_mod_qp_parm *mod_qp_parm;
+       int mod_qp_parm_idx;
+       /* mmap counter for resources mapped into user space */
+       u32 mm_count_squeue;
+       u32 mm_count_rqueue;
+       u32 mm_count_galpa;
+       /* unsolicited ack circumvention */
+       int unsol_ack_circ;
+       int mtu_shift;
+       u32 message_count;
+       u32 packet_count;
+       atomic_t nr_events; /* events seen */
+       wait_queue_head_t wait_completion;
+       int mig_armed;
+       struct list_head sq_err_node;
+       struct list_head rq_err_node;
+};
+
+#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
+#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
+
+/* must be power of 2 */
+#define QP_HASHTAB_LEN 8
+
+struct ehca_cq {
+       struct ib_cq ib_cq;
+       struct ipz_queue ipz_queue;
+       struct h_galpas galpas;
+       spinlock_t spinlock;
+       u32 cq_number;
+       u32 token;
+       u32 nr_of_entries;
+       struct ipz_cq_handle ipz_cq_handle;
+       struct ehca_pfcq pf;
+       spinlock_t cb_lock;
+       struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
+       struct list_head entry;
+       u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
+       atomic_t nr_events; /* #events seen */
+       wait_queue_head_t wait_completion;
+       spinlock_t task_lock;
+       /* mmap counter for resources mapped into user space */
+       u32 mm_count_queue;
+       u32 mm_count_galpa;
+       struct list_head sqp_err_list;
+       struct list_head rqp_err_list;
+};
+
+enum ehca_mr_flag {
+       EHCA_MR_FLAG_FMR = 0x80000000,   /* FMR, created with ehca_alloc_fmr */
+       EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
+};
+
+struct ehca_mr {
+       union {
+               struct ib_mr ib_mr;     /* must always be first in ehca_mr */
+               struct ib_fmr ib_fmr;   /* must always be first in ehca_mr */
+       } ib;
+       struct ib_umem *umem;
+       spinlock_t mrlock;
+
+       enum ehca_mr_flag flags;
+       u32 num_kpages;         /* number of kernel pages */
+       u32 num_hwpages;        /* number of hw pages to form MR */
+       u64 hwpage_size;        /* hw page size used for this MR */
+       int acl;                /* ACL (stored here for usage in reregister) */
+       u64 *start;             /* virtual start address (stored here for */
+                               /* usage in reregister) */
+       u64 size;               /* size (stored here for usage in reregister) */
+       u32 fmr_page_size;      /* page size for FMR */
+       u32 fmr_max_pages;      /* max pages for FMR */
+       u32 fmr_max_maps;       /* max outstanding maps for FMR */
+       u32 fmr_map_cnt;        /* map counter for FMR */
+       /* fw specific data */
+       struct ipz_mrmw_handle ipz_mr_handle;   /* MR handle for h-calls */
+       struct h_galpas galpas;
+};
+
+struct ehca_mw {
+       struct ib_mw ib_mw;     /* gen2 mw, must always be first in ehca_mw */
+       spinlock_t mwlock;
+
+       u8 never_bound;         /* indication MW was never bound */
+       struct ipz_mrmw_handle ipz_mw_handle;   /* MW handle for h-calls */
+       struct h_galpas galpas;
+};
+
+enum ehca_mr_pgi_type {
+       EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
+                                 * ehca_rereg_phys_mr,
+                                 * ehca_reg_internal_maxmr */
+       EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
+       EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
+};
+
+struct ehca_mr_pginfo {
+       enum ehca_mr_pgi_type type;
+       u64 num_kpages;
+       u64 kpage_cnt;
+       u64 hwpage_size;     /* hw page size used for this MR */
+       u64 num_hwpages;     /* number of hw pages */
+       u64 hwpage_cnt;      /* counter for hw pages */
+       u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
+
+       union {
+               struct { /* type EHCA_MR_PGI_PHYS section */
+                       int num_phys_buf;
+                       struct ib_phys_buf *phys_buf_array;
+                       u64 next_buf;
+               } phy;
+               struct { /* type EHCA_MR_PGI_USER section */
+                       struct ib_umem *region;
+                       struct scatterlist *next_sg;
+                       u64 next_nmap;
+               } usr;
+               struct { /* type EHCA_MR_PGI_FMR section */
+                       u64 fmr_pgsize;
+                       u64 *page_list;
+                       u64 next_listelem;
+               } fmr;
+       } u;
+};
+
+/* output parameters for MR/FMR hipz calls */
+struct ehca_mr_hipzout_parms {
+       struct ipz_mrmw_handle handle;
+       u32 lkey;
+       u32 rkey;
+       u64 len;
+       u64 vaddr;
+       u32 acl;
+};
+
+/* output parameters for MW hipz calls */
+struct ehca_mw_hipzout_parms {
+       struct ipz_mrmw_handle handle;
+       u32 rkey;
+};
+
+struct ehca_av {
+       struct ib_ah ib_ah;
+       struct ehca_ud_av av;
+};
+
+struct ehca_ucontext {
+       struct ib_ucontext ib_ucontext;
+};
+
+int ehca_init_pd_cache(void);
+void ehca_cleanup_pd_cache(void);
+int ehca_init_cq_cache(void);
+void ehca_cleanup_cq_cache(void);
+int ehca_init_qp_cache(void);
+void ehca_cleanup_qp_cache(void);
+int ehca_init_av_cache(void);
+void ehca_cleanup_av_cache(void);
+int ehca_init_mrmw_cache(void);
+void ehca_cleanup_mrmw_cache(void);
+int ehca_init_small_qp_cache(void);
+void ehca_cleanup_small_qp_cache(void);
+
+extern rwlock_t ehca_qp_idr_lock;
+extern rwlock_t ehca_cq_idr_lock;
+extern struct idr ehca_qp_idr;
+extern struct idr ehca_cq_idr;
+extern spinlock_t shca_list_lock;
+
+extern int ehca_static_rate;
+extern int ehca_port_act_time;
+extern bool ehca_use_hp_mr;
+extern bool ehca_scaling_code;
+extern int ehca_lock_hcalls;
+extern int ehca_nr_ports;
+extern int ehca_max_cq;
+extern int ehca_max_qp;
+
+struct ipzu_queue_resp {
+       u32 qe_size;      /* queue entry size */
+       u32 act_nr_of_sg;
+       u32 queue_length; /* queue length allocated in bytes */
+       u32 pagesize;
+       u32 toggle_state;
+       u32 offset; /* save offset within a page for small_qp */
+};
+
+struct ehca_create_cq_resp {
+       u32 cq_number;
+       u32 token;
+       struct ipzu_queue_resp ipz_queue;
+       u32 fw_handle_ofs;
+       u32 dummy;
+};
+
+struct ehca_create_qp_resp {
+       u32 qp_num;
+       u32 token;
+       u32 qp_type;
+       u32 ext_type;
+       u32 qkey;
+       /* qp_num assigned by ehca: sqp0/1 may have got different numbers */
+       u32 real_qp_num;
+       u32 fw_handle_ofs;
+       u32 dummy;
+       struct ipzu_queue_resp ipz_squeue;
+       struct ipzu_queue_resp ipz_rqueue;
+};
+
+struct ehca_alloc_cq_parms {
+       u32 nr_cqe;
+       u32 act_nr_of_entries;
+       u32 act_pages;
+       struct ipz_eq_handle eq_handle;
+};
+
+enum ehca_service_type {
+       ST_RC  = 0,
+       ST_UC  = 1,
+       ST_RD  = 2,
+       ST_UD  = 3,
+};
+
+enum ehca_ll_comp_flags {
+       LLQP_SEND_COMP = 0x20,
+       LLQP_RECV_COMP = 0x40,
+       LLQP_COMP_MASK = 0x60,
+};
+
+struct ehca_alloc_queue_parms {
+       /* input parameters */
+       int max_wr;
+       int max_sge;
+       int page_size;
+       int is_small;
+
+       /* output parameters */
+       u16 act_nr_wqes;
+       u8  act_nr_sges;
+       u32 queue_size; /* bytes for small queues, pages otherwise */
+};
+
+struct ehca_alloc_qp_parms {
+       struct ehca_alloc_queue_parms squeue;
+       struct ehca_alloc_queue_parms rqueue;
+
+       /* input parameters */
+       enum ehca_service_type servicetype;
+       int qp_storage;
+       int sigtype;
+       enum ehca_ext_qp_type ext_type;
+       enum ehca_ll_comp_flags ll_comp_flags;
+       int ud_av_l_key_ctl;
+
+       u32 token;
+       struct ipz_eq_handle eq_handle;
+       struct ipz_pd pd;
+       struct ipz_cq_handle send_cq_handle, recv_cq_handle;
+
+       u32 srq_qpn, srq_token, srq_limit;
+
+       /* output parameters */
+       u32 real_qp_num;
+       struct ipz_qp_handle qp_handle;
+       struct h_galpas galpas;
+};
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_classes_pSeries.h b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h
new file mode 100644 (file)
index 0000000..689c357
--- /dev/null
@@ -0,0 +1,208 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  pSeries interface definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_PSERIES_H__
+#define __EHCA_CLASSES_PSERIES_H__
+
+#include "hcp_phyp.h"
+#include "ipz_pt_fn.h"
+
+
+struct ehca_pfqp {
+       struct ipz_qpt sqpt;
+       struct ipz_qpt rqpt;
+};
+
+struct ehca_pfcq {
+       struct ipz_qpt qpt;
+       u32 cqnr;
+};
+
+struct ehca_pfeq {
+       struct ipz_qpt qpt;
+       struct h_galpa galpa;
+       u32 eqnr;
+};
+
+struct ipz_adapter_handle {
+       u64 handle;
+};
+
+struct ipz_cq_handle {
+       u64 handle;
+};
+
+struct ipz_eq_handle {
+       u64 handle;
+};
+
+struct ipz_qp_handle {
+       u64 handle;
+};
+struct ipz_mrmw_handle {
+       u64 handle;
+};
+
+struct ipz_pd {
+       u32 value;
+};
+
+struct hcp_modify_qp_control_block {
+       u32 qkey;                      /* 00 */
+       u32 rdd;                       /* reliable datagram domain */
+       u32 send_psn;                  /* 02 */
+       u32 receive_psn;               /* 03 */
+       u32 prim_phys_port;            /* 04 */
+       u32 alt_phys_port;             /* 05 */
+       u32 prim_p_key_idx;            /* 06 */
+       u32 alt_p_key_idx;             /* 07 */
+       u32 rdma_atomic_ctrl;          /* 08 */
+       u32 qp_state;                  /* 09 */
+       u32 reserved_10;               /* 10 */
+       u32 rdma_nr_atomic_resp_res;   /* 11 */
+       u32 path_migration_state;      /* 12 */
+       u32 rdma_atomic_outst_dest_qp; /* 13 */
+       u32 dest_qp_nr;                /* 14 */
+       u32 min_rnr_nak_timer_field;   /* 15 */
+       u32 service_level;             /* 16 */
+       u32 send_grh_flag;             /* 17 */
+       u32 retry_count;               /* 18 */
+       u32 timeout;                   /* 19 */
+       u32 path_mtu;                  /* 20 */
+       u32 max_static_rate;           /* 21 */
+       u32 dlid;                      /* 22 */
+       u32 rnr_retry_count;           /* 23 */
+       u32 source_path_bits;          /* 24 */
+       u32 traffic_class;             /* 25 */
+       u32 hop_limit;                 /* 26 */
+       u32 source_gid_idx;            /* 27 */
+       u32 flow_label;                /* 28 */
+       u32 reserved_29;               /* 29 */
+       union {                        /* 30 */
+               u64 dw[2];
+               u8 byte[16];
+       } dest_gid;
+       u32 service_level_al;          /* 34 */
+       u32 send_grh_flag_al;          /* 35 */
+       u32 retry_count_al;            /* 36 */
+       u32 timeout_al;                /* 37 */
+       u32 max_static_rate_al;        /* 38 */
+       u32 dlid_al;                   /* 39 */
+       u32 rnr_retry_count_al;        /* 40 */
+       u32 source_path_bits_al;       /* 41 */
+       u32 traffic_class_al;          /* 42 */
+       u32 hop_limit_al;              /* 43 */
+       u32 source_gid_idx_al;         /* 44 */
+       u32 flow_label_al;             /* 45 */
+       u32 reserved_46;               /* 46 */
+       u32 reserved_47;               /* 47 */
+       union {                        /* 48 */
+               u64 dw[2];
+               u8 byte[16];
+       } dest_gid_al;
+       u32 max_nr_outst_send_wr;      /* 52 */
+       u32 max_nr_outst_recv_wr;      /* 53 */
+       u32 disable_ete_credit_check;  /* 54 */
+       u32 qp_number;                 /* 55 */
+       u64 send_queue_handle;         /* 56 */
+       u64 recv_queue_handle;         /* 58 */
+       u32 actual_nr_sges_in_sq_wqe;  /* 60 */
+       u32 actual_nr_sges_in_rq_wqe;  /* 61 */
+       u32 qp_enable;                 /* 62 */
+       u32 curr_srq_limit;            /* 63 */
+       u64 qp_aff_asyn_ev_log_reg;    /* 64 */
+       u64 shared_rq_hndl;            /* 66 */
+       u64 trigg_doorbell_qp_hndl;    /* 68 */
+       u32 reserved_70_127[58];       /* 70 */
+};
+
+#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
+#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
+#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
+#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
+#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
+#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
+#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
+#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
+#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
+#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
+#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
+#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
+#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
+#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
+#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
+#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
+#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
+#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
+#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
+#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
+#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
+#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
+#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
+#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
+#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
+#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
+#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
+#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
+#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
+#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
+#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
+#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
+#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
+#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
+#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
+#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
+#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
+#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
+#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
+#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
+#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
+#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
+#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
+#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
+#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
+#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
+#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
+#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
+
+#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/staging/rdma/ehca/ehca_cq.c b/drivers/staging/rdma/ehca/ehca_cq.c
new file mode 100644 (file)
index 0000000..9b68b17
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Completion queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *cq_cache;
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
+{
+       unsigned int qp_num = qp->real_qp_num;
+       unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->spinlock, flags);
+       hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
+       spin_unlock_irqrestore(&cq->spinlock, flags);
+
+       ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
+                cq->cq_number, qp_num);
+
+       return 0;
+}
+
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
+{
+       int ret = -EINVAL;
+       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+       struct hlist_node *iter;
+       struct ehca_qp *qp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->spinlock, flags);
+       hlist_for_each(iter, &cq->qp_hashtab[key]) {
+               qp = hlist_entry(iter, struct ehca_qp, list_entries);
+               if (qp->real_qp_num == real_qp_num) {
+                       hlist_del(iter);
+                       ehca_dbg(cq->ib_cq.device,
+                                "removed qp from cq .cq_num=%x real_qp_num=%x",
+                                cq->cq_number, real_qp_num);
+                       ret = 0;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&cq->spinlock, flags);
+       if (ret)
+               ehca_err(cq->ib_cq.device,
+                        "qp not found cq_num=%x real_qp_num=%x",
+                        cq->cq_number, real_qp_num);
+
+       return ret;
+}
+
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
+{
+       struct ehca_qp *ret = NULL;
+       unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+       struct hlist_node *iter;
+       struct ehca_qp *qp;
+       hlist_for_each(iter, &cq->qp_hashtab[key]) {
+               qp = hlist_entry(iter, struct ehca_qp, list_entries);
+               if (qp->real_qp_num == real_qp_num) {
+                       ret = qp;
+                       break;
+               }
+       }
+       return ret;
+}
+
+struct ib_cq *ehca_create_cq(struct ib_device *device,
+                            const struct ib_cq_init_attr *attr,
+                            struct ib_ucontext *context,
+                            struct ib_udata *udata)
+{
+       int cqe = attr->cqe;
+       static const u32 additional_cqe = 20;
+       struct ib_cq *cq;
+       struct ehca_cq *my_cq;
+       struct ehca_shca *shca =
+               container_of(device, struct ehca_shca, ib_device);
+       struct ipz_adapter_handle adapter_handle;
+       struct ehca_alloc_cq_parms param; /* h_call's out parameters */
+       struct h_galpa gal;
+       void *vpage;
+       u32 counter;
+       u64 rpage, cqx_fec, h_ret;
+       int ipz_rc, i;
+       unsigned long flags;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
+               return ERR_PTR(-EINVAL);
+
+       if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
+               ehca_err(device, "Unable to create CQ, max number of %i "
+                       "CQs reached.", shca->max_num_cqs);
+               ehca_err(device, "To increase the maximum number of CQs "
+                       "use the number_of_cqs module parameter.\n");
+               return ERR_PTR(-ENOSPC);
+       }
+
+       my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
+       if (!my_cq) {
+               ehca_err(device, "Out of memory for ehca_cq struct device=%p",
+                        device);
+               atomic_dec(&shca->num_cqs);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
+
+       spin_lock_init(&my_cq->spinlock);
+       spin_lock_init(&my_cq->cb_lock);
+       spin_lock_init(&my_cq->task_lock);
+       atomic_set(&my_cq->nr_events, 0);
+       init_waitqueue_head(&my_cq->wait_completion);
+
+       cq = &my_cq->ib_cq;
+
+       adapter_handle = shca->ipz_hca_handle;
+       param.eq_handle = shca->eq.ipz_eq_handle;
+
+       idr_preload(GFP_KERNEL);
+       write_lock_irqsave(&ehca_cq_idr_lock, flags);
+       my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
+       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+       idr_preload_end();
+
+       if (my_cq->token < 0) {
+               cq = ERR_PTR(-ENOMEM);
+               ehca_err(device, "Can't allocate new idr entry. device=%p",
+                        device);
+               goto create_cq_exit1;
+       }
+
+       /*
+        * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
+        * for receiving errors CQEs.
+        */
+       param.nr_cqe = cqe + additional_cqe;
+       h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
+
+       if (h_ret != H_SUCCESS) {
+               ehca_err(device, "hipz_h_alloc_resource_cq() failed "
+                        "h_ret=%lli device=%p", h_ret, device);
+               cq = ERR_PTR(ehca2ib_return_code(h_ret));
+               goto create_cq_exit2;
+       }
+
+       ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
+                               EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
+       if (!ipz_rc) {
+               ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
+                        ipz_rc, device);
+               cq = ERR_PTR(-EINVAL);
+               goto create_cq_exit3;
+       }
+
+       for (counter = 0; counter < param.act_pages; counter++) {
+               vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+               if (!vpage) {
+                       ehca_err(device, "ipz_qpageit_get_inc() "
+                                "returns NULL device=%p", device);
+                       cq = ERR_PTR(-EAGAIN);
+                       goto create_cq_exit4;
+               }
+               rpage = __pa(vpage);
+
+               h_ret = hipz_h_register_rpage_cq(adapter_handle,
+                                                my_cq->ipz_cq_handle,
+                                                &my_cq->pf,
+                                                0,
+                                                0,
+                                                rpage,
+                                                1,
+                                                my_cq->galpas.
+                                                kernel);
+
+               if (h_ret < H_SUCCESS) {
+                       ehca_err(device, "hipz_h_register_rpage_cq() failed "
+                                "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
+                                "act_pages=%i", my_cq, my_cq->cq_number,
+                                h_ret, counter, param.act_pages);
+                       cq = ERR_PTR(-EINVAL);
+                       goto create_cq_exit4;
+               }
+
+               if (counter == (param.act_pages - 1)) {
+                       vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+                       if ((h_ret != H_SUCCESS) || vpage) {
+                               ehca_err(device, "Registration of pages not "
+                                        "complete ehca_cq=%p cq_num=%x "
+                                        "h_ret=%lli", my_cq, my_cq->cq_number,
+                                        h_ret);
+                               cq = ERR_PTR(-EAGAIN);
+                               goto create_cq_exit4;
+                       }
+               } else {
+                       if (h_ret != H_PAGE_REGISTERED) {
+                               ehca_err(device, "Registration of page failed "
+                                        "ehca_cq=%p cq_num=%x h_ret=%lli "
+                                        "counter=%i act_pages=%i",
+                                        my_cq, my_cq->cq_number,
+                                        h_ret, counter, param.act_pages);
+                               cq = ERR_PTR(-ENOMEM);
+                               goto create_cq_exit4;
+                       }
+               }
+       }
+
+       ipz_qeit_reset(&my_cq->ipz_queue);
+
+       gal = my_cq->galpas.kernel;
+       cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
+       ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
+                my_cq, my_cq->cq_number, cqx_fec);
+
+       my_cq->ib_cq.cqe = my_cq->nr_of_entries =
+               param.act_nr_of_entries - additional_cqe;
+       my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
+
+       for (i = 0; i < QP_HASHTAB_LEN; i++)
+               INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
+
+       INIT_LIST_HEAD(&my_cq->sqp_err_list);
+       INIT_LIST_HEAD(&my_cq->rqp_err_list);
+
+       if (context) {
+               struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
+               struct ehca_create_cq_resp resp;
+               memset(&resp, 0, sizeof(resp));
+               resp.cq_number = my_cq->cq_number;
+               resp.token = my_cq->token;
+               resp.ipz_queue.qe_size = ipz_queue->qe_size;
+               resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
+               resp.ipz_queue.queue_length = ipz_queue->queue_length;
+               resp.ipz_queue.pagesize = ipz_queue->pagesize;
+               resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
+               resp.fw_handle_ofs = (u32)
+                       (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
+               if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+                       ehca_err(device, "Copy to udata failed.");
+                       cq = ERR_PTR(-EFAULT);
+                       goto create_cq_exit4;
+               }
+       }
+
+       return cq;
+
+create_cq_exit4:
+       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+
+create_cq_exit3:
+       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+       if (h_ret != H_SUCCESS)
+               ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
+                        "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
+
+create_cq_exit2:
+       write_lock_irqsave(&ehca_cq_idr_lock, flags);
+       idr_remove(&ehca_cq_idr, my_cq->token);
+       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+create_cq_exit1:
+       kmem_cache_free(cq_cache, my_cq);
+
+       atomic_dec(&shca->num_cqs);
+       return cq;
+}
+
+int ehca_destroy_cq(struct ib_cq *cq)
+{
+       u64 h_ret;
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       int cq_num = my_cq->cq_number;
+       struct ib_device *device = cq->device;
+       struct ehca_shca *shca = container_of(device, struct ehca_shca,
+                                             ib_device);
+       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+       unsigned long flags;
+
+       if (cq->uobject) {
+               if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
+                       ehca_err(device, "Resources still referenced in "
+                                "user space cq_num=%x", my_cq->cq_number);
+                       return -EINVAL;
+               }
+       }
+
+       /*
+        * remove the CQ from the idr first to make sure
+        * no more interrupt tasklets will touch this CQ
+        */
+       write_lock_irqsave(&ehca_cq_idr_lock, flags);
+       idr_remove(&ehca_cq_idr, my_cq->token);
+       write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+       /* now wait until all pending events have completed */
+       wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
+
+       /* nobody's using our CQ any longer -- we can destroy it */
+       h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
+       if (h_ret == H_R_STATE) {
+               /* cq in err: read err data and destroy it forcibly */
+               ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
+                        "state. Try to delete it forcibly.",
+                        my_cq, cq_num, my_cq->ipz_cq_handle.handle);
+               ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
+               h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+               if (h_ret == H_SUCCESS)
+                       ehca_dbg(device, "cq_num=%x deleted successfully.",
+                                cq_num);
+       }
+       if (h_ret != H_SUCCESS) {
+               ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
+                        "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
+               return ehca2ib_return_code(h_ret);
+       }
+       ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+       kmem_cache_free(cq_cache, my_cq);
+
+       atomic_dec(&shca->num_cqs);
+       return 0;
+}
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+       /* TODO: proper resize needs to be done */
+       ehca_err(cq->device, "not implemented yet");
+
+       return -EFAULT;
+}
+
+int ehca_init_cq_cache(void)
+{
+       cq_cache = kmem_cache_create("ehca_cache_cq",
+                                    sizeof(struct ehca_cq), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!cq_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_cq_cache(void)
+{
+       if (cq_cache)
+               kmem_cache_destroy(cq_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_eq.c b/drivers/staging/rdma/ehca/ehca_eq.c
new file mode 100644 (file)
index 0000000..90da674
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Event queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_qes.h"
+#include "hcp_if.h"
+#include "ipz_pt_fn.h"
+
+int ehca_create_eq(struct ehca_shca *shca,
+                  struct ehca_eq *eq,
+                  const enum ehca_eq_type type, const u32 length)
+{
+       int ret;
+       u64 h_ret;
+       u32 nr_pages;
+       u32 i;
+       void *vpage;
+       struct ib_device *ib_dev = &shca->ib_device;
+
+       spin_lock_init(&eq->spinlock);
+       spin_lock_init(&eq->irq_spinlock);
+       eq->is_initialized = 0;
+
+       if (type != EHCA_EQ && type != EHCA_NEQ) {
+               ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
+               return -EINVAL;
+       }
+       if (!length) {
+               ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
+               return -EINVAL;
+       }
+
+       h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
+                                        &eq->pf,
+                                        type,
+                                        length,
+                                        &eq->ipz_eq_handle,
+                                        &eq->length,
+                                        &nr_pages, &eq->ist);
+
+       if (h_ret != H_SUCCESS) {
+               ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
+               return -EINVAL;
+       }
+
+       ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
+                            EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
+       if (!ret) {
+               ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
+               goto create_eq_exit1;
+       }
+
+       for (i = 0; i < nr_pages; i++) {
+               u64 rpage;
+
+               vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+               if (!vpage)
+                       goto create_eq_exit2;
+
+               rpage = __pa(vpage);
+               h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
+                                                eq->ipz_eq_handle,
+                                                &eq->pf,
+                                                0, 0, rpage, 1);
+
+               if (i == (nr_pages - 1)) {
+                       /* last page */
+                       vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+                       if (h_ret != H_SUCCESS || vpage)
+                               goto create_eq_exit2;
+               } else {
+                       if (h_ret != H_PAGE_REGISTERED)
+                               goto create_eq_exit2;
+               }
+       }
+
+       ipz_qeit_reset(&eq->ipz_queue);
+
+       /* register interrupt handlers and initialize work queues */
+       if (type == EHCA_EQ) {
+               tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
+
+               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
+                                         0, "ehca_eq",
+                                         (void *)shca);
+               if (ret < 0)
+                       ehca_err(ib_dev, "Can't map interrupt handler.");
+       } else if (type == EHCA_NEQ) {
+               tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
+
+               ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
+                                         0, "ehca_neq",
+                                         (void *)shca);
+               if (ret < 0)
+                       ehca_err(ib_dev, "Can't map interrupt handler.");
+       }
+
+       eq->is_initialized = 1;
+
+       return 0;
+
+create_eq_exit2:
+       ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+create_eq_exit1:
+       hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+       return -EINVAL;
+}
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+       unsigned long flags;
+       void *eqe;
+
+       spin_lock_irqsave(&eq->spinlock, flags);
+       eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
+       spin_unlock_irqrestore(&eq->spinlock, flags);
+
+       return eqe;
+}
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+       unsigned long flags;
+       u64 h_ret;
+
+       ibmebus_free_irq(eq->ist, (void *)shca);
+
+       spin_lock_irqsave(&shca_list_lock, flags);
+       eq->is_initialized = 0;
+       spin_unlock_irqrestore(&shca_list_lock, flags);
+
+       tasklet_kill(&eq->interrupt_task);
+
+       h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't free EQ resources.");
+               return -EINVAL;
+       }
+       ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_hca.c b/drivers/staging/rdma/ehca/ehca_hca.c
new file mode 100644 (file)
index 0000000..e8b1bb6
--- /dev/null
@@ -0,0 +1,414 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HCA query functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/gfp.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static unsigned int limit_uint(unsigned int value)
+{
+       return min_t(unsigned int, value, INT_MAX);
+}
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+                     struct ib_udata *uhw)
+{
+       int i, ret = 0;
+       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+                                             ib_device);
+       struct hipz_query_hca *rblock;
+
+       static const u32 cap_mapping[] = {
+               IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
+               IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
+               IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
+               IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
+               IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
+               IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
+               IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
+               IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
+               IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
+               IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
+               IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
+       };
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query device properties");
+               ret = -EINVAL;
+               goto query_device1;
+       }
+
+       memset(props, 0, sizeof(struct ib_device_attr));
+       props->page_size_cap   = shca->hca_cap_mr_pgsize;
+       props->fw_ver          = rblock->hw_ver;
+       props->max_mr_size     = rblock->max_mr_size;
+       props->vendor_id       = rblock->vendor_id >> 8;
+       props->vendor_part_id  = rblock->vendor_part_id >> 16;
+       props->hw_ver          = rblock->hw_ver;
+       props->max_qp          = limit_uint(rblock->max_qp);
+       props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
+       props->max_sge         = limit_uint(rblock->max_sge);
+       props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
+       props->max_cq          = limit_uint(rblock->max_cq);
+       props->max_cqe         = limit_uint(rblock->max_cqe);
+       props->max_mr          = limit_uint(rblock->max_mr);
+       props->max_mw          = limit_uint(rblock->max_mw);
+       props->max_pd          = limit_uint(rblock->max_pd);
+       props->max_ah          = limit_uint(rblock->max_ah);
+       props->max_ee          = limit_uint(rblock->max_rd_ee_context);
+       props->max_rdd         = limit_uint(rblock->max_rd_domain);
+       props->max_fmr         = limit_uint(rblock->max_mr);
+       props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
+       props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
+       props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
+       props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
+       props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
+
+       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+               props->max_srq         = limit_uint(props->max_qp);
+               props->max_srq_wr      = limit_uint(props->max_qp_wr);
+               props->max_srq_sge     = 3;
+       }
+
+       props->max_pkeys           = 16;
+       /* Some FW versions say 0 here; insert sensible value in that case */
+       props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
+               min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
+       props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
+       props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
+       props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
+       props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
+       props->max_total_mcast_qp_attach
+               = limit_uint(rblock->max_total_mcast_qp_attach);
+
+       /* translate device capabilities */
+       props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
+               IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
+       for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
+               if (rblock->hca_cap_indicators & cap_mapping[i + 1])
+                       props->device_cap_flags |= cap_mapping[i];
+
+query_device1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+{
+       switch (fw_mtu) {
+       case 0x1:
+               return IB_MTU_256;
+       case 0x2:
+               return IB_MTU_512;
+       case 0x3:
+               return IB_MTU_1024;
+       case 0x4:
+               return IB_MTU_2048;
+       case 0x5:
+               return IB_MTU_4096;
+       default:
+               ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
+                        fw_mtu);
+               return 0;
+       }
+}
+
+static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+{
+       switch (vl_cap) {
+       case 0x1:
+               return 1;
+       case 0x2:
+               return 2;
+       case 0x3:
+               return 4;
+       case 0x4:
+               return 8;
+       case 0x5:
+               return 15;
+       default:
+               ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
+                        vl_cap);
+               return 0;
+       }
+}
+
+int ehca_query_port(struct ib_device *ibdev,
+                   u8 port, struct ib_port_attr *props)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+                                             ib_device);
+       struct hipz_query_port *rblock;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_port1;
+       }
+
+       memset(props, 0, sizeof(struct ib_port_attr));
+
+       props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
+       props->port_cap_flags  = rblock->capability_mask;
+       props->gid_tbl_len     = rblock->gid_tbl_len;
+       if (rblock->max_msg_sz)
+               props->max_msg_sz      = rblock->max_msg_sz;
+       else
+               props->max_msg_sz      = 0x1 << 31;
+       props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
+       props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
+       props->pkey_tbl_len    = rblock->pkey_tbl_len;
+       props->lid             = rblock->lid;
+       props->sm_lid          = rblock->sm_lid;
+       props->lmc             = rblock->lmc;
+       props->sm_sl           = rblock->sm_sl;
+       props->subnet_timeout  = rblock->subnet_timeout;
+       props->init_type_reply = rblock->init_type_reply;
+       props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
+
+       if (rblock->state && rblock->phys_width) {
+               props->phys_state      = rblock->phys_pstate;
+               props->state           = rblock->phys_state;
+               props->active_width    = rblock->phys_width;
+               props->active_speed    = rblock->phys_speed;
+       } else {
+               /* old firmware releases don't report physical
+                * port info, so use default values
+                */
+               props->phys_state      = 5;
+               props->state           = rblock->state;
+               props->active_width    = IB_WIDTH_12X;
+               props->active_speed    = IB_SPEED_SDR;
+       }
+
+query_port1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+int ehca_query_sma_attr(struct ehca_shca *shca,
+                       u8 port, struct ehca_sma_attr *attr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct hipz_query_port *rblock;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_sma_attr1;
+       }
+
+       memset(attr, 0, sizeof(struct ehca_sma_attr));
+
+       attr->lid    = rblock->lid;
+       attr->lmc    = rblock->lmc;
+       attr->sm_sl  = rblock->sm_sl;
+       attr->sm_lid = rblock->sm_lid;
+
+       attr->pkey_tbl_len = rblock->pkey_tbl_len;
+       memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
+
+query_sma_attr1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca;
+       struct hipz_query_port *rblock;
+
+       shca = container_of(ibdev, struct ehca_shca, ib_device);
+       if (index > 16) {
+               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+               return -EINVAL;
+       }
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_pkey1;
+       }
+
+       memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
+
+query_pkey1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port,
+                  int index, union ib_gid *gid)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+                                             ib_device);
+       struct hipz_query_port *rblock;
+
+       if (index < 0 || index > 255) {
+               ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+               return -EINVAL;
+       }
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto query_gid1;
+       }
+
+       memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
+       memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
+
+query_gid1:
+       ehca_free_fw_ctrlblock(rblock);
+
+       return ret;
+}
+
+static const u32 allowed_port_caps = (
+       IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
+       IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
+       IB_PORT_VENDOR_CLASS_SUP);
+
+int ehca_modify_port(struct ib_device *ibdev,
+                    u8 port, int port_modify_mask,
+                    struct ib_port_modify *props)
+{
+       int ret = 0;
+       struct ehca_shca *shca;
+       struct hipz_query_port *rblock;
+       u32 cap;
+       u64 hret;
+
+       shca = container_of(ibdev, struct ehca_shca, ib_device);
+       if ((props->set_port_cap_mask | props->clr_port_cap_mask)
+           & ~allowed_port_caps) {
+               ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
+                        "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
+                        props->clr_port_cap_mask, allowed_port_caps);
+               return -EINVAL;
+       }
+
+       if (mutex_lock_interruptible(&shca->modify_mutex))
+               return -ERESTARTSYS;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+               ret = -ENOMEM;
+               goto modify_port1;
+       }
+
+       hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+       if (hret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query port properties");
+               ret = -EINVAL;
+               goto modify_port2;
+       }
+
+       cap = (rblock->capability_mask | props->set_port_cap_mask)
+               & ~props->clr_port_cap_mask;
+
+       hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
+                                 cap, props->init_type, port_modify_mask);
+       if (hret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
+                        hret);
+               ret = -EINVAL;
+       }
+
+modify_port2:
+       ehca_free_fw_ctrlblock(rblock);
+
+modify_port1:
+       mutex_unlock(&shca->modify_mutex);
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.c b/drivers/staging/rdma/ehca/ehca_irq.c
new file mode 100644 (file)
index 0000000..8615d7c
--- /dev/null
@@ -0,0 +1,870 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Functions for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
+#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
+#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
+#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
+#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
+
+#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
+#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
+#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
+#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
+#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
+#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
+
+#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
+#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
+
+static void queue_comp_task(struct ehca_cq *__cq);
+
+static struct ehca_comp_pool *pool;
+
+static inline void comp_event_callback(struct ehca_cq *cq)
+{
+       if (!cq->ib_cq.comp_handler)
+               return;
+
+       spin_lock(&cq->cb_lock);
+       cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
+       spin_unlock(&cq->cb_lock);
+
+       return;
+}
+
+static void print_error_data(struct ehca_shca *shca, void *data,
+                            u64 *rblock, int length)
+{
+       u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+       u64 resource = rblock[1];
+
+       switch (type) {
+       case 0x1: /* Queue Pair */
+       {
+               struct ehca_qp *qp = (struct ehca_qp *)data;
+
+               /* only print error data if AER is set */
+               if (rblock[6] == 0)
+                       return;
+
+               ehca_err(&shca->ib_device,
+                        "QP 0x%x (resource=%llx) has errors.",
+                        qp->ib_qp.qp_num, resource);
+               break;
+       }
+       case 0x4: /* Completion Queue */
+       {
+               struct ehca_cq *cq = (struct ehca_cq *)data;
+
+               ehca_err(&shca->ib_device,
+                        "CQ 0x%x (resource=%llx) has errors.",
+                        cq->cq_number, resource);
+               break;
+       }
+       default:
+               ehca_err(&shca->ib_device,
+                        "Unknown error type: %llx on %s.",
+                        type, shca->ib_device.name);
+               break;
+       }
+
+       ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
+       ehca_err(&shca->ib_device, "EHCA ----- error data begin "
+                "---------------------------------------------------");
+       ehca_dmp(rblock, length, "resource=%llx", resource);
+       ehca_err(&shca->ib_device, "EHCA ----- error data end "
+                "----------------------------------------------------");
+
+       return;
+}
+
+int ehca_error_data(struct ehca_shca *shca, void *data,
+                   u64 resource)
+{
+
+       unsigned long ret;
+       u64 *rblock;
+       unsigned long block_count;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
+               ret = -ENOMEM;
+               goto error_data1;
+       }
+
+       /* rblock must be 4K aligned and should be 4K large */
+       ret = hipz_h_error_data(shca->ipz_hca_handle,
+                               resource,
+                               rblock,
+                               &block_count);
+
+       if (ret == H_R_STATE)
+               ehca_err(&shca->ib_device,
+                        "No error data is available: %llx.", resource);
+       else if (ret == H_SUCCESS) {
+               int length;
+
+               length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
+
+               if (length > EHCA_PAGESIZE)
+                       length = EHCA_PAGESIZE;
+
+               print_error_data(shca, data, rblock, length);
+       } else
+               ehca_err(&shca->ib_device,
+                        "Error data could not be fetched: %llx", resource);
+
+       ehca_free_fw_ctrlblock(rblock);
+
+error_data1:
+       return ret;
+
+}
+
+static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
+                             enum ib_event_type event_type)
+{
+       struct ib_event event;
+
+       /* PATH_MIG without the QP ever having been armed is false alarm */
+       if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
+               return;
+
+       event.device = &shca->ib_device;
+       event.event = event_type;
+
+       if (qp->ext_type == EQPT_SRQ) {
+               if (!qp->ib_srq.event_handler)
+                       return;
+
+               event.element.srq = &qp->ib_srq;
+               qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
+       } else {
+               if (!qp->ib_qp.event_handler)
+                       return;
+
+               event.element.qp = &qp->ib_qp;
+               qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+       }
+}
+
+static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
+                             enum ib_event_type event_type, int fatal)
+{
+       struct ehca_qp *qp;
+       u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
+
+       read_lock(&ehca_qp_idr_lock);
+       qp = idr_find(&ehca_qp_idr, token);
+       if (qp)
+               atomic_inc(&qp->nr_events);
+       read_unlock(&ehca_qp_idr_lock);
+
+       if (!qp)
+               return;
+
+       if (fatal)
+               ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
+
+       dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
+                         IB_EVENT_SRQ_ERR : event_type);
+
+       /*
+        * eHCA only processes one WQE at a time for SRQ base QPs,
+        * so the last WQE has been processed as soon as the QP enters
+        * error state.
+        */
+       if (fatal && qp->ext_type == EQPT_SRQBASE)
+               dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
+
+       if (atomic_dec_and_test(&qp->nr_events))
+               wake_up(&qp->wait_completion);
+       return;
+}
+
+static void cq_event_callback(struct ehca_shca *shca,
+                             u64 eqe)
+{
+       struct ehca_cq *cq;
+       u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
+
+       read_lock(&ehca_cq_idr_lock);
+       cq = idr_find(&ehca_cq_idr, token);
+       if (cq)
+               atomic_inc(&cq->nr_events);
+       read_unlock(&ehca_cq_idr_lock);
+
+       if (!cq)
+               return;
+
+       ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
+
+       if (atomic_dec_and_test(&cq->nr_events))
+               wake_up(&cq->wait_completion);
+
+       return;
+}
+
+static void parse_identifier(struct ehca_shca *shca, u64 eqe)
+{
+       u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
+
+       switch (identifier) {
+       case 0x02: /* path migrated */
+               qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
+               break;
+       case 0x03: /* communication established */
+               qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
+               break;
+       case 0x04: /* send queue drained */
+               qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
+               break;
+       case 0x05: /* QP error */
+       case 0x06: /* QP error */
+               qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
+               break;
+       case 0x07: /* CQ error */
+       case 0x08: /* CQ error */
+               cq_event_callback(shca, eqe);
+               break;
+       case 0x09: /* MRMWPTE error */
+               ehca_err(&shca->ib_device, "MRMWPTE error.");
+               break;
+       case 0x0A: /* port event */
+               ehca_err(&shca->ib_device, "Port event.");
+               break;
+       case 0x0B: /* MR access error */
+               ehca_err(&shca->ib_device, "MR access error.");
+               break;
+       case 0x0C: /* EQ error */
+               ehca_err(&shca->ib_device, "EQ error.");
+               break;
+       case 0x0D: /* P/Q_Key mismatch */
+               ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
+               break;
+       case 0x10: /* sampling complete */
+               ehca_err(&shca->ib_device, "Sampling complete.");
+               break;
+       case 0x11: /* unaffiliated access error */
+               ehca_err(&shca->ib_device, "Unaffiliated access error.");
+               break;
+       case 0x12: /* path migrating */
+               ehca_err(&shca->ib_device, "Path migrating.");
+               break;
+       case 0x13: /* interface trace stopped */
+               ehca_err(&shca->ib_device, "Interface trace stopped.");
+               break;
+       case 0x14: /* first error capture info available */
+               ehca_info(&shca->ib_device, "First error capture available");
+               break;
+       case 0x15: /* SRQ limit reached */
+               qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
+               break;
+       default:
+               ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
+                        identifier, shca->ib_device.name);
+               break;
+       }
+
+       return;
+}
+
+static void dispatch_port_event(struct ehca_shca *shca, int port_num,
+                               enum ib_event_type type, const char *msg)
+{
+       struct ib_event event;
+
+       ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
+       event.device = &shca->ib_device;
+       event.event = type;
+       event.element.port_num = port_num;
+       ib_dispatch_event(&event);
+}
+
+static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
+{
+       struct ehca_sma_attr  new_attr;
+       struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
+
+       ehca_query_sma_attr(shca, port_num, &new_attr);
+
+       if (new_attr.sm_sl  != old_attr->sm_sl ||
+           new_attr.sm_lid != old_attr->sm_lid)
+               dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
+                                   "SM changed");
+
+       if (new_attr.lid != old_attr->lid ||
+           new_attr.lmc != old_attr->lmc)
+               dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
+                                   "LID changed");
+
+       if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
+           memcmp(new_attr.pkeys, old_attr->pkeys,
+                  sizeof(u16) * new_attr.pkey_tbl_len))
+               dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
+                                   "P_Key changed");
+
+       *old_attr = new_attr;
+}
+
+/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
+static int replay_modify_qp(struct ehca_sport *sport)
+{
+       int aqp1_destroyed;
+       unsigned long flags;
+
+       spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+
+       aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
+
+       if (sport->ibqp_sqp[IB_QPT_SMI])
+               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
+       if (!aqp1_destroyed)
+               ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
+
+       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+
+       return aqp1_destroyed;
+}
+
+static void parse_ec(struct ehca_shca *shca, u64 eqe)
+{
+       u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+       u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
+       u8 spec_event;
+       struct ehca_sport *sport = &shca->sport[port - 1];
+
+       switch (ec) {
+       case 0x30: /* port availability change */
+               if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
+                       /* only replay modify_qp calls in autodetect mode;
+                        * if AQP1 was destroyed, the port is already down
+                        * again and we can drop the event.
+                        */
+                       if (ehca_nr_ports < 0)
+                               if (replay_modify_qp(sport))
+                                       break;
+
+                       sport->port_state = IB_PORT_ACTIVE;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+                                           "is active");
+                       ehca_query_sma_attr(shca, port, &sport->saved_attr);
+               } else {
+                       sport->port_state = IB_PORT_DOWN;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+                                           "is inactive");
+               }
+               break;
+       case 0x31:
+               /* port configuration change
+                * disruptive change is caused by
+                * LID, PKEY or SM change
+                */
+               if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
+                       ehca_warn(&shca->ib_device, "disruptive port "
+                                 "%d configuration change", port);
+
+                       sport->port_state = IB_PORT_DOWN;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+                                           "is inactive");
+
+                       sport->port_state = IB_PORT_ACTIVE;
+                       dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+                                           "is active");
+                       ehca_query_sma_attr(shca, port,
+                                           &sport->saved_attr);
+               } else
+                       notify_port_conf_change(shca, port);
+               break;
+       case 0x32: /* adapter malfunction */
+               ehca_err(&shca->ib_device, "Adapter malfunction.");
+               break;
+       case 0x33:  /* trace stopped */
+               ehca_err(&shca->ib_device, "Traced stopped.");
+               break;
+       case 0x34: /* util async event */
+               spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
+               if (spec_event == 0x80) /* client reregister required */
+                       dispatch_port_event(shca, port,
+                                           IB_EVENT_CLIENT_REREGISTER,
+                                           "client reregister req.");
+               else
+                       ehca_warn(&shca->ib_device, "Unknown util async "
+                                 "event %x on port %x", spec_event, port);
+               break;
+       default:
+               ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
+                        ec, shca->ib_device.name);
+               break;
+       }
+
+       return;
+}
+
+static inline void reset_eq_pending(struct ehca_cq *cq)
+{
+       u64 CQx_EP;
+       struct h_galpa gal = cq->galpas.kernel;
+
+       hipz_galpa_store_cq(gal, cqx_ep, 0x0);
+       CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
+
+       return;
+}
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
+{
+       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+       tasklet_hi_schedule(&shca->neq.interrupt_task);
+
+       return IRQ_HANDLED;
+}
+
+void ehca_tasklet_neq(unsigned long data)
+{
+       struct ehca_shca *shca = (struct ehca_shca*)data;
+       struct ehca_eqe *eqe;
+       u64 ret;
+
+       eqe = ehca_poll_eq(shca, &shca->neq);
+
+       while (eqe) {
+               if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
+                       parse_ec(shca, eqe->entry);
+
+               eqe = ehca_poll_eq(shca, &shca->neq);
+       }
+
+       ret = hipz_h_reset_event(shca->ipz_hca_handle,
+                                shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
+
+       if (ret != H_SUCCESS)
+               ehca_err(&shca->ib_device, "Can't clear notification events.");
+
+       return;
+}
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
+{
+       struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+       tasklet_hi_schedule(&shca->eq.interrupt_task);
+
+       return IRQ_HANDLED;
+}
+
+
+static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
+{
+       u64 eqe_value;
+       u32 token;
+       struct ehca_cq *cq;
+
+       eqe_value = eqe->entry;
+       ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
+       if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+               ehca_dbg(&shca->ib_device, "Got completion event");
+               token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+               read_lock(&ehca_cq_idr_lock);
+               cq = idr_find(&ehca_cq_idr, token);
+               if (cq)
+                       atomic_inc(&cq->nr_events);
+               read_unlock(&ehca_cq_idr_lock);
+               if (cq == NULL) {
+                       ehca_err(&shca->ib_device,
+                                "Invalid eqe for non-existing cq token=%x",
+                                token);
+                       return;
+               }
+               reset_eq_pending(cq);
+               if (ehca_scaling_code)
+                       queue_comp_task(cq);
+               else {
+                       comp_event_callback(cq);
+                       if (atomic_dec_and_test(&cq->nr_events))
+                               wake_up(&cq->wait_completion);
+               }
+       } else {
+               ehca_dbg(&shca->ib_device, "Got non completion event");
+               parse_identifier(shca, eqe_value);
+       }
+}
+
+void ehca_process_eq(struct ehca_shca *shca, int is_irq)
+{
+       struct ehca_eq *eq = &shca->eq;
+       struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
+       u64 eqe_value, ret;
+       int eqe_cnt, i;
+       int eq_empty = 0;
+
+       spin_lock(&eq->irq_spinlock);
+       if (is_irq) {
+               const int max_query_cnt = 100;
+               int query_cnt = 0;
+               int int_state = 1;
+               do {
+                       int_state = hipz_h_query_int_state(
+                               shca->ipz_hca_handle, eq->ist);
+                       query_cnt++;
+                       iosync();
+               } while (int_state && query_cnt < max_query_cnt);
+               if (unlikely((query_cnt == max_query_cnt)))
+                       ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
+                                int_state, query_cnt);
+       }
+
+       /* read out all eqes */
+       eqe_cnt = 0;
+       do {
+               u32 token;
+               eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
+               if (!eqe_cache[eqe_cnt].eqe)
+                       break;
+               eqe_value = eqe_cache[eqe_cnt].eqe->entry;
+               if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+                       token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+                       read_lock(&ehca_cq_idr_lock);
+                       eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
+                       if (eqe_cache[eqe_cnt].cq)
+                               atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
+                       read_unlock(&ehca_cq_idr_lock);
+                       if (!eqe_cache[eqe_cnt].cq) {
+                               ehca_err(&shca->ib_device,
+                                        "Invalid eqe for non-existing cq "
+                                        "token=%x", token);
+                               continue;
+                       }
+               } else
+                       eqe_cache[eqe_cnt].cq = NULL;
+               eqe_cnt++;
+       } while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
+       if (!eqe_cnt) {
+               if (is_irq)
+                       ehca_dbg(&shca->ib_device,
+                                "No eqe found for irq event");
+               goto unlock_irq_spinlock;
+       } else if (!is_irq) {
+               ret = hipz_h_eoi(eq->ist);
+               if (ret != H_SUCCESS)
+                       ehca_err(&shca->ib_device,
+                                "bad return code EOI -rc = %lld\n", ret);
+               ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+       }
+       if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
+               ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
+       /* enable irq for new packets */
+       for (i = 0; i < eqe_cnt; i++) {
+               if (eq->eqe_cache[i].cq)
+                       reset_eq_pending(eq->eqe_cache[i].cq);
+       }
+       /* check eq */
+       spin_lock(&eq->spinlock);
+       eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
+       spin_unlock(&eq->spinlock);
+       /* call completion handler for cached eqes */
+       for (i = 0; i < eqe_cnt; i++)
+               if (eq->eqe_cache[i].cq) {
+                       if (ehca_scaling_code)
+                               queue_comp_task(eq->eqe_cache[i].cq);
+                       else {
+                               struct ehca_cq *cq = eq->eqe_cache[i].cq;
+                               comp_event_callback(cq);
+                               if (atomic_dec_and_test(&cq->nr_events))
+                                       wake_up(&cq->wait_completion);
+                       }
+               } else {
+                       ehca_dbg(&shca->ib_device, "Got non completion event");
+                       parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
+               }
+       /* poll eq if not empty */
+       if (eq_empty)
+               goto unlock_irq_spinlock;
+       do {
+               struct ehca_eqe *eqe;
+               eqe = ehca_poll_eq(shca, &shca->eq);
+               if (!eqe)
+                       break;
+               process_eqe(shca, eqe);
+       } while (1);
+
+unlock_irq_spinlock:
+       spin_unlock(&eq->irq_spinlock);
+}
+
+void ehca_tasklet_eq(unsigned long data)
+{
+       ehca_process_eq((struct ehca_shca*)data, 1);
+}
+
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
+{
+       int cpu;
+       unsigned long flags;
+
+       WARN_ON_ONCE(!in_interrupt());
+       if (ehca_debug_level >= 3)
+               ehca_dmp(cpu_online_mask, cpumask_size(), "");
+
+       spin_lock_irqsave(&pool->last_cpu_lock, flags);
+       do {
+               cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = cpumask_first(cpu_online_mask);
+               pool->last_cpu = cpu;
+       } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
+       spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
+
+       return cpu;
+}
+
+static void __queue_comp_task(struct ehca_cq *__cq,
+                             struct ehca_cpu_comp_task *cct,
+                             struct task_struct *thread)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&cct->task_lock, flags);
+       spin_lock(&__cq->task_lock);
+
+       if (__cq->nr_callbacks == 0) {
+               __cq->nr_callbacks++;
+               list_add_tail(&__cq->entry, &cct->cq_list);
+               cct->cq_jobs++;
+               wake_up_process(thread);
+       } else
+               __cq->nr_callbacks++;
+
+       spin_unlock(&__cq->task_lock);
+       spin_unlock_irqrestore(&cct->task_lock, flags);
+}
+
+static void queue_comp_task(struct ehca_cq *__cq)
+{
+       int cpu_id;
+       struct ehca_cpu_comp_task *cct;
+       struct task_struct *thread;
+       int cq_jobs;
+       unsigned long flags;
+
+       cpu_id = find_next_online_cpu(pool);
+       BUG_ON(!cpu_online(cpu_id));
+
+       cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+       BUG_ON(!cct || !thread);
+
+       spin_lock_irqsave(&cct->task_lock, flags);
+       cq_jobs = cct->cq_jobs;
+       spin_unlock_irqrestore(&cct->task_lock, flags);
+       if (cq_jobs > 0) {
+               cpu_id = find_next_online_cpu(pool);
+               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+               thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+               BUG_ON(!cct || !thread);
+       }
+       __queue_comp_task(__cq, cct, thread);
+}
+
+static void run_comp_task(struct ehca_cpu_comp_task *cct)
+{
+       struct ehca_cq *cq;
+
+       while (!list_empty(&cct->cq_list)) {
+               cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
+               spin_unlock_irq(&cct->task_lock);
+
+               comp_event_callback(cq);
+               if (atomic_dec_and_test(&cq->nr_events))
+                       wake_up(&cq->wait_completion);
+
+               spin_lock_irq(&cct->task_lock);
+               spin_lock(&cq->task_lock);
+               cq->nr_callbacks--;
+               if (!cq->nr_callbacks) {
+                       list_del_init(cct->cq_list.next);
+                       cct->cq_jobs--;
+               }
+               spin_unlock(&cq->task_lock);
+       }
+}
+
+static void comp_task_park(unsigned int cpu)
+{
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+       struct ehca_cpu_comp_task *target;
+       struct task_struct *thread;
+       struct ehca_cq *cq, *tmp;
+       LIST_HEAD(list);
+
+       spin_lock_irq(&cct->task_lock);
+       cct->cq_jobs = 0;
+       cct->active = 0;
+       list_splice_init(&cct->cq_list, &list);
+       spin_unlock_irq(&cct->task_lock);
+
+       cpu = find_next_online_cpu(pool);
+       target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+       thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
+       spin_lock_irq(&target->task_lock);
+       list_for_each_entry_safe(cq, tmp, &list, entry) {
+               list_del(&cq->entry);
+               __queue_comp_task(cq, target, thread);
+       }
+       spin_unlock_irq(&target->task_lock);
+}
+
+static void comp_task_stop(unsigned int cpu, bool online)
+{
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+       spin_lock_irq(&cct->task_lock);
+       cct->cq_jobs = 0;
+       cct->active = 0;
+       WARN_ON(!list_empty(&cct->cq_list));
+       spin_unlock_irq(&cct->task_lock);
+}
+
+static int comp_task_should_run(unsigned int cpu)
+{
+       struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+       return cct->cq_jobs;
+}
+
+static void comp_task(unsigned int cpu)
+{
+       struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+       int cql_empty;
+
+       spin_lock_irq(&cct->task_lock);
+       cql_empty = list_empty(&cct->cq_list);
+       if (!cql_empty) {
+               __set_current_state(TASK_RUNNING);
+               run_comp_task(cct);
+       }
+       spin_unlock_irq(&cct->task_lock);
+}
+
+static struct smp_hotplug_thread comp_pool_threads = {
+       .thread_should_run      = comp_task_should_run,
+       .thread_fn              = comp_task,
+       .thread_comm            = "ehca_comp/%u",
+       .cleanup                = comp_task_stop,
+       .park                   = comp_task_park,
+};
+
+int ehca_create_comp_pool(void)
+{
+       int cpu, ret = -ENOMEM;
+
+       if (!ehca_scaling_code)
+               return 0;
+
+       pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
+       if (pool == NULL)
+               return -ENOMEM;
+
+       spin_lock_init(&pool->last_cpu_lock);
+       pool->last_cpu = cpumask_any(cpu_online_mask);
+
+       pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
+       if (!pool->cpu_comp_tasks)
+               goto out_pool;
+
+       pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+       if (!pool->cpu_comp_threads)
+               goto out_tasks;
+
+       for_each_present_cpu(cpu) {
+               struct ehca_cpu_comp_task *cct;
+
+               cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+               spin_lock_init(&cct->task_lock);
+               INIT_LIST_HEAD(&cct->cq_list);
+       }
+
+       comp_pool_threads.store = pool->cpu_comp_threads;
+       ret = smpboot_register_percpu_thread(&comp_pool_threads);
+       if (ret)
+               goto out_threads;
+
+       pr_info("eHCA scaling code enabled\n");
+       return ret;
+
+out_threads:
+       free_percpu(pool->cpu_comp_threads);
+out_tasks:
+       free_percpu(pool->cpu_comp_tasks);
+out_pool:
+       kfree(pool);
+       return ret;
+}
+
+void ehca_destroy_comp_pool(void)
+{
+       if (!ehca_scaling_code)
+               return;
+
+       smpboot_unregister_percpu_thread(&comp_pool_threads);
+
+       free_percpu(pool->cpu_comp_threads);
+       free_percpu(pool->cpu_comp_tasks);
+       kfree(pool);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_irq.h b/drivers/staging/rdma/ehca/ehca_irq.h
new file mode 100644 (file)
index 0000000..5370199
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions and structs for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IRQ_H
+#define __EHCA_IRQ_H
+
+
+struct ehca_shca;
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+
+int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
+void ehca_tasklet_neq(unsigned long data);
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
+void ehca_tasklet_eq(unsigned long data);
+void ehca_process_eq(struct ehca_shca *shca, int is_irq);
+
+struct ehca_cpu_comp_task {
+       struct list_head cq_list;
+       spinlock_t task_lock;
+       int cq_jobs;
+       int active;
+};
+
+struct ehca_comp_pool {
+       struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+       struct task_struct * __percpu *cpu_comp_threads;
+       int last_cpu;
+       spinlock_t last_cpu_lock;
+};
+
+int ehca_create_comp_pool(void);
+void ehca_destroy_comp_pool(void);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h
new file mode 100644 (file)
index 0000000..80e6a3d
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions for internal functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Dietmar Decker <ddecker@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IVERBS_H__
+#define __EHCA_IVERBS_H__
+
+#include "ehca_classes.h"
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+                     struct ib_udata *uhw);
+
+int ehca_query_port(struct ib_device *ibdev, u8 port,
+                   struct ib_port_attr *props);
+
+enum rdma_protocol_type
+ehca_query_protocol(struct ib_device *device, u8 port_num);
+
+int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
+                       struct ehca_sma_attr *attr);
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
+                  union ib_gid *gid);
+
+int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
+                    struct ib_port_modify *props);
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+                           struct ib_ucontext *context,
+                           struct ib_udata *udata);
+
+int ehca_dealloc_pd(struct ib_pd *pd);
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_destroy_ah(struct ib_ah *ah);
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+                              struct ib_phys_buf *phys_buf_array,
+                              int num_phys_buf,
+                              int mr_access_flags, u64 *iova_start);
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                              u64 virt, int mr_access_flags,
+                              struct ib_udata *udata);
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+                      int mr_rereg_mask,
+                      struct ib_pd *pd,
+                      struct ib_phys_buf *phys_buf_array,
+                      int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+int ehca_dereg_mr(struct ib_mr *mr);
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+                struct ib_mw_bind *mw_bind);
+
+int ehca_dealloc_mw(struct ib_mw *mw);
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+                             int mr_access_flags,
+                             struct ib_fmr_attr *fmr_attr);
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+                     u64 *page_list, int list_len, u64 iova);
+
+int ehca_unmap_fmr(struct list_head *fmr_list);
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr);
+
+enum ehca_eq_type {
+       EHCA_EQ = 0, /* Event Queue              */
+       EHCA_NEQ     /* Notification Event Queue */
+};
+
+int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
+                  enum ehca_eq_type type, const u32 length);
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+
+struct ib_cq *ehca_create_cq(struct ib_device *device,
+                            const struct ib_cq_init_attr *attr,
+                            struct ib_ucontext *context,
+                            struct ib_udata *udata);
+
+int ehca_destroy_cq(struct ib_cq *cq);
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+                            struct ib_qp_init_attr *init_attr,
+                            struct ib_udata *udata);
+
+int ehca_destroy_qp(struct ib_qp *qp);
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+                  struct ib_udata *udata);
+
+int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+                  struct ib_send_wr **bad_send_wr);
+
+int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+                  struct ib_recv_wr **bad_recv_wr);
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+                      struct ib_recv_wr *recv_wr,
+                      struct ib_recv_wr **bad_recv_wr);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+                              struct ib_srq_init_attr *init_attr,
+                              struct ib_udata *udata);
+
+int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
+                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+int ehca_destroy_srq(struct ib_srq *srq);
+
+u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
+                   struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+                                       struct ib_udata *udata);
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context);
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in, size_t in_mad_size,
+                    struct ib_mad_hdr *out, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index);
+
+void ehca_poll_eqs(unsigned long data);
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+                 enum ib_rate path_rate, u32 *ipd);
+
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
+
+#ifdef CONFIG_PPC_64K_PAGES
+void *ehca_alloc_fw_ctrlblock(gfp_t flags);
+void ehca_free_fw_ctrlblock(void *ptr);
+#else
+#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
+#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
+#endif
+
+void ehca_recover_sqp(struct ib_qp *sqp);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c
new file mode 100644 (file)
index 0000000..8246418
--- /dev/null
@@ -0,0 +1,1123 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  module start stop, hca detection
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <linux/slab.h>
+#endif
+
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include <rdma/ib_mad.h>
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+#define HCAD_VERSION "0029"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
+MODULE_VERSION(HCAD_VERSION);
+
+static bool ehca_open_aqp1    = 0;
+static int ehca_hw_level      = 0;
+static bool ehca_poll_all_eqs = 1;
+
+int ehca_debug_level   = 0;
+int ehca_nr_ports      = -1;
+bool ehca_use_hp_mr    = 0;
+int ehca_port_act_time = 30;
+int ehca_static_rate   = -1;
+bool ehca_scaling_code = 0;
+int ehca_lock_hcalls   = -1;
+int ehca_max_cq        = -1;
+int ehca_max_qp        = -1;
+
+module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
+module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
+module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
+module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
+module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
+module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
+module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
+module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
+module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
+module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
+module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
+
+MODULE_PARM_DESC(open_aqp1,
+                "Open AQP1 on startup (default: no)");
+MODULE_PARM_DESC(debug_level,
+                "Amount of debug output (0: none (default), 1: traces, "
+                "2: some dumps, 3: lots)");
+MODULE_PARM_DESC(hw_level,
+                "Hardware level (0: autosensing (default), "
+                "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+MODULE_PARM_DESC(nr_ports,
+                "number of connected ports (-1: autodetect (default), "
+                "1: port one only, 2: two ports)");
+MODULE_PARM_DESC(use_hp_mr,
+                "Use high performance MRs (default: no)");
+MODULE_PARM_DESC(port_act_time,
+                "Time to wait for port activation (default: 30 sec)");
+MODULE_PARM_DESC(poll_all_eqs,
+                "Poll all event queues periodically (default: yes)");
+MODULE_PARM_DESC(static_rate,
+                "Set permanent static rate (default: no static rate)");
+MODULE_PARM_DESC(scaling_code,
+                "Enable scaling code (default: no)");
+MODULE_PARM_DESC(lock_hcalls,
+                "Serialize all hCalls made by the driver "
+                "(default: autodetect)");
+MODULE_PARM_DESC(number_of_cqs,
+               "Max number of CQs which can be allocated "
+               "(default: autodetect)");
+MODULE_PARM_DESC(number_of_qps,
+               "Max number of QPs which can be allocated "
+               "(default: autodetect)");
+
+DEFINE_RWLOCK(ehca_qp_idr_lock);
+DEFINE_RWLOCK(ehca_cq_idr_lock);
+DEFINE_IDR(ehca_qp_idr);
+DEFINE_IDR(ehca_cq_idr);
+
+static LIST_HEAD(shca_list); /* list of all registered ehcas */
+DEFINE_SPINLOCK(shca_list_lock);
+
+static struct timer_list poll_eqs_timer;
+
+#ifdef CONFIG_PPC_64K_PAGES
+static struct kmem_cache *ctblk_cache;
+
+void *ehca_alloc_fw_ctrlblock(gfp_t flags)
+{
+       void *ret = kmem_cache_zalloc(ctblk_cache, flags);
+       if (!ret)
+               ehca_gen_err("Out of memory for ctblk");
+       return ret;
+}
+
+void ehca_free_fw_ctrlblock(void *ptr)
+{
+       if (ptr)
+               kmem_cache_free(ctblk_cache, ptr);
+
+}
+#endif
+
+int ehca2ib_return_code(u64 ehca_rc)
+{
+       switch (ehca_rc) {
+       case H_SUCCESS:
+               return 0;
+       case H_RESOURCE:             /* Resource in use */
+       case H_BUSY:
+               return -EBUSY;
+       case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
+       case H_CONSTRAINED:          /* resource constraint */
+       case H_NO_MEM:
+               return -ENOMEM;
+       default:
+               return -EINVAL;
+       }
+}
+
+static int ehca_create_slab_caches(void)
+{
+       int ret;
+
+       ret = ehca_init_pd_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create PD SLAB cache.");
+               return ret;
+       }
+
+       ret = ehca_init_cq_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create CQ SLAB cache.");
+               goto create_slab_caches2;
+       }
+
+       ret = ehca_init_qp_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create QP SLAB cache.");
+               goto create_slab_caches3;
+       }
+
+       ret = ehca_init_av_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create AV SLAB cache.");
+               goto create_slab_caches4;
+       }
+
+       ret = ehca_init_mrmw_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create MR&MW SLAB cache.");
+               goto create_slab_caches5;
+       }
+
+       ret = ehca_init_small_qp_cache();
+       if (ret) {
+               ehca_gen_err("Cannot create small queue SLAB cache.");
+               goto create_slab_caches6;
+       }
+
+#ifdef CONFIG_PPC_64K_PAGES
+       ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
+                                       EHCA_PAGESIZE, H_CB_ALIGNMENT,
+                                       SLAB_HWCACHE_ALIGN,
+                                       NULL);
+       if (!ctblk_cache) {
+               ehca_gen_err("Cannot create ctblk SLAB cache.");
+               ehca_cleanup_small_qp_cache();
+               ret = -ENOMEM;
+               goto create_slab_caches6;
+       }
+#endif
+       return 0;
+
+create_slab_caches6:
+       ehca_cleanup_mrmw_cache();
+
+create_slab_caches5:
+       ehca_cleanup_av_cache();
+
+create_slab_caches4:
+       ehca_cleanup_qp_cache();
+
+create_slab_caches3:
+       ehca_cleanup_cq_cache();
+
+create_slab_caches2:
+       ehca_cleanup_pd_cache();
+
+       return ret;
+}
+
+static void ehca_destroy_slab_caches(void)
+{
+       ehca_cleanup_small_qp_cache();
+       ehca_cleanup_mrmw_cache();
+       ehca_cleanup_av_cache();
+       ehca_cleanup_qp_cache();
+       ehca_cleanup_cq_cache();
+       ehca_cleanup_pd_cache();
+#ifdef CONFIG_PPC_64K_PAGES
+       if (ctblk_cache)
+               kmem_cache_destroy(ctblk_cache);
+#endif
+}
+
+#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
+#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
+
+static struct cap_descr {
+       u64 mask;
+       char *descr;
+} hca_cap_descr[] = {
+       { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
+       { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
+       { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
+       { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
+       { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
+       { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
+       { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
+       { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
+       { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
+       { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
+       { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
+       { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
+       { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
+       { HCA_CAP_SRQ, "HCA_CAP_SRQ" },
+       { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
+       { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
+       { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
+       { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
+};
+
+static int ehca_sense_attributes(struct ehca_shca *shca)
+{
+       int i, ret = 0;
+       u64 h_ret;
+       struct hipz_query_hca *rblock;
+       struct hipz_query_port *port;
+       const char *loc_code;
+
+       static const u32 pgsize_map[] = {
+               HCA_CAP_MR_PGSIZE_4K,  0x1000,
+               HCA_CAP_MR_PGSIZE_64K, 0x10000,
+               HCA_CAP_MR_PGSIZE_1M,  0x100000,
+               HCA_CAP_MR_PGSIZE_16M, 0x1000000,
+       };
+
+       ehca_gen_dbg("Probing adapter %s...",
+                    shca->ofdev->dev.of_node->full_name);
+       loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
+                                  NULL);
+       if (loc_code)
+               ehca_gen_dbg(" ... location lode=%s", loc_code);
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_gen_err("Cannot allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
+       if (h_ret != H_SUCCESS) {
+               ehca_gen_err("Cannot query device properties. h_ret=%lli",
+                            h_ret);
+               ret = -EPERM;
+               goto sense_attributes1;
+       }
+
+       if (ehca_nr_ports == 1)
+               shca->num_ports = 1;
+       else
+               shca->num_ports = (u8)rblock->num_ports;
+
+       ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
+
+       if (ehca_hw_level == 0) {
+               u32 hcaaver;
+               u32 revid;
+
+               hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
+               revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
+
+               ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
+
+               if (hcaaver == 1) {
+                       if (revid <= 3)
+                               shca->hw_level = 0x10 | (revid + 1);
+                       else
+                               shca->hw_level = 0x14;
+               } else if (hcaaver == 2) {
+                       if (revid == 0)
+                               shca->hw_level = 0x21;
+                       else if (revid == 0x10)
+                               shca->hw_level = 0x22;
+                       else if (revid == 0x20 || revid == 0x21)
+                               shca->hw_level = 0x23;
+               }
+
+               if (!shca->hw_level) {
+                       ehca_gen_warn("unknown hardware version"
+                                     " - assuming default level");
+                       shca->hw_level = 0x22;
+               }
+       } else
+               shca->hw_level = ehca_hw_level;
+       ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
+
+       shca->hca_cap = rblock->hca_cap_indicators;
+       ehca_gen_dbg(" ... HCA capabilities:");
+       for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
+               if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
+                       ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
+
+       /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
+        * a firmware property, so it's valid across all adapters
+        */
+       if (ehca_lock_hcalls == -1)
+               ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
+                                       shca->hca_cap);
+
+       /* translate supported MR page sizes; always support 4K */
+       shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
+       for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
+               if (rblock->memory_page_size_supported & pgsize_map[i])
+                       shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
+
+       /* Set maximum number of CQs and QPs to calculate EQ size */
+       if (shca->max_num_qps == -1)
+               shca->max_num_qps = min_t(int, rblock->max_qp,
+                                         EHCA_MAX_NUM_QUEUES);
+       else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
+               ehca_gen_warn("The requested number of QPs is out of range "
+                             "(1 - %i) specified by HW. Value is set to %i",
+                             rblock->max_qp, rblock->max_qp);
+               shca->max_num_qps = rblock->max_qp;
+       }
+
+       if (shca->max_num_cqs == -1)
+               shca->max_num_cqs = min_t(int, rblock->max_cq,
+                                         EHCA_MAX_NUM_QUEUES);
+       else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
+               ehca_gen_warn("The requested number of CQs is out of range "
+                             "(1 - %i) specified by HW. Value is set to %i",
+                             rblock->max_cq, rblock->max_cq);
+       }
+
+       /* query max MTU from first port -- it's the same for all ports */
+       port = (struct hipz_query_port *)rblock;
+       h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
+       if (h_ret != H_SUCCESS) {
+               ehca_gen_err("Cannot query port properties. h_ret=%lli",
+                            h_ret);
+               ret = -EPERM;
+               goto sense_attributes1;
+       }
+
+       shca->max_mtu = port->max_mtu;
+
+sense_attributes1:
+       ehca_free_fw_ctrlblock(rblock);
+       return ret;
+}
+
+static int init_node_guid(struct ehca_shca *shca)
+{
+       int ret = 0;
+       struct hipz_query_hca *rblock;
+
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!rblock) {
+               ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+               return -ENOMEM;
+       }
+
+       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "Can't query device properties");
+               ret = -EINVAL;
+               goto init_node_guid1;
+       }
+
+       memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
+
+init_node_guid1:
+       ehca_free_fw_ctrlblock(rblock);
+       return ret;
+}
+
+static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num,
+                              struct ib_port_immutable *immutable)
+{
+       struct ib_port_attr attr;
+       int err;
+
+       err = ehca_query_port(ibdev, port_num, &attr);
+       if (err)
+               return err;
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+       return 0;
+}
+
+static int ehca_init_device(struct ehca_shca *shca)
+{
+       int ret;
+
+       ret = init_node_guid(shca);
+       if (ret)
+               return ret;
+
+       strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
+       shca->ib_device.owner               = THIS_MODULE;
+
+       shca->ib_device.uverbs_abi_ver      = 8;
+       shca->ib_device.uverbs_cmd_mask     =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+
+       shca->ib_device.node_type           = RDMA_NODE_IB_CA;
+       shca->ib_device.phys_port_cnt       = shca->num_ports;
+       shca->ib_device.num_comp_vectors    = 1;
+       shca->ib_device.dma_device          = &shca->ofdev->dev;
+       shca->ib_device.query_device        = ehca_query_device;
+       shca->ib_device.query_port          = ehca_query_port;
+       shca->ib_device.query_gid           = ehca_query_gid;
+       shca->ib_device.query_pkey          = ehca_query_pkey;
+       /* shca->in_device.modify_device    = ehca_modify_device    */
+       shca->ib_device.modify_port         = ehca_modify_port;
+       shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
+       shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
+       shca->ib_device.alloc_pd            = ehca_alloc_pd;
+       shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
+       shca->ib_device.create_ah           = ehca_create_ah;
+       /* shca->ib_device.modify_ah        = ehca_modify_ah;       */
+       shca->ib_device.query_ah            = ehca_query_ah;
+       shca->ib_device.destroy_ah          = ehca_destroy_ah;
+       shca->ib_device.create_qp           = ehca_create_qp;
+       shca->ib_device.modify_qp           = ehca_modify_qp;
+       shca->ib_device.query_qp            = ehca_query_qp;
+       shca->ib_device.destroy_qp          = ehca_destroy_qp;
+       shca->ib_device.post_send           = ehca_post_send;
+       shca->ib_device.post_recv           = ehca_post_recv;
+       shca->ib_device.create_cq           = ehca_create_cq;
+       shca->ib_device.destroy_cq          = ehca_destroy_cq;
+       shca->ib_device.resize_cq           = ehca_resize_cq;
+       shca->ib_device.poll_cq             = ehca_poll_cq;
+       /* shca->ib_device.peek_cq          = ehca_peek_cq;         */
+       shca->ib_device.req_notify_cq       = ehca_req_notify_cq;
+       /* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
+       shca->ib_device.get_dma_mr          = ehca_get_dma_mr;
+       shca->ib_device.reg_phys_mr         = ehca_reg_phys_mr;
+       shca->ib_device.reg_user_mr         = ehca_reg_user_mr;
+       shca->ib_device.query_mr            = ehca_query_mr;
+       shca->ib_device.dereg_mr            = ehca_dereg_mr;
+       shca->ib_device.rereg_phys_mr       = ehca_rereg_phys_mr;
+       shca->ib_device.alloc_mw            = ehca_alloc_mw;
+       shca->ib_device.bind_mw             = ehca_bind_mw;
+       shca->ib_device.dealloc_mw          = ehca_dealloc_mw;
+       shca->ib_device.alloc_fmr           = ehca_alloc_fmr;
+       shca->ib_device.map_phys_fmr        = ehca_map_phys_fmr;
+       shca->ib_device.unmap_fmr           = ehca_unmap_fmr;
+       shca->ib_device.dealloc_fmr         = ehca_dealloc_fmr;
+       shca->ib_device.attach_mcast        = ehca_attach_mcast;
+       shca->ib_device.detach_mcast        = ehca_detach_mcast;
+       shca->ib_device.process_mad         = ehca_process_mad;
+       shca->ib_device.mmap                = ehca_mmap;
+       shca->ib_device.dma_ops             = &ehca_dma_mapping_ops;
+       shca->ib_device.get_port_immutable  = ehca_port_immutable;
+
+       if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+               shca->ib_device.uverbs_cmd_mask |=
+                       (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+                       (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+                       (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+                       (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+               shca->ib_device.create_srq          = ehca_create_srq;
+               shca->ib_device.modify_srq          = ehca_modify_srq;
+               shca->ib_device.query_srq           = ehca_query_srq;
+               shca->ib_device.destroy_srq         = ehca_destroy_srq;
+               shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
+       }
+
+       return ret;
+}
+
+static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
+{
+       struct ehca_sport *sport = &shca->sport[port - 1];
+       struct ib_cq *ibcq;
+       struct ib_qp *ibqp;
+       struct ib_qp_init_attr qp_init_attr;
+       struct ib_cq_init_attr cq_attr = {};
+       int ret;
+
+       if (sport->ibcq_aqp1) {
+               ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
+               return -EPERM;
+       }
+
+       cq_attr.cqe = 10;
+       ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
+                           &cq_attr);
+       if (IS_ERR(ibcq)) {
+               ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
+               return PTR_ERR(ibcq);
+       }
+       sport->ibcq_aqp1 = ibcq;
+
+       if (sport->ibqp_sqp[IB_QPT_GSI]) {
+               ehca_err(&shca->ib_device, "AQP1 QP is already created.");
+               ret = -EPERM;
+               goto create_aqp1;
+       }
+
+       memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+       qp_init_attr.send_cq          = ibcq;
+       qp_init_attr.recv_cq          = ibcq;
+       qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
+       qp_init_attr.cap.max_send_wr  = 100;
+       qp_init_attr.cap.max_recv_wr  = 100;
+       qp_init_attr.cap.max_send_sge = 2;
+       qp_init_attr.cap.max_recv_sge = 1;
+       qp_init_attr.qp_type          = IB_QPT_GSI;
+       qp_init_attr.port_num         = port;
+       qp_init_attr.qp_context       = NULL;
+       qp_init_attr.event_handler    = NULL;
+       qp_init_attr.srq              = NULL;
+
+       ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
+       if (IS_ERR(ibqp)) {
+               ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
+               ret = PTR_ERR(ibqp);
+               goto create_aqp1;
+       }
+       sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
+
+       return 0;
+
+create_aqp1:
+       ib_destroy_cq(sport->ibcq_aqp1);
+       return ret;
+}
+
+static int ehca_destroy_aqp1(struct ehca_sport *sport)
+{
+       int ret;
+
+       ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
+       if (ret) {
+               ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
+               return ret;
+       }
+
+       ret = ib_destroy_cq(sport->ibcq_aqp1);
+       if (ret)
+               ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
+
+       return ret;
+}
+
+static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
+}
+
+static ssize_t ehca_store_debug_level(struct device_driver *ddp,
+                                     const char *buf, size_t count)
+{
+       int value = (*buf) - '0';
+       if (value >= 0 && value <= 9)
+               ehca_debug_level = value;
+       return 1;
+}
+
+static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
+                  ehca_show_debug_level, ehca_store_debug_level);
+
+static struct attribute *ehca_drv_attrs[] = {
+       &driver_attr_debug_level.attr,
+       NULL
+};
+
+static struct attribute_group ehca_drv_attr_grp = {
+       .attrs = ehca_drv_attrs
+};
+
+static const struct attribute_group *ehca_drv_attr_groups[] = {
+       &ehca_drv_attr_grp,
+       NULL,
+};
+
+#define EHCA_RESOURCE_ATTR(name)                                           \
+static ssize_t  ehca_show_##name(struct device *dev,                       \
+                                struct device_attribute *attr,            \
+                                char *buf)                                \
+{                                                                         \
+       struct ehca_shca *shca;                                            \
+       struct hipz_query_hca *rblock;                                     \
+       int data;                                                          \
+                                                                          \
+       shca = dev_get_drvdata(dev);                                       \
+                                                                          \
+       rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);                      \
+       if (!rblock) {                                                     \
+               dev_err(dev, "Can't allocate rblock memory.\n");           \
+               return 0;                                                  \
+       }                                                                  \
+                                                                          \
+       if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
+               dev_err(dev, "Can't query device properties\n");           \
+               ehca_free_fw_ctrlblock(rblock);                            \
+               return 0;                                                  \
+       }                                                                  \
+                                                                          \
+       data = rblock->name;                                               \
+       ehca_free_fw_ctrlblock(rblock);                                    \
+                                                                          \
+       if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))     \
+               return snprintf(buf, 256, "1\n");                          \
+       else                                                               \
+               return snprintf(buf, 256, "%d\n", data);                   \
+                                                                          \
+}                                                                         \
+static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
+
+EHCA_RESOURCE_ATTR(num_ports);
+EHCA_RESOURCE_ATTR(hw_ver);
+EHCA_RESOURCE_ATTR(max_eq);
+EHCA_RESOURCE_ATTR(cur_eq);
+EHCA_RESOURCE_ATTR(max_cq);
+EHCA_RESOURCE_ATTR(cur_cq);
+EHCA_RESOURCE_ATTR(max_qp);
+EHCA_RESOURCE_ATTR(cur_qp);
+EHCA_RESOURCE_ATTR(max_mr);
+EHCA_RESOURCE_ATTR(cur_mr);
+EHCA_RESOURCE_ATTR(max_mw);
+EHCA_RESOURCE_ATTR(cur_mw);
+EHCA_RESOURCE_ATTR(max_pd);
+EHCA_RESOURCE_ATTR(max_ah);
+
+static ssize_t ehca_show_adapter_handle(struct device *dev,
+                                       struct device_attribute *attr,
+                                       char *buf)
+{
+       struct ehca_shca *shca = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
+
+}
+static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
+
+static struct attribute *ehca_dev_attrs[] = {
+       &dev_attr_adapter_handle.attr,
+       &dev_attr_num_ports.attr,
+       &dev_attr_hw_ver.attr,
+       &dev_attr_max_eq.attr,
+       &dev_attr_cur_eq.attr,
+       &dev_attr_max_cq.attr,
+       &dev_attr_cur_cq.attr,
+       &dev_attr_max_qp.attr,
+       &dev_attr_cur_qp.attr,
+       &dev_attr_max_mr.attr,
+       &dev_attr_cur_mr.attr,
+       &dev_attr_max_mw.attr,
+       &dev_attr_cur_mw.attr,
+       &dev_attr_max_pd.attr,
+       &dev_attr_max_ah.attr,
+       NULL
+};
+
+static struct attribute_group ehca_dev_attr_grp = {
+       .attrs = ehca_dev_attrs
+};
+
+static int ehca_probe(struct platform_device *dev)
+{
+       struct ehca_shca *shca;
+       const u64 *handle;
+       struct ib_pd *ibpd;
+       int ret, i, eq_size;
+       unsigned long flags;
+
+       handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
+       if (!handle) {
+               ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
+                            dev->dev.of_node->full_name);
+               return -ENODEV;
+       }
+
+       if (!(*handle)) {
+               ehca_gen_err("Wrong eHCA handle for adapter: %s.",
+                            dev->dev.of_node->full_name);
+               return -ENODEV;
+       }
+
+       shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
+       if (!shca) {
+               ehca_gen_err("Cannot allocate shca memory.");
+               return -ENOMEM;
+       }
+
+       mutex_init(&shca->modify_mutex);
+       atomic_set(&shca->num_cqs, 0);
+       atomic_set(&shca->num_qps, 0);
+       shca->max_num_qps = ehca_max_qp;
+       shca->max_num_cqs = ehca_max_cq;
+
+       for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
+               spin_lock_init(&shca->sport[i].mod_sqp_lock);
+
+       shca->ofdev = dev;
+       shca->ipz_hca_handle.handle = *handle;
+       dev_set_drvdata(&dev->dev, shca);
+
+       ret = ehca_sense_attributes(shca);
+       if (ret < 0) {
+               ehca_gen_err("Cannot sense eHCA attributes.");
+               goto probe1;
+       }
+
+       ret = ehca_init_device(shca);
+       if (ret) {
+               ehca_gen_err("Cannot init ehca  device struct");
+               goto probe1;
+       }
+
+       eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
+       /* create event queues */
+       ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
+       if (ret) {
+               ehca_err(&shca->ib_device, "Cannot create EQ.");
+               goto probe1;
+       }
+
+       ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
+       if (ret) {
+               ehca_err(&shca->ib_device, "Cannot create NEQ.");
+               goto probe3;
+       }
+
+       /* create internal protection domain */
+       ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
+       if (IS_ERR(ibpd)) {
+               ehca_err(&shca->ib_device, "Cannot create internal PD.");
+               ret = PTR_ERR(ibpd);
+               goto probe4;
+       }
+
+       shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
+       shca->pd->ib_pd.device = &shca->ib_device;
+
+       /* create internal max MR */
+       ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
+
+       if (ret) {
+               ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
+                        ret);
+               goto probe5;
+       }
+
+       ret = ib_register_device(&shca->ib_device, NULL);
+       if (ret) {
+               ehca_err(&shca->ib_device,
+                        "ib_register_device() failed ret=%i", ret);
+               goto probe6;
+       }
+
+       /* create AQP1 for port 1 */
+       if (ehca_open_aqp1 == 1) {
+               shca->sport[0].port_state = IB_PORT_DOWN;
+               ret = ehca_create_aqp1(shca, 1);
+               if (ret) {
+                       ehca_err(&shca->ib_device,
+                                "Cannot create AQP1 for port 1.");
+                       goto probe7;
+               }
+       }
+
+       /* create AQP1 for port 2 */
+       if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
+               shca->sport[1].port_state = IB_PORT_DOWN;
+               ret = ehca_create_aqp1(shca, 2);
+               if (ret) {
+                       ehca_err(&shca->ib_device,
+                                "Cannot create AQP1 for port 2.");
+                       goto probe8;
+               }
+       }
+
+       ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+       if (ret) /* only complain; we can live without attributes */
+               ehca_err(&shca->ib_device,
+                        "Cannot create device attributes  ret=%d", ret);
+
+       spin_lock_irqsave(&shca_list_lock, flags);
+       list_add(&shca->shca_list, &shca_list);
+       spin_unlock_irqrestore(&shca_list_lock, flags);
+
+       return 0;
+
+probe8:
+       ret = ehca_destroy_aqp1(&shca->sport[0]);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy AQP1 for port 1. ret=%i", ret);
+
+probe7:
+       ib_unregister_device(&shca->ib_device);
+
+probe6:
+       ret = ehca_dereg_internal_maxmr(shca);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal MR. ret=%x", ret);
+
+probe5:
+       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal PD. ret=%x", ret);
+
+probe4:
+       ret = ehca_destroy_eq(shca, &shca->neq);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy NEQ. ret=%x", ret);
+
+probe3:
+       ret = ehca_destroy_eq(shca, &shca->eq);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy EQ. ret=%x", ret);
+
+probe1:
+       ib_dealloc_device(&shca->ib_device);
+
+       return -EINVAL;
+}
+
+static int ehca_remove(struct platform_device *dev)
+{
+       struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
+       unsigned long flags;
+       int ret;
+
+       sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+
+       if (ehca_open_aqp1 == 1) {
+               int i;
+               for (i = 0; i < shca->num_ports; i++) {
+                       ret = ehca_destroy_aqp1(&shca->sport[i]);
+                       if (ret)
+                               ehca_err(&shca->ib_device,
+                                        "Cannot destroy AQP1 for port %x "
+                                        "ret=%i", ret, i);
+               }
+       }
+
+       ib_unregister_device(&shca->ib_device);
+
+       ret = ehca_dereg_internal_maxmr(shca);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal MR. ret=%i", ret);
+
+       ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+       if (ret)
+               ehca_err(&shca->ib_device,
+                        "Cannot destroy internal PD. ret=%i", ret);
+
+       ret = ehca_destroy_eq(shca, &shca->eq);
+       if (ret)
+               ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
+
+       ret = ehca_destroy_eq(shca, &shca->neq);
+       if (ret)
+               ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
+
+       ib_dealloc_device(&shca->ib_device);
+
+       spin_lock_irqsave(&shca_list_lock, flags);
+       list_del(&shca->shca_list);
+       spin_unlock_irqrestore(&shca_list_lock, flags);
+
+       return ret;
+}
+
+static struct of_device_id ehca_device_table[] =
+{
+       {
+               .name       = "lhca",
+               .compatible = "IBM,lhca",
+       },
+       {},
+};
+MODULE_DEVICE_TABLE(of, ehca_device_table);
+
+static struct platform_driver ehca_driver = {
+       .probe       = ehca_probe,
+       .remove      = ehca_remove,
+       .driver = {
+               .name = "ehca",
+               .owner = THIS_MODULE,
+               .groups = ehca_drv_attr_groups,
+               .of_match_table = ehca_device_table,
+       },
+};
+
+void ehca_poll_eqs(unsigned long data)
+{
+       struct ehca_shca *shca;
+
+       spin_lock(&shca_list_lock);
+       list_for_each_entry(shca, &shca_list, shca_list) {
+               if (shca->eq.is_initialized) {
+                       /* call deadman proc only if eq ptr does not change */
+                       struct ehca_eq *eq = &shca->eq;
+                       int max = 3;
+                       volatile u64 q_ofs, q_ofs2;
+                       unsigned long flags;
+                       spin_lock_irqsave(&eq->spinlock, flags);
+                       q_ofs = eq->ipz_queue.current_q_offset;
+                       spin_unlock_irqrestore(&eq->spinlock, flags);
+                       do {
+                               spin_lock_irqsave(&eq->spinlock, flags);
+                               q_ofs2 = eq->ipz_queue.current_q_offset;
+                               spin_unlock_irqrestore(&eq->spinlock, flags);
+                               max--;
+                       } while (q_ofs == q_ofs2 && max > 0);
+                       if (q_ofs == q_ofs2)
+                               ehca_process_eq(shca, 0);
+               }
+       }
+       mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
+       spin_unlock(&shca_list_lock);
+}
+
+static int ehca_mem_notifier(struct notifier_block *nb,
+                            unsigned long action, void *data)
+{
+       static unsigned long ehca_dmem_warn_time;
+       unsigned long flags;
+
+       switch (action) {
+       case MEM_CANCEL_OFFLINE:
+       case MEM_CANCEL_ONLINE:
+       case MEM_ONLINE:
+       case MEM_OFFLINE:
+               return NOTIFY_OK;
+       case MEM_GOING_ONLINE:
+       case MEM_GOING_OFFLINE:
+               /* only ok if no hca is attached to the lpar */
+               spin_lock_irqsave(&shca_list_lock, flags);
+               if (list_empty(&shca_list)) {
+                       spin_unlock_irqrestore(&shca_list_lock, flags);
+                       return NOTIFY_OK;
+               } else {
+                       spin_unlock_irqrestore(&shca_list_lock, flags);
+                       if (printk_timed_ratelimit(&ehca_dmem_warn_time,
+                                                  30 * 1000))
+                               ehca_gen_err("DMEM operations are not allowed"
+                                            "in conjunction with eHCA");
+                       return NOTIFY_BAD;
+               }
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block ehca_mem_nb = {
+       .notifier_call = ehca_mem_notifier,
+};
+
+static int __init ehca_module_init(void)
+{
+       int ret;
+
+       printk(KERN_INFO "eHCA Infiniband Device Driver "
+              "(Version " HCAD_VERSION ")\n");
+
+       ret = ehca_create_comp_pool();
+       if (ret) {
+               ehca_gen_err("Cannot create comp pool.");
+               return ret;
+       }
+
+       ret = ehca_create_slab_caches();
+       if (ret) {
+               ehca_gen_err("Cannot create SLAB caches");
+               ret = -ENOMEM;
+               goto module_init1;
+       }
+
+       ret = ehca_create_busmap();
+       if (ret) {
+               ehca_gen_err("Cannot create busmap.");
+               goto module_init2;
+       }
+
+       ret = ibmebus_register_driver(&ehca_driver);
+       if (ret) {
+               ehca_gen_err("Cannot register eHCA device driver");
+               ret = -EINVAL;
+               goto module_init3;
+       }
+
+       ret = register_memory_notifier(&ehca_mem_nb);
+       if (ret) {
+               ehca_gen_err("Failed registering memory add/remove notifier");
+               goto module_init4;
+       }
+
+       if (ehca_poll_all_eqs != 1) {
+               ehca_gen_err("WARNING!!!");
+               ehca_gen_err("It is possible to lose interrupts.");
+       } else {
+               init_timer(&poll_eqs_timer);
+               poll_eqs_timer.function = ehca_poll_eqs;
+               poll_eqs_timer.expires = jiffies + HZ;
+               add_timer(&poll_eqs_timer);
+       }
+
+       return 0;
+
+module_init4:
+       ibmebus_unregister_driver(&ehca_driver);
+
+module_init3:
+       ehca_destroy_busmap();
+
+module_init2:
+       ehca_destroy_slab_caches();
+
+module_init1:
+       ehca_destroy_comp_pool();
+       return ret;
+};
+
+static void __exit ehca_module_exit(void)
+{
+       if (ehca_poll_all_eqs == 1)
+               del_timer_sync(&poll_eqs_timer);
+
+       ibmebus_unregister_driver(&ehca_driver);
+
+       unregister_memory_notifier(&ehca_mem_nb);
+
+       ehca_destroy_busmap();
+
+       ehca_destroy_slab_caches();
+
+       ehca_destroy_comp_pool();
+
+       idr_destroy(&ehca_cq_idr);
+       idr_destroy(&ehca_qp_idr);
+};
+
+module_init(ehca_module_init);
+module_exit(ehca_module_exit);
diff --git a/drivers/staging/rdma/ehca/ehca_mcast.c b/drivers/staging/rdma/ehca/ehca_mcast.c
new file mode 100644 (file)
index 0000000..cec1815
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  mcast  functions
+ *
+ *  Authors: Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define MAX_MC_LID 0xFFFE
+#define MIN_MC_LID 0xC000      /* Multicast limits */
+#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
+#define EHCA_VALID_MULTICAST_LID(lid) \
+       (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
+
+int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+                                             ib_device);
+       union ib_gid my_gid;
+       u64 subnet_prefix, interface_id, h_ret;
+
+       if (ibqp->qp_type != IB_QPT_UD) {
+               ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
+               return -EINVAL;
+       }
+
+       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+               ehca_err(ibqp->device, "invalid mulitcast gid");
+               return -EINVAL;
+       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+               return -EINVAL;
+       }
+
+       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+       interface_id = be64_to_cpu(my_gid.global.interface_id);
+       h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
+                                  my_qp->ipz_qp_handle,
+                                  my_qp->galpas.kernel,
+                                  lid, subnet_prefix, interface_id);
+       if (h_ret != H_SUCCESS)
+               ehca_err(ibqp->device,
+                        "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
+                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+       return ehca2ib_return_code(h_ret);
+}
+
+int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca = container_of(ibqp->pd->device,
+                                             struct ehca_shca, ib_device);
+       union ib_gid my_gid;
+       u64 subnet_prefix, interface_id, h_ret;
+
+       if (ibqp->qp_type != IB_QPT_UD) {
+               ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
+               return -EINVAL;
+       }
+
+       if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+               ehca_err(ibqp->device, "invalid mulitcast gid");
+               return -EINVAL;
+       } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+               ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+               return -EINVAL;
+       }
+
+       memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+       subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+       interface_id = be64_to_cpu(my_gid.global.interface_id);
+       h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
+                                  my_qp->ipz_qp_handle,
+                                  my_qp->galpas.kernel,
+                                  lid, subnet_prefix, interface_id);
+       if (h_ret != H_SUCCESS)
+               ehca_err(ibqp->device,
+                        "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
+                        "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+       return ehca2ib_return_code(h_ret);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c
new file mode 100644 (file)
index 0000000..f914b30
--- /dev/null
@@ -0,0 +1,2593 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "hcp_if.h"
+#include "hipz_hw.h"
+
+#define NUM_CHUNKS(length, chunk_size) \
+       (((length) + (chunk_size - 1)) / (chunk_size))
+
+/* max number of rpages (per hcall register_rpages) */
+#define MAX_RPAGES 512
+
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+       u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+       struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+       struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
+static struct kmem_cache *mr_cache;
+static struct kmem_cache *mw_cache;
+
+enum ehca_mr_pgsize {
+       EHCA_MR_PGSIZE4K  = 0x1000L,
+       EHCA_MR_PGSIZE64K = 0x10000L,
+       EHCA_MR_PGSIZE1M  = 0x100000L,
+       EHCA_MR_PGSIZE16M = 0x1000000L
+};
+
+#define EHCA_MR_PGSHIFT4K  12
+#define EHCA_MR_PGSHIFT64K 16
+#define EHCA_MR_PGSHIFT1M  20
+#define EHCA_MR_PGSHIFT16M 24
+
+static u64 ehca_map_vaddr(void *caddr);
+
+static u32 ehca_encode_hwpage_size(u32 pgsize)
+{
+       int log = ilog2(pgsize);
+       WARN_ON(log < 12 || log > 24 || log & 3);
+       return (log - 12) / 4;
+}
+
+static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
+{
+       return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
+}
+
+static struct ehca_mr *ehca_mr_new(void)
+{
+       struct ehca_mr *me;
+
+       me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
+       if (me)
+               spin_lock_init(&me->mrlock);
+       else
+               ehca_gen_err("alloc failed");
+
+       return me;
+}
+
+static void ehca_mr_delete(struct ehca_mr *me)
+{
+       kmem_cache_free(mr_cache, me);
+}
+
+static struct ehca_mw *ehca_mw_new(void)
+{
+       struct ehca_mw *me;
+
+       me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
+       if (me)
+               spin_lock_init(&me->mwlock);
+       else
+               ehca_gen_err("alloc failed");
+
+       return me;
+}
+
+static void ehca_mw_delete(struct ehca_mw *me)
+{
+       kmem_cache_free(mw_cache, me);
+}
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+       struct ib_mr *ib_mr;
+       int ret;
+       struct ehca_mr *e_maxmr;
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+
+       if (shca->maxmr) {
+               e_maxmr = ehca_mr_new();
+               if (!e_maxmr) {
+                       ehca_err(&shca->ib_device, "out of memory");
+                       ib_mr = ERR_PTR(-ENOMEM);
+                       goto get_dma_mr_exit0;
+               }
+
+               ret = ehca_reg_maxmr(shca, e_maxmr,
+                                    (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
+                                    mr_access_flags, e_pd,
+                                    &e_maxmr->ib.ib_mr.lkey,
+                                    &e_maxmr->ib.ib_mr.rkey);
+               if (ret) {
+                       ehca_mr_delete(e_maxmr);
+                       ib_mr = ERR_PTR(ret);
+                       goto get_dma_mr_exit0;
+               }
+               ib_mr = &e_maxmr->ib.ib_mr;
+       } else {
+               ehca_err(&shca->ib_device, "no internal max-MR exist!");
+               ib_mr = ERR_PTR(-EINVAL);
+               goto get_dma_mr_exit0;
+       }
+
+get_dma_mr_exit0:
+       if (IS_ERR(ib_mr))
+               ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
+                        PTR_ERR(ib_mr), pd, mr_access_flags);
+       return ib_mr;
+} /* end ehca_get_dma_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+                              struct ib_phys_buf *phys_buf_array,
+                              int num_phys_buf,
+                              int mr_access_flags,
+                              u64 *iova_start)
+{
+       struct ib_mr *ib_mr;
+       int ret;
+       struct ehca_mr *e_mr;
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+       u64 size;
+
+       if ((num_phys_buf <= 0) || !phys_buf_array) {
+               ehca_err(pd->device, "bad input values: num_phys_buf=%x "
+                        "phys_buf_array=%p", num_phys_buf, phys_buf_array);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_phys_mr_exit0;
+       }
+       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_phys_mr_exit0;
+       }
+
+       /* check physical buffer list and calculate size */
+       ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
+                                           iova_start, &size);
+       if (ret) {
+               ib_mr = ERR_PTR(ret);
+               goto reg_phys_mr_exit0;
+       }
+       if ((size == 0) ||
+           (((u64)iova_start + size) < (u64)iova_start)) {
+               ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
+                        size, iova_start);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_phys_mr_exit0;
+       }
+
+       e_mr = ehca_mr_new();
+       if (!e_mr) {
+               ehca_err(pd->device, "out of memory");
+               ib_mr = ERR_PTR(-ENOMEM);
+               goto reg_phys_mr_exit0;
+       }
+
+       /* register MR on HCA */
+       if (ehca_mr_is_maxmr(size, iova_start)) {
+               e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+               ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
+                                    e_pd, &e_mr->ib.ib_mr.lkey,
+                                    &e_mr->ib.ib_mr.rkey);
+               if (ret) {
+                       ib_mr = ERR_PTR(ret);
+                       goto reg_phys_mr_exit1;
+               }
+       } else {
+               struct ehca_mr_pginfo pginfo;
+               u32 num_kpages;
+               u32 num_hwpages;
+               u64 hw_pgsize;
+
+               num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
+                                       PAGE_SIZE);
+               /* for kernel space we try most possible pgsize */
+               hw_pgsize = ehca_get_max_hwpage_size(shca);
+               num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
+                                        hw_pgsize);
+               memset(&pginfo, 0, sizeof(pginfo));
+               pginfo.type = EHCA_MR_PGI_PHYS;
+               pginfo.num_kpages = num_kpages;
+               pginfo.hwpage_size = hw_pgsize;
+               pginfo.num_hwpages = num_hwpages;
+               pginfo.u.phy.num_phys_buf = num_phys_buf;
+               pginfo.u.phy.phys_buf_array = phys_buf_array;
+               pginfo.next_hwpage =
+                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+
+               ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+                                 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+                                 &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+               if (ret) {
+                       ib_mr = ERR_PTR(ret);
+                       goto reg_phys_mr_exit1;
+               }
+       }
+
+       /* successful registration of all pages */
+       return &e_mr->ib.ib_mr;
+
+reg_phys_mr_exit1:
+       ehca_mr_delete(e_mr);
+reg_phys_mr_exit0:
+       if (IS_ERR(ib_mr))
+               ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
+                        "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
+                        PTR_ERR(ib_mr), pd, phys_buf_array,
+                        num_phys_buf, mr_access_flags, iova_start);
+       return ib_mr;
+} /* end ehca_reg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                              u64 virt, int mr_access_flags,
+                              struct ib_udata *udata)
+{
+       struct ib_mr *ib_mr;
+       struct ehca_mr *e_mr;
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_mr_pginfo pginfo;
+       int ret, page_shift;
+       u32 num_kpages;
+       u32 num_hwpages;
+       u64 hwpage_size;
+
+       if (!pd) {
+               ehca_gen_err("bad pd=%p", pd);
+               return ERR_PTR(-EFAULT);
+       }
+
+       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_user_mr_exit0;
+       }
+
+       if (length == 0 || virt + length < virt) {
+               ehca_err(pd->device, "bad input values: length=%llx "
+                        "virt_base=%llx", length, virt);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_user_mr_exit0;
+       }
+
+       e_mr = ehca_mr_new();
+       if (!e_mr) {
+               ehca_err(pd->device, "out of memory");
+               ib_mr = ERR_PTR(-ENOMEM);
+               goto reg_user_mr_exit0;
+       }
+
+       e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
+                                mr_access_flags, 0);
+       if (IS_ERR(e_mr->umem)) {
+               ib_mr = (void *)e_mr->umem;
+               goto reg_user_mr_exit1;
+       }
+
+       if (e_mr->umem->page_size != PAGE_SIZE) {
+               ehca_err(pd->device, "page size not supported, "
+                        "e_mr->umem->page_size=%x", e_mr->umem->page_size);
+               ib_mr = ERR_PTR(-EINVAL);
+               goto reg_user_mr_exit2;
+       }
+
+       /* determine number of MR pages */
+       num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
+       /* select proper hw_pgsize */
+       page_shift = PAGE_SHIFT;
+       if (e_mr->umem->hugetlb) {
+               /* determine page_shift, clamp between 4K and 16M */
+               page_shift = (fls64(length - 1) + 3) & ~3;
+               page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+                                EHCA_MR_PGSHIFT16M);
+       }
+       hwpage_size = 1UL << page_shift;
+
+       /* now that we have the desired page size, shift until it's
+        * supported, too. 4K is always supported, so this terminates.
+        */
+       while (!(hwpage_size & shca->hca_cap_mr_pgsize))
+               hwpage_size >>= 4;
+
+reg_user_mr_fallback:
+       num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
+       /* register MR on HCA */
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_USER;
+       pginfo.hwpage_size = hwpage_size;
+       pginfo.num_kpages = num_kpages;
+       pginfo.num_hwpages = num_hwpages;
+       pginfo.u.usr.region = e_mr->umem;
+       pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
+       pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
+       ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+                         e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+                         &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+       if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+               ehca_warn(pd->device, "failed to register mr "
+                         "with hwpage_size=%llx", hwpage_size);
+               ehca_info(pd->device, "try to register mr with "
+                         "kpage_size=%lx", PAGE_SIZE);
+               /*
+                * this means kpages are not contiguous for a hw page
+                * try kernel page size as fallback solution
+                */
+               hwpage_size = PAGE_SIZE;
+               goto reg_user_mr_fallback;
+       }
+       if (ret) {
+               ib_mr = ERR_PTR(ret);
+               goto reg_user_mr_exit2;
+       }
+
+       /* successful registration of all pages */
+       return &e_mr->ib.ib_mr;
+
+reg_user_mr_exit2:
+       ib_umem_release(e_mr->umem);
+reg_user_mr_exit1:
+       ehca_mr_delete(e_mr);
+reg_user_mr_exit0:
+       if (IS_ERR(ib_mr))
+               ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
+                        PTR_ERR(ib_mr), pd, mr_access_flags, udata);
+       return ib_mr;
+} /* end ehca_reg_user_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+                      int mr_rereg_mask,
+                      struct ib_pd *pd,
+                      struct ib_phys_buf *phys_buf_array,
+                      int num_phys_buf,
+                      int mr_access_flags,
+                      u64 *iova_start)
+{
+       int ret;
+
+       struct ehca_shca *shca =
+               container_of(mr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+       u64 new_size;
+       u64 *new_start;
+       u32 new_acl;
+       struct ehca_pd *new_pd;
+       u32 tmp_lkey, tmp_rkey;
+       unsigned long sl_flags;
+       u32 num_kpages = 0;
+       u32 num_hwpages = 0;
+       struct ehca_mr_pginfo pginfo;
+
+       if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
+               /* TODO not supported, because PHYP rereg hCall needs pages */
+               ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
+                        "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+
+       if (mr_rereg_mask & IB_MR_REREG_PD) {
+               if (!pd) {
+                       ehca_err(mr->device, "rereg with bad pd, pd=%p "
+                                "mr_rereg_mask=%x", pd, mr_rereg_mask);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit0;
+               }
+       }
+
+       if ((mr_rereg_mask &
+            ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
+           (mr_rereg_mask == 0)) {
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+
+       /* check other parameters */
+       if (e_mr == shca->maxmr) {
+               /* should be impossible, however reject to be sure */
+               ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
+                        "shca->maxmr=%p mr->lkey=%x",
+                        mr, shca->maxmr, mr->lkey);
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+       if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
+               if (e_mr->flags & EHCA_MR_FLAG_FMR) {
+                       ehca_err(mr->device, "not supported for FMR, mr=%p "
+                                "flags=%x", mr, e_mr->flags);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit0;
+               }
+               if (!phys_buf_array || num_phys_buf <= 0) {
+                       ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
+                                " phys_buf_array=%p num_phys_buf=%x",
+                                mr_rereg_mask, phys_buf_array, num_phys_buf);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit0;
+               }
+       }
+       if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&     /* change ACL */
+           (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+            ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+             !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
+                        "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
+               ret = -EINVAL;
+               goto rereg_phys_mr_exit0;
+       }
+
+       /* set requested values dependent on rereg request */
+       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+       new_start = e_mr->start;
+       new_size = e_mr->size;
+       new_acl = e_mr->acl;
+       new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
+
+       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+               u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
+
+               new_start = iova_start; /* change address */
+               /* check physical buffer list and calculate size */
+               ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
+                                                   num_phys_buf, iova_start,
+                                                   &new_size);
+               if (ret)
+                       goto rereg_phys_mr_exit1;
+               if ((new_size == 0) ||
+                   (((u64)iova_start + new_size) < (u64)iova_start)) {
+                       ehca_err(mr->device, "bad input values: new_size=%llx "
+                                "iova_start=%p", new_size, iova_start);
+                       ret = -EINVAL;
+                       goto rereg_phys_mr_exit1;
+               }
+               num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
+                                       new_size, PAGE_SIZE);
+               num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
+                                        new_size, hw_pgsize);
+               memset(&pginfo, 0, sizeof(pginfo));
+               pginfo.type = EHCA_MR_PGI_PHYS;
+               pginfo.num_kpages = num_kpages;
+               pginfo.hwpage_size = hw_pgsize;
+               pginfo.num_hwpages = num_hwpages;
+               pginfo.u.phy.num_phys_buf = num_phys_buf;
+               pginfo.u.phy.phys_buf_array = phys_buf_array;
+               pginfo.next_hwpage =
+                       ((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+       }
+       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+               new_acl = mr_access_flags;
+       if (mr_rereg_mask & IB_MR_REREG_PD)
+               new_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+       ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
+                           new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+       if (ret)
+               goto rereg_phys_mr_exit1;
+
+       /* successful reregistration */
+       if (mr_rereg_mask & IB_MR_REREG_PD)
+               mr->pd = pd;
+       mr->lkey = tmp_lkey;
+       mr->rkey = tmp_rkey;
+
+rereg_phys_mr_exit1:
+       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+rereg_phys_mr_exit0:
+       if (ret)
+               ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
+                        "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
+                        "iova_start=%p",
+                        ret, mr, mr_rereg_mask, pd, phys_buf_array,
+                        num_phys_buf, mr_access_flags, iova_start);
+       return ret;
+} /* end ehca_rereg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(mr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+       unsigned long sl_flags;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+               ret = -EINVAL;
+               goto query_mr_exit0;
+       }
+
+       memset(mr_attr, 0, sizeof(struct ib_mr_attr));
+       spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+
+       h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
+                        "hca_hndl=%llx mr_hndl=%llx lkey=%x",
+                        h_ret, mr, shca->ipz_hca_handle.handle,
+                        e_mr->ipz_mr_handle.handle, mr->lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto query_mr_exit1;
+       }
+       mr_attr->pd = mr->pd;
+       mr_attr->device_virt_addr = hipzout.vaddr;
+       mr_attr->size = hipzout.len;
+       mr_attr->lkey = hipzout.lkey;
+       mr_attr->rkey = hipzout.rkey;
+       ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
+
+query_mr_exit1:
+       spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+query_mr_exit0:
+       if (ret)
+               ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
+                        ret, mr, mr_attr);
+       return ret;
+} /* end ehca_query_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_mr(struct ib_mr *mr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(mr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+
+       if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+                        "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+               ret = -EINVAL;
+               goto dereg_mr_exit0;
+       } else if (e_mr == shca->maxmr) {
+               /* should be impossible, however reject to be sure */
+               ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
+                        "shca->maxmr=%p mr->lkey=%x",
+                        mr, shca->maxmr, mr->lkey);
+               ret = -EINVAL;
+               goto dereg_mr_exit0;
+       }
+
+       /* TODO: BUSY: MR still has bound window(s) */
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
+                        "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
+                        h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
+                        e_mr->ipz_mr_handle.handle, mr->lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto dereg_mr_exit0;
+       }
+
+       if (e_mr->umem)
+               ib_umem_release(e_mr->umem);
+
+       /* successful deregistration */
+       ehca_mr_delete(e_mr);
+
+dereg_mr_exit0:
+       if (ret)
+               ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
+       return ret;
+} /* end ehca_dereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+       struct ib_mw *ib_mw;
+       u64 h_ret;
+       struct ehca_mw *e_mw;
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_mw_hipzout_parms hipzout;
+
+       if (type != IB_MW_TYPE_1)
+               return ERR_PTR(-EINVAL);
+
+       e_mw = ehca_mw_new();
+       if (!e_mw) {
+               ib_mw = ERR_PTR(-ENOMEM);
+               goto alloc_mw_exit0;
+       }
+
+       h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
+                                        e_pd->fw_pd, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
+                        "shca=%p hca_hndl=%llx mw=%p",
+                        h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
+               ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
+               goto alloc_mw_exit1;
+       }
+       /* successful MW allocation */
+       e_mw->ipz_mw_handle = hipzout.handle;
+       e_mw->ib_mw.rkey    = hipzout.rkey;
+       return &e_mw->ib_mw;
+
+alloc_mw_exit1:
+       ehca_mw_delete(e_mw);
+alloc_mw_exit0:
+       if (IS_ERR(ib_mw))
+               ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
+       return ib_mw;
+} /* end ehca_alloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_bind_mw(struct ib_qp *qp,
+                struct ib_mw *mw,
+                struct ib_mw_bind *mw_bind)
+{
+       /* TODO: not supported up to now */
+       ehca_gen_err("bind MW currently not supported by HCAD");
+
+       return -EPERM;
+} /* end ehca_bind_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_mw(struct ib_mw *mw)
+{
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(mw->device, struct ehca_shca, ib_device);
+       struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
+
+       h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
+                        "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
+                        h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
+                        e_mw->ipz_mw_handle.handle);
+               return ehca2ib_return_code(h_ret);
+       }
+       /* successful deallocation */
+       ehca_mw_delete(e_mw);
+       return 0;
+} /* end ehca_dealloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+                             int mr_access_flags,
+                             struct ib_fmr_attr *fmr_attr)
+{
+       struct ib_fmr *ib_fmr;
+       struct ehca_shca *shca =
+               container_of(pd->device, struct ehca_shca, ib_device);
+       struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_mr *e_fmr;
+       int ret;
+       u32 tmp_lkey, tmp_rkey;
+       struct ehca_mr_pginfo pginfo;
+       u64 hw_pgsize;
+
+       /* check other parameters */
+       if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+           ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+            !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+               /*
+                * Remote Write Access requires Local Write Access
+                * Remote Atomic Access requires Local Write Access
+                */
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+       if (mr_access_flags & IB_ACCESS_MW_BIND) {
+               ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+                        mr_access_flags);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+       if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
+               ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
+                        "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
+                        fmr_attr->max_pages, fmr_attr->max_maps,
+                        fmr_attr->page_shift);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+
+       hw_pgsize = 1 << fmr_attr->page_shift;
+       if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
+               ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
+                        fmr_attr->page_shift);
+               ib_fmr = ERR_PTR(-EINVAL);
+               goto alloc_fmr_exit0;
+       }
+
+       e_fmr = ehca_mr_new();
+       if (!e_fmr) {
+               ib_fmr = ERR_PTR(-ENOMEM);
+               goto alloc_fmr_exit0;
+       }
+       e_fmr->flags |= EHCA_MR_FLAG_FMR;
+
+       /* register MR on HCA */
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.hwpage_size = hw_pgsize;
+       /*
+        * pginfo.num_hwpages==0, ie register_rpages() will not be called
+        * but deferred to map_phys_fmr()
+        */
+       ret = ehca_reg_mr(shca, e_fmr, NULL,
+                         fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+                         mr_access_flags, e_pd, &pginfo,
+                         &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+       if (ret) {
+               ib_fmr = ERR_PTR(ret);
+               goto alloc_fmr_exit1;
+       }
+
+       /* successful */
+       e_fmr->hwpage_size = hw_pgsize;
+       e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
+       e_fmr->fmr_max_pages = fmr_attr->max_pages;
+       e_fmr->fmr_max_maps = fmr_attr->max_maps;
+       e_fmr->fmr_map_cnt = 0;
+       return &e_fmr->ib.ib_fmr;
+
+alloc_fmr_exit1:
+       ehca_mr_delete(e_fmr);
+alloc_fmr_exit0:
+       return ib_fmr;
+} /* end ehca_alloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+                     u64 *page_list,
+                     int list_len,
+                     u64 iova)
+{
+       int ret;
+       struct ehca_shca *shca =
+               container_of(fmr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+       struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
+       struct ehca_mr_pginfo pginfo;
+       u32 tmp_lkey, tmp_rkey;
+
+       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+                        e_fmr, e_fmr->flags);
+               ret = -EINVAL;
+               goto map_phys_fmr_exit0;
+       }
+       ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
+       if (ret)
+               goto map_phys_fmr_exit0;
+       if (iova % e_fmr->fmr_page_size) {
+               /* only whole-numbered pages */
+               ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
+                        iova, e_fmr->fmr_page_size);
+               ret = -EINVAL;
+               goto map_phys_fmr_exit0;
+       }
+       if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
+               /* HCAD does not limit the maps, however trace this anyway */
+               ehca_info(fmr->device, "map limit exceeded, fmr=%p "
+                         "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
+                         fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
+       }
+
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_FMR;
+       pginfo.num_kpages = list_len;
+       pginfo.hwpage_size = e_fmr->hwpage_size;
+       pginfo.num_hwpages =
+               list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
+       pginfo.u.fmr.page_list = page_list;
+       pginfo.next_hwpage =
+               (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
+       pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
+
+       ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
+                           list_len * e_fmr->fmr_page_size,
+                           e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+       if (ret)
+               goto map_phys_fmr_exit0;
+
+       /* successful reregistration */
+       e_fmr->fmr_map_cnt++;
+       e_fmr->ib.ib_fmr.lkey = tmp_lkey;
+       e_fmr->ib.ib_fmr.rkey = tmp_rkey;
+       return 0;
+
+map_phys_fmr_exit0:
+       if (ret)
+               ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
+                        "iova=%llx", ret, fmr, page_list, list_len, iova);
+       return ret;
+} /* end ehca_map_phys_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_fmr(struct list_head *fmr_list)
+{
+       int ret = 0;
+       struct ib_fmr *ib_fmr;
+       struct ehca_shca *shca = NULL;
+       struct ehca_shca *prev_shca;
+       struct ehca_mr *e_fmr;
+       u32 num_fmr = 0;
+       u32 unmap_fmr_cnt = 0;
+
+       /* check all FMR belong to same SHCA, and check internal flag */
+       list_for_each_entry(ib_fmr, fmr_list, list) {
+               prev_shca = shca;
+               shca = container_of(ib_fmr->device, struct ehca_shca,
+                                   ib_device);
+               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+               if ((shca != prev_shca) && prev_shca) {
+                       ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
+                                "prev_shca=%p e_fmr=%p",
+                                shca, prev_shca, e_fmr);
+                       ret = -EINVAL;
+                       goto unmap_fmr_exit0;
+               }
+               if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+                       ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
+                                "e_fmr->flags=%x", e_fmr, e_fmr->flags);
+                       ret = -EINVAL;
+                       goto unmap_fmr_exit0;
+               }
+               num_fmr++;
+       }
+
+       /* loop over all FMRs to unmap */
+       list_for_each_entry(ib_fmr, fmr_list, list) {
+               unmap_fmr_cnt++;
+               e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+               shca = container_of(ib_fmr->device, struct ehca_shca,
+                                   ib_device);
+               ret = ehca_unmap_one_fmr(shca, e_fmr);
+               if (ret) {
+                       /* unmap failed, stop unmapping of rest of FMRs */
+                       ehca_err(&shca->ib_device, "unmap of one FMR failed, "
+                                "stop rest, e_fmr=%p num_fmr=%x "
+                                "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
+                                unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
+                       goto unmap_fmr_exit0;
+               }
+       }
+
+unmap_fmr_exit0:
+       if (ret)
+               ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
+                            ret, fmr_list, num_fmr, unmap_fmr_cnt);
+       return ret;
+} /* end ehca_unmap_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr)
+{
+       int ret;
+       u64 h_ret;
+       struct ehca_shca *shca =
+               container_of(fmr->device, struct ehca_shca, ib_device);
+       struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+
+       if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+               ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+                        e_fmr, e_fmr->flags);
+               ret = -EINVAL;
+               goto free_fmr_exit0;
+       }
+
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
+                        "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
+                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
+                        e_fmr->ipz_mr_handle.handle, fmr->lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto free_fmr_exit0;
+       }
+       /* successful deregistration */
+       ehca_mr_delete(e_fmr);
+       return 0;
+
+free_fmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
+       return ret;
+} /* end ehca_dealloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+                                  struct ehca_mr *e_mr,
+                                  struct ehca_mr_pginfo *pginfo);
+
+int ehca_reg_mr(struct ehca_shca *shca,
+               struct ehca_mr *e_mr,
+               u64 *iova_start,
+               u64 size,
+               int acl,
+               struct ehca_pd *e_pd,
+               struct ehca_mr_pginfo *pginfo,
+               u32 *lkey, /*OUT*/
+               u32 *rkey, /*OUT*/
+               enum ehca_reg_type reg_type)
+{
+       int ret;
+       u64 h_ret;
+       u32 hipz_acl;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+       if (ehca_use_hp_mr == 1)
+               hipz_acl |= 0x00000001;
+
+       h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
+                                        (u64)iova_start, size, hipz_acl,
+                                        e_pd->fw_pd, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
+                        "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
+               ret = ehca2ib_return_code(h_ret);
+               goto ehca_reg_mr_exit0;
+       }
+
+       e_mr->ipz_mr_handle = hipzout.handle;
+
+       if (reg_type == EHCA_REG_BUSMAP_MR)
+               ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+       else if (reg_type == EHCA_REG_MR)
+               ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+       else
+               ret = -EINVAL;
+
+       if (ret)
+               goto ehca_reg_mr_exit1;
+
+       /* successful registration */
+       e_mr->num_kpages = pginfo->num_kpages;
+       e_mr->num_hwpages = pginfo->num_hwpages;
+       e_mr->hwpage_size = pginfo->hwpage_size;
+       e_mr->start = iova_start;
+       e_mr->size = size;
+       e_mr->acl = acl;
+       *lkey = hipzout.lkey;
+       *rkey = hipzout.rkey;
+       return 0;
+
+ehca_reg_mr_exit1:
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
+                        "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
+                        "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
+                        h_ret, shca, e_mr, iova_start, size, acl, e_pd,
+                        hipzout.lkey, pginfo, pginfo->num_kpages,
+                        pginfo->num_hwpages, ret);
+               ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
+                        "not recoverable");
+       }
+ehca_reg_mr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+                        "num_kpages=%llx num_hwpages=%llx",
+                        ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
+                        pginfo->num_kpages, pginfo->num_hwpages);
+       return ret;
+} /* end ehca_reg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+                      struct ehca_mr *e_mr,
+                      struct ehca_mr_pginfo *pginfo)
+{
+       int ret = 0;
+       u64 h_ret;
+       u32 rnum;
+       u64 rpage;
+       u32 i;
+       u64 *kpage;
+
+       if (!pginfo->num_hwpages) /* in case of fmr */
+               return 0;
+
+       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!kpage) {
+               ehca_err(&shca->ib_device, "kpage alloc failed");
+               ret = -ENOMEM;
+               goto ehca_reg_mr_rpages_exit0;
+       }
+
+       /* max MAX_RPAGES ehca mr pages per register call */
+       for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
+
+               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+                       rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
+                       if (rnum == 0)
+                               rnum = MAX_RPAGES;      /* last shot is full */
+               } else
+                       rnum = MAX_RPAGES;
+
+               ret = ehca_set_pagebuf(pginfo, rnum, kpage);
+               if (ret) {
+                       ehca_err(&shca->ib_device, "ehca_set_pagebuf "
+                                "bad rc, ret=%i rnum=%x kpage=%p",
+                                ret, rnum, kpage);
+                       goto ehca_reg_mr_rpages_exit1;
+               }
+
+               if (rnum > 1) {
+                       rpage = __pa(kpage);
+                       if (!rpage) {
+                               ehca_err(&shca->ib_device, "kpage=%p i=%x",
+                                        kpage, i);
+                               ret = -EFAULT;
+                               goto ehca_reg_mr_rpages_exit1;
+                       }
+               } else
+                       rpage = *kpage;
+
+               h_ret = hipz_h_register_rpage_mr(
+                       shca->ipz_hca_handle, e_mr,
+                       ehca_encode_hwpage_size(pginfo->hwpage_size),
+                       0, rpage, rnum);
+
+               if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+                       /*
+                        * check for 'registration complete'==H_SUCCESS
+                        * and for 'page registered'==H_PAGE_REGISTERED
+                        */
+                       if (h_ret != H_SUCCESS) {
+                               ehca_err(&shca->ib_device, "last "
+                                        "hipz_reg_rpage_mr failed, h_ret=%lli "
+                                        "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
+                                        " lkey=%x", h_ret, e_mr, i,
+                                        shca->ipz_hca_handle.handle,
+                                        e_mr->ipz_mr_handle.handle,
+                                        e_mr->ib.ib_mr.lkey);
+                               ret = ehca2ib_return_code(h_ret);
+                               break;
+                       } else
+                               ret = 0;
+               } else if (h_ret != H_PAGE_REGISTERED) {
+                       ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
+                                "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
+                                "mr_hndl=%llx", h_ret, e_mr, i,
+                                e_mr->ib.ib_mr.lkey,
+                                shca->ipz_hca_handle.handle,
+                                e_mr->ipz_mr_handle.handle);
+                       ret = ehca2ib_return_code(h_ret);
+                       break;
+               } else
+                       ret = 0;
+       } /* end for(i) */
+
+
+ehca_reg_mr_rpages_exit1:
+       ehca_free_fw_ctrlblock(kpage);
+ehca_reg_mr_rpages_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
+                        "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
+                        pginfo, pginfo->num_kpages, pginfo->num_hwpages);
+       return ret;
+} /* end ehca_reg_mr_rpages() */
+
+/*----------------------------------------------------------------------*/
+
+inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
+                               struct ehca_mr *e_mr,
+                               u64 *iova_start,
+                               u64 size,
+                               u32 acl,
+                               struct ehca_pd *e_pd,
+                               struct ehca_mr_pginfo *pginfo,
+                               u32 *lkey, /*OUT*/
+                               u32 *rkey) /*OUT*/
+{
+       int ret;
+       u64 h_ret;
+       u32 hipz_acl;
+       u64 *kpage;
+       u64 rpage;
+       struct ehca_mr_pginfo pginfo_save;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+
+       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!kpage) {
+               ehca_err(&shca->ib_device, "kpage alloc failed");
+               ret = -ENOMEM;
+               goto ehca_rereg_mr_rereg1_exit0;
+       }
+
+       pginfo_save = *pginfo;
+       ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
+       if (ret) {
+               ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
+                        "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
+                        "kpage=%p", e_mr, pginfo, pginfo->type,
+                        pginfo->num_kpages, pginfo->num_hwpages, kpage);
+               goto ehca_rereg_mr_rereg1_exit1;
+       }
+       rpage = __pa(kpage);
+       if (!rpage) {
+               ehca_err(&shca->ib_device, "kpage=%p", kpage);
+               ret = -EFAULT;
+               goto ehca_rereg_mr_rereg1_exit1;
+       }
+       h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
+                                     (u64)iova_start, size, hipz_acl,
+                                     e_pd->fw_pd, rpage, &hipzout);
+       if (h_ret != H_SUCCESS) {
+               /*
+                * reregistration unsuccessful, try it again with the 3 hCalls,
+                * e.g. this is required in case H_MR_CONDITION
+                * (MW bound or MR is shared)
+                */
+               ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
+                         "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
+               *pginfo = pginfo_save;
+               ret = -EAGAIN;
+       } else if ((u64 *)hipzout.vaddr != iova_start) {
+               ehca_err(&shca->ib_device, "PHYP changed iova_start in "
+                        "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
+                        "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
+                        hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
+                        e_mr->ib.ib_mr.lkey, hipzout.lkey);
+               ret = -EFAULT;
+       } else {
+               /*
+                * successful reregistration
+                * note: start and start_out are identical for eServer HCAs
+                */
+               e_mr->num_kpages = pginfo->num_kpages;
+               e_mr->num_hwpages = pginfo->num_hwpages;
+               e_mr->hwpage_size = pginfo->hwpage_size;
+               e_mr->start = iova_start;
+               e_mr->size = size;
+               e_mr->acl = acl;
+               *lkey = hipzout.lkey;
+               *rkey = hipzout.rkey;
+       }
+
+ehca_rereg_mr_rereg1_exit1:
+       ehca_free_fw_ctrlblock(kpage);
+ehca_rereg_mr_rereg1_exit0:
+       if ( ret && (ret != -EAGAIN) )
+               ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
+                        "pginfo=%p num_kpages=%llx num_hwpages=%llx",
+                        ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
+                        pginfo->num_hwpages);
+       return ret;
+} /* end ehca_rereg_mr_rereg1() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+                 struct ehca_mr *e_mr,
+                 u64 *iova_start,
+                 u64 size,
+                 int acl,
+                 struct ehca_pd *e_pd,
+                 struct ehca_mr_pginfo *pginfo,
+                 u32 *lkey,
+                 u32 *rkey)
+{
+       int ret = 0;
+       u64 h_ret;
+       int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
+       int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
+
+       /* first determine reregistration hCall(s) */
+       if ((pginfo->num_hwpages > MAX_RPAGES) ||
+           (e_mr->num_hwpages > MAX_RPAGES) ||
+           (pginfo->num_hwpages > e_mr->num_hwpages)) {
+               ehca_dbg(&shca->ib_device, "Rereg3 case, "
+                        "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
+                        pginfo->num_hwpages, e_mr->num_hwpages);
+               rereg_1_hcall = 0;
+               rereg_3_hcall = 1;
+       }
+
+       if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */
+               rereg_1_hcall = 0;
+               rereg_3_hcall = 1;
+               e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
+               ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
+                        e_mr);
+       }
+
+       if (rereg_1_hcall) {
+               ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
+                                          acl, e_pd, pginfo, lkey, rkey);
+               if (ret) {
+                       if (ret == -EAGAIN)
+                               rereg_3_hcall = 1;
+                       else
+                               goto ehca_rereg_mr_exit0;
+               }
+       }
+
+       if (rereg_3_hcall) {
+               struct ehca_mr save_mr;
+
+               /* first deregister old MR */
+               h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+               if (h_ret != H_SUCCESS) {
+                       ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+                                "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
+                                "mr->lkey=%x",
+                                h_ret, e_mr, shca->ipz_hca_handle.handle,
+                                e_mr->ipz_mr_handle.handle,
+                                e_mr->ib.ib_mr.lkey);
+                       ret = ehca2ib_return_code(h_ret);
+                       goto ehca_rereg_mr_exit0;
+               }
+               /* clean ehca_mr_t, without changing struct ib_mr and lock */
+               save_mr = *e_mr;
+               ehca_mr_deletenew(e_mr);
+
+               /* set some MR values */
+               e_mr->flags = save_mr.flags;
+               e_mr->hwpage_size = save_mr.hwpage_size;
+               e_mr->fmr_page_size = save_mr.fmr_page_size;
+               e_mr->fmr_max_pages = save_mr.fmr_max_pages;
+               e_mr->fmr_max_maps = save_mr.fmr_max_maps;
+               e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+
+               ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+                                 e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+               if (ret) {
+                       u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+                       memcpy(&e_mr->flags, &(save_mr.flags),
+                              sizeof(struct ehca_mr) - offset);
+                       goto ehca_rereg_mr_exit0;
+               }
+       }
+
+ehca_rereg_mr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+                        "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+                        "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
+                        "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
+                        acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
+                        rereg_1_hcall, rereg_3_hcall);
+       return ret;
+} /* end ehca_rereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+                      struct ehca_mr *e_fmr)
+{
+       int ret = 0;
+       u64 h_ret;
+       struct ehca_pd *e_pd =
+               container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
+       struct ehca_mr save_fmr;
+       u32 tmp_lkey, tmp_rkey;
+       struct ehca_mr_pginfo pginfo;
+       struct ehca_mr_hipzout_parms hipzout;
+       struct ehca_mr save_mr;
+
+       if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
+               /*
+                * note: after using rereg hcall with len=0,
+                * rereg hcall must be used again for registering pages
+                */
+               h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
+                                             0, 0, e_pd->fw_pd, 0, &hipzout);
+               if (h_ret == H_SUCCESS) {
+                       /* successful reregistration */
+                       e_fmr->start = NULL;
+                       e_fmr->size = 0;
+                       tmp_lkey = hipzout.lkey;
+                       tmp_rkey = hipzout.rkey;
+                       return 0;
+               }
+               /*
+                * should not happen, because length checked above,
+                * FMRs are not shared and no MW bound to FMRs
+                */
+               ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
+                        "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
+                        "mr_hndl=%llx lkey=%x lkey_out=%x",
+                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
+                        e_fmr->ipz_mr_handle.handle,
+                        e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
+               /* try free and rereg */
+       }
+
+       /* first free old FMR */
+       h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+                        "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
+                        "lkey=%x",
+                        h_ret, e_fmr, shca->ipz_hca_handle.handle,
+                        e_fmr->ipz_mr_handle.handle,
+                        e_fmr->ib.ib_fmr.lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto ehca_unmap_one_fmr_exit0;
+       }
+       /* clean ehca_mr_t, without changing lock */
+       save_fmr = *e_fmr;
+       ehca_mr_deletenew(e_fmr);
+
+       /* set some MR values */
+       e_fmr->flags = save_fmr.flags;
+       e_fmr->hwpage_size = save_fmr.hwpage_size;
+       e_fmr->fmr_page_size = save_fmr.fmr_page_size;
+       e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
+       e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
+       e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
+       e_fmr->acl = save_fmr.acl;
+
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_FMR;
+       ret = ehca_reg_mr(shca, e_fmr, NULL,
+                         (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+                         e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+                         &tmp_rkey, EHCA_REG_MR);
+       if (ret) {
+               u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+               memcpy(&e_fmr->flags, &(save_mr.flags),
+                      sizeof(struct ehca_mr) - offset);
+       }
+
+ehca_unmap_one_fmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
+                        "fmr_max_pages=%x",
+                        ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
+       return ret;
+} /* end ehca_unmap_one_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_smr(struct ehca_shca *shca,
+                struct ehca_mr *e_origmr,
+                struct ehca_mr *e_newmr,
+                u64 *iova_start,
+                int acl,
+                struct ehca_pd *e_pd,
+                u32 *lkey, /*OUT*/
+                u32 *rkey) /*OUT*/
+{
+       int ret = 0;
+       u64 h_ret;
+       u32 hipz_acl;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
+                                   &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+                        "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
+                        "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+                        h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
+                        shca->ipz_hca_handle.handle,
+                        e_origmr->ipz_mr_handle.handle,
+                        e_origmr->ib.ib_mr.lkey);
+               ret = ehca2ib_return_code(h_ret);
+               goto ehca_reg_smr_exit0;
+       }
+       /* successful registration */
+       e_newmr->num_kpages = e_origmr->num_kpages;
+       e_newmr->num_hwpages = e_origmr->num_hwpages;
+       e_newmr->hwpage_size   = e_origmr->hwpage_size;
+       e_newmr->start = iova_start;
+       e_newmr->size = e_origmr->size;
+       e_newmr->acl = acl;
+       e_newmr->ipz_mr_handle = hipzout.handle;
+       *lkey = hipzout.lkey;
+       *rkey = hipzout.rkey;
+       return 0;
+
+ehca_reg_smr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
+                        "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
+                        ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
+       return ret;
+} /* end ehca_reg_smr() */
+
+/*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+       unsigned long ret = idx;
+       ret |= dir << EHCA_DIR_INDEX_SHIFT;
+       ret |= top << EHCA_TOP_INDEX_SHIFT;
+       return __va(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+       ((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+                              struct ehca_shca *shca, struct ehca_mr *mr,
+                              struct ehca_mr_pginfo *pginfo)
+{
+       u64 h_ret = 0;
+       unsigned long page = 0;
+       u64 rpage = __pa(kpage);
+       int page_count;
+
+       void *sectbase = ehca_calc_sectbase(top, dir, idx);
+       if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+               ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+                                          "hwpage_size does not fit to "
+                                          "section start address");
+       }
+       page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+       while (page < page_count) {
+               u64 rnum;
+               for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+                    rnum++) {
+                       void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+                       kpage[rnum] = __pa(pg);
+               }
+
+               h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+                       ehca_encode_hwpage_size(pginfo->hwpage_size),
+                       0, rpage, rnum);
+
+               if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+                       ehca_err(&shca->ib_device, "register_rpage_mr failed");
+                       return h_ret;
+               }
+       }
+       return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+                               struct ehca_shca *shca, struct ehca_mr *mr,
+                               struct ehca_mr_pginfo *pginfo)
+{
+       u64 hret = H_SUCCESS;
+       int idx;
+
+       for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+                       continue;
+
+               hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+                                          pginfo);
+               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+                               return hret;
+       }
+       return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+                                   struct ehca_mr *mr,
+                                   struct ehca_mr_pginfo *pginfo)
+{
+       u64 hret = H_SUCCESS;
+       int dir;
+
+       for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+                       continue;
+
+               hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+               if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+                               return hret;
+       }
+       return hret;
+}
+
+/* register internal max-MR to internal SHCA */
+int ehca_reg_internal_maxmr(
+       struct ehca_shca *shca,
+       struct ehca_pd *e_pd,
+       struct ehca_mr **e_maxmr)  /*OUT*/
+{
+       int ret;
+       struct ehca_mr *e_mr;
+       u64 *iova_start;
+       u64 size_maxmr;
+       struct ehca_mr_pginfo pginfo;
+       struct ib_phys_buf ib_pbuf;
+       u32 num_kpages;
+       u32 num_hwpages;
+       u64 hw_pgsize;
+
+       if (!ehca_bmap) {
+               ret = -EFAULT;
+               goto ehca_reg_internal_maxmr_exit0;
+       }
+
+       e_mr = ehca_mr_new();
+       if (!e_mr) {
+               ehca_err(&shca->ib_device, "out of memory");
+               ret = -ENOMEM;
+               goto ehca_reg_internal_maxmr_exit0;
+       }
+       e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+
+       /* register internal max-MR on HCA */
+       size_maxmr = ehca_mr_len;
+       iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
+       ib_pbuf.addr = 0;
+       ib_pbuf.size = size_maxmr;
+       num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+                               PAGE_SIZE);
+       hw_pgsize = ehca_get_max_hwpage_size(shca);
+       num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
+                                hw_pgsize);
+
+       memset(&pginfo, 0, sizeof(pginfo));
+       pginfo.type = EHCA_MR_PGI_PHYS;
+       pginfo.num_kpages = num_kpages;
+       pginfo.num_hwpages = num_hwpages;
+       pginfo.hwpage_size = hw_pgsize;
+       pginfo.u.phy.num_phys_buf = 1;
+       pginfo.u.phy.phys_buf_array = &ib_pbuf;
+
+       ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+                         &pginfo, &e_mr->ib.ib_mr.lkey,
+                         &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+       if (ret) {
+               ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+                        "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+                        "num_hwpages=%x", e_mr, iova_start, size_maxmr,
+                        num_kpages, num_hwpages);
+               goto ehca_reg_internal_maxmr_exit1;
+       }
+
+       /* successful registration of all pages */
+       e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
+       e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
+       e_mr->ib.ib_mr.uobject = NULL;
+       atomic_inc(&(e_pd->ib_pd.usecnt));
+       atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
+       *e_maxmr = e_mr;
+       return 0;
+
+ehca_reg_internal_maxmr_exit1:
+       ehca_mr_delete(e_mr);
+ehca_reg_internal_maxmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
+                        ret, shca, e_pd, e_maxmr);
+       return ret;
+} /* end ehca_reg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+                  struct ehca_mr *e_newmr,
+                  u64 *iova_start,
+                  int acl,
+                  struct ehca_pd *e_pd,
+                  u32 *lkey,
+                  u32 *rkey)
+{
+       u64 h_ret;
+       struct ehca_mr *e_origmr = shca->maxmr;
+       u32 hipz_acl;
+       struct ehca_mr_hipzout_parms hipzout;
+
+       ehca_mrmw_map_acl(acl, &hipz_acl);
+       ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+       h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+                                   (u64)iova_start, hipz_acl, e_pd->fw_pd,
+                                   &hipzout);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+                        "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+                        h_ret, e_origmr, shca->ipz_hca_handle.handle,
+                        e_origmr->ipz_mr_handle.handle,
+                        e_origmr->ib.ib_mr.lkey);
+               return ehca2ib_return_code(h_ret);
+       }
+       /* successful registration */
+       e_newmr->num_kpages = e_origmr->num_kpages;
+       e_newmr->num_hwpages = e_origmr->num_hwpages;
+       e_newmr->hwpage_size = e_origmr->hwpage_size;
+       e_newmr->start = iova_start;
+       e_newmr->size = e_origmr->size;
+       e_newmr->acl = acl;
+       e_newmr->ipz_mr_handle = hipzout.handle;
+       *lkey = hipzout.lkey;
+       *rkey = hipzout.rkey;
+       return 0;
+} /* end ehca_reg_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
+{
+       int ret;
+       struct ehca_mr *e_maxmr;
+       struct ib_pd *ib_pd;
+
+       if (!shca->maxmr) {
+               ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
+               ret = -EINVAL;
+               goto ehca_dereg_internal_maxmr_exit0;
+       }
+
+       e_maxmr = shca->maxmr;
+       ib_pd = e_maxmr->ib.ib_mr.pd;
+       shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
+
+       ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
+       if (ret) {
+               ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
+                        "ret=%i e_maxmr=%p shca=%p lkey=%x",
+                        ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
+               shca->maxmr = e_maxmr;
+               goto ehca_dereg_internal_maxmr_exit0;
+       }
+
+       atomic_dec(&ib_pd->usecnt);
+
+ehca_dereg_internal_maxmr_exit0:
+       if (ret)
+               ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
+                        ret, shca, shca->maxmr);
+       return ret;
+} /* end ehca_dereg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check physical buffer array of MR verbs for validness and
+ * calculates MR size
+ */
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+                                 int num_phys_buf,
+                                 u64 *iova_start,
+                                 u64 *size)
+{
+       struct ib_phys_buf *pbuf = phys_buf_array;
+       u64 size_count = 0;
+       u32 i;
+
+       if (num_phys_buf == 0) {
+               ehca_gen_err("bad phys buf array len, num_phys_buf=0");
+               return -EINVAL;
+       }
+       /* check first buffer */
+       if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
+               ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
+                            "pbuf->addr=%llx pbuf->size=%llx",
+                            iova_start, pbuf->addr, pbuf->size);
+               return -EINVAL;
+       }
+       if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
+           (num_phys_buf > 1)) {
+               ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
+                            "pbuf->size=%llx", pbuf->addr, pbuf->size);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < num_phys_buf; i++) {
+               if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
+                       ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
+                                    "pbuf->size=%llx",
+                                    i, pbuf->addr, pbuf->size);
+                       return -EINVAL;
+               }
+               if (((i > 0) && /* not 1st */
+                    (i < (num_phys_buf - 1)) &&        /* not last */
+                    (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
+                       ehca_gen_err("bad size, i=%x pbuf->size=%llx",
+                                    i, pbuf->size);
+                       return -EINVAL;
+               }
+               size_count += pbuf->size;
+               pbuf++;
+       }
+
+       *size = size_count;
+       return 0;
+} /* end ehca_mr_chk_buf_and_calc_size() */
+
+/*----------------------------------------------------------------------*/
+
+/* check page list of map FMR verb for validness */
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+                            u64 *page_list,
+                            int list_len)
+{
+       u32 i;
+       u64 *page;
+
+       if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
+               ehca_gen_err("bad list_len, list_len=%x "
+                            "e_fmr->fmr_max_pages=%x fmr=%p",
+                            list_len, e_fmr->fmr_max_pages, e_fmr);
+               return -EINVAL;
+       }
+
+       /* each page must be aligned */
+       page = page_list;
+       for (i = 0; i < list_len; i++) {
+               if (*page % e_fmr->fmr_page_size) {
+                       ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
+                                    "fmr_page_size=%x", i, *page, page, e_fmr,
+                                    e_fmr->fmr_page_size);
+                       return -EINVAL;
+               }
+               page++;
+       }
+
+       return 0;
+} /* end ehca_fmr_check_page_list() */
+
+/*----------------------------------------------------------------------*/
+
+/* PAGE_SIZE >= pginfo->hwpage_size */
+static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
+                                 u32 number,
+                                 u64 *kpage)
+{
+       int ret = 0;
+       u64 pgaddr;
+       u32 j = 0;
+       int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
+       struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+       while (*sg != NULL) {
+               pgaddr = page_to_pfn(sg_page(*sg))
+                       << PAGE_SHIFT;
+               *kpage = pgaddr + (pginfo->next_hwpage *
+                                  pginfo->hwpage_size);
+               if (!(*kpage)) {
+                       ehca_gen_err("pgaddr=%llx "
+                                    "sg_dma_address=%llx "
+                                    "entry=%llx next_hwpage=%llx",
+                                    pgaddr, (u64)sg_dma_address(*sg),
+                                    pginfo->u.usr.next_nmap,
+                                    pginfo->next_hwpage);
+                       return -EFAULT;
+               }
+               (pginfo->hwpage_cnt)++;
+               (pginfo->next_hwpage)++;
+               kpage++;
+               if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
+                       (pginfo->kpage_cnt)++;
+                       (pginfo->u.usr.next_nmap)++;
+                       pginfo->next_hwpage = 0;
+                       *sg = sg_next(*sg);
+               }
+               j++;
+               if (j >= number)
+                       break;
+       }
+
+       return ret;
+}
+
+/*
+ * check given pages for contiguous layout
+ * last page addr is returned in prev_pgaddr for further check
+ */
+static int ehca_check_kpages_per_ate(struct scatterlist **sg,
+                                    int num_pages,
+                                    u64 *prev_pgaddr)
+{
+       for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
+               u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
+               if (ehca_debug_level >= 3)
+                       ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
+                                    *(u64 *)__va(pgaddr));
+               if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
+                       ehca_gen_err("uncontiguous page found pgaddr=%llx "
+                                    "prev_pgaddr=%llx entries_left_in_hwpage=%x",
+                                    pgaddr, *prev_pgaddr, num_pages);
+                       return -EINVAL;
+               }
+               *prev_pgaddr = pgaddr;
+       }
+       return 0;
+}
+
+/* PAGE_SIZE < pginfo->hwpage_size */
+static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
+                                 u32 number,
+                                 u64 *kpage)
+{
+       int ret = 0;
+       u64 pgaddr, prev_pgaddr;
+       u32 j = 0;
+       int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
+       int nr_kpages = kpages_per_hwpage;
+       struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+       while (*sg != NULL) {
+
+               if (nr_kpages == kpages_per_hwpage) {
+                       pgaddr = (page_to_pfn(sg_page(*sg))
+                                  << PAGE_SHIFT);
+                       *kpage = pgaddr;
+                       if (!(*kpage)) {
+                               ehca_gen_err("pgaddr=%llx entry=%llx",
+                                            pgaddr, pginfo->u.usr.next_nmap);
+                               ret = -EFAULT;
+                               return ret;
+                       }
+                       /*
+                        * The first page in a hwpage must be aligned;
+                        * the first MR page is exempt from this rule.
+                        */
+                       if (pgaddr & (pginfo->hwpage_size - 1)) {
+                               if (pginfo->hwpage_cnt) {
+                                       ehca_gen_err(
+                                               "invalid alignment "
+                                               "pgaddr=%llx entry=%llx "
+                                               "mr_pgsize=%llx",
+                                               pgaddr, pginfo->u.usr.next_nmap,
+                                               pginfo->hwpage_size);
+                                       ret = -EFAULT;
+                                       return ret;
+                               }
+                               /* first MR page */
+                               pginfo->kpage_cnt =
+                                       (pgaddr &
+                                        (pginfo->hwpage_size - 1)) >>
+                                       PAGE_SHIFT;
+                               nr_kpages -= pginfo->kpage_cnt;
+                               *kpage = pgaddr &
+                                        ~(pginfo->hwpage_size - 1);
+                       }
+                       if (ehca_debug_level >= 3) {
+                               u64 val = *(u64 *)__va(pgaddr);
+                               ehca_gen_dbg("kpage=%llx page=%llx "
+                                            "value=%016llx",
+                                            *kpage, pgaddr, val);
+                       }
+                       prev_pgaddr = pgaddr;
+                       *sg = sg_next(*sg);
+                       pginfo->kpage_cnt++;
+                       pginfo->u.usr.next_nmap++;
+                       nr_kpages--;
+                       if (!nr_kpages)
+                               goto next_kpage;
+                       continue;
+               }
+
+               ret = ehca_check_kpages_per_ate(sg, nr_kpages,
+                                               &prev_pgaddr);
+               if (ret)
+                       return ret;
+               pginfo->kpage_cnt += nr_kpages;
+               pginfo->u.usr.next_nmap += nr_kpages;
+
+next_kpage:
+               nr_kpages = kpages_per_hwpage;
+               (pginfo->hwpage_cnt)++;
+               kpage++;
+               j++;
+               if (j >= number)
+                       break;
+       }
+
+       return ret;
+}
+
+static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
+                                u32 number, u64 *kpage)
+{
+       int ret = 0;
+       struct ib_phys_buf *pbuf;
+       u64 num_hw, offs_hw;
+       u32 i = 0;
+
+       /* loop over desired phys_buf_array entries */
+       while (i < number) {
+               pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
+               num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
+                                    pbuf->size, pginfo->hwpage_size);
+               offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
+                       pginfo->hwpage_size;
+               while (pginfo->next_hwpage < offs_hw + num_hw) {
+                       /* sanity check */
+                       if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+                           (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+                               ehca_gen_err("kpage_cnt >= num_kpages, "
+                                            "kpage_cnt=%llx num_kpages=%llx "
+                                            "hwpage_cnt=%llx "
+                                            "num_hwpages=%llx i=%x",
+                                            pginfo->kpage_cnt,
+                                            pginfo->num_kpages,
+                                            pginfo->hwpage_cnt,
+                                            pginfo->num_hwpages, i);
+                               return -EFAULT;
+                       }
+                       *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
+                                (pginfo->next_hwpage * pginfo->hwpage_size);
+                       if ( !(*kpage) && pbuf->addr ) {
+                               ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
+                                            "next_hwpage=%llx", pbuf->addr,
+                                            pbuf->size, pginfo->next_hwpage);
+                               return -EFAULT;
+                       }
+                       (pginfo->hwpage_cnt)++;
+                       (pginfo->next_hwpage)++;
+                       if (PAGE_SIZE >= pginfo->hwpage_size) {
+                               if (pginfo->next_hwpage %
+                                   (PAGE_SIZE / pginfo->hwpage_size) == 0)
+                                       (pginfo->kpage_cnt)++;
+                       } else
+                               pginfo->kpage_cnt += pginfo->hwpage_size /
+                                       PAGE_SIZE;
+                       kpage++;
+                       i++;
+                       if (i >= number) break;
+               }
+               if (pginfo->next_hwpage >= offs_hw + num_hw) {
+                       (pginfo->u.phy.next_buf)++;
+                       pginfo->next_hwpage = 0;
+               }
+       }
+       return ret;
+}
+
+static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
+                               u32 number, u64 *kpage)
+{
+       int ret = 0;
+       u64 *fmrlist;
+       u32 i;
+
+       /* loop over desired page_list entries */
+       fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
+       for (i = 0; i < number; i++) {
+               *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
+                          pginfo->next_hwpage * pginfo->hwpage_size;
+               if ( !(*kpage) ) {
+                       ehca_gen_err("*fmrlist=%llx fmrlist=%p "
+                                    "next_listelem=%llx next_hwpage=%llx",
+                                    *fmrlist, fmrlist,
+                                    pginfo->u.fmr.next_listelem,
+                                    pginfo->next_hwpage);
+                       return -EFAULT;
+               }
+               (pginfo->hwpage_cnt)++;
+               if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
+                       if (pginfo->next_hwpage %
+                           (pginfo->u.fmr.fmr_pgsize /
+                            pginfo->hwpage_size) == 0) {
+                               (pginfo->kpage_cnt)++;
+                               (pginfo->u.fmr.next_listelem)++;
+                               fmrlist++;
+                               pginfo->next_hwpage = 0;
+                       } else
+                               (pginfo->next_hwpage)++;
+               } else {
+                       unsigned int cnt_per_hwpage = pginfo->hwpage_size /
+                               pginfo->u.fmr.fmr_pgsize;
+                       unsigned int j;
+                       u64 prev = *kpage;
+                       /* check if adrs are contiguous */
+                       for (j = 1; j < cnt_per_hwpage; j++) {
+                               u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
+                               if (prev + pginfo->u.fmr.fmr_pgsize != p) {
+                                       ehca_gen_err("uncontiguous fmr pages "
+                                                    "found prev=%llx p=%llx "
+                                                    "idx=%x", prev, p, i + j);
+                                       return -EINVAL;
+                               }
+                               prev = p;
+                       }
+                       pginfo->kpage_cnt += cnt_per_hwpage;
+                       pginfo->u.fmr.next_listelem += cnt_per_hwpage;
+                       fmrlist += cnt_per_hwpage;
+               }
+               kpage++;
+       }
+       return ret;
+}
+
+/* setup page buffer from page info */
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+                    u32 number,
+                    u64 *kpage)
+{
+       int ret;
+
+       switch (pginfo->type) {
+       case EHCA_MR_PGI_PHYS:
+               ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
+               break;
+       case EHCA_MR_PGI_USER:
+               ret = PAGE_SIZE >= pginfo->hwpage_size ?
+                       ehca_set_pagebuf_user1(pginfo, number, kpage) :
+                       ehca_set_pagebuf_user2(pginfo, number, kpage);
+               break;
+       case EHCA_MR_PGI_FMR:
+               ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
+               break;
+       default:
+               ehca_gen_err("bad pginfo->type=%x", pginfo->type);
+               ret = -EFAULT;
+               break;
+       }
+       return ret;
+} /* end ehca_set_pagebuf() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check MR if it is a max-MR, i.e. uses whole memory
+ * in case it's a max-MR 1 is returned, else 0
+ */
+int ehca_mr_is_maxmr(u64 size,
+                    u64 *iova_start)
+{
+       /* a MR is treated as max-MR only if it fits following: */
+       if ((size == ehca_mr_len) &&
+           (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
+               ehca_gen_dbg("this is a max-MR");
+               return 1;
+       } else
+               return 0;
+} /* end ehca_mr_is_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/* map access control for MR/MW. This routine is used for MR and MW. */
+void ehca_mrmw_map_acl(int ib_acl,
+                      u32 *hipz_acl)
+{
+       *hipz_acl = 0;
+       if (ib_acl & IB_ACCESS_REMOTE_READ)
+               *hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
+       if (ib_acl & IB_ACCESS_REMOTE_WRITE)
+               *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
+       if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
+               *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
+       if (ib_acl & IB_ACCESS_LOCAL_WRITE)
+               *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
+       if (ib_acl & IB_ACCESS_MW_BIND)
+               *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
+} /* end ehca_mrmw_map_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/* sets page size in hipz access control for MR/MW. */
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
+{
+       *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
+} /* end ehca_mrmw_set_pgsize_hipz_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * reverse map access control for MR/MW.
+ * This routine is used for MR and MW.
+ */
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+                              int *ib_acl) /*OUT*/
+{
+       *ib_acl = 0;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
+               *ib_acl |= IB_ACCESS_REMOTE_READ;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
+               *ib_acl |= IB_ACCESS_REMOTE_WRITE;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
+               *ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
+               *ib_acl |= IB_ACCESS_LOCAL_WRITE;
+       if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
+               *ib_acl |= IB_ACCESS_MW_BIND;
+} /* end ehca_mrmw_reverse_map_acl() */
+
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * MR destructor and constructor
+ * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
+ * except struct ib_mr and spinlock
+ */
+void ehca_mr_deletenew(struct ehca_mr *mr)
+{
+       mr->flags = 0;
+       mr->num_kpages = 0;
+       mr->num_hwpages = 0;
+       mr->acl = 0;
+       mr->start = NULL;
+       mr->fmr_page_size = 0;
+       mr->fmr_max_pages = 0;
+       mr->fmr_max_maps = 0;
+       mr->fmr_map_cnt = 0;
+       memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
+       memset(&mr->galpas, 0, sizeof(mr->galpas));
+} /* end ehca_mr_deletenew() */
+
+int ehca_init_mrmw_cache(void)
+{
+       mr_cache = kmem_cache_create("ehca_cache_mr",
+                                    sizeof(struct ehca_mr), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!mr_cache)
+               return -ENOMEM;
+       mw_cache = kmem_cache_create("ehca_cache_mw",
+                                    sizeof(struct ehca_mw), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!mw_cache) {
+               kmem_cache_destroy(mr_cache);
+               mr_cache = NULL;
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+void ehca_cleanup_mrmw_cache(void)
+{
+       if (mr_cache)
+               kmem_cache_destroy(mr_cache);
+       if (mw_cache)
+               kmem_cache_destroy(mw_cache);
+}
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+                                    int dir)
+{
+       if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+               ehca_top_bmap->dir[dir] =
+                       kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+               if (!ehca_top_bmap->dir[dir])
+                       return -ENOMEM;
+               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+               memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+       }
+       return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+       if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+               ehca_bmap->top[top] =
+                       kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+               if (!ehca_bmap->top[top])
+                       return -ENOMEM;
+               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+               memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+       }
+       return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+       return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+       int top, dir;
+
+       if (!ehca_bmap)
+               return;
+
+       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]))
+                       continue;
+               for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+                       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+                               continue;
+
+                       kfree(ehca_bmap->top[top]->dir[dir]);
+               }
+
+               kfree(ehca_bmap->top[top]);
+       }
+
+       kfree(ehca_bmap);
+       ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+       unsigned long i, start_section, end_section;
+       int top, dir, idx;
+
+       if (!nr_pages)
+               return 0;
+
+       if (!ehca_bmap) {
+               ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+               if (!ehca_bmap)
+                       return -ENOMEM;
+               /* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+               memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+       }
+
+       start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+       end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+       for (i = start_section; i < end_section; i++) {
+               int ret;
+               top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+               dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+               idx = i & EHCA_INDEX_MASK;
+
+               ret = ehca_init_bmap(ehca_bmap, top, dir);
+               if (ret) {
+                       ehca_destroy_busmap();
+                       return ret;
+               }
+               ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+               ehca_mr_len += EHCA_SECTSIZE;
+       }
+       return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+       int page_order;
+
+       if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+               return 0;
+
+       page_order = compound_order(pfn_to_page(pfn));
+       if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+               return 0;
+
+       return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+                                      unsigned long total_nr_pages, void *arg)
+{
+       int ret;
+       unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+       if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+               return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+       /* Given chunk is >= 16GB -> check for hugepages */
+       start_pfn = initial_pfn;
+       end_pfn = initial_pfn + total_nr_pages;
+       pfn = start_pfn;
+
+       while (pfn < end_pfn) {
+               if (ehca_is_hugepage(pfn)) {
+                       /* Add mem found in front of the hugepage */
+                       nr_pages = pfn - start_pfn;
+                       ret = ehca_update_busmap(start_pfn, nr_pages);
+                       if (ret)
+                               return ret;
+                       /* Skip the hugepage */
+                       pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+                       start_pfn = pfn;
+               } else
+                       pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+       }
+
+       /* Add mem found behind the hugepage(s)  */
+       nr_pages = pfn - start_pfn;
+       return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+       int ret;
+
+       ehca_mr_len = 0;
+       ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+                                  ehca_create_busmap_callback);
+       return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+                                  struct ehca_mr *e_mr,
+                                  struct ehca_mr_pginfo *pginfo)
+{
+       int top;
+       u64 hret, *kpage;
+
+       kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!kpage) {
+               ehca_err(&shca->ib_device, "kpage alloc failed");
+               return -ENOMEM;
+       }
+       for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+               if (!ehca_bmap_valid(ehca_bmap->top[top]))
+                       continue;
+               hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+               if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+                       break;
+       }
+
+       ehca_free_fw_ctrlblock(kpage);
+
+       if (hret == H_SUCCESS)
+               return 0; /* Everything is fine */
+       else {
+               ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+                                "h_ret=%lli e_mr=%p top=%x lkey=%x "
+                                "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+                                e_mr->ib.ib_mr.lkey,
+                                shca->ipz_hca_handle.handle,
+                                e_mr->ipz_mr_handle.handle);
+               return ehca2ib_return_code(hret);
+       }
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+       int top, dir, idx;
+       unsigned long abs_addr, offset;
+       u64 entry;
+
+       if (!ehca_bmap)
+               return EHCA_INVAL_ADDR;
+
+       abs_addr = __pa(caddr);
+       top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+       if (!ehca_bmap_valid(ehca_bmap->top[top]))
+               return EHCA_INVAL_ADDR;
+
+       dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+       if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+               return EHCA_INVAL_ADDR;
+
+       idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+       entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+       if (ehca_bmap_valid(entry)) {
+               offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+               return entry | offset;
+       } else
+               return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+                              size_t size, enum dma_data_direction direction)
+{
+       if (cpu_addr)
+               return ehca_map_vaddr(cpu_addr);
+       else
+               return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+                                 enum dma_data_direction direction)
+{
+       /* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+                            unsigned long offset, size_t size,
+                            enum dma_data_direction direction)
+{
+       u64 addr;
+
+       if (offset + size > PAGE_SIZE)
+               return EHCA_INVAL_ADDR;
+
+       addr = ehca_map_vaddr(page_address(page));
+       if (!ehca_dma_mapping_error(dev, addr))
+               addr += offset;
+
+       return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+                               enum dma_data_direction direction)
+{
+       /* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                          int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sgl, sg, nents, i) {
+               u64 addr;
+               addr = ehca_map_vaddr(sg_virt(sg));
+               if (ehca_dma_mapping_error(dev, addr))
+                       return 0;
+
+               sg->dma_address = addr;
+               sg->dma_length = sg->length;
+       }
+       return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+                             int nents, enum dma_data_direction direction)
+{
+       /* This is only a stub; nothing to be done here */
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+                                        size_t size,
+                                        enum dma_data_direction dir)
+{
+       dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+                                           size_t size,
+                                           enum dma_data_direction dir)
+{
+       dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                    u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+       u64 dma_addr;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p) {
+               addr = page_address(p);
+               dma_addr = ehca_map_vaddr(addr);
+               if (ehca_dma_mapping_error(dev, dma_addr)) {
+                       free_pages((unsigned long)addr, get_order(size));
+                       return NULL;
+               }
+               if (dma_handle)
+                       *dma_handle = dma_addr;
+               return addr;
+       }
+       return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+                                  void *cpu_addr, u64 dma_handle)
+{
+       if (cpu_addr && size)
+               free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+       .mapping_error          = ehca_dma_mapping_error,
+       .map_single             = ehca_dma_map_single,
+       .unmap_single           = ehca_dma_unmap_single,
+       .map_page               = ehca_dma_map_page,
+       .unmap_page             = ehca_dma_unmap_page,
+       .map_sg                 = ehca_dma_map_sg,
+       .unmap_sg               = ehca_dma_unmap_sg,
+       .sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
+       .sync_single_for_device = ehca_dma_sync_single_for_device,
+       .alloc_coherent         = ehca_dma_alloc_coherent,
+       .free_coherent          = ehca_dma_free_coherent,
+};
diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h
new file mode 100644 (file)
index 0000000..50d8b51
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW declarations and inline functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _EHCA_MRMW_H_
+#define _EHCA_MRMW_H_
+
+enum ehca_reg_type {
+       EHCA_REG_MR,
+       EHCA_REG_BUSMAP_MR
+};
+
+int ehca_reg_mr(struct ehca_shca *shca,
+               struct ehca_mr *e_mr,
+               u64 *iova_start,
+               u64 size,
+               int acl,
+               struct ehca_pd *e_pd,
+               struct ehca_mr_pginfo *pginfo,
+               u32 *lkey,
+               u32 *rkey,
+               enum ehca_reg_type reg_type);
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+                      struct ehca_mr *e_mr,
+                      struct ehca_mr_pginfo *pginfo);
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+                 struct ehca_mr *e_mr,
+                 u64 *iova_start,
+                 u64 size,
+                 int mr_access_flags,
+                 struct ehca_pd *e_pd,
+                 struct ehca_mr_pginfo *pginfo,
+                 u32 *lkey,
+                 u32 *rkey);
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+                      struct ehca_mr *e_fmr);
+
+int ehca_reg_smr(struct ehca_shca *shca,
+                struct ehca_mr *e_origmr,
+                struct ehca_mr *e_newmr,
+                u64 *iova_start,
+                int acl,
+                struct ehca_pd *e_pd,
+                u32 *lkey,
+                u32 *rkey);
+
+int ehca_reg_internal_maxmr(struct ehca_shca *shca,
+                           struct ehca_pd *e_pd,
+                           struct ehca_mr **maxmr);
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+                  struct ehca_mr *e_newmr,
+                  u64 *iova_start,
+                  int acl,
+                  struct ehca_pd *e_pd,
+                  u32 *lkey,
+                  u32 *rkey);
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
+
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+                                 int num_phys_buf,
+                                 u64 *iova_start,
+                                 u64 *size);
+
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+                            u64 *page_list,
+                            int list_len);
+
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+                    u32 number,
+                    u64 *kpage);
+
+int ehca_mr_is_maxmr(u64 size,
+                    u64 *iova_start);
+
+void ehca_mrmw_map_acl(int ib_acl,
+                      u32 *hipz_acl);
+
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
+
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+                              int *ib_acl);
+
+void ehca_mr_deletenew(struct ehca_mr *mr);
+
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_pd.c b/drivers/staging/rdma/ehca/ehca_pd.c
new file mode 100644 (file)
index 0000000..351577a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  PD functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+
+static struct kmem_cache *pd_cache;
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+                           struct ib_ucontext *context, struct ib_udata *udata)
+{
+       struct ehca_pd *pd;
+       int i;
+
+       pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
+       if (!pd) {
+               ehca_err(device, "device=%p context=%p out of memory",
+                        device, context);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       for (i = 0; i < 2; i++) {
+               INIT_LIST_HEAD(&pd->free[i]);
+               INIT_LIST_HEAD(&pd->full[i]);
+       }
+       mutex_init(&pd->lock);
+
+       /*
+        * Kernel PD: when device = -1, 0
+        * User   PD: when context != -1
+        */
+       if (!context) {
+               /*
+                * Kernel PDs after init reuses always
+                * the one created in ehca_shca_reopen()
+                */
+               struct ehca_shca *shca = container_of(device, struct ehca_shca,
+                                                     ib_device);
+               pd->fw_pd.value = shca->pd->fw_pd.value;
+       } else
+               pd->fw_pd.value = (u64)pd;
+
+       return &pd->ib_pd;
+}
+
+int ehca_dealloc_pd(struct ib_pd *pd)
+{
+       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+       int i, leftovers = 0;
+       struct ipz_small_queue_page *page, *tmp;
+
+       for (i = 0; i < 2; i++) {
+               list_splice(&my_pd->full[i], &my_pd->free[i]);
+               list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
+                       leftovers = 1;
+                       free_page(page->page);
+                       kmem_cache_free(small_qp_cache, page);
+               }
+       }
+
+       if (leftovers)
+               ehca_warn(pd->device,
+                         "Some small queue pages were not freed");
+
+       kmem_cache_free(pd_cache, my_pd);
+
+       return 0;
+}
+
+int ehca_init_pd_cache(void)
+{
+       pd_cache = kmem_cache_create("ehca_cache_pd",
+                                    sizeof(struct ehca_pd), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!pd_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_pd_cache(void)
+{
+       if (pd_cache)
+               kmem_cache_destroy(pd_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_qes.h b/drivers/staging/rdma/ehca/ehca_qes.h
new file mode 100644 (file)
index 0000000..90c4efa
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Hardware request structures
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _EHCA_QES_H_
+#define _EHCA_QES_H_
+
+#include "ehca_tools.h"
+
+/* virtual scatter gather entry to specify remote addresses with length */
+struct ehca_vsgentry {
+       u64 vaddr;
+       u32 lkey;
+       u32 length;
+};
+
+#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
+#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
+#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
+#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
+#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
+#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
+#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
+
+/*
+ * Unreliable Datagram Address Vector Format
+ * see IBTA Vol1 chapter 8.3 Global Routing Header
+ */
+struct ehca_ud_av {
+       u8 sl;
+       u8 lnh;
+       u16 dlid;
+       u8 reserved1;
+       u8 reserved2;
+       u8 reserved3;
+       u8 slid_path_bits;
+       u8 reserved4;
+       u8 ipd;
+       u8 reserved5;
+       u8 pmtu;
+       u32 reserved6;
+       u64 reserved7;
+       union {
+               struct {
+                       u64 word_0; /* always set to 6  */
+                       /*should be 0x1B for IB transport */
+                       u64 word_1;
+                       u64 word_2;
+                       u64 word_3;
+                       u64 word_4;
+               } grh;
+               struct {
+                       u32 wd_0;
+                       u32 wd_1;
+                       /* DWord_1 --> SGID */
+
+                       u32 sgid_wd3;
+                       u32 sgid_wd2;
+
+                       u32 sgid_wd1;
+                       u32 sgid_wd0;
+                       /* DWord_3 --> DGID */
+
+                       u32 dgid_wd3;
+                       u32 dgid_wd2;
+
+                       u32 dgid_wd1;
+                       u32 dgid_wd0;
+               } grh_l;
+       };
+};
+
+/* maximum number of sg entries allowed in a WQE */
+#define MAX_WQE_SG_ENTRIES 252
+
+#define WQE_OPTYPE_SEND             0x80
+#define WQE_OPTYPE_RDMAREAD         0x40
+#define WQE_OPTYPE_RDMAWRITE        0x20
+#define WQE_OPTYPE_CMPSWAP          0x10
+#define WQE_OPTYPE_FETCHADD         0x08
+#define WQE_OPTYPE_BIND             0x04
+
+#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
+#define WQE_WRFLAG_FENCE            0x40
+#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
+#define WQE_WRFLAG_SOLIC_EVENT      0x10
+
+#define WQEF_CACHE_HINT             0x80
+#define WQEF_CACHE_HINT_RD_WR       0x40
+#define WQEF_TIMED_WQE              0x20
+#define WQEF_PURGE                  0x08
+#define WQEF_HIGH_NIBBLE            0xF0
+
+#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
+#define MW_BIND_ACCESSCTRL_R_READ    0x20
+#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
+
+struct ehca_wqe {
+       u64 work_request_id;
+       u8 optype;
+       u8 wr_flag;
+       u16 pkeyi;
+       u8 wqef;
+       u8 nr_of_data_seg;
+       u16 wqe_provided_slid;
+       u32 destination_qp_number;
+       u32 resync_psn_sqp;
+       u32 local_ee_context_qkey;
+       u32 immediate_data;
+       union {
+               struct {
+                       u64 remote_virtual_address;
+                       u32 rkey;
+                       u32 reserved;
+                       u64 atomic_1st_op_dma_len;
+                       u64 atomic_2nd_op;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+
+               } nud;
+               struct {
+                       u64 ehca_ud_av_ptr;
+                       u64 reserved1;
+                       u64 reserved2;
+                       u64 reserved3;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+               } ud_avp;
+               struct {
+                       struct ehca_ud_av ud_av;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
+                                                    2];
+               } ud_av;
+               struct {
+                       u64 reserved0;
+                       u64 reserved1;
+                       u64 reserved2;
+                       u64 reserved3;
+                       struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+               } all_rcv;
+
+               struct {
+                       u64 reserved;
+                       u32 rkey;
+                       u32 old_rkey;
+                       u64 reserved1;
+                       u64 reserved2;
+                       u64 virtual_address;
+                       u32 reserved3;
+                       u32 length;
+                       u32 reserved4;
+                       u16 reserved5;
+                       u8 reserved6;
+                       u8 lr_ctl;
+                       u32 lkey;
+                       u32 reserved7;
+                       u64 reserved8;
+                       u64 reserved9;
+                       u64 reserved10;
+                       u64 reserved11;
+               } bind;
+               struct {
+                       u64 reserved12;
+                       u64 reserved13;
+                       u32 size;
+                       u32 start;
+               } inline_data;
+       } u;
+
+};
+
+#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
+#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
+#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
+#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
+#define WC_STATUS_ERROR_BIT 0x80000000
+#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
+#define WC_STATUS_PURGE_BIT 0x10
+#define WC_SEND_RECEIVE_BIT 0x80
+
+struct ehca_cqe {
+       u64 work_request_id;
+       u8 optype;
+       u8 w_completion_flags;
+       u16 reserved1;
+       u32 nr_bytes_transferred;
+       u32 immediate_data;
+       u32 local_qp_number;
+       u8 freed_resource_count;
+       u8 service_level;
+       u16 wqe_count;
+       u32 qp_token;
+       u32 qkey_ee_token;
+       u32 remote_qp_number;
+       u16 dlid;
+       u16 rlid;
+       u16 reserved2;
+       u16 pkey_index;
+       u32 cqe_timestamp;
+       u32 wqe_timestamp;
+       u8 wqe_timestamp_valid;
+       u8 reserved3;
+       u8 reserved4;
+       u8 cqe_flags;
+       u32 status;
+};
+
+struct ehca_eqe {
+       u64 entry;
+};
+
+struct ehca_mrte {
+       u64 starting_va;
+       u64 length; /* length of memory region in bytes*/
+       u32 pd;
+       u8 key_instance;
+       u8 pagesize;
+       u8 mr_control;
+       u8 local_remote_access_ctrl;
+       u8 reserved[0x20 - 0x18];
+       u64 at_pointer[4];
+};
+#endif /*_EHCA_QES_H_*/
diff --git a/drivers/staging/rdma/ehca/ehca_qp.c b/drivers/staging/rdma/ehca/ehca_qp.c
new file mode 100644 (file)
index 0000000..2e89356
--- /dev/null
@@ -0,0 +1,2257 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  QP functions
+ *
+ *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
+ *           Stefan Roscher <stefan.roscher@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+static struct kmem_cache *qp_cache;
+
+/*
+ * attributes not supported by query qp
+ */
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
+                                    IB_QP_EN_SQD_ASYNC_NOTIFY)
+
+/*
+ * ehca (internal) qp state values
+ */
+enum ehca_qp_state {
+       EHCA_QPS_RESET = 1,
+       EHCA_QPS_INIT = 2,
+       EHCA_QPS_RTR = 3,
+       EHCA_QPS_RTS = 5,
+       EHCA_QPS_SQD = 6,
+       EHCA_QPS_SQE = 8,
+       EHCA_QPS_ERR = 128
+};
+
+/*
+ * qp state transitions as defined by IB Arch Rel 1.1 page 431
+ */
+enum ib_qp_statetrans {
+       IB_QPST_ANY2RESET,
+       IB_QPST_ANY2ERR,
+       IB_QPST_RESET2INIT,
+       IB_QPST_INIT2RTR,
+       IB_QPST_INIT2INIT,
+       IB_QPST_RTR2RTS,
+       IB_QPST_RTS2SQD,
+       IB_QPST_RTS2RTS,
+       IB_QPST_SQD2RTS,
+       IB_QPST_SQE2RTS,
+       IB_QPST_SQD2SQD,
+       IB_QPST_MAX     /* nr of transitions, this must be last!!! */
+};
+
+/*
+ * ib2ehca_qp_state maps IB to ehca qp_state
+ * returns ehca qp state corresponding to given ib qp state
+ */
+static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
+{
+       switch (ib_qp_state) {
+       case IB_QPS_RESET:
+               return EHCA_QPS_RESET;
+       case IB_QPS_INIT:
+               return EHCA_QPS_INIT;
+       case IB_QPS_RTR:
+               return EHCA_QPS_RTR;
+       case IB_QPS_RTS:
+               return EHCA_QPS_RTS;
+       case IB_QPS_SQD:
+               return EHCA_QPS_SQD;
+       case IB_QPS_SQE:
+               return EHCA_QPS_SQE;
+       case IB_QPS_ERR:
+               return EHCA_QPS_ERR;
+       default:
+               ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
+               return -EINVAL;
+       }
+}
+
+/*
+ * ehca2ib_qp_state maps ehca to IB qp_state
+ * returns ib qp state corresponding to given ehca qp state
+ */
+static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
+                                               ehca_qp_state)
+{
+       switch (ehca_qp_state) {
+       case EHCA_QPS_RESET:
+               return IB_QPS_RESET;
+       case EHCA_QPS_INIT:
+               return IB_QPS_INIT;
+       case EHCA_QPS_RTR:
+               return IB_QPS_RTR;
+       case EHCA_QPS_RTS:
+               return IB_QPS_RTS;
+       case EHCA_QPS_SQD:
+               return IB_QPS_SQD;
+       case EHCA_QPS_SQE:
+               return IB_QPS_SQE;
+       case EHCA_QPS_ERR:
+               return IB_QPS_ERR;
+       default:
+               ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
+               return -EINVAL;
+       }
+}
+
+/*
+ * ehca_qp_type used as index for req_attr and opt_attr of
+ * struct ehca_modqp_statetrans
+ */
+enum ehca_qp_type {
+       QPT_RC = 0,
+       QPT_UC = 1,
+       QPT_UD = 2,
+       QPT_SQP = 3,
+       QPT_MAX
+};
+
+/*
+ * ib2ehcaqptype maps Ib to ehca qp_type
+ * returns ehca qp type corresponding to ib qp type
+ */
+static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
+{
+       switch (ibqptype) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               return QPT_SQP;
+       case IB_QPT_RC:
+               return QPT_RC;
+       case IB_QPT_UC:
+               return QPT_UC;
+       case IB_QPT_UD:
+               return QPT_UD;
+       default:
+               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+               return -EINVAL;
+       }
+}
+
+static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
+                                                        int ib_tostate)
+{
+       int index = -EINVAL;
+       switch (ib_tostate) {
+       case IB_QPS_RESET:
+               index = IB_QPST_ANY2RESET;
+               break;
+       case IB_QPS_INIT:
+               switch (ib_fromstate) {
+               case IB_QPS_RESET:
+                       index = IB_QPST_RESET2INIT;
+                       break;
+               case IB_QPS_INIT:
+                       index = IB_QPST_INIT2INIT;
+                       break;
+               }
+               break;
+       case IB_QPS_RTR:
+               if (ib_fromstate == IB_QPS_INIT)
+                       index = IB_QPST_INIT2RTR;
+               break;
+       case IB_QPS_RTS:
+               switch (ib_fromstate) {
+               case IB_QPS_RTR:
+                       index = IB_QPST_RTR2RTS;
+                       break;
+               case IB_QPS_RTS:
+                       index = IB_QPST_RTS2RTS;
+                       break;
+               case IB_QPS_SQD:
+                       index = IB_QPST_SQD2RTS;
+                       break;
+               case IB_QPS_SQE:
+                       index = IB_QPST_SQE2RTS;
+                       break;
+               }
+               break;
+       case IB_QPS_SQD:
+               if (ib_fromstate == IB_QPS_RTS)
+                       index = IB_QPST_RTS2SQD;
+               break;
+       case IB_QPS_SQE:
+               break;
+       case IB_QPS_ERR:
+               index = IB_QPST_ANY2ERR;
+               break;
+       default:
+               break;
+       }
+       return index;
+}
+
+/*
+ * ibqptype2servicetype returns hcp service type corresponding to given
+ * ib qp type used by create_qp()
+ */
+static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
+{
+       switch (ibqptype) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               return ST_UD;
+       case IB_QPT_RC:
+               return ST_RC;
+       case IB_QPT_UC:
+               return ST_UC;
+       case IB_QPT_UD:
+               return ST_UD;
+       case IB_QPT_RAW_IPV6:
+               return -EINVAL;
+       case IB_QPT_RAW_ETHERTYPE:
+               return -EINVAL;
+       default:
+               ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+               return -EINVAL;
+       }
+}
+
+/*
+ * init userspace queue info from ipz_queue data
+ */
+static inline void queue2resp(struct ipzu_queue_resp *resp,
+                             struct ipz_queue *queue)
+{
+       resp->qe_size = queue->qe_size;
+       resp->act_nr_of_sg = queue->act_nr_of_sg;
+       resp->queue_length = queue->queue_length;
+       resp->pagesize = queue->pagesize;
+       resp->toggle_state = queue->toggle_state;
+       resp->offset = queue->offset;
+}
+
+/*
+ * init_qp_queue initializes/constructs r/squeue and registers queue pages.
+ */
+static inline int init_qp_queue(struct ehca_shca *shca,
+                               struct ehca_pd *pd,
+                               struct ehca_qp *my_qp,
+                               struct ipz_queue *queue,
+                               int q_type,
+                               u64 expected_hret,
+                               struct ehca_alloc_queue_parms *parms,
+                               int wqe_size)
+{
+       int ret, cnt, ipz_rc, nr_q_pages;
+       void *vpage;
+       u64 rpage, h_ret;
+       struct ib_device *ib_dev = &shca->ib_device;
+       struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
+
+       if (!parms->queue_size)
+               return 0;
+
+       if (parms->is_small) {
+               nr_q_pages = 1;
+               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+                                       128 << parms->page_size,
+                                       wqe_size, parms->act_nr_sges, 1);
+       } else {
+               nr_q_pages = parms->queue_size;
+               ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+                                       EHCA_PAGESIZE, wqe_size,
+                                       parms->act_nr_sges, 0);
+       }
+
+       if (!ipz_rc) {
+               ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
+                        ipz_rc);
+               return -EBUSY;
+       }
+
+       /* register queue pages */
+       for (cnt = 0; cnt < nr_q_pages; cnt++) {
+               vpage = ipz_qpageit_get_inc(queue);
+               if (!vpage) {
+                       ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+                                "failed p_vpage= %p", vpage);
+                       ret = -EINVAL;
+                       goto init_qp_queue1;
+               }
+               rpage = __pa(vpage);
+
+               h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
+                                                my_qp->ipz_qp_handle,
+                                                NULL, 0, q_type,
+                                                rpage, parms->is_small ? 0 : 1,
+                                                my_qp->galpas.kernel);
+               if (cnt == (nr_q_pages - 1)) {  /* last page! */
+                       if (h_ret != expected_hret) {
+                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
+                                        "h_ret=%lli", h_ret);
+                               ret = ehca2ib_return_code(h_ret);
+                               goto init_qp_queue1;
+                       }
+                       vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
+                       if (vpage) {
+                               ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+                                        "should not succeed vpage=%p", vpage);
+                               ret = -EINVAL;
+                               goto init_qp_queue1;
+                       }
+               } else {
+                       if (h_ret != H_PAGE_REGISTERED) {
+                               ehca_err(ib_dev, "hipz_qp_register_rpage() "
+                                        "h_ret=%lli", h_ret);
+                               ret = ehca2ib_return_code(h_ret);
+                               goto init_qp_queue1;
+                       }
+               }
+       }
+
+       ipz_qeit_reset(queue);
+
+       return 0;
+
+init_qp_queue1:
+       ipz_queue_dtor(pd, queue);
+       return ret;
+}
+
+static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
+{
+       if (is_llqp)
+               return 128 << act_nr_sge;
+       else
+               return offsetof(struct ehca_wqe,
+                               u.nud.sg_list[act_nr_sge]);
+}
+
+static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
+                                      int req_nr_sge, int is_llqp)
+{
+       u32 wqe_size, q_size;
+       int act_nr_sge = req_nr_sge;
+
+       if (!is_llqp)
+               /* round up #SGEs so WQE size is a power of 2 */
+               for (act_nr_sge = 4; act_nr_sge <= 252;
+                    act_nr_sge = 4 + 2 * act_nr_sge)
+                       if (act_nr_sge >= req_nr_sge)
+                               break;
+
+       wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
+       q_size = wqe_size * (queue->max_wr + 1);
+
+       if (q_size <= 512)
+               queue->page_size = 2;
+       else if (q_size <= 1024)
+               queue->page_size = 3;
+       else
+               queue->page_size = 0;
+
+       queue->is_small = (queue->page_size != 0);
+}
+
+/* needs to be called with cq->spinlock held */
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
+{
+       struct list_head *list, *node;
+
+       /* TODO: support low latency QPs */
+       if (qp->ext_type == EQPT_LLQP)
+               return;
+
+       if (on_sq) {
+               list = &qp->send_cq->sqp_err_list;
+               node = &qp->sq_err_node;
+       } else {
+               list = &qp->recv_cq->rqp_err_list;
+               node = &qp->rq_err_node;
+       }
+
+       if (list_empty(node))
+               list_add_tail(node, list);
+
+       return;
+}
+
+static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->spinlock, flags);
+
+       if (!list_empty(node))
+               list_del_init(node);
+
+       spin_unlock_irqrestore(&cq->spinlock, flags);
+}
+
+static void reset_queue_map(struct ehca_queue_map *qmap)
+{
+       int i;
+
+       qmap->tail = qmap->entries - 1;
+       qmap->left_to_poll = 0;
+       qmap->next_wqe_idx = 0;
+       for (i = 0; i < qmap->entries; i++) {
+               qmap->map[i].reported = 1;
+               qmap->map[i].cqe_req = 0;
+       }
+}
+
+/*
+ * Create an ib_qp struct that is either a QP or an SRQ, depending on
+ * the value of the is_srq parameter. If init_attr and srq_init_attr share
+ * fields, the field out of init_attr is used.
+ */
+static struct ehca_qp *internal_create_qp(
+       struct ib_pd *pd,
+       struct ib_qp_init_attr *init_attr,
+       struct ib_srq_init_attr *srq_init_attr,
+       struct ib_udata *udata, int is_srq)
+{
+       struct ehca_qp *my_qp, *my_srq = NULL;
+       struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+                                             ib_device);
+       struct ib_ucontext *context = NULL;
+       u64 h_ret;
+       int is_llqp = 0, has_srq = 0, is_user = 0;
+       int qp_type, max_send_sge, max_recv_sge, ret;
+
+       /* h_call's out parameters */
+       struct ehca_alloc_qp_parms parms;
+       u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
+       unsigned long flags;
+
+       if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
+               ehca_err(pd->device, "Unable to create QP, max number of %i "
+                        "QPs reached.", shca->max_num_qps);
+               ehca_err(pd->device, "To increase the maximum number of QPs "
+                        "use the number_of_qps module parameter.\n");
+               return ERR_PTR(-ENOSPC);
+       }
+
+       if (init_attr->create_flags) {
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       memset(&parms, 0, sizeof(parms));
+       qp_type = init_attr->qp_type;
+
+       if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
+               init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
+               ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
+                        init_attr->sq_sig_type);
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       /* save LLQP info */
+       if (qp_type & 0x80) {
+               is_llqp = 1;
+               parms.ext_type = EQPT_LLQP;
+               parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
+       }
+       qp_type &= 0x1F;
+       init_attr->qp_type &= 0x1F;
+
+       /* handle SRQ base QPs */
+       if (init_attr->srq) {
+               my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
+
+               if (qp_type == IB_QPT_UC) {
+                       ehca_err(pd->device, "UC with SRQ not supported");
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+
+               has_srq = 1;
+               parms.ext_type = EQPT_SRQBASE;
+               parms.srq_qpn = my_srq->real_qp_num;
+       }
+
+       if (is_llqp && has_srq) {
+               ehca_err(pd->device, "LLQPs can't have an SRQ");
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       /* handle SRQs */
+       if (is_srq) {
+               parms.ext_type = EQPT_SRQ;
+               parms.srq_limit = srq_init_attr->attr.srq_limit;
+               if (init_attr->cap.max_recv_sge > 3) {
+                       ehca_err(pd->device, "no more than three SGEs "
+                                "supported for SRQ  pd=%p  max_sge=%x",
+                                pd, init_attr->cap.max_recv_sge);
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+       }
+
+       /* check QP type */
+       if (qp_type != IB_QPT_UD &&
+           qp_type != IB_QPT_UC &&
+           qp_type != IB_QPT_RC &&
+           qp_type != IB_QPT_SMI &&
+           qp_type != IB_QPT_GSI) {
+               ehca_err(pd->device, "wrong QP Type=%x", qp_type);
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-EINVAL);
+       }
+
+       if (is_llqp) {
+               switch (qp_type) {
+               case IB_QPT_RC:
+                       if ((init_attr->cap.max_send_wr > 255) ||
+                           (init_attr->cap.max_recv_wr > 255)) {
+                               ehca_err(pd->device,
+                                        "Invalid Number of max_sq_wr=%x "
+                                        "or max_rq_wr=%x for RC LLQP",
+                                        init_attr->cap.max_send_wr,
+                                        init_attr->cap.max_recv_wr);
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-EINVAL);
+                       }
+                       break;
+               case IB_QPT_UD:
+                       if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
+                               ehca_err(pd->device, "UD LLQP not supported "
+                                        "by this adapter");
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-ENOSYS);
+                       }
+                       if (!(init_attr->cap.max_send_sge <= 5
+                           && init_attr->cap.max_send_sge >= 1
+                           && init_attr->cap.max_recv_sge <= 5
+                           && init_attr->cap.max_recv_sge >= 1)) {
+                               ehca_err(pd->device,
+                                        "Invalid Number of max_send_sge=%x "
+                                        "or max_recv_sge=%x for UD LLQP",
+                                        init_attr->cap.max_send_sge,
+                                        init_attr->cap.max_recv_sge);
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-EINVAL);
+                       } else if (init_attr->cap.max_send_wr > 255) {
+                               ehca_err(pd->device,
+                                        "Invalid Number of "
+                                        "max_send_wr=%x for UD QP_TYPE=%x",
+                                        init_attr->cap.max_send_wr, qp_type);
+                               atomic_dec(&shca->num_qps);
+                               return ERR_PTR(-EINVAL);
+                       }
+                       break;
+               default:
+                       ehca_err(pd->device, "unsupported LL QP Type=%x",
+                                qp_type);
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+       } else {
+               int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
+                              || qp_type == IB_QPT_GSI) ? 250 : 252;
+
+               if (init_attr->cap.max_send_sge > max_sge
+                   || init_attr->cap.max_recv_sge > max_sge) {
+                       ehca_err(pd->device, "Invalid number of SGEs requested "
+                                "send_sge=%x recv_sge=%x max_sge=%x",
+                                init_attr->cap.max_send_sge,
+                                init_attr->cap.max_recv_sge, max_sge);
+                       atomic_dec(&shca->num_qps);
+                       return ERR_PTR(-EINVAL);
+               }
+       }
+
+       my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+       if (!my_qp) {
+               ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+               atomic_dec(&shca->num_qps);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       if (pd->uobject && udata) {
+               is_user = 1;
+               context = pd->uobject->context;
+       }
+
+       atomic_set(&my_qp->nr_events, 0);
+       init_waitqueue_head(&my_qp->wait_completion);
+       spin_lock_init(&my_qp->spinlock_s);
+       spin_lock_init(&my_qp->spinlock_r);
+       my_qp->qp_type = qp_type;
+       my_qp->ext_type = parms.ext_type;
+       my_qp->state = IB_QPS_RESET;
+
+       if (init_attr->recv_cq)
+               my_qp->recv_cq =
+                       container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
+       if (init_attr->send_cq)
+               my_qp->send_cq =
+                       container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
+
+       idr_preload(GFP_KERNEL);
+       write_lock_irqsave(&ehca_qp_idr_lock, flags);
+
+       ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
+       if (ret >= 0)
+               my_qp->token = ret;
+
+       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+       idr_preload_end();
+       if (ret < 0) {
+               if (ret == -ENOSPC) {
+                       ret = -EINVAL;
+                       ehca_err(pd->device, "Invalid number of qp");
+               } else {
+                       ret = -ENOMEM;
+                       ehca_err(pd->device, "Can't allocate new idr entry.");
+               }
+               goto create_qp_exit0;
+       }
+
+       if (has_srq)
+               parms.srq_token = my_qp->token;
+
+       parms.servicetype = ibqptype2servicetype(qp_type);
+       if (parms.servicetype < 0) {
+               ret = -EINVAL;
+               ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
+               goto create_qp_exit1;
+       }
+
+       /* Always signal by WQE so we can hide circ. WQEs */
+       parms.sigtype = HCALL_SIGT_BY_WQE;
+
+       /* UD_AV CIRCUMVENTION */
+       max_send_sge = init_attr->cap.max_send_sge;
+       max_recv_sge = init_attr->cap.max_recv_sge;
+       if (parms.servicetype == ST_UD && !is_llqp) {
+               max_send_sge += 2;
+               max_recv_sge += 2;
+       }
+
+       parms.token = my_qp->token;
+       parms.eq_handle = shca->eq.ipz_eq_handle;
+       parms.pd = my_pd->fw_pd;
+       if (my_qp->send_cq)
+               parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
+       if (my_qp->recv_cq)
+               parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
+
+       parms.squeue.max_wr = init_attr->cap.max_send_wr;
+       parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
+       parms.squeue.max_sge = max_send_sge;
+       parms.rqueue.max_sge = max_recv_sge;
+
+       /* RC QPs need one more SWQE for unsolicited ack circumvention */
+       if (qp_type == IB_QPT_RC)
+               parms.squeue.max_wr++;
+
+       if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
+               if (HAS_SQ(my_qp))
+                       ehca_determine_small_queue(
+                               &parms.squeue, max_send_sge, is_llqp);
+               if (HAS_RQ(my_qp))
+                       ehca_determine_small_queue(
+                               &parms.rqueue, max_recv_sge, is_llqp);
+               parms.qp_storage =
+                       (parms.squeue.is_small || parms.rqueue.is_small);
+       }
+
+       h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+                        h_ret);
+               ret = ehca2ib_return_code(h_ret);
+               goto create_qp_exit1;
+       }
+
+       ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
+       my_qp->ipz_qp_handle = parms.qp_handle;
+       my_qp->galpas = parms.galpas;
+
+       swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
+       rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
+
+       switch (qp_type) {
+       case IB_QPT_RC:
+               if (is_llqp) {
+                       parms.squeue.act_nr_sges = 1;
+                       parms.rqueue.act_nr_sges = 1;
+               }
+               /* hide the extra WQE */
+               parms.squeue.act_nr_wqes--;
+               break;
+       case IB_QPT_UD:
+       case IB_QPT_GSI:
+       case IB_QPT_SMI:
+               /* UD circumvention */
+               if (is_llqp) {
+                       parms.squeue.act_nr_sges = 1;
+                       parms.rqueue.act_nr_sges = 1;
+               } else {
+                       parms.squeue.act_nr_sges -= 2;
+                       parms.rqueue.act_nr_sges -= 2;
+               }
+
+               if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
+                       parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
+                       parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
+                       parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
+                       parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
+                       ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
+               }
+
+               break;
+
+       default:
+               break;
+       }
+
+       /* initialize r/squeue and register queue pages */
+       if (HAS_SQ(my_qp)) {
+               ret = init_qp_queue(
+                       shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
+                       HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
+                       &parms.squeue, swqe_size);
+               if (ret) {
+                       ehca_err(pd->device, "Couldn't initialize squeue "
+                                "and pages ret=%i", ret);
+                       goto create_qp_exit2;
+               }
+
+               if (!is_user) {
+                       my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+                               my_qp->ipz_squeue.qe_size;
+                       my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+                                                   sizeof(struct ehca_qmap_entry));
+                       if (!my_qp->sq_map.map) {
+                               ehca_err(pd->device, "Couldn't allocate squeue "
+                                        "map ret=%i", ret);
+                               goto create_qp_exit3;
+                       }
+                       INIT_LIST_HEAD(&my_qp->sq_err_node);
+                       /* to avoid the generation of bogus flush CQEs */
+                       reset_queue_map(&my_qp->sq_map);
+               }
+       }
+
+       if (HAS_RQ(my_qp)) {
+               ret = init_qp_queue(
+                       shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
+                       H_SUCCESS, &parms.rqueue, rwqe_size);
+               if (ret) {
+                       ehca_err(pd->device, "Couldn't initialize rqueue "
+                                "and pages ret=%i", ret);
+                       goto create_qp_exit4;
+               }
+               if (!is_user) {
+                       my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+                               my_qp->ipz_rqueue.qe_size;
+                       my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+                                                   sizeof(struct ehca_qmap_entry));
+                       if (!my_qp->rq_map.map) {
+                               ehca_err(pd->device, "Couldn't allocate squeue "
+                                        "map ret=%i", ret);
+                               goto create_qp_exit5;
+                       }
+                       INIT_LIST_HEAD(&my_qp->rq_err_node);
+                       /* to avoid the generation of bogus flush CQEs */
+                       reset_queue_map(&my_qp->rq_map);
+               }
+       } else if (init_attr->srq && !is_user) {
+               /* this is a base QP, use the queue map of the SRQ */
+               my_qp->rq_map = my_srq->rq_map;
+               INIT_LIST_HEAD(&my_qp->rq_err_node);
+
+               my_qp->ipz_rqueue = my_srq->ipz_rqueue;
+       }
+
+       if (is_srq) {
+               my_qp->ib_srq.pd = &my_pd->ib_pd;
+               my_qp->ib_srq.device = my_pd->ib_pd.device;
+
+               my_qp->ib_srq.srq_context = init_attr->qp_context;
+               my_qp->ib_srq.event_handler = init_attr->event_handler;
+       } else {
+               my_qp->ib_qp.qp_num = ib_qp_num;
+               my_qp->ib_qp.pd = &my_pd->ib_pd;
+               my_qp->ib_qp.device = my_pd->ib_pd.device;
+
+               my_qp->ib_qp.recv_cq = init_attr->recv_cq;
+               my_qp->ib_qp.send_cq = init_attr->send_cq;
+
+               my_qp->ib_qp.qp_type = qp_type;
+               my_qp->ib_qp.srq = init_attr->srq;
+
+               my_qp->ib_qp.qp_context = init_attr->qp_context;
+               my_qp->ib_qp.event_handler = init_attr->event_handler;
+       }
+
+       init_attr->cap.max_inline_data = 0; /* not supported yet */
+       init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
+       init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
+       init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
+       init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
+       my_qp->init_attr = *init_attr;
+
+       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+               shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+                       &my_qp->ib_qp;
+               if (ehca_nr_ports < 0) {
+                       /* alloc array to cache subsequent modify qp parms
+                        * for autodetect mode
+                        */
+                       my_qp->mod_qp_parm =
+                               kzalloc(EHCA_MOD_QP_PARM_MAX *
+                                       sizeof(*my_qp->mod_qp_parm),
+                                       GFP_KERNEL);
+                       if (!my_qp->mod_qp_parm) {
+                               ehca_err(pd->device,
+                                        "Could not alloc mod_qp_parm");
+                               goto create_qp_exit5;
+                       }
+               }
+       }
+
+       /* NOTE: define_apq0() not supported yet */
+       if (qp_type == IB_QPT_GSI) {
+               h_ret = ehca_define_sqp(shca, my_qp, init_attr);
+               if (h_ret != H_SUCCESS) {
+                       kfree(my_qp->mod_qp_parm);
+                       my_qp->mod_qp_parm = NULL;
+                       /* the QP pointer is no longer valid */
+                       shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+                               NULL;
+                       ret = ehca2ib_return_code(h_ret);
+                       goto create_qp_exit6;
+               }
+       }
+
+       if (my_qp->send_cq) {
+               ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
+               if (ret) {
+                       ehca_err(pd->device,
+                                "Couldn't assign qp to send_cq ret=%i", ret);
+                       goto create_qp_exit7;
+               }
+       }
+
+       /* copy queues, galpa data to user space */
+       if (context && udata) {
+               struct ehca_create_qp_resp resp;
+               memset(&resp, 0, sizeof(resp));
+
+               resp.qp_num = my_qp->real_qp_num;
+               resp.token = my_qp->token;
+               resp.qp_type = my_qp->qp_type;
+               resp.ext_type = my_qp->ext_type;
+               resp.qkey = my_qp->qkey;
+               resp.real_qp_num = my_qp->real_qp_num;
+
+               if (HAS_SQ(my_qp))
+                       queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
+               if (HAS_RQ(my_qp))
+                       queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
+               resp.fw_handle_ofs = (u32)
+                       (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
+
+               if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+                       ehca_err(pd->device, "Copy to udata failed");
+                       ret = -EINVAL;
+                       goto create_qp_exit8;
+               }
+       }
+
+       return my_qp;
+
+create_qp_exit8:
+       ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+
+create_qp_exit7:
+       kfree(my_qp->mod_qp_parm);
+
+create_qp_exit6:
+       if (HAS_RQ(my_qp) && !is_user)
+               vfree(my_qp->rq_map.map);
+
+create_qp_exit5:
+       if (HAS_RQ(my_qp))
+               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+
+create_qp_exit4:
+       if (HAS_SQ(my_qp) && !is_user)
+               vfree(my_qp->sq_map.map);
+
+create_qp_exit3:
+       if (HAS_SQ(my_qp))
+               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+
+create_qp_exit2:
+       hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+
+create_qp_exit1:
+       write_lock_irqsave(&ehca_qp_idr_lock, flags);
+       idr_remove(&ehca_qp_idr, my_qp->token);
+       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+create_qp_exit0:
+       kmem_cache_free(qp_cache, my_qp);
+       atomic_dec(&shca->num_qps);
+       return ERR_PTR(ret);
+}
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+                            struct ib_qp_init_attr *qp_init_attr,
+                            struct ib_udata *udata)
+{
+       struct ehca_qp *ret;
+
+       ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
+       return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+                              struct ib_uobject *uobject);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+                              struct ib_srq_init_attr *srq_init_attr,
+                              struct ib_udata *udata)
+{
+       struct ib_qp_init_attr qp_init_attr;
+       struct ehca_qp *my_qp;
+       struct ib_srq *ret;
+       struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+                                             ib_device);
+       struct hcp_modify_qp_control_block *mqpcb;
+       u64 hret, update_mask;
+
+       if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+               return ERR_PTR(-ENOSYS);
+
+       /* For common attributes, internal_create_qp() takes its info
+        * out of qp_init_attr, so copy all common attrs there.
+        */
+       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+       qp_init_attr.event_handler = srq_init_attr->event_handler;
+       qp_init_attr.qp_context = srq_init_attr->srq_context;
+       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+       qp_init_attr.qp_type = IB_QPT_RC;
+       qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
+       qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
+
+       my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
+       if (IS_ERR(my_qp))
+               return (struct ib_srq *)my_qp;
+
+       /* copy back return values */
+       srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
+       srq_init_attr->attr.max_sge = 3;
+
+       /* drive SRQ into RTR state */
+       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!mqpcb) {
+               ehca_err(pd->device, "Could not get zeroed page for mqpcb "
+                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+               ret = ERR_PTR(-ENOMEM);
+               goto create_srq1;
+       }
+
+       mqpcb->qp_state = EHCA_QPS_INIT;
+       mqpcb->prim_phys_port = 1;
+       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               update_mask,
+                               mqpcb, my_qp->galpas.kernel);
+       if (hret != H_SUCCESS) {
+               ehca_err(pd->device, "Could not modify SRQ to INIT "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, hret);
+               goto create_srq2;
+       }
+
+       mqpcb->qp_enable = 1;
+       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               update_mask,
+                               mqpcb, my_qp->galpas.kernel);
+       if (hret != H_SUCCESS) {
+               ehca_err(pd->device, "Could not enable SRQ "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, hret);
+               goto create_srq2;
+       }
+
+       mqpcb->qp_state  = EHCA_QPS_RTR;
+       update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+       hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               update_mask,
+                               mqpcb, my_qp->galpas.kernel);
+       if (hret != H_SUCCESS) {
+               ehca_err(pd->device, "Could not modify SRQ to RTR "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, hret);
+               goto create_srq2;
+       }
+
+       ehca_free_fw_ctrlblock(mqpcb);
+
+       return &my_qp->ib_srq;
+
+create_srq2:
+       ret = ERR_PTR(ehca2ib_return_code(hret));
+       ehca_free_fw_ctrlblock(mqpcb);
+
+create_srq1:
+       internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
+
+       return ret;
+}
+
+/*
+ * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
+ * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
+ * returns total number of bad wqes in bad_wqe_cnt
+ */
+static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
+                          int *bad_wqe_cnt)
+{
+       u64 h_ret;
+       struct ipz_queue *squeue;
+       void *bad_send_wqe_p, *bad_send_wqe_v;
+       u64 q_ofs;
+       struct ehca_wqe *wqe;
+       int qp_num = my_qp->ib_qp.qp_num;
+
+       /* get send wqe pointer */
+       h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+                                          my_qp->ipz_qp_handle, &my_qp->pf,
+                                          &bad_send_wqe_p, NULL, 2);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
+                        " ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, qp_num, h_ret);
+               return ehca2ib_return_code(h_ret);
+       }
+       bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
+       ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
+                qp_num, bad_send_wqe_p);
+       /* convert wqe pointer to vadr */
+       bad_send_wqe_v = __va((u64)bad_send_wqe_p);
+       if (ehca_debug_level >= 2)
+               ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
+       squeue = &my_qp->ipz_squeue;
+       if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
+               ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
+                        " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
+               return -EFAULT;
+       }
+
+       /* loop sets wqe's purge bit */
+       wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+       *bad_wqe_cnt = 0;
+       while (wqe->optype != 0xff && wqe->wqef != 0xff) {
+               if (ehca_debug_level >= 2)
+                       ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
+               wqe->nr_of_data_seg = 0; /* suppress data access */
+               wqe->wqef = WQEF_PURGE; /* WQE to be purged */
+               q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
+               wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+               *bad_wqe_cnt = (*bad_wqe_cnt)+1;
+       }
+       /*
+        * bad wqe will be reprocessed and ignored when pol_cq() is called,
+        *  i.e. nr of wqes with flush error status is one less
+        */
+       ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
+                qp_num, (*bad_wqe_cnt)-1);
+       wqe->wqef = 0;
+
+       return 0;
+}
+
+static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
+                         struct ehca_queue_map *qmap)
+{
+       void *wqe_v;
+       u64 q_ofs;
+       u32 wqe_idx;
+       unsigned int tail_idx;
+
+       /* convert real to abs address */
+       wqe_p = wqe_p & (~(1UL << 63));
+
+       wqe_v = __va(wqe_p);
+
+       if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
+               ehca_gen_err("Invalid offset for calculating left cqes "
+                               "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
+               return -EFAULT;
+       }
+
+       tail_idx = next_index(qmap->tail, qmap->entries);
+       wqe_idx = q_ofs / ipz_queue->qe_size;
+
+       /* check all processed wqes, whether a cqe is requested or not */
+       while (tail_idx != wqe_idx) {
+               if (qmap->map[tail_idx].cqe_req)
+                       qmap->left_to_poll++;
+               tail_idx = next_index(tail_idx, qmap->entries);
+       }
+       /* save index in queue, where we have to start flushing */
+       qmap->next_wqe_idx = wqe_idx;
+       return 0;
+}
+
+static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
+{
+       u64 h_ret;
+       void *send_wqe_p, *recv_wqe_p;
+       int ret;
+       unsigned long flags;
+       int qp_num = my_qp->ib_qp.qp_num;
+
+       /* this hcall is not supported on base QPs */
+       if (my_qp->ext_type != EQPT_SRQBASE) {
+               /* get send and receive wqe pointer */
+               h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle, &my_qp->pf,
+                               &send_wqe_p, &recv_wqe_p, 4);
+               if (h_ret != H_SUCCESS) {
+                       ehca_err(&shca->ib_device, "disable_and_get_wqe() "
+                                "failed ehca_qp=%p qp_num=%x h_ret=%lli",
+                                my_qp, qp_num, h_ret);
+                       return ehca2ib_return_code(h_ret);
+               }
+
+               /*
+                * acquire lock to ensure that nobody is polling the cq which
+                * could mean that the qmap->tail pointer is in an
+                * inconsistent state.
+                */
+               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+               ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
+                               &my_qp->sq_map);
+               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+               if (ret)
+                       return ret;
+
+
+               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+               ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
+                               &my_qp->rq_map);
+               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+               if (ret)
+                       return ret;
+       } else {
+               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+               my_qp->sq_map.left_to_poll = 0;
+               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+                                                       my_qp->sq_map.entries);
+               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+               spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+               my_qp->rq_map.left_to_poll = 0;
+               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+                                                       my_qp->rq_map.entries);
+               spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+       }
+
+       /* this assures flush cqes being generated only for pending wqes */
+       if ((my_qp->sq_map.left_to_poll == 0) &&
+                               (my_qp->rq_map.left_to_poll == 0)) {
+               spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+               ehca_add_to_err_list(my_qp, 1);
+               spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+               if (HAS_RQ(my_qp)) {
+                       spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+                       ehca_add_to_err_list(my_qp, 0);
+                       spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
+                                       flags);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * internal_modify_qp with circumvention to handle aqp0 properly
+ * smi_reset2init indicates if this is an internal reset-to-init-call for
+ * smi. This flag must always be zero if called from ehca_modify_qp()!
+ * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
+ */
+static int internal_modify_qp(struct ib_qp *ibqp,
+                             struct ib_qp_attr *attr,
+                             int attr_mask, int smi_reset2init)
+{
+       enum ib_qp_state qp_cur_state, qp_new_state;
+       int cnt, qp_attr_idx, ret = 0;
+       enum ib_qp_statetrans statetrans;
+       struct hcp_modify_qp_control_block *mqpcb;
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca =
+               container_of(ibqp->pd->device, struct ehca_shca, ib_device);
+       u64 update_mask;
+       u64 h_ret;
+       int bad_wqe_cnt = 0;
+       int is_user = 0;
+       int squeue_locked = 0;
+       unsigned long flags = 0;
+
+       /* do query_qp to obtain current attr values */
+       mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+       if (!mqpcb) {
+               ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
+                        "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               mqpcb, my_qp->galpas.kernel);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(ibqp->device, "hipz_h_query_qp() failed "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, ibqp->qp_num, h_ret);
+               ret = ehca2ib_return_code(h_ret);
+               goto modify_qp_exit1;
+       }
+       if (ibqp->uobject)
+               is_user = 1;
+
+       qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+
+       if (qp_cur_state == -EINVAL) {  /* invalid qp state */
+               ret = -EINVAL;
+               ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
+                        "ehca_qp=%p qp_num=%x",
+                        mqpcb->qp_state, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+       /*
+        * circumvention to set aqp0 initial state to init
+        * as expected by IB spec
+        */
+       if (smi_reset2init == 0 &&
+           ibqp->qp_type == IB_QPT_SMI &&
+           qp_cur_state == IB_QPS_RESET &&
+           (attr_mask & IB_QP_STATE) &&
+           attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
+               struct ib_qp_attr smiqp_attr = {
+                       .qp_state = IB_QPS_INIT,
+                       .port_num = my_qp->init_attr.port_num,
+                       .pkey_index = 0,
+                       .qkey = 0
+               };
+               int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
+                       IB_QP_PKEY_INDEX | IB_QP_QKEY;
+               int smirc = internal_modify_qp(
+                       ibqp, &smiqp_attr, smiqp_attr_mask, 1);
+               if (smirc) {
+                       ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
+                                "ehca_modify_qp() rc=%i", smirc);
+                       ret = H_PARAMETER;
+                       goto modify_qp_exit1;
+               }
+               qp_cur_state = IB_QPS_INIT;
+               ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
+       }
+       /* is transmitted current state  equal to "real" current state */
+       if ((attr_mask & IB_QP_CUR_STATE) &&
+           qp_cur_state != attr->cur_qp_state) {
+               ret = -EINVAL;
+               ehca_err(ibqp->device,
+                        "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
+                        " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
+                        attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
+                "new qp_state=%x attribute_mask=%x",
+                my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
+
+       qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
+       if (!smi_reset2init &&
+           !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
+                               attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
+               ret = -EINVAL;
+               ehca_err(ibqp->device,
+                        "Invalid qp transition new_state=%x cur_state=%x "
+                        "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
+                        qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
+               goto modify_qp_exit1;
+       }
+
+       mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
+       if (mqpcb->qp_state)
+               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+       else {
+               ret = -EINVAL;
+               ehca_err(ibqp->device, "Invalid new qp state=%x "
+                        "ehca_qp=%p qp_num=%x",
+                        qp_new_state, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       /* retrieve state transition struct to get req and opt attrs */
+       statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
+       if (statetrans < 0) {
+               ret = -EINVAL;
+               ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
+                        "new_qp_state=%x State_xsition=%x ehca_qp=%p "
+                        "qp_num=%x", qp_cur_state, qp_new_state,
+                        statetrans, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
+
+       if (qp_attr_idx < 0) {
+               ret = qp_attr_idx;
+               ehca_err(ibqp->device,
+                        "Invalid QP type=%x ehca_qp=%p qp_num=%x",
+                        ibqp->qp_type, my_qp, ibqp->qp_num);
+               goto modify_qp_exit1;
+       }
+
+       ehca_dbg(ibqp->device,
+                "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
+                my_qp, ibqp->qp_num, statetrans);
+
+       /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
+        * in non-LL UD QPs.
+        */
+       if ((my_qp->qp_type == IB_QPT_UD) &&
+           (my_qp->ext_type != EQPT_LLQP) &&
+           (statetrans == IB_QPST_INIT2RTR) &&
+           (shca->hw_level >= 0x22)) {
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+               mqpcb->send_grh_flag = 1;
+       }
+
+       /* sqe -> rts: set purge bit of bad wqe before actual trans */
+       if ((my_qp->qp_type == IB_QPT_UD ||
+            my_qp->qp_type == IB_QPT_GSI ||
+            my_qp->qp_type == IB_QPT_SMI) &&
+           statetrans == IB_QPST_SQE2RTS) {
+               /* mark next free wqe if kernel */
+               if (!ibqp->uobject) {
+                       struct ehca_wqe *wqe;
+                       /* lock send queue */
+                       spin_lock_irqsave(&my_qp->spinlock_s, flags);
+                       squeue_locked = 1;
+                       /* mark next free wqe */
+                       wqe = (struct ehca_wqe *)
+                               ipz_qeit_get(&my_qp->ipz_squeue);
+                       wqe->optype = wqe->wqef = 0xff;
+                       ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
+                                ibqp->qp_num, wqe);
+               }
+               ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
+               if (ret) {
+                       ehca_err(ibqp->device, "prepare_sqe_rts() failed "
+                                "ehca_qp=%p qp_num=%x ret=%i",
+                                my_qp, ibqp->qp_num, ret);
+                       goto modify_qp_exit2;
+               }
+       }
+
+       /*
+        * enable RDMA_Atomic_Control if reset->init und reliable con
+        * this is necessary since gen2 does not provide that flag,
+        * but pHyp requires it
+        */
+       if (statetrans == IB_QPST_RESET2INIT &&
+           (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
+               mqpcb->rdma_atomic_ctrl = 3;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
+       }
+       /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
+       if (statetrans == IB_QPST_INIT2RTR &&
+           (ibqp->qp_type == IB_QPT_UC) &&
+           !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
+               mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX) {
+               if (attr->pkey_index >= 16) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid pkey_index=%x. "
+                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
+                                attr->pkey_index, my_qp, ibqp->qp_num);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->prim_p_key_idx = attr->pkey_index;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
+       }
+       if (attr_mask & IB_QP_PORT) {
+               struct ehca_sport *sport;
+               struct ehca_qp *aqp1;
+               if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid port=%x. "
+                                "ehca_qp=%p qp_num=%x num_ports=%x",
+                                attr->port_num, my_qp, ibqp->qp_num,
+                                shca->num_ports);
+                       goto modify_qp_exit2;
+               }
+               sport = &shca->sport[attr->port_num - 1];
+               if (!sport->ibqp_sqp[IB_QPT_GSI]) {
+                       /* should not occur */
+                       ret = -EFAULT;
+                       ehca_err(ibqp->device, "AQP1 was not created for "
+                                "port=%x", attr->port_num);
+                       goto modify_qp_exit2;
+               }
+               aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
+                                   struct ehca_qp, ib_qp);
+               if (ibqp->qp_type != IB_QPT_GSI &&
+                   ibqp->qp_type != IB_QPT_SMI &&
+                   aqp1->mod_qp_parm) {
+                       /*
+                        * firmware will reject this modify_qp() because
+                        * port is not activated/initialized fully
+                        */
+                       ret = -EFAULT;
+                       ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
+                                 "either port is being activated (try again) "
+                                 "or cabling issue", attr->port_num);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->prim_phys_port = attr->port_num;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
+       }
+       if (attr_mask & IB_QP_QKEY) {
+               mqpcb->qkey = attr->qkey;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
+       }
+       if (attr_mask & IB_QP_AV) {
+               mqpcb->dlid = attr->ah_attr.dlid;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
+               mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
+               mqpcb->service_level = attr->ah_attr.sl;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
+
+               if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
+                                 attr->ah_attr.static_rate,
+                                 &mqpcb->max_static_rate)) {
+                       ret = -EINVAL;
+                       goto modify_qp_exit2;
+               }
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
+
+               /*
+                * Always supply the GRH flag, even if it's zero, to give the
+                * hypervisor a clear "yes" or "no" instead of a "perhaps"
+                */
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+
+               /*
+                * only if GRH is TRUE we might consider SOURCE_GID_IDX
+                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+                */
+               if (attr->ah_attr.ah_flags == IB_AH_GRH) {
+                       mqpcb->send_grh_flag = 1;
+
+                       mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
+                       update_mask |=
+                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
+
+                       for (cnt = 0; cnt < 16; cnt++)
+                               mqpcb->dest_gid.byte[cnt] =
+                                       attr->ah_attr.grh.dgid.raw[cnt];
+
+                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
+                       mqpcb->flow_label = attr->ah_attr.grh.flow_label;
+                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
+                       mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
+                       update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
+                       mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
+                       update_mask |=
+                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
+               }
+       }
+
+       if (attr_mask & IB_QP_PATH_MTU) {
+               /* store ld(MTU) */
+               my_qp->mtu_shift = attr->path_mtu + 7;
+               mqpcb->path_mtu = attr->path_mtu;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
+       }
+       if (attr_mask & IB_QP_TIMEOUT) {
+               mqpcb->timeout = attr->timeout;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
+       }
+       if (attr_mask & IB_QP_RETRY_CNT) {
+               mqpcb->retry_count = attr->retry_cnt;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
+       }
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               mqpcb->rnr_retry_count = attr->rnr_retry;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
+       }
+       if (attr_mask & IB_QP_RQ_PSN) {
+               mqpcb->receive_psn = attr->rq_psn;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
+       }
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+               mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
+                       attr->max_dest_rd_atomic : 2;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+       }
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+               mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
+                       attr->max_rd_atomic : 2;
+               update_mask |=
+                       EHCA_BMASK_SET
+                       (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
+       }
+       if (attr_mask & IB_QP_ALT_PATH) {
+               if (attr->alt_port_num < 1
+                   || attr->alt_port_num > shca->num_ports) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid alt_port=%x. "
+                                "ehca_qp=%p qp_num=%x num_ports=%x",
+                                attr->alt_port_num, my_qp, ibqp->qp_num,
+                                shca->num_ports);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->alt_phys_port = attr->alt_port_num;
+
+               if (attr->alt_pkey_index >= 16) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
+                                "ehca_qp=%p qp_num=%x max_pkey_index=f",
+                                attr->pkey_index, my_qp, ibqp->qp_num);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->alt_p_key_idx = attr->alt_pkey_index;
+
+               mqpcb->timeout_al = attr->alt_timeout;
+               mqpcb->dlid_al = attr->alt_ah_attr.dlid;
+               mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
+               mqpcb->service_level_al = attr->alt_ah_attr.sl;
+
+               if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
+                                 attr->alt_ah_attr.static_rate,
+                                 &mqpcb->max_static_rate_al)) {
+                       ret = -EINVAL;
+                       goto modify_qp_exit2;
+               }
+
+               /* OpenIB doesn't support alternate retry counts - copy them */
+               mqpcb->retry_count_al = mqpcb->retry_count;
+               mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
+
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
+
+               /*
+                * Always supply the GRH flag, even if it's zero, to give the
+                * hypervisor a clear "yes" or "no" instead of a "perhaps"
+                */
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
+
+               /*
+                * only if GRH is TRUE we might consider SOURCE_GID_IDX
+                * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+                */
+               if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
+                       mqpcb->send_grh_flag_al = 1;
+
+                       for (cnt = 0; cnt < 16; cnt++)
+                               mqpcb->dest_gid_al.byte[cnt] =
+                                       attr->alt_ah_attr.grh.dgid.raw[cnt];
+                       mqpcb->source_gid_idx_al =
+                               attr->alt_ah_attr.grh.sgid_index;
+                       mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
+                       mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
+                       mqpcb->traffic_class_al =
+                               attr->alt_ah_attr.grh.traffic_class;
+
+                       update_mask |=
+                               EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
+                               | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
+                               | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
+                               | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
+                               EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
+               }
+       }
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+               mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
+       }
+
+       if (attr_mask & IB_QP_SQ_PSN) {
+               mqpcb->send_psn = attr->sq_psn;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
+       }
+
+       if (attr_mask & IB_QP_DEST_QPN) {
+               mqpcb->dest_qp_nr = attr->dest_qp_num;
+               update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
+       }
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE) {
+               if (attr->path_mig_state != IB_MIG_REARM
+                   && attr->path_mig_state != IB_MIG_MIGRATED) {
+                       ret = -EINVAL;
+                       ehca_err(ibqp->device, "Invalid mig_state=%x",
+                                attr->path_mig_state);
+                       goto modify_qp_exit2;
+               }
+               mqpcb->path_migration_state = attr->path_mig_state + 1;
+               if (attr->path_mig_state == IB_MIG_REARM)
+                       my_qp->mig_armed = 1;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
+       }
+
+       if (attr_mask & IB_QP_CAP) {
+               mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
+               mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
+               /* no support for max_send/recv_sge yet */
+       }
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
+
+       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                                my_qp->ipz_qp_handle,
+                                &my_qp->pf,
+                                update_mask,
+                                mqpcb, my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
+                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
+               goto modify_qp_exit2;
+       }
+
+       if ((my_qp->qp_type == IB_QPT_UD ||
+            my_qp->qp_type == IB_QPT_GSI ||
+            my_qp->qp_type == IB_QPT_SMI) &&
+           statetrans == IB_QPST_SQE2RTS) {
+               /* doorbell to reprocessing wqes */
+               iosync(); /* serialize GAL register access */
+               hipz_update_sqa(my_qp, bad_wqe_cnt-1);
+               ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
+       }
+
+       if (statetrans == IB_QPST_RESET2INIT ||
+           statetrans == IB_QPST_INIT2INIT) {
+               mqpcb->qp_enable = 1;
+               mqpcb->qp_state = EHCA_QPS_INIT;
+               update_mask = 0;
+               update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+
+               h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+                                        my_qp->ipz_qp_handle,
+                                        &my_qp->pf,
+                                        update_mask,
+                                        mqpcb,
+                                        my_qp->galpas.kernel);
+
+               if (h_ret != H_SUCCESS) {
+                       ret = ehca2ib_return_code(h_ret);
+                       ehca_err(ibqp->device, "ENABLE in context of "
+                                "RESET_2_INIT failed! Maybe you didn't get "
+                                "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
+                                h_ret, my_qp, ibqp->qp_num);
+                       goto modify_qp_exit2;
+               }
+       }
+       if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
+           && !is_user) {
+               ret = check_for_left_cqes(my_qp, shca);
+               if (ret)
+                       goto modify_qp_exit2;
+       }
+
+       if (statetrans == IB_QPST_ANY2RESET) {
+               ipz_qeit_reset(&my_qp->ipz_rqueue);
+               ipz_qeit_reset(&my_qp->ipz_squeue);
+
+               if (qp_cur_state == IB_QPS_ERR && !is_user) {
+                       del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+                       if (HAS_RQ(my_qp))
+                               del_from_err_list(my_qp->recv_cq,
+                                                 &my_qp->rq_err_node);
+               }
+               if (!is_user)
+                       reset_queue_map(&my_qp->sq_map);
+
+               if (HAS_RQ(my_qp) && !is_user)
+                       reset_queue_map(&my_qp->rq_map);
+       }
+
+       if (attr_mask & IB_QP_QKEY)
+               my_qp->qkey = attr->qkey;
+
+modify_qp_exit2:
+       if (squeue_locked) { /* this means: sqe -> rts */
+               spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+               my_qp->sqerr_purgeflag = 1;
+       }
+
+modify_qp_exit1:
+       ehca_free_fw_ctrlblock(mqpcb);
+
+       return ret;
+}
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+                  struct ib_udata *udata)
+{
+       int ret = 0;
+
+       struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+                                             ib_device);
+       struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+
+       /* The if-block below caches qp_attr to be modified for GSI and SMI
+        * qps during the initialization by ib_mad. When the respective port
+        * is activated, ie we got an event PORT_ACTIVE, we'll replay the
+        * cached modify calls sequence, see ehca_recover_sqs() below.
+        * Why that is required:
+        * 1) If one port is connected, older code requires that port one
+        *    to be connected and module option nr_ports=1 to be given by
+        *    user, which is very inconvenient for end user.
+        * 2) Firmware accepts modify_qp() only if respective port has become
+        *    active. Older code had a wait loop of 30sec create_qp()/
+        *    define_aqp1(), which is not appropriate in practice. This
+        *    code now removes that wait loop, see define_aqp1(), and always
+        *    reports all ports to ib_mad resp. users. Only activated ports
+        *    will then usable for the users.
+        */
+       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+               int port = my_qp->init_attr.port_num;
+               struct ehca_sport *sport = &shca->sport[port - 1];
+               unsigned long flags;
+               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+               /* cache qp_attr only during init */
+               if (my_qp->mod_qp_parm) {
+                       struct ehca_mod_qp_parm *p;
+                       if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
+                               ehca_err(&shca->ib_device,
+                                        "mod_qp_parm overflow state=%x port=%x"
+                                        " type=%x", attr->qp_state,
+                                        my_qp->init_attr.port_num,
+                                        ibqp->qp_type);
+                               spin_unlock_irqrestore(&sport->mod_sqp_lock,
+                                                      flags);
+                               return -EINVAL;
+                       }
+                       p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
+                       p->mask = attr_mask;
+                       p->attr = *attr;
+                       my_qp->mod_qp_parm_idx++;
+                       ehca_dbg(&shca->ib_device,
+                                "Saved qp_attr for state=%x port=%x type=%x",
+                                attr->qp_state, my_qp->init_attr.port_num,
+                                ibqp->qp_type);
+                       spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+                       goto out;
+               }
+               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+       }
+
+       ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
+
+out:
+       if ((ret == 0) && (attr_mask & IB_QP_STATE))
+               my_qp->state = attr->qp_state;
+
+       return ret;
+}
+
+void ehca_recover_sqp(struct ib_qp *sqp)
+{
+       struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
+       int port = my_sqp->init_attr.port_num;
+       struct ib_qp_attr attr;
+       struct ehca_mod_qp_parm *qp_parm;
+       int i, qp_parm_idx, ret;
+       unsigned long flags, wr_cnt;
+
+       if (!my_sqp->mod_qp_parm)
+               return;
+       ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
+
+       qp_parm = my_sqp->mod_qp_parm;
+       qp_parm_idx = my_sqp->mod_qp_parm_idx;
+       for (i = 0; i < qp_parm_idx; i++) {
+               attr = qp_parm[i].attr;
+               ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
+               if (ret) {
+                       ehca_err(sqp->device, "Could not modify SQP port=%x "
+                                "qp_num=%x ret=%x", port, sqp->qp_num, ret);
+                       goto free_qp_parm;
+               }
+               ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
+                        port, sqp->qp_num, attr.qp_state);
+       }
+
+       /* re-trigger posted recv wrs */
+       wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
+               my_sqp->ipz_rqueue.qe_size;
+       if (wr_cnt) {
+               spin_lock_irqsave(&my_sqp->spinlock_r, flags);
+               hipz_update_rqa(my_sqp, wr_cnt);
+               spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
+               ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
+                        port, sqp->qp_num, wr_cnt);
+       }
+
+free_qp_parm:
+       kfree(qp_parm);
+       /* this prevents subsequent calls to modify_qp() to cache qp_attr */
+       my_sqp->mod_qp_parm = NULL;
+}
+
+int ehca_query_qp(struct ib_qp *qp,
+                 struct ib_qp_attr *qp_attr,
+                 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+       struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
+                                             ib_device);
+       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+       struct hcp_modify_qp_control_block *qpcb;
+       int cnt, ret = 0;
+       u64 h_ret;
+
+       if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
+               ehca_err(qp->device, "Invalid attribute mask "
+                        "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
+                        my_qp, qp->qp_num, qp_attr_mask);
+               return -EINVAL;
+       }
+
+       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!qpcb) {
+               ehca_err(qp->device, "Out of memory for qpcb "
+                        "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_qp(adapter_handle,
+                               my_qp->ipz_qp_handle,
+                               &my_qp->pf,
+                               qpcb, my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(qp->device, "hipz_h_query_qp() failed "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, qp->qp_num, h_ret);
+               goto query_qp_exit1;
+       }
+
+       qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
+       qp_attr->qp_state = qp_attr->cur_qp_state;
+
+       if (qp_attr->cur_qp_state == -EINVAL) {
+               ret = -EINVAL;
+               ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
+                        "ehca_qp=%p qp_num=%x",
+                        qpcb->qp_state, my_qp, qp->qp_num);
+               goto query_qp_exit1;
+       }
+
+       if (qp_attr->qp_state == IB_QPS_SQD)
+               qp_attr->sq_draining = 1;
+
+       qp_attr->qkey = qpcb->qkey;
+       qp_attr->path_mtu = qpcb->path_mtu;
+       qp_attr->path_mig_state = qpcb->path_migration_state - 1;
+       qp_attr->rq_psn = qpcb->receive_psn;
+       qp_attr->sq_psn = qpcb->send_psn;
+       qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
+       qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
+       qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
+       /* UD_AV CIRCUMVENTION */
+       if (my_qp->qp_type == IB_QPT_UD) {
+               qp_attr->cap.max_send_sge =
+                       qpcb->actual_nr_sges_in_sq_wqe - 2;
+               qp_attr->cap.max_recv_sge =
+                       qpcb->actual_nr_sges_in_rq_wqe - 2;
+       } else {
+               qp_attr->cap.max_send_sge =
+                       qpcb->actual_nr_sges_in_sq_wqe;
+               qp_attr->cap.max_recv_sge =
+                       qpcb->actual_nr_sges_in_rq_wqe;
+       }
+
+       qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+       qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+
+       qp_attr->pkey_index = qpcb->prim_p_key_idx;
+       qp_attr->port_num = qpcb->prim_phys_port;
+       qp_attr->timeout = qpcb->timeout;
+       qp_attr->retry_cnt = qpcb->retry_count;
+       qp_attr->rnr_retry = qpcb->rnr_retry_count;
+
+       qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+       qp_attr->alt_port_num = qpcb->alt_phys_port;
+       qp_attr->alt_timeout = qpcb->timeout_al;
+
+       qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
+       qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
+
+       /* primary av */
+       qp_attr->ah_attr.sl = qpcb->service_level;
+
+       if (qpcb->send_grh_flag) {
+               qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+       }
+
+       qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
+       qp_attr->ah_attr.dlid = qpcb->dlid;
+       qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
+       qp_attr->ah_attr.port_num = qp_attr->port_num;
+
+       /* primary GRH */
+       qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
+       qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
+       qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
+       qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
+
+       for (cnt = 0; cnt < 16; cnt++)
+               qp_attr->ah_attr.grh.dgid.raw[cnt] =
+                       qpcb->dest_gid.byte[cnt];
+
+       /* alternate AV */
+       qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
+       if (qpcb->send_grh_flag_al) {
+               qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
+       }
+
+       qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
+       qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
+       qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
+
+       /* alternate GRH */
+       qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
+       qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
+       qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
+       qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
+
+       for (cnt = 0; cnt < 16; cnt++)
+               qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
+                       qpcb->dest_gid_al.byte[cnt];
+
+       /* return init attributes given in ehca_create_qp */
+       if (qp_init_attr)
+               *qp_init_attr = my_qp->init_attr;
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
+
+query_qp_exit1:
+       ehca_free_fw_ctrlblock(qpcb);
+
+       return ret;
+}
+
+int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                   enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+       struct ehca_qp *my_qp =
+               container_of(ibsrq, struct ehca_qp, ib_srq);
+       struct ehca_shca *shca =
+               container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
+       struct hcp_modify_qp_control_block *mqpcb;
+       u64 update_mask;
+       u64 h_ret;
+       int ret = 0;
+
+       mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!mqpcb) {
+               ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
+                        "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+               return -ENOMEM;
+       }
+
+       update_mask = 0;
+       if (attr_mask & IB_SRQ_LIMIT) {
+               attr_mask &= ~IB_SRQ_LIMIT;
+               update_mask |=
+                       EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+                       | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+               mqpcb->curr_srq_limit = attr->srq_limit;
+               mqpcb->qp_aff_asyn_ev_log_reg =
+                       EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+       }
+
+       /* by now, all bits in attr_mask should have been cleared */
+       if (attr_mask) {
+               ehca_err(ibsrq->device, "invalid attribute mask bits set  "
+                        "attr_mask=%x", attr_mask);
+               ret = -EINVAL;
+               goto modify_srq_exit0;
+       }
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+       h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
+                                NULL, update_mask, mqpcb,
+                                my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
+                        "ehca_qp=%p qp_num=%x",
+                        h_ret, my_qp, my_qp->real_qp_num);
+       }
+
+modify_srq_exit0:
+       ehca_free_fw_ctrlblock(mqpcb);
+
+       return ret;
+}
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+{
+       struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
+       struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
+                                             ib_device);
+       struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+       struct hcp_modify_qp_control_block *qpcb;
+       int ret = 0;
+       u64 h_ret;
+
+       qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+       if (!qpcb) {
+               ehca_err(srq->device, "Out of memory for qpcb "
+                        "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
+               return -ENOMEM;
+       }
+
+       h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
+                               NULL, qpcb, my_qp->galpas.kernel);
+
+       if (h_ret != H_SUCCESS) {
+               ret = ehca2ib_return_code(h_ret);
+               ehca_err(srq->device, "hipz_h_query_qp() failed "
+                        "ehca_qp=%p qp_num=%x h_ret=%lli",
+                        my_qp, my_qp->real_qp_num, h_ret);
+               goto query_srq_exit1;
+       }
+
+       srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+       srq_attr->max_sge = 3;
+       srq_attr->srq_limit = qpcb->curr_srq_limit;
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+query_srq_exit1:
+       ehca_free_fw_ctrlblock(qpcb);
+
+       return ret;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+                              struct ib_uobject *uobject)
+{
+       struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
+       struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
+                                            ib_pd);
+       struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
+       u32 qp_num = my_qp->real_qp_num;
+       int ret;
+       u64 h_ret;
+       u8 port_num;
+       int is_user = 0;
+       enum ib_qp_type qp_type;
+       unsigned long flags;
+
+       if (uobject) {
+               is_user = 1;
+               if (my_qp->mm_count_galpa ||
+                   my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+                       ehca_err(dev, "Resources still referenced in "
+                                "user space qp_num=%x", qp_num);
+                       return -EINVAL;
+               }
+       }
+
+       if (my_qp->send_cq) {
+               ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
+               if (ret) {
+                       ehca_err(dev, "Couldn't unassign qp from "
+                                "send_cq ret=%i qp_num=%x cq_num=%x", ret,
+                                qp_num, my_qp->send_cq->cq_number);
+                       return ret;
+               }
+       }
+
+       write_lock_irqsave(&ehca_qp_idr_lock, flags);
+       idr_remove(&ehca_qp_idr, my_qp->token);
+       write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+       /*
+        * SRQs will never get into an error list and do not have a recv_cq,
+        * so we need to skip them here.
+        */
+       if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+               del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+
+       if (HAS_SQ(my_qp) && !is_user)
+               del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+       /* now wait until all pending events have completed */
+       wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+
+       h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+       if (h_ret != H_SUCCESS) {
+               ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
+                        "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
+               return ehca2ib_return_code(h_ret);
+       }
+
+       port_num = my_qp->init_attr.port_num;
+       qp_type  = my_qp->init_attr.qp_type;
+
+       if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+               spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+               kfree(my_qp->mod_qp_parm);
+               my_qp->mod_qp_parm = NULL;
+               shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
+               spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+       }
+
+       /* no support for IB_QPT_SMI yet */
+       if (qp_type == IB_QPT_GSI) {
+               struct ib_event event;
+               ehca_info(dev, "device %s: port %x is inactive.",
+                               shca->ib_device.name, port_num);
+               event.device = &shca->ib_device;
+               event.event = IB_EVENT_PORT_ERR;
+               event.element.port_num = port_num;
+               shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
+               ib_dispatch_event(&event);
+       }
+
+       if (HAS_RQ(my_qp)) {
+               ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+               if (!is_user)
+                       vfree(my_qp->rq_map.map);
+       }
+       if (HAS_SQ(my_qp)) {
+               ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+               if (!is_user)
+                       vfree(my_qp->sq_map.map);
+       }
+       kmem_cache_free(qp_cache, my_qp);
+       atomic_dec(&shca->num_qps);
+       return 0;
+}
+
+int ehca_destroy_qp(struct ib_qp *qp)
+{
+       return internal_destroy_qp(qp->device,
+                                  container_of(qp, struct ehca_qp, ib_qp),
+                                  qp->uobject);
+}
+
+int ehca_destroy_srq(struct ib_srq *srq)
+{
+       return internal_destroy_qp(srq->device,
+                                  container_of(srq, struct ehca_qp, ib_srq),
+                                  srq->uobject);
+}
+
+int ehca_init_qp_cache(void)
+{
+       qp_cache = kmem_cache_create("ehca_cache_qp",
+                                    sizeof(struct ehca_qp), 0,
+                                    SLAB_HWCACHE_ALIGN,
+                                    NULL);
+       if (!qp_cache)
+               return -ENOMEM;
+       return 0;
+}
+
+void ehca_cleanup_qp_cache(void)
+{
+       if (qp_cache)
+               kmem_cache_destroy(qp_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c
new file mode 100644 (file)
index 0000000..47f9498
--- /dev/null
@@ -0,0 +1,953 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  post_send/recv, poll_cq, req_notify
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
+static u64 replace_wr_id(u64 wr_id, u16 idx)
+{
+       u64 ret;
+
+       ret = wr_id & ~QMAP_IDX_MASK;
+       ret |= idx & QMAP_IDX_MASK;
+
+       return ret;
+}
+
+static u16 get_app_wr_id(u64 wr_id)
+{
+       return wr_id & QMAP_IDX_MASK;
+}
+
+static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+                                 struct ehca_wqe *wqe_p,
+                                 struct ib_recv_wr *recv_wr,
+                                 u32 rq_map_idx)
+{
+       u8 cnt_ds;
+       if (unlikely((recv_wr->num_sge < 0) ||
+                    (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
+               ehca_gen_err("Invalid number of WQE SGE. "
+                        "num_sqe=%x max_nr_of_sg=%x",
+                        recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
+               return -EINVAL; /* invalid SG list length */
+       }
+
+       /* clear wqe header until sglist */
+       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+       wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+       wqe_p->nr_of_data_seg = recv_wr->num_sge;
+
+       for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+               wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
+                       recv_wr->sg_list[cnt_ds].addr;
+               wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
+                       recv_wr->sg_list[cnt_ds].lkey;
+               wqe_p->u.all_rcv.sg_list[cnt_ds].length =
+                       recv_wr->sg_list[cnt_ds].length;
+       }
+
+       if (ehca_debug_level >= 3) {
+               ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
+                            ipz_rqueue);
+               ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+       }
+
+       return 0;
+}
+
+#if defined(DEBUG_GSI_SEND_WR)
+
+/* need ib_mad struct */
+#include <rdma/ib_mad.h>
+
+static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
+{
+       int idx;
+       int j;
+       while (send_wr) {
+               struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
+               struct ib_sge *sge = send_wr->sg_list;
+               ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
+                            "send_flags=%x opcode=%x", idx, send_wr->wr_id,
+                            send_wr->num_sge, send_wr->send_flags,
+                            send_wr->opcode);
+               if (mad_hdr) {
+                       ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
+                                    "mgmt_class=%x class_version=%x method=%x "
+                                    "status=%x class_specific=%x tid=%lx "
+                                    "attr_id=%x resv=%x attr_mod=%x",
+                                    idx, mad_hdr->base_version,
+                                    mad_hdr->mgmt_class,
+                                    mad_hdr->class_version, mad_hdr->method,
+                                    mad_hdr->status, mad_hdr->class_specific,
+                                    mad_hdr->tid, mad_hdr->attr_id,
+                                    mad_hdr->resv,
+                                    mad_hdr->attr_mod);
+               }
+               for (j = 0; j < send_wr->num_sge; j++) {
+                       u8 *data = __va(sge->addr);
+                       ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
+                                    "lkey=%x",
+                                    idx, j, data, sge->length, sge->lkey);
+                       /* assume length is n*16 */
+                       ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
+                                idx, j);
+                       sge++;
+               } /* eof for j */
+               idx++;
+               send_wr = send_wr->next;
+       } /* eof while send_wr */
+}
+
+#endif /* DEBUG_GSI_SEND_WR */
+
+static inline int ehca_write_swqe(struct ehca_qp *qp,
+                                 struct ehca_wqe *wqe_p,
+                                 const struct ib_send_wr *send_wr,
+                                 u32 sq_map_idx,
+                                 int hidden)
+{
+       u32 idx;
+       u64 dma_length;
+       struct ehca_av *my_av;
+       u32 remote_qkey = send_wr->wr.ud.remote_qkey;
+       struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+
+       if (unlikely((send_wr->num_sge < 0) ||
+                    (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+               ehca_gen_err("Invalid number of WQE SGE. "
+                        "num_sqe=%x max_nr_of_sg=%x",
+                        send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
+               return -EINVAL; /* invalid SG list length */
+       }
+
+       /* clear wqe header until sglist */
+       memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+       wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+
+       qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
+       qmap_entry->reported = 0;
+       qmap_entry->cqe_req = 0;
+
+       switch (send_wr->opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               wqe_p->optype = WQE_OPTYPE_SEND;
+               break;
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
+               break;
+       case IB_WR_RDMA_READ:
+               wqe_p->optype = WQE_OPTYPE_RDMAREAD;
+               break;
+       default:
+               ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
+               return -EINVAL; /* invalid opcode */
+       }
+
+       wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
+
+       wqe_p->wr_flag = 0;
+
+       if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+           qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+           && !hidden) {
+               wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
+               qmap_entry->cqe_req = 1;
+       }
+
+       if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
+           send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+               /* this might not work as long as HW does not support it */
+               wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
+               wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
+       }
+
+       wqe_p->nr_of_data_seg = send_wr->num_sge;
+
+       switch (qp->qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               /* no break is intential here */
+       case IB_QPT_UD:
+               /* IB 1.2 spec C10-15 compliance */
+               if (send_wr->wr.ud.remote_qkey & 0x80000000)
+                       remote_qkey = qp->qkey;
+
+               wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
+               wqe_p->local_ee_context_qkey = remote_qkey;
+               if (unlikely(!send_wr->wr.ud.ah)) {
+                       ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
+                       return -EINVAL;
+               }
+               if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
+                       ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
+                       return -EINVAL;
+               }
+               my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
+               wqe_p->u.ud_av.ud_av = my_av->av;
+
+               /*
+                * omitted check of IB_SEND_INLINE
+                * since HW does not support it
+                */
+               for (idx = 0; idx < send_wr->num_sge; idx++) {
+                       wqe_p->u.ud_av.sg_list[idx].vaddr =
+                               send_wr->sg_list[idx].addr;
+                       wqe_p->u.ud_av.sg_list[idx].lkey =
+                               send_wr->sg_list[idx].lkey;
+                       wqe_p->u.ud_av.sg_list[idx].length =
+                               send_wr->sg_list[idx].length;
+               } /* eof for idx */
+               if (qp->qp_type == IB_QPT_SMI ||
+                   qp->qp_type == IB_QPT_GSI)
+                       wqe_p->u.ud_av.ud_av.pmtu = 1;
+               if (qp->qp_type == IB_QPT_GSI) {
+                       wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
+#ifdef DEBUG_GSI_SEND_WR
+                       trace_send_wr_ud(send_wr);
+#endif /* DEBUG_GSI_SEND_WR */
+               }
+               break;
+
+       case IB_QPT_UC:
+               if (send_wr->send_flags & IB_SEND_FENCE)
+                       wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
+               /* no break is intentional here */
+       case IB_QPT_RC:
+               /* TODO: atomic not implemented */
+               wqe_p->u.nud.remote_virtual_address =
+                       send_wr->wr.rdma.remote_addr;
+               wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
+
+               /*
+                * omitted checking of IB_SEND_INLINE
+                * since HW does not support it
+                */
+               dma_length = 0;
+               for (idx = 0; idx < send_wr->num_sge; idx++) {
+                       wqe_p->u.nud.sg_list[idx].vaddr =
+                               send_wr->sg_list[idx].addr;
+                       wqe_p->u.nud.sg_list[idx].lkey =
+                               send_wr->sg_list[idx].lkey;
+                       wqe_p->u.nud.sg_list[idx].length =
+                               send_wr->sg_list[idx].length;
+                       dma_length += send_wr->sg_list[idx].length;
+               } /* eof idx */
+               wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
+
+               /* unsolicited ack circumvention */
+               if (send_wr->opcode == IB_WR_RDMA_READ) {
+                       /* on RDMA read, switch on and reset counters */
+                       qp->message_count = qp->packet_count = 0;
+                       qp->unsol_ack_circ = 1;
+               } else
+                       /* else estimate #packets */
+                       qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
+               break;
+
+       default:
+               ehca_gen_err("Invalid qptype=%x", qp->qp_type);
+               return -EINVAL;
+       }
+
+       if (ehca_debug_level >= 3) {
+               ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
+               ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
+       }
+       return 0;
+}
+
+/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
+static inline void map_ib_wc_status(u32 cqe_status,
+                                   enum ib_wc_status *wc_status)
+{
+       if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
+               switch (cqe_status & 0x3F) {
+               case 0x01:
+               case 0x21:
+                       *wc_status = IB_WC_LOC_LEN_ERR;
+                       break;
+               case 0x02:
+               case 0x22:
+                       *wc_status = IB_WC_LOC_QP_OP_ERR;
+                       break;
+               case 0x03:
+               case 0x23:
+                       *wc_status = IB_WC_LOC_EEC_OP_ERR;
+                       break;
+               case 0x04:
+               case 0x24:
+                       *wc_status = IB_WC_LOC_PROT_ERR;
+                       break;
+               case 0x05:
+               case 0x25:
+                       *wc_status = IB_WC_WR_FLUSH_ERR;
+                       break;
+               case 0x06:
+                       *wc_status = IB_WC_MW_BIND_ERR;
+                       break;
+               case 0x07: /* remote error - look into bits 20:24 */
+                       switch ((cqe_status
+                                & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
+                       case 0x0:
+                               /*
+                                * PSN Sequence Error!
+                                * couldn't find a matching status!
+                                */
+                               *wc_status = IB_WC_GENERAL_ERR;
+                               break;
+                       case 0x1:
+                               *wc_status = IB_WC_REM_INV_REQ_ERR;
+                               break;
+                       case 0x2:
+                               *wc_status = IB_WC_REM_ACCESS_ERR;
+                               break;
+                       case 0x3:
+                               *wc_status = IB_WC_REM_OP_ERR;
+                               break;
+                       case 0x4:
+                               *wc_status = IB_WC_REM_INV_RD_REQ_ERR;
+                               break;
+                       }
+                       break;
+               case 0x08:
+                       *wc_status = IB_WC_RETRY_EXC_ERR;
+                       break;
+               case 0x09:
+                       *wc_status = IB_WC_RNR_RETRY_EXC_ERR;
+                       break;
+               case 0x0A:
+               case 0x2D:
+                       *wc_status = IB_WC_REM_ABORT_ERR;
+                       break;
+               case 0x0B:
+               case 0x2E:
+                       *wc_status = IB_WC_INV_EECN_ERR;
+                       break;
+               case 0x0C:
+               case 0x2F:
+                       *wc_status = IB_WC_INV_EEC_STATE_ERR;
+                       break;
+               case 0x0D:
+                       *wc_status = IB_WC_BAD_RESP_ERR;
+                       break;
+               case 0x10:
+                       /* WQE purged */
+                       *wc_status = IB_WC_WR_FLUSH_ERR;
+                       break;
+               default:
+                       *wc_status = IB_WC_FATAL_ERR;
+
+               }
+       } else
+               *wc_status = IB_WC_SUCCESS;
+}
+
+static inline int post_one_send(struct ehca_qp *my_qp,
+                        struct ib_send_wr *cur_send_wr,
+                        int hidden)
+{
+       struct ehca_wqe *wqe_p;
+       int ret;
+       u32 sq_map_idx;
+       u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+       /* get pointer next to free WQE */
+       wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+       if (unlikely(!wqe_p)) {
+               /* too many posted work requests: queue overflow */
+               ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+                        "qp_num=%x", my_qp->ib_qp.qp_num);
+               return -ENOMEM;
+       }
+
+       /*
+        * Get the index of the WQE in the send queue. The same index is used
+        * for writing into the sq_map.
+        */
+       sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
+
+       /* write a SEND WQE into the QUEUE */
+       ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
+       /*
+        * if something failed,
+        * reset the free entry pointer to the start value
+        */
+       if (unlikely(ret)) {
+               my_qp->ipz_squeue.current_q_offset = start_offset;
+               ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+                        "qp_num=%x", my_qp->ib_qp.qp_num);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int ehca_post_send(struct ib_qp *qp,
+                  struct ib_send_wr *send_wr,
+                  struct ib_send_wr **bad_send_wr)
+{
+       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+       int wqe_cnt = 0;
+       int ret = 0;
+       unsigned long flags;
+
+       /* Reject WR if QP is in RESET, INIT or RTR state */
+       if (unlikely(my_qp->state < IB_QPS_RTS)) {
+               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+                        my_qp->state, qp->qp_num);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* LOCK the QUEUE */
+       spin_lock_irqsave(&my_qp->spinlock_s, flags);
+
+       /* Send an empty extra RDMA read if:
+        *  1) there has been an RDMA read on this connection before
+        *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+        *  3) we can be sure that any previous extra RDMA read has been
+        *     processed so we don't overflow the SQ
+        */
+       if (unlikely(my_qp->unsol_ack_circ &&
+                    my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+                    my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+               /* insert an empty RDMA READ to fix up the remote QP state */
+               struct ib_send_wr circ_wr;
+               memset(&circ_wr, 0, sizeof(circ_wr));
+               circ_wr.opcode = IB_WR_RDMA_READ;
+               post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
+               wqe_cnt++;
+               ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
+               my_qp->message_count = my_qp->packet_count = 0;
+       }
+
+       /* loop processes list of send reqs */
+       while (send_wr) {
+               ret = post_one_send(my_qp, send_wr, 0);
+               if (unlikely(ret)) {
+                       goto post_send_exit0;
+               }
+               wqe_cnt++;
+               send_wr = send_wr->next;
+       }
+
+post_send_exit0:
+       iosync(); /* serialize GAL register access */
+       hipz_update_sqa(my_qp, wqe_cnt);
+       if (unlikely(ret || ehca_debug_level >= 2))
+               ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+                        my_qp, qp->qp_num, wqe_cnt, ret);
+       my_qp->message_count += wqe_cnt;
+       spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+
+out:
+       if (ret)
+               *bad_send_wr = send_wr;
+       return ret;
+}
+
+static int internal_post_recv(struct ehca_qp *my_qp,
+                             struct ib_device *dev,
+                             struct ib_recv_wr *recv_wr,
+                             struct ib_recv_wr **bad_recv_wr)
+{
+       struct ehca_wqe *wqe_p;
+       int wqe_cnt = 0;
+       int ret = 0;
+       u32 rq_map_idx;
+       unsigned long flags;
+       struct ehca_qmap_entry *qmap_entry;
+
+       if (unlikely(!HAS_RQ(my_qp))) {
+               ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
+                        my_qp, my_qp->real_qp_num, my_qp->ext_type);
+               ret = -ENODEV;
+               goto out;
+       }
+
+       /* LOCK the QUEUE */
+       spin_lock_irqsave(&my_qp->spinlock_r, flags);
+
+       /* loop processes list of recv reqs */
+       while (recv_wr) {
+               u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
+               /* get pointer next to free WQE */
+               wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
+               if (unlikely(!wqe_p)) {
+                       /* too many posted work requests: queue overflow */
+                       ret = -ENOMEM;
+                       ehca_err(dev, "Too many posted WQEs "
+                               "qp_num=%x", my_qp->real_qp_num);
+                       goto post_recv_exit0;
+               }
+               /*
+                * Get the index of the WQE in the recv queue. The same index
+                * is used for writing into the rq_map.
+                */
+               rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
+
+               /* write a RECV WQE into the QUEUE */
+               ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
+                               rq_map_idx);
+               /*
+                * if something failed,
+                * reset the free entry pointer to the start value
+                */
+               if (unlikely(ret)) {
+                       my_qp->ipz_rqueue.current_q_offset = start_offset;
+                       ret = -EINVAL;
+                       ehca_err(dev, "Could not write WQE "
+                               "qp_num=%x", my_qp->real_qp_num);
+                       goto post_recv_exit0;
+               }
+
+               qmap_entry = &my_qp->rq_map.map[rq_map_idx];
+               qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
+               qmap_entry->reported = 0;
+               qmap_entry->cqe_req = 1;
+
+               wqe_cnt++;
+               recv_wr = recv_wr->next;
+       } /* eof for recv_wr */
+
+post_recv_exit0:
+       iosync(); /* serialize GAL register access */
+       hipz_update_rqa(my_qp, wqe_cnt);
+       if (unlikely(ret || ehca_debug_level >= 2))
+           ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+                    my_qp, my_qp->real_qp_num, wqe_cnt, ret);
+       spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
+
+out:
+       if (ret)
+               *bad_recv_wr = recv_wr;
+
+       return ret;
+}
+
+int ehca_post_recv(struct ib_qp *qp,
+                  struct ib_recv_wr *recv_wr,
+                  struct ib_recv_wr **bad_recv_wr)
+{
+       struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+       /* Reject WR if QP is in RESET state */
+       if (unlikely(my_qp->state == IB_QPS_RESET)) {
+               ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+                        my_qp->state, qp->qp_num);
+               *bad_recv_wr = recv_wr;
+               return -EINVAL;
+       }
+
+       return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
+}
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+                      struct ib_recv_wr *recv_wr,
+                      struct ib_recv_wr **bad_recv_wr)
+{
+       return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
+                                 srq->device, recv_wr, bad_recv_wr);
+}
+
+/*
+ * ib_wc_opcode table converts ehca wc opcode to ib
+ * Since we use zero to indicate invalid opcode, the actual ib opcode must
+ * be decremented!!!
+ */
+static const u8 ib_wc_opcode[255] = {
+       [0x01] = IB_WC_RECV+1,
+       [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
+       [0x04] = IB_WC_BIND_MW+1,
+       [0x08] = IB_WC_FETCH_ADD+1,
+       [0x10] = IB_WC_COMP_SWAP+1,
+       [0x20] = IB_WC_RDMA_WRITE+1,
+       [0x40] = IB_WC_RDMA_READ+1,
+       [0x80] = IB_WC_SEND+1
+};
+
+/* internal function to poll one entry of cq */
+static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
+{
+       int ret = 0, qmap_tail_idx;
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       struct ehca_cqe *cqe;
+       struct ehca_qp *my_qp;
+       struct ehca_qmap_entry *qmap_entry;
+       struct ehca_queue_map *qmap;
+       int cqe_count = 0, is_error;
+
+repoll:
+       cqe = (struct ehca_cqe *)
+               ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
+       if (!cqe) {
+               ret = -EAGAIN;
+               if (ehca_debug_level >= 3)
+                       ehca_dbg(cq->device, "Completion queue is empty  "
+                                "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
+               goto poll_cq_one_exit0;
+       }
+
+       /* prevents loads being reordered across this point */
+       rmb();
+
+       cqe_count++;
+       if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
+               struct ehca_qp *qp;
+               int purgeflag;
+               unsigned long flags;
+
+               qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
+               if (!qp) {
+                       ehca_err(cq->device, "cq_num=%x qp_num=%x "
+                                "could not find qp -> ignore cqe",
+                                my_cq->cq_number, cqe->local_qp_number);
+                       ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
+                                my_cq->cq_number, cqe->local_qp_number);
+                       /* ignore this purged cqe */
+                       goto repoll;
+               }
+               spin_lock_irqsave(&qp->spinlock_s, flags);
+               purgeflag = qp->sqerr_purgeflag;
+               spin_unlock_irqrestore(&qp->spinlock_s, flags);
+
+               if (purgeflag) {
+                       ehca_dbg(cq->device,
+                                "Got CQE with purged bit qp_num=%x src_qp=%x",
+                                cqe->local_qp_number, cqe->remote_qp_number);
+                       if (ehca_debug_level >= 2)
+                               ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
+                                        cqe->local_qp_number,
+                                        cqe->remote_qp_number);
+                       /*
+                        * ignore this to avoid double cqes of bad wqe
+                        * that caused sqe and turn off purge flag
+                        */
+                       qp->sqerr_purgeflag = 0;
+                       goto repoll;
+               }
+       }
+
+       is_error = cqe->status & WC_STATUS_ERROR_BIT;
+
+       /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
+       if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
+               ehca_dbg(cq->device,
+                        "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
+                        is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
+               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+                        my_cq, my_cq->cq_number);
+               ehca_dbg(cq->device,
+                        "ehca_cq=%p cq_num=%x -------------------------",
+                        my_cq, my_cq->cq_number);
+       }
+
+       read_lock(&ehca_qp_idr_lock);
+       my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
+       read_unlock(&ehca_qp_idr_lock);
+       if (!my_qp)
+               goto repoll;
+       wc->qp = &my_qp->ib_qp;
+
+       qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
+       if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
+               /* We got a send completion. */
+               qmap = &my_qp->sq_map;
+       else
+               /* We got a receive completion. */
+               qmap = &my_qp->rq_map;
+
+       /* advance the tail pointer */
+       qmap->tail = qmap_tail_idx;
+
+       if (is_error) {
+               /*
+                * set left_to_poll to 0 because in error state, we will not
+                * get any additional CQEs
+                */
+               my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+                                                       my_qp->sq_map.entries);
+               my_qp->sq_map.left_to_poll = 0;
+               ehca_add_to_err_list(my_qp, 1);
+
+               my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+                                                       my_qp->rq_map.entries);
+               my_qp->rq_map.left_to_poll = 0;
+               if (HAS_RQ(my_qp))
+                       ehca_add_to_err_list(my_qp, 0);
+       }
+
+       qmap_entry = &qmap->map[qmap_tail_idx];
+       if (qmap_entry->reported) {
+               ehca_warn(cq->device, "Double cqe on qp_num=%#x",
+                               my_qp->real_qp_num);
+               /* found a double cqe, discard it and read next one */
+               goto repoll;
+       }
+
+       wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
+       qmap_entry->reported = 1;
+
+       /* if left_to_poll is decremented to 0, add the QP to the error list */
+       if (qmap->left_to_poll > 0) {
+               qmap->left_to_poll--;
+               if ((my_qp->sq_map.left_to_poll == 0) &&
+                               (my_qp->rq_map.left_to_poll == 0)) {
+                       ehca_add_to_err_list(my_qp, 1);
+                       if (HAS_RQ(my_qp))
+                               ehca_add_to_err_list(my_qp, 0);
+               }
+       }
+
+       /* eval ib_wc_opcode */
+       wc->opcode = ib_wc_opcode[cqe->optype]-1;
+       if (unlikely(wc->opcode == -1)) {
+               ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
+                        "ehca_cq=%p cq_num=%x",
+                        cqe->optype, cqe->status, my_cq, my_cq->cq_number);
+               /* dump cqe for other infos */
+               ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+                        my_cq, my_cq->cq_number);
+               /* update also queue adder to throw away this entry!!! */
+               goto repoll;
+       }
+
+       /* eval ib_wc_status */
+       if (unlikely(is_error)) {
+               /* complete with errors */
+               map_ib_wc_status(cqe->status, &wc->status);
+               wc->vendor_err = wc->status;
+       } else
+               wc->status = IB_WC_SUCCESS;
+
+       wc->byte_len = cqe->nr_bytes_transferred;
+       wc->pkey_index = cqe->pkey_index;
+       wc->slid = cqe->rlid;
+       wc->dlid_path_bits = cqe->dlid;
+       wc->src_qp = cqe->remote_qp_number;
+       /*
+        * HW has "Immed data present" and "GRH present" in bits 6 and 5.
+        * SW defines those in bits 1 and 0, so we can just shift and mask.
+        */
+       wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
+       wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
+       wc->sl = cqe->service_level;
+
+poll_cq_one_exit0:
+       if (cqe_count > 0)
+               hipz_update_feca(my_cq, cqe_count);
+
+       return ret;
+}
+
+static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
+                              struct ib_wc *wc, int num_entries,
+                              struct ipz_queue *ipz_queue, int on_sq)
+{
+       int nr = 0;
+       struct ehca_wqe *wqe;
+       u64 offset;
+       struct ehca_queue_map *qmap;
+       struct ehca_qmap_entry *qmap_entry;
+
+       if (on_sq)
+               qmap = &my_qp->sq_map;
+       else
+               qmap = &my_qp->rq_map;
+
+       qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+       while ((nr < num_entries) && (qmap_entry->reported == 0)) {
+               /* generate flush CQE */
+
+               memset(wc, 0, sizeof(*wc));
+
+               offset = qmap->next_wqe_idx * ipz_queue->qe_size;
+               wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
+               if (!wqe) {
+                       ehca_err(cq->device, "Invalid wqe offset=%#llx on "
+                                "qp_num=%#x", offset, my_qp->real_qp_num);
+                       return nr;
+               }
+
+               wc->wr_id = replace_wr_id(wqe->work_request_id,
+                                         qmap_entry->app_wr_id);
+
+               if (on_sq) {
+                       switch (wqe->optype) {
+                       case WQE_OPTYPE_SEND:
+                               wc->opcode = IB_WC_SEND;
+                               break;
+                       case WQE_OPTYPE_RDMAWRITE:
+                               wc->opcode = IB_WC_RDMA_WRITE;
+                               break;
+                       case WQE_OPTYPE_RDMAREAD:
+                               wc->opcode = IB_WC_RDMA_READ;
+                               break;
+                       default:
+                               ehca_err(cq->device, "Invalid optype=%x",
+                                               wqe->optype);
+                               return nr;
+                       }
+               } else
+                       wc->opcode = IB_WC_RECV;
+
+               if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
+                       wc->ex.imm_data = wqe->immediate_data;
+                       wc->wc_flags |= IB_WC_WITH_IMM;
+               }
+
+               wc->status = IB_WC_WR_FLUSH_ERR;
+
+               wc->qp = &my_qp->ib_qp;
+
+               /* mark as reported and advance next_wqe pointer */
+               qmap_entry->reported = 1;
+               qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
+                                               qmap->entries);
+               qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+               wc++; nr++;
+       }
+
+       return nr;
+
+}
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       int nr;
+       struct ehca_qp *err_qp;
+       struct ib_wc *current_wc = wc;
+       int ret = 0;
+       unsigned long flags;
+       int entries_left = num_entries;
+
+       if (num_entries < 1) {
+               ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
+                        "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
+               ret = -EINVAL;
+               goto poll_cq_exit0;
+       }
+
+       spin_lock_irqsave(&my_cq->spinlock, flags);
+
+       /* generate flush cqes for send queues */
+       list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
+               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+                               &err_qp->ipz_squeue, 1);
+               entries_left -= nr;
+               current_wc += nr;
+
+               if (entries_left == 0)
+                       break;
+       }
+
+       /* generate flush cqes for receive queues */
+       list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
+               nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+                               &err_qp->ipz_rqueue, 0);
+               entries_left -= nr;
+               current_wc += nr;
+
+               if (entries_left == 0)
+                       break;
+       }
+
+       for (nr = 0; nr < entries_left; nr++) {
+               ret = ehca_poll_cq_one(cq, current_wc);
+               if (ret)
+                       break;
+               current_wc++;
+       } /* eof for nr */
+       entries_left -= nr;
+
+       spin_unlock_irqrestore(&my_cq->spinlock, flags);
+       if (ret == -EAGAIN  || !ret)
+               ret = num_entries - entries_left;
+
+poll_cq_exit0:
+       return ret;
+}
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
+{
+       struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+       int ret = 0;
+
+       switch (notify_flags & IB_CQ_SOLICITED_MASK) {
+       case IB_CQ_SOLICITED:
+               hipz_set_cqx_n0(my_cq, 1);
+               break;
+       case IB_CQ_NEXT_COMP:
+               hipz_set_cqx_n1(my_cq, 1);
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+               unsigned long spl_flags;
+               spin_lock_irqsave(&my_cq->spinlock, spl_flags);
+               ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
+               spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
+       }
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_sqp.c b/drivers/staging/rdma/ehca/ehca_sqp.c
new file mode 100644 (file)
index 0000000..376b031
--- /dev/null
@@ -0,0 +1,245 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  SQP functions
+ *
+ *  Authors: Khadija Souissi <souissi@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define IB_MAD_STATUS_REDIRECT         cpu_to_be16(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD     cpu_to_be16(0x0008)
+
+#define IB_PMA_CLASS_PORT_INFO         cpu_to_be16(0x0001)
+
+/**
+ * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
+ * pair is created successfully, the corresponding port gets active.
+ *
+ * Define Special Queue pair 0 (SMI QP) is still not supported.
+ *
+ * @qp_init_attr: Queue pair init attributes with port and queue pair type
+ */
+
+u64 ehca_define_sqp(struct ehca_shca *shca,
+                   struct ehca_qp *ehca_qp,
+                   struct ib_qp_init_attr *qp_init_attr)
+{
+       u32 pma_qp_nr, bma_qp_nr;
+       u64 ret;
+       u8 port = qp_init_attr->port_num;
+       int counter;
+
+       shca->sport[port - 1].port_state = IB_PORT_DOWN;
+
+       switch (qp_init_attr->qp_type) {
+       case IB_QPT_SMI:
+               /* function not supported yet */
+               break;
+       case IB_QPT_GSI:
+               ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
+                                        ehca_qp->ipz_qp_handle,
+                                        ehca_qp->galpas.kernel,
+                                        (u32) qp_init_attr->port_num,
+                                        &pma_qp_nr, &bma_qp_nr);
+
+               if (ret != H_SUCCESS) {
+                       ehca_err(&shca->ib_device,
+                                "Can't define AQP1 for port %x. h_ret=%lli",
+                                port, ret);
+                       return ret;
+               }
+               shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
+               ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
+                        port, pma_qp_nr);
+               break;
+       default:
+               ehca_err(&shca->ib_device, "invalid qp_type=%x",
+                        qp_init_attr->qp_type);
+               return H_PARAMETER;
+       }
+
+       if (ehca_nr_ports < 0) /* autodetect mode */
+               return H_SUCCESS;
+
+       for (counter = 0;
+            shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
+                    counter < ehca_port_act_time;
+            counter++) {
+               ehca_dbg(&shca->ib_device, "... wait until port %x is active",
+                        port);
+               msleep_interruptible(1000);
+       }
+
+       if (counter == ehca_port_act_time) {
+               ehca_err(&shca->ib_device, "Port %x is not active.", port);
+               return H_HARDWARE;
+       }
+
+       return H_SUCCESS;
+}
+
+struct ib_perf {
+       struct ib_mad_hdr mad_hdr;
+       u8 reserved[40];
+       u8 data[192];
+} __attribute__ ((packed));
+
+/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
+struct tcslfl {
+       u32 tc:8;
+       u32 sl:4;
+       u32 fl:20;
+} __attribute__ ((packed));
+
+/* IP Version/TC/FL packed into 32 bits, as in GRH */
+struct vertcfl {
+       u32 ver:4;
+       u32 tc:8;
+       u32 fl:20;
+} __attribute__ ((packed));
+
+static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
+                            const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                            const struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+       const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
+       struct ib_perf *out_perf = (struct ib_perf *)out_mad;
+       struct ib_class_port_info *poi =
+               (struct ib_class_port_info *)out_perf->data;
+       struct tcslfl *tcslfl =
+               (struct tcslfl *)&poi->redirect_tcslfl;
+       struct ehca_shca *shca =
+               container_of(ibdev, struct ehca_shca, ib_device);
+       struct ehca_sport *sport = &shca->sport[port_num - 1];
+
+       ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
+
+       *out_mad = *in_mad;
+
+       if (in_perf->mad_hdr.class_version != 1) {
+               ehca_warn(ibdev, "Unsupported class_version=%x",
+                         in_perf->mad_hdr.class_version);
+               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
+               goto perf_reply;
+       }
+
+       switch (in_perf->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+       case IB_MGMT_METHOD_SET:
+               /* set class port info for redirection */
+               out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
+               out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
+               memset(poi, 0, sizeof(*poi));
+               poi->base_version = 1;
+               poi->class_version = 1;
+               poi->resp_time_value = 18;
+
+               /* copy local routing information from WC where applicable */
+               tcslfl->sl         = in_wc->sl;
+               poi->redirect_lid  =
+                       sport->saved_attr.lid | in_wc->dlid_path_bits;
+               poi->redirect_qp   = sport->pma_qp_nr;
+               poi->redirect_qkey = IB_QP1_QKEY;
+
+               ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
+                               &poi->redirect_pkey);
+
+               /* if request was globally routed, copy route info */
+               if (in_grh) {
+                       const struct vertcfl *vertcfl =
+                               (const struct vertcfl *)&in_grh->version_tclass_flow;
+                       memcpy(poi->redirect_gid, in_grh->dgid.raw,
+                              sizeof(poi->redirect_gid));
+                       tcslfl->tc        = vertcfl->tc;
+                       tcslfl->fl        = vertcfl->fl;
+               } else
+                       /* else only fill in default GID */
+                       ehca_query_gid(ibdev, port_num, 0,
+                                      (union ib_gid *)&poi->redirect_gid);
+
+               ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
+                        sport->saved_attr.lid, sport->pma_qp_nr);
+               break;
+
+       case IB_MGMT_METHOD_GET_RESP:
+               return IB_MAD_RESULT_FAILURE;
+
+       default:
+               out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
+               break;
+       }
+
+perf_reply:
+       out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in, size_t in_mad_size,
+                    struct ib_mad_hdr *out, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index)
+{
+       int ret;
+       const struct ib_mad *in_mad = (const struct ib_mad *)in;
+       struct ib_mad *out_mad = (struct ib_mad *)out;
+
+       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+                        *out_mad_size != sizeof(*out_mad)))
+               return IB_MAD_RESULT_FAILURE;
+
+       if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
+               return IB_MAD_RESULT_FAILURE;
+
+       /* accept only pma request */
+       if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+               return IB_MAD_RESULT_SUCCESS;
+
+       ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
+       ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
+                               in_mad, out_mad);
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ehca/ehca_tools.h b/drivers/staging/rdma/ehca/ehca_tools.h
new file mode 100644 (file)
index 0000000..d280b12
--- /dev/null
@@ -0,0 +1,155 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  auxiliary functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef EHCA_TOOLS_H
+#define EHCA_TOOLS_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+
+#include <linux/atomic.h>
+#include <asm/ibmebus.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hvcall.h>
+
+extern int ehca_debug_level;
+
+#define ehca_dbg(ib_dev, format, arg...) \
+       do { \
+               if (unlikely(ehca_debug_level)) \
+                       dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
+                                  "PU%04x EHCA_DBG:%s " format "\n", \
+                                  raw_smp_processor_id(), __func__, \
+                                  ## arg); \
+       } while (0)
+
+#define ehca_info(ib_dev, format, arg...) \
+       dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
+                raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_warn(ib_dev, format, arg...) \
+       dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
+                raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_err(ib_dev, format, arg...) \
+       dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
+               raw_smp_processor_id(), __func__, ## arg)
+
+/* use this one only if no ib_dev available */
+#define ehca_gen_dbg(format, arg...) \
+       do { \
+               if (unlikely(ehca_debug_level)) \
+                       printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
+                              raw_smp_processor_id(), __func__, ## arg); \
+       } while (0)
+
+#define ehca_gen_warn(format, arg...) \
+       printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
+              raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_gen_err(format, arg...) \
+       printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
+              raw_smp_processor_id(), __func__, ## arg)
+
+/**
+ * ehca_dmp - printk a memory block, whose length is n*8 bytes.
+ * Each line has the following layout:
+ * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
+ */
+#define ehca_dmp(adr, len, format, args...) \
+       do { \
+               unsigned int x; \
+               unsigned int l = (unsigned int)(len); \
+               unsigned char *deb = (unsigned char *)(adr); \
+               for (x = 0; x < l; x += 16) { \
+                       printk(KERN_INFO "EHCA_DMP:%s " format \
+                              " adr=%p ofs=%04x %016llx %016llx\n", \
+                              __func__, ##args, deb, x, \
+                              *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
+                       deb += 16; \
+               } \
+       } while (0)
+
+/* define a bitmask, little endian version */
+#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
+
+/* define a bitmask, the ibm way... */
+#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
+
+/* internal function, don't use */
+#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
+
+/* internal function, don't use */
+#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
+
+/**
+ * EHCA_BMASK_SET - return value shifted and masked by mask
+ * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
+ * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
+ * in variable
+ */
+#define EHCA_BMASK_SET(mask, value) \
+       ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
+
+/**
+ * EHCA_BMASK_GET - extract a parameter from value by mask
+ */
+#define EHCA_BMASK_GET(mask, value) \
+       (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
+
+/* Converts ehca to ib return code */
+int ehca2ib_return_code(u64 ehca_rc);
+
+#endif /* EHCA_TOOLS_H */
diff --git a/drivers/staging/rdma/ehca/ehca_uverbs.c b/drivers/staging/rdma/ehca/ehca_uverbs.c
new file mode 100644 (file)
index 0000000..1a1d5d9
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  userspace support verbs
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+                                       struct ib_udata *udata)
+{
+       struct ehca_ucontext *my_context;
+
+       my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
+       if (!my_context) {
+               ehca_err(device, "Out of memory device=%p", device);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       return &my_context->ib_ucontext;
+}
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context)
+{
+       kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
+       return 0;
+}
+
+static void ehca_mm_open(struct vm_area_struct *vma)
+{
+       u32 *count = (u32 *)vma->vm_private_data;
+       if (!count) {
+               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+                            vma->vm_start, vma->vm_end);
+               return;
+       }
+       (*count)++;
+       if (!(*count))
+               ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
+                            vma->vm_start, vma->vm_end);
+       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+                    vma->vm_start, vma->vm_end, *count);
+}
+
+static void ehca_mm_close(struct vm_area_struct *vma)
+{
+       u32 *count = (u32 *)vma->vm_private_data;
+       if (!count) {
+               ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+                            vma->vm_start, vma->vm_end);
+               return;
+       }
+       (*count)--;
+       ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+                    vma->vm_start, vma->vm_end, *count);
+}
+
+static const struct vm_operations_struct vm_ops = {
+       .open = ehca_mm_open,
+       .close = ehca_mm_close,
+};
+
+static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
+                       u32 *mm_count)
+{
+       int ret;
+       u64 vsize, physical;
+
+       vsize = vma->vm_end - vma->vm_start;
+       if (vsize < EHCA_PAGESIZE) {
+               ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
+               return -EINVAL;
+       }
+
+       physical = galpas->user.fw_handle;
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+       ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
+       /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+       ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
+                          vma->vm_page_prot);
+       if (unlikely(ret)) {
+               ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
+               return -ENOMEM;
+       }
+
+       vma->vm_private_data = mm_count;
+       (*mm_count)++;
+       vma->vm_ops = &vm_ops;
+
+       return 0;
+}
+
+static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
+                          u32 *mm_count)
+{
+       int ret;
+       u64 start, ofs;
+       struct page *page;
+
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+       start = vma->vm_start;
+       for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
+               u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
+               page = virt_to_page(virt_addr);
+               ret = vm_insert_page(vma, start, page);
+               if (unlikely(ret)) {
+                       ehca_gen_err("vm_insert_page() failed rc=%i", ret);
+                       return ret;
+               }
+               start += PAGE_SIZE;
+       }
+       vma->vm_private_data = mm_count;
+       (*mm_count)++;
+       vma->vm_ops = &vm_ops;
+
+       return 0;
+}
+
+static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
+                       u32 rsrc_type)
+{
+       int ret;
+
+       switch (rsrc_type) {
+       case 0: /* galpa fw handle */
+               ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
+               ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
+               if (unlikely(ret)) {
+                       ehca_err(cq->ib_cq.device,
+                                "ehca_mmap_fw() failed rc=%i cq_num=%x",
+                                ret, cq->cq_number);
+                       return ret;
+               }
+               break;
+
+       case 1: /* cq queue_addr */
+               ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
+               ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
+               if (unlikely(ret)) {
+                       ehca_err(cq->ib_cq.device,
+                                "ehca_mmap_queue() failed rc=%i cq_num=%x",
+                                ret, cq->cq_number);
+                       return ret;
+               }
+               break;
+
+       default:
+               ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
+                        rsrc_type, cq->cq_number);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
+                       u32 rsrc_type)
+{
+       int ret;
+
+       switch (rsrc_type) {
+       case 0: /* galpa fw handle */
+               ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
+               ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "remap_pfn_range() failed ret=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return -ENOMEM;
+               }
+               break;
+
+       case 1: /* qp rqueue_addr */
+               ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
+               ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
+                                     &qp->mm_count_rqueue);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return ret;
+               }
+               break;
+
+       case 2: /* qp squeue_addr */
+               ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
+               ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
+                                     &qp->mm_count_squeue);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return ret;
+               }
+               break;
+
+       default:
+               ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
+                        rsrc_type, qp->ib_qp.qp_num);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       u64 fileoffset = vma->vm_pgoff;
+       u32 idr_handle = fileoffset & 0x1FFFFFF;
+       u32 q_type = (fileoffset >> 27) & 0x1;    /* CQ, QP,...        */
+       u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
+       u32 ret;
+       struct ehca_cq *cq;
+       struct ehca_qp *qp;
+       struct ib_uobject *uobject;
+
+       switch (q_type) {
+       case  0: /* CQ */
+               read_lock(&ehca_cq_idr_lock);
+               cq = idr_find(&ehca_cq_idr, idr_handle);
+               read_unlock(&ehca_cq_idr_lock);
+
+               /* make sure this mmap really belongs to the authorized user */
+               if (!cq)
+                       return -EINVAL;
+
+               if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
+                       return -EINVAL;
+
+               ret = ehca_mmap_cq(vma, cq, rsrc_type);
+               if (unlikely(ret)) {
+                       ehca_err(cq->ib_cq.device,
+                                "ehca_mmap_cq() failed rc=%i cq_num=%x",
+                                ret, cq->cq_number);
+                       return ret;
+               }
+               break;
+
+       case 1: /* QP */
+               read_lock(&ehca_qp_idr_lock);
+               qp = idr_find(&ehca_qp_idr, idr_handle);
+               read_unlock(&ehca_qp_idr_lock);
+
+               /* make sure this mmap really belongs to the authorized user */
+               if (!qp)
+                       return -EINVAL;
+
+               uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
+               if (!uobject || uobject->context != context)
+                       return -EINVAL;
+
+               ret = ehca_mmap_qp(vma, qp, rsrc_type);
+               if (unlikely(ret)) {
+                       ehca_err(qp->ib_qp.device,
+                                "ehca_mmap_qp() failed rc=%i qp_num=%x",
+                                ret, qp->ib_qp.qp_num);
+                       return ret;
+               }
+               break;
+
+       default:
+               ehca_gen_err("bad queue type %x", q_type);
+               return -EINVAL;
+       }
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ehca/hcp_if.c b/drivers/staging/rdma/ehca/hcp_if.c
new file mode 100644 (file)
index 0000000..89517ff
--- /dev/null
@@ -0,0 +1,949 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/hvcall.h>
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hcp_phyp.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
+#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
+#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
+#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
+#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
+#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
+#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
+#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
+#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
+#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
+#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
+
+#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
+#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
+#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
+
+#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
+#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
+#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
+
+#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
+#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
+
+#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
+
+#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
+#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
+#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
+
+#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
+#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
+#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
+
+static DEFINE_SPINLOCK(hcall_lock);
+
+static long ehca_plpar_hcall_norets(unsigned long opcode,
+                                   unsigned long arg1,
+                                   unsigned long arg2,
+                                   unsigned long arg3,
+                                   unsigned long arg4,
+                                   unsigned long arg5,
+                                   unsigned long arg6,
+                                   unsigned long arg7)
+{
+       long ret;
+       int i, sleep_msecs;
+       unsigned long flags = 0;
+
+       if (unlikely(ehca_debug_level >= 2))
+               ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
+                            opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+
+       for (i = 0; i < 5; i++) {
+               /* serialize hCalls to work around firmware issue */
+               if (ehca_lock_hcalls)
+                       spin_lock_irqsave(&hcall_lock, flags);
+
+               ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
+                                        arg5, arg6, arg7);
+
+               if (ehca_lock_hcalls)
+                       spin_unlock_irqrestore(&hcall_lock, flags);
+
+               if (H_IS_LONG_BUSY(ret)) {
+                       sleep_msecs = get_longbusy_msecs(ret);
+                       msleep_interruptible(sleep_msecs);
+                       continue;
+               }
+
+               if (ret < H_SUCCESS)
+                       ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
+                                    opcode, ret, arg1, arg2, arg3,
+                                    arg4, arg5, arg6, arg7);
+               else
+                       if (unlikely(ehca_debug_level >= 2))
+                               ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
+
+               return ret;
+       }
+
+       return H_BUSY;
+}
+
+static long ehca_plpar_hcall9(unsigned long opcode,
+                             unsigned long *outs, /* array of 9 outputs */
+                             unsigned long arg1,
+                             unsigned long arg2,
+                             unsigned long arg3,
+                             unsigned long arg4,
+                             unsigned long arg5,
+                             unsigned long arg6,
+                             unsigned long arg7,
+                             unsigned long arg8,
+                             unsigned long arg9)
+{
+       long ret;
+       int i, sleep_msecs;
+       unsigned long flags = 0;
+
+       if (unlikely(ehca_debug_level >= 2))
+               ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
+                            arg1, arg2, arg3, arg4, arg5,
+                            arg6, arg7, arg8, arg9);
+
+       for (i = 0; i < 5; i++) {
+               /* serialize hCalls to work around firmware issue */
+               if (ehca_lock_hcalls)
+                       spin_lock_irqsave(&hcall_lock, flags);
+
+               ret = plpar_hcall9(opcode, outs,
+                                  arg1, arg2, arg3, arg4, arg5,
+                                  arg6, arg7, arg8, arg9);
+
+               if (ehca_lock_hcalls)
+                       spin_unlock_irqrestore(&hcall_lock, flags);
+
+               if (H_IS_LONG_BUSY(ret)) {
+                       sleep_msecs = get_longbusy_msecs(ret);
+                       msleep_interruptible(sleep_msecs);
+                       continue;
+               }
+
+               if (ret < H_SUCCESS) {
+                       ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
+                                    opcode, arg1, arg2, arg3, arg4, arg5,
+                                    arg6, arg7, arg8, arg9);
+                       ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+                                    ret, outs[0], outs[1], outs[2], outs[3],
+                                    outs[4], outs[5], outs[6], outs[7],
+                                    outs[8]);
+               } else if (unlikely(ehca_debug_level >= 2))
+                       ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+                                    ret, outs[0], outs[1], outs[2], outs[3],
+                                    outs[4], outs[5], outs[6], outs[7],
+                                    outs[8]);
+               return ret;
+       }
+
+       return H_BUSY;
+}
+
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u32 neq_control,
+                            const u32 number_of_entries,
+                            struct ipz_eq_handle *eq_handle,
+                            u32 *act_nr_of_entries,
+                            u32 *act_pages,
+                            u32 *eq_ist)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+       u64 allocate_controls;
+
+       /* resource type */
+       allocate_controls = 3ULL;
+
+       /* ISN is associated */
+       if (neq_control != 1)
+               allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
+       else /* notification event queue */
+               allocate_controls = (1ULL << 63) | allocate_controls;
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,  /* r4 */
+                               allocate_controls,      /* r5 */
+                               number_of_entries,      /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       eq_handle->handle = outs[0];
+       *act_nr_of_entries = (u32)outs[3];
+       *act_pages = (u32)outs[4];
+       *eq_ist = (u32)outs[5];
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resource - ret=%lli ", ret);
+
+       return ret;
+}
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+                      struct ipz_eq_handle eq_handle,
+                      const u64 event_mask)
+{
+       return ehca_plpar_hcall_norets(H_RESET_EVENTS,
+                                      adapter_handle.handle, /* r4 */
+                                      eq_handle.handle,      /* r5 */
+                                      event_mask,            /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_cq *cq,
+                            struct ehca_alloc_cq_parms *param)
+{
+       int rc;
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,   /* r4  */
+                               2,                       /* r5  */
+                               param->eq_handle.handle, /* r6  */
+                               cq->token,               /* r7  */
+                               param->nr_cqe,           /* r8  */
+                               0, 0, 0, 0);
+       cq->ipz_cq_handle.handle = outs[0];
+       param->act_nr_of_entries = (u32)outs[3];
+       param->act_pages = (u32)outs[4];
+
+       if (ret == H_SUCCESS) {
+               rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+               if (rc) {
+                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+                                    rc, outs[5]);
+
+                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                               adapter_handle.handle,     /* r4 */
+                                               cq->ipz_cq_handle.handle,  /* r5 */
+                                               0, 0, 0, 0, 0);
+                       ret = H_NO_MEM;
+               }
+       }
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_alloc_qp_parms *parms, int is_user)
+{
+       int rc;
+       u64 ret;
+       u64 allocate_controls, max_r10_reg, r11, r12;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       allocate_controls =
+               EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
+                                parms->squeue.page_size)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
+                                parms->rqueue.page_size)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
+                                !!(parms->ll_comp_flags & LLQP_RECV_COMP))
+               | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
+                                !!(parms->ll_comp_flags & LLQP_SEND_COMP))
+               | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
+                                parms->ud_av_l_key_ctl)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
+
+       max_r10_reg =
+               EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
+                              parms->squeue.max_wr + 1)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
+                                parms->rqueue.max_wr + 1)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
+                                parms->squeue.max_sge)
+               | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
+                                parms->rqueue.max_sge);
+
+       r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
+
+       if (parms->ext_type == EQPT_SRQ)
+               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
+       else
+               r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,             /* r4  */
+                               allocate_controls,                 /* r5  */
+                               parms->send_cq_handle.handle,
+                               parms->recv_cq_handle.handle,
+                               parms->eq_handle.handle,
+                               ((u64)parms->token << 32) | parms->pd.value,
+                               max_r10_reg, r11, r12);
+
+       parms->qp_handle.handle = outs[0];
+       parms->real_qp_num = (u32)outs[1];
+       parms->squeue.act_nr_wqes =
+               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
+       parms->rqueue.act_nr_wqes =
+               (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
+       parms->squeue.act_nr_sges =
+               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
+       parms->rqueue.act_nr_sges =
+               (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
+       parms->squeue.queue_size =
+               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
+       parms->rqueue.queue_size =
+               (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+
+       if (ret == H_SUCCESS) {
+               rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+               if (rc) {
+                       ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+                                    rc, outs[6]);
+
+                       ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                               adapter_handle.handle,     /* r4 */
+                                               parms->qp_handle.handle,  /* r5 */
+                                               0, 0, 0, 0, 0);
+                       ret = H_NO_MEM;
+               }
+       }
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+                     const u8 port_id,
+                     struct hipz_query_port *query_port_response_block)
+{
+       u64 ret;
+       u64 r_cb = __pa(query_port_response_block);
+
+       if (r_cb & (EHCA_PAGESIZE-1)) {
+               ehca_gen_err("response block not page aligned");
+               return H_PARAMETER;
+       }
+
+       ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
+                                     adapter_handle.handle, /* r4 */
+                                     port_id,               /* r5 */
+                                     r_cb,                  /* r6 */
+                                     0, 0, 0, 0);
+
+       if (ehca_debug_level >= 2)
+               ehca_dmp(query_port_response_block, 64, "response_block");
+
+       return ret;
+}
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+                      const u8 port_id, const u32 port_cap,
+                      const u8 init_type, const int modify_mask)
+{
+       u64 port_attributes = port_cap;
+
+       if (modify_mask & IB_PORT_SHUTDOWN)
+               port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
+       if (modify_mask & IB_PORT_INIT_TYPE)
+               port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
+       if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
+               port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
+
+       return ehca_plpar_hcall_norets(H_MODIFY_PORT,
+                                      adapter_handle.handle, /* r4 */
+                                      port_id,               /* r5 */
+                                      port_attributes,       /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+                    struct hipz_query_hca *query_hca_rblock)
+{
+       u64 r_cb = __pa(query_hca_rblock);
+
+       if (r_cb & (EHCA_PAGESIZE-1)) {
+               ehca_gen_err("response_block=%p not page aligned",
+                            query_hca_rblock);
+               return H_PARAMETER;
+       }
+
+       return ehca_plpar_hcall_norets(H_QUERY_HCA,
+                                      adapter_handle.handle, /* r4 */
+                                      r_cb,                  /* r5 */
+                                      0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+                         const u8 pagesize,
+                         const u8 queue_type,
+                         const u64 resource_handle,
+                         const u64 logical_address_of_page,
+                         u64 count)
+{
+       return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
+                                      adapter_handle.handle,      /* r4  */
+                                      (u64)queue_type | ((u64)pagesize) << 8,
+                                      /* r5  */
+                                      resource_handle,            /* r6  */
+                                      logical_address_of_page,    /* r7  */
+                                      count,                      /* r8  */
+                                      0, 0);
+}
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_eq_handle eq_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count)
+{
+       if (count != 1) {
+               ehca_gen_err("Ppage counter=%llx", count);
+               return H_PARAMETER;
+       }
+       return hipz_h_register_rpage(adapter_handle,
+                                    pagesize,
+                                    queue_type,
+                                    eq_handle.handle,
+                                    logical_address_of_page, count);
+}
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
+                          u32 ist)
+{
+       u64 ret;
+       ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
+                                     adapter_handle.handle, /* r4 */
+                                     ist,                   /* r5 */
+                                     0, 0, 0, 0, 0);
+
+       if (ret != H_SUCCESS && ret != H_BUSY)
+               ehca_gen_err("Could not query interrupt state.");
+
+       return ret;
+}
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_cq_handle cq_handle,
+                            struct ehca_pfcq *pfcq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa gal)
+{
+       if (count != 1) {
+               ehca_gen_err("Page counter=%llx", count);
+               return H_PARAMETER;
+       }
+
+       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+                                    cq_handle.handle, logical_address_of_page,
+                                    count);
+}
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_qp_handle qp_handle,
+                            struct ehca_pfqp *pfqp,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa galpa)
+{
+       if (count > 1) {
+               ehca_gen_err("Page counter=%llx", count);
+               return H_PARAMETER;
+       }
+
+       return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+                                    qp_handle.handle, logical_address_of_page,
+                                    count);
+}
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+                              const struct ipz_qp_handle qp_handle,
+                              struct ehca_pfqp *pfqp,
+                              void **log_addr_next_sq_wqe2processed,
+                              void **log_addr_next_rq_wqe2processed,
+                              int dis_and_get_function_code)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+                               adapter_handle.handle,     /* r4 */
+                               dis_and_get_function_code, /* r5 */
+                               qp_handle.handle,          /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       if (log_addr_next_sq_wqe2processed)
+               *log_addr_next_sq_wqe2processed = (void *)outs[0];
+       if (log_addr_next_rq_wqe2processed)
+               *log_addr_next_rq_wqe2processed = (void *)outs[1];
+
+       return ret;
+}
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+                    const struct ipz_qp_handle qp_handle,
+                    struct ehca_pfqp *pfqp,
+                    const u64 update_mask,
+                    struct hcp_modify_qp_control_block *mqpcb,
+                    struct h_galpa gal)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+       ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
+                               adapter_handle.handle, /* r4 */
+                               qp_handle.handle,      /* r5 */
+                               update_mask,           /* r6 */
+                               __pa(mqpcb),           /* r7 */
+                               0, 0, 0, 0, 0);
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Insufficient resources ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+                   const struct ipz_qp_handle qp_handle,
+                   struct ehca_pfqp *pfqp,
+                   struct hcp_modify_qp_control_block *qqpcb,
+                   struct h_galpa gal)
+{
+       return ehca_plpar_hcall_norets(H_QUERY_QP,
+                                      adapter_handle.handle, /* r4 */
+                                      qp_handle.handle,      /* r5 */
+                                      __pa(qqpcb),           /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_qp *qp)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = hcp_galpas_dtor(&qp->galpas);
+       if (ret) {
+               ehca_gen_err("Could not destruct qp->galpas");
+               return H_RESOURCE;
+       }
+       ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+                               adapter_handle.handle,     /* r4 */
+                               /* function code */
+                               1,                         /* r5 */
+                               qp->ipz_qp_handle.handle,  /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       if (ret == H_HARDWARE)
+               ehca_gen_err("HCA not operational. ret=%lli", ret);
+
+       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                     adapter_handle.handle,     /* r4 */
+                                     qp->ipz_qp_handle.handle,  /* r5 */
+                                     0, 0, 0, 0, 0);
+
+       if (ret == H_RESOURCE)
+               ehca_gen_err("Resource still in use. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port)
+{
+       return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
+                                      adapter_handle.handle, /* r4 */
+                                      qp_handle.handle,      /* r5 */
+                                      port,                  /* r6 */
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port, u32 * pma_qp_nr,
+                      u32 * bma_qp_nr)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
+                               adapter_handle.handle, /* r4 */
+                               qp_handle.handle,      /* r5 */
+                               port,                  /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       *pma_qp_nr = (u32)outs[0];
+       *bma_qp_nr = (u32)outs[1];
+
+       if (ret == H_ALIAS_EXIST)
+               ehca_gen_err("AQP1 already exists. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id)
+{
+       u64 ret;
+
+       ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
+                                     adapter_handle.handle,  /* r4 */
+                                     qp_handle.handle,       /* r5 */
+                                     mcg_dlid,               /* r6 */
+                                     interface_id,           /* r7 */
+                                     subnet_prefix,          /* r8 */
+                                     0, 0);
+
+       if (ret == H_NOT_ENOUGH_RESOURCES)
+               ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+       return ret;
+}
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id)
+{
+       return ehca_plpar_hcall_norets(H_DETACH_MCQP,
+                                      adapter_handle.handle, /* r4 */
+                                      qp_handle.handle,      /* r5 */
+                                      mcg_dlid,              /* r6 */
+                                      interface_id,          /* r7 */
+                                      subnet_prefix,         /* r8 */
+                                      0, 0);
+}
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_cq *cq,
+                     u8 force_flag)
+{
+       u64 ret;
+
+       ret = hcp_galpas_dtor(&cq->galpas);
+       if (ret) {
+               ehca_gen_err("Could not destruct cp->galpas");
+               return H_RESOURCE;
+       }
+
+       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                     adapter_handle.handle,     /* r4 */
+                                     cq->ipz_cq_handle.handle,  /* r5 */
+                                     force_flag != 0 ? 1L : 0L, /* r6 */
+                                     0, 0, 0, 0);
+
+       if (ret == H_RESOURCE)
+               ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
+
+       return ret;
+}
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_eq *eq)
+{
+       u64 ret;
+
+       ret = hcp_galpas_dtor(&eq->galpas);
+       if (ret) {
+               ehca_gen_err("Could not destruct eq->galpas");
+               return H_RESOURCE;
+       }
+
+       ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                     adapter_handle.handle,     /* r4 */
+                                     eq->ipz_eq_handle.handle,  /* r5 */
+                                     0, 0, 0, 0, 0);
+
+       if (ret == H_RESOURCE)
+               ehca_gen_err("Resource in use. ret=%lli ", ret);
+
+       return ret;
+}
+
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u64 vaddr,
+                            const u64 length,
+                            const u32 access_ctrl,
+                            const struct ipz_pd pd,
+                            struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,            /* r4 */
+                               5,                                /* r5 */
+                               vaddr,                            /* r6 */
+                               length,                           /* r7 */
+                               (((u64)access_ctrl) << 32ULL),    /* r8 */
+                               pd.value,                         /* r9 */
+                               0, 0, 0);
+       outparms->handle.handle = outs[0];
+       outparms->lkey = (u32)outs[2];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count)
+{
+       u64 ret;
+
+       if (unlikely(ehca_debug_level >= 3)) {
+               if (count > 1) {
+                       u64 *kpage;
+                       int i;
+                       kpage = __va(logical_address_of_page);
+                       for (i = 0; i < count; i++)
+                               ehca_gen_dbg("kpage[%d]=%p",
+                                            i, (void *)kpage[i]);
+               } else
+                       ehca_gen_dbg("kpage=%p",
+                                    (void *)logical_address_of_page);
+       }
+
+       if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
+               ehca_gen_err("logical_address_of_page not on a 4k boundary "
+                            "adapter_handle=%llx mr=%p mr_handle=%llx "
+                            "pagesize=%x queue_type=%x "
+                            "logical_address_of_page=%llx count=%llx",
+                            adapter_handle.handle, mr,
+                            mr->ipz_mr_handle.handle, pagesize, queue_type,
+                            logical_address_of_page, count);
+               ret = H_PARAMETER;
+       } else
+               ret = hipz_h_register_rpage(adapter_handle, pagesize,
+                                           queue_type,
+                                           mr->ipz_mr_handle.handle,
+                                           logical_address_of_page, count);
+       return ret;
+}
+
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mr *mr,
+                   struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
+                               adapter_handle.handle,     /* r4 */
+                               mr->ipz_mr_handle.handle,  /* r5 */
+                               0, 0, 0, 0, 0, 0, 0);
+       outparms->len = outs[0];
+       outparms->vaddr = outs[1];
+       outparms->acl  = outs[4] >> 32;
+       outparms->lkey = (u32)(outs[5] >> 32);
+       outparms->rkey = (u32)(outs[5] & (0xffffffff));
+
+       return ret;
+}
+
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mr *mr)
+{
+       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                      adapter_handle.handle,    /* r4 */
+                                      mr->ipz_mr_handle.handle, /* r5 */
+                                      0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+                         const struct ehca_mr *mr,
+                         const u64 vaddr_in,
+                         const u64 length,
+                         const u32 access_ctrl,
+                         const struct ipz_pd pd,
+                         const u64 mr_addr_cb,
+                         struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
+                               adapter_handle.handle,    /* r4 */
+                               mr->ipz_mr_handle.handle, /* r5 */
+                               vaddr_in,                 /* r6 */
+                               length,                   /* r7 */
+                               /* r8 */
+                               ((((u64)access_ctrl) << 32ULL) | pd.value),
+                               mr_addr_cb,               /* r9 */
+                               0, 0, 0);
+       outparms->vaddr = outs[1];
+       outparms->lkey = (u32)outs[2];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+                       const struct ehca_mr *mr,
+                       const struct ehca_mr *orig_mr,
+                       const u64 vaddr_in,
+                       const u32 access_ctrl,
+                       const struct ipz_pd pd,
+                       struct ehca_mr_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
+                               adapter_handle.handle,            /* r4 */
+                               orig_mr->ipz_mr_handle.handle,    /* r5 */
+                               vaddr_in,                         /* r6 */
+                               (((u64)access_ctrl) << 32ULL),    /* r7 */
+                               pd.value,                         /* r8 */
+                               0, 0, 0, 0);
+       outparms->handle.handle = outs[0];
+       outparms->lkey = (u32)outs[2];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mw *mw,
+                            const struct ipz_pd pd,
+                            struct ehca_mw_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+                               adapter_handle.handle,      /* r4 */
+                               6,                          /* r5 */
+                               pd.value,                   /* r6 */
+                               0, 0, 0, 0, 0, 0);
+       outparms->handle.handle = outs[0];
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mw *mw,
+                   struct ehca_mw_hipzout_parms *outparms)
+{
+       u64 ret;
+       unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+       ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
+                               adapter_handle.handle,    /* r4 */
+                               mw->ipz_mw_handle.handle, /* r5 */
+                               0, 0, 0, 0, 0, 0, 0);
+       outparms->rkey = (u32)outs[3];
+
+       return ret;
+}
+
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mw *mw)
+{
+       return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+                                      adapter_handle.handle,    /* r4 */
+                                      mw->ipz_mw_handle.handle, /* r5 */
+                                      0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+                     const u64 ressource_handle,
+                     void *rblock,
+                     unsigned long *byte_count)
+{
+       u64 r_cb = __pa(rblock);
+
+       if (r_cb & (EHCA_PAGESIZE-1)) {
+               ehca_gen_err("rblock not page aligned.");
+               return H_PARAMETER;
+       }
+
+       return ehca_plpar_hcall_norets(H_ERROR_DATA,
+                                      adapter_handle.handle,
+                                      ressource_handle,
+                                      r_cb,
+                                      0, 0, 0, 0);
+}
+
+u64 hipz_h_eoi(int irq)
+{
+       unsigned long xirr;
+
+       iosync();
+       xirr = (0xffULL << 24) | irq;
+
+       return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/staging/rdma/ehca/hcp_if.h b/drivers/staging/rdma/ehca/hcp_if.h
new file mode 100644 (file)
index 0000000..a46e514
--- /dev/null
@@ -0,0 +1,265 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_IF_H__
+#define __HCP_IF_H__
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "hipz_hw.h"
+
+/*
+ * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
+ * resources, create the empty EQPT (ring).
+ */
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u32 neq_control,
+                            const u32 number_of_entries,
+                            struct ipz_eq_handle *eq_handle,
+                            u32 * act_nr_of_entries,
+                            u32 * act_pages,
+                            u32 * eq_ist);
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+                      struct ipz_eq_handle eq_handle,
+                      const u64 event_mask);
+/*
+ * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
+ * resources, create the empty CQPT (ring).
+ */
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_cq *cq,
+                            struct ehca_alloc_cq_parms *param);
+
+
+/*
+ * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
+ * initialize resources, create empty QPPTs (2 rings).
+ */
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+                            struct ehca_alloc_qp_parms *parms, int is_user);
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+                     const u8 port_id,
+                     struct hipz_query_port *query_port_response_block);
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+                      const u8 port_id, const u32 port_cap,
+                      const u8 init_type, const int modify_mask);
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+                    struct hipz_query_hca *query_hca_rblock);
+
+/*
+ * hipz_h_register_rpage internal function in hcp_if.h for all
+ * hcp_H_REGISTER_RPAGE calls.
+ */
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+                         const u8 pagesize,
+                         const u8 queue_type,
+                         const u64 resource_handle,
+                         const u64 logical_address_of_page,
+                         u64 count);
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_eq_handle eq_handle,
+                            struct ehca_pfeq *pfeq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count);
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle
+                          hcp_adapter_handle,
+                          u32 ist);
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_cq_handle cq_handle,
+                            struct ehca_pfcq *pfcq,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa gal);
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+                            const struct ipz_qp_handle qp_handle,
+                            struct ehca_pfqp *pfqp,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count,
+                            const struct h_galpa galpa);
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+                              const struct ipz_qp_handle qp_handle,
+                              struct ehca_pfqp *pfqp,
+                              void **log_addr_next_sq_wqe_tb_processed,
+                              void **log_addr_next_rq_wqe_tb_processed,
+                              int dis_and_get_function_code);
+enum hcall_sigt {
+       HCALL_SIGT_NO_CQE = 0,
+       HCALL_SIGT_BY_WQE = 1,
+       HCALL_SIGT_EVERY = 2
+};
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+                    const struct ipz_qp_handle qp_handle,
+                    struct ehca_pfqp *pfqp,
+                    const u64 update_mask,
+                    struct hcp_modify_qp_control_block *mqpcb,
+                    struct h_galpa gal);
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+                   const struct ipz_qp_handle qp_handle,
+                   struct ehca_pfqp *pfqp,
+                   struct hcp_modify_qp_control_block *qqpcb,
+                   struct h_galpa gal);
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_qp *qp);
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port);
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u32 port, u32 * pma_qp_nr,
+                      u32 * bma_qp_nr);
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+                      const struct ipz_qp_handle qp_handle,
+                      struct h_galpa gal,
+                      u16 mcg_dlid,
+                      u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_cq *cq,
+                     u8 force_flag);
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+                     struct ehca_eq *eq);
+
+/*
+ * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u64 vaddr,
+                            const u64 length,
+                            const u32 access_ctrl,
+                            const struct ipz_pd pd,
+                            struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mr *mr,
+                            const u8 pagesize,
+                            const u8 queue_type,
+                            const u64 logical_address_of_page,
+                            const u64 count);
+
+/* hipz_h_query_mr queries MR in HW and FW */
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mr *mr,
+                   struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mr frees MR resources in HW and FW */
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mr *mr);
+
+/* hipz_h_reregister_pmr reregisters MR in HW and FW */
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+                         const struct ehca_mr *mr,
+                         const u64 vaddr_in,
+                         const u64 length,
+                         const u32 access_ctrl,
+                         const struct ipz_pd pd,
+                         const u64 mr_addr_cb,
+                         struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_smr register shared MR in HW and FW */
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+                       const struct ehca_mr *mr,
+                       const struct ehca_mr *orig_mr,
+                       const u64 vaddr_in,
+                       const u32 access_ctrl,
+                       const struct ipz_pd pd,
+                       struct ehca_mr_hipzout_parms *outparms);
+
+/*
+ * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                            const struct ehca_mw *mw,
+                            const struct ipz_pd pd,
+                            struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_query_mw queries MW in HW and FW */
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+                   const struct ehca_mw *mw,
+                   struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mw frees MW resources in HW and FW */
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+                           const struct ehca_mw *mw);
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+                     const u64 ressource_handle,
+                     void *rblock,
+                     unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
+
+#endif /* __HCP_IF_H__ */
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.c b/drivers/staging/rdma/ehca/hcp_phyp.c
new file mode 100644 (file)
index 0000000..077376f
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *   load store abstraction for ehca register access with tracing
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+u64 hcall_map_page(u64 physaddr)
+{
+       return (u64)ioremap(physaddr, EHCA_PAGESIZE);
+}
+
+int hcall_unmap_page(u64 mapaddr)
+{
+       iounmap((volatile void __iomem *) mapaddr);
+       return 0;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+                   u64 paddr_kernel, u64 paddr_user)
+{
+       if (!is_user) {
+               galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
+               if (!galpas->kernel.fw_handle)
+                       return -ENOMEM;
+       } else
+               galpas->kernel.fw_handle = 0;
+
+       galpas->user.fw_handle = paddr_user;
+
+       return 0;
+}
+
+int hcp_galpas_dtor(struct h_galpas *galpas)
+{
+       if (galpas->kernel.fw_handle) {
+               int ret = hcall_unmap_page(galpas->kernel.fw_handle);
+               if (ret)
+                       return ret;
+       }
+
+       galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ehca/hcp_phyp.h b/drivers/staging/rdma/ehca/hcp_phyp.h
new file mode 100644 (file)
index 0000000..d1b0299
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware calls
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_PHYP_H__
+#define __HCP_PHYP_H__
+
+
+/*
+ * eHCA page (mapped into memory)
+ * resource to access eHCA register pages in CPU address space
+*/
+struct h_galpa {
+       u64 fw_handle;
+       /* for pSeries this is a 64bit memory address where
+          I/O memory is mapped into CPU address space (kv) */
+};
+
+/*
+ * resource to access eHCA address space registers, all types
+ */
+struct h_galpas {
+       u32 pid;                /*PID of userspace galpa checking */
+       struct h_galpa user;    /* user space accessible resource,
+                                  set to 0 if unused */
+       struct h_galpa kernel;  /* kernel space accessible resource,
+                                  set to 0 if unused */
+};
+
+static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
+{
+       u64 addr = galpa.fw_handle + offset;
+       return *(volatile u64 __force *)addr;
+}
+
+static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+{
+       u64 addr = galpa.fw_handle + offset;
+       *(volatile u64 __force *)addr = value;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+                   u64 paddr_kernel, u64 paddr_user);
+
+int hcp_galpas_dtor(struct h_galpas *galpas);
+
+u64 hcall_map_page(u64 physaddr);
+
+int hcall_unmap_page(u64 mapaddr);
+
+#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns.h b/drivers/staging/rdma/ehca/hipz_fns.h
new file mode 100644 (file)
index 0000000..9dac93d
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_H__
+#define __HIPZ_FNS_H__
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+#include "hipz_fns_core.h"
+
+#define hipz_galpa_store_eq(gal, offset, value) \
+       hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_eq(gal, offset) \
+       hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qped(gal, offset, value) \
+       hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_qped(gal, offset) \
+       hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
+
+#define hipz_galpa_store_mrmw(gal, offset, value) \
+       hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_mrmw(gal, offset) \
+       hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
+
+#endif
diff --git a/drivers/staging/rdma/ehca/hipz_fns_core.h b/drivers/staging/rdma/ehca/hipz_fns_core.h
new file mode 100644 (file)
index 0000000..868735f
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_CORE_H__
+#define __HIPZ_FNS_CORE_H__
+
+#include "hcp_phyp.h"
+#include "hipz_hw.h"
+
+#define hipz_galpa_store_cq(gal, offset, value) \
+       hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_cq(gal, offset) \
+       hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qp(gal, offset, value) \
+       hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
+#define hipz_galpa_load_qp(gal, offset) \
+       hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
+
+static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+       /*  ringing doorbell :-) */
+       hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
+                           EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
+}
+
+static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+       /*  ringing doorbell :-) */
+       hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
+                           EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
+}
+
+static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
+{
+       hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
+                           EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
+}
+
+static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
+{
+       u64 cqx_n0_reg;
+
+       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
+                           EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
+                                          value));
+       cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
+}
+
+static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
+{
+       u64 cqx_n1_reg;
+
+       hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
+                           EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
+       cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
+}
+
+#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/staging/rdma/ehca/hipz_hw.h b/drivers/staging/rdma/ehca/hipz_hw.h
new file mode 100644 (file)
index 0000000..bf996c7
--- /dev/null
@@ -0,0 +1,414 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  eHCA register definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_HW_H__
+#define __HIPZ_HW_H__
+
+#include "ehca_tools.h"
+
+#define EHCA_MAX_MTU 4
+
+/* QP Table Entry Memory Map */
+struct hipz_qptemm {
+       u64 qpx_hcr;
+       u64 qpx_c;
+       u64 qpx_herr;
+       u64 qpx_aer;
+/* 0x20*/
+       u64 qpx_sqa;
+       u64 qpx_sqc;
+       u64 qpx_rqa;
+       u64 qpx_rqc;
+/* 0x40*/
+       u64 qpx_st;
+       u64 qpx_pmstate;
+       u64 qpx_pmfa;
+       u64 qpx_pkey;
+/* 0x60*/
+       u64 qpx_pkeya;
+       u64 qpx_pkeyb;
+       u64 qpx_pkeyc;
+       u64 qpx_pkeyd;
+/* 0x80*/
+       u64 qpx_qkey;
+       u64 qpx_dqp;
+       u64 qpx_dlidp;
+       u64 qpx_portp;
+/* 0xa0*/
+       u64 qpx_slidp;
+       u64 qpx_slidpp;
+       u64 qpx_dlida;
+       u64 qpx_porta;
+/* 0xc0*/
+       u64 qpx_slida;
+       u64 qpx_slidpa;
+       u64 qpx_slvl;
+       u64 qpx_ipd;
+/* 0xe0*/
+       u64 qpx_mtu;
+       u64 qpx_lato;
+       u64 qpx_rlimit;
+       u64 qpx_rnrlimit;
+/* 0x100*/
+       u64 qpx_t;
+       u64 qpx_sqhp;
+       u64 qpx_sqptp;
+       u64 qpx_nspsn;
+/* 0x120*/
+       u64 qpx_nspsnhwm;
+       u64 reserved1;
+       u64 qpx_sdsi;
+       u64 qpx_sdsbc;
+/* 0x140*/
+       u64 qpx_sqwsize;
+       u64 qpx_sqwts;
+       u64 qpx_lsn;
+       u64 qpx_nssn;
+/* 0x160 */
+       u64 qpx_mor;
+       u64 qpx_cor;
+       u64 qpx_sqsize;
+       u64 qpx_erc;
+/* 0x180*/
+       u64 qpx_rnrrc;
+       u64 qpx_ernrwt;
+       u64 qpx_rnrresp;
+       u64 qpx_lmsna;
+/* 0x1a0 */
+       u64 qpx_sqhpc;
+       u64 qpx_sqcptp;
+       u64 qpx_sigt;
+       u64 qpx_wqecnt;
+/* 0x1c0*/
+       u64 qpx_rqhp;
+       u64 qpx_rqptp;
+       u64 qpx_rqsize;
+       u64 qpx_nrr;
+/* 0x1e0*/
+       u64 qpx_rdmac;
+       u64 qpx_nrpsn;
+       u64 qpx_lapsn;
+       u64 qpx_lcr;
+/* 0x200*/
+       u64 qpx_rwc;
+       u64 qpx_rwva;
+       u64 qpx_rdsi;
+       u64 qpx_rdsbc;
+/* 0x220*/
+       u64 qpx_rqwsize;
+       u64 qpx_crmsn;
+       u64 qpx_rdd;
+       u64 qpx_larpsn;
+/* 0x240*/
+       u64 qpx_pd;
+       u64 qpx_scqn;
+       u64 qpx_rcqn;
+       u64 qpx_aeqn;
+/* 0x260*/
+       u64 qpx_aaelog;
+       u64 qpx_ram;
+       u64 qpx_rdmaqe0;
+       u64 qpx_rdmaqe1;
+/* 0x280*/
+       u64 qpx_rdmaqe2;
+       u64 qpx_rdmaqe3;
+       u64 qpx_nrpsnhwm;
+/* 0x298*/
+       u64 reserved[(0x400 - 0x298) / 8];
+/* 0x400 extended data */
+       u64 reserved_ext[(0x500 - 0x400) / 8];
+/* 0x500 */
+       u64 reserved2[(0x1000 - 0x500) / 8];
+/* 0x1000      */
+};
+
+#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
+
+#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
+
+/* MRMWPT Entry Memory Map */
+struct hipz_mrmwmm {
+       /* 0x00 */
+       u64 mrx_hcr;
+
+       u64 mrx_c;
+       u64 mrx_herr;
+       u64 mrx_aer;
+       /* 0x20 */
+       u64 mrx_pp;
+       u64 reserved1;
+       u64 reserved2;
+       u64 reserved3;
+       /* 0x40 */
+       u64 reserved4[(0x200 - 0x40) / 8];
+       /* 0x200 */
+       u64 mrx_ctl[64];
+
+};
+
+#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
+
+struct hipz_qpedmm {
+       /* 0x00 */
+       u64 reserved0[(0x400) / 8];
+       /* 0x400 */
+       u64 qpedx_phh;
+       u64 qpedx_ppsgp;
+       /* 0x410 */
+       u64 qpedx_ppsgu;
+       u64 qpedx_ppdgp;
+       /* 0x420 */
+       u64 qpedx_ppdgu;
+       u64 qpedx_aph;
+       /* 0x430 */
+       u64 qpedx_apsgp;
+       u64 qpedx_apsgu;
+       /* 0x440 */
+       u64 qpedx_apdgp;
+       u64 qpedx_apdgu;
+       /* 0x450 */
+       u64 qpedx_apav;
+       u64 qpedx_apsav;
+       /* 0x460  */
+       u64 qpedx_hcr;
+       u64 reserved1[4];
+       /* 0x488 */
+       u64 qpedx_rrl0;
+       /* 0x490 */
+       u64 qpedx_rrrkey0;
+       u64 qpedx_rrva0;
+       /* 0x4a0 */
+       u64 reserved2;
+       u64 qpedx_rrl1;
+       /* 0x4b0 */
+       u64 qpedx_rrrkey1;
+       u64 qpedx_rrva1;
+       /* 0x4c0 */
+       u64 reserved3;
+       u64 qpedx_rrl2;
+       /* 0x4d0 */
+       u64 qpedx_rrrkey2;
+       u64 qpedx_rrva2;
+       /* 0x4e0 */
+       u64 reserved4;
+       u64 qpedx_rrl3;
+       /* 0x4f0 */
+       u64 qpedx_rrrkey3;
+       u64 qpedx_rrva3;
+};
+
+#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
+
+/* CQ Table Entry Memory Map */
+struct hipz_cqtemm {
+       u64 cqx_hcr;
+       u64 cqx_c;
+       u64 cqx_herr;
+       u64 cqx_aer;
+/* 0x20  */
+       u64 cqx_ptp;
+       u64 cqx_tp;
+       u64 cqx_fec;
+       u64 cqx_feca;
+/* 0x40  */
+       u64 cqx_ep;
+       u64 cqx_eq;
+/* 0x50  */
+       u64 reserved1;
+       u64 cqx_n0;
+/* 0x60  */
+       u64 cqx_n1;
+       u64 reserved2[(0x1000 - 0x60) / 8];
+/* 0x1000 */
+};
+
+#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
+#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
+#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+
+#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
+
+/* EQ Table Entry Memory Map */
+struct hipz_eqtemm {
+       u64 eqx_hcr;
+       u64 eqx_c;
+
+       u64 eqx_herr;
+       u64 eqx_aer;
+/* 0x20 */
+       u64 eqx_ptp;
+       u64 eqx_tp;
+       u64 eqx_ssba;
+       u64 eqx_psba;
+
+/* 0x40 */
+       u64 eqx_cec;
+       u64 eqx_meql;
+       u64 eqx_xisbi;
+       u64 eqx_xisc;
+/* 0x60 */
+       u64 eqx_it;
+
+};
+
+#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
+
+/* access control defines for MR/MW */
+#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
+#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
+#define HIPZ_ACCESSCTRL_R_READ   0x00200000
+#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
+#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
+
+/* query hca response block */
+struct hipz_query_hca {
+       u32 cur_reliable_dg;
+       u32 cur_qp;
+       u32 cur_cq;
+       u32 cur_eq;
+       u32 cur_mr;
+       u32 cur_mw;
+       u32 cur_ee_context;
+       u32 cur_mcast_grp;
+       u32 cur_qp_attached_mcast_grp;
+       u32 reserved1;
+       u32 cur_ipv6_qp;
+       u32 cur_eth_qp;
+       u32 cur_hp_mr;
+       u32 reserved2[3];
+       u32 max_rd_domain;
+       u32 max_qp;
+       u32 max_cq;
+       u32 max_eq;
+       u32 max_mr;
+       u32 max_hp_mr;
+       u32 max_mw;
+       u32 max_mrwpte;
+       u32 max_special_mrwpte;
+       u32 max_rd_ee_context;
+       u32 max_mcast_grp;
+       u32 max_total_mcast_qp_attach;
+       u32 max_mcast_qp_attach;
+       u32 max_raw_ipv6_qp;
+       u32 max_raw_ethy_qp;
+       u32 internal_clock_frequency;
+       u32 max_pd;
+       u32 max_ah;
+       u32 max_cqe;
+       u32 max_wqes_wq;
+       u32 max_partitions;
+       u32 max_rr_ee_context;
+       u32 max_rr_qp;
+       u32 max_rr_hca;
+       u32 max_act_wqs_ee_context;
+       u32 max_act_wqs_qp;
+       u32 max_sge;
+       u32 max_sge_rd;
+       u32 memory_page_size_supported;
+       u64 max_mr_size;
+       u32 local_ca_ack_delay;
+       u32 num_ports;
+       u32 vendor_id;
+       u32 vendor_part_id;
+       u32 hw_ver;
+       u64 node_guid;
+       u64 hca_cap_indicators;
+       u32 data_counter_register_size;
+       u32 max_shared_rq;
+       u32 max_isns_eq;
+       u32 max_neq;
+} __attribute__ ((packed));
+
+#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
+#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
+#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
+#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
+#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
+#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
+#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
+#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
+#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
+#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
+#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
+#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
+#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
+#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
+#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
+#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
+#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
+#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
+
+/* query port response block */
+struct hipz_query_port {
+       u32 state;
+       u32 bad_pkey_cntr;
+       u32 lmc;
+       u32 lid;
+       u32 subnet_timeout;
+       u32 qkey_viol_cntr;
+       u32 sm_sl;
+       u32 sm_lid;
+       u32 capability_mask;
+       u32 init_type_reply;
+       u32 pkey_tbl_len;
+       u32 gid_tbl_len;
+       u64 gid_prefix;
+       u32 port_nr;
+       u16 pkey_entries[16];
+       u8  reserved1[32];
+       u32 trent_size;
+       u32 trbuf_size;
+       u64 max_msg_sz;
+       u32 max_mtu;
+       u32 vl_cap;
+       u32 phys_pstate;
+       u32 phys_state;
+       u32 phys_speed;
+       u32 phys_width;
+       u8  reserved2[1884];
+       u64 guid_entries[255];
+} __attribute__ ((packed));
+
+#endif
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.c b/drivers/staging/rdma/ehca/ipz_pt_fn.c
new file mode 100644 (file)
index 0000000..7ffc748
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ipz_pt_fn.h"
+#include "ehca_classes.h"
+
+#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
+
+struct kmem_cache *small_qp_cache;
+
+void *ipz_qpageit_get_inc(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       queue->current_q_offset += queue->pagesize;
+       if (queue->current_q_offset > queue->queue_length) {
+               queue->current_q_offset -= queue->pagesize;
+               ret = NULL;
+       }
+       if (((u64)ret) % queue->pagesize) {
+               ehca_gen_err("ERROR!! not at PAGE-Boundary");
+               return NULL;
+       }
+       return ret;
+}
+
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       u64 last_entry_in_q = queue->queue_length - queue->qe_size;
+
+       queue->current_q_offset += queue->qe_size;
+       if (queue->current_q_offset > last_entry_in_q) {
+               queue->current_q_offset = 0;
+               queue->toggle_state = (~queue->toggle_state) & 1;
+       }
+
+       return ret;
+}
+
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
+{
+       int i;
+       for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
+               u64 page = __pa(queue->queue_pages[i]);
+               if (addr >= page && addr < page + queue->pagesize) {
+                       *q_offset = addr - page + i * queue->pagesize;
+                       return 0;
+               }
+       }
+       return -EINVAL;
+}
+
+#if PAGE_SHIFT < EHCA_PAGESHIFT
+#error Kernel pages must be at least as large than eHCA pages (4K) !
+#endif
+
+/*
+ * allocate pages for queue:
+ * outer loop allocates whole kernel pages (page aligned) and
+ * inner loop divides a kernel page into smaller hca queue pages
+ */
+static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
+{
+       int k, f = 0;
+       u8 *kpage;
+
+       while (f < nr_of_pages) {
+               kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
+               if (!kpage)
+                       goto out;
+
+               for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
+                       queue->queue_pages[f] = (struct ipz_page *)kpage;
+                       kpage += EHCA_PAGESIZE;
+                       f++;
+               }
+       }
+       return 1;
+
+out:
+       for (f = 0; f < nr_of_pages && queue->queue_pages[f];
+            f += PAGES_PER_KPAGE)
+               free_page((unsigned long)(queue->queue_pages)[f]);
+       return 0;
+}
+
+static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+       int order = ilog2(queue->pagesize) - 9;
+       struct ipz_small_queue_page *page;
+       unsigned long bit;
+
+       mutex_lock(&pd->lock);
+
+       if (!list_empty(&pd->free[order]))
+               page = list_entry(pd->free[order].next,
+                                 struct ipz_small_queue_page, list);
+       else {
+               page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
+               if (!page)
+                       goto out;
+
+               page->page = get_zeroed_page(GFP_KERNEL);
+               if (!page->page) {
+                       kmem_cache_free(small_qp_cache, page);
+                       goto out;
+               }
+
+               list_add(&page->list, &pd->free[order]);
+       }
+
+       bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
+       __set_bit(bit, page->bitmap);
+       page->fill++;
+
+       if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
+               list_move(&page->list, &pd->full[order]);
+
+       mutex_unlock(&pd->lock);
+
+       queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
+       queue->small_page = page;
+       queue->offset = bit << (order + 9);
+       return 1;
+
+out:
+       ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
+       mutex_unlock(&pd->lock);
+       return 0;
+}
+
+static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+       int order = ilog2(queue->pagesize) - 9;
+       struct ipz_small_queue_page *page = queue->small_page;
+       unsigned long bit;
+       int free_page = 0;
+
+       bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
+               >> (order + 9);
+
+       mutex_lock(&pd->lock);
+
+       __clear_bit(bit, page->bitmap);
+       page->fill--;
+
+       if (page->fill == 0) {
+               list_del(&page->list);
+               free_page = 1;
+       }
+
+       if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
+               /* the page was full until we freed the chunk */
+               list_move_tail(&page->list, &pd->free[order]);
+
+       mutex_unlock(&pd->lock);
+
+       if (free_page) {
+               free_page(page->page);
+               kmem_cache_free(small_qp_cache, page);
+       }
+}
+
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+                  const u32 nr_of_pages, const u32 pagesize,
+                  const u32 qe_size, const u32 nr_of_sg,
+                  int is_small)
+{
+       if (pagesize > PAGE_SIZE) {
+               ehca_gen_err("FATAL ERROR: pagesize=%x "
+                            "is greater than kernel page size", pagesize);
+               return 0;
+       }
+
+       /* init queue fields */
+       queue->queue_length = nr_of_pages * pagesize;
+       queue->pagesize = pagesize;
+       queue->qe_size = qe_size;
+       queue->act_nr_of_sg = nr_of_sg;
+       queue->current_q_offset = 0;
+       queue->toggle_state = 1;
+       queue->small_page = NULL;
+
+       /* allocate queue page pointers */
+       queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
+                                    GFP_KERNEL | __GFP_NOWARN);
+       if (!queue->queue_pages) {
+               queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
+               if (!queue->queue_pages) {
+                       ehca_gen_err("Couldn't allocate queue page list");
+                       return 0;
+               }
+       }
+
+       /* allocate actual queue pages */
+       if (is_small) {
+               if (!alloc_small_queue_page(queue, pd))
+                       goto ipz_queue_ctor_exit0;
+       } else
+               if (!alloc_queue_pages(queue, nr_of_pages))
+                       goto ipz_queue_ctor_exit0;
+
+       return 1;
+
+ipz_queue_ctor_exit0:
+       ehca_gen_err("Couldn't alloc pages queue=%p "
+                "nr_of_pages=%x",  queue, nr_of_pages);
+       kvfree(queue->queue_pages);
+
+       return 0;
+}
+
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+{
+       int i, nr_pages;
+
+       if (!queue || !queue->queue_pages) {
+               ehca_gen_dbg("queue or queue_pages is NULL");
+               return 0;
+       }
+
+       if (queue->small_page)
+               free_small_queue_page(queue, pd);
+       else {
+               nr_pages = queue->queue_length / queue->pagesize;
+               for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
+                       free_page((unsigned long)queue->queue_pages[i]);
+       }
+
+       kvfree(queue->queue_pages);
+
+       return 1;
+}
+
+int ehca_init_small_qp_cache(void)
+{
+       small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
+                                          sizeof(struct ipz_small_queue_page),
+                                          0, SLAB_HWCACHE_ALIGN, NULL);
+       if (!small_qp_cache)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void ehca_cleanup_small_qp_cache(void)
+{
+       kmem_cache_destroy(small_qp_cache);
+}
diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.h b/drivers/staging/rdma/ehca/ipz_pt_fn.h
new file mode 100644 (file)
index 0000000..a801274
--- /dev/null
@@ -0,0 +1,289 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IPZ_PT_FN_H__
+#define __IPZ_PT_FN_H__
+
+#define EHCA_PAGESHIFT   12
+#define EHCA_PAGESIZE   4096UL
+#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
+#define EHCA_PT_ENTRIES 512UL
+
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+
+struct ehca_pd;
+struct ipz_small_queue_page;
+
+extern struct kmem_cache *small_qp_cache;
+
+/* struct generic ehca page */
+struct ipz_page {
+       u8 entries[EHCA_PAGESIZE];
+};
+
+#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
+
+struct ipz_small_queue_page {
+       unsigned long page;
+       unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
+       int fill;
+       void *mapped_addr;
+       u32 mmap_count;
+       struct list_head list;
+};
+
+/* struct generic queue in linux kernel virtual memory (kv) */
+struct ipz_queue {
+       u64 current_q_offset;   /* current queue entry */
+
+       struct ipz_page **queue_pages;  /* array of pages belonging to queue */
+       u32 qe_size;            /* queue entry size */
+       u32 act_nr_of_sg;
+       u32 queue_length;       /* queue length allocated in bytes */
+       u32 pagesize;
+       u32 toggle_state;       /* toggle flag - per page */
+       u32 offset; /* save offset within page for small_qp */
+       struct ipz_small_queue_page *small_page;
+};
+
+/*
+ * return current Queue Entry for a certain q_offset
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
+{
+       struct ipz_page *current_page;
+       if (q_offset >= queue->queue_length)
+               return NULL;
+       current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
+       return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
+}
+
+/*
+ * return current Queue Entry
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_get(struct ipz_queue *queue)
+{
+       return ipz_qeit_calc(queue, queue->current_q_offset);
+}
+
+/*
+ * return current Queue Page , increment Queue Page iterator from
+ * page to page in struct ipz_queue, last increment will return 0! and
+ * NOT wrap
+ * returns address (kv) of Queue Page
+ * warning don't use in parallel with ipz_QE_get_inc()
+ */
+void *ipz_qpageit_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       queue->current_q_offset += queue->qe_size;
+       if (queue->current_q_offset >= queue->queue_length) {
+               queue->current_q_offset = 0;
+               /* toggle the valid flag */
+               queue->toggle_state = (~queue->toggle_state) & 1;
+       }
+
+       return ret;
+}
+
+/*
+ * return a bool indicating whether current Queue Entry is valid
+ */
+static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
+{
+       struct ehca_cqe *cqe = ipz_qeit_get(queue);
+       return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
+}
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
+{
+       return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
+}
+
+/*
+ * returns and resets Queue Entry iterator
+ * returns address (kv) of first Queue Entry
+ */
+static inline void *ipz_qeit_reset(struct ipz_queue *queue)
+{
+       queue->current_q_offset = 0;
+       return ipz_qeit_get(queue);
+}
+
+/*
+ * return the q_offset corresponding to an absolute address
+ */
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
+
+/*
+ * return the next queue offset. don't modify the queue.
+ */
+static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
+{
+       offset += queue->qe_size;
+       if (offset >= queue->queue_length) offset = 0;
+       return offset;
+}
+
+/* struct generic page table */
+struct ipz_pt {
+       u64 entries[EHCA_PT_ENTRIES];
+};
+
+/* struct page table for a queue, only to be used in pf */
+struct ipz_qpt {
+       /* queue page tables (kv), use u64 because we know the element length */
+       u64 *qpts;
+       u32 n_qpts;
+       u32 n_ptes;       /*  number of page table entries */
+       u64 *current_pte_addr;
+};
+
+/*
+ * constructor for a ipz_queue_t, placement new for ipz_queue_t,
+ * new for all dependent datastructors
+ * all QP Tables are the same
+ * flow:
+ *    allocate+pin queue
+ * see ipz_qpt_ctor()
+ * returns true if ok, false if out of memory
+ */
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+                  const u32 nr_of_pages, const u32 pagesize,
+                  const u32 qe_size, const u32 nr_of_sg,
+                  int is_small);
+
+/*
+ * destructor for a ipz_queue_t
+ *  -# free queue
+ *  see ipz_queue_ctor()
+ *  returns true if ok, false if queue was NULL-ptr of free failed
+ */
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
+
+/*
+ * constructor for a ipz_qpt_t,
+ * placement new for struct ipz_queue, new for all dependent datastructors
+ * all QP Tables are the same,
+ * flow:
+ * -# allocate+pin queue
+ * -# initialise ptcb
+ * -# allocate+pin PTs
+ * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
+ * -# the ring must have room for exactly nr_of_PTEs
+ * see ipz_qpt_ctor()
+ */
+void ipz_qpt_ctor(struct ipz_qpt *qpt,
+                 const u32 nr_of_qes,
+                 const u32 pagesize,
+                 const u32 qe_size,
+                 const u8 lowbyte, const u8 toggle,
+                 u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ * fix EQ page problems
+ */
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Event Queue Entry, increment Queue Entry iterator
+ * by one step in struct ipz_queue if valid, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_queue_QPageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ */
+static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       u32 qe = *(u8 *)ret;
+       if ((qe >> 7) != (queue->toggle_state & 1))
+               return NULL;
+       ipz_qeit_eq_get_inc(queue); /* this is a good one */
+       return ret;
+}
+
+static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
+{
+       void *ret = ipz_qeit_get(queue);
+       u32 qe = *(u8 *)ret;
+       if ((qe >> 7) != (queue->toggle_state & 1))
+               return NULL;
+       return ret;
+}
+
+/* returns address (GX) of first queue entry */
+static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
+{
+       return be64_to_cpu(qpt->qpts[0]);
+}
+
+/* returns address (kv) of first page of queue page table */
+static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
+{
+       return qpt->qpts;
+}
+
+#endif                         /* __IPZ_PT_FN_H__ */
diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
new file mode 100644 (file)
index 0000000..fd25078
--- /dev/null
@@ -0,0 +1,37 @@
+config INFINIBAND_HFI1
+       tristate "Intel OPA Gen1 support"
+       depends on X86_64
+       default m
+       ---help---
+       This is a low-level driver for Intel OPA Gen1 adapter.
+config HFI1_DEBUG_SDMA_ORDER
+       bool "HFI1 SDMA Order debug"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This is a debug flag to test for out of order
+       sdma completions for unit testing
+config HFI1_VERBS_31BIT_PSN
+       bool "HFI1 enable 31 bit PSN"
+       depends on INFINIBAND_HFI1
+       default y
+       ---help---
+       Setting this enables 31 BIT PSN
+       For verbs RC/UC
+config SDMA_VERBOSITY
+       bool "Config SDMA Verbosity"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This is a configuration flag to enable verbose
+       SDMA debug
+config PRESCAN_RXQ
+       bool "Enable prescanning of the RX queue for ECNs"
+       depends on INFINIBAND_HFI1
+       default n
+       ---help---
+       This option toggles the prescanning of the receive queue for
+       Explicit Congestion Notifications. If an ECN is detected, it
+       is processed as quickly as possible, the ECN is toggled off.
+       After the prescanning step, the receive queue is processed as
+       usual.
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
new file mode 100644 (file)
index 0000000..2e5daa6
--- /dev/null
@@ -0,0 +1,19 @@
+#
+# HFI driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
+
+hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o eprom.o file_ops.o firmware.o \
+       init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \
+       qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \
+       uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o
+hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
+
+CFLAGS_trace.o = -I$(src)
+ifdef MVERSION
+CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\"
+endif
diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO
new file mode 100644 (file)
index 0000000..05de0da
--- /dev/null
@@ -0,0 +1,6 @@
+July, 2015
+
+- Remove unneeded file entries in sysfs
+- Remove software processing of IB protocol and place in library for use
+  by qib, ipath (if still present), hfi1, and eventually soft-roce
+
diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
new file mode 100644 (file)
index 0000000..654eafe
--- /dev/null
@@ -0,0 +1,10798 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+       u64 flag;       /* the flag */
+       char *str;      /* description string */
+       u16 extra;      /* extra information */
+       u16 unused0;
+       u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED      0x1
+#define SEC_PACKET_DROPPED     0x2
+#define SEC_SC_HALTED          0x4     /* per-context only */
+#define SEC_SPC_FREEZE         0x8     /* per-HFI only */
+
+#define VL15CTXT                  1
+#define MIN_KERNEL_KCTXTS         2
+#define NUM_MAP_REGS             32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT     39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE         2ull
+#define QW_SHIFT               6ull
+/* QPN[7..1] */
+#define QPN_WIDTH              7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW             0ull
+#define LRH_BTH_BIT_OFFSET     48ull
+#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK           3ull
+#define LRH_BTH_VALUE          2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW              0ull
+#define LRH_SC_BIT_OFFSET      56ull
+#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK            128ull
+#define LRH_SC_VALUE           0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+       num, \
+       sc0, sc0val, \
+       sc1, sc1val, \
+       sc2, sc2val, \
+       sc3, sc3val, \
+       sc4, sc4val, \
+       sc5, sc5val, \
+       sc6, sc6val, \
+       sc7, sc7val) \
+( \
+       ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+       ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+       ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+       ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+       ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+       ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+       ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+       ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
+)
+
+#define DC_SC_VL_VAL( \
+       range, \
+       e0, e0val, \
+       e1, e1val, \
+       e2, e2val, \
+       e3, e3val, \
+       e4, e4val, \
+       e5, e5val, \
+       e6, e6val, \
+       e7, e7val, \
+       e8, e8val, \
+       e9, e9val, \
+       e10, e10val, \
+       e11, e11val, \
+       e12, e12val, \
+       e13, e13val, \
+       e14, e14val, \
+       e15, e15val) \
+( \
+       ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+       ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+       ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+       ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+       ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+       ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+       ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+       ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+       ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+       ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+       ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+       ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+       ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+       ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+       ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+       ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+                       | CCE_STATUS_RXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+                       | CCE_STATUS_TXE_PAUSED_SMASK \
+                       | CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
+               CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
+               CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
+               CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
+               CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+               CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
+               CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+               CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/ FLAG_ENTRY0("LATriggered",
+               CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+               CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
+               CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
+       SEC_WRITE_DROPPED,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("PioCsrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/ FLAG_ENTRY("PioPccFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY("PioPecFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY("PioSmPktResetParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/ FLAG_ENTRY("PioInitSmIn",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/ FLAG_ENTRY("PioWriteDataParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/ FLAG_ENTRY("PioStateMachine",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
+       SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
+       SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/ FLAG_ENTRY("PioVlfSopParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/ FLAG_ENTRY("PioVlFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY("PioPpmcSopLen",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+       (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
+               SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
+               SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR  \
+               (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+               SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+               SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+               SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+               SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+               SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+               SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+               SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+               SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+               SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+               SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+               SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+               SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+               SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+               SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+               SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+               SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+               SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+               SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+               SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+               SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+               SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+               SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+       (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+       | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+       | SEES(TX_LAUNCH_CSR_PARITY) \
+       | SEES(TX_SBRD_CTL_CSR_PARITY) \
+       | SEES(TX_CONFIG_PARITY) \
+       | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+       | SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("InconsistentSop",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("DisallowedPacket",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("WriteOverflow",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+               RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+               RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+               RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+               RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+               RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+               RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+               RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+               RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+               RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+               RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+               RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+               RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR  \
+       (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+       (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+       FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+       FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+       FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+       FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+       FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+       FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+       FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+       FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+       FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+       FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+       FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+       FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+       FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+       FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+       FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+       FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+       FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+       FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+       FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+       FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+       FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+       FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+       FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+       FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+       FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+       FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+       FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+       FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+       FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+       FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+       FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+       FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+       FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+       FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+       FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+       FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+               LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+               LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+               LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+               LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+               LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+               LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+       FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+       FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+       FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+       FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+       FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+       FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+       FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+       FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+       FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+               D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+       FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+       FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
+       FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
+       FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
+       FLAG_ENTRY0("Serdes internal loopback failure",
+                                       FAILED_SERDES_INTERNAL_LOOPBACK),
+       FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
+       FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
+       FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
+       FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
+       FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+       FLAG_ENTRY0("Host request done", 0x0001),
+       FLAG_ENTRY0("BC SMA message", 0x0002),
+       FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+       FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+       FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+       FLAG_ENTRY0("External device config request", 0x0020),
+       FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+       FLAG_ENTRY0("LinkUp achieved", 0x0080),
+       FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+                                         u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+                          u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+
+/*
+ * Error interrupt table entry.  This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+       u32 status;             /* status CSR offset */
+       u32 clear;              /* clear CSR offset */
+       u32 mask;               /* mask CSR offset */
+       void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+       const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries.  Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+       { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+               handler, desc }
+#define DC_EE1(reg, handler, desc) \
+       { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+       { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/ EE(CCE_ERR,             handle_cce_err,    "CceErr"),
+/* 1*/ EE(RCV_ERR,             handle_rxe_err,    "RxeErr"),
+/* 2*/ EE(MISC_ERR,    handle_misc_err,   "MiscErr"),
+/* 3*/ { 0, 0, 0, NULL }, /* reserved */
+/* 4*/ EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
+/* 5*/ EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
+/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/ EE(SEND_ERR,    handle_txe_err,    "TxeErr")
+       /* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+       EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/ EE(ASIC_QSFP1,  handle_qsfp_int,        "QSFP1"),
+/* 3*/ EE(ASIC_QSFP2,  handle_qsfp_int,        "QSFP2"),
+/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
+       /* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/ DC_EE1(DCC_ERR,         handle_dcc_err,        "DCC Err"),
+/* 1*/ DC_EE2(DC_LCB_ERR,      handle_lcb_err,        "LCB Err"),
+/* 2*/ DC_EE2(DC_DC8051_ERR,   handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
+       /* the rest are reserved */
+};
+
+struct cntr_entry {
+       /*
+        * counter name
+        */
+       char *name;
+
+       /*
+        * csr to read for name (if applicable)
+        */
+       u64 csr;
+
+       /*
+        * offset into dd or ppd to store the counter's value
+        */
+       int offset;
+
+       /*
+        * flags
+        */
+       u8 flags;
+
+       /*
+        * accessor for stat element, context either dd or ppd
+        */
+       u64 (*rw_cntr)(const struct cntr_entry *,
+                              void *context,
+                              int vl,
+                              int mode,
+                              u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+       name, \
+       csr, \
+       offset, \
+       flags, \
+       accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+         (RCV_HDR_OVFL_CNT + ctx*0x100), \
+         0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+         counter * 8 + SEND_COUNTER_ARRAY64, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+         0, \
+         0, \
+         CNTR_SYNTH, \
+         access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+       u64 val;
+
+       if (dd->flags & HFI1_PRESENT) {
+               val = readq((void __iomem *)dd->kregbase + offset);
+               return val;
+       }
+       return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+       if (dd->flags & HFI1_PRESENT)
+               writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset)
+{
+       return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+                                int mode, u64 value)
+{
+       u64 ret;
+
+
+       if (mode == CNTR_MODE_R) {
+               ret = read_csr(dd, csr);
+       } else if (mode == CNTR_MODE_W) {
+               write_csr(dd, csr, value);
+               ret = value;
+       } else {
+               dd_dev_err(dd, "Invalid cntr register access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+       return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+                           void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_csr(dd, entry->csr, mode, data);
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+                           int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       u64 val = 0;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+
+       val = read_write_csr(dd, csr, mode, data);
+       return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+                           int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+       u32 csr = entry->csr;
+       int ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       if (mode == CNTR_MODE_R)
+               ret = read_lcb_csr(dd, csr, &data);
+       else if (mode == CNTR_MODE_W)
+               ret = write_lcb_csr(dd, csr, data);
+
+       if (ret) {
+               dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+       return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+                            int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+                            void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+       u64 val;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       val = read_write_csr(ppd->dd, csr, mode, data);
+       return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+                               u64 data)
+{
+       u64 ret;
+
+       if (mode == CNTR_MODE_R) {
+               ret = *cntr;
+       } else if (mode == CNTR_MODE_W) {
+               *cntr = data;
+               ret = data;
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+       return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+                              int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+                              int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+                                   void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+                                    void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+                            mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+                                    void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+                            mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+       int cpu;
+       u64 counter = 0;
+
+       for_each_possible_cpu(cpu)
+               counter += *per_cpu_ptr(cntr, cpu);
+       return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+                         u64 __percpu *cntr,
+                         int vl, int mode, u64 data)
+{
+
+       u64 ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       if (mode == CNTR_MODE_R) {
+               ret = get_all_cpu_total(cntr) - *z_val;
+       } else if (mode == CNTR_MODE_W) {
+               /* A write can only zero the counter */
+               if (data == 0)
+                       *z_val = get_all_cpu_total(cntr);
+               else
+                       dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+               return 0;
+       }
+
+       return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+                             mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+                             mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_kmem_wait;
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
+                             void *context, int vl, int mode, u64 data)      \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+       return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr,           \
+                             ppd->ibport_data.cntr, vl,                      \
+                             mode, data);                                    \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry,                 \
+                               void *context, int vl, int mode, u64 data)    \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+                                                                             \
+       if (vl != CNTR_INVALID_VL)                                            \
+               return 0;                                                     \
+                                                                             \
+       return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr,            \
+                            mode, data);                                     \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+                       RCV_TID_FLOW_GEN_MISMATCH_CNT,
+                       CNTR_NORMAL),
+[C_RX_CTX_RHQS] = RXE32_DEV_CNTR_ELEM(RxCtxRHQS, RCV_CONTEXT_RHQ_STALL,
+                       CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+                       CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+                       RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+                       CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+                       CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+                       CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+                                CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+                                 CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+                                  DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+                                 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+                               DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+                              CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+                             CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+                              CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+                                CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+                               CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+                               CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+                              CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+                               CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+                               CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+                                  CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+       DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+                        CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+                                 CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+       DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+                        CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+       DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+                        CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+       DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+                        CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+       DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+       DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+       DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+       DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+                        CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+                        CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+                           access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_wait),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+                           access_sw_kmem_wait),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+                       CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+                       CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+                       CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+                       CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+                       CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+                       CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                       access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                       access_sw_link_up_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                       access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+                       CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+                       access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+                       access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+                       access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a0 */
+int is_a0(struct hfi1_devdata *dd)
+{
+       return ((dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK) == 0;
+}
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return !!(chip_rev_minor & 0x10);
+}
+
+/*
+ * Append string s to buffer buf.  Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+       char *p = *curp;
+       int len = *lenp;
+       int result = 0; /* success */
+       char c;
+
+       /* add a comma, if first in the buffer */
+       if (p != buf) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = ',';
+               len--;
+       }
+
+       /* copy the string */
+       while ((c = *s++) != 0) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = c;
+               len--;
+       }
+
+done:
+       /* write return values */
+       *curp = p;
+       *lenp = len;
+
+       return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer.  End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+                               struct flag_table *table, int table_size)
+{
+       char extra[32];
+       char *p = buf;
+       int len = buf_len;
+       int no_room = 0;
+       int i;
+
+       /* make sure there is at least 2 so we can form "*" */
+       if (len < 2)
+               return "";
+
+       len--;  /* leave room for a nul */
+       for (i = 0; i < table_size; i++) {
+               if (flags & table[i].flag) {
+                       no_room = append_str(buf, &p, &len, table[i].str);
+                       if (no_room)
+                               break;
+                       flags &= ~table[i].flag;
+               }
+       }
+
+       /* any undocumented bits left? */
+       if (!no_room && flags) {
+               snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+               no_room = append_str(buf, &p, &len, extra);
+       }
+
+       /* add * if ran out of room */
+       if (no_room) {
+               /* may need to back up to add space for a '*' */
+               if (len == 0)
+                       --p;
+               *p++ = '*';
+       }
+
+       /* add final nul - space already allocated above */
+       *p = 0;
+       return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+       "CceErrInt",            /* 0 */
+       "RxeErrInt",            /* 1 */
+       "MiscErrInt",           /* 2 */
+       "Reserved3",            /* 3 */
+       "PioErrInt",            /* 4 */
+       "SDmaErrInt",           /* 5 */
+       "EgressErrInt",         /* 6 */
+       "TxeErrInt"             /* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(cce_misc_names))
+               strncpy(buf, cce_misc_names[source], bsize);
+       else
+               snprintf(buf,
+                       bsize,
+                       "Reserved%u",
+                       source + IS_GENERAL_ERR_START);
+
+       return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+       return buf;
+}
+
+static const char * const various_names[] = {
+       "PbcInt",
+       "GpioAssertInt",
+       "Qsfp1Int",
+       "Qsfp2Int",
+       "TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(various_names))
+               strncpy(buf, various_names[source], bsize);
+       else
+               snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START);
+       return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+       static const char * const dc_int_names[] = {
+               "common",
+               "lcb",
+               "8051",
+               "lbm"   /* local block merge */
+       };
+
+       if (source < ARRAY_SIZE(dc_int_names))
+               snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+       else
+               snprintf(buf, bsize, "DCInt%u", source);
+       return buf;
+}
+
+static const char * const sdma_int_names[] = {
+       "SDmaInt",
+       "SdmaIdleInt",
+       "SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+       if (likely(what < 3))
+               snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+       else
+               snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+       return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvAvailInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvUrgentInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCreditInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+       return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, misc_err_status_flags,
+                       ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       sdma_err_status_flags,
+                       ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+               egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+               egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       send_err_status_flags,
+                       ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       /*
+        * For most these errors, there is nothing that can be done except
+        * report or record it.
+        */
+       dd_dev_info(dd, "CCE Error: %s\n",
+               cce_err_status_string(buf, sizeof(buf), reg));
+
+       if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK)
+                       && is_a0(dd)
+                       && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+               /* this error requires a manual drop into SPC freeze mode */
+               /* then a fix up */
+               start_freeze_handling(dd->pport, FREEZE_SELF);
+       }
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+       struct hfi1_pportdata *ppd = dd->pport;
+       u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+       if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+               ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(ppd,
+                 OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+                       OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+       dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt;
+
+       mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+       init_timer(&dd->rcverr_timer);
+       dd->rcverr_timer.function = update_rcverr_timer;
+       dd->rcverr_timer.data = (unsigned long) dd;
+       /* Assume the hardware counter has been reset */
+       dd->rcv_ovfl_cnt = 0;
+       return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+       if (dd->rcverr_timer.data)
+               del_timer_sync(&dd->rcverr_timer);
+       dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "Receive Error: %s\n",
+               rxe_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_RXE_FREEZE_ERR) {
+               int flags = 0;
+
+               /*
+                * Freeze mode recovery is disabled for the errors
+                * in RXE_FREEZE_ABORT_MASK
+                */
+               if (is_a0(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+                       flags = FREEZE_ABORT;
+
+               start_freeze_handling(dd->pport, flags);
+       }
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "Misc Error: %s",
+               misc_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "PIO Error: %s\n",
+               pio_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_PIO_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "SDMA Error: %s\n",
+               sdma_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_SDMA_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       if (ppd->port_xmit_discards < ~(u64)0)
+               ppd->port_xmit_discards++;
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+       u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+       char buf[96];
+
+       /* clear down all observed info as quickly as possible after read */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+       dd_dev_info(dd,
+               "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+               info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+       /* Eventually add other counters for each bit */
+
+       if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) {
+               if (ppd->port_xmit_discards < ~(u64)0)
+                       ppd->port_xmit_discards++;
+       }
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+       return (posn >= SEES(TX_LINKDOWN) &&
+               posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(u64 posn)
+{
+       return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+               posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 reg_copy = reg, handled = 0;
+       char buf[96];
+
+       if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+       if (is_a0(dd) && (reg &
+                   SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK)
+                   && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+               start_freeze_handling(dd->pport, 0);
+
+       while (reg_copy) {
+               int posn = fls64(reg_copy);
+               /*
+                * fls64() returns a 1-based offset, but we generally
+                * want 0-based offsets.
+                */
+               int shift = posn - 1;
+
+               if (port_inactive_err(shift)) {
+                       count_port_inactive(dd);
+                       handled |= (1ULL << shift);
+               } else if (disallowed_pkt_err(shift)) {
+                       handle_send_egress_err_info(dd);
+                       handled |= (1ULL << shift);
+               }
+               clear_bit(shift, (unsigned long *)&reg_copy);
+       }
+
+       reg &= ~handled;
+
+       if (reg)
+               dd_dev_info(dd, "Egress Error: %s\n",
+                       egress_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "Send Error: %s\n",
+               send_err_status_string(buf, sizeof(buf), reg));
+
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error.  This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register.  All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+                                u32 context,
+                                const struct err_reg_info *eri)
+{
+       u64 reg;
+       u32 count;
+
+       /* read in a loop until no more errors are seen */
+       count = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, context, eri->status);
+               if (reg == 0)
+                       break;
+               write_kctxt_csr(dd, context, eri->clear, reg);
+               if (likely(eri->handler))
+                       eri->handler(dd, context, reg);
+               count++;
+               if (count > MAX_CLEAR_COUNT) {
+                       u64 mask;
+
+                       dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+                               eri->desc, reg);
+                       /*
+                        * Read-modify-write so any other masked bits
+                        * remain masked.
+                        */
+                       mask = read_kctxt_csr(dd, context, eri->mask);
+                       mask &= ~reg;
+                       write_kctxt_csr(dd, context, eri->mask, mask);
+                       break;
+               }
+       }
+}
+
+/*
+ * CCE block "misc" interrupt.  Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &misc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else {
+               dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+                       source);
+       }
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt.  Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt.  The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+                               unsigned int hw_context)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       char flags[96];
+       u64 status;
+       u32 sw_index;
+
+       sw_index = dd->hw_to_sw[hw_context];
+       if (sw_index >= dd->num_send_contexts) {
+               dd_dev_err(dd,
+                       "out of range sw index %u for send context %u\n",
+                       sw_index, hw_context);
+               return;
+       }
+       sci = &dd->send_contexts[sw_index];
+       sc = sci->sc;
+       if (!sc) {
+               dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+                       sw_index, hw_context);
+               return;
+       }
+
+       /* tell the software that a halt has begun */
+       sc_stop(sc, SCF_HALTED);
+
+       status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+       dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+               send_context_err_status_string(flags, sizeof(flags), status));
+
+       if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+               handle_send_egress_err_info(dd);
+
+       /*
+        * Automatically restart halted kernel contexts out of interrupt
+        * context.  User contexts must ask the driver to restart the context.
+        */
+       if (sc->type != SC_USER)
+               queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int source, u64 status)
+{
+       struct sdma_engine *sde;
+
+       sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+                  sde->this_idx, source, (unsigned long long)status);
+#endif
+       sdma_engine_error(sde, status);
+}
+
+/*
+ * CCE block SDMA error interrupt.  Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct sdma_engine *sde = &dd->per_sdma[source];
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+                  source);
+       sdma_dumpstate(sde);
+#endif
+       interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt.  Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &various_err[source];
+
+       /*
+        * TCritInt cannot go through interrupt_clear_down()
+        * because it is not a second tier interrupt. The handler
+        * should be called directly.
+        */
+       if (source == TCRIT_INT_SOURCE)
+               handle_temp_err(dd);
+       else if (eri->handler)
+               interrupt_clear_down(dd, 0, eri);
+       else
+               dd_dev_info(dd,
+                       "%s: Unimplemented/reserved interrupt %d\n",
+                       __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+       /* source is always zero */
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned long flags;
+       u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+       if (reg & QSFP_HFI0_MODPRST_N) {
+
+               dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n",
+                               __func__);
+
+               if (!qsfp_mod_present(ppd)) {
+                       ppd->driver_link_ready = 0;
+                       /*
+                        * Cable removed, reset all our information about the
+                        * cache and cable capabilities
+                        */
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       /*
+                        * We don't set cache_refresh_required here as we expect
+                        * an interrupt when a cable is inserted
+                        */
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.qsfp_interrupt_functional = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                               flags);
+                       write_csr(dd,
+                                       dd->hfi1_id ?
+                                               ASIC_QSFP2_INVERT :
+                                               ASIC_QSFP1_INVERT,
+                               qsfp_int_mgmt);
+                       if (ppd->host_link_state == HLS_DN_POLL) {
+                               /*
+                                * The link is still in POLL. This means
+                                * that the normal link down processing
+                                * will not happen. We have to do it here
+                                * before turning the DC off.
+                                */
+                               queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+                       }
+               } else {
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.cache_refresh_required = 1;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                               flags);
+
+                       qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+                       write_csr(dd,
+                                       dd->hfi1_id ?
+                                               ASIC_QSFP2_INVERT :
+                                               ASIC_QSFP1_INVERT,
+                               qsfp_int_mgmt);
+               }
+       }
+
+       if (reg & QSFP_HFI0_INT_N) {
+
+               dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n",
+                               __func__);
+               spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+               ppd->qsfp_info.check_interrupt_flags = 1;
+               ppd->qsfp_info.qsfp_interrupt_functional = 1;
+               spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+       }
+
+       /* Schedule the QSFP work only if there is a cable attached. */
+       if (qsfp_mod_present(ppd))
+               queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+               (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+               NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                       __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+               (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+               NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                       __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                               DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK
+                               | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                               DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051.  If the host already has access,
+ * just increment a counter.  Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ *     0 on success
+ *     -EBUSY if the 8051 has control and cannot be disturbed
+ *     -errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       int ret = 0;
+
+       /*
+        * Use the host link state lock so the operation of this routine
+        * { link state check, selector change, count increment } can occur
+        * as a unit against a link state change.  Otherwise there is a
+        * race between the state change and the count increment.
+        */
+       if (sleep_ok) {
+               mutex_lock(&ppd->hls_lock);
+       } else {
+               while (mutex_trylock(&ppd->hls_lock) == EBUSY)
+                       udelay(1);
+       }
+
+       /* this access is valid only when the link is up */
+       if ((ppd->host_link_state & HLS_UP) == 0) {
+               dd_dev_info(dd, "%s: link state %s not up\n",
+                       __func__, link_state_name(ppd->host_link_state));
+               ret = -EBUSY;
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 0) {
+               ret = request_host_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                               "%s: unable to acquire LCB access, err %d\n",
+                               __func__, ret);
+                       goto done;
+               }
+               set_host_lcb_access(dd);
+       }
+       dd->lcb_access_count++;
+done:
+       mutex_unlock(&ppd->hls_lock);
+       return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count.  If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ *     0 on success
+ *     -errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       int ret = 0;
+
+       /*
+        * Use the host link state lock because the acquire needed it.
+        * Here, we only need to keep { selector change, count decrement }
+        * as a unit.
+        */
+       if (sleep_ok) {
+               mutex_lock(&dd->pport->hls_lock);
+       } else {
+               while (mutex_trylock(&dd->pport->hls_lock) == EBUSY)
+                       udelay(1);
+       }
+
+       if (dd->lcb_access_count == 0) {
+               dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
+                       __func__);
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 1) {
+               set_8051_lcb_access(dd);
+               ret = request_8051_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                               "%s: unable to release LCB access, err %d\n",
+                               __func__, ret);
+                       /* restore host access if the grant didn't work */
+                       set_host_lcb_access(dd);
+                       goto done;
+               }
+       }
+       dd->lcb_access_count--;
+done:
+       mutex_unlock(&dd->pport->hls_lock);
+       return ret;
+}
+
+/*
+ * Initialize LCB access variables and state.  Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host.  The driver defaults to
+ * leaving access to the 8051.  Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done.  In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+       dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+               DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK
+               | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT
+               | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       u16 data;
+       u8 type;
+
+       reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+       if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+               return; /* no request */
+
+       /* zero out COMPLETED so the response is seen */
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+       /* extract request details */
+       type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+       data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+       switch (type) {
+       case HREQ_LOAD_CONFIG:
+       case HREQ_SAVE_CONFIG:
+       case HREQ_READ_CONFIG:
+       case HREQ_SET_TX_EQ_ABS:
+       case HREQ_SET_TX_EQ_REL:
+       case HREQ_ENABLE:
+               dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+                       type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+
+       case HREQ_CONFIG_DONE:
+               hreq_response(dd, HREQ_SUCCESS, 0);
+               break;
+
+       case HREQ_INTERFACE_TEST:
+               hreq_response(dd, HREQ_SUCCESS, data);
+               break;
+
+       default:
+               dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+       }
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+                               u8 vau, u16 total, u16 shared)
+{
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+               ((u64)total
+                       << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+               | ((u64)shared
+                       << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+               | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote.  Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+       /* leave shared count at zero for both global and VL15 */
+       write_global_credit(dd, vau, vl15buf, 0);
+
+       /* We may need some credits for another VL when sending packets
+        * with the snoop interface. Dividing it down the middle for VL15
+        * and VL0 should suffice.
+        */
+       if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+               write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+       } else {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+                       << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+       }
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove all previous VL credit limits */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       write_global_credit(dd, 0, 0, 0);
+       /* reset the CM block */
+       pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+       return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+       return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+       return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+       u64 reg;
+
+       /* clear lcb run: LCB_CFG_RUN.EN = 0 */
+       write_csr(dd, DC_LCB_CFG_RUN, 0);
+       /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+               1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+       /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+       dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+       reg = read_csr(dd, DCC_CFG_RESET);
+       write_csr(dd, DCC_CFG_RESET,
+               reg
+               | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT)
+               | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+       (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+       if (!abort) {
+               udelay(1);    /* must hold for the longer of 16cclks or 20ns */
+               write_csr(dd, DCC_CFG_RESET, reg);
+               write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       }
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (dd->dc_shutdown) {
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               return;
+       }
+       dd->dc_shutdown = 1;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Shutdown the LCB */
+       lcb_shutdown(dd, 1);
+       /* Going to OFFLINE would have causes the 8051 to put the
+        * SerDes into reset already. Just need to shut down the 8051,
+        * itself. */
+       write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/* Calling this after the DC has been brought out of reset should not
+ * do any damage. */
+static void dc_start(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (!dd->dc_shutdown)
+               goto done;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Take the 8051 out of reset */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+       /* Wait until 8051 is ready */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) {
+               dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+                       __func__);
+       }
+       /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+       write_csr(dd, DCC_CFG_RESET, 0x10);
+       /* lcb_shutdown() with abort=1 does not restore these */
+       write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       dd->dc_shutdown = 0;
+done:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+       u64 rx_radr, tx_radr;
+       u32 version;
+
+       if (dd->icode != ICODE_FPGA_EMULATION)
+               return;
+
+       /*
+        * These LCB defaults on emulator _s are good, nothing to do here:
+        *      LCB_CFG_TX_FIFOS_RADR
+        *      LCB_CFG_RX_FIFOS_RADR
+        *      LCB_CFG_LN_DCLK
+        *      LCB_CFG_IGNORE_LOST_RCLK
+        */
+       if (is_emulator_s(dd))
+               return;
+       /* else this is _p */
+
+       version = emulator_rev(dd);
+       if (!is_a0(dd))
+               version = 0x2d; /* all B0 use 0x2d or higher settings */
+
+       if (version <= 0x12) {
+               /* release 0x12 and below */
+
+               /*
+                * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+                */
+               rx_radr =
+                     0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               /*
+                * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+                * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+                */
+               tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version <= 0x18) {
+               /* release 0x13 up to 0x18 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x19) {
+               /* release 0x19 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+               rx_radr =
+                     0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x1a) {
+               /* release 0x1a */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+               write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+       } else {
+               /* release 0x1b and higher */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+               rx_radr =
+                     0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       }
+
+       write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+       /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+       write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+               DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       sma_message_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 msg;
+       int ret;
+
+       /* msg is bytes 1-4 of the 40-bit idle message - the command code
+          is stripped off */
+       ret = read_idle_sma(dd, &msg);
+       if (ret)
+               return;
+       dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+       /*
+        * React to the SMA message.  Byte[1] (0 for us) is the command.
+        */
+       switch (msg & 0xff) {
+       case SMA_IDLE_ARM:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Only expected in INIT or ARMED, discard otherwise.
+                */
+               if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+                       ppd->neighbor_normal = 1;
+               break;
+       case SMA_IDLE_ACTIVE:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Can activate the node.  Discard otherwise.
+                */
+               if (ppd->host_link_state == HLS_UP_ARMED
+                                       && ppd->is_active_optimize_enabled) {
+                       ppd->neighbor_normal = 1;
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret)
+                               dd_dev_err(
+                                       dd,
+                                       "%s: received Active SMA idle message, couldn't set link to Active\n",
+                                       __func__);
+               }
+               break;
+       default:
+               dd_dev_err(dd,
+                       "%s: received unexpected SMA idle message 0x%llx\n",
+                       __func__, msg);
+               break;
+       }
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+       u64 rcvctrl;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+       rcvctrl = read_csr(dd, RCV_CTRL);
+       rcvctrl |= add;
+       rcvctrl &= ~clear;
+       write_csr(dd, RCV_CTRL, rcvctrl);
+       spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+       adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+       adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct send_context *sc;
+       int i;
+
+       if (flags & FREEZE_SELF)
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+       /* enter frozen mode */
+       dd->flags |= HFI1_FROZEN;
+
+       /* notify all SDMA engines that they are going into a freeze */
+       sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+       /* do halt pre-handling on all enabled send contexts */
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (sc && (sc->flags & SCF_ENABLED))
+                       sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+       }
+
+       /* Send context are frozen. Notify user space */
+       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+       if (flags & FREEZE_ABORT) {
+               dd_dev_err(dd,
+                          "Aborted freeze recovery. Please REBOOT system\n");
+               return;
+       }
+       /* queue non-interrupt handler */
+       queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if (freeze) {
+                       /* waiting until all indicators are set */
+                       if ((reg & ALL_FROZE) == ALL_FROZE)
+                               return; /* all done */
+               } else {
+                       /* waiting until all indicators are clear */
+                       if ((reg & ALL_FROZE) == 0)
+                               return; /* all done */
+               }
+
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                               "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+                               freeze ? "" : "un",
+                               reg & ALL_FROZE,
+                               freeze ? ALL_FROZE : 0ull);
+                       return;
+               }
+               usleep_range(80, 120);
+       }
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* disable port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /* disable all receive contexts */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port.  User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* enable all kernel contexts */
+       for (i = 0; i < dd->n_krcv_queues; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i);
+
+       /* enable port */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               freeze_work);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* wait for freeze indicators on all affected blocks */
+       dd_dev_info(dd, "Entering SPC freeze\n");
+       wait_for_freeze_status(dd, 1);
+
+       /* SPC is now frozen */
+
+       /* do send PIO freeze steps */
+       pio_freeze(dd);
+
+       /* do send DMA freeze steps */
+       sdma_freeze(dd);
+
+       /* do send egress freeze steps - nothing to do */
+
+       /* do receive freeze steps */
+       rxe_freeze(dd);
+
+       /*
+        * Unfreeze the hardware - clear the freeze, wait for each
+        * block's frozen bit to clear, then clear the frozen flag.
+        */
+       write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       wait_for_freeze_status(dd, 0);
+
+       if (is_a0(dd)) {
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+               wait_for_freeze_status(dd, 1);
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+               wait_for_freeze_status(dd, 0);
+       }
+
+       /* do send PIO unfreeze steps for kernel contexts */
+       pio_kernel_unfreeze(dd);
+
+       /* do send DMA unfreeze steps */
+       sdma_unfreeze(dd);
+
+       /* do send egress unfreeze steps - nothing to do */
+
+       /* do receive unfreeze steps for kernel contexts */
+       rxe_kernel_unfreeze(dd);
+
+       /*
+        * The unfreeze procedure touches global device registers when
+        * it disables and re-enables RXE. Mark the device unfrozen
+        * after all that is done so other parts of the driver waiting
+        * for the device to unfreeze don't do things out of order.
+        *
+        * The above implies that the meaning of HFI1_FROZEN flag is
+        * "Device has gone into freeze mode and freeze mode handling
+        * is still in progress."
+        *
+        * The flag will be removed when freeze mode processing has
+        * completed.
+        */
+       dd->flags &= ~HFI1_FROZEN;
+       wake_up(&dd->event_queue);
+
+       /* no longer frozen */
+       dd_dev_err(dd, "Exiting SPC freeze\n");
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_up_work);
+       set_link_state(ppd, HLS_UP_INIT);
+
+       /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_ltp_rtt(ppd->dd);
+       /*
+        * OPA specifies that certain counters are cleared on a transition
+        * to link up, so do that.
+        */
+       clear_linkup_counters(ppd->dd);
+       /*
+        * And (re)set link up default values.
+        */
+       set_linkup_defaults(ppd);
+
+       /* enforce link speed enabled */
+       if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+               /* oops - current speed is not enabled, bounce */
+               dd_dev_err(ppd->dd,
+                       "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+                       ppd->link_speed_active, ppd->link_speed_enabled);
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+                       OPA_LINKDOWN_REASON_SPEED_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               start_link(ppd);
+       }
+}
+
+/* Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+       ppd->neighbor_guid = 0;
+       ppd->neighbor_port_number = 0;
+       ppd->neighbor_type = 0;
+       ppd->neighbor_fm_security = 0;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+       u8 lcl_reason, neigh_reason = 0;
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_down_work);
+
+       /* go offline first, then deal with reasons */
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       lcl_reason = 0;
+       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+
+       /*
+        * If no reason, assume peer-initiated but missed
+        * LinkGoingDown idle flits.
+        */
+       if (neigh_reason == 0)
+               lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+
+       set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+       reset_neighbor_info(ppd);
+
+       /* disable the port */
+       clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /* If there is no cable attached, turn the DC off. Otherwise,
+        * start the link bring up. */
+       if (!qsfp_mod_present(ppd))
+               dc_shutdown(ppd->dd);
+       else
+               start_link(ppd);
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_bounce_work);
+
+       /*
+        * Only do something if the link is currently up.
+        */
+       if (ppd->host_link_state & HLS_UP) {
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               start_link(ppd);
+       } else {
+               dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+                       __func__, link_state_name(ppd->host_link_state));
+       }
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP.  The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+       int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+       if (cap & CAP_CRC_14B)
+               port_ltp |= PORT_LTP_CRC_MODE_14;
+       if (cap & CAP_CRC_48B)
+               port_ltp |= PORT_LTP_CRC_MODE_48;
+       if (cap & CAP_CRC_12B_16B_PER_LANE)
+               port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+       return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+       int cap_mask = 0;
+
+       if (port_ltp & PORT_LTP_CRC_MODE_14)
+               cap_mask |= CAP_CRC_14B;
+       if (port_ltp & PORT_LTP_CRC_MODE_48)
+               cap_mask |= CAP_CRC_48B;
+       if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+               cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+       return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+       int port_ltp = 0;
+
+       if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+               port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+       else if (lcb_crc == LCB_CRC_48B)
+               port_ltp = PORT_LTP_CRC_MODE_48;
+       else if (lcb_crc == LCB_CRC_14B)
+               port_ltp = PORT_LTP_CRC_MODE_14;
+       else
+               port_ltp = PORT_LTP_CRC_MODE_16;
+
+       return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* Sanity check - ppd->pkeys[2] should be 0 */
+       if (ppd->pkeys[2] != 0)
+               dd_dev_err(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+                          __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+       ppd->pkeys[2] = FULL_MGMT_P_KEY;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+       switch (width) {
+       case 0:
+               /*
+                * Simulator and quick linkup do not set the width.
+                * Just set it to 4x without complaint.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+                       return OPA_LINK_WIDTH_4X;
+               return 0; /* no lanes up */
+       case 1: return OPA_LINK_WIDTH_1X;
+       case 2: return OPA_LINK_WIDTH_2X;
+       case 3: return OPA_LINK_WIDTH_3X;
+       default:
+               dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+                       __func__, width);
+               /* fall through */
+       case 4: return OPA_LINK_WIDTH_4X;
+       }
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+static inline u8 nibble_to_count(u8 nibble)
+{
+       return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ *     enable_lane_tx
+ *     enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                           u16 *rx_width)
+{
+       u16 tx, rx;
+       u8 enable_lane_rx;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       u8 max_rate;
+
+       /* read the active lanes */
+       read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+                               &rx_polarity_inversion, &max_rate);
+       read_local_lni(dd, &enable_lane_rx);
+
+       /* convert to counts */
+       tx = nibble_to_count(enable_lane_tx);
+       rx = nibble_to_count(enable_lane_rx);
+
+       /*
+        * Set link_speed_active here, overriding what was set in
+        * handle_verify_cap().  The ASIC 8051 firmware does not correctly
+        * set the max_rate field in handle_verify_cap until v0.19.
+        */
+       if ((dd->icode == ICODE_RTL_SILICON)
+                               && (dd->dc8051_ver < dc8051_ver(0, 19))) {
+               /* max_rate: 0 = 12.5G, 1 = 25G */
+               switch (max_rate) {
+               case 0:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               default:
+                       dd_dev_err(dd,
+                               "%s: unexpected max rate %d, using 25Gb\n",
+                               __func__, (int)max_rate);
+                       /* fall through */
+               case 1:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       }
+
+       dd_dev_info(dd,
+               "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+               enable_lane_tx, tx, enable_lane_rx, rx);
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp.  Does not change
+ * after link up.  I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ *     + bits [7:4] contain the number of active transmitters
+ *     + bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                             u16 *rx_width)
+{
+       u16 widths, tx, rx;
+       u8 misc_bits, local_flags;
+       u16 active_tx, active_rx;
+
+       read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+       tx = widths >> 12;
+       rx = (widths >> 8) & 0xf;
+
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+       u16 tx_width, rx_width;
+
+       /* get end-of-LNI link widths */
+       get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+       /* use tx_width as the link is supposed to be symmetric on link up */
+       ppd->link_width_active = tx_width;
+       /* link width downgrade active (LWD.A) starts out matching LW.A */
+       ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+       ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+       /* per OPA spec, on link up LWD.E resets to LWD.S */
+       ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+       /* cache the active egress rate (units {10^6 bits/sec]) */
+       ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_vc_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       u8 power_management;
+       u8 continious;
+       u8 vcu;
+       u8 vau;
+       u8 z;
+       u16 vl15buf;
+       u16 link_widths;
+       u16 crc_mask;
+       u16 crc_val;
+       u16 device_id;
+       u16 active_tx, active_rx;
+       u8 partner_supported_crc;
+       u8 remote_tx_rate;
+       u8 device_rev;
+
+       set_link_state(ppd, HLS_VERIFY_CAP);
+
+       lcb_shutdown(dd, 0);
+       adjust_lcb_for_fpga_serdes(dd);
+
+       /*
+        * These are now valid:
+        *      remote VerifyCap fields in the general LNI config
+        *      CSR DC8051_STS_REMOTE_GUID
+        *      CSR DC8051_STS_REMOTE_NODE_TYPE
+        *      CSR DC8051_STS_REMOTE_FM_SECURITY
+        *      CSR DC8051_STS_REMOTE_PORT_NO
+        */
+
+       read_vc_remote_phy(dd, &power_management, &continious);
+       read_vc_remote_fabric(
+               dd,
+               &vau,
+               &z,
+               &vcu,
+               &vl15buf,
+               &partner_supported_crc);
+       read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+       read_remote_device_id(dd, &device_id, &device_rev);
+       /*
+        * And the 'MgmtAllowed' information, which is exchanged during
+        * LNI, is also be available at this point.
+        */
+       read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+       dd_dev_info(dd,
+               "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+               (int)power_management, (int)continious);
+       dd_dev_info(dd,
+               "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+               (int)vau,
+               (int)z,
+               (int)vcu,
+               (int)vl15buf,
+               (int)partner_supported_crc);
+       dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+               (u32)remote_tx_rate, (u32)link_widths);
+       dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+               (u32)device_id, (u32)device_rev);
+       /*
+        * The peer vAU value just read is the peer receiver value.  HFI does
+        * not support a transmit vAU of 0 (AU == 8).  We advertised that
+        * with Z=1 in the fabric capabilities sent to the peer.  The peer
+        * will see our Z=1, and, if it advertised a vAU of 0, will move its
+        * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
+        * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+        * subject to the Z value exception.
+        */
+       if (vau == 0)
+               vau = 1;
+       set_up_vl15(dd, vau, vl15buf);
+
+       /* set up the LCB CRC mode */
+       crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+       /* order is important: use the lowest bit in common */
+       if (crc_mask & CAP_CRC_14B)
+               crc_val = LCB_CRC_14B;
+       else if (crc_mask & CAP_CRC_48B)
+               crc_val = LCB_CRC_48B;
+       else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+               crc_val = LCB_CRC_12B_16B_PER_LANE;
+       else
+               crc_val = LCB_CRC_16B;
+
+       dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+       write_csr(dd, DC_LCB_CFG_CRC_MODE,
+                 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+       /* set (14b only) or clear sideband credit */
+       reg = read_csr(dd, SEND_CM_CTRL);
+       if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+               write_csr(dd, SEND_CM_CTRL,
+                       reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       } else {
+               write_csr(dd, SEND_CM_CTRL,
+                       reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       }
+
+       ppd->link_speed_active = 0;     /* invalid value */
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+               switch (remote_tx_rate) {
+               case 0:
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               case 1:
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       } else {
+               /* actual rate is highest bit of the ANDed rates */
+               u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+               if (rate & 2)
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+               else if (rate & 1)
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+       }
+       if (ppd->link_speed_active == 0) {
+               dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+                       __func__, (int)remote_tx_rate);
+               ppd->link_speed_active = OPA_LINK_SPEED_25G;
+       }
+
+       /*
+        * Cache the values of the supported, enabled, and active
+        * LTP CRC modes to return in 'portinfo' queries. But the bit
+        * flags that are returned in the portinfo query differ from
+        * what's in the link_crc_mask, crc_sizes, and crc_val
+        * variables. Convert these here.
+        */
+       ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* supported crc modes */
+       ppd->port_ltp_crc_mode |=
+               cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+               /* enabled crc modes */
+       ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+               /* active crc mode */
+
+       /* set up the remote credit return table */
+       assign_remote_cm_au_table(dd, vcu);
+
+       /*
+        * The LCB is reset on entry to handle_verify_cap(), so this must
+        * be applied on every link up.
+        *
+        * Adjust LCB error kill enable to kill the link if
+        * these RBUF errors are seen:
+        *      REPLAY_BUF_MBE_SMASK
+        *      FLIT_INPUT_BUF_MBE_SMASK
+        */
+       if (is_a0(dd)) {                        /* fixed in B0 */
+               reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+               reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+                       | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+               write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+       }
+
+       /* pull LCB fifos out of reset - all fifo clocks must be stable */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* give 8051 access to the LCB CSRs */
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       ppd->neighbor_guid =
+               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+       ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+       ppd->neighbor_type =
+               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+               DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+       ppd->neighbor_fm_security =
+               read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+               DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+       dd_dev_info(dd,
+               "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+               ppd->neighbor_guid, ppd->neighbor_type,
+               ppd->mgmt_allowed, ppd->neighbor_fm_security);
+       if (ppd->mgmt_allowed)
+               add_full_mgmt_pkey(ppd);
+
+       /* tell the 8051 to go to LinkUp */
+       set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+       int skip = 1;
+       int do_bounce = 0;
+       u16 lwde = ppd->link_width_downgrade_enabled;
+       u16 tx, rx;
+
+       mutex_lock(&ppd->hls_lock);
+       /* only apply if the link is up */
+       if (ppd->host_link_state & HLS_UP)
+               skip = 0;
+       mutex_unlock(&ppd->hls_lock);
+       if (skip)
+               return;
+
+       if (refresh_widths) {
+               get_link_widths(ppd->dd, &tx, &rx);
+               ppd->link_width_downgrade_tx_active = tx;
+               ppd->link_width_downgrade_rx_active = rx;
+       }
+
+       if (lwde == 0) {
+               /* downgrade is disabled */
+
+               /* bounce if not at starting active width */
+               if ((ppd->link_width_active !=
+                                       ppd->link_width_downgrade_tx_active)
+                               || (ppd->link_width_active !=
+                                       ppd->link_width_downgrade_rx_active)) {
+                       dd_dev_err(ppd->dd,
+                               "Link downgrade is disabled and link has downgraded, downing link\n");
+                       dd_dev_err(ppd->dd,
+                               "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+                               ppd->link_width_active,
+                               ppd->link_width_downgrade_tx_active,
+                               ppd->link_width_downgrade_rx_active);
+                       do_bounce = 1;
+               }
+       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0
+               || (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+               /* Tx or Rx is outside the enabled policy */
+               dd_dev_err(ppd->dd,
+                       "Link is outside of downgrade allowed, downing link\n");
+               dd_dev_err(ppd->dd,
+                       "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+                       lwde,
+                       ppd->link_width_downgrade_tx_active,
+                       ppd->link_width_downgrade_rx_active);
+               do_bounce = 1;
+       }
+
+       if (do_bounce) {
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+                 OPA_LINKDOWN_REASON_WIDTH_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               start_link(ppd);
+       }
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_downgrade_work);
+
+       dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+       apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dcc_err_flags,
+               ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, lcb_err_flags,
+               ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_err_flags,
+               ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+               ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+               ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 info, err, host_msg;
+       int queue_link_down = 0;
+       char buf[96];
+
+       /* look at the flags */
+       if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+               /* 8051 information set by firmware */
+               /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+               info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+               err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+               host_msg = (info >>
+                       DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+               /*
+                * Handle error flags.
+                */
+               if (err & FAILED_LNI) {
+                       /*
+                        * LNI error indications are cleared by the 8051
+                        * only when starting polling.  Only pay attention
+                        * to them when in the states that occur during
+                        * LNI.
+                        */
+                       if (ppd->host_link_state
+                           & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+                               queue_link_down = 1;
+                               dd_dev_info(dd, "Link error: %s\n",
+                                       dc8051_info_err_string(buf,
+                                               sizeof(buf),
+                                               err & FAILED_LNI));
+                       }
+                       err &= ~(u64)FAILED_LNI;
+               }
+               if (err) {
+                       /* report remaining errors, but do not do anything */
+                       dd_dev_err(dd, "8051 info error: %s\n",
+                               dc8051_info_err_string(buf, sizeof(buf), err));
+               }
+
+               /*
+                * Handle host message flags.
+                */
+               if (host_msg & HOST_REQ_DONE) {
+                       /*
+                        * Presently, the driver does a busy wait for
+                        * host requests to complete.  This is only an
+                        * informational message.
+                        * NOTE: The 8051 clears the host message
+                        * information *on the next 8051 command*.
+                        * Therefore, when linkup is achieved,
+                        * this flag will still be set.
+                        */
+                       host_msg &= ~(u64)HOST_REQ_DONE;
+               }
+               if (host_msg & BC_SMA_MSG) {
+                       queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+                       host_msg &= ~(u64)BC_SMA_MSG;
+               }
+               if (host_msg & LINKUP_ACHIEVED) {
+                       dd_dev_info(dd, "8051: Link up\n");
+                       queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+                       host_msg &= ~(u64)LINKUP_ACHIEVED;
+               }
+               if (host_msg & EXT_DEVICE_CFG_REQ) {
+                       handle_8051_request(dd);
+                       host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+               }
+               if (host_msg & VERIFY_CAP_FRAME) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+                       host_msg &= ~(u64)VERIFY_CAP_FRAME;
+               }
+               if (host_msg & LINK_GOING_DOWN) {
+                       const char *extra = "";
+                       /* no downgrade action needed if going down */
+                       if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                               host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+                               extra = " (ignoring downgrade)";
+                       }
+                       dd_dev_info(dd, "8051: Link down%s\n", extra);
+                       queue_link_down = 1;
+                       host_msg &= ~(u64)LINK_GOING_DOWN;
+               }
+               if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+                       host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+               }
+               if (host_msg) {
+                       /* report remaining messages, but do not do anything */
+                       dd_dev_info(dd, "8051 info host message: %s\n",
+                               dc8051_info_host_msg_string(buf, sizeof(buf),
+                                       host_msg));
+               }
+
+               reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+       }
+       if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+               /*
+                * Lost the 8051 heartbeat.  If this happens, we
+                * receive constant interrupts about it.  Disable
+                * the interrupt after the first.
+                */
+               dd_dev_err(dd, "Lost 8051 heartbeat\n");
+               write_csr(dd, DC_DC8051_ERR_EN,
+                       read_csr(dd, DC_DC8051_ERR_EN)
+                         & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+               reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+       }
+       if (reg) {
+               /* report the error, but do not do anything */
+               dd_dev_err(dd, "8051 error: %s\n",
+                       dc8051_err_string(buf, sizeof(buf), reg));
+       }
+
+       if (queue_link_down) {
+               /* if the link is already going down or disabled, do not
+                * queue another */
+               if ((ppd->host_link_state
+                                   & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN))
+                               || ppd->link_enabled == 0) {
+                       dd_dev_info(dd, "%s: not queuing link down\n",
+                               __func__);
+               } else {
+                       queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+               }
+       }
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+       "BadHeadDist: Distance violation between two head flits",
+[1] =
+       "BadTailDist: Distance violation between two tail flits",
+[2] =
+       "BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+       "BadCrdAck: Credits return for unsupported VL",
+[4] =
+       "UnsupportedVLMarker: Received VL Marker",
+[5] =
+       "BadPreempt: Exceeded the preemption nesting level",
+[6] =
+       "BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+       "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+       "BadPktLen: Illegal PktLen",
+[2] =
+       "PktLenTooLong: Packet longer than PktLen",
+[3] =
+       "PktLenTooShort: Packet shorter than PktLen",
+[4] =
+       "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+       "BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+       "BadL2: Illegal L2 opcode",
+[7] =
+       "BadSC: Unsupported SC",
+[9] =
+       "BadRC: Illegal RC",
+[11] =
+       "PreemptError: Preempting with same VL",
+[12] =
+       "PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 info, hdr0, hdr1;
+       const char *extra;
+       char buf[96];
+       struct hfi1_pportdata *ppd = dd->pport;
+       u8 lcl_reason = 0;
+       int do_bounce = 0;
+
+       if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+               if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+                       info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+                       dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+               }
+               reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+               struct hfi1_pportdata *ppd = dd->pport;
+               /* this counter saturates at (2^32) - 1 */
+               if (ppd->link_downed < (u32)UINT_MAX)
+                       ppd->link_downed++;
+               reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+               if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+               }
+               switch (info) {
+               case 0:
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+                       extra = fm_config_txt[info];
+                       break;
+               case 8:
+                       extra = fm_config_txt[info];
+                       if (ppd->port_error_action &
+                           OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+                               do_bounce = 1;
+                               /*
+                                * lcl_reason cannot be derived from info
+                                * for this error
+                                */
+                               lcl_reason =
+                                 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+                       }
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+               reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+               hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+               hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+               if (!(dd->err_info_rcvport.status_and_code &
+                     OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_rcvport.status_and_code =
+                               info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_rcvport.status_and_code |=
+                               OPA_EI_STATUS_SMASK;
+                       /* save first 2 flits in the packet that caused
+                        * the error */
+                        dd->err_info_rcvport.packet_flit1 = hdr0;
+                        dd->err_info_rcvport.packet_flit2 = hdr1;
+               }
+               switch (info) {
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+               case 7:
+               case 9:
+               case 11:
+               case 12:
+                       extra = port_rcv_txt[info];
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_PORTRCV_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+               dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
+                       hdr0, hdr1);
+
+               reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "8051 access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+       }
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "host access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+       }
+
+       /* report any remaining errors */
+       if (reg)
+               dd_dev_info(dd, "DCC Error: %s\n",
+                       dcc_err_string(buf, sizeof(buf), reg));
+
+       if (lcl_reason == 0)
+               lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+       if (do_bounce) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "LCB Error: %s\n",
+               lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt.  Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &dc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else if (source == 3 /* dc_lbm_int */) {
+               /*
+                * This indicates that a parity error has occurred on the
+                * address/control lines presented to the LBM.  The error
+                * is a single pulse, there is no associated error flag,
+                * and it is non-maskable.  This is because if a parity
+                * error occurs on the request the request is dropped.
+                * This should never occur, but it is nice to know if it
+                * ever does.
+                */
+               dd_dev_err(dd, "Parity error in DC LBM block\n");
+       } else {
+               dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+       }
+}
+
+/*
+ * TX block send credit interrupt.  Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt.  Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ *      0 -  N-1 = SDma
+ *      N - 2N-1 = SDmaProgress
+ *     2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+       if (likely(what < 3 && which < dd->num_sdma)) {
+               sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+       } else {
+               /* should not happen */
+               dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+       }
+}
+
+/*
+ * RX block receive available interrupt.  Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       if (source < dd->first_user_ctxt)
+                               rcd->do_interrupt(rcd);
+                       else
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+               err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt.  Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       /* only pay attention to user urgent interrupts */
+                       if (source >= dd->first_user_ctxt)
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+               err_detail, source);
+}
+
+/*
+ * Reserved range interrupt.  Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       char name[64];
+
+       dd_dev_err(dd, "unexpected %s interrupt\n",
+                               is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/* start                    end
+                               name func               interrupt func */
+{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
+                               is_misc_err_name,       is_misc_err_int },
+{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
+                               is_sdma_eng_err_name,   is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+                               is_sendctxt_err_name,   is_sendctxt_err_int },
+{ IS_SDMA_START,            IS_SDMA_END,
+                               is_sdma_eng_name,       is_sdma_eng_int },
+{ IS_VARIOUS_START,         IS_VARIOUS_END,
+                               is_various_name,        is_various_int },
+{ IS_DC_START,      IS_DC_END,
+                               is_dc_name,             is_dc_int },
+{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
+                               is_rcv_avail_name,      is_rcv_avail_int },
+{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
+                               is_rcv_urgent_name,     is_rcv_urgent_int },
+{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
+                               is_send_credit_name,    is_send_credit_int},
+{ IS_RESERVED_START,     IS_RESERVED_END,
+                               is_reserved_name,       is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct is_table *entry;
+
+       /* avoids a double compare by walking the table in-order */
+       for (entry = &is_table[0]; entry->is_name; entry++) {
+               if (source < entry->end) {
+                       trace_hfi1_interrupt(dd, entry, source);
+                       entry->is_int(dd, source - entry->start);
+                       return;
+               }
+       }
+       /* fell off the end */
+       dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler.  This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+       struct hfi1_devdata *dd = data;
+       u64 regs[CCE_NUM_INT_CSRS];
+       u32 bit;
+       int i;
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* phase 1: scan and clear all handled interrupts */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               if (dd->gi_mask[i] == 0) {
+                       regs[i] = 0;    /* used later */
+                       continue;
+               }
+               regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+                               dd->gi_mask[i];
+               /* only clear if anything is set */
+               if (regs[i])
+                       write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+       }
+
+       /* phase 2: call the appropriate handler */
+       for_each_set_bit(bit, (unsigned long *)&regs[0],
+                                               CCE_NUM_INT_CSRS*64) {
+               is_interrupt(dd, bit);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+       struct sdma_engine *sde = data;
+       struct hfi1_devdata *dd = sde->dd;
+       u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(sde);
+#endif
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* This read_csr is really bad in the hot path */
+       status = read_csr(dd,
+                       CCE_INT_STATUS + (8*(IS_SDMA_START/64)))
+                       & sde->imask;
+       if (likely(status)) {
+               /* clear the interrupt(s) */
+               write_csr(dd,
+                       CCE_INT_CLEAR + (8*(IS_SDMA_START/64)),
+                       status);
+
+               /* handle the interrupt(s) */
+               sdma_engine_interrupt(sde, status);
+       } else
+               dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+                       sde->this_idx);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * NOTE: this routine expects to be on its own MSI-X interrupt.  If
+ * multiple receive contexts share the same MSI-X interrupt, then this
+ * routine must check for who received it.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       struct hfi1_devdata *dd = rcd->dd;
+
+       trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+       this_cpu_inc(*dd->int_counter);
+
+       /* clear the interrupt */
+       write_csr(rcd->dd, CCE_INT_CLEAR + (8*rcd->ireg), rcd->imask);
+
+       /* handle the interrupt */
+       rcd->do_interrupt(rcd);
+
+       return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+       return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+static u32 read_logical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+                               & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       /* clear current state, set new state */
+       reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+       reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+       write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       u32 regno;
+       int ret;
+
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               if (acquire_lcb_access(dd, 0) == 0) {
+                       *data = read_csr(dd, addr);
+                       release_lcb_access(dd, 0);
+                       return 0;
+               }
+               return -EBUSY;
+       }
+
+       /* register is an index of LCB registers: (offset - base) / 8 */
+       regno = (addr - DC_LCB_CFG_RUN) >> 3;
+       ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+       if (ret != HCMD_SUCCESS)
+               return -EBUSY;
+       return 0;
+}
+
+/*
+ * Read an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return read_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       *data = read_csr(dd, addr);
+       return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+
+       if (acquire_lcb_access(dd, 0) == 0) {
+               write_csr(dd, addr, data);
+               release_lcb_access(dd, 0);
+               return 0;
+       }
+       return -EBUSY;
+}
+
+/*
+ * Write an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return write_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       write_csr(dd, addr, data);
+       return 0;
+}
+
+/*
+ * Returns:
+ *     < 0 = Linux error, not able to get access
+ *     > 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+       struct hfi1_devdata *dd,
+       u32 type,
+       u64 in_data,
+       u64 *out_data)
+{
+       u64 reg, completed;
+       int return_code;
+       unsigned long flags;
+       unsigned long timeout;
+
+       hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+       /*
+        * Alternative to holding the lock for a long time:
+        * - keep busy wait - have other users bounce off
+        */
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+       /* We can't send any commands to the 8051 if it's in reset */
+       if (dd->dc_shutdown) {
+               return_code = -ENODEV;
+               goto fail;
+       }
+
+       /*
+        * If an 8051 host command timed out previously, then the 8051 is
+        * stuck.
+        *
+        * On first timeout, attempt to reset and restart the entire DC
+        * block (including 8051). (Is this too big of a hammer?)
+        *
+        * If the 8051 times out a second time, the reset did not bring it
+        * back to healthy life. In that case, fail any subsequent commands.
+        */
+       if (dd->dc8051_timed_out) {
+               if (dd->dc8051_timed_out > 1) {
+                       dd_dev_err(dd,
+                                  "Previous 8051 host command timed out, skipping command %u\n",
+                                  type);
+                       return_code = -ENXIO;
+                       goto fail;
+               }
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               dc_shutdown(dd);
+               dc_start(dd);
+               spin_lock_irqsave(&dd->dc8051_lock, flags);
+       }
+
+       /*
+        * If there is no timeout, then the 8051 command interface is
+        * waiting for a command.
+        */
+
+       /*
+        * Do two writes: the first to stabilize the type and req_data, the
+        * second to activate.
+        */
+       reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+               | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+       reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+       /* wait for completion, alternate: interrupt */
+       timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+               completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+               if (completed)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd->dc8051_timed_out++;
+                       dd_dev_err(dd, "8051 host command %u timeout\n", type);
+                       if (out_data)
+                               *out_data = 0;
+                       return_code = -ETIMEDOUT;
+                       goto fail;
+               }
+               udelay(2);
+       }
+
+       if (out_data) {
+               *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+               if (type == HCMD_READ_LCB_CSR) {
+                       /* top 16 bits are in a different register */
+                       *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+                               & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+                               << (48
+                                   - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+               }
+       }
+       return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+       dd->dc8051_timed_out = 0;
+       /*
+        * Clear command for next user.
+        */
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+       return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+       return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+static int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+                           u8 lane_id, u32 config_data)
+{
+       u64 data;
+       int ret;
+
+       data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+               | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+               | (u64)config_data << LOAD_DATA_DATA_SHIFT;
+       ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                       "load 8051 config: field id %d, lane %d, err %d\n",
+                       (int)field_id, (int)lane_id, ret);
+       }
+       return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers".  Use the RAM directly.  Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+                           u32 *result)
+{
+       u64 big_data;
+       u32 addr;
+       int ret;
+
+       /* address start depends on the lane_id */
+       if (lane_id < 4)
+               addr = (4 * NUM_GENERAL_FIELDS)
+                       + (lane_id * 4 * NUM_LANE_FIELDS);
+       else
+               addr = 0;
+       addr += field_id * 4;
+
+       /* read is in 8-byte chunks, hardware will truncate the address down */
+       ret = read_8051_data(dd, addr, 8, &big_data);
+
+       if (ret == 0) {
+               /* extract the 4 bytes we want */
+               if (addr & 0x4)
+                       *result = (u32)(big_data >> 32);
+               else
+                       *result = (u32)big_data;
+       } else {
+               *result = 0;
+               dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+                       __func__, lane_id, field_id);
+       }
+
+       return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+                             u8 continuous)
+{
+       u32 frame;
+
+       frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+               | power_management << POWER_MANAGEMENT_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+                               GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+                                u16 vl15buf, u8 crc_sizes)
+{
+       u32 frame;
+
+       frame = (u32)vau << VAU_SHIFT
+               | (u32)z << Z_SHIFT
+               | (u32)vcu << VCU_SHIFT
+               | (u32)vl15buf << VL15BUF_SHIFT
+               | (u32)crc_sizes << CRC_SIZES_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+                               GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                               &frame);
+       *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+       *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+                                    u8 misc_bits,
+                                    u8 flag_bits,
+                                    u16 link_widths)
+{
+       u32 frame;
+
+       frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+               | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+               | (u32)link_widths << LINK_WIDTH_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                    frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+                                u8 device_rev)
+{
+       u32 frame;
+
+       frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+               | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+       return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+       *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+       *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+                       & REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+       u32 frame;
+
+       read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+       *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+       *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+       *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+                                       & POWER_MANAGEMENT_MASK;
+       *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+                                       & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+       *vau = (frame >> VAU_SHIFT) & VAU_MASK;
+       *z = (frame >> Z_SHIFT) & Z_MASK;
+       *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+       *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+       *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate,
+                                     u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+                               &frame);
+       *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+                               & REMOTE_TX_RATE_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+       u32 frame;
+
+       read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+       *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+       *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+       read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+       read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+       u32 frame;
+       int ret;
+
+       *link_quality = 0;
+       if (dd->pport->host_link_state & HLS_UP) {
+               ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+                                       &frame);
+               if (ret == 0)
+                       *link_quality = (frame >> LINK_QUALITY_SHIFT)
+                                               & LINK_QUALITY_MASK;
+       }
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+       *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+                           u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion,
+                           u8 *max_rate)
+{
+       u32 frame;
+       int ret;
+
+       ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+       *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+                               & ENABLE_LANE_TX_MASK;
+       *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+                               & TX_POLARITY_INVERSION_MASK;
+       *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+                               & RX_POLARITY_INVERSION_MASK;
+       *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+       return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+                            u8 enable_lane_tx,
+                            u8 tx_polarity_inversion,
+                            u8 rx_polarity_inversion,
+                            u8 max_rate)
+{
+       u32 frame;
+
+       /* no need to mask, all variable sizes match field widths */
+       frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+               | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+               | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+               | max_rate << MAX_RATE_SHIFT;
+       return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
+{
+       u32 frame, version, prod_id;
+       int ret, lane;
+
+       /* 4 lanes */
+       for (lane = 0; lane < 4; lane++) {
+               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
+               if (ret) {
+                       dd_dev_err(
+                               dd,
+                               "Unable to read lane %d firmware details\n",
+                               lane);
+                       continue;
+               }
+               version = (frame >> SPICO_ROM_VERSION_SHIFT)
+                                       & SPICO_ROM_VERSION_MASK;
+               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
+                                       & SPICO_ROM_PROD_ID_MASK;
+               dd_dev_info(dd,
+                       "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+                       lane, version, prod_id);
+       }
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG,
+               type, data_out);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "read idle message: type %d, err %d\n",
+                       (u32)type, ret);
+               return -EINVAL;
+       }
+       dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+       /* return only the payload as we already know the type */
+       *data_out >>= IDLE_PAYLOAD_SHIFT;
+       return 0;
+}
+
+/*
+ * Read an idle SMA message.  To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+       return read_idle_message(dd,
+                       (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+       int ret;
+
+       dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+       ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+                       data, ret);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+       u64 data;
+
+       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT)
+               | ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+       return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up.  This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       unsigned long timeout;
+       int ret;
+
+       lcb_shutdown(dd, 0);
+
+       if (loopback) {
+               /* LCB_CFG_LOOPBACK.VAL = 2 */
+               /* LCB_CFG_LANE_WIDTH.VAL = 0 */
+               write_csr(dd, DC_LCB_CFG_LOOPBACK,
+                       IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+               write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+       }
+
+       /* start the LCBs */
+       /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* simulator only loopback steps */
+       if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               /* LCB_CFG_RUN.EN = 1 */
+               write_csr(dd, DC_LCB_CFG_RUN,
+                       1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+               /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+               timeout = jiffies + msecs_to_jiffies(10);
+               while (1) {
+                       reg = read_csr(dd,
+                               DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+                       if (reg)
+                               break;
+                       if (time_after(jiffies, timeout)) {
+                               dd_dev_err(dd,
+                                       "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+                               return -ETIMEDOUT;
+                       }
+                       udelay(2);
+               }
+
+               write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+                       1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+       }
+
+       if (!loopback) {
+               /*
+                * When doing quick linkup and not in loopback, both
+                * sides must be done with LCB set-up before either
+                * starts the quick linkup.  Put a delay here so that
+                * both sides can be started and have a chance to be
+                * done with LCB set up before resuming.
+                */
+               dd_dev_err(dd,
+                       "Pausing for peer to be finished with LCB set up\n");
+               msleep(5000);
+               dd_dev_err(dd,
+                       "Continuing with quick linkup\n");
+       }
+
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       /*
+        * State "quick" LinkUp request sets the physical link state to
+        * LinkUp without a verify capability sequence.
+        * This state is in simulator v37 and later.
+        */
+       ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                       "%s: set physical link state to quick LinkUp failed with return %d\n",
+                       __func__, ret);
+
+               set_host_lcb_access(dd);
+               write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+               if (ret >= 0)
+                       ret = -EINVAL;
+               return ret;
+       }
+
+       return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+       dd_dev_err(dd,
+               "Set physical link state to SerDes Loopback failed with return %d\n",
+               ret);
+       if (ret >= 0)
+               ret = -EINVAL;
+       return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+       dd_dev_info(dd, "Entering loopback mode\n");
+
+       /* all loopbacks should disable self GUID check */
+       write_csr(dd, DC_DC8051_CFG_MODE,
+               (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+       /*
+        * The simulator has only one loopback option - LCB.  Switch
+        * to that option, which includes quick link up.
+        *
+        * Accept all valid loopback values.
+        */
+       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               && (loopback == LOOPBACK_SERDES
+                       || loopback == LOOPBACK_LCB
+                       || loopback == LOOPBACK_CABLE)) {
+               loopback = LOOPBACK_LCB;
+               quick_linkup = 1;
+               return 0;
+       }
+
+       /* handle serdes loopback */
+       if (loopback == LOOPBACK_SERDES) {
+               /* internal serdes loopack needs quick linkup on RTL */
+               if (dd->icode == ICODE_RTL_SILICON)
+                       quick_linkup = 1;
+               return set_serdes_loopback_mode(dd);
+       }
+
+       /* LCB loopback - handled at poll time */
+       if (loopback == LOOPBACK_LCB) {
+               quick_linkup = 1; /* LCB is always quick linkup */
+
+               /* not supported in emulation due to emulation RTL changes */
+               if (dd->icode == ICODE_FPGA_EMULATION) {
+                       dd_dev_err(dd,
+                               "LCB loopback not supported in emulation\n");
+                       return -EINVAL;
+               }
+               return 0;
+       }
+
+       /* external cable loopback requires no extra steps */
+       if (loopback == LOOPBACK_CABLE)
+               return 0;
+
+       dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+       return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+       int i;
+       u16 result = 0;
+
+       static const struct link_bits {
+               u16 from;
+               u16 to;
+       } opa_link_xlate[] = {
+               { OPA_LINK_WIDTH_1X, 1 << (1-1)  },
+               { OPA_LINK_WIDTH_2X, 1 << (2-1)  },
+               { OPA_LINK_WIDTH_3X, 1 << (3-1)  },
+               { OPA_LINK_WIDTH_4X, 1 << (4-1)  },
+       };
+
+       for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+               if (opa_widths & opa_link_xlate[i].from)
+                       result |= opa_link_xlate[i].to;
+       }
+       return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       int ret;
+
+       /* reset our fabric serdes to clear any lingering problems */
+       fabric_serdes_reset(dd);
+
+       /* set the local tx rate - need to read-modify-write */
+       ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+               &rx_polarity_inversion, &ppd->local_tx_rate);
+       if (ret)
+               goto set_local_link_attributes_fail;
+
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* set the tx rate to the fastest enabled */
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate = 1;
+               else
+                       ppd->local_tx_rate = 0;
+       } else {
+               /* set the tx rate to all enabled */
+               ppd->local_tx_rate = 0;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate |= 2;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+                       ppd->local_tx_rate |= 1;
+       }
+       ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+                    rx_polarity_inversion, ppd->local_tx_rate);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /*
+        * DC supports continuous updates.
+        */
+       ret = write_vc_local_phy(dd, 0 /* no power management */,
+                                    1 /* continuous updates */);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* z=1 in the next call: AU of 0 is not supported by the hardware */
+       ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+                                   ppd->port_crc_mode_enabled);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       ret = write_vc_local_link_width(dd, 0, 0,
+                    opa_to_vc_link_widths(ppd->link_width_enabled));
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* let peer know who we are */
+       ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+
+set_local_link_attributes_fail:
+       dd_dev_err(dd,
+               "Failed to set local link attributes, return 0x%x\n",
+               ret);
+       return ret;
+}
+
+/*
+ * Call this to start the link.  Schedule a retry if the cable is not
+ * present or if unable to start polling.  Do not do anything if the
+ * link is disabled.  Returns 0 if link is disabled or moved to polling
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+       if (!ppd->link_enabled) {
+               dd_dev_info(ppd->dd,
+                       "%s: stopping link start because link is disabled\n",
+                       __func__);
+               return 0;
+       }
+       if (!ppd->driver_link_ready) {
+               dd_dev_info(ppd->dd,
+                       "%s: stopping link start because driver is not ready\n",
+                       __func__);
+               return 0;
+       }
+
+       if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
+                       loopback == LOOPBACK_LCB ||
+                       ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return set_link_state(ppd, HLS_DN_POLL);
+
+       dd_dev_info(ppd->dd,
+               "%s: stopping link start because no cable is present\n",
+               __func__);
+       return -EAGAIN;
+}
+
+static void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask, qsfp_mask;
+
+       mask = (u64)QSFP_HFI0_RESET_N;
+       qsfp_mask = read_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
+       qsfp_mask |= mask;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE,
+               qsfp_mask);
+
+       qsfp_mask = read_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+       qsfp_mask &= ~mask;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+               qsfp_mask);
+
+       udelay(10);
+
+       qsfp_mask |= mask;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+               qsfp_mask);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+                                       u8 *qsfp_interrupt_status)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+               (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP cable on fire\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+               (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP cable temperature too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+               (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP supply voltage too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+               (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP supply voltage too low\n",
+                       __func__);
+
+       /* Byte 2 is vendor specific */
+
+       if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 1/2 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 1/2 power too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 3/4 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 3/4 power too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+               (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 bias too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+               (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 bias too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+               (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 bias too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+               (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 bias too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 power too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 power too low\n",
+                       __func__);
+
+       /* Bytes 9-10 and 11-12 are reserved */
+       /* Bytes 13-15 are vendor specific */
+
+       return 0;
+}
+
+static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd)
+{
+       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+       return 0;
+}
+
+static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u8 qsfp_interrupt_status = 0;
+
+       if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1)
+               != 1) {
+               dd_dev_info(dd,
+                       "%s: Failed to read status of QSFP module\n",
+                       __func__);
+               return -EIO;
+       }
+
+       /* We don't care about alarms & warnings with a non-functional INT_N */
+       if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY))
+               do_pre_lni_host_behaviors(ppd);
+
+       return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module is present */
+static void qsfp_event(struct work_struct *work)
+{
+       struct qsfp_data *qd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+
+       qd = container_of(work, struct qsfp_data, qsfp_work);
+       ppd = qd->ppd;
+       dd = ppd->dd;
+
+       /* Sanity check */
+       if (!qsfp_mod_present(ppd))
+               return;
+
+       /*
+        * Turn DC back on after cables has been
+        * re-inserted. Up until now, the DC has been in
+        * reset to save power.
+        */
+       dc_start(dd);
+
+       if (qd->cache_refresh_required) {
+               msleep(3000);
+               reset_qsfp(ppd);
+
+               /* Check for QSFP interrupt after t_init (SFF 8679)
+                * + extra
+                */
+               msleep(3000);
+               if (!qd->qsfp_interrupt_functional) {
+                       if (do_qsfp_intr_fallback(ppd) < 0)
+                               dd_dev_info(dd, "%s: QSFP fallback failed\n",
+                                       __func__);
+                       ppd->driver_link_ready = 1;
+                       start_link(ppd);
+               }
+       }
+
+       if (qd->check_interrupt_flags) {
+               u8 qsfp_interrupt_status[16] = {0,};
+
+               if (qsfp_read(ppd, dd->hfi1_id, 6,
+                             &qsfp_interrupt_status[0], 16) != 16) {
+                       dd_dev_info(dd,
+                               "%s: Failed to read status of QSFP module\n",
+                               __func__);
+               } else {
+                       unsigned long flags;
+                       u8 data_status;
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.check_interrupt_flags = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                                               flags);
+
+                       if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1)
+                                != 1) {
+                               dd_dev_info(dd,
+                               "%s: Failed to read status of QSFP module\n",
+                                       __func__);
+                       }
+                       if (!(data_status & QSFP_DATA_NOT_READY)) {
+                               do_pre_lni_host_behaviors(ppd);
+                               start_link(ppd);
+                       } else
+                               handle_qsfp_error_conditions(ppd,
+                                               qsfp_interrupt_status);
+               }
+       }
+}
+
+void init_qsfp(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 qsfp_mask;
+
+       if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+                       ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR ||
+                       !HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
+               ppd->driver_link_ready = 1;
+               return;
+       }
+
+       ppd->qsfp_info.ppd = ppd;
+       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+       qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+       /* Clear current status to avoid spurious interrupts */
+       write_csr(dd,
+                       dd->hfi1_id ?
+                               ASIC_QSFP2_CLEAR :
+                               ASIC_QSFP1_CLEAR,
+               qsfp_mask);
+
+       /* Handle active low nature of INT_N and MODPRST_N pins */
+       if (qsfp_mod_present(ppd))
+               qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+                 qsfp_mask);
+
+       /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */
+       qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+               qsfp_mask);
+
+       if (qsfp_mod_present(ppd)) {
+               msleep(3000);
+               reset_qsfp(ppd);
+
+               /* Check for QSFP interrupt after t_init (SFF 8679)
+                * + extra
+                */
+               msleep(3000);
+               if (!ppd->qsfp_info.qsfp_interrupt_functional) {
+                       if (do_qsfp_intr_fallback(ppd) < 0)
+                               dd_dev_info(dd,
+                                       "%s: QSFP fallback failed\n",
+                                       __func__);
+                       ppd->driver_link_ready = 1;
+               }
+       }
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 guid;
+       int ret;
+
+       if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+               add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+       guid = ppd->guid;
+       if (!guid) {
+               if (dd->base_guid)
+                       guid = dd->base_guid + ppd->port - 1;
+               ppd->guid = guid;
+       }
+
+       /* the link defaults to enabled */
+       ppd->link_enabled = 1;
+       /* Set linkinit_reason on power up per OPA spec */
+       ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+       if (loopback) {
+               ret = init_loopback(dd);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return start_link(ppd);
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Shut down the link and keep it down.   First turn off that the
+        * driver wants to allow the link to be up (driver_link_ready).
+        * Then make sure the link is not automatically restarted
+        * (link_enabled).  Cancel any pending restart.  And finally
+        * go offline.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+
+       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+         OPA_LINKDOWN_REASON_SMA_DISABLED);
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       /* disable the port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.rc_acks = NULL;
+               ppd->ibport_data.rc_qacks = NULL;
+               ppd->ibport_data.rc_acks = alloc_percpu(u64);
+               ppd->ibport_data.rc_qacks = alloc_percpu(u64);
+               ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64);
+               if ((ppd->ibport_data.rc_acks == NULL) ||
+                   (ppd->ibport_data.rc_delayed_comp == NULL) ||
+                   (ppd->ibport_data.rc_qacks == NULL))
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static const char * const pt_names[] = {
+       "expected",
+       "eager",
+       "invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+       return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order)
+{
+       u64 reg;
+       void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+                             (dd->kregbase + RCV_ARRAY));
+
+       if (!(dd->flags & HFI1_PRESENT))
+               goto done;
+
+       if (type == PT_INVALID) {
+               pa = 0;
+       } else if (type > PT_INVALID) {
+               dd_dev_err(dd,
+                       "unexpected receive array type %u for index %u, not handled\n",
+                       type, index);
+               goto done;
+       }
+
+       hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+                 pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12       /* 4KB kernel address boundary */
+       reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+               | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+               | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+                                       << RCV_ARRAY_RT_ADDR_SHIFT;
+       writeq(reg, base + (index * 8));
+
+       if (type == PT_EAGER)
+               /*
+                * Eager entries are written one-by-one so we have to push them
+                * after we write the entry.
+                */
+               flush_wc();
+done:
+       return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 i;
+
+       /* this could be optimized */
+       for (i = rcd->eager_base; i < rcd->eager_base +
+                    rcd->egrbufs.alloced; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+       for (i = rcd->expected_base;
+                       i < rcd->expected_base + rcd->expected_count; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo)
+{
+       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
+               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
+       return 0;
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+       u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+       return (struct hfi1_message_header *)
+               (rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+       "HFI1_IB_CFG_LIDLMC",
+       "HFI1_IB_CFG_LWID_DG_ENB",
+       "HFI1_IB_CFG_LWID_ENB",
+       "HFI1_IB_CFG_LWID",
+       "HFI1_IB_CFG_SPD_ENB",
+       "HFI1_IB_CFG_SPD",
+       "HFI1_IB_CFG_RXPOL_ENB",
+       "HFI1_IB_CFG_LREV_ENB",
+       "HFI1_IB_CFG_LINKLATENCY",
+       "HFI1_IB_CFG_HRTBT",
+       "HFI1_IB_CFG_OP_VLS",
+       "HFI1_IB_CFG_VL_HIGH_CAP",
+       "HFI1_IB_CFG_VL_LOW_CAP",
+       "HFI1_IB_CFG_OVERRUN_THRESH",
+       "HFI1_IB_CFG_PHYERR_THRESH",
+       "HFI1_IB_CFG_LINKDEFAULT",
+       "HFI1_IB_CFG_PKEYS",
+       "HFI1_IB_CFG_MTU",
+       "HFI1_IB_CFG_LSTATE",
+       "HFI1_IB_CFG_VL_HIGH_LIMIT",
+       "HFI1_IB_CFG_PMA_TICKS",
+       "HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+       if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+               return "invalid";
+       return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int val = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+               val = ppd->link_width_enabled;
+               break;
+       case HFI1_IB_CFG_LWID: /* currently active Link-width */
+               val = ppd->link_width_active;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               val = ppd->link_speed_enabled;
+               break;
+       case HFI1_IB_CFG_SPD: /* current Link speed */
+               val = ppd->link_speed_active;
+               break;
+
+       case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+       case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+       case HFI1_IB_CFG_LINKLATENCY:
+               goto unimplemented;
+
+       case HFI1_IB_CFG_OP_VLS:
+               val = ppd->vls_operational;
+               break;
+       case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+               val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+               val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               val = ppd->overrun_threshold;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               val = ppd->phy_error_threshold;
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               val = dd->link_default;
+               break;
+
+       case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+       case HFI1_IB_CFG_PMA_TICKS:
+       default:
+unimplemented:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(
+                               dd,
+                               "%s: which %s: not implemented\n",
+                               __func__,
+                               ib_cfg_name(which));
+               break;
+       }
+
+       return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+       /*
+        * The maximum non-payload (MTU) bytes in LRH.PktLen are
+        * the Receive Header Entry Size minus the PBC (or RHF) size
+        * plus one DW for the ICRC appended by HW.
+        *
+        * dd->rcd[0].rcvhdrqentsize is in DW.
+        * We use rcd[0] as all context will have the same value. Also,
+        * the first kernel context would have been allocated by now so
+        * we are guaranteed a valid value.
+        */
+       return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 max_hb = lrh_max_header_bytes(dd), maxvlmtu = 0, dcmtu;
+       u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+                             & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+               SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+       int i;
+
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if (dd->vld[i].mtu > maxvlmtu)
+                       maxvlmtu = dd->vld[i].mtu;
+               if (i <= 3)
+                       len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+               else
+                       len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+       }
+       write_csr(dd, SEND_LEN_CHECK0, len1);
+       write_csr(dd, SEND_LEN_CHECK1, len2);
+       /* adjust kernel credit return thresholds based on new MTUs */
+       /* all kernel receive contexts have the same hdrqentsize */
+       for (i = 0; i < ppd->vls_supported; i++) {
+               sc_set_cr_threshold(dd->vld[i].sc,
+                       sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu,
+                               dd->rcd[0]->rcvhdrqentsize));
+       }
+       sc_set_cr_threshold(dd->vld[15].sc,
+               sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu,
+                       dd->rcd[0]->rcvhdrqentsize));
+
+       /* Adjust maximum MTU for the port in DC */
+       dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+               (ilog2(maxvlmtu >> 8) + 1);
+       len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+       len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+       len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+               DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+       int i;
+       u64 sreg = 0;
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 mask = ~((1U << ppd->lmc) - 1);
+       u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+       if (dd->hfi1_snoop.mode_flag)
+               dd_dev_info(dd, "Set lid/lmc while snooping");
+
+       c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+               | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+       c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)|
+             ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+                       << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+       /*
+        * Iterate over all the send contexts and set their SLID check
+        */
+       sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+                       SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+              (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+                       SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+       }
+
+       /* Now we have to do the same thing for the sdma engines */
+       sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+       unsigned long timeout;
+       u32 curr_state;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               curr_state = read_physical_state(dd);
+               if (curr_state == state)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                               "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+                               state, curr_state);
+                       return -ETIMEDOUT;
+               }
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+
+       return 0;
+}
+
+/*
+ * Helper for set_link_state().  Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 pstate, previous_state;
+       u32 last_local_state;
+       u32 last_remote_state;
+       int ret;
+       int do_transition;
+       int do_wait;
+
+       previous_state = ppd->host_link_state;
+       ppd->host_link_state = HLS_GOING_OFFLINE;
+       pstate = read_physical_state(dd);
+       if (pstate == PLS_OFFLINE) {
+               do_transition = 0;      /* in right state */
+               do_wait = 0;            /* ...no need to wait */
+       } else if ((pstate & 0xff) == PLS_OFFLINE) {
+               do_transition = 0;      /* in an offline transient state */
+               do_wait = 1;            /* ...wait for it to settle */
+       } else {
+               do_transition = 1;      /* need to move to offline */
+               do_wait = 1;            /* ...will need to wait */
+       }
+
+       if (do_transition) {
+               ret = set_physical_link_state(dd,
+                       PLS_OFFLINE | (rem_reason << 8));
+
+               if (ret != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                               "Failed to transition to Offline link state, return %d\n",
+                               ret);
+                       return -EINVAL;
+               }
+               if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE)
+                       ppd->offline_disabled_reason =
+                       OPA_LINKDOWN_REASON_TRANSIENT;
+       }
+
+       if (do_wait) {
+               /* it can take a while for the link to go down */
+               ret = wait_phy_linkstate(dd, PLS_OFFLINE, 5000);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /* make sure the logical state is also down */
+       wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+       /*
+        * Now in charge of LCB - must be after the physical state is
+        * offline.quiet and before host_link_state is changed.
+        */
+       set_host_lcb_access(dd);
+       write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+       ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+       /*
+        * The LNI has a mandatory wait time after the physical state
+        * moves to Offline.Quiet.  The wait time may be different
+        * depending on how the link went down.  The 8051 firmware
+        * will observe the needed wait time and only move to ready
+        * when that is completed.  The largest of the quiet timeouts
+        * is 2.5s, so wait that long and then a bit more.
+        */
+       ret = wait_fm_ready(dd, 3000);
+       if (ret) {
+               dd_dev_err(dd,
+                       "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+               /* state is really offline, so make it so */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               return ret;
+       }
+
+       /*
+        * The state is now offline and the 8051 is ready to accept host
+        * requests.
+        *      - change our state
+        *      - notify others if we were previously in a linkup state
+        */
+       ppd->host_link_state = HLS_DN_OFFLINE;
+       if (previous_state & HLS_UP) {
+               /* went down while link was up */
+               handle_linkup_change(dd, 0);
+       } else if (previous_state
+                       & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+               /* went down while attempting link up */
+               /* byte 1 of last_*_state is the failure reason */
+               read_last_local_state(dd, &last_local_state);
+               read_last_remote_state(dd, &last_remote_state);
+               dd_dev_err(dd,
+                       "LNI failure last states: local 0x%08x, remote 0x%08x\n",
+                       last_local_state, last_remote_state);
+       }
+
+       /* the active link width (downgrade) is 0 on link down */
+       ppd->link_width_active = 0;
+       ppd->link_width_downgrade_tx_active = 0;
+       ppd->link_width_downgrade_rx_active = 0;
+       ppd->current_egress_rate = 0;
+       return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+       const char *name;
+       int n = ilog2(state);
+       static const char * const names[] = {
+               [__HLS_UP_INIT_BP]       = "INIT",
+               [__HLS_UP_ARMED_BP]      = "ARMED",
+               [__HLS_UP_ACTIVE_BP]     = "ACTIVE",
+               [__HLS_DN_DOWNDEF_BP]    = "DOWNDEF",
+               [__HLS_DN_POLL_BP]       = "POLL",
+               [__HLS_DN_DISABLE_BP]    = "DISABLE",
+               [__HLS_DN_OFFLINE_BP]    = "OFFLINE",
+               [__HLS_VERIFY_CAP_BP]    = "VERIFY_CAP",
+               [__HLS_GOING_UP_BP]      = "GOING_UP",
+               [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+               [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+       };
+
+       name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+       return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+       if (state == HLS_UP_INIT) {
+               switch (ppd->linkinit_reason) {
+               case OPA_LINKINIT_REASON_LINKUP:
+                       return "(LINKUP)";
+               case OPA_LINKINIT_REASON_FLAPPING:
+                       return "(FLAPPING)";
+               case OPA_LINKINIT_OUTSIDE_POLICY:
+                       return "(OUTSIDE_POLICY)";
+               case OPA_LINKINIT_QUARANTINED:
+                       return "(QUARANTINED)";
+               case OPA_LINKINIT_INSUFIC_CAPABILITY:
+                       return "(INSUFIC_CAPABILITY)";
+               default:
+                       break;
+               }
+       }
+       return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+       switch (ppd->host_link_state) {
+       case HLS_UP_INIT:
+       case HLS_UP_ARMED:
+       case HLS_UP_ACTIVE:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case HLS_DN_POLL:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_DN_DISABLE:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case HLS_DN_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_VERIFY_CAP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_UP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_LINK_COOLDOWN:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_DN_DOWNDEF:
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+               return  -1;
+       }
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+       if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
+               return IB_PORT_DOWN;
+
+       switch (ppd->host_link_state & HLS_UP) {
+       case HLS_UP_INIT:
+               return IB_PORT_INIT;
+       case HLS_UP_ARMED:
+               return IB_PORT_ARMED;
+       case HLS_UP_ACTIVE:
+               return IB_PORT_ACTIVE;
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+       return -1;
+       }
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason)
+{
+       if (ppd->local_link_down_reason.latest == 0 &&
+           ppd->neigh_link_down_reason.latest == 0) {
+               ppd->local_link_down_reason.latest = lcl_reason;
+               ppd->neigh_link_down_reason.latest = neigh_reason;
+               ppd->remote_link_down_reason = rem_reason;
+       }
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt.  It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct ib_event event = {.device = NULL};
+       int ret1, ret = 0;
+       int was_up, is_down;
+       int orig_new_state, poll_bounce;
+
+       mutex_lock(&ppd->hls_lock);
+
+       orig_new_state = state;
+       if (state == HLS_DN_DOWNDEF)
+               state = dd->link_default;
+
+       /* interpret poll -> poll as a link bounce */
+       poll_bounce = ppd->host_link_state == HLS_DN_POLL
+                               && state == HLS_DN_POLL;
+
+       dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+               link_state_name(ppd->host_link_state),
+               link_state_name(orig_new_state),
+               poll_bounce ? "(bounce) " : "",
+               link_state_reason_name(ppd, state));
+
+       was_up = !!(ppd->host_link_state & HLS_UP);
+
+       /*
+        * If we're going to a (HLS_*) link state that implies the logical
+        * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+        * reset is_sm_config_started to 0.
+        */
+       if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+               ppd->is_sm_config_started = 0;
+
+       /*
+        * Do nothing if the states match.  Let a poll to poll link bounce
+        * go through.
+        */
+       if (ppd->host_link_state == state && !poll_bounce)
+               goto done;
+
+       switch (state) {
+       case HLS_UP_INIT:
+               if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup
+                           || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+                       /*
+                        * Quick link up jumps from polling to here.
+                        *
+                        * Whether in normal or loopback mode, the
+                        * simulator jumps from polling to link up.
+                        * Accept that here.
+                        */
+                       /* OK */;
+               } else if (ppd->host_link_state != HLS_GOING_UP) {
+                       goto unexpected;
+               }
+
+               ppd->host_link_state = HLS_UP_INIT;
+               ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at going_up */
+                       ppd->host_link_state = HLS_GOING_UP;
+                       dd_dev_err(dd,
+                               "%s: logical state did not change to INIT\n",
+                               __func__);
+               } else {
+                       /* clear old transient LINKINIT_REASON code */
+                       if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+                               ppd->linkinit_reason =
+                                       OPA_LINKINIT_REASON_LINKUP;
+
+                       /* enable the port */
+                       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+                       handle_linkup_change(dd, 1);
+               }
+               break;
+       case HLS_UP_ARMED:
+               if (ppd->host_link_state != HLS_UP_INIT)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ARMED;
+               set_logical_state(dd, LSTATE_ARMED);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at init */
+                       ppd->host_link_state = HLS_UP_INIT;
+                       dd_dev_err(dd,
+                               "%s: logical state did not change to ARMED\n",
+                               __func__);
+               }
+               /*
+                * The simulator does not currently implement SMA messages,
+                * so neighbor_normal is not set.  Set it here when we first
+                * move to Armed.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+                       ppd->neighbor_normal = 1;
+               break;
+       case HLS_UP_ACTIVE:
+               if (ppd->host_link_state != HLS_UP_ARMED)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ACTIVE;
+               set_logical_state(dd, LSTATE_ACTIVE);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at armed */
+                       ppd->host_link_state = HLS_UP_ARMED;
+                       dd_dev_err(dd,
+                               "%s: logical state did not change to ACTIVE\n",
+                               __func__);
+               } else {
+
+                       /* tell all engines to go running */
+                       sdma_all_running(dd);
+
+                       /* Signal the IB layer that the port has went active */
+                       event.device = &dd->verbs_dev.ibdev;
+                       event.element.port_num = ppd->port;
+                       event.event = IB_EVENT_PORT_ACTIVE;
+               }
+               break;
+       case HLS_DN_POLL:
+               if ((ppd->host_link_state == HLS_DN_DISABLE ||
+                    ppd->host_link_state == HLS_DN_OFFLINE) &&
+                   dd->dc_shutdown)
+                       dc_start(dd);
+               /* Hand LED control to the DC */
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       u8 tmp = ppd->link_enabled;
+
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret) {
+                               ppd->link_enabled = tmp;
+                               break;
+                       }
+                       ppd->remote_link_down_reason = 0;
+
+                       if (ppd->driver_link_ready)
+                               ppd->link_enabled = 1;
+               }
+
+               ret = set_local_link_attributes(ppd);
+               if (ret)
+                       break;
+
+               ppd->port_error_action = 0;
+               ppd->host_link_state = HLS_DN_POLL;
+
+               if (quick_linkup) {
+                       /* quick linkup does not go into polling */
+                       ret = do_quick_linkup(dd);
+               } else {
+                       ret1 = set_physical_link_state(dd, PLS_POLLING);
+                       if (ret1 != HCMD_SUCCESS) {
+                               dd_dev_err(dd,
+                                       "Failed to transition to Polling link state, return 0x%x\n",
+                                       ret1);
+                               ret = -EINVAL;
+                       }
+               }
+               ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+               /*
+                * If an error occurred above, go back to offline.  The
+                * caller may reschedule another attempt.
+                */
+               if (ret)
+                       goto_offline(ppd, 0);
+               break;
+       case HLS_DN_DISABLE:
+               /* link is disabled */
+               ppd->link_enabled = 0;
+
+               /* allow any state to transition to disabled */
+
+               /* must transition to offline first */
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret)
+                               break;
+                       ppd->remote_link_down_reason = 0;
+               }
+
+               ret1 = set_physical_link_state(dd, PLS_DISABLED);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                               "Failed to transition to Disabled link state, return 0x%x\n",
+                               ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_DN_DISABLE;
+               dc_shutdown(dd);
+               break;
+       case HLS_DN_OFFLINE:
+               if (ppd->host_link_state == HLS_DN_DISABLE)
+                       dc_start(dd);
+
+               /* allow any state to transition to offline */
+               ret = goto_offline(ppd, ppd->remote_link_down_reason);
+               if (!ret)
+                       ppd->remote_link_down_reason = 0;
+               break;
+       case HLS_VERIFY_CAP:
+               if (ppd->host_link_state != HLS_DN_POLL)
+                       goto unexpected;
+               ppd->host_link_state = HLS_VERIFY_CAP;
+               break;
+       case HLS_GOING_UP:
+               if (ppd->host_link_state != HLS_VERIFY_CAP)
+                       goto unexpected;
+
+               ret1 = set_physical_link_state(dd, PLS_LINKUP);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                               "Failed to transition to link up state, return 0x%x\n",
+                               ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_GOING_UP;
+               break;
+
+       case HLS_GOING_OFFLINE:         /* transient within goto_offline() */
+       case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
+       default:
+               dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+                       __func__, state);
+               ret = -EINVAL;
+               break;
+       }
+
+       is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
+                       HLS_DN_DISABLE | HLS_DN_OFFLINE));
+
+       if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
+           ppd->neigh_link_down_reason.sma == 0) {
+               ppd->local_link_down_reason.sma =
+                 ppd->local_link_down_reason.latest;
+               ppd->neigh_link_down_reason.sma =
+                 ppd->neigh_link_down_reason.latest;
+       }
+
+       goto done;
+
+unexpected:
+       dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+               __func__, link_state_name(ppd->host_link_state),
+               link_state_name(state));
+       ret = -EINVAL;
+
+done:
+       mutex_unlock(&ppd->hls_lock);
+
+       if (event.device)
+               ib_dispatch_event(&event);
+
+       return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+       u64 reg;
+       int ret = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LIDLMC:
+               set_lidlmc(ppd);
+               break;
+       case HFI1_IB_CFG_VL_HIGH_LIMIT:
+               /*
+                * The VL Arbitrator high limit is sent in units of 4k
+                * bytes, while HFI stores it in units of 64 bytes.
+                */
+               val *= 4096/64;
+               reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+                       << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+               write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               /* HFI only supports POLL as the default link down state */
+               if (val != HLS_DN_POLL)
+                       ret = -EINVAL;
+               break;
+       case HFI1_IB_CFG_OP_VLS:
+               if (ppd->vls_operational != val) {
+                       ppd->vls_operational = val;
+                       if (!ppd->port)
+                               ret = -EINVAL;
+                       else
+                               ret = sdma_map_init(
+                                       ppd->dd,
+                                       ppd->port - 1,
+                                       val,
+                                       NULL);
+               }
+               break;
+       /*
+        * For link width, link width downgrade, and speed enable, always AND
+        * the setting with what is actually supported.  This has two benefits.
+        * First, enabled can't have unsupported values, no matter what the
+        * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
+        * "fill in with your supported value" have all the bits in the
+        * field set, so simply ANDing with supported has the desired result.
+        */
+       case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+               ppd->link_width_enabled = val & ppd->link_width_supported;
+               break;
+       case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+               ppd->link_width_downgrade_enabled =
+                               val & ppd->link_width_downgrade_supported;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               ppd->link_speed_enabled = val & ppd->link_speed_supported;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->overrun_threshold = val;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->phy_error_threshold = val;
+               break;
+
+       case HFI1_IB_CFG_MTU:
+               set_send_length(ppd);
+               break;
+
+       case HFI1_IB_CFG_PKEYS:
+               if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+                       set_partition_keys(ppd);
+               break;
+
+       default:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(ppd->dd,
+                         "%s: which %s, val 0x%x: not implemented\n",
+                         __func__, ib_cfg_name(which), val);
+               break;
+       }
+       return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+       int i;
+
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_LOW_PRIO_TABLE_SIZE);
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+       /*
+        * Note that we always return values directly from the
+        * 'vl_arb_cache' (and do no CSR reads) in response to a
+        * 'Get(VLArbTable)'. This is obviously correct after a
+        * 'Set(VLArbTable)', since the cache will then be up to
+        * date. But it's also correct prior to any 'Set(VLArbTable)'
+        * since then both the cache, and the relevant h/w registers
+        * will be zeroed.
+        */
+
+       for (i = 0; i < MAX_PRIO_TABLE; i++)
+               spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+               return NULL;
+       spin_lock(&ppd->vl_arb_cache[idx].lock);
+       return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+                             struct ib_vl_weight_elem *vl)
+{
+       return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+                         u32 size, struct ib_vl_weight_elem *vl)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       unsigned int i, is_up = 0;
+       int drain, ret = 0;
+
+       mutex_lock(&ppd->hls_lock);
+
+       if (ppd->host_link_state & HLS_UP)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * Before adjusting VL arbitration weights, empty per-VL
+                * FIFOs, otherwise a packet whose VL weight is being
+                * set to 0 could get stuck in a FIFO with no chance to
+                * egress.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(
+                       dd,
+                       "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+                       __func__);
+               goto err;
+       }
+
+       for (i = 0; i < size; i++, vl++) {
+               /*
+                * NOTE: The low priority shift and mask are used here, but
+                * they are the same for both the low and high registers.
+                */
+               reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+                               << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+                     | (((u64)vl->weight
+                               & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+                               << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+               write_csr(dd, target + (i * 8), reg);
+       }
+       pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+                          struct vl_limit *vll)
+{
+       u64 reg = read_csr(dd, csr);
+
+       vll->dedicated = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+       vll->shared = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+                             struct buffer_control *bc, u16 *overall_limit)
+{
+       u64 reg;
+       int i;
+
+       /* not all entries are filled in */
+       memset(bc, 0, sizeof(*bc));
+
+       /* OPA and HFI have a 1-1 mapping */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]);
+
+       /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+       read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       bc->overall_shared_limit = cpu_to_be16(
+               (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+               & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+       if (overall_limit)
+               *overall_limit = (reg
+                       >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+                       & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+       return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       u64 reg;
+       int i;
+
+       /* each register contains 16 SC->VLnt mappings, 4 bits each */
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[2 * i] = byte & 0xf;
+               dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[16 + (2 * i)] = byte & 0xf;
+               dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+       return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+                             struct ib_vl_weight_elem *vl)
+{
+       unsigned int i;
+
+       for (i = 0; i < nelems; i++, vl++) {
+               vl->vl = 0xf;
+               vl->weight = 0;
+       }
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+               DC_SC_VL_VAL(15_0,
+               0, dp->vlnt[0] & 0xf,
+               1, dp->vlnt[1] & 0xf,
+               2, dp->vlnt[2] & 0xf,
+               3, dp->vlnt[3] & 0xf,
+               4, dp->vlnt[4] & 0xf,
+               5, dp->vlnt[5] & 0xf,
+               6, dp->vlnt[6] & 0xf,
+               7, dp->vlnt[7] & 0xf,
+               8, dp->vlnt[8] & 0xf,
+               9, dp->vlnt[9] & 0xf,
+               10, dp->vlnt[10] & 0xf,
+               11, dp->vlnt[11] & 0xf,
+               12, dp->vlnt[12] & 0xf,
+               13, dp->vlnt[13] & 0xf,
+               14, dp->vlnt[14] & 0xf,
+               15, dp->vlnt[15] & 0xf));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+               DC_SC_VL_VAL(31_16,
+               16, dp->vlnt[16] & 0xf,
+               17, dp->vlnt[17] & 0xf,
+               18, dp->vlnt[18] & 0xf,
+               19, dp->vlnt[19] & 0xf,
+               20, dp->vlnt[20] & 0xf,
+               21, dp->vlnt[21] & 0xf,
+               22, dp->vlnt[22] & 0xf,
+               23, dp->vlnt[23] & 0xf,
+               24, dp->vlnt[24] & 0xf,
+               25, dp->vlnt[25] & 0xf,
+               26, dp->vlnt[26] & 0xf,
+               27, dp->vlnt[27] & 0xf,
+               28, dp->vlnt[28] & 0xf,
+               29, dp->vlnt[29] & 0xf,
+               30, dp->vlnt[30] & 0xf,
+               31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+                       u16 limit)
+{
+       if (limit != 0)
+               dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+                       what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+                                    const char *which)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+               if (reg == 0)
+                       return; /* success */
+               if (time_after(jiffies, timeout))
+                       break;          /* timed out */
+               udelay(1);
+       }
+
+       dd_dev_err(dd,
+               "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+               which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+       /*
+        * If this occurs, it is likely there was a credit loss on the link.
+        * The only recovery from that is a link bounce.
+        */
+       dd_dev_err(dd,
+               "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented.  In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ *     set Global_Shared_Credit_Limit = 0
+ *     use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ *     mask1 = all VLs that are lowering dedicated limits
+ *     lower Dedicated_Limit[mask1]
+ *     spin until Return_Credit_Status[mask1] == 0
+ *     raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ *     earlier in the algorithm), set the new limit to the new value
+ */
+static int set_buffer_control(struct hfi1_devdata *dd,
+                             struct buffer_control *new_bc)
+{
+       u64 changing_mask, ld_mask, stat_mask;
+       int change_count;
+       int i, use_all_mask;
+       int this_shared_changing;
+       /*
+        * A0: add the variable any_shared_limit_changing below and in the
+        * algorithm above.  If removing A0 support, it can be removed.
+        */
+       int any_shared_limit_changing;
+       struct buffer_control cur_bc;
+       u8 changing[OPA_MAX_VLS];
+       u8 lowering_dedicated[OPA_MAX_VLS];
+       u16 cur_total;
+       u32 new_total = 0;
+       const u64 all_mask =
+       SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16      /* look at VL15 and less */
+
+
+       /* find the new total credits, do sanity check on unused VLs */
+       for (i = 0; i < OPA_MAX_VLS; i++) {
+               if (valid_vl(i)) {
+                       new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+                       continue;
+               }
+               nonzero_msg(dd, i, "dedicated",
+                       be16_to_cpu(new_bc->vl[i].dedicated));
+               nonzero_msg(dd, i, "shared",
+                       be16_to_cpu(new_bc->vl[i].shared));
+               new_bc->vl[i].dedicated = 0;
+               new_bc->vl[i].shared = 0;
+       }
+       new_total += be16_to_cpu(new_bc->overall_shared_limit);
+       if (new_total > (u32)dd->link_credits)
+               return -EINVAL;
+       /* fetch the current values */
+       get_buffer_control(dd, &cur_bc, &cur_total);
+
+       /*
+        * Create the masks we will use.
+        */
+       memset(changing, 0, sizeof(changing));
+       memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+       /* NOTE: Assumes that the individual VL bits are adjacent and in
+          increasing order */
+       stat_mask =
+               SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+       changing_mask = 0;
+       ld_mask = 0;
+       change_count = 0;
+       any_shared_limit_changing = 0;
+       for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+               if (!valid_vl(i))
+                       continue;
+               this_shared_changing = new_bc->vl[i].shared
+                                               != cur_bc.vl[i].shared;
+               if (this_shared_changing)
+                       any_shared_limit_changing = 1;
+               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated
+                               || this_shared_changing) {
+                       changing[i] = 1;
+                       changing_mask |= stat_mask;
+                       change_count++;
+               }
+               if (be16_to_cpu(new_bc->vl[i].dedicated) <
+                                       be16_to_cpu(cur_bc.vl[i].dedicated)) {
+                       lowering_dedicated[i] = 1;
+                       ld_mask |= stat_mask;
+               }
+       }
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total > cur_total)
+               set_global_limit(dd, new_total);
+
+       /*
+        * Start the credit change algorithm.
+        */
+       use_all_mask = 0;
+       if ((be16_to_cpu(new_bc->overall_shared_limit) <
+                               be16_to_cpu(cur_bc.overall_shared_limit))
+                       || (is_a0(dd) && any_shared_limit_changing)) {
+               set_global_shared(dd, 0);
+               cur_bc.overall_shared_limit = 0;
+               use_all_mask = 1;
+       }
+
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (changing[i]) {
+                       set_vl_shared(dd, i, 0);
+                       cur_bc.vl[i].shared = 0;
+               }
+       }
+
+       wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+               "shared");
+
+       if (change_count > 0) {
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (lowering_dedicated[i]) {
+                               set_vl_dedicated(dd, i,
+                                       be16_to_cpu(new_bc->vl[i].dedicated));
+                               cur_bc.vl[i].dedicated =
+                                               new_bc->vl[i].dedicated;
+                       }
+               }
+
+               wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+               /* now raise all dedicated that are going up */
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) >
+                                       be16_to_cpu(cur_bc.vl[i].dedicated))
+                               set_vl_dedicated(dd, i,
+                                       be16_to_cpu(new_bc->vl[i].dedicated));
+               }
+       }
+
+       /* next raise all shared that are going up */
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (be16_to_cpu(new_bc->vl[i].shared) >
+                               be16_to_cpu(cur_bc.vl[i].shared))
+                       set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+       }
+
+       /* finally raise the global shared */
+       if (be16_to_cpu(new_bc->overall_shared_limit) >
+                       be16_to_cpu(cur_bc.overall_shared_limit))
+               set_global_shared(dd,
+                       be16_to_cpu(new_bc->overall_shared_limit));
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total < cur_total)
+               set_global_limit(dd, new_total);
+       return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+       int size;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               size = get_buffer_control(ppd->dd, t, NULL);
+               break;
+       case FM_TBL_SC2VLNT:
+               size = get_sc2vlnt(ppd->dd, t);
+               break;
+       case FM_TBL_VL_PREEMPT_ELEMS:
+               size = 256;
+               /* OPA specifies 128 elements, of 2 bytes each */
+               get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+               break;
+       case FM_TBL_VL_PREEMPT_MATRIX:
+               size = 256;
+               /*
+                * OPA specifies that this is the same size as the VL
+                * arbitration tables (i.e., 256 bytes).
+                */
+               break;
+       default:
+               return -EINVAL;
+       }
+       return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+       int ret = 0;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+                                    VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+                                    VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               ret = set_buffer_control(ppd->dd, t);
+               break;
+       case FM_TBL_SC2VLNT:
+               set_sc2vlnt(ppd->dd, t);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_a0(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+       return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_a0(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+       return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+       sc_wait(dd);
+       sdma_wait(dd);
+       pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs.  This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = disable_data_vls(dd);
+       if (ret == 0)
+               drain_data_vls(dd);
+
+       return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count.  No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+       u32 cclocks;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+       else  /* simulation pretends to be ASIC */
+               cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+       if (ns && !cclocks)     /* if ns nonzero, must be at least 1 */
+               cclocks = 1;
+       return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+       u32 ns;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+       else  /* simulation pretends to be ASIC */
+               ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+       if (cclocks && !ns)
+               ns = 1;
+       return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 timeout = rcd->rcvavail_timeout;
+
+       /*
+        * This algorithm doubles or halves the timeout depending on whether
+        * the number of packets received in this interrupt were less than or
+        * greater equal the interrupt count.
+        *
+        * The calculations below do not allow a steady state to be achieved.
+        * Only at the endpoints it is possible to have an unchanging
+        * timeout.
+        */
+       if (npkts < rcv_intr_count) {
+               /*
+                * Not enough packets arrived before the timeout, adjust
+                * timeout downward.
+                */
+               if (timeout < 2) /* already at minimum? */
+                       return;
+               timeout >>= 1;
+       } else {
+               /*
+                * More than enough packets arrived before the timeout, adjust
+                * timeout upward.
+                */
+               if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+                       return;
+               timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+       }
+
+       rcd->rcvavail_timeout = timeout;
+       /* timeout cannot be larger than rcv_intr_timeout_csr which has already
+          been verified to be in range */
+       write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+               (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+                   u32 intr_adjust, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u64 reg;
+       u32 ctxt = rcd->ctxt;
+
+       /*
+        * Need to write timeout register before updating RcvHdrHead to ensure
+        * that a new value is used when the HW decides to restart counting.
+        */
+       if (intr_adjust)
+               adjust_rcv_timeout(rcd, npkts);
+       if (updegr) {
+               reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+                       << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+       }
+       mmiowb();
+       reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+               (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+                       << RCV_HDR_HEAD_HEAD_SHIFT);
+       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+       u32 head, tail;
+
+       head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+               & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+       if (rcd->rcvhdrtail_kvaddr)
+               tail = get_rcvhdrtail(rcd);
+       else
+               tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+       return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ *     0x0 invalid
+ *     0x1   4 KB
+ *     0x2   8 KB
+ *     0x3  16 KB
+ *     0x4  32 KB
+ *     0x5  64 KB
+ *     0x6 128 KB
+ *     0x7 256 KB
+ *     0x8 512 KB (Receive Array only)
+ *     0x9   1 MB (Receive Array only)
+ *     0xa   2 MB (Receive Array only)
+ *
+ *     0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+       switch (size) {
+       case   4*1024: return 0x1;
+       case   8*1024: return 0x2;
+       case  16*1024: return 0x3;
+       case  32*1024: return 0x4;
+       case  64*1024: return 0x5;
+       case 128*1024: return 0x6;
+       case 256*1024: return 0x7;
+       case 512*1024: return 0x8;
+       case   1*1024*1024: return 0x9;
+       case   2*1024*1024: return 0xa;
+       }
+       return 0x1;     /* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       u64 rcvctrl, reg;
+       int did_enable = 0;
+
+       rcd = dd->rcd[ctxt];
+       if (!rcd)
+               return;
+
+       hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+       rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+       /* if the context already enabled, don't do the extra steps */
+       if ((op & HFI1_RCVCTRL_CTXT_ENB)
+                       && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+               /* reset the tail and hdr addresses, and sequence count */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+                               rcd->rcvhdrq_phys);
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                                       rcd->rcvhdrqtailaddr_phys);
+               rcd->seq_cnt = 1;
+
+               /* reset the cached receive header queue head value */
+               rcd->head = 0;
+
+               /*
+                * Zero the receive header queue so we don't get false
+                * positives when checking the sequence number.  The
+                * sequence numbers could land exactly on the same spot.
+                * E.g. a rcd restart before the receive header wrapped.
+                */
+               memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+               /* starting timeout */
+               rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+               /* enable the context */
+               rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+               /* clean the egr buffer size first */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+                               & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+                                       << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+               /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+               did_enable = 1;
+
+               /* zero RcvEgrIndexHead */
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+               /* set eager count and base index */
+               reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+                       & RCV_EGR_CTRL_EGR_CNT_MASK)
+                      << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+                       (((rcd->eager_base >> RCV_SHIFT)
+                         & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+                        << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+               /*
+                * Set TID (expected) count and base index.
+                * rcd->expected_count is set to individual RcvArray entries,
+                * not pairs, and the CSR takes a pair-count in groups of
+                * four, so divide by 8.
+                */
+               reg = (((rcd->expected_count >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+                               << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+                     (((rcd->expected_base >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+                               << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+               if (ctxt == VL15CTXT)
+                       write_csr(dd, RCV_VL15, VL15CTXT);
+       }
+       if (op & HFI1_RCVCTRL_CTXT_DIS) {
+               write_csr(dd, RCV_VL15, 0);
+               rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+               rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+               /* In one-packet-per-eager mode, the size comes from
+                  the RcvArray entry. */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       rcd->rcvctrl = rcvctrl;
+       hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+       write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+       /* work around sticky RcvCtxtStatus.BlockedRHQFull */
+       if (did_enable
+           && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+               reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+               if (reg != 0) {
+                       dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+                               ctxt, reg);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+                       dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+                               ctxt, reg, reg == 0 ? "not" : "still");
+               }
+       }
+
+       if (did_enable) {
+               /*
+                * The interrupt timeout and count must be set after
+                * the context is enabled to take effect.
+                */
+               /* set interrupt timeout */
+               write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+                       (u64)rcd->rcvavail_timeout <<
+                               RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+               /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+               reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       }
+
+       if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+               /*
+                * If the context has been disabled and the Tail Update has
+                * been cleared, clear the RCV_HDR_TAIL_ADDR CSR so
+                * it doesn't contain an address that is invalid.
+                */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, 0);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+                   u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = dd->cntrnameslen;
+               if (pos != 0) {
+                       dd_dev_err(dd, "read_cntrs does not support indexing");
+                       return 0;
+               }
+               *namep = dd->cntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               int i, j;
+
+               ret = (dd->ndevcntrs) * sizeof(u64);
+               if (pos != 0) {
+                       dd_dev_err(dd, "read_cntrs does not support indexing");
+                       return 0;
+               }
+
+               /* Get the start of the block of counters */
+               *cntrp = dd->cntrs;
+
+               /*
+                * Now go and fill in each counter in the block.
+                */
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                       } else {
+                               if (entry->flags & CNTR_VL) {
+                                       hfi1_cdbg(CNTR, "\tPer VL\n");
+                                       for (j = 0; j < C_VL_COUNT; j++) {
+                                               val = entry->rw_cntr(entry,
+                                                                 dd, j,
+                                                                 CNTR_MODE_R,
+                                                                 0);
+                                               hfi1_cdbg(
+                                                  CNTR,
+                                                  "\t\tRead 0x%llx for %d\n",
+                                                  val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                           val;
+                                       }
+                               } else {
+                                       val = entry->rw_cntr(entry, dd,
+                                                       CNTR_INVALID_VL,
+                                                       CNTR_MODE_R, 0);
+                                       dd->cntrs[entry->offset] = val;
+                                       hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                               }
+                       }
+               }
+       }
+       return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+                       char **namep, u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = dd->portcntrnameslen;
+               if (pos != 0) {
+                       dd_dev_err(dd, "index not supported");
+                       return 0;
+               }
+               *namep = dd->portcntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               struct hfi1_pportdata *ppd;
+               int i, j;
+
+               ret = (dd->nportcntrs) * sizeof(u64);
+               if (pos != 0) {
+                       dd_dev_err(dd, "indexing not supported");
+                       return 0;
+               }
+               ppd = (struct hfi1_pportdata *)(dd + 1 + port);
+               *cntrp = ppd->cntrs;
+
+               for (i = 0; i < PORT_CNTR_LAST; i++) {
+                       entry = &port_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                               continue;
+                       }
+
+                       if (entry->flags & CNTR_VL) {
+                               hfi1_cdbg(CNTR, "\tPer VL");
+                               for (j = 0; j < C_VL_COUNT; j++) {
+                                       val = entry->rw_cntr(entry, ppd, j,
+                                                              CNTR_MODE_R,
+                                                              0);
+                                       hfi1_cdbg(
+                                          CNTR,
+                                          "\t\tRead 0x%llx for %d",
+                                          val, j);
+                                       ppd->cntrs[entry->offset + j] = val;
+                               }
+                       } else {
+                               val = entry->rw_cntr(entry, ppd,
+                                                      CNTR_INVALID_VL,
+                                                      CNTR_MODE_R,
+                                                      0);
+                               ppd->cntrs[entry->offset] = val;
+                               hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                       }
+               }
+       }
+       return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       if (dd->synth_stats_timer.data)
+               del_timer_sync(&dd->synth_stats_timer);
+       dd->synth_stats_timer.data = 0;
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               kfree(ppd->cntrs);
+               kfree(ppd->scntrs);
+               free_percpu(ppd->ibport_data.rc_acks);
+               free_percpu(ppd->ibport_data.rc_qacks);
+               free_percpu(ppd->ibport_data.rc_delayed_comp);
+               ppd->cntrs = NULL;
+               ppd->scntrs = NULL;
+               ppd->ibport_data.rc_acks = NULL;
+               ppd->ibport_data.rc_qacks = NULL;
+               ppd->ibport_data.rc_delayed_comp = NULL;
+       }
+       kfree(dd->portcntrnames);
+       dd->portcntrnames = NULL;
+       kfree(dd->cntrs);
+       dd->cntrs = NULL;
+       kfree(dd->scntrs);
+       dd->scntrs = NULL;
+       kfree(dd->cntrnames);
+       dd->cntrnames = NULL;
+}
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+                             u64 *psval, void *context, int vl)
+{
+       u64 val;
+       u64 sval = *psval;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+       /* If its a synthetic counter there is more work we need to do */
+       if (entry->flags & CNTR_SYNTH) {
+               if (sval == CNTR_MAX) {
+                       /* No need to read already saturated */
+                       return CNTR_MAX;
+               }
+
+               if (entry->flags & CNTR_32BIT) {
+                       /* 32bit counters can wrap multiple times */
+                       u64 upper = sval >> 32;
+                       u64 lower = (sval << 32) >> 32;
+
+                       if (lower > val) { /* hw wrapped */
+                               if (upper == CNTR_32BIT_MAX)
+                                       val = CNTR_MAX;
+                               else
+                                       upper++;
+                       }
+
+                       if (val != CNTR_MAX)
+                               val = (upper << 32) | val;
+
+               } else {
+                       /* If we rolled we are saturated */
+                       if ((val < sval) || (val > CNTR_MAX))
+                               val = CNTR_MAX;
+               }
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+                              struct cntr_entry *entry,
+                              u64 *psval, void *context, int vl, u64 data)
+{
+       u64 val;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       if (entry->flags & CNTR_SYNTH) {
+               *psval = data;
+               if (entry->flags & CNTR_32BIT) {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            (data << 32) >> 32);
+                       val = data; /* return the full 64bit value */
+               } else {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            data);
+               }
+       } else {
+               val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+       u64 cur_tx;
+       u64 cur_rx;
+       u64 total_flits;
+       u8 update = 0;
+       int i, j, vl;
+       struct hfi1_pportdata *ppd;
+       struct cntr_entry *entry;
+
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+       /*
+        * Rather than keep beating on the CSRs pick a minimal set that we can
+        * check to watch for potential roll over. We can do this by looking at
+        * the number of flits sent/recv. If the total flits exceeds 32bits then
+        * we have to iterate all the counters and update.
+        */
+       entry = &dev_cntrs[C_DC_RCV_FLITS];
+       cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       entry = &dev_cntrs[C_DC_XMIT_FLITS];
+       cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       hfi1_cdbg(
+           CNTR,
+           "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+           dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+       if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+               /*
+                * May not be strictly necessary to update but it won't hurt and
+                * simplifies the logic here.
+                */
+               update = 1;
+               hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+                         dd->unit);
+       } else {
+               total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+               hfi1_cdbg(CNTR,
+                         "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+                         total_flits, (u64)CNTR_32BIT_MAX);
+               if (total_flits >= CNTR_32BIT_MAX) {
+                       hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+                                 dd->unit);
+                       update = 1;
+               }
+       }
+
+       if (update) {
+               hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       if (entry->flags & CNTR_VL) {
+                               for (vl = 0; vl < C_VL_COUNT; vl++)
+                                       read_dev_cntr(dd, i, vl);
+                       } else {
+                               read_dev_cntr(dd, i, CNTR_INVALID_VL);
+                       }
+               }
+               ppd = (struct hfi1_pportdata *)(dd + 1);
+               for (i = 0; i < dd->num_pports; i++, ppd++) {
+                       for (j = 0; j < PORT_CNTR_LAST; j++) {
+                               entry = &port_cntrs[j];
+                               if (entry->flags & CNTR_VL) {
+                                       for (vl = 0; vl < C_VL_COUNT; vl++)
+                                               read_port_cntr(ppd, j, vl);
+                               } else {
+                                       read_port_cntr(ppd, j, CNTR_INVALID_VL);
+                               }
+                       }
+               }
+
+               /*
+                * We want the value in the register. The goal is to keep track
+                * of the number of "ticks" not the counter value. In other
+                * words if the register rolls we want to notice it and go ahead
+                * and force an update.
+                */
+               entry = &dev_cntrs[C_DC_XMIT_FLITS];
+               dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               entry = &dev_cntrs[C_DC_RCV_FLITS];
+               dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+                         dd->unit, dd->last_tx, dd->last_rx);
+
+       } else {
+               hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+       }
+
+mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+       int i, rcv_ctxts, index, j;
+       size_t sz;
+       char *p;
+       char name[C_MAX_NAME];
+       struct hfi1_pportdata *ppd;
+
+       /* set up the stats timer; the add_timer is done at the end */
+       init_timer(&dd->synth_stats_timer);
+       dd->synth_stats_timer.function = update_synth_timer;
+       dd->synth_stats_timer.data = (unsigned long) dd;
+
+       /***********************/
+       /* per device counters */
+       /***********************/
+
+       /* size names and determine how many we have*/
+       dd->ndevcntrs = 0;
+       sz = 0;
+       index = 0;
+
+       for (i = 0; i < DEV_CNTR_LAST; i++) {
+               hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name);
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+                       continue;
+               }
+
+               if (dev_cntrs[i].flags & CNTR_VL) {
+                       hfi1_dbg_early("\tProcessing VL cntr\n");
+                       dev_cntrs[i].offset = index;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               memset(name, '\0', C_MAX_NAME);
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                       dev_cntrs[i].name,
+                                       vl_from_idx(j));
+                               sz += strlen(name);
+                               sz++;
+                               hfi1_dbg_early("\t\t%s\n", name);
+                               dd->ndevcntrs++;
+                               index++;
+                       }
+               } else {
+                       /* +1 for newline  */
+                       sz += strlen(dev_cntrs[i].name) + 1;
+                       dd->ndevcntrs++;
+                       dev_cntrs[i].offset = index;
+                       index++;
+                       hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name);
+               }
+       }
+
+       /* allocate space for the counter values */
+       dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+       if (!dd->cntrs)
+               goto bail;
+
+       dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+       if (!dd->scntrs)
+               goto bail;
+
+
+       /* allocate space for the counter names */
+       dd->cntrnameslen = sz;
+       dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->cntrnames)
+               goto bail;
+
+       /* fill in the names */
+       for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) {
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       /* Nothing */
+               } else {
+                       if (dev_cntrs[i].flags & CNTR_VL) {
+                               for (j = 0; j < C_VL_COUNT; j++) {
+                                       memset(name, '\0', C_MAX_NAME);
+                                       snprintf(name, C_MAX_NAME, "%s%d",
+                                               dev_cntrs[i].name,
+                                               vl_from_idx(j));
+                                       memcpy(p, name, strlen(name));
+                                       p += strlen(name);
+                                       *p++ = '\n';
+                               }
+                       } else {
+                               memcpy(p, dev_cntrs[i].name,
+                                      strlen(dev_cntrs[i].name));
+                               p += strlen(dev_cntrs[i].name);
+                               *p++ = '\n';
+                       }
+                       index++;
+               }
+       }
+
+       /*********************/
+       /* per port counters */
+       /*********************/
+
+       /*
+        * Go through the counters for the overflows and disable the ones we
+        * don't need. This varies based on platform so we need to do it
+        * dynamically here.
+        */
+       rcv_ctxts = dd->num_rcv_contexts;
+       for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+            i <= C_RCV_HDR_OVF_LAST; i++) {
+               port_cntrs[i].flags |= CNTR_DISABLED;
+       }
+
+       /* size port counter names and determine how many we have*/
+       sz = 0;
+       dd->nportcntrs = 0;
+       for (i = 0; i < PORT_CNTR_LAST; i++) {
+               hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name);
+               if (port_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+                       continue;
+               }
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       hfi1_dbg_early("\tProcessing VL cntr\n");
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               memset(name, '\0', C_MAX_NAME);
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                       port_cntrs[i].name,
+                                       vl_from_idx(j));
+                               sz += strlen(name);
+                               sz++;
+                               hfi1_dbg_early("\t\t%s\n", name);
+                               dd->nportcntrs++;
+                       }
+               } else {
+                       /* +1 for newline  */
+                       sz += strlen(port_cntrs[i].name) + 1;
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       dd->nportcntrs++;
+                       hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name);
+               }
+       }
+
+       /* allocate space for the counter names */
+       dd->portcntrnameslen = sz;
+       dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->portcntrnames)
+               goto bail;
+
+       /* fill in port cntr names */
+       for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+               if (port_cntrs[i].flags & CNTR_DISABLED)
+                       continue;
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               memset(name, '\0', C_MAX_NAME);
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                       port_cntrs[i].name,
+                                       vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+                               *p++ = '\n';
+                       }
+               } else {
+                       memcpy(p, port_cntrs[i].name,
+                              strlen(port_cntrs[i].name));
+                       p += strlen(port_cntrs[i].name);
+                       *p++ = '\n';
+               }
+       }
+
+       /* allocate per port storage for counter values */
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->cntrs)
+                       goto bail;
+
+               ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->scntrs)
+                       goto bail;
+       }
+
+       /* CPU counters need to be allocated and zeroed */
+       if (init_cpu_counters(dd))
+               goto bail;
+
+       mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+       return 0;
+bail:
+       free_cntrs(dd);
+       return -ENOMEM;
+}
+
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       switch (chip_lstate) {
+       default:
+               dd_dev_err(dd,
+                        "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+                        chip_lstate);
+               /* fall through */
+       case LSTATE_DOWN:
+               return IB_PORT_DOWN;
+       case LSTATE_INIT:
+               return IB_PORT_INIT;
+       case LSTATE_ARMED:
+               return IB_PORT_ARMED;
+       case LSTATE_ACTIVE:
+               return IB_PORT_ACTIVE;
+       }
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+       /* look at the HFI meta-states only */
+       switch (chip_pstate & 0xf0) {
+       default:
+               dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+                       chip_pstate);
+               /* fall through */
+       case PLS_DISABLED:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case PLS_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case PLS_POLLING:
+               return IB_PORTPHYSSTATE_POLLING;
+       case PLS_CONFIGPHY:
+               return IB_PORTPHYSSTATE_TRAINING;
+       case PLS_LINKUP:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case PLS_PHYTEST:
+               return IB_PORTPHYSSTATE_PHY_TEST;
+       }
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+       static const char * const port_logical_names[] = {
+               "PORT_NOP",
+               "PORT_DOWN",
+               "PORT_INIT",
+               "PORT_ARMED",
+               "PORT_ACTIVE",
+               "PORT_ACTIVE_DEFER",
+       };
+       if (lstate < ARRAY_SIZE(port_logical_names))
+               return port_logical_names[lstate];
+       return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+       static const char * const port_physical_names[] = {
+               "PHYS_NOP",
+               "reserved1",
+               "PHYS_POLL",
+               "PHYS_DISABLED",
+               "PHYS_TRAINING",
+               "PHYS_LINKUP",
+               "PHYS_LINK_ERR_RECOVER",
+               "PHYS_PHY_TEST",
+               "reserved8",
+               "PHYS_OFFLINE",
+               "PHYS_GANGED",
+               "PHYS_TEST",
+       };
+       if (pstate < ARRAY_SIZE(port_physical_names))
+               return port_physical_names[pstate];
+       return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+       u32 new_state;
+
+       new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+       if (new_state != ppd->lstate) {
+               dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+                       opa_lstate_name(new_state), new_state);
+               ppd->lstate = new_state;
+       }
+       /*
+        * Set port status flags in the page mapped into userspace
+        * memory. Do it here to ensure a reliable state - this is
+        * the only function called by all state handling code.
+        * Always set the flags due to the fact that the cache value
+        * might have been changed explicitly outside of this
+        * function.
+        */
+       if (ppd->statusp) {
+               switch (ppd->lstate) {
+               case IB_PORT_DOWN:
+               case IB_PORT_INIT:
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+                       break;
+               case IB_PORT_ARMED:
+                       *ppd->statusp |= HFI1_STATUS_IB_CONF;
+                       break;
+               case IB_PORT_ACTIVE:
+                       *ppd->statusp |= HFI1_STATUS_IB_READY;
+                       break;
+               }
+       }
+       return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs)
+{
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               if (get_logical_state(ppd) == state)
+                       return 0;
+               if (time_after(jiffies, timeout))
+                       break;
+               msleep(20);
+       }
+       dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+       return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+       static u32 remembered_state = 0xff;
+       u32 pstate;
+       u32 ib_pstate;
+
+       pstate = read_physical_state(ppd->dd);
+       ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+       if (remembered_state != ib_pstate) {
+               dd_dev_info(ppd->dd,
+                       "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+                       __func__, opa_pstate_name(ib_pstate), ib_pstate,
+                       pstate);
+               remembered_state = ib_pstate;
+       }
+       return ib_pstate;
+}
+
+/*
+ * Read/modify/write ASIC_QSFP register bits as selected by mask
+ * data: 0 or 1 in the positions depending on what needs to be written
+ * dir: 0 for read, 1 for write
+ * mask: select by setting
+ *      I2CCLK  (bit 0)
+ *      I2CDATA (bit 1)
+ */
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask)
+{
+       u64 qsfp_oe, target_oe;
+
+       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+       if (mask) {
+               /* We are writing register bits, so lock access */
+               dir &= mask;
+               data &= mask;
+
+               qsfp_oe = read_csr(dd, target_oe);
+               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
+               write_csr(dd, target_oe, qsfp_oe);
+       }
+       /* We are exclusively reading bits here, but it is unlikely
+        * we'll get valid data when we set the direction of the pin
+        * in the same call, so read should call this function again
+        * to get valid data
+        */
+       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+       if (sc != NULL) {
+               struct hfi1_devdata *dd = sc->dd;
+               u64 reg;
+               u8 set = (sc->type == SC_USER ?
+                         HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+                         HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+               reg = read_kctxt_csr(dd, sc->hw_context,
+                                    SEND_CTXT_CHECK_ENABLE);
+               if (set)
+                       CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+               else
+                       SET_STATIC_RATE_CONTROL_SMASK(reg);
+               write_kctxt_csr(dd, sc->hw_context,
+                               SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+       int ret = 0;
+       u64 reg;
+
+       if (dd->icode != ICODE_RTL_SILICON) {
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+                                   __func__);
+               return -EINVAL;
+       }
+       reg = read_csr(dd, ASIC_STS_THERM);
+       temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+                     ASIC_STS_THERM_CURR_TEMP_MASK);
+       temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+                       ASIC_STS_THERM_LO_TEMP_MASK);
+       temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+                       ASIC_STS_THERM_HI_TEMP_MASK);
+       temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+                         ASIC_STS_THERM_CRIT_TEMP_MASK);
+       /* triggers is a 3-bit value - 1 bit per trigger. */
+       temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+       return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+       int i;
+
+       /*
+        * In HFI, the mask needs to be 1 to allow interrupts.
+        */
+       if (enable) {
+               u64 cce_int_mask;
+               const int qsfp1_int_smask = QSFP1_INT % 64;
+               const int qsfp2_int_smask = QSFP2_INT % 64;
+
+               /* enable all interrupts */
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0);
+
+               /*
+                * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+                * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+                * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+                * the index of the appropriate CSR in the CCEIntMask CSR array
+                */
+               cce_int_mask = read_csr(dd, CCE_INT_MASK +
+                                               (8*(QSFP1_INT/64)));
+               if (dd->hfi1_id) {
+                       cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+                       write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)),
+                                       cce_int_mask);
+               } else {
+                       cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+                       write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)),
+                                       cce_int_mask);
+               }
+       } else {
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+       }
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0);
+
+       write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+       write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+       write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+       write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+       pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove irqs - must happen before disabling/turning off */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               struct hfi1_msix_entry *me = dd->msix_entries;
+
+               for (i = 0; i < dd->num_msix_entries; i++, me++) {
+                       if (me->arg == NULL) /* => no irq, no affinity */
+                               break;
+                       irq_set_affinity_hint(dd->msix_entries[i].msix.vector,
+                                       NULL);
+                       free_irq(me->msix.vector, me->arg);
+               }
+       } else {
+               /* INTx */
+               if (dd->requested_intx_irq) {
+                       free_irq(dd->pcidev->irq, dd);
+                       dd->requested_intx_irq = 0;
+               }
+       }
+
+       /* turn off interrupts */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               hfi1_nomsix(dd);
+       } else {
+               /* INTx */
+               disable_intx(dd->pcidev);
+       }
+
+       /* clean structures */
+       for (i = 0; i < dd->num_msix_entries; i++)
+               free_cpumask_var(dd->msix_entries[i].mask);
+       kfree(dd->msix_entries);
+       dd->msix_entries = NULL;
+       dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+       u64 reg;
+       int m, n;
+
+       /* clear from the handled mask of the general interrupt */
+       m = isrc / 64;
+       n = isrc % 64;
+       dd->gi_mask[m] &= ~((u64)1 << n);
+
+       /* direct the chip source to the given MSI-X interrupt */
+       m = isrc / 8;
+       n = isrc % 8;
+       reg = read_csr(dd, CCE_INT_MAP + (8*m));
+       reg &= ~((u64)0xff << (8*n));
+       reg |= ((u64)msix_intr & 0xff) << (8*n);
+       write_csr(dd, CCE_INT_MAP + (8*m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+                                 int engine, int msix_intr)
+{
+       /*
+        * SDMA engine interrupt sources grouped by type, rather than
+        * engine.  Per-engine interrupts are as follows:
+        *      SDMA
+        *      SDMAProgress
+        *      SDMAIdle
+        */
+       remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine,
+               msix_intr);
+       remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine,
+               msix_intr);
+       remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine,
+               msix_intr);
+}
+
+static void remap_receive_available_interrupt(struct hfi1_devdata *dd,
+                                             int rx, int msix_intr)
+{
+       remap_intr(dd, IS_RCVAVAIL_START + rx, msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME"_%d",
+               dd->unit);
+       ret = request_irq(dd->pcidev->irq, general_interrupt,
+                                 IRQF_SHARED, dd->intx_name, dd);
+       if (ret)
+               dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+                               ret);
+       else
+               dd->requested_intx_irq = 1;
+       return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+       const struct cpumask *local_mask;
+       cpumask_var_t def, rcv;
+       bool def_ret, rcv_ret;
+       int first_general, last_general;
+       int first_sdma, last_sdma;
+       int first_rx, last_rx;
+       int first_cpu, restart_cpu, curr_cpu;
+       int rcv_cpu, sdma_cpu;
+       int i, ret = 0, possible;
+       int ht;
+
+       /* calculate the ranges we are going to use */
+       first_general = 0;
+       first_sdma = last_general = first_general + 1;
+       first_rx = last_sdma = first_sdma + dd->num_sdma;
+       last_rx = first_rx + dd->n_krcv_queues;
+
+       /*
+        * Interrupt affinity.
+        *
+        * non-rcv avail gets a default mask that
+        * starts as possible cpus with threads reset
+        * and each rcv avail reset.
+        *
+        * rcv avail gets node relative 1 wrapping back
+        * to the node relative 1 as necessary.
+        *
+        */
+       local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+       /* if first cpu is invalid, use NUMA 0 */
+       if (cpumask_first(local_mask) >= nr_cpu_ids)
+               local_mask = topology_core_cpumask(0);
+
+       def_ret = zalloc_cpumask_var(&def, GFP_KERNEL);
+       rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL);
+       if (!def_ret || !rcv_ret)
+               goto bail;
+       /* use local mask as default */
+       cpumask_copy(def, local_mask);
+       possible = cpumask_weight(def);
+       /* disarm threads from default */
+       ht = cpumask_weight(
+                       topology_sibling_cpumask(cpumask_first(local_mask)));
+       for (i = possible/ht; i < possible; i++)
+               cpumask_clear_cpu(i, def);
+       /* reset possible */
+       possible = cpumask_weight(def);
+       /* def now has full cores on chosen node*/
+       first_cpu = cpumask_first(def);
+       if (nr_cpu_ids >= first_cpu)
+               first_cpu++;
+       restart_cpu = first_cpu;
+       curr_cpu = restart_cpu;
+
+       for (i = first_cpu; i < dd->n_krcv_queues + first_cpu; i++) {
+               cpumask_clear_cpu(curr_cpu, def);
+               cpumask_set_cpu(curr_cpu, rcv);
+               if (curr_cpu >= possible)
+                       curr_cpu = restart_cpu;
+               else
+                       curr_cpu++;
+       }
+       /* def mask has non-rcv, rcv has recv mask */
+       rcv_cpu = cpumask_first(rcv);
+       sdma_cpu = cpumask_first(def);
+
+       /*
+        * Sanity check - the code expects all SDMA chip source
+        * interrupts to be in the same CSR, starting at bit 0.  Verify
+        * that this is true by checking the bit location of the start.
+        */
+       BUILD_BUG_ON(IS_SDMA_START % 64);
+
+       for (i = 0; i < dd->num_msix_entries; i++) {
+               struct hfi1_msix_entry *me = &dd->msix_entries[i];
+               const char *err_info;
+               irq_handler_t handler;
+               void *arg;
+               int idx;
+               struct hfi1_ctxtdata *rcd = NULL;
+               struct sdma_engine *sde = NULL;
+
+               /* obtain the arguments to request_irq */
+               if (first_general <= i && i < last_general) {
+                       idx = i - first_general;
+                       handler = general_interrupt;
+                       arg = dd;
+                       snprintf(me->name, sizeof(me->name),
+                               DRIVER_NAME"_%d", dd->unit);
+                       err_info = "general";
+               } else if (first_sdma <= i && i < last_sdma) {
+                       idx = i - first_sdma;
+                       sde = &dd->per_sdma[idx];
+                       handler = sdma_interrupt;
+                       arg = sde;
+                       snprintf(me->name, sizeof(me->name),
+                               DRIVER_NAME"_%d sdma%d", dd->unit, idx);
+                       err_info = "sdma";
+                       remap_sdma_interrupts(dd, idx, i);
+               } else if (first_rx <= i && i < last_rx) {
+                       idx = i - first_rx;
+                       rcd = dd->rcd[idx];
+                       /* no interrupt if no rcd */
+                       if (!rcd)
+                               continue;
+                       /*
+                        * Set the interrupt register and mask for this
+                        * context's interrupt.
+                        */
+                       rcd->ireg = (IS_RCVAVAIL_START+idx) / 64;
+                       rcd->imask = ((u64)1) <<
+                                       ((IS_RCVAVAIL_START+idx) % 64);
+                       handler = receive_context_interrupt;
+                       arg = rcd;
+                       snprintf(me->name, sizeof(me->name),
+                               DRIVER_NAME"_%d kctxt%d", dd->unit, idx);
+                       err_info = "receive context";
+                       remap_receive_available_interrupt(dd, idx, i);
+               } else {
+                       /* not in our expected range - complain, then
+                          ignore it */
+                       dd_dev_err(dd,
+                               "Unexpected extra MSI-X interrupt %d\n", i);
+                       continue;
+               }
+               /* no argument, no interrupt */
+               if (arg == NULL)
+                       continue;
+               /* make sure the name is terminated */
+               me->name[sizeof(me->name)-1] = 0;
+
+               ret = request_irq(me->msix.vector, handler, 0, me->name, arg);
+               if (ret) {
+                       dd_dev_err(dd,
+                               "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+                                err_info, me->msix.vector, idx, ret);
+                       return ret;
+               }
+               /*
+                * assign arg after request_irq call, so it will be
+                * cleaned up
+                */
+               me->arg = arg;
+
+               if (!zalloc_cpumask_var(
+                       &dd->msix_entries[i].mask,
+                       GFP_KERNEL))
+                       goto bail;
+               if (handler == sdma_interrupt) {
+                       dd_dev_info(dd, "sdma engine %d cpu %d\n",
+                               sde->this_idx, sdma_cpu);
+                       cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
+                       sdma_cpu = cpumask_next(sdma_cpu, def);
+                       if (sdma_cpu >= nr_cpu_ids)
+                               sdma_cpu = cpumask_first(def);
+               } else if (handler == receive_context_interrupt) {
+                       dd_dev_info(dd, "rcv ctxt %d cpu %d\n",
+                               rcd->ctxt, rcv_cpu);
+                       cpumask_set_cpu(rcv_cpu, dd->msix_entries[i].mask);
+                       rcv_cpu = cpumask_next(rcv_cpu, rcv);
+                       if (rcv_cpu >= nr_cpu_ids)
+                               rcv_cpu = cpumask_first(rcv);
+               } else {
+                       /* otherwise first def */
+                       dd_dev_info(dd, "%s cpu %d\n",
+                               err_info, cpumask_first(def));
+                       cpumask_set_cpu(
+                               cpumask_first(def), dd->msix_entries[i].mask);
+               }
+               irq_set_affinity_hint(
+                       dd->msix_entries[i].msix.vector,
+                       dd->msix_entries[i].mask);
+       }
+
+out:
+       free_cpumask_var(def);
+       free_cpumask_var(rcv);
+       return ret;
+bail:
+       ret = -ENOMEM;
+       goto  out;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* all interrupts handled by the general handler */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               dd->gi_mask[i] = ~(u64)0;
+
+       /* all chip interrupts map to MSI-X 0 */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP + (8*i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+       struct hfi1_msix_entry *entries;
+       u32 total, request;
+       int i, ret;
+       int single_interrupt = 0; /* we expect to have all the interrupts */
+
+       /*
+        * Interrupt count:
+        *      1 general, "slow path" interrupt (includes the SDMA engines
+        *              slow source, SDMACleanupDone)
+        *      N interrupts - one per used SDMA engine
+        *      M interrupt - one per kernel receive context
+        */
+       total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+       entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+       if (!entries) {
+               dd_dev_err(dd, "cannot allocate msix table\n");
+               ret = -ENOMEM;
+               goto fail;
+       }
+       /* 1-1 MSI-X entry assignment */
+       for (i = 0; i < total; i++)
+               entries[i].msix.entry = i;
+
+       /* ask for MSI-X interrupts */
+       request = total;
+       request_msix(dd, &request, entries);
+
+       if (request == 0) {
+               /* using INTx */
+               /* dd->num_msix_entries already zero */
+               kfree(entries);
+               single_interrupt = 1;
+               dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+       } else {
+               /* using MSI-X */
+               dd->num_msix_entries = request;
+               dd->msix_entries = entries;
+
+               if (request != total) {
+                       /* using MSI-X, with reduced interrupts */
+                       dd_dev_err(
+                               dd,
+                               "cannot handle reduced interrupt case, want %u, got %u\n",
+                               total, request);
+                       ret = -EINVAL;
+                       goto fail;
+               }
+               dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+       }
+
+       /* mask all interrupts */
+       set_intr_state(dd, 0);
+       /* clear all pending interrupts */
+       clear_all_interrupts(dd);
+
+       /* reset general handler mask, chip MSI-X mappings */
+       reset_interrupts(dd);
+
+       if (single_interrupt)
+               ret = request_intx_irq(dd);
+       else
+               ret = request_msix_irqs(dd);
+       if (ret)
+               goto fail;
+
+       return 0;
+
+fail:
+       clean_up_interrupts(dd);
+       return ret;
+}
+
+/*
+ * Set up context values in dd.  Sets:
+ *
+ *     num_rcv_contexts - number of contexts being used
+ *     n_krcv_queues - number of kernel contexts
+ *     first_user_ctxt - first non-kernel context in array of contexts
+ *     freectxts  - number of free user contexts
+ *     num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+       int num_kernel_contexts;
+       int num_user_contexts;
+       int total_contexts;
+       int ret;
+       unsigned ngroups;
+
+       /*
+        * Kernel contexts: (to be fixed later):
+        * - min or 2 or 1 context/numa
+        * - Context 0 - default/errors
+        * - Context 1 - VL15
+        */
+       if (n_krcvqs)
+               num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS;
+       else
+               num_kernel_contexts = num_online_nodes();
+       num_kernel_contexts =
+               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+       /*
+        * Every kernel receive context needs an ACK send context.
+        * one send context is allocated for each VL{0-7} and VL15
+        */
+       if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+               dd_dev_err(dd,
+                          "Reducing # kernel rcv contexts to: %d, from %d\n",
+                          (int)(dd->chip_send_contexts - num_vls - 1),
+                          (int)num_kernel_contexts);
+               num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+       }
+       /*
+        * User contexts: (to be fixed later)
+        *      - set to num_rcv_contexts if non-zero
+        *      - default to 1 user context per CPU
+        */
+       if (num_rcv_contexts)
+               num_user_contexts = num_rcv_contexts;
+       else
+               num_user_contexts = num_online_cpus();
+
+       total_contexts = num_kernel_contexts + num_user_contexts;
+
+       /*
+        * Adjust the counts given a global max.
+        */
+       if (total_contexts > dd->chip_rcv_contexts) {
+               dd_dev_err(dd,
+                          "Reducing # user receive contexts to: %d, from %d\n",
+                          (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+                          (int)num_user_contexts);
+               num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+               /* recalculate */
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
+       /* the first N are kernel contexts, the rest are user contexts */
+       dd->num_rcv_contexts = total_contexts;
+       dd->n_krcv_queues = num_kernel_contexts;
+       dd->first_user_ctxt = num_kernel_contexts;
+       dd->freectxts = num_user_contexts;
+       dd_dev_info(dd,
+               "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+               (int)dd->chip_rcv_contexts,
+               (int)dd->num_rcv_contexts,
+               (int)dd->n_krcv_queues,
+               (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+       /*
+        * Receive array allocation:
+        *   All RcvArray entries are divided into groups of 8. This
+        *   is required by the hardware and will speed up writes to
+        *   consecutive entries by using write-combining of the entire
+        *   cacheline.
+        *
+        *   The number of groups are evenly divided among all contexts.
+        *   any left over groups will be given to the first N user
+        *   contexts.
+        */
+       dd->rcv_entries.group_size = RCV_INCREMENT;
+       ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+       dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+       dd->rcv_entries.nctxt_extra = ngroups -
+               (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+       dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+                   dd->rcv_entries.ngroups,
+                   dd->rcv_entries.nctxt_extra);
+       if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+           MAX_EAGER_ENTRIES * 2) {
+               dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+                       dd->rcv_entries.group_size;
+               dd_dev_info(dd,
+                  "RcvArray group count too high, change to %u\n",
+                  dd->rcv_entries.ngroups);
+               dd->rcv_entries.nctxt_extra = 0;
+       }
+       /*
+        * PIO send contexts
+        */
+       ret = init_sc_pools_and_sizes(dd);
+       if (ret >= 0) { /* success */
+               dd->num_send_contexts = ret;
+               dd_dev_info(
+                       dd,
+                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+                       dd->chip_send_contexts,
+                       dd->num_send_contexts,
+                       dd->sc_sizes[SC_KERNEL].count,
+                       dd->sc_sizes[SC_ACK].count,
+                       dd->sc_sizes[SC_USER].count);
+               ret = 0;        /* success */
+       }
+
+       return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg = 0;
+       int i;
+
+       dd_dev_info(dd, "Setting partition keys\n");
+       for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+               reg |= (ppd->pkeys[i] &
+                       RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+                       ((i % 4) *
+                        RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+               /* Each register holds 4 PKey values. */
+               if ((i % 4) == 3) {
+                       write_csr(dd, RCV_PARTITION_KEY +
+                                 ((i - 3) * 2), reg);
+                       reg = 0;
+               }
+       }
+
+       /* Always enable HW pkeys check when pkeys table is set */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /* CceIntMap */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP+(8*i), 0);
+
+       /* SendCtxtCreditReturnAddr */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+       /* PIO Send buffers */
+       /* SDMA Send buffers */
+       /* These are not normally read, and (presently) have no method
+          to be read, so are not pre-initialized */
+
+       /* RcvHdrAddr */
+       /* RcvHdrTailAddr */
+       /* RcvTidFlowTable */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0);
+       }
+
+       /* RcvArray */
+       for (i = 0; i < dd->chip_rcv_array_count; i++)
+               write_csr(dd, RCV_ARRAY + (8*i),
+                                       RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+       /* RcvQPMapTable */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+                            u64 ctrl_bits)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       /* is the condition present? */
+       reg = read_csr(dd, CCE_STATUS);
+       if ((reg & status_bits) == 0)
+               return;
+
+       /* clear the condition */
+       write_csr(dd, CCE_CTRL, ctrl_bits);
+
+       /* wait for the condition to clear */
+       timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if ((reg & status_bits) == 0)
+                       return;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                               "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+                               status_bits, reg & status_bits);
+                       return;
+               }
+               udelay(1);
+       }
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* CCE_REVISION read-only */
+       /* CCE_REVISION2 read-only */
+       /* CCE_CTRL - bits clear automatically */
+       /* CCE_STATUS read-only, use CceCtrl to clear */
+       clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+       clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+       for (i = 0; i < CCE_NUM_SCRATCH; i++)
+               write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+       /* CCE_ERR_STATUS read-only */
+       write_csr(dd, CCE_ERR_MASK, 0);
+       write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+       /* CCE_ERR_FORCE leave alone */
+       for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+       /* CCE_PCIE_CTRL leave alone */
+       for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+               write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+               write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+                                       CCE_MSIX_TABLE_UPPER_RESETCSR);
+       }
+       for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+               /* CCE_MSIX_PBA read-only */
+               write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+               write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+       }
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP, 0);
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               /* CCE_INT_STATUS read-only */
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+               /* CCE_INT_FORCE leave alone */
+               /* CCE_INT_BLOCKED read-only */
+       }
+       for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+               write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set ASIC CSRs to chip reset defaults */
+static void reset_asic_csrs(struct hfi1_devdata *dd)
+{
+       static DEFINE_MUTEX(asic_mutex);
+       static int called;
+       int i;
+
+       /*
+        * If the HFIs are shared between separate nodes or VMs,
+        * then more will need to be done here.  One idea is a module
+        * parameter that returns early, letting the first power-on or
+        * a known first load do the reset and blocking all others.
+        */
+
+       /*
+        * These CSRs should only be reset once - the first one here will
+        * do the work.  Use a mutex so that a non-first caller waits until
+        * the first is finished before it can proceed.
+        */
+       mutex_lock(&asic_mutex);
+       if (called)
+               goto done;
+       called = 1;
+
+       if (dd->icode != ICODE_FPGA_EMULATION) {
+               /* emulation does not have an SBus - leave these alone */
+               /*
+                * All writes to ASIC_CFG_SBUS_REQUEST do something.
+                * Notes:
+                * o The reset is not zero if aimed at the core.  See the
+                *   SBus documentation for details.
+                * o If the SBus firmware has been updated (e.g. by the BIOS),
+                *   will the reset revert that?
+                */
+               /* ASIC_CFG_SBUS_REQUEST leave alone */
+               write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+       }
+       /* ASIC_SBUS_RESULT read-only */
+       write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0);
+       for (i = 0; i < ASIC_NUM_SCRATCH; i++)
+               write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0);
+       write_csr(dd, ASIC_CFG_MUTEX, 0);       /* this will clear it */
+       write_csr(dd, ASIC_CFG_DRV_STR, 0);
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0);
+       /* ASIC_STS_THERM read-only */
+       /* ASIC_CFG_RESET leave alone */
+
+       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0);
+       /* ASIC_PCIE_SD_HOST_STATUS read-only */
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0);
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0);
+       /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */
+       /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */
+       /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */
+       for (i = 0; i < 16; i++)
+               write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0);
+
+       /* ASIC_GPIO_IN read-only */
+       write_csr(dd, ASIC_GPIO_OE, 0);
+       write_csr(dd, ASIC_GPIO_INVERT, 0);
+       write_csr(dd, ASIC_GPIO_OUT, 0);
+       write_csr(dd, ASIC_GPIO_MASK, 0);
+       /* ASIC_GPIO_STATUS read-only */
+       write_csr(dd, ASIC_GPIO_CLEAR, ~0ull);
+       /* ASIC_GPIO_FORCE leave alone */
+
+       /* ASIC_QSFP1_IN read-only */
+       write_csr(dd, ASIC_QSFP1_OE, 0);
+       write_csr(dd, ASIC_QSFP1_INVERT, 0);
+       write_csr(dd, ASIC_QSFP1_OUT, 0);
+       write_csr(dd, ASIC_QSFP1_MASK, 0);
+       /* ASIC_QSFP1_STATUS read-only */
+       write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull);
+       /* ASIC_QSFP1_FORCE leave alone */
+
+       /* ASIC_QSFP2_IN read-only */
+       write_csr(dd, ASIC_QSFP2_OE, 0);
+       write_csr(dd, ASIC_QSFP2_INVERT, 0);
+       write_csr(dd, ASIC_QSFP2_OUT, 0);
+       write_csr(dd, ASIC_QSFP2_MASK, 0);
+       /* ASIC_QSFP2_STATUS read-only */
+       write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull);
+       /* ASIC_QSFP2_FORCE leave alone */
+
+       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR);
+       /* this also writes a NOP command, clearing paging mode */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, 0);
+       write_csr(dd, ASIC_EEP_DATA, 0);
+
+done:
+       mutex_unlock(&asic_mutex);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < 32; i++) {
+               write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+       }
+       /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+          only be written 128-byte chunks */
+       /* init RSA engine to clear lingering errors */
+       write_csr(dd, MISC_CFG_RSA_CMD, 1);
+       write_csr(dd, MISC_CFG_RSA_MU, 0);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+       /* MISC_STS_8051_DIGEST read-only */
+       /* MISC_STS_SBM_DIGEST read-only */
+       /* MISC_STS_PCIE_DIGEST read-only */
+       /* MISC_STS_FAB_DIGEST read-only */
+       /* MISC_ERR_STATUS read-only */
+       write_csr(dd, MISC_ERR_MASK, 0);
+       write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+       /* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * TXE Kernel CSRs
+        */
+       write_csr(dd, SEND_CTRL, 0);
+       __cm_reset(dd, 0);      /* reset CM internal state */
+       /* SEND_CONTEXTS read-only */
+       /* SEND_DMA_ENGINES read-only */
+       /* SEND_PIO_MEM_SIZE read-only */
+       /* SEND_DMA_MEM_SIZE read-only */
+       write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+       pio_reset_all(dd);      /* SEND_PIO_INIT_CTXT */
+       /* SEND_PIO_ERR_STATUS read-only */
+       write_csr(dd, SEND_PIO_ERR_MASK, 0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+       /* SEND_PIO_ERR_FORCE leave alone */
+       /* SEND_DMA_ERR_STATUS read-only */
+       write_csr(dd, SEND_DMA_ERR_MASK, 0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+       /* SEND_DMA_ERR_FORCE leave alone */
+       /* SEND_EGRESS_ERR_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+       /* SEND_EGRESS_ERR_FORCE leave alone */
+       write_csr(dd, SEND_BTH_QP, 0);
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+       write_csr(dd, SEND_SC2VLT0, 0);
+       write_csr(dd, SEND_SC2VLT1, 0);
+       write_csr(dd, SEND_SC2VLT2, 0);
+       write_csr(dd, SEND_SC2VLT3, 0);
+       write_csr(dd, SEND_LEN_CHECK0, 0);
+       write_csr(dd, SEND_LEN_CHECK1, 0);
+       /* SEND_ERR_STATUS read-only */
+       write_csr(dd, SEND_ERR_MASK, 0);
+       write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+       /* SEND_ERR_FORCE read-only */
+       for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0);
+       for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0);
+       for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++)
+               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0);
+       for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0);
+       for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0);
+       write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+                                       SEND_CM_GLOBAL_CREDIT_RESETCSR);
+       /* SEND_CM_CREDIT_USED_STATUS read-only */
+       write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       /* SEND_CM_CREDIT_USED_VL read-only */
+       /* SEND_CM_CREDIT_USED_VL15 read-only */
+       /* SEND_EGRESS_CTXT_STATUS read-only */
+       /* SEND_EGRESS_SEND_DMA_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+       /* SEND_EGRESS_ERR_INFO read-only */
+       /* SEND_EGRESS_ERR_SOURCE read-only */
+
+       /*
+        * TXE Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+       }
+
+       /*
+        * TXE Per-SDMA CSRs
+        */
+       for (i = 0; i < dd->chip_sdma_engines; i++) {
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+               /* SEND_DMA_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+               /* SEND_DMA_HEAD read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+               /* SEND_DMA_IDLE_CNT read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+               /* SEND_DMA_DESC_FETCHED_CNT read-only */
+               /* SEND_DMA_ENG_ERR_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+               /* SEND_DMA_ENG_ERR_FORCE leave alone */
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+       }
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int count;
+
+       /*
+        * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+        * clear.
+        */
+       count = 0;
+       while (1) {
+               reg = read_csr(dd, RCV_STATUS);
+               if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+                           | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+                       break;
+               /*
+                * Give up after 1ms - maximum wait time.
+                *
+                * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
+                * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
+                *      148 KB / (66% * 250MB/s) = 920us
+                */
+               if (count++ > 500) {
+                       dd_dev_err(dd,
+                               "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+                               __func__, reg);
+                       break;
+               }
+               udelay(2); /* do not busy-wait the CSR */
+       }
+
+       /* start the init - expect RcvCtrl to be 0 */
+       write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+       /*
+        * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
+        * period after the write before RcvStatus.RxRbufInitDone is valid.
+        * The delay in the first run through the loop below is sufficient and
+        * required before the first read of RcvStatus.RxRbufInintDone.
+        */
+       read_csr(dd, RCV_CTRL);
+
+       /* wait for the init to finish */
+       count = 0;
+       while (1) {
+               /* delay is required first time through - see above */
+               udelay(2); /* do not busy-wait the CSR */
+               reg = read_csr(dd, RCV_STATUS);
+               if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+                       break;
+
+               /* give up after 100us - slowest possible at 33MHz is 73us */
+               if (count++ > 50) {
+                       dd_dev_err(dd,
+                               "%s: RcvStatus.RxRbufInit not set, continuing\n",
+                               __func__);
+                       break;
+               }
+       }
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /*
+        * RXE Kernel CSRs
+        */
+       write_csr(dd, RCV_CTRL, 0);
+       init_rbufs(dd);
+       /* RCV_STATUS read-only */
+       /* RCV_CONTEXTS read-only */
+       /* RCV_ARRAY_CNT read-only */
+       /* RCV_BUF_SIZE read-only */
+       write_csr(dd, RCV_BTH_QP, 0);
+       write_csr(dd, RCV_MULTICAST, 0);
+       write_csr(dd, RCV_BYPASS, 0);
+       write_csr(dd, RCV_VL15, 0);
+       /* this is a clear-down */
+       write_csr(dd, RCV_ERR_INFO,
+                       RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+       /* RCV_ERR_STATUS read-only */
+       write_csr(dd, RCV_ERR_MASK, 0);
+       write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+       /* RCV_ERR_FORCE leave alone */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+       for (i = 0; i < 4; i++)
+               write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+               write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+               write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+               write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+       }
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+       /*
+        * RXE Kernel and User Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               /* kernel */
+               write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+               /* RCV_CTXT_STATUS read-only */
+               write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+               /* user */
+               /* RCV_HDR_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+               /* RCV_EGR_INDEX_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+               /* RCV_EGR_OFFSET_TAIL read-only */
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j),
+                               0);
+               }
+       }
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15  -> VL 15
+ * otherwise
+ *        -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+       int i;
+       /* init per architecture spec, constrained by hardware capability */
+
+       /* HFI maps sent packets */
+       write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+               0,
+               0, 0, 1, 1,
+               2, 2, 3, 3,
+               4, 4, 5, 5,
+               6, 6, 7, 7));
+       write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+               1,
+               8, 0, 9, 0,
+               10, 0, 11, 0,
+               12, 0, 13, 0,
+               14, 0, 15, 15));
+       write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+               2,
+               16, 0, 17, 0,
+               18, 0, 19, 0,
+               20, 0, 21, 0,
+               22, 0, 23, 0));
+       write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+               3,
+               24, 0, 25, 0,
+               26, 0, 27, 0,
+               28, 0, 29, 0,
+               30, 0, 31, 0));
+
+       /* DC maps received packets */
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+               15_0,
+               0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
+               8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+               31_16,
+               16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+               24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+       /* initialize the cached sc2vl values consistently with h/w */
+       for (i = 0; i < 32; i++) {
+               if (i < 8 || i == 15)
+                       *((u8 *)(dd->sc2vl) + i) = (u8)i;
+               else
+                       *((u8 *)(dd->sc2vl) + i) = 0;
+       }
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Put the HFI CSRs in a known state.
+        * Combine this with a DC reset.
+        *
+        * Stop the device from doing anything while we do a
+        * reset.  We know there are no other active users of
+        * the device since we are now in charge.  Turn off
+        * off all outbound and inbound traffic and make sure
+        * the device does not generate any interrupts.
+        */
+
+       /* disable send contexts and SDMA engines */
+       write_csr(dd, SEND_CTRL, 0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+       /* disable port (turn off RXE inbound traffic) and contexts */
+       write_csr(dd, RCV_CTRL, 0);
+       for (i = 0; i < dd->chip_rcv_contexts; i++)
+               write_csr(dd, RCV_CTXT_CTRL, 0);
+       /* mask all interrupt sources */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+
+       /*
+        * DC Reset: do a full DC reset before the register clear.
+        * A recommended length of time to hold is one CSR read,
+        * so reread the CceDcCtrl.  Then, hold the DC in reset
+        * across the clear.
+        */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void) read_csr(dd, CCE_DC_CTRL);
+
+       if (use_flr) {
+               /*
+                * A FLR will reset the SPC core and part of the PCIe.
+                * The parts that need to be restored have already been
+                * saved.
+                */
+               dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+               /* do the FLR, the DC reset will remain */
+               hfi1_pcie_flr(dd);
+
+               /* restore command and BARs */
+               restore_pci_variables(dd);
+
+               if (is_a0(dd)) {
+                       dd_dev_info(dd, "Resetting CSRs with FLR\n");
+                       hfi1_pcie_flr(dd);
+                       restore_pci_variables(dd);
+               }
+
+       } else {
+               dd_dev_info(dd, "Resetting CSRs with writes\n");
+               reset_cce_csrs(dd);
+               reset_txe_csrs(dd);
+               reset_rxe_csrs(dd);
+               reset_asic_csrs(dd);
+               reset_misc_csrs(dd);
+       }
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+       /* Set the LED off */
+       if (is_a0(dd))
+               setextled(dd, 0);
+       /*
+        * Clear the QSFP reset.
+        * A0 leaves the out lines floating on power on, then on an FLR
+        * enforces a 0 on all out pins.  The driver does not touch
+        * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
+        * anything  plugged constantly in reset, if it pays attention
+        * to RESET_N.
+        * A prime example of this is SiPh. For now, set all pins high.
+        * I2CCLK and I2CDAT will change per direction, and INT_N and
+        * MODPRS_N are input only and their value is ignored.
+        */
+       if (is_a0(dd)) {
+               write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+               write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+       }
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* assign link credit variables */
+       dd->vau = CM_VAU;
+       dd->link_credits = CM_GLOBAL_CREDITS;
+       if (is_a0(dd))
+               dd->link_credits--;
+       dd->vcu = cu_to_vcu(hfi1_cu);
+       /* enough room for 8 MAD packets plus header - 17K */
+       dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+       if (dd->vl15_init > dd->link_credits)
+               dd->vl15_init = dd->link_credits;
+
+       write_uninitialized_csrs_and_memories(dd);
+
+       if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_pportdata *ppd = &dd->pport[i];
+
+                       set_partition_keys(ppd);
+               }
+       init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+       /* user changed the KDETH_QP */
+       if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+               /* out of range or illegal value */
+               dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+               kdeth_qp = 0;
+       }
+       if (kdeth_qp == 0)      /* not set, or failed range check */
+               kdeth_qp = DEFAULT_KDETH_QP;
+
+       write_csr(dd, SEND_BTH_QP,
+                       (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK)
+                               << SEND_BTH_QP_KDETH_QP_SHIFT);
+
+       write_csr(dd, RCV_BTH_QP,
+                       (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK)
+                               << RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass.  Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+                            u32 first_ctxt,
+                            u32 last_ctxt)
+{
+       u64 reg = 0;
+       u64 regno = RCV_QP_MAP_TABLE;
+       int i;
+       u64 ctxt = first_ctxt;
+
+       for (i = 0; i < 256;) {
+               if (ctxt == VL15CTXT) {
+                       ctxt++;
+                       if (ctxt > last_ctxt)
+                               ctxt = first_ctxt;
+                       continue;
+               }
+               reg |= ctxt << (8 * (i % 8));
+               i++;
+               ctxt++;
+               if (ctxt > last_ctxt)
+                       ctxt = first_ctxt;
+               if (i % 8 == 0) {
+                       write_csr(dd, regno, reg);
+                       reg = 0;
+                       regno += 8;
+               }
+       }
+       if (i % 8)
+               write_csr(dd, regno, reg);
+
+       add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+                       | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @first_context
+ *
+ * This routine initializes Rule 0 and the
+ * RSM map table to implement qos.
+ *
+ * If all of the limit tests succeed,
+ * qos is applied based on the array
+ * interpretation of krcvqs where
+ * entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn
+ * bits (m) are computed to feed both the RSM map table
+ * and the single rule.
+ *
+ */
+static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+{
+       u8 max_by_vl = 0;
+       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+       u64 *rsmmap;
+       u64 reg;
+       u8  rxcontext = is_a0(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+       /* validate */
+       if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+           num_vls == 1 ||
+           krcvqsset <= 1)
+               goto bail;
+       for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+               if (krcvqs[i] > max_by_vl)
+                       max_by_vl = krcvqs[i];
+       if (max_by_vl > 32)
+               goto bail;
+       qpns_per_vl = __roundup_pow_of_two(max_by_vl);
+       /* determine bits vl */
+       n = ilog2(num_vls);
+       /* determine bits for qpn */
+       m = ilog2(qpns_per_vl);
+       if ((m + n) > 7)
+               goto bail;
+       if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+               goto bail;
+       rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
+       memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
+       /* init the local copy of the table */
+       for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+               unsigned tctxt;
+
+               for (qpn = 0, tctxt = ctxt;
+                    krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+                       unsigned idx, regoff, regidx;
+
+                       /* generate index <= 128 */
+                       idx = (qpn << n) ^ i;
+                       regoff = (idx % 8) * 8;
+                       regidx = idx / 8;
+                       reg = rsmmap[regidx];
+                       /* replace 0xff with context number */
+                       reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+                               << regoff);
+                       reg |= (u64)(tctxt++) << regoff;
+                       rsmmap[regidx] = reg;
+                       if (tctxt == ctxt + krcvqs[i])
+                               tctxt = ctxt;
+               }
+               ctxt += krcvqs[i];
+       }
+       /* flush cached copies to chip */
+       for (i = 0; i < NUM_MAP_REGS; i++)
+               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
+       /* add rule0 */
+       write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
+               RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK
+                       << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
+               2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+       write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
+               LRH_BTH_MATCH_OFFSET
+                       << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+               LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+               LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+               ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+               QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+               ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+       write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
+               LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
+               LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
+               LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
+               LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
+       /* Enable RSM */
+       add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+       kfree(rsmmap);
+       /* map everything else (non-VL15) to context 0 */
+       init_qpmap_table(
+               dd,
+               0,
+               0);
+       dd->qos_shift = n + 1;
+       return;
+bail:
+       dd->qos_shift = 1;
+       init_qpmap_table(
+               dd,
+               dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0,
+               dd->n_krcv_queues - 1);
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+       /* enable all receive errors */
+       write_csr(dd, RCV_ERR_MASK, ~0ull);
+       /* setup QPN map table - start where VL15 context leaves off */
+       init_qos(
+               dd,
+               dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0);
+       /*
+        * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+        * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+        * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
+        * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+        * Max_PayLoad_Size set to its minimum of 128.
+        *
+        * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+        * (64 bytes).  Max_Payload_Size is possibly modified upward in
+        * tune_pcie_caps() which is called after this routine.
+        */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+       /* enable all CCE errors */
+       write_csr(dd, CCE_ERR_MASK, ~0ull);
+       /* enable *some* Misc errors */
+       write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+       /* enable all DC errors, except LCB */
+       write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+       write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU.  A CU is defined in terms
+ * AUs.  The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+                              u32 csr0to3, u32 csr4to7)
+{
+       write_csr(dd, csr0to3,
+                  0ull <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT
+               |  1ull <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT
+               |  2ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT
+               |  4ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+       write_csr(dd, csr4to7,
+                  8ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT
+               | 16ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT
+               | 32ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT
+               | 64ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+                                       SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+                                       SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* enable all PIO, SDMA, general, and Egress errors */
+       write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+       /* enable all per-context and per-SDMA engine errors */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+       /* set the local CU to AU mapping */
+       assign_local_cm_au_table(dd, dd->vcu);
+
+       /*
+        * Set reasonable default for Credit Return Timer
+        * Don't set on Simulator - causes it to choke.
+        */
+       if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+               write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+               ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+                SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+       /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+       if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+               reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+       /*
+        * Enable send-side J_KEY integrity check, unless this is A0 h/w
+        * (due to A0 erratum).
+        */
+       if (!is_a0(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+
+       /* Enable J_KEY check on receive context. */
+       reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+               ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+                RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+       /*
+        * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+        * This check would not have been enabled for A0 h/w, see
+        * set_ctxt_jkey().
+        */
+       if (!is_a0(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       /* Turn off the J_KEY on the receive side */
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+       return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts)
+               rcd = dd->rcd[ctxt];
+       else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+               SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts)
+               rcd = dd->rcd[ctxt];
+       else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+       return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+       free_cntrs(dd);
+       free_rcverr(dd);
+       clean_up_interrupts(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+       ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Certain chip functions need to be initialized only once per asic
+ * instead of per-device. This function finds the peer device and
+ * checks whether that chip initialization needs to be done by this
+ * device.
+ */
+static void asic_should_init(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       struct hfi1_devdata *tmp, *peer = NULL;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       /* Find our peer device */
+       list_for_each_entry(tmp, &hfi1_dev_list, list) {
+               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+                   dd->unit != tmp->unit) {
+                       peer = tmp;
+                       break;
+               }
+       }
+
+       /*
+        * "Claim" the ASIC for initialization if it hasn't been
+        " "claimed" yet.
+        */
+       if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC))
+               dd->flags |= HFI1_DO_INIT_ASIC;
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+}
+
+/**
+ * Allocate an initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+                                 const struct pci_device_id *ent)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       u64 reg;
+       int i, ret;
+       static const char * const inames[] = { /* implementation names */
+               "RTL silicon",
+               "RTL VCS simulation",
+               "RTL FPGA emulation",
+               "Functional simulator"
+       };
+
+       dd = hfi1_alloc_devdata(pdev,
+               NUM_IB_PORTS * sizeof(struct hfi1_pportdata));
+       if (IS_ERR(dd))
+               goto bail;
+       ppd = dd->pport;
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               int vl;
+               /* init common fields */
+               hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+               /* DC supports 4 link widths */
+               ppd->link_width_supported =
+                       OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+                       OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_supported =
+                       ppd->link_width_supported;
+               /* start out enabling only 4X */
+               ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_enabled =
+                                       ppd->link_width_downgrade_supported;
+               /* link width active is 0 when link is down */
+               /* link width downgrade active is 0 when link is down */
+
+               if (num_vls < HFI1_MIN_VLS_SUPPORTED
+                       || num_vls > HFI1_MAX_VLS_SUPPORTED) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Invalid num_vls %u, using %u VLs\n",
+                                   num_vls, HFI1_MAX_VLS_SUPPORTED);
+                       num_vls = HFI1_MAX_VLS_SUPPORTED;
+               }
+               ppd->vls_supported = num_vls;
+               ppd->vls_operational = ppd->vls_supported;
+               /* Set the default MTU. */
+               for (vl = 0; vl < num_vls; vl++)
+                       dd->vld[vl].mtu = hfi1_max_mtu;
+               dd->vld[15].mtu = MAX_MAD_PACKET;
+               /*
+                * Set the initial values to reasonable default, will be set
+                * for real when link is up.
+                */
+               ppd->lstate = IB_PORT_DOWN;
+               ppd->overrun_threshold = 0x4;
+               ppd->phy_error_threshold = 0xf;
+               ppd->port_crc_mode_enabled = link_crc_mask;
+               /* initialize supported LTP CRC mode */
+               ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* initialize enabled LTP CRC mode */
+               ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+               /* start in offline */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               init_vl_arb_caches(ppd);
+       }
+
+       dd->link_default = HLS_DN_POLL;
+
+       /*
+        * Do remaining PCIe setup and save PCIe values in dd.
+        * Any error printing is already done by the init code.
+        * On return, we have the chip mapped.
+        */
+       ret = hfi1_pcie_ddinit(dd, pdev, ent);
+       if (ret < 0)
+               goto bail_free;
+
+       /* verify that reads actually work, save revision for reset check */
+       dd->revision = read_csr(dd, CCE_REVISION);
+       if (dd->revision == ~(u64)0) {
+               dd_dev_err(dd, "cannot read chip CSRs\n");
+               ret = -EINVAL;
+               goto bail_cleanup;
+       }
+       dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MAJOR_MASK;
+       dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+       /* obtain the hardware ID - NOT related to unit, which is a
+          software enumeration */
+       reg = read_csr(dd, CCE_REVISION2);
+       dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+                                       & CCE_REVISION2_HFI_ID_MASK;
+       /* the variable size will remove unwanted bits */
+       dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+       dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+       dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+               dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown",
+               (int)dd->irev);
+
+       /* speeds the hardware can support */
+       dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+       /* speeds allowed to run at */
+       dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+       /* give a reasonable active value, will be set on link up */
+       dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+       dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+       dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+       dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+       dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+       dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+       /* fix up link widths for emulation _p */
+       ppd = dd->pport;
+       if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+               ppd->link_width_supported =
+                       ppd->link_width_enabled =
+                       ppd->link_width_downgrade_supported =
+                       ppd->link_width_downgrade_enabled =
+                               OPA_LINK_WIDTH_1X;
+       }
+       /* insure num_vls isn't larger than number of sdma engines */
+       if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+               dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+                               num_vls, HFI1_MAX_VLS_SUPPORTED);
+               ppd->vls_supported = num_vls = HFI1_MAX_VLS_SUPPORTED;
+               ppd->vls_operational = ppd->vls_supported;
+       }
+
+       /*
+        * Convert the ns parameter to the 64 * cclocks used in the CSR.
+        * Limit the max if larger than the field holds.  If timeout is
+        * non-zero, then the calculated field will be at least 1.
+        *
+        * Must be after icode is set up - the cclock rate depends
+        * on knowing the hardware being used.
+        */
+       dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+       if (dd->rcv_intr_timeout_csr >
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+               dd->rcv_intr_timeout_csr =
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+       else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+               dd->rcv_intr_timeout_csr = 1;
+
+       /* obtain chip sizes, reset chip CSRs */
+       init_chip(dd);
+
+       /* read in the PCIe link speed information */
+       ret = pcie_speeds(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* needs to be done before we look for the peer device */
+       read_guid(dd);
+
+       asic_should_init(dd);
+
+       /* read in firmware */
+       ret = hfi1_firmware_init(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /*
+        * In general, the PCIe Gen3 transition must occur after the
+        * chip has been idled (so it won't initiate any PCIe transactions
+        * e.g. an interrupt) and before the driver changes any registers
+        * (the transition will reset the registers).
+        *
+        * In particular, place this call after:
+        * - init_chip()     - the chip will not initiate any PCIe transactions
+        * - pcie_speeds()   - reads the current link speed
+        * - hfi1_firmware_init() - the needed firmware is ready to be
+        *                          downloaded
+        */
+       ret = do_pcie_gen3_transition(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* start setting dd values and adjusting CSRs */
+       init_early_variables(dd);
+
+       parse_platform_config(dd);
+
+       /* add board names as they are defined */
+       dd->boardname = kmalloc(64, GFP_KERNEL);
+       if (!dd->boardname)
+               goto bail_cleanup;
+       snprintf(dd->boardname, 64, "Board ID 0x%llx",
+                dd->revision >> CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT
+                   & CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK);
+
+       snprintf(dd->boardversion, BOARD_VERS_MAX,
+                "ChipABI %u.%u, %s, ChipRev %u.%u, SW Compat %llu\n",
+                HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+                dd->boardname,
+                (u32)dd->majrev,
+                (u32)dd->minrev,
+                (dd->revision >> CCE_REVISION_SW_SHIFT)
+                   & CCE_REVISION_SW_MASK);
+
+       ret = set_up_context_variables(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set initial RXE CSRs */
+       init_rxe(dd);
+       /* set initial TXE CSRs */
+       init_txe(dd);
+       /* set initial non-RXE, non-TXE CSRs */
+       init_other(dd);
+       /* set up KDETH QP prefix in both RX and TX CSRs */
+       init_kdeth_qp(dd);
+
+       /* send contexts must be set up before receive contexts */
+       ret = init_send_contexts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       ret = hfi1_create_ctxts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+       /*
+        * rcd[0] is guaranteed to be valid by this point. Also, all
+        * context are using the same value, as per the module parameter.
+        */
+       dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+       ret = init_pervl_scs(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* sdma init */
+       for (i = 0; i < dd->num_pports; ++i) {
+               ret = sdma_init(dd, i);
+               if (ret)
+                       goto bail_cleanup;
+       }
+
+       /* use contexts created by hfi1_create_ctxts */
+       ret = set_up_interrupts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set up LCB access - must be after set_up_interrupts() */
+       init_lcb_access(dd);
+
+       snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+                dd->base_guid & 0xFFFFFF);
+
+       dd->oui1 = dd->base_guid >> 56 & 0xFF;
+       dd->oui2 = dd->base_guid >> 48 & 0xFF;
+       dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+       ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+       if (ret)
+               goto bail_clear_intr;
+       check_fabric_firmware_versions(dd);
+
+       thermal_init(dd);
+
+       ret = init_cntrs(dd);
+       if (ret)
+               goto bail_clear_intr;
+
+       ret = init_rcverr(dd);
+       if (ret)
+               goto bail_free_cntrs;
+
+       ret = eprom_init(dd);
+       if (ret)
+               goto bail_free_rcverr;
+
+       goto bail;
+
+bail_free_rcverr:
+       free_rcverr(dd);
+bail_free_cntrs:
+       free_cntrs(dd);
+bail_clear_intr:
+       clean_up_interrupts(dd);
+bail_cleanup:
+       hfi1_pcie_ddcleanup(dd);
+bail_free:
+       hfi1_free_devdata(dd);
+       dd = ERR_PTR(ret);
+bail:
+       return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+                       u32 dw_len)
+{
+       u32 delta_cycles;
+       u32 current_egress_rate = ppd->current_egress_rate;
+       /* rates here are in units of 10^6 bits/sec */
+
+       if (desired_egress_rate == -1)
+               return 0; /* shouldn't happen */
+
+       if (desired_egress_rate >= current_egress_rate)
+               return 0; /* we can't help go faster, only slower */
+
+       delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+                       egress_cycles(dw_len * 4, current_egress_rate);
+
+       return (u16)delta_cycles;
+}
+
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature.  The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+              u32 dw_len)
+{
+       u64 pbc, delay = 0;
+
+       if (unlikely(srate_mbs))
+               delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+       pbc = flags
+               | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+               | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+               | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+               | (dw_len & PBC_LENGTH_DWS_MASK)
+                       << PBC_LENGTH_DWS_SHIFT;
+
+       return pbc;
+}
+
+#define SBUS_THERMAL    0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+       dd_dev_err((dd),                                                \
+                  "Thermal sensor initialization failed: %s (%d)\n",   \
+                  (reason), (ret))
+
+/*
+ * Initialize the Avago Thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       if (dd->icode != ICODE_RTL_SILICON ||
+           !(dd->flags & HFI1_DO_INIT_ASIC))
+               return ret;
+
+       acquire_hw_mutex(dd);
+       dd_dev_info(dd, "Initializing thermal sensor\n");
+       /* Thermal Sensor Initialization */
+       /*    Step 1: Reset the Thermal SBus Receiver */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               RESET_SBUS_RECEIVER, 0);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Bus Reset");
+               goto done;
+       }
+       /*    Step 2: Set Reset bit in Thermal block */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x1);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Therm Block Reset");
+               goto done;
+       }
+       /*    Step 3: Write clock divider value (100MHz -> 2MHz) */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+                               WRITE_SBUS_RECEIVER, 0x32);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Clock Div");
+               goto done;
+       }
+       /*    Step 4: Select temperature mode */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+                               WRITE_SBUS_RECEIVER,
+                               SBUS_THERM_MONITOR_MODE);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Mode Sel");
+               goto done;
+       }
+       /*    Step 5: De-assert block reset and start conversion */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x2);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Reset Deassert");
+               goto done;
+       }
+       /*    Step 5.1: Wait for first conversion (21.5ms per spec) */
+       msleep(22);
+
+       /* Enable polling of thermal readings */
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+done:
+       release_hw_mutex(dd);
+       return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       /*
+        * Thermal Critical Interrupt
+        * Put the device into forced freeze mode, take link down to
+        * offline, and put DC into reset.
+        */
+       dd_dev_emerg(dd,
+                    "Critical temperature reached! Forcing device into freeze mode!\n");
+       dd->flags |= HFI1_FORCED_FREEZE;
+       start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT);
+       /*
+        * Shut DC down as much and as quickly as possible.
+        *
+        * Step 1: Take the link down to OFFLINE. This will cause the
+        *         8051 to put the Serdes in reset. However, we don't want to
+        *         go through the entire link state machine since we want to
+        *         shutdown ASAP. Furthermore, this is not a graceful shutdown
+        *         but rather an attempt to save the chip.
+        *         Code below is almost the same as quiet_serdes() but avoids
+        *         all the extra work and the sleeps.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+       set_physical_link_state(dd, PLS_OFFLINE |
+                               (OPA_LINKDOWN_REASON_SMA_DISABLED << 8));
+       /*
+        * Step 2: Shutdown LCB and 8051
+        *         After shutdown, do not restore DC_CFG_RESET value.
+        */
+       dc_shutdown(dd);
+}
diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h
new file mode 100644 (file)
index 0000000..f89a432
--- /dev/null
@@ -0,0 +1,1035 @@
+#ifndef _CHIP_H
+#define _CHIP_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the defines that is specific to the HFI chip
+ */
+
+/* sizes */
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define NUM_INTERRUPT_SOURCES 768
+#define RXE_NUM_CONTEXTS 160
+#define RXE_PER_CONTEXT_SIZE 0x1000    /* 4k */
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_NUM_DATA_VL 8
+#define TXE_NUM_CONTEXTS 160
+#define TXE_NUM_SDMA_ENGINES 16
+#define NUM_CONTEXTS_PER_SET 8
+#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16
+#define VL_ARB_LOW_PRIO_TABLE_SIZE 16
+#define VL_ARB_TABLE_SIZE 16
+#define TXE_NUM_32_BIT_COUNTER 7
+#define TXE_NUM_64_BIT_COUNTER 30
+#define TXE_NUM_DATA_VL 8
+#define TXE_PIO_SIZE (32 * 0x100000)   /* 32 MB */
+#define PIO_BLOCK_SIZE 64                      /* bytes */
+#define SDMA_BLOCK_SIZE 64                     /* bytes */
+#define RCV_BUF_BLOCK_SIZE 64               /* bytes */
+#define PIO_CMASK 0x7ff        /* counter mask for free and fill counters */
+#define MAX_EAGER_ENTRIES    2048      /* max receive eager entries */
+#define MAX_TID_PAIR_ENTRIES 1024      /* max receive expected pairs */
+/* Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed
+   at 64 bytes for all generation one devices */
+#define CM_VAU 3
+/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */
+#define CM_GLOBAL_CREDITS 0x940
+/* Number of PKey entries in the HW */
+#define MAX_PKEY_VALUES 16
+
+#include "chip_registers.h"
+
+#define RXE_PER_CONTEXT_USER   (RXE + RXE_PER_CONTEXT_OFFSET)
+#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET)
+
+/* PBC flags */
+#define PBC_INTR               (1ull << 31)
+#define PBC_DC_INFO_SHIFT      (30)
+#define PBC_DC_INFO            (1ull << PBC_DC_INFO_SHIFT)
+#define PBC_TEST_EBP   (1ull << 29)
+#define PBC_PACKET_BYPASS      (1ull << 28)
+#define PBC_CREDIT_RETURN      (1ull << 25)
+#define PBC_INSERT_BYPASS_ICRC (1ull << 24)
+#define PBC_TEST_BAD_ICRC      (1ull << 23)
+#define PBC_FECN               (1ull << 22)
+
+/* PbcInsertHcrc field settings */
+#define PBC_IHCRC_LKDETH 0x0   /* insert @ local KDETH offset */
+#define PBC_IHCRC_GKDETH 0x1   /* insert @ global KDETH offset */
+#define PBC_IHCRC_NONE   0x2   /* no HCRC inserted */
+
+/* PBC fields */
+#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32
+#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull
+#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \
+       (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \
+       PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+
+#define PBC_INSERT_HCRC_SHIFT 26
+#define PBC_INSERT_HCRC_MASK 0x3ull
+#define PBC_INSERT_HCRC_SMASK \
+       (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT)
+
+#define PBC_VL_SHIFT 12
+#define PBC_VL_MASK 0xfull
+#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT)
+
+#define PBC_LENGTH_DWS_SHIFT 0
+#define PBC_LENGTH_DWS_MASK 0xfffull
+#define PBC_LENGTH_DWS_SMASK \
+       (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT)
+
+/* Credit Return Fields */
+#define CR_COUNTER_SHIFT 0
+#define CR_COUNTER_MASK 0x7ffull
+#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT)
+
+#define CR_STATUS_SHIFT 11
+#define CR_STATUS_MASK 0x1ull
+#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12
+#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14
+#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT)
+
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull
+#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \
+       (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
+       CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
+
+/* interrupt source numbers */
+#define IS_GENERAL_ERR_START     0
+#define IS_SDMAENG_ERR_START    16
+#define IS_SENDCTXT_ERR_START   32
+#define IS_SDMA_START          192 /* includes SDmaProgress,SDmaIdle */
+#define IS_VARIOUS_START               240
+#define IS_DC_START                    248
+#define IS_RCVAVAIL_START              256
+#define IS_RCVURGENT_START             416
+#define IS_SENDCREDIT_START            576
+#define IS_RESERVED_START              736
+#define IS_MAX_SOURCES         768
+
+/* derived interrupt source values */
+#define IS_GENERAL_ERR_END             IS_SDMAENG_ERR_START
+#define IS_SDMAENG_ERR_END             IS_SENDCTXT_ERR_START
+#define IS_SENDCTXT_ERR_END            IS_SDMA_START
+#define IS_SDMA_END                    IS_VARIOUS_START
+#define IS_VARIOUS_END         IS_DC_START
+#define IS_DC_END                      IS_RCVAVAIL_START
+#define IS_RCVAVAIL_END                IS_RCVURGENT_START
+#define IS_RCVURGENT_END               IS_SENDCREDIT_START
+#define IS_SENDCREDIT_END              IS_RESERVED_START
+#define IS_RESERVED_END                IS_MAX_SOURCES
+
+/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
+#define QSFP1_INT              242
+#define QSFP2_INT              243
+
+/* DCC_CFG_PORT_CONFIG logical link states */
+#define LSTATE_DOWN    0x1
+#define LSTATE_INIT    0x2
+#define LSTATE_ARMED   0x3
+#define LSTATE_ACTIVE  0x4
+
+/* DC8051_STS_CUR_STATE port values (physical link states) */
+#define PLS_DISABLED                      0x30
+#define PLS_OFFLINE                               0x90
+#define PLS_OFFLINE_QUIET                         0x90
+#define PLS_OFFLINE_PLANNED_DOWN_INFORM           0x91
+#define PLS_OFFLINE_READY_TO_QUIET_LT     0x92
+#define PLS_OFFLINE_REPORT_FAILURE                0x93
+#define PLS_OFFLINE_READY_TO_QUIET_BCC    0x94
+#define PLS_POLLING                               0x20
+#define PLS_POLLING_QUIET                         0x20
+#define PLS_POLLING_ACTIVE                        0x21
+#define PLS_CONFIGPHY                     0x40
+#define PLS_CONFIGPHY_DEBOUCE             0x40
+#define PLS_CONFIGPHY_ESTCOMM             0x41
+#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT           0x42
+#define PLS_CONFIGPHY_ESTcOMM_LOCAL_COMPLETE   0x43
+#define PLS_CONFIGPHY_OPTEQ                       0x44
+#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING    0x44
+#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE        0x45
+#define PLS_CONFIGPHY_VERIFYCAP                   0x46
+#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE          0x46
+#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47
+#define PLS_CONFIGLT                      0x48
+#define PLS_CONFIGLT_CONFIGURE            0x48
+#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE         0x49
+#define PLS_LINKUP                                0x50
+#define PLS_PHYTEST                               0xB0
+#define PLS_INTERNAL_SERDES_LOOPBACK      0xe1
+#define PLS_QUICK_LINKUP                          0xe2
+
+/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */
+#define HCMD_LOAD_CONFIG_DATA  0x01
+#define HCMD_READ_CONFIG_DATA  0x02
+#define HCMD_CHANGE_PHY_STATE  0x03
+#define HCMD_SEND_LCB_IDLE_MSG 0x04
+#define HCMD_MISC                 0x05
+#define HCMD_READ_LCB_IDLE_MSG 0x06
+#define HCMD_READ_LCB_CSR      0x07
+#define HCMD_INTERFACE_TEST       0xff
+
+/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */
+#define HCMD_SUCCESS 2
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */
+#define SPICO_ROM_FAILED                   (1 <<  0)
+#define UNKNOWN_FRAME              (1 <<  1)
+#define TARGET_BER_NOT_MET                 (1 <<  2)
+#define FAILED_SERDES_INTERNAL_LOOPBACK (1 <<  3)
+#define FAILED_SERDES_INIT                 (1 <<  4)
+#define FAILED_LNI_POLLING                 (1 <<  5)
+#define FAILED_LNI_DEBOUNCE                (1 <<  6)
+#define FAILED_LNI_ESTBCOMM                (1 <<  7)
+#define FAILED_LNI_OPTEQ                   (1 <<  8)
+#define FAILED_LNI_VERIFY_CAP1     (1 <<  9)
+#define FAILED_LNI_VERIFY_CAP2     (1 << 10)
+#define FAILED_LNI_CONFIGLT                (1 << 11)
+
+#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \
+                       | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \
+                       | FAILED_LNI_VERIFY_CAP1 \
+                       | FAILED_LNI_VERIFY_CAP2 \
+                       | FAILED_LNI_CONFIGLT)
+
+/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */
+#define HOST_REQ_DONE     (1 << 0)
+#define BC_PWR_MGM_MSG    (1 << 1)
+#define BC_SMA_MSG                (1 << 2)
+#define BC_BCC_UNKOWN_MSG         (1 << 3)
+#define BC_IDLE_UNKNOWN_MSG       (1 << 4)
+#define EXT_DEVICE_CFG_REQ        (1 << 5)
+#define VERIFY_CAP_FRAME          (1 << 6)
+#define LINKUP_ACHIEVED           (1 << 7)
+#define LINK_GOING_DOWN           (1 << 8)
+#define LINK_WIDTH_DOWNGRADED  (1 << 9)
+
+/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */
+#define HREQ_LOAD_CONFIG       0x01
+#define HREQ_SAVE_CONFIG       0x02
+#define HREQ_READ_CONFIG       0x03
+#define HREQ_SET_TX_EQ_ABS     0x04
+#define HREQ_SET_TX_EQ_REL     0x05
+#define HREQ_ENABLE            0x06
+#define HREQ_CONFIG_DONE       0xfe
+#define HREQ_INTERFACE_TEST    0xff
+
+/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */
+#define HREQ_INVALID           0x01
+#define HREQ_SUCCESS           0x02
+#define HREQ_NOT_SUPPORTED             0x03
+#define HREQ_FEATURE_NOT_SUPPORTED     0x04 /* request specific feature */
+#define HREQ_REQUEST_REJECTED  0xfe
+#define HREQ_EXECUTION_ONGOING 0xff
+
+/* MISC host command functions */
+#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1
+#define HCMD_MISC_GRANT_LCB_ACCESS   0x2
+
+/* idle flit message types */
+#define IDLE_PHYSICAL_LINK_MGMT 0x1
+#define IDLE_CRU                   0x2
+#define IDLE_SMA                   0x3
+#define IDLE_POWER_MGMT            0x4
+
+/* idle flit message send fields (both send and read) */
+#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */
+#define IDLE_PAYLOAD_SHIFT 8
+#define IDLE_MSG_TYPE_MASK 0xf
+#define IDLE_MSG_TYPE_SHIFT 0
+
+/* idle flit message read fields */
+#define READ_IDLE_MSG_TYPE_MASK 0xf
+#define READ_IDLE_MSG_TYPE_SHIFT 0
+
+/* SMA idle flit payload commands */
+#define SMA_IDLE_ARM   1
+#define SMA_IDLE_ACTIVE 2
+
+/* DC_DC8051_CFG_MODE.GENERAL bits */
+#define DISABLE_SELF_GUID_CHECK 0x2
+
+/*
+ * Eager buffer minimum and maximum sizes supported by the hardware.
+ * All power-of-two sizes in between are supported as well.
+ * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory
+ * allocatable for Eager buffer to a single context. All others
+ * are limits for the RcvArray entries.
+ */
+#define MIN_EAGER_BUFFER       (4 * 1024)
+#define MAX_EAGER_BUFFER       (256 * 1024)
+#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
+#define MAX_EXPECTED_BUFFER    (2048 * 1024)
+
+/*
+ * Receive expected base and count and eager base and count increment -
+ * the CSR fields hold multiples of this value.
+ */
+#define RCV_SHIFT 3
+#define RCV_INCREMENT (1 << RCV_SHIFT)
+
+/*
+ * Receive header queue entry increment - the CSR holds multiples of
+ * this value.
+ */
+#define HDRQ_SIZE_SHIFT 5
+#define HDRQ_INCREMENT (1 << HDRQ_SIZE_SHIFT)
+
+/*
+ * Freeze handling flags
+ */
+#define FREEZE_ABORT     0x01  /* do not do recovery */
+#define FREEZE_SELF         0x02       /* initiate the freeze */
+#define FREEZE_LINK_DOWN 0x04  /* link is down */
+
+/*
+ * Chip implementation codes.
+ */
+#define ICODE_RTL_SILICON              0x00
+#define ICODE_RTL_VCS_SIMULATION       0x01
+#define ICODE_FPGA_EMULATION   0x02
+#define ICODE_FUNCTIONAL_SIMULATOR     0x03
+
+/*
+ * 8051 data memory size.
+ */
+#define DC8051_DATA_MEM_SIZE 0x1000
+
+/*
+ * 8051 firmware registers
+ */
+#define NUM_GENERAL_FIELDS 0x17
+#define NUM_LANE_FIELDS    0x8
+
+/* 8051 general register Field IDs */
+#define TX_SETTINGS                 0x06
+#define VERIFY_CAP_LOCAL_PHY        0x07
+#define VERIFY_CAP_LOCAL_FABRIC             0x08
+#define VERIFY_CAP_LOCAL_LINK_WIDTH  0x09
+#define LOCAL_DEVICE_ID                     0x0a
+#define LOCAL_LNI_INFO              0x0c
+#define REMOTE_LNI_INFO              0x0d
+#define MISC_STATUS                 0x0e
+#define VERIFY_CAP_REMOTE_PHY       0x0f
+#define VERIFY_CAP_REMOTE_FABRIC     0x10
+#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11
+#define LAST_LOCAL_STATE_COMPLETE    0x12
+#define LAST_REMOTE_STATE_COMPLETE   0x13
+#define LINK_QUALITY_INFO            0x14
+#define REMOTE_DEVICE_ID            0x15
+
+/* Lane ID for general configuration registers */
+#define GENERAL_CONFIG 4
+
+/* LOAD_DATA 8051 command shifts and fields */
+#define LOAD_DATA_FIELD_ID_SHIFT 40
+#define LOAD_DATA_FIELD_ID_MASK 0xfull
+#define LOAD_DATA_LANE_ID_SHIFT 32
+#define LOAD_DATA_LANE_ID_MASK 0xfull
+#define LOAD_DATA_DATA_SHIFT   0x0
+#define LOAD_DATA_DATA_MASK   0xffffffffull
+
+/* READ_DATA 8051 command shifts and fields */
+#define READ_DATA_FIELD_ID_SHIFT 40
+#define READ_DATA_FIELD_ID_MASK 0xffull
+#define READ_DATA_LANE_ID_SHIFT 32
+#define READ_DATA_LANE_ID_MASK 0xffull
+#define READ_DATA_DATA_SHIFT   0x0
+#define READ_DATA_DATA_MASK   0xffffffffull
+
+/* TX settings fields */
+#define ENABLE_LANE_TX_SHIFT           0
+#define ENABLE_LANE_TX_MASK            0xff
+#define TX_POLARITY_INVERSION_SHIFT    8
+#define TX_POLARITY_INVERSION_MASK     0xff
+#define RX_POLARITY_INVERSION_SHIFT    16
+#define RX_POLARITY_INVERSION_MASK     0xff
+#define MAX_RATE_SHIFT                 24
+#define MAX_RATE_MASK                  0xff
+
+/* verify capability PHY fields */
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4
+#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK  0x1
+#define POWER_MANAGEMENT_SHIFT                 0x0
+#define POWER_MANAGEMENT_MASK                  0xf
+
+/* 8051 lane register Field IDs */
+#define SPICO_FW_VERSION 0x7   /* SPICO firmware version */
+
+/* SPICO firmware version fields */
+#define SPICO_ROM_VERSION_SHIFT 0
+#define SPICO_ROM_VERSION_MASK 0xffff
+#define SPICO_ROM_PROD_ID_SHIFT 16
+#define SPICO_ROM_PROD_ID_MASK 0xffff
+
+/* verify capability fabric fields */
+#define VAU_SHIFT      0
+#define VAU_MASK       0x0007
+#define Z_SHIFT                3
+#define Z_MASK         0x0001
+#define VCU_SHIFT      4
+#define VCU_MASK       0x0007
+#define VL15BUF_SHIFT  8
+#define VL15BUF_MASK   0x0fff
+#define CRC_SIZES_SHIFT 20
+#define CRC_SIZES_MASK 0x7
+
+/* verify capability local link width fields */
+#define LINK_WIDTH_SHIFT 0             /* also for remote link width */
+#define LINK_WIDTH_MASK 0xffff         /* also for remote link width */
+#define LOCAL_FLAG_BITS_SHIFT 16
+#define LOCAL_FLAG_BITS_MASK 0xff
+#define MISC_CONFIG_BITS_SHIFT 24
+#define MISC_CONFIG_BITS_MASK 0xff
+
+/* verify capability remote link width fields */
+#define REMOTE_TX_RATE_SHIFT 16
+#define REMOTE_TX_RATE_MASK 0xff
+
+/* LOCAL_DEVICE_ID fields */
+#define LOCAL_DEVICE_REV_SHIFT 0
+#define LOCAL_DEVICE_REV_MASK 0xff
+#define LOCAL_DEVICE_ID_SHIFT 8
+#define LOCAL_DEVICE_ID_MASK 0xffff
+
+/* REMOTE_DEVICE_ID fields */
+#define REMOTE_DEVICE_REV_SHIFT 0
+#define REMOTE_DEVICE_REV_MASK 0xff
+#define REMOTE_DEVICE_ID_SHIFT 8
+#define REMOTE_DEVICE_ID_MASK 0xffff
+
+/* local LNI link width fields */
+#define ENABLE_LANE_RX_SHIFT 16
+#define ENABLE_LANE_RX_MASK  0xff
+
+/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */
+#define MGMT_ALLOWED_SHIFT 23
+#define MGMT_ALLOWED_MASK 0x1
+
+/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */
+#define LINK_QUALITY_SHIFT 24
+#define LINK_QUALITY_MASK  0x7
+
+/*
+ * mask, shift for reading 'planned_down_remote_reason_code'
+ * from LINK_QUALITY_INFO field
+ */
+#define DOWN_REMOTE_REASON_SHIFT 16
+#define DOWN_REMOTE_REASON_MASK  0xff
+
+/* verify capability PHY power management bits */
+#define PWRM_BER_CONTROL       0x1
+#define PWRM_BANDWIDTH_CONTROL 0x2
+
+/* verify capability fabric CRC size bits */
+enum {
+       CAP_CRC_14B = (1 << 0), /* 14b CRC */
+       CAP_CRC_48B = (1 << 1), /* 48b CRC */
+       CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */
+};
+
+#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B)
+
+/* misc status version fields */
+#define STS_FM_VERSION_A_SHIFT 16
+#define STS_FM_VERSION_A_MASK  0xff
+#define STS_FM_VERSION_B_SHIFT 24
+#define STS_FM_VERSION_B_MASK  0xff
+
+/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */
+#define LCB_CRC_16B                    0x0     /* 16b CRC */
+#define LCB_CRC_14B                    0x1     /* 14b CRC */
+#define LCB_CRC_48B                    0x2     /* 48b CRC */
+#define LCB_CRC_12B_16B_PER_LANE       0x3     /* 12b-16b per lane CRC */
+
+/* the following enum is (almost) a copy/paste of the definition
+ * in the OPA spec, section 20.2.2.6.8 (PortInfo) */
+enum {
+       PORT_LTP_CRC_MODE_NONE = 0,
+       PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */
+       PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */
+       PORT_LTP_CRC_MODE_48 = 4,
+               /* 48-bit overlapping LTP CRC mode (optional) */
+       PORT_LTP_CRC_MODE_PER_LANE = 8
+               /* 12 to 16 bit per lane LTP CRC mode (optional) */
+};
+
+/* timeouts */
+#define LINK_RESTART_DELAY 1000                /* link restart delay, in ms */
+#define TIMEOUT_8051_START 5000         /* 8051 start timeout, in ms */
+#define DC8051_COMMAND_TIMEOUT 20000   /* DC8051 command timeout, in ms */
+#define FREEZE_STATUS_TIMEOUT 20       /* wait for freeze indicators, in ms */
+#define VL_STATUS_CLEAR_TIMEOUT 5000   /* per-VL status clear, in ms */
+#define CCE_STATUS_TIMEOUT 10          /* time to clear CCE Status, in ms */
+
+/* cclock tick time, in picoseconds per tick: 1/speed * 10^12  */
+#define ASIC_CCLOCK_PS  1242   /* 805 MHz */
+#define FPGA_CCLOCK_PS 30300   /*  33 MHz */
+
+/*
+ * Mask of enabled MISC errors.  Do not enable the two RSA engine errors -
+ * see firmware.c:run_rsa() for details.
+ */
+#define DRIVER_MISC_MASK \
+       (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \
+               | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK))
+
+/* valid values for the loopback module parameter */
+#define LOOPBACK_NONE  0       /* no loopback - default */
+#define LOOPBACK_SERDES 1
+#define LOOPBACK_LCB   2
+#define LOOPBACK_CABLE 3       /* external cable */
+
+/* read and write hardware registers */
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
+
+/*
+ * The *_kctxt_* flavor of the CSR read/write functions are for
+ * per-context or per-SDMA CSRs that are not mappable to user-space.
+ * Their spacing is not a PAGE_SIZE multiple.
+ */
+static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+                                u32 offset0)
+{
+       /* kernel per-context CSRs are separated by 0x100 */
+       return read_csr(dd, offset0 + (0x100 * ctxt));
+}
+
+static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt,
+                                  u32 offset0, u64 value)
+{
+       /* kernel per-context CSRs are separated by 0x100 */
+       write_csr(dd, offset0 + (0x100 * ctxt), value);
+}
+
+int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data);
+int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset);
+
+static inline void __iomem *get_kctxt_csr_addr(
+       struct hfi1_devdata *dd,
+       int ctxt,
+       u32 offset0)
+{
+       return get_csr_addr(dd, offset0 + (0x100 * ctxt));
+}
+
+/*
+ * The *_uctxt_* flavor of the CSR read/write functions are for
+ * per-context CSRs that are mappable to user space. All these CSRs
+ * are spaced by a PAGE_SIZE multiple in order to be mappable to
+ * different processes without exposing other contexts' CSRs
+ */
+static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt,
+                                u32 offset0)
+{
+       /* user per-context CSRs are separated by 0x1000 */
+       return read_csr(dd, offset0 + (0x1000 * ctxt));
+}
+
+static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt,
+                                  u32 offset0, u64 value)
+{
+       /* user per-context CSRs are separated by 0x1000 */
+       write_csr(dd, offset0 + (0x1000 * ctxt), value);
+}
+
+u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32);
+
+/* firmware.c */
+#define NUM_PCIE_SERDES 16     /* number of PCIe serdes on the SBus */
+extern const u8 pcie_serdes_broadcast[];
+extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES];
+/* SBus commands */
+#define RESET_SBUS_RECEIVER 0x20
+#define WRITE_SBUS_RECEIVER 0x21
+void sbus_request(struct hfi1_devdata *dd,
+                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+int sbus_request_slow(struct hfi1_devdata *dd,
+                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in);
+void set_sbus_fast_mode(struct hfi1_devdata *dd);
+void clear_sbus_fast_mode(struct hfi1_devdata *dd);
+int hfi1_firmware_init(struct hfi1_devdata *dd);
+int load_pcie_firmware(struct hfi1_devdata *dd);
+int load_firmware(struct hfi1_devdata *dd);
+void dispose_firmware(void);
+int acquire_hw_mutex(struct hfi1_devdata *dd);
+void release_hw_mutex(struct hfi1_devdata *dd);
+void fabric_serdes_reset(struct hfi1_devdata *dd);
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result);
+
+/* chip.c */
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b);
+void read_guid(struct hfi1_devdata *dd);
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout);
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason);
+int set_link_state(struct hfi1_pportdata *, u32 state);
+int port_ltp_to_cap(int port_ltp);
+void handle_verify_cap(struct work_struct *work);
+void handle_freeze(struct work_struct *work);
+void handle_link_up(struct work_struct *work);
+void handle_link_down(struct work_struct *work);
+void handle_link_downgrade(struct work_struct *work);
+void handle_link_bounce(struct work_struct *work);
+void handle_sma_message(struct work_struct *work);
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags);
+int send_idle_sma(struct hfi1_devdata *dd, u64 message);
+int start_link(struct hfi1_pportdata *ppd);
+void init_qsfp(struct hfi1_pportdata *ppd);
+int bringup_serdes(struct hfi1_pportdata *ppd);
+void set_intr_state(struct hfi1_devdata *dd, u32 enable);
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd,
+                                int refresh_widths);
+void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32);
+int stop_drain_data_vls(struct hfi1_devdata *dd);
+int open_fill_data_vls(struct hfi1_devdata *dd);
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns);
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock);
+void get_linkup_link_widths(struct hfi1_pportdata *ppd);
+void read_ltp_rtt(struct hfi1_devdata *dd);
+void clear_linkup_counters(struct hfi1_devdata *dd);
+u32 hdrqempty(struct hfi1_ctxtdata *rcd);
+int is_a0(struct hfi1_devdata *dd);
+int is_ax(struct hfi1_devdata *dd);
+int is_bx(struct hfi1_devdata *dd);
+u32 read_physical_state(struct hfi1_devdata *dd);
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
+u32 get_logical_state(struct hfi1_pportdata *ppd);
+const char *opa_lstate_name(u32 lstate);
+const char *opa_pstate_name(u32 pstate);
+u32 driver_physical_state(struct hfi1_pportdata *ppd);
+u32 driver_logical_state(struct hfi1_pportdata *ppd);
+
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok);
+#define LCB_START DC_LCB_CSRS
+#define LCB_END   DC_8051_CSRS /* next block is 8051 */
+static inline int is_lcb_offset(u32 offset)
+{
+       return (offset >= LCB_START && offset < LCB_END);
+}
+
+extern uint num_vls;
+
+extern uint disable_integrity;
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl);
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data);
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl);
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data);
+
+/* Per VL indexes */
+enum {
+       C_VL_0 = 0,
+       C_VL_1,
+       C_VL_2,
+       C_VL_3,
+       C_VL_4,
+       C_VL_5,
+       C_VL_6,
+       C_VL_7,
+       C_VL_15,
+       C_VL_COUNT
+};
+
+static inline int vl_from_idx(int idx)
+{
+       return (idx == C_VL_15 ? 15 : idx);
+}
+
+static inline int idx_from_vl(int vl)
+{
+       return (vl == 15 ? C_VL_15 : vl);
+}
+
+/* Per device counter indexes */
+enum {
+       C_RCV_OVF = 0,
+       C_RX_TID_FULL,
+       C_RX_TID_INVALID,
+       C_RX_TID_FLGMS,
+       C_RX_CTX_RHQS,
+       C_RX_CTX_EGRS,
+       C_RCV_TID_FLSMS,
+       C_CCE_PCI_CR_ST,
+       C_CCE_PCI_TR_ST,
+       C_CCE_PIO_WR_ST,
+       C_CCE_ERR_INT,
+       C_CCE_SDMA_INT,
+       C_CCE_MISC_INT,
+       C_CCE_RCV_AV_INT,
+       C_CCE_RCV_URG_INT,
+       C_CCE_SEND_CR_INT,
+       C_DC_UNC_ERR,
+       C_DC_RCV_ERR,
+       C_DC_FM_CFG_ERR,
+       C_DC_RMT_PHY_ERR,
+       C_DC_DROPPED_PKT,
+       C_DC_MC_XMIT_PKTS,
+       C_DC_MC_RCV_PKTS,
+       C_DC_XMIT_CERR,
+       C_DC_RCV_CERR,
+       C_DC_RCV_FCC,
+       C_DC_XMIT_FCC,
+       C_DC_XMIT_FLITS,
+       C_DC_RCV_FLITS,
+       C_DC_XMIT_PKTS,
+       C_DC_RCV_PKTS,
+       C_DC_RX_FLIT_VL,
+       C_DC_RX_PKT_VL,
+       C_DC_RCV_FCN,
+       C_DC_RCV_FCN_VL,
+       C_DC_RCV_BCN,
+       C_DC_RCV_BCN_VL,
+       C_DC_RCV_BBL,
+       C_DC_RCV_BBL_VL,
+       C_DC_MARK_FECN,
+       C_DC_MARK_FECN_VL,
+       C_DC_TOTAL_CRC,
+       C_DC_CRC_LN0,
+       C_DC_CRC_LN1,
+       C_DC_CRC_LN2,
+       C_DC_CRC_LN3,
+       C_DC_CRC_MULT_LN,
+       C_DC_TX_REPLAY,
+       C_DC_RX_REPLAY,
+       C_DC_SEQ_CRC_CNT,
+       C_DC_ESC0_ONLY_CNT,
+       C_DC_ESC0_PLUS1_CNT,
+       C_DC_ESC0_PLUS2_CNT,
+       C_DC_REINIT_FROM_PEER_CNT,
+       C_DC_SBE_CNT,
+       C_DC_MISC_FLG_CNT,
+       C_DC_PRF_GOOD_LTP_CNT,
+       C_DC_PRF_ACCEPTED_LTP_CNT,
+       C_DC_PRF_RX_FLIT_CNT,
+       C_DC_PRF_TX_FLIT_CNT,
+       C_DC_PRF_CLK_CNTR,
+       C_DC_PG_DBG_FLIT_CRDTS_CNT,
+       C_DC_PG_STS_PAUSE_COMPLETE_CNT,
+       C_DC_PG_STS_TX_SBE_CNT,
+       C_DC_PG_STS_TX_MBE_CNT,
+       C_SW_CPU_INTR,
+       C_SW_CPU_RCV_LIM,
+       C_SW_VTX_WAIT,
+       C_SW_PIO_WAIT,
+       C_SW_KMEM_WAIT,
+       DEV_CNTR_LAST  /* Must be kept last */
+};
+
+/* Per port counter indexes */
+enum {
+       C_TX_UNSUP_VL = 0,
+       C_TX_INVAL_LEN,
+       C_TX_MM_LEN_ERR,
+       C_TX_UNDERRUN,
+       C_TX_FLOW_STALL,
+       C_TX_DROPPED,
+       C_TX_HDR_ERR,
+       C_TX_PKT,
+       C_TX_WORDS,
+       C_TX_WAIT,
+       C_TX_FLIT_VL,
+       C_TX_PKT_VL,
+       C_TX_WAIT_VL,
+       C_RX_PKT,
+       C_RX_WORDS,
+       C_SW_LINK_DOWN,
+       C_SW_LINK_UP,
+       C_SW_XMIT_DSCD,
+       C_SW_XMIT_DSCD_VL,
+       C_SW_XMIT_CSTR_ERR,
+       C_SW_RCV_CSTR_ERR,
+       C_SW_IBP_LOOP_PKTS,
+       C_SW_IBP_RC_RESENDS,
+       C_SW_IBP_RNR_NAKS,
+       C_SW_IBP_OTHER_NAKS,
+       C_SW_IBP_RC_TIMEOUTS,
+       C_SW_IBP_PKT_DROPS,
+       C_SW_IBP_DMA_WAIT,
+       C_SW_IBP_RC_SEQNAK,
+       C_SW_IBP_RC_DUPREQ,
+       C_SW_IBP_RDMA_SEQ,
+       C_SW_IBP_UNALIGNED,
+       C_SW_IBP_SEQ_NAK,
+       C_SW_CPU_RC_ACKS,
+       C_SW_CPU_RC_QACKS,
+       C_SW_CPU_RC_DELAYED_COMP,
+       C_RCV_HDR_OVF_0,
+       C_RCV_HDR_OVF_1,
+       C_RCV_HDR_OVF_2,
+       C_RCV_HDR_OVF_3,
+       C_RCV_HDR_OVF_4,
+       C_RCV_HDR_OVF_5,
+       C_RCV_HDR_OVF_6,
+       C_RCV_HDR_OVF_7,
+       C_RCV_HDR_OVF_8,
+       C_RCV_HDR_OVF_9,
+       C_RCV_HDR_OVF_10,
+       C_RCV_HDR_OVF_11,
+       C_RCV_HDR_OVF_12,
+       C_RCV_HDR_OVF_13,
+       C_RCV_HDR_OVF_14,
+       C_RCV_HDR_OVF_15,
+       C_RCV_HDR_OVF_16,
+       C_RCV_HDR_OVF_17,
+       C_RCV_HDR_OVF_18,
+       C_RCV_HDR_OVF_19,
+       C_RCV_HDR_OVF_20,
+       C_RCV_HDR_OVF_21,
+       C_RCV_HDR_OVF_22,
+       C_RCV_HDR_OVF_23,
+       C_RCV_HDR_OVF_24,
+       C_RCV_HDR_OVF_25,
+       C_RCV_HDR_OVF_26,
+       C_RCV_HDR_OVF_27,
+       C_RCV_HDR_OVF_28,
+       C_RCV_HDR_OVF_29,
+       C_RCV_HDR_OVF_30,
+       C_RCV_HDR_OVF_31,
+       C_RCV_HDR_OVF_32,
+       C_RCV_HDR_OVF_33,
+       C_RCV_HDR_OVF_34,
+       C_RCV_HDR_OVF_35,
+       C_RCV_HDR_OVF_36,
+       C_RCV_HDR_OVF_37,
+       C_RCV_HDR_OVF_38,
+       C_RCV_HDR_OVF_39,
+       C_RCV_HDR_OVF_40,
+       C_RCV_HDR_OVF_41,
+       C_RCV_HDR_OVF_42,
+       C_RCV_HDR_OVF_43,
+       C_RCV_HDR_OVF_44,
+       C_RCV_HDR_OVF_45,
+       C_RCV_HDR_OVF_46,
+       C_RCV_HDR_OVF_47,
+       C_RCV_HDR_OVF_48,
+       C_RCV_HDR_OVF_49,
+       C_RCV_HDR_OVF_50,
+       C_RCV_HDR_OVF_51,
+       C_RCV_HDR_OVF_52,
+       C_RCV_HDR_OVF_53,
+       C_RCV_HDR_OVF_54,
+       C_RCV_HDR_OVF_55,
+       C_RCV_HDR_OVF_56,
+       C_RCV_HDR_OVF_57,
+       C_RCV_HDR_OVF_58,
+       C_RCV_HDR_OVF_59,
+       C_RCV_HDR_OVF_60,
+       C_RCV_HDR_OVF_61,
+       C_RCV_HDR_OVF_62,
+       C_RCV_HDR_OVF_63,
+       C_RCV_HDR_OVF_64,
+       C_RCV_HDR_OVF_65,
+       C_RCV_HDR_OVF_66,
+       C_RCV_HDR_OVF_67,
+       C_RCV_HDR_OVF_68,
+       C_RCV_HDR_OVF_69,
+       C_RCV_HDR_OVF_70,
+       C_RCV_HDR_OVF_71,
+       C_RCV_HDR_OVF_72,
+       C_RCV_HDR_OVF_73,
+       C_RCV_HDR_OVF_74,
+       C_RCV_HDR_OVF_75,
+       C_RCV_HDR_OVF_76,
+       C_RCV_HDR_OVF_77,
+       C_RCV_HDR_OVF_78,
+       C_RCV_HDR_OVF_79,
+       C_RCV_HDR_OVF_80,
+       C_RCV_HDR_OVF_81,
+       C_RCV_HDR_OVF_82,
+       C_RCV_HDR_OVF_83,
+       C_RCV_HDR_OVF_84,
+       C_RCV_HDR_OVF_85,
+       C_RCV_HDR_OVF_86,
+       C_RCV_HDR_OVF_87,
+       C_RCV_HDR_OVF_88,
+       C_RCV_HDR_OVF_89,
+       C_RCV_HDR_OVF_90,
+       C_RCV_HDR_OVF_91,
+       C_RCV_HDR_OVF_92,
+       C_RCV_HDR_OVF_93,
+       C_RCV_HDR_OVF_94,
+       C_RCV_HDR_OVF_95,
+       C_RCV_HDR_OVF_96,
+       C_RCV_HDR_OVF_97,
+       C_RCV_HDR_OVF_98,
+       C_RCV_HDR_OVF_99,
+       C_RCV_HDR_OVF_100,
+       C_RCV_HDR_OVF_101,
+       C_RCV_HDR_OVF_102,
+       C_RCV_HDR_OVF_103,
+       C_RCV_HDR_OVF_104,
+       C_RCV_HDR_OVF_105,
+       C_RCV_HDR_OVF_106,
+       C_RCV_HDR_OVF_107,
+       C_RCV_HDR_OVF_108,
+       C_RCV_HDR_OVF_109,
+       C_RCV_HDR_OVF_110,
+       C_RCV_HDR_OVF_111,
+       C_RCV_HDR_OVF_112,
+       C_RCV_HDR_OVF_113,
+       C_RCV_HDR_OVF_114,
+       C_RCV_HDR_OVF_115,
+       C_RCV_HDR_OVF_116,
+       C_RCV_HDR_OVF_117,
+       C_RCV_HDR_OVF_118,
+       C_RCV_HDR_OVF_119,
+       C_RCV_HDR_OVF_120,
+       C_RCV_HDR_OVF_121,
+       C_RCV_HDR_OVF_122,
+       C_RCV_HDR_OVF_123,
+       C_RCV_HDR_OVF_124,
+       C_RCV_HDR_OVF_125,
+       C_RCV_HDR_OVF_126,
+       C_RCV_HDR_OVF_127,
+       C_RCV_HDR_OVF_128,
+       C_RCV_HDR_OVF_129,
+       C_RCV_HDR_OVF_130,
+       C_RCV_HDR_OVF_131,
+       C_RCV_HDR_OVF_132,
+       C_RCV_HDR_OVF_133,
+       C_RCV_HDR_OVF_134,
+       C_RCV_HDR_OVF_135,
+       C_RCV_HDR_OVF_136,
+       C_RCV_HDR_OVF_137,
+       C_RCV_HDR_OVF_138,
+       C_RCV_HDR_OVF_139,
+       C_RCV_HDR_OVF_140,
+       C_RCV_HDR_OVF_141,
+       C_RCV_HDR_OVF_142,
+       C_RCV_HDR_OVF_143,
+       C_RCV_HDR_OVF_144,
+       C_RCV_HDR_OVF_145,
+       C_RCV_HDR_OVF_146,
+       C_RCV_HDR_OVF_147,
+       C_RCV_HDR_OVF_148,
+       C_RCV_HDR_OVF_149,
+       C_RCV_HDR_OVF_150,
+       C_RCV_HDR_OVF_151,
+       C_RCV_HDR_OVF_152,
+       C_RCV_HDR_OVF_153,
+       C_RCV_HDR_OVF_154,
+       C_RCV_HDR_OVF_155,
+       C_RCV_HDR_OVF_156,
+       C_RCV_HDR_OVF_157,
+       C_RCV_HDR_OVF_158,
+       C_RCV_HDR_OVF_159,
+       PORT_CNTR_LAST /* Must be kept last */
+};
+
+u64 get_all_cpu_total(u64 __percpu *cntr);
+void hfi1_start_cleanup(struct hfi1_devdata *dd);
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd);
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr);
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo);
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask);
+int hfi1_init_ctxt(struct send_context *sc);
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order);
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd);
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt);
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+                   u64 **cntrp);
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+                       char **namep, u64 **cntrp);
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd);
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which);
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val);
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey);
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt);
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey);
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt);
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality);
+
+/*
+ * Interrupt source table.
+ *
+ * Each entry is an interrupt source "type".  It is ordered by increasing
+ * number.
+ */
+struct is_table {
+       int start;       /* interrupt source type start */
+       int end;         /* interrupt source type end */
+       /* routine that returns the name of the interrupt source */
+       char *(*is_name)(char *name, size_t size, unsigned int source);
+       /* routine to call when receiving an interrupt */
+       void (*is_int)(struct hfi1_devdata *dd, unsigned int source);
+};
+
+#endif /* _CHIP_H */
diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h
new file mode 100644 (file)
index 0000000..bf45de2
--- /dev/null
@@ -0,0 +1,1292 @@
+#ifndef DEF_CHIP_REG
+#define DEF_CHIP_REG
+
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define CORE           0x000000000000
+#define CCE                    (CORE + 0x000000000000)
+#define ASIC           (CORE + 0x000000400000)
+#define MISC           (CORE + 0x000000500000)
+#define DC_TOP_CSRS            (CORE + 0x000000600000)
+#define CHIP_DEBUG             (CORE + 0x000000700000)
+#define RXE                    (CORE + 0x000001000000)
+#define TXE                    (CORE + 0x000001800000)
+#define DCC_CSRS               (DC_TOP_CSRS + 0x000000000000)
+#define DC_LCB_CSRS            (DC_TOP_CSRS + 0x000000001000)
+#define DC_8051_CSRS           (DC_TOP_CSRS + 0x000000002000)
+#define PCIE           0
+
+#define ASIC_NUM_SCRATCH 4
+#define CCE_ERR_INT_CNT 0
+#define CCE_MISC_INT_CNT 2
+#define CCE_NUM_32_BIT_COUNTERS 3
+#define CCE_NUM_32_BIT_INT_COUNTERS 6
+#define CCE_NUM_INT_CSRS 12
+#define CCE_NUM_INT_MAP_CSRS 96
+#define CCE_NUM_MSIX_PBAS 4
+#define CCE_NUM_MSIX_VECTORS 256
+#define CCE_NUM_SCRATCH 4
+#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2
+#define CCE_PCIE_TRGT_STALL_CNT 0
+#define CCE_PIO_WR_STALL_CNT 1
+#define CCE_RCV_AVAIL_INT_CNT 3
+#define CCE_RCV_URGENT_INT_CNT 4
+#define CCE_SDMA_INT_CNT 1
+#define CCE_SEND_CREDIT_INT_CNT 5
+#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040)
+#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0
+#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull
+#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008)
+#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010)
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16
+#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0
+#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48
+#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32
+#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull
+#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000)
+#define DCC_CFG_RESET_RESET_LCB_SHIFT 0
+#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2
+#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028)
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030)
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56
+#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60
+#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120)
+#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050)
+#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull
+#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull
+#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull
+#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull
+#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull
+#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull
+#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull
+#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull
+#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull
+#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull
+#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull
+#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull
+#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull
+#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060)
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull
+#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull
+#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058)
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull
+#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull
+#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull
+#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull
+#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull
+#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull
+#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull
+#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull
+#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull
+#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull
+#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull
+#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull
+#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull
+#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull
+#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull
+#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull
+#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull
+#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull
+#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull
+#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull
+#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull
+#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull
+#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull
+#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull
+#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull
+#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull
+#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull
+#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull
+#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110)
+#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090)
+#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078)
+#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080)
+#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088)
+#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098)
+#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108)
+#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118)
+#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100)
+#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330)
+#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290)
+#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0)
+#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140)
+#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198)
+#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240)
+#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130)
+#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8)
+#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338)
+#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298)
+#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8)
+#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0)
+#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248)
+#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8)
+#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138)
+#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190)
+#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128)
+#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0)
+#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180)
+#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110)
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull
+#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118)
+#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120)
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028)
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030)
+#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull
+#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16
+#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038)
+#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008)
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull
+#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000)
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull
+#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018)
+#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull
+#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010)
+#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020)
+#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068)
+#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull
+#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull
+#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull
+#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull
+#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8)
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull
+#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0
+#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8)
+#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0)
+#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0)
+#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull
+#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull
+#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull
+#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull
+#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull
+#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull
+#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull
+#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull
+#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull
+#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull
+#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060)
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16
+#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull
+#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0
+#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050)
+#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull
+#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058)
+#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048)
+#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull
+#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130)
+#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull
+#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128)
+#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0
+#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058)
+#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0
+#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020)
+#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull
+#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100)
+#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120)
+#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060)
+#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8)
+#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0
+#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000)
+#define DC_LCB_CFG_RUN_EN_SHIFT 0
+#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018)
+#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8
+#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4
+#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010)
+#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0
+#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008)
+#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0
+#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308)
+#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310)
+#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300)
+#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull
+#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull
+#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull
+#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull
+#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull
+#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull
+#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull
+#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull
+#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull
+#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull
+#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull
+#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull
+#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull
+#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull
+#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull
+#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull
+#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull
+#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull
+#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull
+#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull
+#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull
+#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull
+#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338)
+#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340)
+#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348)
+#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370)
+#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378)
+#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390)
+#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380)
+#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358)
+#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388)
+#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360)
+#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320)
+#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350)
+#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580)
+#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8)
+#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608)
+#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600)
+#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408)
+#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420)
+#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400)
+#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410)
+#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
+#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
+#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_BUF_OVFL_CNT 10
+#define RCV_CONTEXT_EGR_STALL 22
+#define RCV_CONTEXT_RHQ_STALL 21
+#define RCV_DATA_PKT_CNT 0
+#define RCV_DWORD_CNT 1
+#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20
+#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23
+#define RCV_TID_FULL_ERR_CNT 18
+#define RCV_TID_VALID_ERR_CNT 19
+#define RXE_NUM_32_BIT_COUNTERS 24
+#define RXE_NUM_64_BIT_COUNTERS 2
+#define RXE_NUM_RSM_INSTANCES 4
+#define RXE_NUM_TID_FLOWS 32
+#define RXE_PER_CONTEXT_OFFSET 0x0300000
+#define SEND_DATA_PKT_CNT 0
+#define SEND_DATA_PKT_VL0_CNT 12
+#define SEND_DATA_VL0_CNT 3
+#define SEND_DROPPED_PKT_CNT 5
+#define SEND_DWORD_CNT 1
+#define SEND_FLOW_STALL_CNT 4
+#define SEND_HEADERS_ERR_CNT 6
+#define SEND_LEN_ERR_CNT 1
+#define SEND_MAX_MIN_LEN_ERR_CNT 2
+#define SEND_UNDERRUN_CNT 3
+#define SEND_UNSUP_VL_ERR_CNT 0
+#define SEND_WAIT_CNT 2
+#define SEND_WAIT_VL0_CNT 21
+#define TXE_PIO_SEND_OFFSET 0x0800000
+#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048)
+#define ASIC_CFG_MUTEX (ASIC + 0x000000000040)
+#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008)
+#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull
+#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull
+#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000)
+#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16
+#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8
+#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32
+#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0
+#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020)
+#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050)
+#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308)
+#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull
+#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300)
+#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull
+#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8
+#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull
+#define ASIC_EEP_DATA (ASIC + 0x000000000310)
+#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230)
+#define ASIC_GPIO_FORCE (ASIC + 0x000000000238)
+#define ASIC_GPIO_IN (ASIC + 0x000000000200)
+#define ASIC_GPIO_INVERT (ASIC + 0x000000000210)
+#define ASIC_GPIO_MASK (ASIC + 0x000000000220)
+#define ASIC_GPIO_OE (ASIC + 0x000000000208)
+#define ASIC_GPIO_OUT (ASIC + 0x000000000218)
+#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100)
+#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0
+#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull
+#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull
+#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12
+#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108)
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull
+#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110)
+#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118)
+#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180)
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16
+#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0
+#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128)
+#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270)
+#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278)
+#define ASIC_QSFP1_IN (ASIC + 0x000000000240)
+#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250)
+#define ASIC_QSFP1_MASK (ASIC + 0x000000000260)
+#define ASIC_QSFP1_OE (ASIC + 0x000000000248)
+#define ASIC_QSFP1_OUT (ASIC + 0x000000000258)
+#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268)
+#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0)
+#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8)
+#define ASIC_QSFP2_IN (ASIC + 0x000000000280)
+#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290)
+#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0)
+#define ASIC_QSFP2_OE (ASIC + 0x000000000288)
+#define ASIC_QSFP2_OUT (ASIC + 0x000000000298)
+#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8)
+#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018)
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull
+#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16
+#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010)
+#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull
+#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull
+#define ASIC_STS_THERM (ASIC + 0x000000000058)
+#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18
+#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2
+#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_HI_TEMP_SHIFT 50
+#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull
+#define ASIC_STS_THERM_LO_TEMP_SHIFT 34
+#define ASIC_STS_THERM_LOW_SHIFT 13
+#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060)
+#define CCE_CTRL (CCE + 0x000000000010)
+#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull
+#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull
+#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull
+#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull
+#define CCE_DC_CTRL (CCE + 0x0000000000B8)
+#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull
+#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull
+#define CCE_ERR_CLEAR (CCE + 0x000000000050)
+#define CCE_ERR_MASK (CCE + 0x000000000048)
+#define CCE_ERR_STATUS (CCE + 0x000000000040)
+#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \
+               0x200ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \
+               0x800ull
+#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \
+               0x400ull
+#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull
+#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull
+#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull
+#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull
+#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull
+#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull
+#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull
+#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull
+#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull
+#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull
+#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull
+#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull
+#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull
+#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull
+#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull
+#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull
+#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull
+#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull
+#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull
+#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull
+#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull
+#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull
+#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull
+#define CCE_INT_CLEAR (CCE + 0x000000110A00)
+#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00)
+#define CCE_INT_FORCE (CCE + 0x000000110B00)
+#define CCE_INT_MAP (CCE + 0x000000110500)
+#define CCE_INT_MASK (CCE + 0x000000110900)
+#define CCE_INT_STATUS (CCE + 0x000000110800)
+#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200)
+#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000)
+#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008)
+#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull
+#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400)
+#define CCE_REVISION (CCE + 0x000000000000)
+#define CCE_REVISION2 (CCE + 0x000000000008)
+#define CCE_REVISION2_HFI_ID_MASK 0x1ull
+#define CCE_REVISION2_HFI_ID_SHIFT 0
+#define CCE_REVISION2_IMPL_CODE_SHIFT 8
+#define CCE_REVISION2_IMPL_REVISION_SHIFT 16
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull
+#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32
+#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8
+#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull
+#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0
+#define CCE_REVISION_SW_MASK 0xFFull
+#define CCE_REVISION_SW_SHIFT 24
+#define CCE_SCRATCH (CCE + 0x000000000020)
+#define CCE_STATUS (CCE + 0x000000000018)
+#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull
+#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull
+#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull
+#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull
+#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull
+#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull
+#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull
+#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull
+#define MISC_CFG_FW_CTRL (MISC + 0x000000001000)
+#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2
+#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull
+#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08)
+#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400)
+#define MISC_CFG_RSA_MU (MISC + 0x000000000A10)
+#define MISC_CFG_RSA_R2 (MISC + 0x000000000000)
+#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200)
+#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00)
+#define MISC_ERR_CLEAR (MISC + 0x000000002010)
+#define MISC_ERR_MASK (MISC + 0x000000002008)
+#define MISC_ERR_STATUS (MISC + 0x000000002000)
+#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull
+#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull
+#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull
+#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull
+#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull
+#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull
+#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull
+#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull
+#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull
+#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull
+#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull
+#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0)
+#define PCI_CFG_REG1 (PCIE + 0x000000000004)
+#define PCI_CFG_REG11 (PCIE + 0x00000000002C)
+#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C)
+#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150)
+#define PCIE_CFG_TPH2 (PCIE + 0x000000000180)
+#define RCV_ARRAY (RXE + 0x000000200000)
+#define RCV_ARRAY_CNT (RXE + 0x000000000018)
+#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull
+#define RCV_ARRAY_RT_ADDR_SHIFT 0
+#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36
+#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull
+#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050)
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull
+#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0
+#define RCV_BTH_QP (RXE + 0x000000000028)
+#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull
+#define RCV_BTH_QP_KDETH_QP_SHIFT 16
+#define RCV_BYPASS (RXE + 0x000000000038)
+#define RCV_CONTEXTS (RXE + 0x000000000010)
+#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400)
+#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500)
+#define RCV_CTRL (RXE + 0x000000000000)
+#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull
+#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull
+#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull
+#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull
+#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull
+#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull
+#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull
+#define RCV_CTXT_CTRL (RXE + 0x000000100000)
+#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull
+#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8
+#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull
+#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull
+#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull
+#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull
+#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull
+#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull
+#define RCV_CTXT_STATUS (RXE + 0x000000100008)
+#define RCV_EGR_CTRL (RXE + 0x000000100010)
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull
+#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0
+#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull
+#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32
+#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018)
+#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull
+#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0
+#define RCV_ERR_CLEAR (RXE + 0x000000000070)
+#define RCV_ERR_INFO (RXE + 0x000000000050)
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full
+#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull
+#define RCV_ERR_MASK (RXE + 0x000000000068)
+#define RCV_ERR_STATUS (RXE + 0x000000000060)
+#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull
+#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \
+               0x4000000000000000ull
+#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull
+#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \
+               0x40000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+               0x20000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+               0x800000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+               0x400000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull
+#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull
+#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull
+#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+               0x10000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+               0x20000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull
+#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull
+#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+               0x200000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+               0x8000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull
+#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+               0x1000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull
+#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull
+#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull
+#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull
+#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull
+#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull
+#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull
+#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull
+#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull
+#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull
+#define RCV_HDR_ADDR (RXE + 0x000000100028)
+#define RCV_HDR_CNT (RXE + 0x000000100030)
+#define RCV_HDR_CNT_CNT_MASK 0x1FFull
+#define RCV_HDR_CNT_CNT_SHIFT 0
+#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038)
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull
+#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0
+#define RCV_HDR_HEAD (RXE + 0x000000300008)
+#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull
+#define RCV_HDR_HEAD_COUNTER_SHIFT 32
+#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull
+#define RCV_HDR_HEAD_HEAD_SHIFT 0
+#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull
+#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058)
+#define RCV_HDR_SIZE (RXE + 0x000000100040)
+#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full
+#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0
+#define RCV_HDR_TAIL (RXE + 0x000000300000)
+#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048)
+#define RCV_KEY_CTRL (RXE + 0x000000100020)
+#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull
+#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0
+#define RCV_MULTICAST (RXE + 0x000000000030)
+#define RCV_PARTITION_KEY (RXE + 0x000000000200)
+#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull
+#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16
+#define RCV_QP_MAP_TABLE (RXE + 0x000000000100)
+#define RCV_RSM_CFG (RXE + 0x000000000600)
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull
+#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0
+#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60
+#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900)
+#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull
+#define RCV_RSM_MATCH (RXE + 0x000000000800)
+#define RCV_RSM_MATCH_MASK1_SHIFT 0
+#define RCV_RSM_MATCH_MASK2_SHIFT 16
+#define RCV_RSM_MATCH_VALUE1_SHIFT 8
+#define RCV_RSM_MATCH_VALUE2_SHIFT 24
+#define RCV_RSM_SELECT (RXE + 0x000000000700)
+#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0
+#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16
+#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32
+#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44
+#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48
+#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60
+#define RCV_STATUS (RXE + 0x000000000008)
+#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull
+#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull
+#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull
+#define RCV_TID_CTRL (RXE + 0x000000100018)
+#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull
+#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0
+#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull
+#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32
+#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800)
+#define RCV_VL15 (RXE + 0x000000000048)
+#define SEND_BTH_QP (TXE + 0x0000000000A0)
+#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull
+#define SEND_BTH_QP_KDETH_QP_SHIFT 16
+#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510)
+#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \
+               0x1000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \
+               0x8000000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \
+               0x2000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \
+               0x4000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \
+               0x8000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \
+               0x10000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \
+               0x20000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \
+               0x40000000000000ull
+#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \
+               0x80000000000000ull
+#define SEND_CM_CREDIT_VL (TXE + 0x000000000600)
+#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678)
+#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0
+#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16
+#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull
+#define SEND_CM_CTRL (TXE + 0x000000000500)
+#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull
+#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull
+#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508)
+#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16
+#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0
+#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32
+#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull
+#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520)
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48
+#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528)
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32
+#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48
+#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530)
+#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538)
+#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518)
+#define SEND_CONTEXTS (TXE + 0x000000000010)
+#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200)
+#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300)
+#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400)
+#define SEND_CTRL (TXE + 0x000000000000)
+#define SEND_CTRL_CM_RESET_SMASK 0x4ull
+#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
+#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \
+               0x200000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+               0x100000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+               0x80000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \
+               0x40000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+               0x8000ull
+#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \
+               0x4000ull
+#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090)
+#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull
+#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8)
+#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8
+#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098)
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0)
+#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16
+#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088)
+#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010)
+#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull
+#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
+#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
+#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
+#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
+#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull
+#define SEND_CTXT_CTRL (TXE + 0x000000100000)
+#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull
+#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32
+#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull
+#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48
+#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull
+#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050)
+#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048)
+#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040)
+#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull
+#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull
+#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull
+#define SEND_CTXT_STATUS (TXE + 0x000000100008)
+#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull
+#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010)
+#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080)
+#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull
+#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \
+               0x100000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \
+               0x80000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \
+               0x8000ull
+#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull
+#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090)
+#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8)
+#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098)
+#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0)
+#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16
+#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull
+#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0
+#define SEND_DMA_CHECK_VL (TXE + 0x000000200088)
+#define SEND_DMA_CTRL (TXE + 0x000000200000)
+#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull
+#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull
+#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull
+#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull
+#define SEND_DMA_DESC_CNT (TXE + 0x000000200050)
+#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull
+#define SEND_DMA_DESC_CNT_CNT_SHIFT 0
+#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070)
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull
+#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18
+#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068)
+#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060)
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \
+               0x40000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \
+               0x20000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \
+               0x100ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \
+               0x10000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull
+#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull
+#define SEND_DMA_ENGINES (TXE + 0x000000000018)
+#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070)
+#define SEND_DMA_ERR_MASK (TXE + 0x000000000068)
+#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060)
+#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull
+#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull
+#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull
+#define SEND_DMA_HEAD (TXE + 0x000000200028)
+#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030)
+#define SEND_DMA_LEN_GEN (TXE + 0x000000200018)
+#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16
+#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6
+#define SEND_DMA_MEMORY (TXE + 0x0000002000B0)
+#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16
+#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0
+#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028)
+#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038)
+#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048)
+#define SEND_DMA_STATUS (TXE + 0x000000200008)
+#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull
+#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull
+#define SEND_DMA_TAIL (TXE + 0x000000200020)
+#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800)
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \
+               0x3FFFull
+#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090)
+#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00)
+#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull
+#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull
+#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull
+#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull
+#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull
+#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull
+#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull
+#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull
+#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull
+#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull
+#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull
+#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088)
+#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08)
+#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080)
+#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \
+               0x200000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \
+               0x20000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \
+               0x800000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \
+               0x2000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \
+               0x200000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \
+               0x8ull
+#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \
+               0x400000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull
+#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \
+               0x1000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \
+               0x100000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \
+               0x2000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \
+               0x200000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \
+               0x4000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \
+               0x400000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \
+               0x8000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \
+               0x800000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \
+               0x10000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \
+               0x1000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \
+               0x20000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \
+               0x2000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \
+               0x40000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \
+               0x4000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \
+               0x80000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \
+               0x8000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \
+               0x100000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \
+               0x10000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull
+#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull
+#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \
+               0x1000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \
+               0x8000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \
+               0x100000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \
+               0x800000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \
+               0x4000000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \
+               0x80000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \
+               0x800ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \
+               0x10000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \
+               0x4000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \
+               0x8000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \
+               0x10000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \
+               0x20000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \
+               0x40000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \
+               0x80000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \
+               0x20000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \
+               0x40000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \
+               0x80000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \
+               0x100000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \
+               0x200000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \
+               0x400000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \
+               0x800000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \
+               0x1000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \
+               0x2000000ull
+#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \
+               0x100ull
+#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00)
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0
+#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+               0x3FFFull
+#define SEND_ERR_CLEAR (TXE + 0x0000000000F0)
+#define SEND_ERR_MASK (TXE + 0x0000000000E8)
+#define SEND_ERR_STATUS (TXE + 0x0000000000E0)
+#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull
+#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull
+#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull
+#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030)
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull
+#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0
+#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180)
+#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0)
+#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull
+#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12
+#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8)
+#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48
+#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull
+#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12
+#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100)
+#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull
+#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull
+#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0
+#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050)
+#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_MASK (TXE + 0x000000000048)
+#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040)
+#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+               0x1000000ull
+#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull
+#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull
+#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+               0x100000000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull
+#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull
+#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull
+#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+               0x200000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull
+#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+               0x400000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull
+#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \
+               0x800000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull
+#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull
+#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull
+#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull
+#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+               0x100ull
+#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull
+#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull
+#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull
+#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull
+#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038)
+#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull
+#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8
+#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull
+#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull
+#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull
+#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020)
+#define SEND_SC2VLT0 (TXE + 0x0000000000B0)
+#define SEND_SC2VLT0_SC0_SHIFT 0
+#define SEND_SC2VLT0_SC1_SHIFT 8
+#define SEND_SC2VLT0_SC2_SHIFT 16
+#define SEND_SC2VLT0_SC3_SHIFT 24
+#define SEND_SC2VLT0_SC4_SHIFT 32
+#define SEND_SC2VLT0_SC5_SHIFT 40
+#define SEND_SC2VLT0_SC6_SHIFT 48
+#define SEND_SC2VLT0_SC7_SHIFT 56
+#define SEND_SC2VLT1 (TXE + 0x0000000000B8)
+#define SEND_SC2VLT1_SC10_SHIFT 16
+#define SEND_SC2VLT1_SC11_SHIFT 24
+#define SEND_SC2VLT1_SC12_SHIFT 32
+#define SEND_SC2VLT1_SC13_SHIFT 40
+#define SEND_SC2VLT1_SC14_SHIFT 48
+#define SEND_SC2VLT1_SC15_SHIFT 56
+#define SEND_SC2VLT1_SC8_SHIFT 0
+#define SEND_SC2VLT1_SC9_SHIFT 8
+#define SEND_SC2VLT2 (TXE + 0x0000000000C0)
+#define SEND_SC2VLT2_SC16_SHIFT 0
+#define SEND_SC2VLT2_SC17_SHIFT 8
+#define SEND_SC2VLT2_SC18_SHIFT 16
+#define SEND_SC2VLT2_SC19_SHIFT 24
+#define SEND_SC2VLT2_SC20_SHIFT 32
+#define SEND_SC2VLT2_SC21_SHIFT 40
+#define SEND_SC2VLT2_SC22_SHIFT 48
+#define SEND_SC2VLT2_SC23_SHIFT 56
+#define SEND_SC2VLT3 (TXE + 0x0000000000C8)
+#define SEND_SC2VLT3_SC24_SHIFT 0
+#define SEND_SC2VLT3_SC25_SHIFT 8
+#define SEND_SC2VLT3_SC26_SHIFT 16
+#define SEND_SC2VLT3_SC27_SHIFT 24
+#define SEND_SC2VLT3_SC28_SHIFT 32
+#define SEND_SC2VLT3_SC29_SHIFT 40
+#define SEND_SC2VLT3_SC30_SHIFT 48
+#define SEND_SC2VLT3_SC31_SHIFT 56
+#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8)
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0
+#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull
+#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708)
+#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898)
+#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12
+#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6
+#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0
+#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C)
+#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4)
+#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull
+#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24
+#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890)
+#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull
+#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894)
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6
+#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0
+#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8)
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8
+#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull
+#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull
+#define CCE_INT_BLOCKED (CCE + 0x000000110C00)
+#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040)
+#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058)
+
+#endif          /* DEF_CHIP_REG */
diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
new file mode 100644 (file)
index 0000000..5f22937
--- /dev/null
@@ -0,0 +1,415 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <rdma/hfi/hfi1_user.h>
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fast path code
+ * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in fast path code
+ * _HFI1_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define DEFAULT_KDETH_QP 0x80
+
+/* driver/hw feature set bitmask */
+#define HFI1_CAP_USER_SHIFT      24
+#define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
+/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */
+#define HFI1_CAP_LOCKED_SHIFT    63
+#define HFI1_CAP_LOCKED_MASK     0x1ULL
+#define HFI1_CAP_LOCKED_SMASK    (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT)
+/* extra bits used between kernel and user processes */
+#define HFI1_CAP_MISC_SHIFT      (HFI1_CAP_USER_SHIFT * 2)
+#define HFI1_CAP_MISC_MASK       ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \
+                                          HFI1_CAP_MISC_SHIFT)) - 1)
+
+#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; })
+#define HFI1_CAP_KCLEAR(cap)                                           \
+       ({                                                              \
+               hfi1_cap_mask &= ~HFI1_CAP_##cap;                       \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_USET(cap)                                             \
+       ({                                                              \
+               hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+               hfi1_cap_mask;                                          \
+               })
+#define HFI1_CAP_UCLEAR(cap)                                           \
+       ({                                                              \
+               hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_SET(cap)                                              \
+       ({                                                              \
+               hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap <<   \
+                                                 HFI1_CAP_USER_SHIFT)); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_CLEAR(cap)                                            \
+       ({                                                              \
+               hfi1_cap_mask &= ~(HFI1_CAP_##cap |                     \
+                                 (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \
+               hfi1_cap_mask;                                          \
+       })
+#define HFI1_CAP_LOCK()                                                        \
+       ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; })
+#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK))
+/*
+ * The set of capability bits that can be changed after initial load
+ * This set is the same for kernel and user contexts. However, for
+ * user contexts, the set can be further filtered by using the
+ * HFI1_CAP_RESERVED_MASK bits.
+ */
+#define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |                  \
+                                HFI1_CAP_HDRSUPP |                     \
+                                HFI1_CAP_MULTI_PKT_EGR |               \
+                                HFI1_CAP_NODROP_RHQ_FULL |             \
+                                HFI1_CAP_NODROP_EGR_FULL |             \
+                                HFI1_CAP_ALLOW_PERM_JKEY |             \
+                                HFI1_CAP_STATIC_RATE_CTRL |            \
+                                HFI1_CAP_PRINT_UNIMPL)
+/*
+ * A set of capability bits that are "global" and are not allowed to be
+ * set in the user bitmask.
+ */
+#define HFI1_CAP_RESERVED_MASK   ((HFI1_CAP_SDMA |                     \
+                                 HFI1_CAP_USE_SDMA_HEAD |              \
+                                 HFI1_CAP_EXTENDED_PSN |               \
+                                 HFI1_CAP_PRINT_UNIMPL |               \
+                                 HFI1_CAP_QSFP_ENABLED |               \
+                                 HFI1_CAP_NO_INTEGRITY |               \
+                                 HFI1_CAP_PKEY_CHECK) <<               \
+                                HFI1_CAP_USER_SHIFT)
+/*
+ * Set of capabilities that need to be enabled for kernel context in
+ * order to be allowed for user contexts, as well.
+ */
+#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL)
+/* Default enabled capabilities (both kernel and user) */
+#define HFI1_CAP_MASK_DEFAULT    (HFI1_CAP_HDRSUPP |                   \
+                                HFI1_CAP_NODROP_RHQ_FULL |             \
+                                HFI1_CAP_NODROP_EGR_FULL |             \
+                                HFI1_CAP_SDMA |                        \
+                                HFI1_CAP_PRINT_UNIMPL |                \
+                                HFI1_CAP_STATIC_RATE_CTRL |            \
+                                HFI1_CAP_QSFP_ENABLED |                \
+                                HFI1_CAP_PKEY_CHECK |                  \
+                                HFI1_CAP_MULTI_PKT_EGR |               \
+                                HFI1_CAP_EXTENDED_PSN |                \
+                                ((HFI1_CAP_HDRSUPP |                   \
+                                  HFI1_CAP_MULTI_PKT_EGR |             \
+                                  HFI1_CAP_STATIC_RATE_CTRL |          \
+                                  HFI1_CAP_PKEY_CHECK |                \
+                                  HFI1_CAP_EARLY_CREDIT_RETURN) <<     \
+                                 HFI1_CAP_USER_SHIFT))
+/*
+ * A bitmask of kernel/global capabilities that should be communicated
+ * to user level processes.
+ */
+#define HFI1_CAP_K2U (HFI1_CAP_SDMA |                  \
+                    HFI1_CAP_EXTENDED_PSN |            \
+                    HFI1_CAP_PKEY_CHECK |              \
+                    HFI1_CAP_NO_INTEGRITY)
+
+#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR)
+
+#ifndef HFI1_KERN_TYPE
+#define HFI1_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a Intel release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-Intel and 1 for Intel-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of hfi1_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#ifndef HFI1_DRIVER_VERSION_BASE
+#define HFI1_DRIVER_VERSION_BASE "0.9-248"
+#endif
+
+/* create the final driver version string */
+#ifdef HFI1_IDSTR
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR
+#else
+#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * Diagnostics can send a packet by writing the following
+ * struct to the diag packet special file.
+ *
+ * This allows a custom PBC qword, so that special modes and deliberate
+ * changes to CRCs can be used.
+ */
+#define _DIAG_PKT_VERS 1
+struct diag_pkt {
+       __u16 version;          /* structure version */
+       __u16 unit;             /* which device */
+       __u16 sw_index;         /* send sw index to use */
+       __u16 len;              /* data length, in bytes */
+       __u16 port;             /* port number */
+       __u16 unused;
+       __u32 flags;            /* call flags */
+       __u64 data;             /* user data pointer */
+       __u64 pbc;              /* PBC for the packet */
+};
+
+/* diag_pkt flags */
+#define F_DIAGPKT_WAIT 0x1     /* wait until packet is sent */
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/*
+ * Receive Header Flags
+ */
+#define RHF_PKT_LEN_SHIFT      0
+#define RHF_PKT_LEN_MASK       0xfffull
+#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT)
+
+#define RHF_RCV_TYPE_SHIFT     12
+#define RHF_RCV_TYPE_MASK      0x7ull
+#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT)
+
+#define RHF_USE_EGR_BFR_SHIFT  15
+#define RHF_USE_EGR_BFR_MASK   0x1ull
+#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT)
+
+#define RHF_EGR_INDEX_SHIFT    16
+#define RHF_EGR_INDEX_MASK     0x7ffull
+#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT)
+
+#define RHF_DC_INFO_SHIFT      27
+#define RHF_DC_INFO_MASK       0x1ull
+#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT)
+
+#define RHF_RCV_SEQ_SHIFT      28
+#define RHF_RCV_SEQ_MASK       0xfull
+#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT)
+
+#define RHF_EGR_OFFSET_SHIFT   32
+#define RHF_EGR_OFFSET_MASK    0xfffull
+#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT)
+#define RHF_HDRQ_OFFSET_SHIFT  44
+#define RHF_HDRQ_OFFSET_MASK   0x1ffull
+#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT)
+#define RHF_K_HDR_LEN_ERR      (0x1ull << 53)
+#define RHF_DC_UNC_ERR         (0x1ull << 54)
+#define RHF_DC_ERR             (0x1ull << 55)
+#define RHF_RCV_TYPE_ERR_SHIFT 56
+#define RHF_RCV_TYPE_ERR_MASK  0x7ul
+#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT)
+#define RHF_TID_ERR            (0x1ull << 59)
+#define RHF_LEN_ERR            (0x1ull << 60)
+#define RHF_ECC_ERR            (0x1ull << 61)
+#define RHF_VCRC_ERR           (0x1ull << 62)
+#define RHF_ICRC_ERR           (0x1ull << 63)
+
+#define RHF_ERROR_SMASK 0xffe0000000000000ull          /* bits 63:53 */
+
+/* RHF receive types */
+#define RHF_RCV_TYPE_EXPECTED 0
+#define RHF_RCV_TYPE_EAGER    1
+#define RHF_RCV_TYPE_IB       2 /* normal IB, IB Raw, or IPv6 */
+#define RHF_RCV_TYPE_ERROR    3
+#define RHF_RCV_TYPE_BYPASS   4
+#define RHF_RCV_TYPE_INVALID5 5
+#define RHF_RCV_TYPE_INVALID6 6
+#define RHF_RCV_TYPE_INVALID7 7
+
+/* RHF receive type error - expected packet errors */
+#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR  0x2
+#define RHF_RTE_EXPECTED_FLOW_GEN_ERR  0x4
+
+/* RHF receive type error - eager packet errors */
+#define RHF_RTE_EAGER_NO_ERR           0x0
+
+/* RHF receive type error - IB packet errors */
+#define RHF_RTE_IB_NO_ERR              0x0
+
+/* RHF receive type error - error packet errors */
+#define RHF_RTE_ERROR_NO_ERR           0x0
+#define RHF_RTE_ERROR_OP_CODE_ERR      0x1
+#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2
+#define RHF_RTE_ERROR_KHDR_HCRC_ERR    0x3
+#define RHF_RTE_ERROR_KHDR_KVER_ERR    0x4
+#define RHF_RTE_ERROR_CONTEXT_ERR      0x5
+#define RHF_RTE_ERROR_KHDR_TID_ERR     0x6
+
+/* RHF receive type error - bypass packet errors */
+#define RHF_RTE_BYPASS_NO_ERR          0x0
+
+/*
+ * This structure contains the first field common to all protocols
+ * that employ this chip.
+ */
+struct hfi1_message_header {
+       __be16 lrh[4];
+};
+
+/* IB - LRH header constants */
+#define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define LIM_MGMT_P_KEY       0x7FFF
+#define FULL_MGMT_P_KEY      0xFFFF
+
+#define DEFAULT_P_KEY LIM_MGMT_P_KEY
+#define HFI1_PERMISSIVE_LID 0xFFFF
+#define HFI1_AETH_CREDIT_SHIFT 24
+#define HFI1_AETH_CREDIT_MASK 0x1F
+#define HFI1_AETH_CREDIT_INVAL 0x1F
+#define HFI1_MSN_MASK 0xFFFFFF
+#define HFI1_QPN_MASK 0xFFFFFF
+#define HFI1_FECN_SHIFT 31
+#define HFI1_FECN_MASK 1
+#define HFI1_FECN_SMASK (1 << HFI1_FECN_SHIFT)
+#define HFI1_BECN_SHIFT 30
+#define HFI1_BECN_MASK 1
+#define HFI1_BECN_SMASK (1 << HFI1_BECN_SHIFT)
+#define HFI1_MULTICAST_LID_BASE 0xC000
+
+static inline __u64 rhf_to_cpu(const __le32 *rbuf)
+{
+       return __le64_to_cpu(*((__le64 *)rbuf));
+}
+
+static inline u64 rhf_err_flags(u64 rhf)
+{
+       return rhf & RHF_ERROR_SMASK;
+}
+
+static inline u32 rhf_rcv_type(u64 rhf)
+{
+       return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK;
+}
+
+static inline u32 rhf_rcv_type_err(u64 rhf)
+{
+       return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK;
+}
+
+/* return size is in bytes, not DWORDs */
+static inline u32 rhf_pkt_len(u64 rhf)
+{
+       return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2;
+}
+
+static inline u32 rhf_egr_index(u64 rhf)
+{
+       return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK;
+}
+
+static inline u32 rhf_rcv_seq(u64 rhf)
+{
+       return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK;
+}
+
+/* returned offset is in DWORDS */
+static inline u32 rhf_hdrq_offset(u64 rhf)
+{
+       return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline u64 rhf_use_egr_bfr(u64 rhf)
+{
+       return rhf & RHF_USE_EGR_BFR_SMASK;
+}
+
+static inline u64 rhf_dc_info(u64 rhf)
+{
+       return rhf & RHF_DC_INFO_SMASK;
+}
+
+static inline u32 rhf_egr_buf_offset(u64 rhf)
+{
+       return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK;
+}
+#endif /* _COMMON_H */
diff --git a/drivers/staging/rdma/hfi1/cq.c b/drivers/staging/rdma/hfi1/cq.c
new file mode 100644 (file)
index 0000000..4f046ff
--- /dev/null
@@ -0,0 +1,558 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+
+#include "verbs.h"
+#include "hfi.h"
+
+/**
+ * hfi1_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicited entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited)
+{
+       struct hfi1_cq_wc *wc;
+       unsigned long flags;
+       u32 head;
+       u32 next;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       /*
+        * Note that the head pointer might be writable by user processes.
+        * Take care to verify it is a sane value.
+        */
+       wc = cq->queue;
+       head = wc->head;
+       if (head >= (unsigned) cq->ibcq.cqe) {
+               head = cq->ibcq.cqe;
+               next = 0;
+       } else
+               next = head + 1;
+       if (unlikely(next == wc->tail)) {
+               spin_unlock_irqrestore(&cq->lock, flags);
+               if (cq->ibcq.event_handler) {
+                       struct ib_event ev;
+
+                       ev.device = cq->ibcq.device;
+                       ev.element.cq = &cq->ibcq;
+                       ev.event = IB_EVENT_CQ_ERR;
+                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+               }
+               return;
+       }
+       if (cq->ip) {
+               wc->uqueue[head].wr_id = entry->wr_id;
+               wc->uqueue[head].status = entry->status;
+               wc->uqueue[head].opcode = entry->opcode;
+               wc->uqueue[head].vendor_err = entry->vendor_err;
+               wc->uqueue[head].byte_len = entry->byte_len;
+               wc->uqueue[head].ex.imm_data =
+                       (__u32 __force)entry->ex.imm_data;
+               wc->uqueue[head].qp_num = entry->qp->qp_num;
+               wc->uqueue[head].src_qp = entry->src_qp;
+               wc->uqueue[head].wc_flags = entry->wc_flags;
+               wc->uqueue[head].pkey_index = entry->pkey_index;
+               wc->uqueue[head].slid = entry->slid;
+               wc->uqueue[head].sl = entry->sl;
+               wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+               wc->uqueue[head].port_num = entry->port_num;
+               /* Make sure entry is written before the head index. */
+               smp_wmb();
+       } else
+               wc->kqueue[head] = *entry;
+       wc->head = next;
+
+       if (cq->notify == IB_CQ_NEXT_COMP ||
+           (cq->notify == IB_CQ_SOLICITED &&
+            (solicited || entry->status != IB_WC_SUCCESS))) {
+               struct kthread_worker *worker;
+               /*
+                * This will cause send_complete() to be called in
+                * another thread.
+                */
+               smp_read_barrier_depends(); /* see hfi1_cq_exit */
+               worker = cq->dd->worker;
+               if (likely(worker)) {
+                       cq->notify = IB_CQ_NONE;
+                       cq->triggered++;
+                       queue_kthread_work(worker, &cq->comptask);
+               }
+       }
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+/**
+ * hfi1_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+       struct hfi1_cq *cq = to_icq(ibcq);
+       struct hfi1_cq_wc *wc;
+       unsigned long flags;
+       int npolled;
+       u32 tail;
+
+       /* The kernel can only poll a kernel completion queue */
+       if (cq->ip) {
+               npolled = -EINVAL;
+               goto bail;
+       }
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       wc = cq->queue;
+       tail = wc->tail;
+       if (tail > (u32) cq->ibcq.cqe)
+               tail = (u32) cq->ibcq.cqe;
+       for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+               if (tail == wc->head)
+                       break;
+               /* The kernel doesn't need a RMB since it has the lock. */
+               *entry = wc->kqueue[tail];
+               if (tail >= cq->ibcq.cqe)
+                       tail = 0;
+               else
+                       tail++;
+       }
+       wc->tail = tail;
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+       return npolled;
+}
+
+static void send_complete(struct kthread_work *work)
+{
+       struct hfi1_cq *cq = container_of(work, struct hfi1_cq, comptask);
+
+       /*
+        * The completion handler will most likely rearm the notification
+        * and poll for all pending entries.  If a new completion entry
+        * is added while we are in this routine, queue_work()
+        * won't call us again until we return so we check triggered to
+        * see if we need to call the handler again.
+        */
+       for (;;) {
+               u8 triggered = cq->triggered;
+
+               /*
+                * IPoIB connected mode assumes the callback is from a
+                * soft IRQ. We simulate this by blocking "bottom halves".
+                * See the implementation for ipoib_cm_handle_tx_wc(),
+                * netif_tx_lock_bh() and netif_tx_lock().
+                */
+               local_bh_disable();
+               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+               local_bh_enable();
+
+               if (cq->triggered == triggered)
+                       return;
+       }
+}
+
+/**
+ * hfi1_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the driver
+ * @udata: user data for libibverbs.so
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *hfi1_create_cq(
+       struct ib_device *ibdev,
+       const struct ib_cq_init_attr *attr,
+       struct ib_ucontext *context,
+       struct ib_udata *udata)
+{
+       struct hfi1_ibdev *dev = to_idev(ibdev);
+       struct hfi1_cq *cq;
+       struct hfi1_cq_wc *wc;
+       struct ib_cq *ret;
+       u32 sz;
+       unsigned int entries = attr->cqe;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       if (entries < 1 || entries > hfi1_max_cqes)
+               return ERR_PTR(-EINVAL);
+
+       /* Allocate the completion queue structure. */
+       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq)
+               return ERR_PTR(-ENOMEM);
+
+       /*
+        * Allocate the completion queue entries and head/tail pointers.
+        * This is allocated separately so that it can be resized and
+        * also mapped into user space.
+        * We need to use vmalloc() in order to support mmap and large
+        * numbers of entries.
+        */
+       sz = sizeof(*wc);
+       if (udata && udata->outlen >= sizeof(__u64))
+               sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+       else
+               sz += sizeof(struct ib_wc) * (entries + 1);
+       wc = vmalloc_user(sz);
+       if (!wc) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_cq;
+       }
+
+       /*
+        * Return the address of the WC as the offset to mmap.
+        * See hfi1_mmap() for details.
+        */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               int err;
+
+               cq->ip = hfi1_create_mmap_info(dev, sz, context, wc);
+               if (!cq->ip) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_wc;
+               }
+
+               err = ib_copy_to_udata(udata, &cq->ip->offset,
+                                      sizeof(cq->ip->offset));
+               if (err) {
+                       ret = ERR_PTR(err);
+                       goto bail_ip;
+               }
+       } else
+               cq->ip = NULL;
+
+       spin_lock(&dev->n_cqs_lock);
+       if (dev->n_cqs_allocated == hfi1_max_cqs) {
+               spin_unlock(&dev->n_cqs_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_ip;
+       }
+
+       dev->n_cqs_allocated++;
+       spin_unlock(&dev->n_cqs_lock);
+
+       if (cq->ip) {
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       /*
+        * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+        * The number of entries should be >= the number requested or return
+        * an error.
+        */
+       cq->dd = dd_from_dev(dev);
+       cq->ibcq.cqe = entries;
+       cq->notify = IB_CQ_NONE;
+       cq->triggered = 0;
+       spin_lock_init(&cq->lock);
+       init_kthread_work(&cq->comptask, send_complete);
+       wc->head = 0;
+       wc->tail = 0;
+       cq->queue = wc;
+
+       ret = &cq->ibcq;
+
+       goto done;
+
+bail_ip:
+       kfree(cq->ip);
+bail_wc:
+       vfree(wc);
+bail_cq:
+       kfree(cq);
+done:
+       return ret;
+}
+
+/**
+ * hfi1_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int hfi1_destroy_cq(struct ib_cq *ibcq)
+{
+       struct hfi1_ibdev *dev = to_idev(ibcq->device);
+       struct hfi1_cq *cq = to_icq(ibcq);
+
+       flush_kthread_work(&cq->comptask);
+       spin_lock(&dev->n_cqs_lock);
+       dev->n_cqs_allocated--;
+       spin_unlock(&dev->n_cqs_lock);
+       if (cq->ip)
+               kref_put(&cq->ip->ref, hfi1_release_mmap_info);
+       else
+               vfree(cq->queue);
+       kfree(cq);
+
+       return 0;
+}
+
+/**
+ * hfi1_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+       struct hfi1_cq *cq = to_icq(ibcq);
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&cq->lock, flags);
+       /*
+        * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+        * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+        */
+       if (cq->notify != IB_CQ_NEXT_COMP)
+               cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+       if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+           cq->queue->head != cq->queue->tail)
+               ret = 1;
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       return ret;
+}
+
+/**
+ * hfi1_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+       struct hfi1_cq *cq = to_icq(ibcq);
+       struct hfi1_cq_wc *old_wc;
+       struct hfi1_cq_wc *wc;
+       u32 head, tail, n;
+       int ret;
+       u32 sz;
+
+       if (cqe < 1 || cqe > hfi1_max_cqes) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       sz = sizeof(*wc);
+       if (udata && udata->outlen >= sizeof(__u64))
+               sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+       else
+               sz += sizeof(struct ib_wc) * (cqe + 1);
+       wc = vmalloc_user(sz);
+       if (!wc) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       /* Check that we can write the offset to mmap. */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               __u64 offset = 0;
+
+               ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+               if (ret)
+                       goto bail_free;
+       }
+
+       spin_lock_irq(&cq->lock);
+       /*
+        * Make sure head and tail are sane since they
+        * might be user writable.
+        */
+       old_wc = cq->queue;
+       head = old_wc->head;
+       if (head > (u32) cq->ibcq.cqe)
+               head = (u32) cq->ibcq.cqe;
+       tail = old_wc->tail;
+       if (tail > (u32) cq->ibcq.cqe)
+               tail = (u32) cq->ibcq.cqe;
+       if (head < tail)
+               n = cq->ibcq.cqe + 1 + head - tail;
+       else
+               n = head - tail;
+       if (unlikely((u32)cqe < n)) {
+               ret = -EINVAL;
+               goto bail_unlock;
+       }
+       for (n = 0; tail != head; n++) {
+               if (cq->ip)
+                       wc->uqueue[n] = old_wc->uqueue[tail];
+               else
+                       wc->kqueue[n] = old_wc->kqueue[tail];
+               if (tail == (u32) cq->ibcq.cqe)
+                       tail = 0;
+               else
+                       tail++;
+       }
+       cq->ibcq.cqe = cqe;
+       wc->head = n;
+       wc->tail = 0;
+       cq->queue = wc;
+       spin_unlock_irq(&cq->lock);
+
+       vfree(old_wc);
+
+       if (cq->ip) {
+               struct hfi1_ibdev *dev = to_idev(ibcq->device);
+               struct hfi1_mmap_info *ip = cq->ip;
+
+               hfi1_update_mmap_info(dev, ip, sz, wc);
+
+               /*
+                * Return the offset to mmap.
+                * See hfi1_mmap() for details.
+                */
+               if (udata && udata->outlen >= sizeof(__u64)) {
+                       ret = ib_copy_to_udata(udata, &ip->offset,
+                                              sizeof(ip->offset));
+                       if (ret)
+                               goto bail;
+               }
+
+               spin_lock_irq(&dev->pending_lock);
+               if (list_empty(&ip->pending_mmaps))
+                       list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       ret = 0;
+       goto bail;
+
+bail_unlock:
+       spin_unlock_irq(&cq->lock);
+bail_free:
+       vfree(wc);
+bail:
+       return ret;
+}
+
+int hfi1_cq_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+       int cpu;
+       struct task_struct *task;
+
+       if (dd->worker)
+               return 0;
+       dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL);
+       if (!dd->worker)
+               return -ENOMEM;
+       init_kthread_worker(dd->worker);
+       task = kthread_create_on_node(
+               kthread_worker_fn,
+               dd->worker,
+               dd->assigned_node_id,
+               "hfi1_cq%d", dd->unit);
+       if (IS_ERR(task))
+               goto task_fail;
+       cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id));
+       kthread_bind(task, cpu);
+       wake_up_process(task);
+out:
+       return ret;
+task_fail:
+       ret = PTR_ERR(task);
+       kfree(dd->worker);
+       dd->worker = NULL;
+       goto out;
+}
+
+void hfi1_cq_exit(struct hfi1_devdata *dd)
+{
+       struct kthread_worker *worker;
+
+       worker = dd->worker;
+       if (!worker)
+               return;
+       /* blocks future queuing from send_complete() */
+       dd->worker = NULL;
+       smp_wmb(); /* See hfi1_cq_enter */
+       flush_kthread_worker(worker);
+       kthread_stop(worker->task);
+       kfree(worker);
+}
diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c
new file mode 100644 (file)
index 0000000..acd2269
--- /dev/null
@@ -0,0 +1,899 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+#include "hfi.h"
+#include "debugfs.h"
+#include "device.h"
+#include "qp.h"
+#include "sdma.h"
+
+static struct dentry *hfi1_dbg_root;
+
+#define private2dd(file) (file_inode(file)->i_private)
+#define private2ppd(file) (file_inode(file)->i_private)
+
+#define DEBUGFS_SEQ_FILE_OPS(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+       .start = _##name##_seq_start, \
+       .next  = _##name##_seq_next, \
+       .stop  = _##name##_seq_stop, \
+       .show  = _##name##_seq_show \
+}
+#define DEBUGFS_SEQ_FILE_OPEN(name) \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+       struct seq_file *seq; \
+       int ret; \
+       ret =  seq_open(s, &_##name##_seq_ops); \
+       if (ret) \
+               return ret; \
+       seq = s->private_data; \
+       seq->private = inode->i_private; \
+       return 0; \
+}
+
+#define DEBUGFS_FILE_OPS(name) \
+static const struct file_operations _##name##_file_ops = { \
+       .owner   = THIS_MODULE, \
+       .open    = _##name##_open, \
+       .read    = seq_read, \
+       .llseek  = seq_lseek, \
+       .release = seq_release \
+}
+
+#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode)     \
+do { \
+       struct dentry *ent; \
+       ent = debugfs_create_file(name, mode, parent, \
+               data, ops); \
+       if (!ent) \
+               pr_warn("create of %s failed\n", name); \
+} while (0)
+
+
+#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
+       DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct hfi1_opcode_stats_perctx *opstats;
+
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(opstats->stats))
+               return NULL;
+       return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_opcode_stats_perctx *opstats;
+
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(opstats->stats))
+               return NULL;
+       return pos;
+}
+
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+       loff_t i = *spos, j;
+       u64 n_packets = 0, n_bytes = 0;
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       for (j = 0; j < dd->first_user_ctxt; j++) {
+               if (!dd->rcd[j])
+                       continue;
+               n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+               n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+       }
+       if (!n_packets && !n_bytes)
+               return SEQ_SKIP;
+       seq_printf(s, "%02llx %llu/%llu\n", i,
+               (unsigned long long) n_packets,
+               (unsigned long long) n_bytes);
+
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(opcode_stats);
+DEBUGFS_SEQ_FILE_OPEN(opcode_stats)
+DEBUGFS_FILE_OPS(opcode_stats);
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (!*pos)
+               return SEQ_START_TOKEN;
+       if (*pos >= dd->first_user_ctxt)
+               return NULL;
+       return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (v == SEQ_START_TOKEN)
+               return pos;
+
+       ++*pos;
+       if (*pos >= dd->first_user_ctxt)
+               return NULL;
+       return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+       /* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos;
+       loff_t i, j;
+       u64 n_packets = 0;
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       if (v == SEQ_START_TOKEN) {
+               seq_puts(s, "Ctx:npkts\n");
+               return 0;
+       }
+
+       spos = v;
+       i = *spos;
+
+       if (!dd->rcd[i])
+               return SEQ_SKIP;
+
+       for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+               n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+       if (!n_packets)
+               return SEQ_SKIP;
+
+       seq_printf(s, "  %llu:%llu\n", i, n_packets);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(ctx_stats);
+DEBUGFS_SEQ_FILE_OPEN(ctx_stats)
+DEBUGFS_FILE_OPS(ctx_stats);
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct qp_iter *iter;
+       loff_t n = *pos;
+
+       rcu_read_lock();
+       iter = qp_iter_init(s->private);
+       if (!iter)
+               return NULL;
+
+       while (n--) {
+               if (qp_iter_next(iter)) {
+                       kfree(iter);
+                       return NULL;
+               }
+       }
+
+       return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+                                  loff_t *pos)
+{
+       struct qp_iter *iter = iter_ptr;
+
+       (*pos)++;
+
+       if (qp_iter_next(iter)) {
+               kfree(iter);
+               return NULL;
+       }
+
+       return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+       struct qp_iter *iter = iter_ptr;
+
+       if (!iter)
+               return 0;
+
+       qp_iter_print(s, iter);
+
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(qp_stats);
+DEBUGFS_SEQ_FILE_OPEN(qp_stats)
+DEBUGFS_FILE_OPS(qp_stats);
+
+static void *_sdes_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       struct hfi1_ibdev *ibd;
+       struct hfi1_devdata *dd;
+
+       rcu_read_lock();
+       ibd = (struct hfi1_ibdev *)s->private;
+       dd = dd_from_dev(ibd);
+       if (!dd->per_sdma || *pos >= dd->num_sdma)
+               return NULL;
+       return pos;
+}
+
+static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+       ++*pos;
+       if (!dd->per_sdma || *pos >= dd->num_sdma)
+               return NULL;
+       return pos;
+}
+
+
+static void _sdes_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _sdes_seq_show(struct seq_file *s, void *v)
+{
+       struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+       loff_t *spos = v;
+       loff_t i = *spos;
+
+       sdma_seqfile_dump_sde(s, &dd->per_sdma[i]);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(sdes);
+DEBUGFS_SEQ_FILE_OPEN(sdes)
+DEBUGFS_FILE_OPS(sdes);
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       u64 *counters;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_cntrs(dd, *ppos, NULL, &counters);
+       rval =  simple_read_from_buffer(buf, count, ppos, counters, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+       char *names;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       avail = hfi1_read_cntrs(dd, *ppos, &names, NULL);
+       rval =  simple_read_from_buffer(buf, count, ppos, names, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+struct counter_info {
+       char *name;
+       const struct file_operations ops;
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+       char *names;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       dd = private2dd(file);
+       /* port number n/a here since names are constant */
+       avail = hfi1_read_portcntrs(dd, *ppos, 0, &names, NULL);
+       rval = simple_read_from_buffer(buf, count, ppos, names, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/* read the per-port counters */
+static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       u64 *counters;
+       size_t avail;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       ssize_t rval;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       dd = ppd->dd;
+       avail = hfi1_read_portcntrs(dd, *ppos, ppd->port - 1, NULL, &counters);
+       rval = simple_read_from_buffer(buf, count, ppos, counters, avail);
+       rcu_read_unlock();
+       return rval;
+}
+
+/*
+ * read the per-port QSFP data for ppd
+ */
+static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd;
+       char *tmp;
+       int ret;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+       tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!tmp) {
+               rcu_read_unlock();
+               return -ENOMEM;
+       }
+
+       ret = qsfp_dump(ppd, tmp, PAGE_SIZE);
+       if (ret > 0)
+               ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+       rcu_read_unlock();
+       kfree(tmp);
+       return ret;
+}
+
+/* Do an i2c write operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int i2c_addr;
+       int offset;
+       int total_written;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       i2c_addr = (*ppos >> 16) & 0xff;
+       offset = *ppos & 0xffff;
+
+       total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count);
+       if (total_written < 0) {
+               ret = total_written;
+               goto _free;
+       }
+
+       *ppos += total_written;
+
+       ret = total_written;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do an i2c write operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c write operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do an i2c read operation on the chain for the given HFI. */
+static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int i2c_addr;
+       int offset;
+       int total_read;
+
+       rcu_read_lock();
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       i2c_addr = (*ppos >> 16) & 0xff;
+       offset = *ppos & 0xffff;
+
+       total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count);
+       if (total_read < 0) {
+               ret = total_read;
+               goto _free;
+       }
+
+       *ppos += total_read;
+
+       ret = copy_to_user(buf, buff, total_read);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       ret = total_read;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do an i2c read operation on chain for HFI 0. */
+static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do an i2c read operation on chain for HFI 1. */
+static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       return __i2c_debugfs_read(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP write operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int total_written;
+
+       rcu_read_lock();
+       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       ret = copy_from_user(buff, buf, count);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       total_written = qsfp_write(ppd, target, *ppos, buff, count);
+       if (total_written < 0) {
+               ret = total_written;
+               goto _free;
+       }
+
+       *ppos += total_written;
+
+       ret = total_written;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_write(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP write operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_write(file, buf, count, ppos, 1);
+}
+
+/* Do a QSFP read operation on the i2c chain for the given HFI. */
+static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos, u32 target)
+{
+       struct hfi1_pportdata *ppd;
+       char *buff;
+       int ret;
+       int total_read;
+
+       rcu_read_lock();
+       if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */
+               ret = -EINVAL;
+               goto _return;
+       }
+
+       ppd = private2ppd(file);
+
+       buff = kmalloc(count, GFP_KERNEL);
+       if (!buff) {
+               ret = -ENOMEM;
+               goto _return;
+       }
+
+       total_read = qsfp_read(ppd, target, *ppos, buff, count);
+       if (total_read < 0) {
+               ret = total_read;
+               goto _free;
+       }
+
+       *ppos += total_read;
+
+       ret = copy_to_user(buf, buff, total_read);
+       if (ret > 0) {
+               ret = -EFAULT;
+               goto _free;
+       }
+
+       ret = total_read;
+
+ _free:
+       kfree(buff);
+ _return:
+       rcu_read_unlock();
+       return ret;
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 0. */
+static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_read(file, buf, count, ppos, 0);
+}
+
+/* Do a QSFP read operation on i2c chain for HFI 1. */
+static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       return __qsfp_debugfs_read(file, buf, count, ppos, 1);
+}
+
+#define DEBUGFS_OPS(nm, readroutine, writeroutine)     \
+{ \
+       .name = nm, \
+       .ops = { \
+               .read = readroutine, \
+               .write = writeroutine, \
+               .llseek = generic_file_llseek, \
+       }, \
+}
+
+static const struct counter_info cntr_ops[] = {
+       DEBUGFS_OPS("counter_names", dev_names_read, NULL),
+       DEBUGFS_OPS("counters", dev_counters_read, NULL),
+       DEBUGFS_OPS("portcounter_names", portnames_read, NULL),
+};
+
+static const struct counter_info port_cntr_ops[] = {
+       DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL),
+       DEBUGFS_OPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write),
+       DEBUGFS_OPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write),
+       DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL),
+       DEBUGFS_OPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write),
+       DEBUGFS_OPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write),
+};
+
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+       char name[sizeof("port0counters") + 1];
+       char link[10];
+       struct hfi1_devdata *dd = dd_from_dev(ibd);
+       struct hfi1_pportdata *ppd;
+       int unit = dd->unit;
+       int i, j;
+
+       if (!hfi1_dbg_root)
+               return;
+       snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
+       snprintf(link, sizeof(link), "%d", unit);
+       ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
+       if (!ibd->hfi1_ibdev_dbg) {
+               pr_warn("create of %s failed\n", name);
+               return;
+       }
+       ibd->hfi1_ibdev_link =
+               debugfs_create_symlink(link, hfi1_dbg_root, name);
+       if (!ibd->hfi1_ibdev_link) {
+               pr_warn("create of %s symlink failed\n", name);
+               return;
+       }
+       DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
+       DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
+       /* dev counter files */
+       for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
+               DEBUGFS_FILE_CREATE(cntr_ops[i].name,
+                                   ibd->hfi1_ibdev_dbg,
+                                   dd,
+                                   &cntr_ops[i].ops, S_IRUGO);
+       /* per port files */
+       for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
+               for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
+                       snprintf(name,
+                                sizeof(name),
+                                port_cntr_ops[i].name,
+                                j + 1);
+                       DEBUGFS_FILE_CREATE(name,
+                                           ibd->hfi1_ibdev_dbg,
+                                           ppd,
+                                           &port_cntr_ops[i].ops,
+                                           port_cntr_ops[i].ops.write == NULL ?
+                                           S_IRUGO : S_IRUGO|S_IWUSR);
+               }
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+       if (!hfi1_dbg_root)
+               goto out;
+       debugfs_remove(ibd->hfi1_ibdev_link);
+       debugfs_remove_recursive(ibd->hfi1_ibdev_dbg);
+out:
+       ibd->hfi1_ibdev_dbg = NULL;
+       synchronize_rcu();
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like hfistats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if hfi1_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by hfistats utility.
+ */
+static const char * const hfi1_statnames[] = {
+       /* must be element 0*/
+       "KernIntr",
+       "ErrorIntr",
+       "Tx_Errs",
+       "Rcv_Errs",
+       "H/W_Errs",
+       "NoPIOBufs",
+       "CtxtsOpen",
+       "RcvLen_Errs",
+       "EgrBufFull",
+       "EgrHdrFull"
+};
+
+static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void *_driver_stats_names_seq_next(
+       struct seq_file *s,
+       void *v,
+       loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void _driver_stats_names_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static int _driver_stats_names_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+
+       seq_printf(s, "%s\n", hfi1_statnames[*spos]);
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats_names);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats_names)
+DEBUGFS_FILE_OPS(driver_stats_names);
+
+static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos)
+__acquires(RCU)
+{
+       rcu_read_lock();
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       ++*pos;
+       if (*pos >= ARRAY_SIZE(hfi1_statnames))
+               return NULL;
+       return pos;
+}
+
+static void _driver_stats_seq_stop(struct seq_file *s, void *v)
+__releases(RCU)
+{
+       rcu_read_unlock();
+}
+
+static u64 hfi1_sps_ints(void)
+{
+       unsigned long flags;
+       struct hfi1_devdata *dd;
+       u64 sps_ints = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               sps_ints += get_all_cpu_total(dd->int_counter);
+       }
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return sps_ints;
+}
+
+static int _driver_stats_seq_show(struct seq_file *s, void *v)
+{
+       loff_t *spos = v;
+       char *buffer;
+       u64 *stats = (u64 *)&hfi1_stats;
+       size_t sz = seq_get_buf(s, &buffer);
+
+       if (sz < sizeof(u64))
+               return SEQ_SKIP;
+       /* special case for interrupts */
+       if (*spos == 0)
+               *(u64 *)buffer = hfi1_sps_ints();
+       else
+               *(u64 *)buffer = stats[*spos];
+       seq_commit(s,  sizeof(u64));
+       return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(driver_stats);
+DEBUGFS_SEQ_FILE_OPEN(driver_stats)
+DEBUGFS_FILE_OPS(driver_stats);
+
+void hfi1_dbg_init(void)
+{
+       hfi1_dbg_root  = debugfs_create_dir(DRIVER_NAME, NULL);
+       if (!hfi1_dbg_root)
+               pr_warn("init of debugfs failed\n");
+       DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
+       DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+}
+
+void hfi1_dbg_exit(void)
+{
+       debugfs_remove_recursive(hfi1_dbg_root);
+       hfi1_dbg_root = NULL;
+}
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h
new file mode 100644 (file)
index 0000000..92d6fe1
--- /dev/null
@@ -0,0 +1,78 @@
+#ifndef _HFI1_DEBUGFS_H
+#define _HFI1_DEBUGFS_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_ibdev;
+#ifdef CONFIG_DEBUG_FS
+void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd);
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd);
+void hfi1_dbg_init(void);
+void hfi1_dbg_exit(void);
+#else
+static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd)
+{
+}
+
+void hfi1_dbg_init(void)
+{
+}
+
+void hfi1_dbg_exit(void)
+{
+}
+
+#endif
+
+#endif                          /* _HFI1_DEBUGFS_H */
diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c
new file mode 100644 (file)
index 0000000..07c87a8
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+
+#include "hfi.h"
+#include "device.h"
+
+static struct class *class;
+static dev_t hfi1_dev;
+
+int hfi1_cdev_init(int minor, const char *name,
+                  const struct file_operations *fops,
+                  struct cdev *cdev, struct device **devp)
+{
+       const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor);
+       struct device *device = NULL;
+       int ret;
+
+       cdev_init(cdev, fops);
+       cdev->owner = THIS_MODULE;
+       kobject_set_name(&cdev->kobj, name);
+
+       ret = cdev_add(cdev, dev, 1);
+       if (ret < 0) {
+               pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto done;
+       }
+
+       device = device_create(class, NULL, dev, NULL, "%s", name);
+       if (!IS_ERR(device))
+               goto done;
+       ret = PTR_ERR(device);
+       device = NULL;
+       pr_err("Could not create device for minor %d, %s (err %d)\n",
+              minor, name, -ret);
+       cdev_del(cdev);
+done:
+       *devp = device;
+       return ret;
+}
+
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp)
+{
+       struct device *device = *devp;
+
+       if (device) {
+               device_unregister(device);
+               *devp = NULL;
+
+               cdev_del(cdev);
+       }
+}
+
+static const char *hfi1_class_name = "hfi1";
+
+const char *class_name(void)
+{
+       return hfi1_class_name;
+}
+
+int __init dev_init(void)
+{
+       int ret;
+
+       ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME);
+       if (ret < 0) {
+               pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+               goto done;
+       }
+
+       class = class_create(THIS_MODULE, class_name());
+       if (IS_ERR(class)) {
+               ret = PTR_ERR(class);
+               pr_err("Could not create device class (err %d)\n", -ret);
+               unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+       }
+
+done:
+       return ret;
+}
+
+void dev_cleanup(void)
+{
+       if (class) {
+               class_destroy(class);
+               class = NULL;
+       }
+
+       unregister_chrdev_region(hfi1_dev, HFI1_NMINORS);
+}
diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h
new file mode 100644 (file)
index 0000000..98caecd
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef _HFI1_DEVICE_H
+#define _HFI1_DEVICE_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+int hfi1_cdev_init(int minor, const char *name,
+                  const struct file_operations *fops,
+                  struct cdev *cdev, struct device **devp);
+void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp);
+const char *class_name(void);
+int __init dev_init(void);
+void dev_cleanup(void);
+
+#endif                          /* _HFI1_DEVICE_H */
diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c
new file mode 100644 (file)
index 0000000..6777d6b
--- /dev/null
@@ -0,0 +1,1873 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the hfi1_diag device, normally minor number 129.  Diagnostic use
+ * of the chip may render the chip or board unusable until the driver
+ * is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <rdma/ib_smi.h>
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+#define snoop_dbg(fmt, ...) \
+       hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__)
+
+/* Snoop option mask */
+#define SNOOP_DROP_SEND        (1 << 0)
+#define SNOOP_USE_METADATA     (1 << 1)
+
+static u8 snoop_flags;
+
+/*
+ * Extract packet length from LRH header.
+ * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the
+ * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes
+ */
+#define HFI1_GET_PKT_LEN(x)      (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2)
+
+enum hfi1_filter_status {
+       HFI1_FILTER_HIT,
+       HFI1_FILTER_ERR,
+       HFI1_FILTER_MISS
+};
+
+/* snoop processing functions */
+rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = {
+       [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler,
+       [RHF_RCV_TYPE_EAGER]    = snoop_recv_handler,
+       [RHF_RCV_TYPE_IB]       = snoop_recv_handler,
+       [RHF_RCV_TYPE_ERROR]    = snoop_recv_handler,
+       [RHF_RCV_TYPE_BYPASS]   = snoop_recv_handler,
+       [RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
+       [RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
+       [RHF_RCV_TYPE_INVALID7] = process_receive_invalid
+};
+
+/* Snoop packet structure */
+struct snoop_packet {
+       struct list_head list;
+       u32 total_len;
+       u8 data[];
+};
+
+/* Do not make these an enum or it will blow up the capture_md */
+#define PKT_DIR_EGRESS 0x0
+#define PKT_DIR_INGRESS 0x1
+
+/* Packet capture metadata returned to the user with the packet. */
+struct capture_md {
+       u8 port;
+       u8 dir;
+       u8 reserved[6];
+       union {
+               u64 pbc;
+               u64 rhf;
+       } u;
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev diagpkt_cdev;
+static struct device *diagpkt_device;
+
+static ssize_t diagpkt_write(struct file *fp, const char __user *data,
+                                size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+       .owner = THIS_MODULE,
+       .write = diagpkt_write,
+       .llseek = noop_llseek,
+};
+
+/*
+ * This is used for communication with user space for snoop extended IOCTLs
+ */
+struct hfi1_link_info {
+       __be64 node_guid;
+       u8 port_mode;
+       u8 port_state;
+       u16 link_speed_active;
+       u16 link_width_active;
+       u16 vl15_init;
+       u8 port_number;
+       /*
+        * Add padding to make this a full IB SMP payload. Note: changing the
+        * size of this structure will make the IOCTLs created with _IOWR
+        * change.
+        * Be sure to run tests on all IOCTLs when making changes to this
+        * structure.
+        */
+       u8 res[47];
+};
+
+/*
+ * This starts our ioctl sequence numbers *way* off from the ones
+ * defined in ib_core.
+ */
+#define SNOOP_CAPTURE_VERSION 0x1
+
+#define IB_IOCTL_MAGIC          0x1b /* See Documentation/ioctl-number.txt */
+#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC
+#define HFI1_SNOOP_IOC_BASE_SEQ 0x80
+
+#define HFI1_SNOOP_IOCGETLINKSTATE \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ)
+#define HFI1_SNOOP_IOCSETLINKSTATE \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+1)
+#define HFI1_SNOOP_IOCCLEARQUEUE \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+2)
+#define HFI1_SNOOP_IOCCLEARFILTER \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+3)
+#define HFI1_SNOOP_IOCSETFILTER \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+4)
+#define HFI1_SNOOP_IOCGETVERSION \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+5)
+#define HFI1_SNOOP_IOCSET_OPTS \
+       _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6)
+
+/*
+ * These offsets +6/+7 could change, but these are already known and used
+ * IOCTL numbers so don't change them without a good reason.
+ */
+#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \
+       _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6, \
+               struct hfi1_link_info)
+#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \
+       _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+7, \
+               struct hfi1_link_info)
+
+static int hfi1_snoop_open(struct inode *in, struct file *fp);
+static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
+                               size_t pkt_len, loff_t *off);
+static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
+                                size_t count, loff_t *off);
+static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg);
+static unsigned int hfi1_snoop_poll(struct file *fp,
+                                       struct poll_table_struct *wait);
+static int hfi1_snoop_release(struct inode *in, struct file *fp);
+
+struct hfi1_packet_filter_command {
+       int opcode;
+       int length;
+       void *value_ptr;
+};
+
+/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */
+#define HFI1_SNOOP_INGRESS 0x1
+#define HFI1_SNOOP_EGRESS  0x2
+
+enum hfi1_packet_filter_opcodes {
+       FILTER_BY_LID,
+       FILTER_BY_DLID,
+       FILTER_BY_MAD_MGMT_CLASS,
+       FILTER_BY_QP_NUMBER,
+       FILTER_BY_PKT_TYPE,
+       FILTER_BY_SERVICE_LEVEL,
+       FILTER_BY_PKEY,
+       FILTER_BY_DIRECTION,
+};
+
+static const struct file_operations snoop_file_ops = {
+       .owner = THIS_MODULE,
+       .open = hfi1_snoop_open,
+       .read = hfi1_snoop_read,
+       .unlocked_ioctl = hfi1_ioctl,
+       .poll = hfi1_snoop_poll,
+       .write = hfi1_snoop_write,
+       .release = hfi1_snoop_release
+};
+
+struct hfi1_filter_array {
+       int (*filter)(void *, void *, void *);
+};
+
+static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
+                                     void *value);
+static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
+                                    void *value);
+static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
+                                       void *value);
+static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value);
+static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value);
+
+static struct hfi1_filter_array hfi1_filters[] = {
+       { hfi1_filter_lid },
+       { hfi1_filter_dlid },
+       { hfi1_filter_mad_mgmt_class },
+       { hfi1_filter_qp_number },
+       { hfi1_filter_ibpacket_type },
+       { hfi1_filter_ib_service_level },
+       { hfi1_filter_ib_pkey },
+       { hfi1_filter_direction },
+};
+
+#define HFI1_MAX_FILTERS       ARRAY_SIZE(hfi1_filters)
+#define HFI1_DIAG_MINOR_BASE   129
+
+static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name);
+
+int hfi1_diag_add(struct hfi1_devdata *dd)
+{
+       char name[16];
+       int ret = 0;
+
+       snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(),
+                dd->unit);
+       /*
+        * Do this for each device as opposed to the normal diagpkt
+        * interface which is one per host
+        */
+       ret = hfi1_snoop_add(dd, name);
+       if (ret)
+               dd_dev_err(dd, "Unable to init snoop/capture device");
+
+       snprintf(name, sizeof(name), "%s_diagpkt", class_name());
+       if (atomic_inc_return(&diagpkt_count) == 1) {
+               ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name,
+                                    &diagpkt_file_ops, &diagpkt_cdev,
+                                    &diagpkt_device);
+       }
+
+       return ret;
+}
+
+/* this must be called w/ dd->snoop_in_lock held */
+static void drain_snoop_list(struct list_head *queue)
+{
+       struct list_head *pos, *q;
+       struct snoop_packet *packet;
+
+       list_for_each_safe(pos, q, queue) {
+               packet = list_entry(pos, struct snoop_packet, list);
+               list_del(pos);
+               kfree(packet);
+       }
+}
+
+static void hfi1_snoop_remove(struct hfi1_devdata *dd)
+{
+       unsigned long flags = 0;
+
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+       drain_snoop_list(&dd->hfi1_snoop.queue);
+       hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
+       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+}
+
+void hfi1_diag_remove(struct hfi1_devdata *dd)
+{
+
+       hfi1_snoop_remove(dd);
+       if (atomic_dec_and_test(&diagpkt_count))
+               hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
+       hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
+}
+
+
+/*
+ * Allocated structure shared between the credit return mechanism and
+ * diagpkt_send().
+ */
+struct diagpkt_wait {
+       struct completion credits_returned;
+       int code;
+       atomic_t count;
+};
+
+/*
+ * When each side is finished with the structure, they call this.
+ * The last user frees the structure.
+ */
+static void put_diagpkt_wait(struct diagpkt_wait *wait)
+{
+       if (atomic_dec_and_test(&wait->count))
+               kfree(wait);
+}
+
+/*
+ * Callback from the credit return code.  Set the complete, which
+ * will let diapkt_send() continue.
+ */
+static void diagpkt_complete(void *arg, int code)
+{
+       struct diagpkt_wait *wait = (struct diagpkt_wait *)arg;
+
+       wait->code = code;
+       complete(&wait->credits_returned);
+       put_diagpkt_wait(wait); /* finished with the structure */
+}
+
+/**
+ * diagpkt_send - send a packet
+ * @dp: diag packet descriptor
+ */
+static ssize_t diagpkt_send(struct diag_pkt *dp)
+{
+       struct hfi1_devdata *dd;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       u32 *tmpbuf = NULL;
+       ssize_t ret = 0;
+       u32 pkt_len, total_len;
+       pio_release_cb credit_cb = NULL;
+       void *credit_arg = NULL;
+       struct diagpkt_wait *wait = NULL;
+
+       dd = hfi1_lookup(dp->unit);
+       if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) {
+               ret = -ENODEV;
+               goto bail;
+       }
+       if (!(dd->flags & HFI1_INITTED)) {
+               /* no hardware, freeze, etc. */
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       if (dp->version != _DIAG_PKT_VERS) {
+               dd_dev_err(dd, "Invalid version %u for diagpkt_write\n",
+                           dp->version);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* send count must be an exact number of dwords */
+       if (dp->len & 3) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* there is only port 1 */
+       if (dp->port != 1) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* need a valid context */
+       if (dp->sw_index >= dd->num_send_contexts) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* can only use kernel contexts */
+       if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* must be allocated */
+       sc = dd->send_contexts[dp->sw_index].sc;
+       if (!sc) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* must be enabled */
+       if (!(sc->flags & SCF_ENABLED)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* allocate a buffer and copy the data in */
+       tmpbuf = vmalloc(dp->len);
+       if (!tmpbuf) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       if (copy_from_user(tmpbuf,
+                          (const void __user *) (unsigned long) dp->data,
+                          dp->len)) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       /*
+        * pkt_len is how much data we have to write, includes header and data.
+        * total_len is length of the packet in Dwords plus the PBC should not
+        * include the CRC.
+        */
+       pkt_len = dp->len >> 2;
+       total_len = pkt_len + 2; /* PBC + packet */
+
+       /* if 0, fill in a default */
+       if (dp->pbc == 0) {
+               struct hfi1_pportdata *ppd = dd->pport;
+
+               hfi1_cdbg(PKT, "Generating PBC");
+               dp->pbc = create_pbc(ppd, 0, 0, 0, total_len);
+       } else {
+               hfi1_cdbg(PKT, "Using passed in PBC");
+       }
+
+       hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc);
+
+       /*
+        * The caller wants to wait until the packet is sent and to
+        * check for errors.  The best we can do is wait until
+        * the buffer credits are returned and check if any packet
+        * error has occurred.  If there are any late errors, this
+        * could miss it.  If there are other senders who generate
+        * an error, this may find it.  However, in general, it
+        * should catch most.
+        */
+       if (dp->flags & F_DIAGPKT_WAIT) {
+               /* always force a credit return */
+               dp->pbc |= PBC_CREDIT_RETURN;
+               /* turn on credit return interrupts */
+               sc_add_credit_return_intr(sc);
+               wait = kmalloc(sizeof(*wait), GFP_KERNEL);
+               if (!wait) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+               init_completion(&wait->credits_returned);
+               atomic_set(&wait->count, 2);
+               wait->code = PRC_OK;
+
+               credit_cb = diagpkt_complete;
+               credit_arg = wait;
+       }
+
+       pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg);
+       if (!pbuf) {
+               /*
+                * No send buffer means no credit callback.  Undo
+                * the wait set-up that was done above.  We free wait
+                * because the callback will never be called.
+                */
+               if (dp->flags & F_DIAGPKT_WAIT) {
+                       sc_del_credit_return_intr(sc);
+                       kfree(wait);
+                       wait = NULL;
+               }
+               ret = -ENOSPC;
+               goto bail;
+       }
+
+       pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len);
+       /* no flush needed as the HW knows the packet size */
+
+       ret = sizeof(*dp);
+
+       if (dp->flags & F_DIAGPKT_WAIT) {
+               /* wait for credit return */
+               ret = wait_for_completion_interruptible(
+                                               &wait->credits_returned);
+               /*
+                * If the wait returns an error, the wait was interrupted,
+                * e.g. with a ^C in the user program.  The callback is
+                * still pending.  This is OK as the wait structure is
+                * kmalloc'ed and the structure will free itself when
+                * all users are done with it.
+                *
+                * A context disable occurs on a send context restart, so
+                * include that in the list of errors below to check for.
+                * NOTE: PRC_FILL_ERR is at best informational and cannot
+                * be depended on.
+                */
+               if (!ret && (((wait->code & PRC_STATUS_ERR)
+                               || (wait->code & PRC_FILL_ERR)
+                               || (wait->code & PRC_SC_DISABLE))))
+                       ret = -EIO;
+
+               put_diagpkt_wait(wait); /* finished with the structure */
+               sc_del_credit_return_intr(sc);
+       }
+
+bail:
+       vfree(tmpbuf);
+       return ret;
+}
+
+static ssize_t diagpkt_write(struct file *fp, const char __user *data,
+                                size_t count, loff_t *off)
+{
+       struct hfi1_devdata *dd;
+       struct send_context *sc;
+       u8 vl;
+
+       struct diag_pkt dp;
+
+       if (count != sizeof(dp))
+               return -EINVAL;
+
+       if (copy_from_user(&dp, data, sizeof(dp)))
+               return -EFAULT;
+
+       /*
+       * The Send Context is derived from the PbcVL value
+       * if PBC is populated
+       */
+       if (dp.pbc) {
+               dd = hfi1_lookup(dp.unit);
+               if (dd == NULL)
+                       return -ENODEV;
+               vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
+               sc = dd->vld[vl].sc;
+               if (sc) {
+                       dp.sw_index = sc->sw_index;
+                       hfi1_cdbg(
+                              PKT,
+                              "Packet sent over VL %d via Send Context %u(%u)",
+                              vl, sc->sw_index, sc->hw_context);
+               }
+       }
+
+       return diagpkt_send(&dp);
+}
+
+static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name)
+{
+       int ret = 0;
+
+       dd->hfi1_snoop.mode_flag = 0;
+       spin_lock_init(&dd->hfi1_snoop.snoop_lock);
+       INIT_LIST_HEAD(&dd->hfi1_snoop.queue);
+       init_waitqueue_head(&dd->hfi1_snoop.waitq);
+
+       ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name,
+                            &snoop_file_ops,
+                            &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev);
+
+       if (ret) {
+               dd_dev_err(dd, "Couldn't create %s device: %d", name, ret);
+               hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev,
+                                &dd->hfi1_snoop.class_dev);
+       }
+
+       return ret;
+}
+
+static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in)
+{
+       int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE;
+       struct hfi1_devdata *dd = NULL;
+
+       dd = hfi1_lookup(unit);
+       return dd;
+
+}
+
+/* clear or restore send context integrity checks */
+static void adjust_integrity_checks(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       unsigned long sc_flags;
+       int i;
+
+       spin_lock_irqsave(&dd->sc_lock, sc_flags);
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               int enable;
+
+               sc = dd->send_contexts[i].sc;
+
+               if (!sc)
+                       continue;       /* not allocated */
+
+               enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+                        dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE;
+
+               set_pio_integrity(sc);
+
+               if (enable) /* take HFI_CAP_* flags into account */
+                       hfi1_init_ctxt(sc);
+       }
+       spin_unlock_irqrestore(&dd->sc_lock, sc_flags);
+}
+
+static int hfi1_snoop_open(struct inode *in, struct file *fp)
+{
+       int ret;
+       int mode_flag = 0;
+       unsigned long flags = 0;
+       struct hfi1_devdata *dd;
+       struct list_head *queue;
+
+       mutex_lock(&hfi1_mutex);
+
+       dd = hfi1_dd_from_sc_inode(in);
+       if (dd == NULL) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       /*
+        * File mode determines snoop or capture. Some existing user
+        * applications expect the capture device to be able to be opened RDWR
+        * because they expect a dedicated capture device. For this reason we
+        * support a module param to force capture mode even if the file open
+        * mode matches snoop.
+        */
+       if ((fp->f_flags & O_ACCMODE) == O_RDONLY) {
+               snoop_dbg("Capture Enabled");
+               mode_flag = HFI1_PORT_CAPTURE_MODE;
+       } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) {
+               snoop_dbg("Snoop Enabled");
+               mode_flag = HFI1_PORT_SNOOP_MODE;
+       } else {
+               snoop_dbg("Invalid");
+               ret =  -EINVAL;
+               goto bail;
+       }
+       queue = &dd->hfi1_snoop.queue;
+
+       /*
+        * We are not supporting snoop and capture at the same time.
+        */
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+       if (dd->hfi1_snoop.mode_flag) {
+               ret = -EBUSY;
+               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+               goto bail;
+       }
+
+       dd->hfi1_snoop.mode_flag = mode_flag;
+       drain_snoop_list(queue);
+
+       dd->hfi1_snoop.filter_callback = NULL;
+       dd->hfi1_snoop.filter_value = NULL;
+
+       /*
+        * Send side packet integrity checks are not helpful when snooping so
+        * disable and re-enable when we stop snooping.
+        */
+       if (mode_flag == HFI1_PORT_SNOOP_MODE) {
+               /* clear after snoop mode is on */
+               adjust_integrity_checks(dd); /* clear */
+
+               /*
+                * We also do not want to be doing the DLID LMC check for
+                * ingressed packets.
+                */
+               dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1);
+               write_csr(dd, DCC_CFG_PORT_CONFIG1,
+                         (dd->hfi1_snoop.dcc_cfg >> 32) << 32);
+       }
+
+       /*
+        * As soon as we set these function pointers the recv and send handlers
+        * are active. This is a race condition so we must make sure to drain
+        * the queue and init filter values above. Technically we should add
+        * locking here but all that will happen is on recv a packet will get
+        * allocated and get stuck on the snoop_lock before getting added to the
+        * queue. Same goes for send.
+        */
+       dd->rhf_rcv_function_map = snoop_rhf_rcv_functions;
+       dd->process_pio_send = snoop_send_pio_handler;
+       dd->process_dma_send = snoop_send_pio_handler;
+       dd->pio_inline_send = snoop_inline_pio_send;
+
+       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+       ret = 0;
+
+bail:
+       mutex_unlock(&hfi1_mutex);
+
+       return ret;
+}
+
+static int hfi1_snoop_release(struct inode *in, struct file *fp)
+{
+       unsigned long flags = 0;
+       struct hfi1_devdata *dd;
+       int mode_flag;
+
+       dd = hfi1_dd_from_sc_inode(in);
+       if (dd == NULL)
+               return -ENODEV;
+
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+       /* clear the snoop mode before re-adjusting send context CSRs */
+       mode_flag = dd->hfi1_snoop.mode_flag;
+       dd->hfi1_snoop.mode_flag = 0;
+
+       /*
+        * Drain the queue and clear the filters we are done with it. Don't
+        * forget to restore the packet integrity checks
+        */
+       drain_snoop_list(&dd->hfi1_snoop.queue);
+       if (mode_flag == HFI1_PORT_SNOOP_MODE) {
+               /* restore after snoop mode is clear */
+               adjust_integrity_checks(dd); /* restore */
+
+               /*
+                * Also should probably reset the DCC_CONFIG1 register for DLID
+                * checking on incoming packets again. Use the value saved when
+                * opening the snoop device.
+                */
+               write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg);
+       }
+
+       dd->hfi1_snoop.filter_callback = NULL;
+       kfree(dd->hfi1_snoop.filter_value);
+       dd->hfi1_snoop.filter_value = NULL;
+
+       /*
+        * User is done snooping and capturing, return control to the normal
+        * handler. Re-enable SDMA handling.
+        */
+       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+       dd->process_pio_send = hfi1_verbs_send_pio;
+       dd->process_dma_send = hfi1_verbs_send_dma;
+       dd->pio_inline_send = pio_copy;
+
+       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+       snoop_dbg("snoop/capture device released");
+
+       return 0;
+}
+
+static unsigned int hfi1_snoop_poll(struct file *fp,
+                                   struct poll_table_struct *wait)
+{
+       int ret = 0;
+       unsigned long flags = 0;
+
+       struct hfi1_devdata *dd;
+
+       dd = hfi1_dd_from_sc_inode(fp->f_inode);
+       if (dd == NULL)
+               return -ENODEV;
+
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+       poll_wait(fp, &dd->hfi1_snoop.waitq, wait);
+       if (!list_empty(&dd->hfi1_snoop.queue))
+               ret |= POLLIN | POLLRDNORM;
+
+       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+       return ret;
+
+}
+
+static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data,
+                               size_t count, loff_t *off)
+{
+       struct diag_pkt dpkt;
+       struct hfi1_devdata *dd;
+       size_t ret;
+       u8 byte_two, sl, sc5, sc4, vl, byte_one;
+       struct send_context *sc;
+       u32 len;
+       u64 pbc;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+
+       dd = hfi1_dd_from_sc_inode(fp->f_inode);
+       if (dd == NULL)
+               return -ENODEV;
+
+       ppd = dd->pport;
+       snoop_dbg("received %lu bytes from user", count);
+
+       memset(&dpkt, 0, sizeof(struct diag_pkt));
+       dpkt.version = _DIAG_PKT_VERS;
+       dpkt.unit = dd->unit;
+       dpkt.port = 1;
+
+       if (likely(!(snoop_flags & SNOOP_USE_METADATA))) {
+               /*
+               * We need to generate the PBC and not let diagpkt_send do it,
+               * to do this we need the VL and the length in dwords.
+               * The VL can be determined by using the SL and looking up the
+               * SC. Then the SC can be converted into VL. The exception to
+               * this is those packets which are from an SMI queue pair.
+               * Since we can't detect anything about the QP here we have to
+               * rely on the SC. If its 0xF then we assume its SMI and
+               * do not look at the SL.
+               */
+               if (copy_from_user(&byte_one, data, 1))
+                       return -EINVAL;
+
+               if (copy_from_user(&byte_two, data+1, 1))
+                       return -EINVAL;
+
+               sc4 = (byte_one >> 4) & 0xf;
+               if (sc4 == 0xF) {
+                       snoop_dbg("Detected VL15 packet ignoring SL in packet");
+                       vl = sc4;
+               } else {
+                       sl = (byte_two >> 4) & 0xf;
+                       ibp = to_iport(&dd->verbs_dev.ibdev, 1);
+                       sc5 = ibp->sl_to_sc[sl];
+                       vl = sc_to_vlt(dd, sc5);
+                       if (vl != sc4) {
+                               snoop_dbg("VL %d does not match SC %d of packet",
+                                         vl, sc4);
+                               return -EINVAL;
+                       }
+               }
+
+               sc = dd->vld[vl].sc; /* Look up the context based on VL */
+               if (sc) {
+                       dpkt.sw_index = sc->sw_index;
+                       snoop_dbg("Sending on context %u(%u)", sc->sw_index,
+                                 sc->hw_context);
+               } else {
+                       snoop_dbg("Could not find context for vl %d", vl);
+                       return -EINVAL;
+               }
+
+               len = (count >> 2) + 2; /* Add in PBC */
+               pbc = create_pbc(ppd, 0, 0, vl, len);
+       } else {
+               if (copy_from_user(&pbc, data, sizeof(pbc)))
+                       return -EINVAL;
+               vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK;
+               sc = dd->vld[vl].sc; /* Look up the context based on VL */
+               if (sc) {
+                       dpkt.sw_index = sc->sw_index;
+               } else {
+                       snoop_dbg("Could not find context for vl %d", vl);
+                       return -EINVAL;
+               }
+               data += sizeof(pbc);
+               count -= sizeof(pbc);
+       }
+       dpkt.len = count;
+       dpkt.data = (unsigned long)data;
+
+       snoop_dbg("PBC: vl=0x%llx Length=0x%llx",
+                 (pbc >> 12) & 0xf,
+                 (pbc & 0xfff));
+
+       dpkt.pbc = pbc;
+       ret = diagpkt_send(&dpkt);
+       /*
+        * diagpkt_send only returns number of bytes in the diagpkt so patch
+        * that up here before returning.
+        */
+       if (ret == sizeof(dpkt))
+               return count;
+
+       return ret;
+}
+
+static ssize_t hfi1_snoop_read(struct file *fp, char __user *data,
+                              size_t pkt_len, loff_t *off)
+{
+       ssize_t ret = 0;
+       unsigned long flags = 0;
+       struct snoop_packet *packet = NULL;
+       struct hfi1_devdata *dd;
+
+       dd = hfi1_dd_from_sc_inode(fp->f_inode);
+       if (dd == NULL)
+               return -ENODEV;
+
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+       while (list_empty(&dd->hfi1_snoop.queue)) {
+               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+               if (fp->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               if (wait_event_interruptible(
+                               dd->hfi1_snoop.waitq,
+                               !list_empty(&dd->hfi1_snoop.queue)))
+                       return -EINTR;
+
+               spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+       }
+
+       if (!list_empty(&dd->hfi1_snoop.queue)) {
+               packet = list_entry(dd->hfi1_snoop.queue.next,
+                                   struct snoop_packet, list);
+               list_del(&packet->list);
+               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+               if (pkt_len >= packet->total_len) {
+                       if (copy_to_user(data, packet->data,
+                               packet->total_len))
+                               ret = -EFAULT;
+                       else
+                               ret = packet->total_len;
+               } else
+                       ret = -EINVAL;
+
+               kfree(packet);
+       } else
+               spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+
+       return ret;
+}
+
+static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+       struct hfi1_devdata *dd;
+       void *filter_value = NULL;
+       long ret = 0;
+       int value = 0;
+       u8 physState = 0;
+       u8 linkState = 0;
+       u16 devState = 0;
+       unsigned long flags = 0;
+       unsigned long *argp = NULL;
+       struct hfi1_packet_filter_command filter_cmd = {0};
+       int mode_flag = 0;
+       struct hfi1_pportdata *ppd = NULL;
+       unsigned int index;
+       struct hfi1_link_info link_info;
+
+       dd = hfi1_dd_from_sc_inode(fp->f_inode);
+       if (dd == NULL)
+               return -ENODEV;
+
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+
+       mode_flag = dd->hfi1_snoop.mode_flag;
+
+       if (((_IOC_DIR(cmd) & _IOC_READ)
+           && !access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd)))
+           || ((_IOC_DIR(cmd) & _IOC_WRITE)
+           && !access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)))) {
+               ret = -EFAULT;
+       } else if (!capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+       } else if ((mode_flag & HFI1_PORT_CAPTURE_MODE) &&
+                  (cmd != HFI1_SNOOP_IOCCLEARQUEUE) &&
+                  (cmd != HFI1_SNOOP_IOCCLEARFILTER) &&
+                  (cmd != HFI1_SNOOP_IOCSETFILTER)) {
+               /* Capture devices are allowed only 3 operations
+                * 1.Clear capture queue
+                * 2.Clear capture filter
+                * 3.Set capture filter
+                * Other are invalid.
+                */
+               ret = -EINVAL;
+       } else {
+               switch (cmd) {
+               case HFI1_SNOOP_IOCSETLINKSTATE:
+                       snoop_dbg("HFI1_SNOOP_IOCSETLINKSTATE is not valid");
+                       ret = -EINVAL;
+                       break;
+
+               case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA:
+                       memset(&link_info, 0, sizeof(link_info));
+
+                       ret = copy_from_user(&link_info,
+                               (struct hfi1_link_info __user *)arg,
+                               sizeof(link_info));
+                       if (ret)
+                               break;
+
+                       value = link_info.port_state;
+                       index = link_info.port_number;
+                       if (index > dd->num_pports - 1) {
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       ppd = &dd->pport[index];
+                       if (!ppd) {
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       /* What we want to transition to */
+                       physState = (value >> 4) & 0xF;
+                       linkState = value & 0xF;
+                       snoop_dbg("Setting link state 0x%x", value);
+
+                       switch (linkState) {
+                       case IB_PORT_NOP:
+                               if (physState == 0)
+                                       break;
+                                       /* fall through */
+                       case IB_PORT_DOWN:
+                               switch (physState) {
+                               case 0:
+                                       devState = HLS_DN_DOWNDEF;
+                                       break;
+                               case 2:
+                                       devState = HLS_DN_POLL;
+                                       break;
+                               case 3:
+                                       devState = HLS_DN_DISABLE;
+                                       break;
+                               default:
+                                       ret = -EINVAL;
+                                       goto done;
+                               }
+                               ret = set_link_state(ppd, devState);
+                               break;
+                       case IB_PORT_ARMED:
+                               ret = set_link_state(ppd, HLS_UP_ARMED);
+                               if (!ret)
+                                       send_idle_sma(dd, SMA_IDLE_ARM);
+                               break;
+                       case IB_PORT_ACTIVE:
+                               ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                               if (!ret)
+                                       send_idle_sma(dd, SMA_IDLE_ACTIVE);
+                               break;
+                       default:
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       if (ret)
+                               break;
+                       /* fall through */
+               case HFI1_SNOOP_IOCGETLINKSTATE:
+               case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA:
+                       if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) {
+                               memset(&link_info, 0, sizeof(link_info));
+                               ret = copy_from_user(&link_info,
+                                       (struct hfi1_link_info __user *)arg,
+                                       sizeof(link_info));
+                               index = link_info.port_number;
+                       } else {
+                               ret = __get_user(index, (int __user *) arg);
+                               if (ret !=  0)
+                                       break;
+                       }
+
+                       if (index > dd->num_pports - 1) {
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       ppd = &dd->pport[index];
+                       if (!ppd) {
+                               ret = -EINVAL;
+                               break;
+                       }
+                       value = hfi1_ibphys_portstate(ppd);
+                       value <<= 4;
+                       value |= driver_lstate(ppd);
+
+                       snoop_dbg("Link port | Link State: %d", value);
+
+                       if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) ||
+                           (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) {
+                               link_info.port_state = value;
+                               link_info.node_guid = cpu_to_be64(ppd->guid);
+                               link_info.link_speed_active =
+                                                       ppd->link_speed_active;
+                               link_info.link_width_active =
+                                                       ppd->link_width_active;
+                               ret = copy_to_user(
+                                       (struct hfi1_link_info __user *)arg,
+                                       &link_info, sizeof(link_info));
+                       } else {
+                               ret = __put_user(value, (int __user *)arg);
+                       }
+                       break;
+
+               case HFI1_SNOOP_IOCCLEARQUEUE:
+                       snoop_dbg("Clearing snoop queue");
+                       drain_snoop_list(&dd->hfi1_snoop.queue);
+                       break;
+
+               case HFI1_SNOOP_IOCCLEARFILTER:
+                       snoop_dbg("Clearing filter");
+                       if (dd->hfi1_snoop.filter_callback) {
+                               /* Drain packets first */
+                               drain_snoop_list(&dd->hfi1_snoop.queue);
+                               dd->hfi1_snoop.filter_callback = NULL;
+                       }
+                       kfree(dd->hfi1_snoop.filter_value);
+                       dd->hfi1_snoop.filter_value = NULL;
+                       break;
+
+               case HFI1_SNOOP_IOCSETFILTER:
+                       snoop_dbg("Setting filter");
+                       /* just copy command structure */
+                       argp = (unsigned long *)arg;
+                       ret = copy_from_user(&filter_cmd, (void __user *)argp,
+                                            sizeof(filter_cmd));
+                       if (ret < 0) {
+                               pr_alert("Error copying filter command\n");
+                               break;
+                       }
+                       if (filter_cmd.opcode >= HFI1_MAX_FILTERS) {
+                               pr_alert("Invalid opcode in request\n");
+                               ret = -EINVAL;
+                               break;
+                       }
+
+                       snoop_dbg("Opcode %d Len %d Ptr %p",
+                                  filter_cmd.opcode, filter_cmd.length,
+                                  filter_cmd.value_ptr);
+
+                       filter_value = kzalloc(
+                                               filter_cmd.length * sizeof(u8),
+                                               GFP_KERNEL);
+                       if (!filter_value) {
+                               pr_alert("Not enough memory\n");
+                               ret = -ENOMEM;
+                               break;
+                       }
+                       /* copy remaining data from userspace */
+                       ret = copy_from_user((u8 *)filter_value,
+                                       (void __user *)filter_cmd.value_ptr,
+                                       filter_cmd.length);
+                       if (ret < 0) {
+                               kfree(filter_value);
+                               pr_alert("Error copying filter data\n");
+                               break;
+                       }
+                       /* Drain packets first */
+                       drain_snoop_list(&dd->hfi1_snoop.queue);
+                       dd->hfi1_snoop.filter_callback =
+                               hfi1_filters[filter_cmd.opcode].filter;
+                       /* just in case we see back to back sets */
+                       kfree(dd->hfi1_snoop.filter_value);
+                       dd->hfi1_snoop.filter_value = filter_value;
+
+                       break;
+               case HFI1_SNOOP_IOCGETVERSION:
+                       value = SNOOP_CAPTURE_VERSION;
+                       snoop_dbg("Getting version: %d", value);
+                       ret = __put_user(value, (int __user *)arg);
+                       break;
+               case HFI1_SNOOP_IOCSET_OPTS:
+                       snoop_flags = 0;
+                       ret = __get_user(value, (int __user *) arg);
+                       if (ret != 0)
+                               break;
+
+                       snoop_dbg("Setting snoop option %d", value);
+                       if (value & SNOOP_DROP_SEND)
+                               snoop_flags |= SNOOP_DROP_SEND;
+                       if (value & SNOOP_USE_METADATA)
+                               snoop_flags |= SNOOP_USE_METADATA;
+                       break;
+               default:
+                       ret = -ENOTTY;
+                       break;
+               }
+       }
+done:
+       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+       return ret;
+}
+
+static void snoop_list_add_tail(struct snoop_packet *packet,
+                               struct hfi1_devdata *dd)
+{
+       unsigned long flags = 0;
+
+       spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags);
+       if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) ||
+                  (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) {
+               list_add_tail(&packet->list, &dd->hfi1_snoop.queue);
+               snoop_dbg("Added packet to list");
+       }
+
+       /*
+        * Technically we can could have closed the snoop device while waiting
+        * on the above lock and it is gone now. The snoop mode_flag will
+        * prevent us from adding the packet to the queue though.
+        */
+
+       spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags);
+       wake_up_interruptible(&dd->hfi1_snoop.waitq);
+}
+
+static inline int hfi1_filter_check(void *val, const char *msg)
+{
+       if (!val) {
+               snoop_dbg("Error invalid %s value for filter", msg);
+               return HFI1_FILTER_ERR;
+       }
+       return 0;
+}
+
+static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value)
+{
+       struct hfi1_ib_header *hdr;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */
+               return HFI1_FILTER_HIT; /* matched */
+
+       return HFI1_FILTER_MISS; /* Not matched */
+}
+
+static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value)
+{
+       struct hfi1_ib_header *hdr;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1]))
+               return HFI1_FILTER_HIT;
+
+       return HFI1_FILTER_MISS;
+}
+
+/* Not valid for outgoing packets, send handler passes null for data*/
+static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data,
+                                     void *value)
+{
+       struct hfi1_ib_header *hdr;
+       struct hfi1_other_headers *ohdr = NULL;
+       struct ib_smp *smp = NULL;
+       u32 qpn = 0;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(packet_data, "packet_data");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       /* Check for GRH */
+       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
+       else
+               ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
+
+       qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF;
+       if (qpn <= 1) {
+               smp = (struct ib_smp *)packet_data;
+               if (*((u8 *)value) == smp->mgmt_class)
+                       return HFI1_FILTER_HIT;
+               else
+                       return HFI1_FILTER_MISS;
+       }
+       return HFI1_FILTER_ERR;
+}
+
+static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value)
+{
+
+       struct hfi1_ib_header *hdr;
+       struct hfi1_other_headers *ohdr = NULL;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       /* Check for GRH */
+       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth; /* LRH + BTH + DETH */
+       else
+               ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */
+       if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF))
+               return HFI1_FILTER_HIT;
+
+       return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data,
+                                    void *value)
+{
+       u32 lnh = 0;
+       u8 opcode = 0;
+       struct hfi1_ib_header *hdr;
+       struct hfi1_other_headers *ohdr = NULL;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
+
+       if (lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else if (lnh == HFI1_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else
+               return HFI1_FILTER_ERR;
+
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+       if (*((u8 *)value) == ((opcode >> 5) & 0x7))
+               return HFI1_FILTER_HIT;
+
+       return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data,
+                                       void *value)
+{
+       struct hfi1_ib_header *hdr;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF))
+               return HFI1_FILTER_HIT;
+
+       return HFI1_FILTER_MISS;
+}
+
+static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value)
+{
+
+       u32 lnh = 0;
+       struct hfi1_ib_header *hdr;
+       struct hfi1_other_headers *ohdr = NULL;
+       int ret;
+
+       ret = hfi1_filter_check(ibhdr, "header");
+       if (ret)
+               return ret;
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       hdr = (struct hfi1_ib_header *)ibhdr;
+
+       lnh = (be16_to_cpu(hdr->lrh[0]) & 3);
+       if (lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else if (lnh == HFI1_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else
+               return HFI1_FILTER_ERR;
+
+       /* P_key is 16-bit entity, however top most bit indicates
+        * type of membership. 0 for limited and 1 for Full.
+        * Limited members cannot accept information from other
+        * Limited members, but communication is allowed between
+        * every other combination of membership.
+        * Hence we'll omit comparing top-most bit while filtering
+        */
+
+       if ((*(u16 *)value & 0x7FFF) ==
+               ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF))
+               return HFI1_FILTER_HIT;
+
+       return HFI1_FILTER_MISS;
+}
+
+/*
+ * If packet_data is NULL then this is coming from one of the send functions.
+ * Thus we know if its an ingressed or egressed packet.
+ */
+static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value)
+{
+       u8 user_dir = *(u8 *)value;
+       int ret;
+
+       ret = hfi1_filter_check(value, "user");
+       if (ret)
+               return ret;
+
+       if (packet_data) {
+               /* Incoming packet */
+               if (user_dir & HFI1_SNOOP_INGRESS)
+                       return HFI1_FILTER_HIT;
+       } else {
+               /* Outgoing packet */
+               if (user_dir & HFI1_SNOOP_EGRESS)
+                       return HFI1_FILTER_HIT;
+       }
+
+       return HFI1_FILTER_MISS;
+}
+
+/*
+ * Allocate a snoop packet. The structure that is stored in the ring buffer, not
+ * to be confused with an hfi packet type.
+ */
+static struct snoop_packet *allocate_snoop_packet(u32 hdr_len,
+                                                 u32 data_len,
+                                                 u32 md_len)
+{
+
+       struct snoop_packet *packet = NULL;
+
+       packet = kzalloc(sizeof(struct snoop_packet) + hdr_len + data_len
+                        + md_len,
+                        GFP_ATOMIC | __GFP_NOWARN);
+       if (likely(packet))
+               INIT_LIST_HEAD(&packet->list);
+
+
+       return packet;
+}
+
+/*
+ * Instead of having snoop and capture code intermixed with the recv functions,
+ * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call
+ * and land in here for snoop/capture but if not enabled the call will go
+ * through as before. This gives us a single point to constrain all of the snoop
+ * snoop recv logic. There is nothing special that needs to happen for bypass
+ * packets. This routine should not try to look into the packet. It just copied
+ * it. There is no guarantee for filters when it comes to bypass packets as
+ * there is no specific support. Bottom line is this routine does now even know
+ * what a bypass packet is.
+ */
+int snoop_recv_handler(struct hfi1_packet *packet)
+{
+       struct hfi1_pportdata *ppd = packet->rcd->ppd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       int header_size = packet->hlen;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct snoop_packet *s_packet = NULL;
+       int ret;
+       int snoop_mode = 0;
+       u32 md_len = 0;
+       struct capture_md md;
+
+       snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen,
+                 data);
+
+       trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size,
+                           data);
+
+       if (!ppd->dd->hfi1_snoop.filter_callback) {
+               snoop_dbg("filter not set");
+               ret = HFI1_FILTER_HIT;
+       } else {
+               ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data,
+                                       ppd->dd->hfi1_snoop.filter_value);
+       }
+
+       switch (ret) {
+       case HFI1_FILTER_ERR:
+               snoop_dbg("Error in filter call");
+               break;
+       case HFI1_FILTER_MISS:
+               snoop_dbg("Filter Miss");
+               break;
+       case HFI1_FILTER_HIT:
+
+               if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+                       snoop_mode = 1;
+               if ((snoop_mode == 0) ||
+                   unlikely(snoop_flags & SNOOP_USE_METADATA))
+                       md_len = sizeof(struct capture_md);
+
+
+               s_packet = allocate_snoop_packet(header_size,
+                                                tlen - header_size,
+                                                md_len);
+
+               if (unlikely(s_packet == NULL)) {
+                       dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
+                       break;
+               }
+
+               if (md_len > 0) {
+                       memset(&md, 0, sizeof(struct capture_md));
+                       md.port = 1;
+                       md.dir = PKT_DIR_INGRESS;
+                       md.u.rhf = packet->rhf;
+                       memcpy(s_packet->data, &md, md_len);
+               }
+
+               /* We should always have a header */
+               if (hdr) {
+                       memcpy(s_packet->data + md_len, hdr, header_size);
+               } else {
+                       dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n");
+                       kfree(s_packet);
+                       break;
+               }
+
+               /*
+                * Packets with no data are possible. If there is no data needed
+                * to take care of the last 4 bytes which are normally included
+                * with data buffers and are included in tlen.  Since we kzalloc
+                * the buffer we do not need to set any values but if we decide
+                * not to use kzalloc we should zero them.
+                */
+               if (data)
+                       memcpy(s_packet->data + header_size + md_len, data,
+                              tlen - header_size);
+
+               s_packet->total_len = tlen + md_len;
+               snoop_list_add_tail(s_packet, ppd->dd);
+
+               /*
+                * If we are snooping the packet not capturing then throw away
+                * after adding to the list.
+                */
+               snoop_dbg("Capturing packet");
+               if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) {
+                       snoop_dbg("Throwing packet away");
+                       /*
+                        * If we are dropping the packet we still may need to
+                        * handle the case where error flags are set, this is
+                        * normally done by the type specific handler but that
+                        * won't be called in this case.
+                        */
+                       if (unlikely(rhf_err_flags(packet->rhf)))
+                               handle_eflags(packet);
+
+                       /* throw the packet on the floor */
+                       return RHF_RCV_CONTINUE;
+               }
+               break;
+       default:
+               break;
+       }
+
+       /*
+        * We do not care what type of packet came in here - just pass it off
+        * to the normal handler.
+        */
+       return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)]
+                       (packet);
+}
+
+/*
+ * Handle snooping and capturing packets when sdma is being used.
+ */
+int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+                          u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                          u32 plen, u32 dwords, u64 pbc)
+{
+       pr_alert("Snooping/Capture of  Send DMA Packets Is Not Supported!\n");
+       snoop_dbg("Unsupported Operation");
+       return hfi1_verbs_send_dma(qp, ibhdr, hdrwords, ss, len, plen, dwords,
+                                 0);
+}
+
+/*
+ * Handle snooping and capturing packets when pio is being used. Does not handle
+ * bypass packets. The only way to send a bypass packet currently is to use the
+ * diagpkt interface. When that interface is enable snoop/capture is not.
+ */
+int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+                          u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                          u32 plen, u32 dwords, u64 pbc)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct snoop_packet *s_packet = NULL;
+       u32 *hdr = (u32 *)&ahdr->ibh;
+       u32 length = 0;
+       struct hfi1_sge_state temp_ss;
+       void *data = NULL;
+       void *data_start = NULL;
+       int ret;
+       int snoop_mode = 0;
+       int md_len = 0;
+       struct capture_md md;
+       u32 vl;
+       u32 hdr_len = hdrwords << 2;
+       u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh);
+
+       md.u.pbc = 0;
+
+       snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u",
+                 hdrwords, len, plen, dwords, tlen);
+       if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+               snoop_mode = 1;
+       if ((snoop_mode == 0) ||
+           unlikely(snoop_flags & SNOOP_USE_METADATA))
+               md_len = sizeof(struct capture_md);
+
+       /* not using ss->total_len as arg 2 b/c that does not count CRC */
+       s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len);
+
+       if (unlikely(s_packet == NULL)) {
+               dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n");
+               goto out;
+       }
+
+       s_packet->total_len = tlen + md_len;
+
+       if (md_len > 0) {
+               memset(&md, 0, sizeof(struct capture_md));
+               md.port = 1;
+               md.dir = PKT_DIR_EGRESS;
+               if (likely(pbc == 0)) {
+                       vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12;
+                       md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen);
+               } else {
+                       md.u.pbc = 0;
+               }
+               memcpy(s_packet->data, &md, md_len);
+       } else {
+               md.u.pbc = pbc;
+       }
+
+       /* Copy header */
+       if (likely(hdr)) {
+               memcpy(s_packet->data + md_len, hdr, hdr_len);
+       } else {
+               dd_dev_err(ppd->dd,
+                          "Unable to copy header to snoop/capture packet\n");
+               kfree(s_packet);
+               goto out;
+       }
+
+       if (ss) {
+               data = s_packet->data + hdr_len + md_len;
+               data_start = data;
+
+               /*
+                * Copy SGE State
+                * The update_sge() function below will not modify the
+                * individual SGEs in the array. It will make a copy each time
+                * and operate on that. So we only need to copy this instance
+                * and it won't impact PIO.
+                */
+               temp_ss = *ss;
+               length = len;
+
+               snoop_dbg("Need to copy %d bytes", length);
+               while (length) {
+                       void *addr = temp_ss.sge.vaddr;
+                       u32 slen = temp_ss.sge.length;
+
+                       if (slen > length) {
+                               slen = length;
+                               snoop_dbg("slen %d > len %d", slen, length);
+                       }
+                       snoop_dbg("copy %d to %p", slen, addr);
+                       memcpy(data, addr, slen);
+                       update_sge(&temp_ss, slen);
+                       length -= slen;
+                       data += slen;
+                       snoop_dbg("data is now %p bytes left %d", data, length);
+               }
+               snoop_dbg("Completed SGE copy");
+       }
+
+       /*
+        * Why do the filter check down here? Because the event tracing has its
+        * own filtering and we need to have the walked the SGE list.
+        */
+       if (!ppd->dd->hfi1_snoop.filter_callback) {
+               snoop_dbg("filter not set\n");
+               ret = HFI1_FILTER_HIT;
+       } else {
+               ret = ppd->dd->hfi1_snoop.filter_callback(
+                                       &ahdr->ibh,
+                                       NULL,
+                                       ppd->dd->hfi1_snoop.filter_value);
+       }
+
+       switch (ret) {
+       case HFI1_FILTER_ERR:
+               snoop_dbg("Error in filter call");
+               /* fall through */
+       case HFI1_FILTER_MISS:
+               snoop_dbg("Filter Miss");
+               kfree(s_packet);
+               break;
+       case HFI1_FILTER_HIT:
+               snoop_dbg("Capturing packet");
+               snoop_list_add_tail(s_packet, ppd->dd);
+
+               if (unlikely((snoop_flags & SNOOP_DROP_SEND) &&
+                            (ppd->dd->hfi1_snoop.mode_flag &
+                             HFI1_PORT_SNOOP_MODE))) {
+                       unsigned long flags;
+
+                       snoop_dbg("Dropping packet");
+                       if (qp->s_wqe) {
+                               spin_lock_irqsave(&qp->s_lock, flags);
+                               hfi1_send_complete(
+                                       qp,
+                                       qp->s_wqe,
+                                       IB_WC_SUCCESS);
+                               spin_unlock_irqrestore(&qp->s_lock, flags);
+                       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+                               spin_lock_irqsave(&qp->s_lock, flags);
+                               hfi1_rc_send_complete(qp, &ahdr->ibh);
+                               spin_unlock_irqrestore(&qp->s_lock, flags);
+                       }
+                       return 0;
+               }
+               break;
+       default:
+               kfree(s_packet);
+               break;
+       }
+out:
+       return hfi1_verbs_send_pio(qp, ahdr, hdrwords, ss, len, plen, dwords,
+                                 md.u.pbc);
+}
+
+/*
+ * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently
+ * this can be used anywhere, but the intention is for inline ACKs for RC and
+ * CCA packets. We don't restrict this usage though.
+ */
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                          u64 pbc, const void *from, size_t count)
+{
+       int snoop_mode = 0;
+       int md_len = 0;
+       struct capture_md md;
+       struct snoop_packet *s_packet = NULL;
+
+       /*
+        * count is in dwords so we need to convert to bytes.
+        * We also need to account for CRC which would be tacked on by hardware.
+        */
+       int packet_len = (count << 2) + 4;
+       int ret;
+
+       snoop_dbg("ACK OUT: len %d", packet_len);
+
+       if (!dd->hfi1_snoop.filter_callback) {
+               snoop_dbg("filter not set");
+               ret = HFI1_FILTER_HIT;
+       } else {
+               ret = dd->hfi1_snoop.filter_callback(
+                               (struct hfi1_ib_header *)from,
+                               NULL,
+                               dd->hfi1_snoop.filter_value);
+       }
+
+       switch (ret) {
+       case HFI1_FILTER_ERR:
+               snoop_dbg("Error in filter call");
+               /* fall through */
+       case HFI1_FILTER_MISS:
+               snoop_dbg("Filter Miss");
+               break;
+       case HFI1_FILTER_HIT:
+               snoop_dbg("Capturing packet");
+               if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE)
+                       snoop_mode = 1;
+               if ((snoop_mode == 0) ||
+                   unlikely(snoop_flags & SNOOP_USE_METADATA))
+                       md_len = sizeof(struct capture_md);
+
+               s_packet = allocate_snoop_packet(packet_len, 0, md_len);
+
+               if (unlikely(s_packet == NULL)) {
+                       dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n");
+                       goto inline_pio_out;
+               }
+
+               s_packet->total_len = packet_len + md_len;
+
+               /* Fill in the metadata for the packet */
+               if (md_len > 0) {
+                       memset(&md, 0, sizeof(struct capture_md));
+                       md.port = 1;
+                       md.dir = PKT_DIR_EGRESS;
+                       md.u.pbc = pbc;
+                       memcpy(s_packet->data, &md, md_len);
+               }
+
+               /* Add the packet data which is a single buffer */
+               memcpy(s_packet->data + md_len, from, packet_len);
+
+               snoop_list_add_tail(s_packet, dd);
+
+               if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) {
+                       snoop_dbg("Dropping packet");
+                       return;
+               }
+               break;
+       default:
+               break;
+       }
+
+inline_pio_out:
+       pio_copy(dd, pbuf, pbc, from, count);
+
+}
diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c
new file mode 100644 (file)
index 0000000..e03bd73
--- /dev/null
@@ -0,0 +1,186 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr,
+                              size_t size, enum dma_data_direction direction)
+{
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       return (u64) cpu_addr;
+}
+
+static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+                                 enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page,
+                            unsigned long offset, size_t size,
+                           enum dma_data_direction direction)
+{
+       u64 addr;
+
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       if (offset + size > PAGE_SIZE)
+               return BAD_DMA_ADDRESS;
+
+       addr = (u64) page_address(page);
+       if (addr)
+               addr += offset;
+
+       return addr;
+}
+
+static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+                               enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                      int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       u64 addr;
+       int i;
+       int ret = nents;
+
+       if (WARN_ON(!valid_dma_direction(direction)))
+               return BAD_DMA_ADDRESS;
+
+       for_each_sg(sgl, sg, nents, i) {
+               addr = (u64) page_address(sg_page(sg));
+               if (!addr) {
+                       ret = 0;
+                       break;
+               }
+               sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+               sg->dma_length = sg->length;
+#endif
+       }
+       return ret;
+}
+
+static void hfi1_unmap_sg(struct ib_device *dev,
+                         struct scatterlist *sg, int nents,
+                        enum dma_data_direction direction)
+{
+       /* This is a stub, nothing to be done here */
+}
+
+static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+                                    size_t size, enum dma_data_direction dir)
+{
+}
+
+static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr,
+                                       size_t size,
+                                       enum dma_data_direction dir)
+{
+}
+
+static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                    u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p)
+               addr = page_address(p);
+       if (dma_handle)
+               *dma_handle = (u64) addr;
+       return addr;
+}
+
+static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size,
+                                  void *cpu_addr, u64 dma_handle)
+{
+       free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops hfi1_dma_mapping_ops = {
+       .mapping_error = hfi1_mapping_error,
+       .map_single = hfi1_dma_map_single,
+       .unmap_single = hfi1_dma_unmap_single,
+       .map_page = hfi1_dma_map_page,
+       .unmap_page = hfi1_dma_unmap_page,
+       .map_sg = hfi1_map_sg,
+       .unmap_sg = hfi1_unmap_sg,
+       .sync_single_for_cpu = hfi1_sync_single_for_cpu,
+       .sync_single_for_device = hfi1_sync_single_for_device,
+       .alloc_coherent = hfi1_dma_alloc_coherent,
+       .free_coherent = hfi1_dma_free_coherent
+};
diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c
new file mode 100644 (file)
index 0000000..c0a5900
--- /dev/null
@@ -0,0 +1,1241 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the initialization code.
+ */
+const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(hfi1_devs_lock);
+LIST_HEAD(hfi1_dev_list);
+DEFINE_MUTEX(hfi1_mutex);      /* general driver use */
+
+unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192");
+
+unsigned int hfi1_cu = 1;
+module_param_named(cu, hfi1_cu, uint, S_IRUGO);
+MODULE_PARM_DESC(cu, "Credit return units");
+
+unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT;
+static int hfi1_caps_set(const char *, const struct kernel_param *);
+static int hfi1_caps_get(char *, const struct kernel_param *);
+static const struct kernel_param_ops cap_ops = {
+       .set = hfi1_caps_set,
+       .get = hfi1_caps_get
+};
+module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Intel Omni-Path Architecture driver");
+MODULE_VERSION(HFI1_DRIVER_VERSION);
+
+/*
+ * MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define MAX_PKT_RECV 64
+#define EGR_HEAD_UPDATE_THRESHOLD 16
+
+struct hfi1_ib_stats hfi1_stats;
+
+static int hfi1_caps_set(const char *val, const struct kernel_param *kp)
+{
+       int ret = 0;
+       unsigned long *cap_mask_ptr = (unsigned long *)kp->arg,
+               cap_mask = *cap_mask_ptr, value, diff,
+               write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) |
+                             HFI1_CAP_WRITABLE_MASK);
+
+       ret = kstrtoul(val, 0, &value);
+       if (ret) {
+               pr_warn("Invalid module parameter value for 'cap_mask'\n");
+               goto done;
+       }
+       /* Get the changed bits (except the locked bit) */
+       diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK);
+
+       /* Remove any bits that are not allowed to change after driver load */
+       if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) {
+               pr_warn("Ignoring non-writable capability bits %#lx\n",
+                       diff & ~write_mask);
+               diff &= write_mask;
+       }
+
+       /* Mask off any reserved bits */
+       diff &= ~HFI1_CAP_RESERVED_MASK;
+       /* Clear any previously set and changing bits */
+       cap_mask &= ~diff;
+       /* Update the bits with the new capability */
+       cap_mask |= (value & diff);
+       /* Check for any kernel/user restrictions */
+       diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^
+               ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT);
+       cap_mask &= ~diff;
+       /* Set the bitmask to the final set */
+       *cap_mask_ptr = cap_mask;
+done:
+       return ret;
+}
+
+static int hfi1_caps_get(char *buffer, const struct kernel_param *kp)
+{
+       unsigned long cap_mask = *(unsigned long *)kp->arg;
+
+       cap_mask &= ~HFI1_CAP_LOCKED_SMASK;
+       cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT);
+
+       return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask);
+}
+
+const char *get_unit_name(int unit)
+{
+       static char iname[16];
+
+       snprintf(iname, sizeof(iname), DRIVER_NAME"_%u", unit);
+       return iname;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int hfi1_count_active_units(void)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       unsigned long flags;
+       int pidx, nunits_active = 0;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+                       continue;
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+                       if (ppd->lid && ppd->linkup) {
+                               nunits_active++;
+                               break;
+                       }
+               }
+       }
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int hfi1_count_units(int *npresentp, int *nupp)
+{
+       int nunits = 0, npresent = 0, nup = 0;
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       int pidx;
+       struct hfi1_pportdata *ppd;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+       list_for_each_entry(dd, &hfi1_dev_list, list) {
+               nunits++;
+               if ((dd->flags & HFI1_PRESENT) && dd->kregbase)
+                       npresent++;
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+                       if (ppd->lid && ppd->linkup)
+                               nup++;
+               }
+       }
+
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       if (npresentp)
+               *npresentp = npresent;
+       if (nupp)
+               *nupp = nup;
+
+       return nunits;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf,
+                              u8 *update)
+{
+       u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf);
+
+       *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset;
+       return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) +
+                       (offset * RCV_BUF_BLOCK_SIZE));
+}
+
+/*
+ * Validate and encode the a given RcvArray Buffer size.
+ * The function will check whether the given size falls within
+ * allowed size ranges for the respective type and, optionally,
+ * return the proper encoding.
+ */
+inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded)
+{
+       if (unlikely(!IS_ALIGNED(size, PAGE_SIZE)))
+               return 0;
+       if (unlikely(size < MIN_EAGER_BUFFER))
+               return 0;
+       if (size >
+           (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER))
+               return 0;
+       if (encoded)
+               *encoded = ilog2(size / PAGE_SIZE) + 1;
+       return 1;
+}
+
+static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
+                      struct hfi1_packet *packet)
+{
+       struct hfi1_message_header *rhdr = packet->hdr;
+       u32 rte = rhf_rcv_type_err(packet->rhf);
+       int lnh = be16_to_cpu(rhdr->lrh[0]) & 3;
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+               return;
+
+       if (packet->rhf & RHF_TID_ERR) {
+               /* For TIDERR and RC QPs preemptively schedule a NAK */
+               struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr;
+               struct hfi1_other_headers *ohdr = NULL;
+               u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+               u16 lid  = be16_to_cpu(hdr->lrh[1]);
+               u32 qp_num;
+               u32 rcv_flags = 0;
+
+               /* Sanity check packet */
+               if (tlen < 24)
+                       goto drop;
+
+               /* Check for GRH */
+               if (lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+               else if (lnh == HFI1_LRH_GRH) {
+                       u32 vtf;
+
+                       ohdr = &hdr->u.l.oth;
+                       if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+                               goto drop;
+                       vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+                       if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+                               goto drop;
+                       rcv_flags |= HFI1_HAS_GRH;
+               } else
+                       goto drop;
+
+               /* Get the destination QP number. */
+               qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+               if (lid < HFI1_MULTICAST_LID_BASE) {
+                       struct hfi1_qp *qp;
+
+                       rcu_read_lock();
+                       qp = hfi1_lookup_qpn(ibp, qp_num);
+                       if (!qp) {
+                               rcu_read_unlock();
+                               goto drop;
+                       }
+
+                       /*
+                        * Handle only RC QPs - for other QP types drop error
+                        * packet.
+                        */
+                       spin_lock(&qp->r_lock);
+
+                       /* Check for valid receive state. */
+                       if (!(ib_hfi1_state_ops[qp->state] &
+                             HFI1_PROCESS_RECV_OK)) {
+                               ibp->n_pkt_drops++;
+                       }
+
+                       switch (qp->ibqp.qp_type) {
+                       case IB_QPT_RC:
+                               hfi1_rc_hdrerr(
+                                       rcd,
+                                       hdr,
+                                       rcv_flags,
+                                       qp);
+                               break;
+                       default:
+                               /* For now don't handle any other QP types */
+                               break;
+                       }
+
+                       spin_unlock(&qp->r_lock);
+                       rcu_read_unlock();
+               } /* Unicast QP */
+       } /* Valid packet with TIDErr */
+
+       /* handle "RcvTypeErr" flags */
+       switch (rte) {
+       case RHF_RTE_ERROR_OP_CODE_ERR:
+       {
+               u32 opcode;
+               void *ebuf = NULL;
+               __be32 *bth = NULL;
+
+               if (rhf_use_egr_bfr(packet->rhf))
+                       ebuf = packet->ebuf;
+
+               if (ebuf == NULL)
+                       goto drop; /* this should never happen */
+
+               if (lnh == HFI1_LRH_BTH)
+                       bth = (__be32 *)ebuf;
+               else if (lnh == HFI1_LRH_GRH)
+                       bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh));
+               else
+                       goto drop;
+
+               opcode = be32_to_cpu(bth[0]) >> 24;
+               opcode &= 0xff;
+
+               if (opcode == IB_OPCODE_CNP) {
+                       /*
+                        * Only in pre-B0 h/w is the CNP_OPCODE handled
+                        * via this code path (errata 291394).
+                        */
+                       struct hfi1_qp *qp = NULL;
+                       u32 lqpn, rqpn;
+                       u16 rlid;
+                       u8 svc_type, sl, sc5;
+
+                       sc5  = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf;
+                       if (rhf_dc_info(packet->rhf))
+                               sc5 |= 0x10;
+                       sl = ibp->sc_to_sl[sc5];
+
+                       lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK;
+                       rcu_read_lock();
+                       qp = hfi1_lookup_qpn(ibp, lqpn);
+                       if (qp == NULL) {
+                               rcu_read_unlock();
+                               goto drop;
+                       }
+
+                       switch (qp->ibqp.qp_type) {
+                       case IB_QPT_UD:
+                               rlid = 0;
+                               rqpn = 0;
+                               svc_type = IB_CC_SVCTYPE_UD;
+                               break;
+                       case IB_QPT_UC:
+                               rlid = be16_to_cpu(rhdr->lrh[3]);
+                               rqpn = qp->remote_qpn;
+                               svc_type = IB_CC_SVCTYPE_UC;
+                               break;
+                       default:
+                               goto drop;
+                       }
+
+                       process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
+                       rcu_read_unlock();
+               }
+
+               packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK;
+               break;
+       }
+       default:
+               break;
+       }
+
+drop:
+       return;
+}
+
+static inline void init_packet(struct hfi1_ctxtdata *rcd,
+                             struct hfi1_packet *packet)
+{
+
+       packet->rsize = rcd->rcvhdrqentsize; /* words */
+       packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+       packet->rcd = rcd;
+       packet->updegr = 0;
+       packet->etail = -1;
+       packet->rhf_addr = (__le32 *) rcd->rcvhdrq + rcd->head +
+                          rcd->dd->rhf_offset;
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+       packet->rhqoff = rcd->head;
+       packet->numpkt = 0;
+       packet->rcv_flags = 0;
+}
+
+#ifndef CONFIG_PRESCAN_RXQ
+static void prescan_rxq(struct hfi1_packet *packet) {}
+#else /* CONFIG_PRESCAN_RXQ */
+static int prescan_receive_queue;
+
+static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr,
+                       struct hfi1_other_headers *ohdr,
+                       u64 rhf, struct ib_grh *grh)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       u32 bth1;
+       u8 sc5, svc_type;
+       int is_fecn, is_becn;
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_UD:
+               svc_type = IB_CC_SVCTYPE_UD;
+               break;
+       case IB_QPT_UC: /* LATER */
+       case IB_QPT_RC: /* LATER */
+       default:
+               return;
+       }
+
+       is_fecn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) &
+                       HFI1_FECN_MASK;
+       is_becn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) &
+                       HFI1_BECN_MASK;
+
+       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       if (rhf_dc_info(rhf))
+               sc5 |= 0x10;
+
+       if (is_fecn) {
+               u32 src_qpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+               u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+               u16 dlid = be16_to_cpu(hdr->lrh[1]);
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+
+               return_cnp(ibp, qp, src_qpn, pkey, dlid, slid, sc5, grh);
+       }
+
+       if (is_becn) {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+               u8 sl = ibp->sc_to_sl[sc5];
+
+               process_becn(ppd, sl, 0, lqpn, 0, svc_type);
+       }
+
+       /* turn off BECN, or FECN */
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       bth1 &= ~(HFI1_FECN_MASK << HFI1_FECN_SHIFT);
+       bth1 &= ~(HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+       ohdr->bth[1] = cpu_to_be32(bth1);
+}
+
+struct ps_mdata {
+       struct hfi1_ctxtdata *rcd;
+       u32 rsize;
+       u32 maxcnt;
+       u32 ps_head;
+       u32 ps_tail;
+       u32 ps_seq;
+};
+
+static inline void init_ps_mdata(struct ps_mdata *mdata,
+                                struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+
+       mdata->rcd = rcd;
+       mdata->rsize = packet->rsize;
+       mdata->maxcnt = packet->maxcnt;
+
+       if (rcd->ps_state.initialized == 0) {
+               mdata->ps_head = packet->rhqoff;
+               rcd->ps_state.initialized++;
+       } else
+               mdata->ps_head = rcd->ps_state.ps_head;
+
+       if (HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+               mdata->ps_tail = packet->hdrqtail;
+               mdata->ps_seq = 0; /* not used with DMA_RTAIL */
+       } else {
+               mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
+               mdata->ps_seq = rcd->seq_cnt;
+       }
+}
+
+static inline int ps_done(struct ps_mdata *mdata, u64 rhf)
+{
+       if (HFI1_CAP_IS_KSET(DMA_RTAIL))
+               return mdata->ps_head == mdata->ps_tail;
+       return mdata->ps_seq != rhf_rcv_seq(rhf);
+}
+
+static inline void update_ps_mdata(struct ps_mdata *mdata)
+{
+       struct hfi1_ctxtdata *rcd = mdata->rcd;
+
+       mdata->ps_head += mdata->rsize;
+       if (mdata->ps_head > mdata->maxcnt)
+               mdata->ps_head = 0;
+       rcd->ps_state.ps_head = mdata->ps_head;
+       if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+               if (++mdata->ps_seq > 13)
+                       mdata->ps_seq = 1;
+       }
+}
+
+/*
+ * prescan_rxq - search through the receive queue looking for packets
+ * containing Excplicit Congestion Notifications (FECNs, or BECNs).
+ * When an ECN is found, process the Congestion Notification, and toggle
+ * it off.
+ */
+static void prescan_rxq(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct ps_mdata mdata;
+
+       if (!prescan_receive_queue)
+               return;
+
+       init_ps_mdata(&mdata, packet);
+
+       while (1) {
+               struct hfi1_devdata *dd = rcd->dd;
+               struct hfi1_ibport *ibp = &rcd->ppd->ibport_data;
+               __le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head +
+                                        dd->rhf_offset;
+               struct hfi1_qp *qp;
+               struct hfi1_ib_header *hdr;
+               struct hfi1_other_headers *ohdr;
+               struct ib_grh *grh = NULL;
+               u64 rhf = rhf_to_cpu(rhf_addr);
+               u32 etype = rhf_rcv_type(rhf), qpn;
+               int is_ecn = 0;
+               u8 lnh;
+
+               if (ps_done(&mdata, rhf))
+                       break;
+
+               if (etype != RHF_RCV_TYPE_IB)
+                       goto next;
+
+               hdr = (struct hfi1_ib_header *)
+                       hfi1_get_msgheader(dd, rhf_addr);
+               lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+
+               if (lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+               else if (lnh == HFI1_LRH_GRH) {
+                       ohdr = &hdr->u.l.oth;
+                       grh = &hdr->u.l.grh;
+               } else
+                       goto next; /* just in case */
+
+               is_ecn |= be32_to_cpu(ohdr->bth[1]) &
+                       (HFI1_FECN_MASK << HFI1_FECN_SHIFT);
+               is_ecn |= be32_to_cpu(ohdr->bth[1]) &
+                       (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+
+               if (!is_ecn)
+                       goto next;
+
+               qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+               rcu_read_lock();
+               qp = hfi1_lookup_qpn(ibp, qpn);
+
+               if (qp == NULL) {
+                       rcu_read_unlock();
+                       goto next;
+               }
+
+               process_ecn(qp, hdr, ohdr, rhf, grh);
+               rcu_read_unlock();
+next:
+               update_ps_mdata(&mdata);
+       }
+}
+#endif /* CONFIG_PRESCAN_RXQ */
+
+#define RCV_PKT_OK 0x0
+#define RCV_PKT_MAX 0x1
+
+static inline int process_rcv_packet(struct hfi1_packet *packet)
+{
+       int ret = RCV_PKT_OK;
+
+       packet->hdr = hfi1_get_msgheader(packet->rcd->dd,
+                                        packet->rhf_addr);
+       packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+       packet->etype = rhf_rcv_type(packet->rhf);
+       /* total length */
+       packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+       /* retrieve eager buffer details */
+       packet->ebuf = NULL;
+       if (rhf_use_egr_bfr(packet->rhf)) {
+               packet->etail = rhf_egr_index(packet->rhf);
+               packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+                                &packet->updegr);
+               /*
+                * Prefetch the contents of the eager buffer.  It is
+                * OK to send a negative length to prefetch_range().
+                * The +2 is the size of the RHF.
+                */
+               prefetch_range(packet->ebuf,
+                       packet->tlen - ((packet->rcd->rcvhdrqentsize -
+                                 (rhf_hdrq_offset(packet->rhf)+2)) * 4));
+       }
+
+       /*
+        * Call a type specific handler for the packet. We
+        * should be able to trust that etype won't be beyond
+        * the range of valid indexes. If so something is really
+        * wrong and we can probably just let things come
+        * crashing down. There is no need to eat another
+        * comparison in this performance critical code.
+        */
+       packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet);
+       packet->numpkt++;
+
+       /* Set up for the next packet */
+       packet->rhqoff += packet->rsize;
+       if (packet->rhqoff >= packet->maxcnt)
+               packet->rhqoff = 0;
+
+       if (packet->numpkt == MAX_PKT_RECV) {
+               ret = RCV_PKT_MAX;
+               this_cpu_inc(*packet->rcd->dd->rcv_limit);
+       }
+
+       packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff +
+                                     packet->rcd->dd->rhf_offset;
+       packet->rhf = rhf_to_cpu(packet->rhf_addr);
+
+       return ret;
+}
+
+static inline void process_rcv_update(int last, struct hfi1_packet *packet)
+{
+       /*
+        * Update head regs etc., every 16 packets, if not last pkt,
+        * to help prevent rcvhdrq overflows, when many packets
+        * are processed and queue is nearly full.
+        * Don't request an interrupt for intermediate updates.
+        */
+       if (!last && !(packet->numpkt & 0xf)) {
+               update_usrhead(packet->rcd, packet->rhqoff, packet->updegr,
+                              packet->etail, 0, 0);
+               packet->updegr = 0;
+       }
+       packet->rcv_flags = 0;
+}
+
+static inline void finish_packet(struct hfi1_packet *packet)
+{
+
+       /*
+        * Nothing we need to free for the packet.
+        *
+        * The only thing we need to do is a final update and call for an
+        * interrupt
+        */
+       update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+                      packet->etail, rcv_intr_dynamic, packet->numpkt);
+
+}
+
+static inline void process_rcv_qp_work(struct hfi1_packet *packet)
+{
+
+       struct hfi1_ctxtdata *rcd;
+       struct hfi1_qp *qp, *nqp;
+
+       rcd = packet->rcd;
+       rcd->head = packet->rhqoff;
+
+       /*
+        * Iterate over all QPs waiting to respond.
+        * The list won't change since the IRQ is only run on one CPU.
+        */
+       list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+               list_del_init(&qp->rspwait);
+               if (qp->r_flags & HFI1_R_RSP_NAK) {
+                       qp->r_flags &= ~HFI1_R_RSP_NAK;
+                       hfi1_send_rc_ack(rcd, qp, 0);
+               }
+               if (qp->r_flags & HFI1_R_RSP_SEND) {
+                       unsigned long flags;
+
+                       qp->r_flags &= ~HFI1_R_RSP_SEND;
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       if (ib_hfi1_state_ops[qp->state] &
+                                       HFI1_PROCESS_OR_FLUSH_SEND)
+                               hfi1_schedule_send(qp);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+               }
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+}
+
+/*
+ * Handle receive interrupts when using the no dma rtail option.
+ */
+void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd)
+{
+       u32 seq;
+       int last = 0;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+       seq = rhf_rcv_seq(packet.rhf);
+       if (seq != rcd->seq_cnt)
+               goto bail;
+
+       prescan_rxq(&packet);
+
+       while (!last) {
+               last = process_rcv_packet(&packet);
+               seq = rhf_rcv_seq(packet.rhf);
+               if (++rcd->seq_cnt > 13)
+                       rcd->seq_cnt = 1;
+               if (seq != rcd->seq_cnt)
+                       last = 1;
+               process_rcv_update(last, &packet);
+       }
+       process_rcv_qp_work(&packet);
+bail:
+       finish_packet(&packet);
+}
+
+void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd)
+{
+       u32 hdrqtail;
+       int last = 0;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+       hdrqtail = get_rcvhdrtail(rcd);
+       if (packet.rhqoff == hdrqtail)
+               goto bail;
+       smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+
+       prescan_rxq(&packet);
+
+       while (!last) {
+               last = process_rcv_packet(&packet);
+               if (packet.rhqoff == hdrqtail)
+                       last = 1;
+               process_rcv_update(last, &packet);
+       }
+       process_rcv_qp_work(&packet);
+bail:
+       finish_packet(&packet);
+
+}
+
+static inline void set_all_nodma_rtail(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt =
+                       &handle_receive_interrupt_nodma_rtail;
+}
+
+static inline void set_all_dma_rtail(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->first_user_ctxt; i++)
+               dd->rcd[i]->do_interrupt =
+                       &handle_receive_interrupt_dma_rtail;
+}
+
+/*
+ * handle_receive_interrupt - receive a packet
+ * @rcd: the context
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler.
+ */
+void handle_receive_interrupt(struct hfi1_ctxtdata *rcd)
+{
+
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 hdrqtail;
+       int last = 0, needset = 1;
+       struct hfi1_packet packet;
+
+       init_packet(rcd, &packet);
+
+       if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+               u32 seq = rhf_rcv_seq(packet.rhf);
+
+               if (seq != rcd->seq_cnt)
+                       goto bail;
+               hdrqtail = 0;
+       } else {
+               hdrqtail = get_rcvhdrtail(rcd);
+               if (packet.rhqoff == hdrqtail)
+                       goto bail;
+               smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+       }
+
+       prescan_rxq(&packet);
+
+       while (!last) {
+
+               if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet,
+                       DROP_PACKET_OFF) == DROP_PACKET_ON)) {
+                       dd->do_drop = 0;
+
+                       /* On to the next packet */
+                       packet.rhqoff += packet.rsize;
+                       packet.rhf_addr = (__le32 *) rcd->rcvhdrq +
+                                         packet.rhqoff +
+                                         dd->rhf_offset;
+                       packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+               } else {
+                       last = process_rcv_packet(&packet);
+               }
+
+               if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) {
+                       u32 seq = rhf_rcv_seq(packet.rhf);
+
+                       if (++rcd->seq_cnt > 13)
+                               rcd->seq_cnt = 1;
+                       if (seq != rcd->seq_cnt)
+                               last = 1;
+                       if (needset) {
+                               dd_dev_info(dd,
+                                       "Switching to NO_DMA_RTAIL\n");
+                               set_all_nodma_rtail(dd);
+                               needset = 0;
+                       }
+               } else {
+                       if (packet.rhqoff == hdrqtail)
+                               last = 1;
+                       if (needset) {
+                               dd_dev_info(dd,
+                                           "Switching to DMA_RTAIL\n");
+                               set_all_dma_rtail(dd);
+                               needset = 0;
+                       }
+               }
+
+               process_rcv_update(last, &packet);
+       }
+
+       process_rcv_qp_work(&packet);
+
+bail:
+       /*
+        * Always write head at end, and setup rcv interrupt, even
+        * if no packets were processed.
+        */
+       finish_packet(&packet);
+}
+
+/*
+ * Convert a given MTU size to the on-wire MAD packet enumeration.
+ * Return -1 if the size is invalid.
+ */
+int mtu_to_enum(u32 mtu, int default_if_bad)
+{
+       switch (mtu) {
+       case     0: return OPA_MTU_0;
+       case   256: return OPA_MTU_256;
+       case   512: return OPA_MTU_512;
+       case  1024: return OPA_MTU_1024;
+       case  2048: return OPA_MTU_2048;
+       case  4096: return OPA_MTU_4096;
+       case  8192: return OPA_MTU_8192;
+       case 10240: return OPA_MTU_10240;
+       }
+       return default_if_bad;
+}
+
+u16 enum_to_mtu(int mtu)
+{
+       switch (mtu) {
+       case OPA_MTU_0:     return 0;
+       case OPA_MTU_256:   return 256;
+       case OPA_MTU_512:   return 512;
+       case OPA_MTU_1024:  return 1024;
+       case OPA_MTU_2048:  return 2048;
+       case OPA_MTU_4096:  return 4096;
+       case OPA_MTU_8192:  return 8192;
+       case OPA_MTU_10240: return 10240;
+       default: return 0xffff;
+       }
+}
+
+/*
+ * set_mtu - set the MTU
+ * @ppd: the per port data
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.  We do not deal with what happens
+ * to programs that are already running when the size changes.
+ */
+int set_mtu(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int i, drain, ret = 0, is_up = 0;
+
+       ppd->ibmtu = 0;
+       for (i = 0; i < ppd->vls_supported; i++)
+               if (ppd->ibmtu < dd->vld[i].mtu)
+                       ppd->ibmtu = dd->vld[i].mtu;
+       ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd);
+
+       mutex_lock(&ppd->hls_lock);
+       if (ppd->host_link_state == HLS_UP_INIT
+                       || ppd->host_link_state == HLS_UP_ARMED
+                       || ppd->host_link_state == HLS_UP_ACTIVE)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * MTU is specified per-VL. To ensure that no packet gets
+                * stuck (due, e.g., to the MTU for the packet's VL being
+                * reduced), empty the per-VL FIFOs before adjusting MTU.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n",
+                          __func__);
+               goto err;
+       }
+
+       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       ppd->lid = lid;
+       ppd->lmc = lmc;
+       hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0);
+
+       dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid);
+
+       return 0;
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDs, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void run_led_override(unsigned long opaque)
+{
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque;
+       struct hfi1_devdata *dd = ppd->dd;
+       int timeoff;
+       int ph_idx;
+
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+
+       ph_idx = ppd->led_override_phase++ & 1;
+       ppd->led_override = ppd->led_override_vals[ph_idx];
+       timeoff = ppd->led_override_timeoff;
+
+       /*
+        * don't re-fire the timer if user asked for it to be off; we let
+        * it fire one more time after they turn it off to simplify
+        */
+       if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+               mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+}
+
+void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int timeoff, freq;
+
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+
+       /* First check if we are blinking. If not, use 1HZ polling */
+       timeoff = HZ;
+       freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+       if (freq) {
+               /* For blink, set each phase from one nybble of val */
+               ppd->led_override_vals[0] = val & 0xF;
+               ppd->led_override_vals[1] = (val >> 4) & 0xF;
+               timeoff = (HZ << 4)/freq;
+       } else {
+               /* Non-blink set both phases the same. */
+               ppd->led_override_vals[0] = val & 0xF;
+               ppd->led_override_vals[1] = val & 0xF;
+       }
+       ppd->led_override_timeoff = timeoff;
+
+       /*
+        * If the timer has not already been started, do so. Use a "quick"
+        * timeout so the function will be called soon, to look at our request.
+        */
+       if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
+               /* Need to start timer */
+               init_timer(&ppd->led_override_timer);
+               ppd->led_override_timer.function = run_led_override;
+               ppd->led_override_timer.data = (unsigned long) ppd;
+               ppd->led_override_timer.expires = jiffies + 1;
+               add_timer(&ppd->led_override_timer);
+       } else {
+               if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+                       mod_timer(&ppd->led_override_timer, jiffies + 1);
+               atomic_dec(&ppd->led_override_timer_active);
+       }
+}
+
+/**
+ * hfi1_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int hfi1_reset_device(int unit)
+{
+       int ret, i;
+       struct hfi1_devdata *dd = hfi1_lookup(unit);
+       struct hfi1_pportdata *ppd;
+       unsigned long flags;
+       int pidx;
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       dd_dev_info(dd, "Reset on unit %u requested\n", unit);
+
+       if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+               dd_dev_info(dd,
+                       "Invalid unit number %u or not initialized or not present\n",
+                       unit);
+               ret = -ENXIO;
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       if (dd->rcd)
+               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+                       if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+                               continue;
+                       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+                       ret = -EBUSY;
+                       goto bail;
+               }
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (atomic_read(&ppd->led_override_timer_active)) {
+                       /* Need to stop LED timer, _then_ shut off LEDs */
+                       del_timer_sync(&ppd->led_override_timer);
+                       atomic_set(&ppd->led_override_timer_active, 0);
+               }
+
+               /* Shut off LEDs after we are sure timer is not running */
+               ppd->led_override = LED_OVER_BOTH_OFF;
+       }
+       if (dd->flags & HFI1_HAS_SEND_DMA)
+               sdma_exit(dd);
+
+       hfi1_reset_cpu_counters(dd);
+
+       ret = hfi1_init(dd, 1);
+
+       if (ret)
+               dd_dev_err(dd,
+                       "Reinitialize unit %u after reset failed with %d\n",
+                       unit, ret);
+       else
+               dd_dev_info(dd, "Reinitialized unit %u after resetting\n",
+                       unit);
+
+bail:
+       return ret;
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       u32 rte = rhf_rcv_type_err(packet->rhf);
+
+       dd_dev_err(rcd->dd,
+               "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+               rcd->ctxt, packet->rhf,
+               packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+               packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+               packet->rhf & RHF_DC_ERR ? "dc " : "",
+               packet->rhf & RHF_TID_ERR ? "tid " : "",
+               packet->rhf & RHF_LEN_ERR ? "len " : "",
+               packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+               packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+               packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+               rte);
+
+       rcv_hdrerr(rcd, rcd->ppd, packet);
+}
+
+/*
+ * The following functions are called by the interrupt handler. They are type
+ * specific handlers for each packet type.
+ */
+int process_receive_ib(struct hfi1_packet *packet)
+{
+       trace_hfi1_rcvhdr(packet->rcd->ppd->dd,
+                         packet->rcd->ctxt,
+                         rhf_err_flags(packet->rhf),
+                         RHF_RCV_TYPE_IB,
+                         packet->hlen,
+                         packet->tlen,
+                         packet->updegr,
+                         rhf_egr_index(packet->rhf));
+
+       if (unlikely(rhf_err_flags(packet->rhf))) {
+               handle_eflags(packet);
+               return RHF_RCV_CONTINUE;
+       }
+
+       hfi1_ib_rcv(packet);
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_bypass(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+          "Bypass packets are not supported in normal operation. Dropping\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_error(struct hfi1_packet *packet)
+{
+       handle_eflags(packet);
+
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               dd_dev_err(packet->rcd->dd,
+                          "Unhandled error packet received. Dropping.\n");
+
+       return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_expected(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Unhandled expected packet received. Dropping.\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int kdeth_process_eager(struct hfi1_packet *packet)
+{
+       if (unlikely(rhf_err_flags(packet->rhf)))
+               handle_eflags(packet);
+
+       dd_dev_err(packet->rcd->dd,
+                  "Unhandled eager packet received. Dropping.\n");
+       return RHF_RCV_CONTINUE;
+}
+
+int process_receive_invalid(struct hfi1_packet *packet)
+{
+       dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
+               rhf_rcv_type(packet->rhf));
+       return RHF_RCV_CONTINUE;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c
new file mode 100644 (file)
index 0000000..b61d3ae
--- /dev/null
@@ -0,0 +1,475 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/delay.h>
+#include "hfi.h"
+#include "common.h"
+#include "eprom.h"
+
+/*
+ * The EPROM is logically divided into two partitions:
+ *     partition 0: the first 128K, visible from PCI ROM BAR
+ *     partition 1: the rest
+ */
+#define P0_SIZE (128 * 1024)
+#define P1_START P0_SIZE
+
+/* largest erase size supported by the controller */
+#define SIZE_32KB (32 * 1024)
+#define MASK_32KB (SIZE_32KB - 1)
+
+/* controller page size, in bytes */
+#define EP_PAGE_SIZE 256
+#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1)
+
+/* controller commands */
+#define CMD_SHIFT 24
+#define CMD_NOP                            (0)
+#define CMD_PAGE_PROGRAM(addr)     ((0x02 << CMD_SHIFT) | addr)
+#define CMD_READ_DATA(addr)        ((0x03 << CMD_SHIFT) | addr)
+#define CMD_READ_SR1               ((0x05 << CMD_SHIFT))
+#define CMD_WRITE_ENABLE           ((0x06 << CMD_SHIFT))
+#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr)
+#define CMD_CHIP_ERASE             ((0x60 << CMD_SHIFT))
+#define CMD_READ_MANUF_DEV_ID      ((0x90 << CMD_SHIFT))
+#define CMD_RELEASE_POWERDOWN_NOID  ((0xab << CMD_SHIFT))
+
+/* controller interface speeds */
+#define EP_SPEED_FULL 0x2      /* full speed */
+
+/* controller status register 1 bits */
+#define SR1_BUSY 0x1ull                /* the BUSY bit in SR1 */
+
+/* sleep length while waiting for controller */
+#define WAIT_SLEEP_US 100      /* must be larger than 5 (see usage) */
+#define COUNT_DELAY_SEC(n) ((n) * (1000000/WAIT_SLEEP_US))
+
+/* GPIO pins */
+#define EPROM_WP_N (1ull << 14)        /* EPROM write line */
+
+/*
+ * Use the EP mutex to guard against other callers from within the driver.
+ * Also covers usage of eprom_available.
+ */
+static DEFINE_MUTEX(eprom_mutex);
+static int eprom_available;    /* default: not available */
+
+/*
+ * Turn on external enable line that allows writing on the flash.
+ */
+static void write_enable(struct hfi1_devdata *dd)
+{
+       /* raise signal */
+       write_csr(dd, ASIC_GPIO_OUT,
+               read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N);
+       /* raise enable */
+       write_csr(dd, ASIC_GPIO_OE,
+               read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N);
+}
+
+/*
+ * Turn off external enable line that allows writing on the flash.
+ */
+static void write_disable(struct hfi1_devdata *dd)
+{
+       /* lower signal */
+       write_csr(dd, ASIC_GPIO_OUT,
+               read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N);
+       /* lower enable */
+       write_csr(dd, ASIC_GPIO_OE,
+               read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N);
+}
+
+/*
+ * Wait for the device to become not busy.  Must be called after all
+ * write or erase operations.
+ */
+static int wait_for_not_busy(struct hfi1_devdata *dd)
+{
+       unsigned long count = 0;
+       u64 reg;
+       int ret = 0;
+
+       /* starts page mode */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1);
+       while (1) {
+               udelay(WAIT_SLEEP_US);
+               usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5);
+               count++;
+               reg = read_csr(dd, ASIC_EEP_DATA);
+               if ((reg & SR1_BUSY) == 0)
+                       break;
+               /* 200s is the largest time for a 128Mb device */
+               if (count > COUNT_DELAY_SEC(200)) {
+                       dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n");
+                       ret = -ETIMEDOUT;
+                       break; /* break, not goto - must stop page mode */
+               }
+       }
+
+       /* stop page mode with a NOP */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP);
+
+       return ret;
+}
+
+/*
+ * Read the device ID from the SPI controller.
+ */
+static u32 read_device_id(struct hfi1_devdata *dd)
+{
+       /* read the Manufacture Device ID */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID);
+       return (u32)read_csr(dd, ASIC_EEP_DATA);
+}
+
+/*
+ * Erase the whole flash.
+ */
+static int erase_chip(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       write_enable(dd);
+
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE);
+       ret = wait_for_not_busy(dd);
+
+       write_disable(dd);
+
+       return ret;
+}
+
+/*
+ * Erase a range using the 32KB erase command.
+ */
+static int erase_32kb_range(struct hfi1_devdata *dd, u32 start, u32 end)
+{
+       int ret = 0;
+
+       if (end < start)
+               return -EINVAL;
+
+       if ((start & MASK_32KB) || (end & MASK_32KB)) {
+               dd_dev_err(dd,
+                       "%s: non-aligned range (0x%x,0x%x) for a 32KB erase\n",
+                       __func__, start, end);
+               return -EINVAL;
+       }
+
+       write_enable(dd);
+
+       for (; start < end; start += SIZE_32KB) {
+               write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+               write_csr(dd, ASIC_EEP_ADDR_CMD,
+                                               CMD_SECTOR_ERASE_32KB(start));
+               ret = wait_for_not_busy(dd);
+               if (ret)
+                       goto done;
+       }
+
+done:
+       write_disable(dd);
+
+       return ret;
+}
+
+/*
+ * Read a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result)
+{
+       int i;
+
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset));
+       for (i = 0; i < EP_PAGE_SIZE/sizeof(u32); i++)
+               result[i] = (u32)read_csr(dd, ASIC_EEP_DATA);
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */
+}
+
+/*
+ * Read length bytes starting at offset.  Copy to user address addr.
+ */
+static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
+{
+       u32 offset;
+       u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
+       int ret = 0;
+
+       /* reject anything not on an EPROM page boundary */
+       if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
+               return -EINVAL;
+
+       for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
+               read_page(dd, start + offset, buffer);
+               if (copy_to_user((void __user *)(addr + offset),
+                                               buffer, EP_PAGE_SIZE)) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+       }
+
+done:
+       return ret;
+}
+
+/*
+ * Write a 256 byte (64 dword) EPROM page.
+ * All callers have verified the offset is at a page boundary.
+ */
+static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data)
+{
+       int i;
+
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE);
+       write_csr(dd, ASIC_EEP_DATA, data[0]);
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset));
+       for (i = 1; i < EP_PAGE_SIZE/sizeof(u32); i++)
+               write_csr(dd, ASIC_EEP_DATA, data[i]);
+       /* will close the open page */
+       return wait_for_not_busy(dd);
+}
+
+/*
+ * Write length bytes starting at offset.  Read from user address addr.
+ */
+static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr)
+{
+       u32 offset;
+       u32 buffer[EP_PAGE_SIZE/sizeof(u32)];
+       int ret = 0;
+
+       /* reject anything not on an EPROM page boundary */
+       if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK))
+               return -EINVAL;
+
+       write_enable(dd);
+
+       for (offset = 0; offset < len; offset += EP_PAGE_SIZE) {
+               if (copy_from_user(buffer, (void __user *)(addr + offset),
+                                               EP_PAGE_SIZE)) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+               ret = write_page(dd, start + offset, buffer);
+               if (ret)
+                       goto done;
+       }
+
+done:
+       write_disable(dd);
+       return ret;
+}
+
+/*
+ * Perform the given operation on the EPROM.  Called from user space.  The
+ * user credentials have already been checked.
+ *
+ * Return 0 on success, -ERRNO on error
+ */
+int handle_eprom_command(const struct hfi1_cmd *cmd)
+{
+       struct hfi1_devdata *dd;
+       u32 dev_id;
+       int ret = 0;
+
+       /*
+        * The EPROM is per-device, so use unit 0 as that will always
+        * exist.
+        */
+       dd = hfi1_lookup(0);
+       if (!dd) {
+               pr_err("%s: cannot find unit 0!\n", __func__);
+               return -EINVAL;
+       }
+
+       /* lock against other callers touching the ASIC block */
+       mutex_lock(&eprom_mutex);
+
+       /* some platforms do not have an EPROM */
+       if (!eprom_available) {
+               ret = -ENOSYS;
+               goto done_asic;
+       }
+
+       /* lock against the other HFI on another OS */
+       ret = acquire_hw_mutex(dd);
+       if (ret) {
+               dd_dev_err(dd,
+                       "%s: unable to acquire hw mutex, no EPROM support\n",
+                       __func__);
+               goto done_asic;
+       }
+
+       dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n",
+               __func__, cmd->type, cmd->len, cmd->addr);
+
+       switch (cmd->type) {
+       case HFI1_CMD_EP_INFO:
+               if (cmd->len != sizeof(u32)) {
+                       ret = -ERANGE;
+                       break;
+               }
+               dev_id = read_device_id(dd);
+               /* addr points to a u32 user buffer */
+               if (copy_to_user((void __user *)cmd->addr, &dev_id,
+                                                               sizeof(u32)))
+                       ret = -EFAULT;
+               break;
+       case HFI1_CMD_EP_ERASE_CHIP:
+               ret = erase_chip(dd);
+               break;
+       case HFI1_CMD_EP_ERASE_P0:
+               if (cmd->len != P0_SIZE) {
+                       ret = -ERANGE;
+                       break;
+               }
+               ret = erase_32kb_range(dd, 0, cmd->len);
+               break;
+       case HFI1_CMD_EP_ERASE_P1:
+               /* check for overflow */
+               if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+                       ret = -ERANGE;
+                       break;
+               }
+               ret = erase_32kb_range(dd, P1_START, P1_START + cmd->len);
+               break;
+       case HFI1_CMD_EP_READ_P0:
+               if (cmd->len != P0_SIZE) {
+                       ret = -ERANGE;
+                       break;
+               }
+               ret = read_length(dd, 0, cmd->len, cmd->addr);
+               break;
+       case HFI1_CMD_EP_READ_P1:
+               /* check for overflow */
+               if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+                       ret = -ERANGE;
+                       break;
+               }
+               ret = read_length(dd, P1_START, cmd->len, cmd->addr);
+               break;
+       case HFI1_CMD_EP_WRITE_P0:
+               if (cmd->len > P0_SIZE) {
+                       ret = -ERANGE;
+                       break;
+               }
+               ret = write_length(dd, 0, cmd->len, cmd->addr);
+               break;
+       case HFI1_CMD_EP_WRITE_P1:
+               /* check for overflow */
+               if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) {
+                       ret = -ERANGE;
+                       break;
+               }
+               ret = write_length(dd, P1_START, cmd->len, cmd->addr);
+               break;
+       default:
+               dd_dev_err(dd, "%s: unexpected command %d\n",
+                       __func__, cmd->type);
+               ret = -EINVAL;
+               break;
+       }
+
+       release_hw_mutex(dd);
+done_asic:
+       mutex_unlock(&eprom_mutex);
+       return ret;
+}
+
+/*
+ * Initialize the EPROM handler.
+ */
+int eprom_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       /* only the discrete chip has an EPROM, nothing to do */
+       if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0)
+               return 0;
+
+       /* lock against other callers */
+       mutex_lock(&eprom_mutex);
+       if (eprom_available)    /* already initialized */
+               goto done_asic;
+
+       /*
+        * Lock against the other HFI on another OS - the mutex above
+        * would have caught anything in this driver.  It is OK if
+        * both OSes reset the EPROM - as long as they don't do it at
+        * the same time.
+        */
+       ret = acquire_hw_mutex(dd);
+       if (ret) {
+               dd_dev_err(dd,
+                       "%s: unable to acquire hw mutex, no EPROM support\n",
+                       __func__);
+               goto done_asic;
+       }
+
+       /* reset EPROM to be sure it is in a good state */
+
+       /* set reset */
+       write_csr(dd, ASIC_EEP_CTL_STAT,
+                                       ASIC_EEP_CTL_STAT_EP_RESET_SMASK);
+       /* clear reset, set speed */
+       write_csr(dd, ASIC_EEP_CTL_STAT,
+                       EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT);
+
+       /* wake the device with command "release powerdown NoID" */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID);
+
+       eprom_available = 1;
+       release_hw_mutex(dd);
+done_asic:
+       mutex_unlock(&eprom_mutex);
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h
new file mode 100644 (file)
index 0000000..64a6427
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+struct hfi1_cmd;
+struct hfi1_devdata;
+
+int eprom_init(struct hfi1_devdata *dd);
+int handle_eprom_command(const struct hfi1_cmd *cmd);
diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
new file mode 100644 (file)
index 0000000..4698617
--- /dev/null
@@ -0,0 +1,2140 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <asm/pgtable.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <linux/uio.h>
+
+#include "hfi.h"
+#include "pio.h"
+#include "device.h"
+#include "common.h"
+#include "trace.h"
+#include "user_sdma.h"
+#include "eprom.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
+
+/*
+ * File operation functions
+ */
+static int hfi1_file_open(struct inode *, struct file *);
+static int hfi1_file_close(struct inode *, struct file *);
+static ssize_t hfi1_file_write(struct file *, const char __user *,
+                              size_t, loff_t *);
+static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
+static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
+
+static u64 kvirt_to_phys(void *);
+static int assign_ctxt(struct file *, struct hfi1_user_info *);
+static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
+static int user_init(struct file *);
+static int get_ctxt_info(struct file *, void __user *, __u32);
+static int get_base_info(struct file *, void __user *, __u32);
+static int setup_ctxt(struct file *);
+static int setup_subctxt(struct hfi1_ctxtdata *);
+static int get_user_context(struct file *, struct hfi1_user_info *,
+                           int, unsigned);
+static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
+static int allocate_ctxt(struct file *, struct hfi1_devdata *,
+                        struct hfi1_user_info *);
+static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
+static unsigned int poll_next(struct file *, struct poll_table_struct *);
+static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
+static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
+static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
+static int vma_fault(struct vm_area_struct *, struct vm_fault *);
+static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
+static int exp_tid_free(struct file *, struct hfi1_tid_info *);
+static void unlock_exp_tids(struct hfi1_ctxtdata *);
+
+static const struct file_operations hfi1_file_ops = {
+       .owner = THIS_MODULE,
+       .write = hfi1_file_write,
+       .write_iter = hfi1_write_iter,
+       .open = hfi1_file_open,
+       .release = hfi1_file_close,
+       .poll = hfi1_poll,
+       .mmap = hfi1_file_mmap,
+       .llseek = noop_llseek,
+};
+
+static struct vm_operations_struct vm_ops = {
+       .fault = vma_fault,
+};
+
+/*
+ * Types of memories mapped into user processes' space
+ */
+enum mmap_types {
+       PIO_BUFS = 1,
+       PIO_BUFS_SOP,
+       PIO_CRED,
+       RCV_HDRQ,
+       RCV_EGRBUF,
+       UREGS,
+       EVENTS,
+       STATUS,
+       RTAIL,
+       SUBCTXT_UREGS,
+       SUBCTXT_RCV_HDRQ,
+       SUBCTXT_EGRBUF,
+       SDMA_COMP
+};
+
+/*
+ * Masks and offsets defining the mmap tokens
+ */
+#define HFI1_MMAP_OFFSET_MASK   0xfffULL
+#define HFI1_MMAP_OFFSET_SHIFT  0
+#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
+#define HFI1_MMAP_SUBCTXT_SHIFT 12
+#define HFI1_MMAP_CTXT_MASK     0xffULL
+#define HFI1_MMAP_CTXT_SHIFT    16
+#define HFI1_MMAP_TYPE_MASK     0xfULL
+#define HFI1_MMAP_TYPE_SHIFT    24
+#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
+#define HFI1_MMAP_MAGIC_SHIFT   32
+
+#define HFI1_MMAP_MAGIC         0xdabbad00
+
+#define HFI1_MMAP_TOKEN_SET(field, val)        \
+       (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
+#define HFI1_MMAP_TOKEN_GET(field, token) \
+       (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
+#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
+       (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
+       HFI1_MMAP_TOKEN_SET(TYPE, type) | \
+       HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
+       HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
+       HFI1_MMAP_TOKEN_SET(OFFSET, ((unsigned long)addr & ~PAGE_MASK)))
+
+#define EXP_TID_SET(field, value)                      \
+       (((value) & EXP_TID_TID##field##_MASK) <<       \
+        EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) {                                    \
+               (tid) &= ~(EXP_TID_TID##field##_MASK <<                 \
+                          EXP_TID_TID##field##_SHIFT);                 \
+                       }
+#define EXP_TID_RESET(tid, field, value) do {                          \
+               EXP_TID_CLEAR(tid, field);                              \
+               (tid) |= EXP_TID_SET(field, value);                     \
+       } while (0)
+
+#define dbg(fmt, ...)                          \
+       pr_info(fmt, ##__VA_ARGS__)
+
+
+static inline int is_valid_mmap(u64 token)
+{
+       return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
+}
+
+static int hfi1_file_open(struct inode *inode, struct file *fp)
+{
+       /* The real work is performed later in assign_ctxt() */
+       fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
+       if (fp->private_data) /* no cpu affinity by default */
+               ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
+       return fp->private_data ? 0 : -ENOMEM;
+}
+
+static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
+                              size_t count, loff_t *offset)
+{
+       const struct hfi1_cmd __user *ucmd;
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_cmd cmd;
+       struct hfi1_user_info uinfo;
+       struct hfi1_tid_info tinfo;
+       ssize_t consumed = 0, copy = 0, ret = 0;
+       void *dest = NULL;
+       __u64 user_val = 0;
+       int uctxt_required = 1;
+       int must_be_root = 0;
+
+       if (count < sizeof(cmd)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ucmd = (const struct hfi1_cmd __user *)data;
+       if (copy_from_user(&cmd, ucmd, sizeof(cmd))) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       consumed = sizeof(cmd);
+
+       switch (cmd.type) {
+       case HFI1_CMD_ASSIGN_CTXT:
+               uctxt_required = 0;     /* assigned user context not required */
+               copy = sizeof(uinfo);
+               dest = &uinfo;
+               break;
+       case HFI1_CMD_SDMA_STATUS_UPD:
+       case HFI1_CMD_CREDIT_UPD:
+               copy = 0;
+               break;
+       case HFI1_CMD_TID_UPDATE:
+       case HFI1_CMD_TID_FREE:
+               copy = sizeof(tinfo);
+               dest = &tinfo;
+               break;
+       case HFI1_CMD_USER_INFO:
+       case HFI1_CMD_RECV_CTRL:
+       case HFI1_CMD_POLL_TYPE:
+       case HFI1_CMD_ACK_EVENT:
+       case HFI1_CMD_CTXT_INFO:
+       case HFI1_CMD_SET_PKEY:
+       case HFI1_CMD_CTXT_RESET:
+               copy = 0;
+               user_val = cmd.addr;
+               break;
+       case HFI1_CMD_EP_INFO:
+       case HFI1_CMD_EP_ERASE_CHIP:
+       case HFI1_CMD_EP_ERASE_P0:
+       case HFI1_CMD_EP_ERASE_P1:
+       case HFI1_CMD_EP_READ_P0:
+       case HFI1_CMD_EP_READ_P1:
+       case HFI1_CMD_EP_WRITE_P0:
+       case HFI1_CMD_EP_WRITE_P1:
+               uctxt_required = 0;     /* assigned user context not required */
+               must_be_root = 1;       /* validate user */
+               copy = 0;
+               break;
+       default:
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* If the command comes with user data, copy it. */
+       if (copy) {
+               if (copy_from_user(dest, (void __user *)cmd.addr, copy)) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               consumed += copy;
+       }
+
+       /*
+        * Make sure there is a uctxt when needed.
+        */
+       if (uctxt_required && !uctxt) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* only root can do these operations */
+       if (must_be_root && !capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto bail;
+       }
+
+       switch (cmd.type) {
+       case HFI1_CMD_ASSIGN_CTXT:
+               ret = assign_ctxt(fp, &uinfo);
+               if (ret < 0)
+                       goto bail;
+               ret = setup_ctxt(fp);
+               if (ret)
+                       goto bail;
+               ret = user_init(fp);
+               break;
+       case HFI1_CMD_CTXT_INFO:
+               ret = get_ctxt_info(fp, (void __user *)(unsigned long)
+                                   user_val, cmd.len);
+               break;
+       case HFI1_CMD_USER_INFO:
+               ret = get_base_info(fp, (void __user *)(unsigned long)
+                                   user_val, cmd.len);
+               break;
+       case HFI1_CMD_SDMA_STATUS_UPD:
+               break;
+       case HFI1_CMD_CREDIT_UPD:
+               if (uctxt && uctxt->sc)
+                       sc_return_credits(uctxt->sc);
+               break;
+       case HFI1_CMD_TID_UPDATE:
+               ret = exp_tid_setup(fp, &tinfo);
+               if (!ret) {
+                       unsigned long addr;
+                       /*
+                        * Copy the number of tidlist entries we used
+                        * and the length of the buffer we registered.
+                        * These fields are adjacent in the structure so
+                        * we can copy them at the same time.
+                        */
+                       addr = (unsigned long)cmd.addr +
+                               offsetof(struct hfi1_tid_info, tidcnt);
+                       if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+                                        sizeof(tinfo.tidcnt) +
+                                        sizeof(tinfo.length)))
+                               ret = -EFAULT;
+               }
+               break;
+       case HFI1_CMD_TID_FREE:
+               ret = exp_tid_free(fp, &tinfo);
+               break;
+       case HFI1_CMD_RECV_CTRL:
+               ret = manage_rcvq(uctxt, subctxt_fp(fp), (int)user_val);
+               break;
+       case HFI1_CMD_POLL_TYPE:
+               uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
+               break;
+       case HFI1_CMD_ACK_EVENT:
+               ret = user_event_ack(uctxt, subctxt_fp(fp), user_val);
+               break;
+       case HFI1_CMD_SET_PKEY:
+               if (HFI1_CAP_IS_USET(PKEY_CHECK))
+                       ret = set_ctxt_pkey(uctxt, subctxt_fp(fp), user_val);
+               else
+                       ret = -EPERM;
+               break;
+       case HFI1_CMD_CTXT_RESET: {
+               struct send_context *sc;
+               struct hfi1_devdata *dd;
+
+               if (!uctxt || !uctxt->dd || !uctxt->sc) {
+                       ret = -EINVAL;
+                       break;
+               }
+               /*
+                * There is no protection here. User level has to
+                * guarantee that no one will be writing to the send
+                * context while it is being re-initialized.
+                * If user level breaks that guarantee, it will break
+                * it's own context and no one else's.
+                */
+               dd = uctxt->dd;
+               sc = uctxt->sc;
+               /*
+                * Wait until the interrupt handler has marked the
+                * context as halted or frozen. Report error if we time
+                * out.
+                */
+               wait_event_interruptible_timeout(
+                       sc->halt_wait, (sc->flags & SCF_HALTED),
+                       msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+               if (!(sc->flags & SCF_HALTED)) {
+                       ret = -ENOLCK;
+                       break;
+               }
+               /*
+                * If the send context was halted due to a Freeze,
+                * wait until the device has been "unfrozen" before
+                * resetting the context.
+                */
+               if (sc->flags & SCF_FROZEN) {
+                       wait_event_interruptible_timeout(
+                               dd->event_queue,
+                               !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+                               msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
+                       if (dd->flags & HFI1_FROZEN) {
+                               ret = -ENOLCK;
+                               break;
+                       }
+                       if (dd->flags & HFI1_FORCED_FREEZE) {
+                               /* Don't allow context reset if we are into
+                                * forced freeze */
+                               ret = -ENODEV;
+                               break;
+                       }
+                       sc_disable(sc);
+                       ret = sc_enable(sc);
+                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
+                                    uctxt->ctxt);
+               } else
+                       ret = sc_restart(sc);
+               if (!ret)
+                       sc_return_credits(sc);
+               break;
+       }
+       case HFI1_CMD_EP_INFO:
+       case HFI1_CMD_EP_ERASE_CHIP:
+       case HFI1_CMD_EP_ERASE_P0:
+       case HFI1_CMD_EP_ERASE_P1:
+       case HFI1_CMD_EP_READ_P0:
+       case HFI1_CMD_EP_READ_P1:
+       case HFI1_CMD_EP_WRITE_P0:
+       case HFI1_CMD_EP_WRITE_P1:
+               ret = handle_eprom_command(&cmd);
+               break;
+       }
+
+       if (ret >= 0)
+               ret = consumed;
+bail:
+       return ret;
+}
+
+static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       int ret = 0, done = 0, reqs = 0;
+       unsigned long dim = from->nr_segs;
+
+       if (!user_sdma_comp_fp(kiocb->ki_filp) ||
+           !user_sdma_pkt_fp(kiocb->ki_filp)) {
+               ret = -EIO;
+               goto done;
+       }
+
+       if (!iter_is_iovec(from) || !dim) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
+                 ctxt_fp(kiocb->ki_filp)->ctxt, subctxt_fp(kiocb->ki_filp),
+                 dim);
+       pq = user_sdma_pkt_fp(kiocb->ki_filp);
+       cq = user_sdma_comp_fp(kiocb->ki_filp);
+
+       if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
+               ret = -ENOSPC;
+               goto done;
+       }
+
+       while (dim) {
+               unsigned long count = 0;
+
+               ret = hfi1_user_sdma_process_request(
+                       kiocb->ki_filp, (struct iovec *)(from->iov + done),
+                       dim, &count);
+               if (ret)
+                       goto done;
+               dim -= count;
+               done += count;
+               reqs++;
+       }
+done:
+       return ret ? ret : reqs;
+}
+
+static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+       struct hfi1_ctxtdata *uctxt;
+       struct hfi1_devdata *dd;
+       unsigned long flags, pfn;
+       u64 token = vma->vm_pgoff << PAGE_SHIFT,
+               memaddr = 0;
+       u8 subctxt, mapio = 0, vmf = 0, type;
+       ssize_t memlen = 0;
+       int ret = 0;
+       u16 ctxt;
+
+       uctxt = ctxt_fp(fp);
+       if (!is_valid_mmap(token) || !uctxt ||
+           !(vma->vm_flags & VM_SHARED)) {
+               ret = -EINVAL;
+               goto done;
+       }
+       dd = uctxt->dd;
+       ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
+       subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
+       type = HFI1_MMAP_TOKEN_GET(TYPE, token);
+       if (ctxt != uctxt->ctxt || subctxt != subctxt_fp(fp)) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       flags = vma->vm_flags;
+
+       switch (type) {
+       case PIO_BUFS:
+       case PIO_BUFS_SOP:
+               memaddr = ((dd->physaddr + TXE_PIO_SEND) +
+                               /* chip pio base */
+                          (uctxt->sc->hw_context * (1 << 16))) +
+                               /* 64K PIO space / ctxt */
+                       (type == PIO_BUFS_SOP ?
+                               (TXE_PIO_SIZE / 2) : 0); /* sop? */
+               /*
+                * Map only the amount allocated to the context, not the
+                * entire available context's PIO space.
+                */
+               memlen = ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE,
+                              PAGE_SIZE);
+               flags &= ~VM_MAYREAD;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+               mapio = 1;
+               break;
+       case PIO_CRED:
+               if (flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               /*
+                * The credit return location for this context could be on the
+                * second or third page allocated for credit returns (if number
+                * of enabled contexts > 64 and 128 respectively).
+                */
+               memaddr = dd->cr_base[uctxt->numa_id].pa +
+                       (((u64)uctxt->sc->hw_free -
+                         (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
+               memlen = PAGE_SIZE;
+               flags &= ~VM_MAYWRITE;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               /*
+                * The driver has already allocated memory for credit
+                * returns and programmed it into the chip. Has that
+                * memory been flagged as non-cached?
+                */
+               /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
+               mapio = 1;
+               break;
+       case RCV_HDRQ:
+               memaddr = uctxt->rcvhdrq_phys;
+               memlen = uctxt->rcvhdrq_size;
+               break;
+       case RCV_EGRBUF: {
+               unsigned long addr;
+               int i;
+               /*
+                * The RcvEgr buffer need to be handled differently
+                * as multiple non-contiguous pages need to be mapped
+                * into the user process.
+                */
+               memlen = uctxt->egrbufs.size;
+               if ((vma->vm_end - vma->vm_start) != memlen) {
+                       dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
+                                  (vma->vm_end - vma->vm_start), memlen);
+                       ret = -EINVAL;
+                       goto done;
+               }
+               if (vma->vm_flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               vma->vm_flags &= ~VM_MAYWRITE;
+               addr = vma->vm_start;
+               for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
+                       ret = remap_pfn_range(
+                               vma, addr,
+                               uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT,
+                               uctxt->egrbufs.buffers[i].len,
+                               vma->vm_page_prot);
+                       if (ret < 0)
+                               goto done;
+                       addr += uctxt->egrbufs.buffers[i].len;
+               }
+               ret = 0;
+               goto done;
+       }
+       case UREGS:
+               /*
+                * Map only the page that contains this context's user
+                * registers.
+                */
+               memaddr = (unsigned long)
+                       (dd->physaddr + RXE_PER_CONTEXT_USER)
+                       + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
+               /*
+                * TidFlow table is on the same page as the rest of the
+                * user registers.
+                */
+               memlen = PAGE_SIZE;
+               flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+               mapio = 1;
+               break;
+       case EVENTS:
+               /*
+                * Use the page where this context's flags are. User level
+                * knows where it's own bitmap is within the page.
+                */
+               memaddr = ((unsigned long)dd->events +
+                          ((uctxt->ctxt - dd->first_user_ctxt) *
+                           HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
+               memlen = PAGE_SIZE;
+               /*
+                * v3.7 removes VM_RESERVED but the effect is kept by
+                * using VM_IO.
+                */
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case STATUS:
+               memaddr = kvirt_to_phys((void *)dd->status);
+               memlen = PAGE_SIZE;
+               flags |= VM_IO | VM_DONTEXPAND;
+               break;
+       case RTAIL:
+               if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
+                       /*
+                        * If the memory allocation failed, the context alloc
+                        * also would have failed, so we would never get here
+                        */
+                       ret = -EINVAL;
+                       goto done;
+               }
+               if (flags & VM_WRITE) {
+                       ret = -EPERM;
+                       goto done;
+               }
+               memaddr = uctxt->rcvhdrqtailaddr_phys;
+               memlen = PAGE_SIZE;
+               flags &= ~VM_MAYWRITE;
+               break;
+       case SUBCTXT_UREGS:
+               memaddr = (u64)uctxt->subctxt_uregbase;
+               memlen = PAGE_SIZE;
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case SUBCTXT_RCV_HDRQ:
+               memaddr = (u64)uctxt->subctxt_rcvhdr_base;
+               memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       case SUBCTXT_EGRBUF:
+               memaddr = (u64)uctxt->subctxt_rcvegrbuf;
+               memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
+               flags |= VM_IO | VM_DONTEXPAND;
+               flags &= ~VM_MAYWRITE;
+               vmf = 1;
+               break;
+       case SDMA_COMP: {
+               struct hfi1_user_sdma_comp_q *cq;
+
+               if (!user_sdma_comp_fp(fp)) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+               cq = user_sdma_comp_fp(fp);
+               memaddr = (u64)cq->comps;
+               memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE);
+               flags |= VM_IO | VM_DONTEXPAND;
+               vmf = 1;
+               break;
+       }
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       if ((vma->vm_end - vma->vm_start) != memlen) {
+               hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
+                         uctxt->ctxt, subctxt_fp(fp),
+                         (vma->vm_end - vma->vm_start), memlen);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       vma->vm_flags = flags;
+       dd_dev_info(dd,
+                   "%s: %u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
+                   __func__, ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
+                   vma->vm_end - vma->vm_start, vma->vm_flags);
+       pfn = (unsigned long)(memaddr >> PAGE_SHIFT);
+       if (vmf) {
+               vma->vm_pgoff = pfn;
+               vma->vm_ops = &vm_ops;
+               ret = 0;
+       } else if (mapio) {
+               ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+                                        vma->vm_page_prot);
+       } else {
+               ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen,
+                                     vma->vm_page_prot);
+       }
+done:
+       return ret;
+}
+
+/*
+ * Local (non-chip) user memory is not mapped right away but as it is
+ * accessed by the user-level code.
+ */
+static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct page *page;
+
+       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+       if (!page)
+               return VM_FAULT_SIGBUS;
+
+       get_page(page);
+       vmf->page = page;
+
+       return 0;
+}
+
+static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
+{
+       struct hfi1_ctxtdata *uctxt;
+       unsigned pollflag;
+
+       uctxt = ctxt_fp(fp);
+       if (!uctxt)
+               pollflag = POLLERR;
+       else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
+               pollflag = poll_urgent(fp, pt);
+       else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
+               pollflag = poll_next(fp, pt);
+       else /* invalid */
+               pollflag = POLLERR;
+
+       return pollflag;
+}
+
+static int hfi1_file_close(struct inode *inode, struct file *fp)
+{
+       struct hfi1_filedata *fdata = fp->private_data;
+       struct hfi1_ctxtdata *uctxt = fdata->uctxt;
+       struct hfi1_devdata *dd;
+       unsigned long flags, *ev;
+
+       fp->private_data = NULL;
+
+       if (!uctxt)
+               goto done;
+
+       hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
+       dd = uctxt->dd;
+       mutex_lock(&hfi1_mutex);
+
+       flush_wc();
+       /* drain user sdma queue */
+       if (fdata->pq)
+               hfi1_user_sdma_free_queues(fdata);
+
+       /*
+        * Clear any left over, unhandled events so the next process that
+        * gets this context doesn't get confused.
+        */
+       ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+                          HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
+       *ev = 0;
+
+       if (--uctxt->cnt) {
+               uctxt->active_slaves &= ~(1 << fdata->subctxt);
+               uctxt->subpid[fdata->subctxt] = 0;
+               mutex_unlock(&hfi1_mutex);
+               goto done;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       /*
+        * Disable receive context and interrupt available, reset all
+        * RcvCtxtCtrl bits to default values.
+        */
+       hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+                    HFI1_RCVCTRL_TIDFLOW_DIS |
+                    HFI1_RCVCTRL_INTRAVAIL_DIS |
+                    HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+                    HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+                    HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
+       /* Clear the context's J_KEY */
+       hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
+       /*
+        * Reset context integrity checks to default.
+        * (writes to CSRs probably belong in chip.c)
+        */
+       write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
+                       hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
+       sc_disable(uctxt->sc);
+       uctxt->pid = 0;
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+       dd->rcd[uctxt->ctxt] = NULL;
+       uctxt->rcvwait_to = 0;
+       uctxt->piowait_to = 0;
+       uctxt->rcvnowait = 0;
+       uctxt->pionowait = 0;
+       uctxt->event_flags = 0;
+
+       hfi1_clear_tids(uctxt);
+       hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
+
+       if (uctxt->tid_pg_list)
+               unlock_exp_tids(uctxt);
+
+       hfi1_stats.sps_ctxts--;
+       dd->freectxts++;
+       mutex_unlock(&hfi1_mutex);
+       hfi1_free_ctxtdata(dd, uctxt);
+done:
+       kfree(fdata);
+       return 0;
+}
+
+/*
+ * Convert kernel *virtual* addresses to physical addresses.
+ * This is used to vmalloc'ed addresses.
+ */
+static u64 kvirt_to_phys(void *addr)
+{
+       struct page *page;
+       u64 paddr = 0;
+
+       page = vmalloc_to_page(addr);
+       if (page)
+               paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
+{
+       int i_minor, ret = 0;
+       unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS;
+
+       swmajor = uinfo->userversion >> 16;
+       if (swmajor != HFI1_USER_SWMAJOR) {
+               ret = -ENODEV;
+               goto done;
+       }
+
+       swminor = uinfo->userversion & 0xffff;
+
+       if (uinfo->hfi1_alg < HFI1_ALG_COUNT)
+               alg = uinfo->hfi1_alg;
+
+       mutex_lock(&hfi1_mutex);
+       /* First, lets check if we need to setup a shared context? */
+       if (uinfo->subctxt_cnt)
+               ret = find_shared_ctxt(fp, uinfo);
+
+       /*
+        * We execute the following block if we couldn't find a
+        * shared context or if context sharing is not required.
+        */
+       if (!ret) {
+               i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
+               ret = get_user_context(fp, uinfo, i_minor - 1, alg);
+       }
+       mutex_unlock(&hfi1_mutex);
+done:
+       return ret;
+}
+
+static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
+                           int devno, unsigned alg)
+{
+       struct hfi1_devdata *dd = NULL;
+       int ret = 0, devmax, npresent, nup, dev;
+
+       devmax = hfi1_count_units(&npresent, &nup);
+       if (!npresent) {
+               ret = -ENXIO;
+               goto done;
+       }
+       if (!nup) {
+               ret = -ENETDOWN;
+               goto done;
+       }
+       if (devno >= 0) {
+               dd = hfi1_lookup(devno);
+               if (!dd)
+                       ret = -ENODEV;
+               else if (!dd->freectxts)
+                       ret = -EBUSY;
+       } else {
+               struct hfi1_devdata *pdd;
+
+               if (alg == HFI1_ALG_ACROSS) {
+                       unsigned free = 0U;
+
+                       for (dev = 0; dev < devmax; dev++) {
+                               pdd = hfi1_lookup(dev);
+                               if (pdd && pdd->freectxts &&
+                                   pdd->freectxts > free) {
+                                       dd = pdd;
+                                       free = pdd->freectxts;
+                               }
+                       }
+               } else {
+                       for (dev = 0; dev < devmax; dev++) {
+                               pdd = hfi1_lookup(dev);
+                               if (pdd && pdd->freectxts) {
+                                       dd = pdd;
+                                       break;
+                               }
+                       }
+               }
+               if (!dd)
+                       ret = -EBUSY;
+       }
+done:
+       return ret ? ret : allocate_ctxt(fp, dd, uinfo);
+}
+
+static int find_shared_ctxt(struct file *fp,
+                           const struct hfi1_user_info *uinfo)
+{
+       int devmax, ndev, i;
+       int ret = 0;
+
+       devmax = hfi1_count_units(NULL, NULL);
+
+       for (ndev = 0; ndev < devmax; ndev++) {
+               struct hfi1_devdata *dd = hfi1_lookup(ndev);
+
+               /* device portion of usable() */
+               if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
+                       continue;
+               for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
+                       struct hfi1_ctxtdata *uctxt = dd->rcd[i];
+
+                       /* Skip ctxts which are not yet open */
+                       if (!uctxt || !uctxt->cnt)
+                               continue;
+                       /* Skip ctxt if it doesn't match the requested one */
+                       if (memcmp(uctxt->uuid, uinfo->uuid,
+                                  sizeof(uctxt->uuid)) ||
+                           uctxt->subctxt_id != uinfo->subctxt_id ||
+                           uctxt->subctxt_cnt != uinfo->subctxt_cnt)
+                               continue;
+
+                       /* Verify the sharing process matches the master */
+                       if (uctxt->userversion != uinfo->userversion ||
+                           uctxt->cnt >= uctxt->subctxt_cnt) {
+                               ret = -EINVAL;
+                               goto done;
+                       }
+                       ctxt_fp(fp) = uctxt;
+                       subctxt_fp(fp) = uctxt->cnt++;
+                       uctxt->subpid[subctxt_fp(fp)] = current->pid;
+                       uctxt->active_slaves |= 1 << subctxt_fp(fp);
+                       ret = 1;
+                       goto done;
+               }
+       }
+
+done:
+       return ret;
+}
+
+static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
+                        struct hfi1_user_info *uinfo)
+{
+       struct hfi1_ctxtdata *uctxt;
+       unsigned ctxt;
+       int ret;
+
+       if (dd->flags & HFI1_FROZEN) {
+               /*
+                * Pick an error that is unique from all other errors
+                * that are returned so the user process knows that
+                * it tried to allocate while the SPC was frozen.  It
+                * it should be able to retry with success in a short
+                * while.
+                */
+               return -EIO;
+       }
+
+       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
+               if (!dd->rcd[ctxt])
+                       break;
+
+       if (ctxt == dd->num_rcv_contexts)
+               return -EBUSY;
+
+       uctxt = hfi1_create_ctxtdata(dd->pport, ctxt);
+       if (!uctxt) {
+               dd_dev_err(dd,
+                          "Unable to allocate ctxtdata memory, failing open\n");
+               return -ENOMEM;
+       }
+       /*
+        * Allocate and enable a PIO send context.
+        */
+       uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
+                            uctxt->numa_id);
+       if (!uctxt->sc)
+               return -ENOMEM;
+
+       dbg("allocated send context %u(%u)\n", uctxt->sc->sw_index,
+               uctxt->sc->hw_context);
+       ret = sc_enable(uctxt->sc);
+       if (ret)
+               return ret;
+       /*
+        * Setup shared context resources if the user-level has requested
+        * shared contexts and this is the 'master' process.
+        * This has to be done here so the rest of the sub-contexts find the
+        * proper master.
+        */
+       if (uinfo->subctxt_cnt && !subctxt_fp(fp)) {
+               ret = init_subctxts(uctxt, uinfo);
+               /*
+                * On error, we don't need to disable and de-allocate the
+                * send context because it will be done during file close
+                */
+               if (ret)
+                       return ret;
+       }
+       uctxt->userversion = uinfo->userversion;
+       uctxt->pid = current->pid;
+       uctxt->flags = HFI1_CAP_UGET(MASK);
+       init_waitqueue_head(&uctxt->wait);
+       strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
+       memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
+       uctxt->jkey = generate_jkey(current_uid());
+       INIT_LIST_HEAD(&uctxt->sdma_queues);
+       spin_lock_init(&uctxt->sdma_qlock);
+       hfi1_stats.sps_ctxts++;
+       dd->freectxts--;
+       ctxt_fp(fp) = uctxt;
+
+       return 0;
+}
+
+static int init_subctxts(struct hfi1_ctxtdata *uctxt,
+                        const struct hfi1_user_info *uinfo)
+{
+       int ret = 0;
+       unsigned num_subctxts;
+
+       num_subctxts = uinfo->subctxt_cnt;
+       if (num_subctxts > HFI1_MAX_SHARED_CTXTS) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       uctxt->subctxt_cnt = uinfo->subctxt_cnt;
+       uctxt->subctxt_id = uinfo->subctxt_id;
+       uctxt->active_slaves = 1;
+       uctxt->redirect_seq_cnt = 1;
+       set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+bail:
+       return ret;
+}
+
+static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
+{
+       int ret = 0;
+       unsigned num_subctxts = uctxt->subctxt_cnt;
+
+       uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
+       if (!uctxt->subctxt_uregbase) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       /* We can take the size of the RcvHdr Queue from the master */
+       uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
+                                                 num_subctxts);
+       if (!uctxt->subctxt_rcvhdr_base) {
+               ret = -ENOMEM;
+               goto bail_ureg;
+       }
+
+       uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
+                                               num_subctxts);
+       if (!uctxt->subctxt_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail_rhdr;
+       }
+       goto bail;
+bail_rhdr:
+       vfree(uctxt->subctxt_rcvhdr_base);
+bail_ureg:
+       vfree(uctxt->subctxt_uregbase);
+       uctxt->subctxt_uregbase = NULL;
+bail:
+       return ret;
+}
+
+static int user_init(struct file *fp)
+{
+       int ret;
+       unsigned int rcvctrl_ops = 0;
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+
+       /* make sure that the context has already been setup */
+       if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) {
+               ret = -EFAULT;
+               goto done;
+       }
+
+       /*
+        * Subctxts don't need to initialize anything since master
+        * has done it.
+        */
+       if (subctxt_fp(fp)) {
+               ret = wait_event_interruptible(uctxt->wait,
+                       !test_bit(HFI1_CTXT_MASTER_UNINIT,
+                       &uctxt->event_flags));
+               goto done;
+       }
+
+       /* initialize poll variables... */
+       uctxt->urgent = 0;
+       uctxt->urgent_poll = 0;
+
+       /*
+        * Now enable the ctxt for receive.
+        * For chips that are set to DMA the tail register to memory
+        * when they change (and when the update bit transitions from
+        * 0 to 1.  So for those chips, we turn it off and then back on.
+        * This will (very briefly) affect any other open ctxts, but the
+        * duration is very short, and therefore isn't an issue.  We
+        * explicitly set the in-memory tail copy to 0 beforehand, so we
+        * don't have to wait to be sure the DMA update has happened
+        * (chip resets head/tail to 0 on transition to enable).
+        */
+       if (uctxt->rcvhdrtail_kvaddr)
+               clear_rcvhdrtail(uctxt);
+
+       /* Setup J_KEY before enabling the context */
+       hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
+
+       rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP))
+               rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
+       /*
+        * Ignore the bit in the flags for now until proper
+        * support for multiple packet per rcv array entry is
+        * added.
+        */
+       if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+               rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+               rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+               rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+       if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+               rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+       hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
+
+       /* Notify any waiting slaves */
+       if (uctxt->subctxt_cnt) {
+               clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
+               wake_up(&uctxt->wait);
+       }
+       ret = 0;
+
+done:
+       return ret;
+}
+
+static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
+{
+       struct hfi1_ctxt_info cinfo;
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_filedata *fd = fp->private_data;
+       int ret = 0;
+
+       ret = hfi1_get_base_kinfo(uctxt, &cinfo);
+       if (ret < 0)
+               goto done;
+       cinfo.num_active = hfi1_count_active_units();
+       cinfo.unit = uctxt->dd->unit;
+       cinfo.ctxt = uctxt->ctxt;
+       cinfo.subctxt = subctxt_fp(fp);
+       cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
+                               uctxt->dd->rcv_entries.group_size) +
+               uctxt->expected_count;
+       cinfo.credits = uctxt->sc->credits;
+       cinfo.numa_node = uctxt->numa_id;
+       cinfo.rec_cpu = fd->rec_cpu_num;
+       cinfo.send_ctxt = uctxt->sc->hw_context;
+
+       cinfo.egrtids = uctxt->egrbufs.alloced;
+       cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+       cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+       cinfo.sdma_ring_size = user_sdma_comp_fp(fp)->nentries;
+       cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
+
+       trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, subctxt_fp(fp), cinfo);
+       if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
+               ret = -EFAULT;
+done:
+       return ret;
+}
+
+static int setup_ctxt(struct file *fp)
+{
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_devdata *dd = uctxt->dd;
+       int ret = 0;
+
+       /*
+        * Context should be set up only once (including allocation and
+        * programming of eager buffers. This is done if context sharing
+        * is not requested or by the master process.
+        */
+       if (!uctxt->subctxt_cnt || !subctxt_fp(fp)) {
+               ret = hfi1_init_ctxt(uctxt->sc);
+               if (ret)
+                       goto done;
+
+               /* Now allocate the RcvHdr queue and eager buffers. */
+               ret = hfi1_create_rcvhdrq(dd, uctxt);
+               if (ret)
+                       goto done;
+               ret = hfi1_setup_eagerbufs(uctxt);
+               if (ret)
+                       goto done;
+               if (uctxt->subctxt_cnt && !subctxt_fp(fp)) {
+                       ret = setup_subctxt(uctxt);
+                       if (ret)
+                               goto done;
+               }
+               /* Setup Expected Rcv memories */
+               uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
+                                            sizeof(struct page **));
+               if (!uctxt->tid_pg_list) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               uctxt->physshadow = vzalloc(uctxt->expected_count *
+                                           sizeof(*uctxt->physshadow));
+               if (!uctxt->physshadow) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               /* allocate expected TID map and initialize the cursor */
+               atomic_set(&uctxt->tidcursor, 0);
+               uctxt->numtidgroups = uctxt->expected_count /
+                       dd->rcv_entries.group_size;
+               uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
+                       !!(uctxt->numtidgroups % BITS_PER_LONG);
+               uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
+                                               sizeof(*uctxt->tidusemap),
+                                               GFP_KERNEL, uctxt->numa_id);
+               if (!uctxt->tidusemap) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               /*
+                * In case that the number of groups is not a multiple of
+                * 64 (the number of groups in a tidusemap element), mark
+                * the extra ones as used. This will effectively make them
+                * permanently used and should never be assigned. Otherwise,
+                * the code which checks how many free groups we have will
+                * get completely confused about the state of the bits.
+                */
+               if (uctxt->numtidgroups % BITS_PER_LONG)
+                       uctxt->tidusemap[uctxt->tidmapcnt - 1] =
+                               ~((1ULL << (uctxt->numtidgroups %
+                                           BITS_PER_LONG)) - 1);
+               trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0,
+                                      uctxt->tidusemap, uctxt->tidmapcnt);
+       }
+       ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
+       if (ret)
+               goto done;
+
+       set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
+done:
+       return ret;
+}
+
+static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
+{
+       struct hfi1_base_info binfo;
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_devdata *dd = uctxt->dd;
+       ssize_t sz;
+       unsigned offset;
+       int ret = 0;
+
+       trace_hfi1_uctxtdata(uctxt->dd, uctxt);
+
+       memset(&binfo, 0, sizeof(binfo));
+       binfo.hw_version = dd->revision;
+       binfo.sw_version = HFI1_KERN_SWVERSION;
+       binfo.bthqp = kdeth_qp;
+       binfo.jkey = uctxt->jkey;
+       /*
+        * If more than 64 contexts are enabled the allocated credit
+        * return will span two or three contiguous pages. Since we only
+        * map the page containing the context's credit return address,
+        * we need to calculate the offset in the proper page.
+        */
+       offset = ((u64)uctxt->sc->hw_free -
+                 (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
+       binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
+                                              subctxt_fp(fp), offset);
+       binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
+                                           subctxt_fp(fp),
+                                           uctxt->sc->base_addr);
+       binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
+                                               uctxt->ctxt,
+                                               subctxt_fp(fp),
+                                               uctxt->sc->base_addr);
+       binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
+                                              subctxt_fp(fp),
+                                              uctxt->rcvhdrq);
+       binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
+                                              subctxt_fp(fp),
+                                              uctxt->egrbufs.rcvtids[0].phys);
+       binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
+                                                subctxt_fp(fp), 0);
+       /*
+        * user regs are at
+        * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
+        */
+       binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
+                                           subctxt_fp(fp), 0);
+       offset = ((((uctxt->ctxt - dd->first_user_ctxt) *
+                   HFI1_MAX_SHARED_CTXTS) + subctxt_fp(fp)) *
+                 sizeof(*dd->events)) & ~PAGE_MASK;
+       binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
+                                             subctxt_fp(fp),
+                                             offset);
+       binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
+                                             subctxt_fp(fp),
+                                             dd->status);
+       if (HFI1_CAP_IS_USET(DMA_RTAIL))
+               binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
+                                                      subctxt_fp(fp), 0);
+       if (uctxt->subctxt_cnt) {
+               binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
+                                                       uctxt->ctxt,
+                                                       subctxt_fp(fp), 0);
+               binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
+                                                        uctxt->ctxt,
+                                                        subctxt_fp(fp), 0);
+               binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
+                                                        uctxt->ctxt,
+                                                        subctxt_fp(fp), 0);
+       }
+       sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
+       if (copy_to_user(ubase, &binfo, sz))
+               ret = -EFAULT;
+       return ret;
+}
+
+static unsigned int poll_urgent(struct file *fp,
+                               struct poll_table_struct *pt)
+{
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned pollflag;
+
+       poll_wait(fp, &uctxt->wait, pt);
+
+       spin_lock_irq(&dd->uctxt_lock);
+       if (uctxt->urgent != uctxt->urgent_poll) {
+               pollflag = POLLIN | POLLRDNORM;
+               uctxt->urgent_poll = uctxt->urgent;
+       } else {
+               pollflag = 0;
+               set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
+       }
+       spin_unlock_irq(&dd->uctxt_lock);
+
+       return pollflag;
+}
+
+static unsigned int poll_next(struct file *fp,
+                             struct poll_table_struct *pt)
+{
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned pollflag;
+
+       poll_wait(fp, &uctxt->wait, pt);
+
+       spin_lock_irq(&dd->uctxt_lock);
+       if (hdrqempty(uctxt)) {
+               set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
+               pollflag = 0;
+       } else
+               pollflag = POLLIN | POLLRDNORM;
+       spin_unlock_irq(&dd->uctxt_lock);
+
+       return pollflag;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
+{
+       struct hfi1_ctxtdata *uctxt;
+       struct hfi1_devdata *dd = ppd->dd;
+       unsigned ctxt;
+       int ret = 0;
+       unsigned long flags;
+
+       if (!dd->events) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
+            ctxt++) {
+               uctxt = dd->rcd[ctxt];
+               if (uctxt) {
+                       unsigned long *evs = dd->events +
+                               (uctxt->ctxt - dd->first_user_ctxt) *
+                               HFI1_MAX_SHARED_CTXTS;
+                       int i;
+                       /*
+                        * subctxt_cnt is 0 if not shared, so do base
+                        * separately, first, then remaining subctxt, if any
+                        */
+                       set_bit(evtbit, evs);
+                       for (i = 1; i < uctxt->subctxt_cnt; i++)
+                               set_bit(evtbit, evs + i);
+               }
+       }
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+done:
+       return ret;
+}
+
+/**
+ * manage_rcvq - manage a context's receive queue
+ * @uctxt: the context
+ * @subctxt: the sub-context
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+                      int start_stop)
+{
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned int rcvctrl_op;
+
+       if (subctxt)
+               goto bail;
+       /* atomically clear receive enable ctxt. */
+       if (start_stop) {
+               /*
+                * On enable, force in-memory copy of the tail register to
+                * 0, so that protocol code doesn't have to worry about
+                * whether or not the chip has yet updated the in-memory
+                * copy or not on return from the system call. The chip
+                * always resets it's tail register back to 0 on a
+                * transition from disabled to enabled.
+                */
+               if (uctxt->rcvhdrtail_kvaddr)
+                       clear_rcvhdrtail(uctxt);
+               rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
+       } else
+               rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
+       hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
+       /* always; new head should be equal to new tail; see above */
+bail:
+       return 0;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
+                         unsigned long events)
+{
+       int i;
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned long *evs;
+
+       if (!dd->events)
+               return 0;
+
+       evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
+                           HFI1_MAX_SHARED_CTXTS) + subctxt;
+
+       for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
+               if (!test_bit(i, &events))
+                       continue;
+               clear_bit(i, evs);
+       }
+       return 0;
+}
+
+#define num_user_pages(vaddr, len)                                     \
+       (1 + (((((unsigned long)(vaddr) +                               \
+                (unsigned long)(len) - 1) & PAGE_MASK) -               \
+              ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+/**
+ * tzcnt - count the number of trailing zeros in a 64bit value
+ * @value: the value to be examined
+ *
+ * Returns the number of trailing least significant zeros in the
+ * the input value. If the value is zero, return the number of
+ * bits of the value.
+ */
+static inline u8 tzcnt(u64 value)
+{
+       return value ? __builtin_ctzl(value) : sizeof(value) * 8;
+}
+
+static inline unsigned num_free_groups(unsigned long map, u16 *start)
+{
+       unsigned free;
+       u16 bitidx = *start;
+
+       if (bitidx >= BITS_PER_LONG)
+               return 0;
+       /* "Turn off" any bits set before our bit index */
+       map &= ~((1ULL << bitidx) - 1);
+       free = tzcnt(map) - bitidx;
+       while (!free && bitidx < BITS_PER_LONG) {
+               /* Zero out the last set bit so we look at the rest */
+               map &= ~(1ULL << bitidx);
+               /*
+                * Account for the previously checked bits and advance
+                * the bit index. We don't have to check for bitidx
+                * getting bigger than BITS_PER_LONG here as it would
+                * mean extra instructions that we don't need. If it
+                * did happen, it would push free to a negative value
+                * which will break the loop.
+                */
+               free = tzcnt(map) - ++bitidx;
+       }
+       *start = bitidx;
+       return free;
+}
+
+static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       int ret = 0;
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned tid, mapped = 0, npages, ngroups, exp_groups,
+               tidpairs = uctxt->expected_count / 2;
+       struct page **pages;
+       unsigned long vaddr, tidmap[uctxt->tidmapcnt];
+       dma_addr_t *phys;
+       u32 tidlist[tidpairs], pairidx = 0, tidcursor;
+       u16 useidx, idx, bitidx, tidcnt = 0;
+
+       vaddr = tinfo->vaddr;
+
+       if (vaddr & ~PAGE_MASK) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       npages = num_user_pages(vaddr, tinfo->length);
+       if (!npages) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+                      npages * PAGE_SIZE)) {
+               dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+                          (void *)vaddr, npages);
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
+       memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
+
+       exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
+       /* which group set do we look at first? */
+       tidcursor = atomic_read(&uctxt->tidcursor);
+       useidx = (tidcursor >> 16) & 0xffff;
+       bitidx = tidcursor & 0xffff;
+
+       /*
+        * Keep going until we've mapped all pages or we've exhausted all
+        * RcvArray entries.
+        * This iterates over the number of tidmaps + 1
+        * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
+        * started from one more time for any free bits before the
+        * starting point bit.
+        */
+       for (mapped = 0, idx = 0;
+            mapped < npages && idx <= uctxt->tidmapcnt;) {
+               u64 i, offset = 0;
+               unsigned free, pinned, pmapped = 0, bits_used;
+               u16 grp;
+
+               /*
+                * "Reserve" the needed group bits under lock so other
+                * processes can't step in the middle of it. Once
+                * reserved, we don't need the lock anymore since we
+                * are guaranteed the groups.
+                */
+               spin_lock(&uctxt->exp_lock);
+               if (uctxt->tidusemap[useidx] == -1ULL ||
+                   bitidx >= BITS_PER_LONG) {
+                       /* no free groups in the set, use the next */
+                       useidx = (useidx + 1) % uctxt->tidmapcnt;
+                       idx++;
+                       bitidx = 0;
+                       spin_unlock(&uctxt->exp_lock);
+                       continue;
+               }
+               ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
+                       !!((npages - mapped) % dd->rcv_entries.group_size);
+
+               /*
+                * If we've gotten here, the current set of groups does have
+                * one or more free groups.
+                */
+               free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
+               if (!free) {
+                       /*
+                        * Despite the check above, free could still come back
+                        * as 0 because we don't check the entire bitmap but
+                        * we start from bitidx.
+                        */
+                       spin_unlock(&uctxt->exp_lock);
+                       continue;
+               }
+               bits_used = min(free, ngroups);
+               tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
+               uctxt->tidusemap[useidx] |= tidmap[useidx];
+               spin_unlock(&uctxt->exp_lock);
+
+               /*
+                * At this point, we know where in the map we have free bits.
+                * properly offset into the various "shadow" arrays and compute
+                * the RcvArray entry index.
+                */
+               offset = ((useidx * BITS_PER_LONG) + bitidx) *
+                       dd->rcv_entries.group_size;
+               pages = uctxt->tid_pg_list + offset;
+               phys = uctxt->physshadow + offset;
+               tid = uctxt->expected_base + offset;
+
+               /* Calculate how many pages we can pin based on free bits */
+               pinned = min((bits_used * dd->rcv_entries.group_size),
+                            (npages - mapped));
+               /*
+                * Now that we know how many free RcvArray entries we have,
+                * we can pin that many user pages.
+                */
+               ret = hfi1_get_user_pages(vaddr + (mapped * PAGE_SIZE),
+                                         pinned, pages);
+               if (ret) {
+                       /*
+                        * We can't continue because the pages array won't be
+                        * initialized. This should never happen,
+                        * unless perhaps the user has mpin'ed the pages
+                        * themselves.
+                        */
+                       dd_dev_info(dd,
+                                   "Failed to lock addr %p, %u pages: errno %d\n",
+                                   (void *) vaddr, pinned, -ret);
+                       /*
+                        * Let go of the bits that we reserved since we are not
+                        * going to use them.
+                        */
+                       spin_lock(&uctxt->exp_lock);
+                       uctxt->tidusemap[useidx] &=
+                               ~(((1ULL << bits_used) - 1) << bitidx);
+                       spin_unlock(&uctxt->exp_lock);
+                       goto done;
+               }
+               /*
+                * How many groups do we need based on how many pages we have
+                * pinned?
+                */
+               ngroups = (pinned / dd->rcv_entries.group_size) +
+                       !!(pinned % dd->rcv_entries.group_size);
+               /*
+                * Keep programming RcvArray entries for all the <ngroups> free
+                * groups.
+                */
+               for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
+                       unsigned j;
+                       u32 pair_size = 0, tidsize;
+                       /*
+                        * This inner loop will program an entire group or the
+                        * array of pinned pages (which ever limit is hit
+                        * first).
+                        */
+                       for (j = 0; j < dd->rcv_entries.group_size &&
+                                    pmapped < pinned; j++, pmapped++, tid++) {
+                               tidsize = PAGE_SIZE;
+                               phys[pmapped] = hfi1_map_page(dd->pcidev,
+                                                  pages[pmapped], 0,
+                                                  tidsize, PCI_DMA_FROMDEVICE);
+                               trace_hfi1_exp_rcv_set(uctxt->ctxt,
+                                                      subctxt_fp(fp),
+                                                      tid, vaddr,
+                                                      phys[pmapped],
+                                                      pages[pmapped]);
+                               /*
+                                * Each RcvArray entry is programmed with one
+                                * page * worth of memory. This will handle
+                                * the 8K MTU as well as anything smaller
+                                * due to the fact that both entries in the
+                                * RcvTidPair are programmed with a page.
+                                * PSM currently does not handle anything
+                                * bigger than 8K MTU, so should we even worry
+                                * about 10K here?
+                                */
+                               hfi1_put_tid(dd, tid, PT_EXPECTED,
+                                            phys[pmapped],
+                                            ilog2(tidsize >> PAGE_SHIFT) + 1);
+                               pair_size += tidsize >> PAGE_SHIFT;
+                               EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
+                               if (!(tid % 2)) {
+                                       tidlist[pairidx] |=
+                                          EXP_TID_SET(IDX,
+                                               (tid - uctxt->expected_base)
+                                                      / 2);
+                                       tidlist[pairidx] |=
+                                               EXP_TID_SET(CTRL, 1);
+                                       tidcnt++;
+                               } else {
+                                       tidlist[pairidx] |=
+                                               EXP_TID_SET(CTRL, 2);
+                                       pair_size = 0;
+                                       pairidx++;
+                               }
+                       }
+                       /*
+                        * We've programmed the entire group (or as much of the
+                        * group as we'll use. Now, it's time to push it out...
+                        */
+                       flush_wc();
+               }
+               mapped += pinned;
+               atomic_set(&uctxt->tidcursor,
+                          (((useidx & 0xffffff) << 16) |
+                           ((bitidx + bits_used) & 0xffffff)));
+       }
+       trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, uctxt->tidusemap,
+                              uctxt->tidmapcnt);
+
+done:
+       /* If we've mapped anything, copy relevant info to user */
+       if (mapped) {
+               if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+                                tidlist, sizeof(tidlist[0]) * tidcnt)) {
+                       ret = -EFAULT;
+                       goto done;
+               }
+               /* copy TID info to user */
+               if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
+                                tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
+                       ret = -EFAULT;
+       }
+bail:
+       /*
+        * Calculate mapped length. New Exp TID protocol does not "unwind" and
+        * report an error if it can't map the entire buffer. It just reports
+        * the length that was mapped.
+        */
+       tinfo->length = mapped * PAGE_SIZE;
+       tinfo->tidcnt = tidcnt;
+       return ret;
+}
+
+static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned long tidmap[uctxt->tidmapcnt];
+       struct page **pages;
+       dma_addr_t *phys;
+       u16 idx, bitidx, tid;
+       int ret = 0;
+
+       if (copy_from_user(&tidmap, (void __user *)(unsigned long)
+                          tinfo->tidmap,
+                          sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
+               ret = -EFAULT;
+               goto done;
+       }
+       for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
+               unsigned long map;
+
+               bitidx = 0;
+               if (!tidmap[idx])
+                       continue;
+               map = tidmap[idx];
+               while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
+                       int i, pcount = 0;
+                       struct page *pshadow[dd->rcv_entries.group_size];
+                       unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
+                               dd->rcv_entries.group_size;
+
+                       pages = uctxt->tid_pg_list + offset;
+                       phys = uctxt->physshadow + offset;
+                       tid = uctxt->expected_base + offset;
+                       for (i = 0; i < dd->rcv_entries.group_size;
+                            i++, tid++) {
+                               if (pages[i]) {
+                                       hfi1_put_tid(dd, tid, PT_INVALID,
+                                                     0, 0);
+                                       trace_hfi1_exp_rcv_free(uctxt->ctxt,
+                                                               subctxt_fp(fp),
+                                                               tid, phys[i],
+                                                               pages[i]);
+                                       pci_unmap_page(dd->pcidev, phys[i],
+                                             PAGE_SIZE, PCI_DMA_FROMDEVICE);
+                                       pshadow[pcount] = pages[i];
+                                       pages[i] = NULL;
+                                       pcount++;
+                                       phys[i] = 0;
+                               }
+                       }
+                       flush_wc();
+                       hfi1_release_user_pages(pshadow, pcount);
+                       clear_bit(bitidx, &uctxt->tidusemap[idx]);
+                       map &= ~(1ULL<<bitidx);
+               }
+       }
+       trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 1, uctxt->tidusemap,
+                              uctxt->tidmapcnt);
+done:
+       return ret;
+}
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
+{
+       struct hfi1_devdata *dd = uctxt->dd;
+       unsigned tid;
+
+       dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
+                   uctxt->ctxt);
+       for (tid = 0; tid < uctxt->expected_count; tid++) {
+               struct page *p = uctxt->tid_pg_list[tid];
+               dma_addr_t phys;
+
+               if (!p)
+                       continue;
+
+               phys = uctxt->physshadow[tid];
+               uctxt->physshadow[tid] = 0;
+               uctxt->tid_pg_list[tid] = NULL;
+               pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
+               hfi1_release_user_pages(&p, 1);
+       }
+}
+
+static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
+                        u16 pkey)
+{
+       int ret = -ENOENT, i, intable = 0;
+       struct hfi1_pportdata *ppd = uctxt->ppd;
+       struct hfi1_devdata *dd = uctxt->dd;
+
+       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
+               if (pkey == ppd->pkeys[i]) {
+                       intable = 1;
+                       break;
+               }
+
+       if (intable)
+               ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
+done:
+       return ret;
+}
+
+static int ui_open(struct inode *inode, struct file *filp)
+{
+       struct hfi1_devdata *dd;
+
+       dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev);
+       filp->private_data = dd; /* for other methods */
+       return 0;
+}
+
+static int ui_release(struct inode *inode, struct file *filp)
+{
+       /* nothing to do */
+       return 0;
+}
+
+static loff_t ui_lseek(struct file *filp, loff_t offset, int whence)
+{
+       struct hfi1_devdata *dd = filp->private_data;
+
+       switch (whence) {
+       case SEEK_SET:
+               break;
+       case SEEK_CUR:
+               offset += filp->f_pos;
+               break;
+       case SEEK_END:
+               offset = ((dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE) -
+                       offset;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (offset < 0)
+               return -EINVAL;
+
+       if (offset >= (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE)
+               return -EINVAL;
+
+       filp->f_pos = offset;
+
+       return filp->f_pos;
+}
+
+
+/* NOTE: assumes unsigned long is 8 bytes */
+static ssize_t ui_read(struct file *filp, char __user *buf, size_t count,
+                       loff_t *f_pos)
+{
+       struct hfi1_devdata *dd = filp->private_data;
+       void __iomem *base = dd->kregbase;
+       unsigned long total, csr_off,
+               barlen = (dd->kregend - dd->kregbase);
+       u64 data;
+
+       /* only read 8 byte quantities */
+       if ((count % 8) != 0)
+               return -EINVAL;
+       /* offset must be 8-byte aligned */
+       if ((*f_pos % 8) != 0)
+               return -EINVAL;
+       /* destination buffer must be 8-byte aligned */
+       if ((unsigned long)buf % 8 != 0)
+               return -EINVAL;
+       /* must be in range */
+       if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE))
+               return -EINVAL;
+       /* only set the base if we are not starting past the BAR */
+       if (*f_pos < barlen)
+               base += *f_pos;
+       csr_off = *f_pos;
+       for (total = 0; total < count; total += 8, csr_off += 8) {
+               /* accessing LCB CSRs requires more checks */
+               if (is_lcb_offset(csr_off)) {
+                       if (read_lcb_csr(dd, csr_off, (u64 *)&data))
+                               break; /* failed */
+               }
+               /*
+                * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a
+                * false parity error.  Avoid the whole issue by not reading
+                * them.  These registers are defined as having a read value
+                * of 0.
+                */
+               else if (csr_off == ASIC_GPIO_CLEAR
+                               || csr_off == ASIC_GPIO_FORCE
+                               || csr_off == ASIC_QSFP1_CLEAR
+                               || csr_off == ASIC_QSFP1_FORCE
+                               || csr_off == ASIC_QSFP2_CLEAR
+                               || csr_off == ASIC_QSFP2_FORCE)
+                       data = 0;
+               else if (csr_off >= barlen) {
+                       /*
+                        * read_8051_data can read more than just 8 bytes at
+                        * a time. However, folding this into the loop and
+                        * handling the reads in 8 byte increments allows us
+                        * to smoothly transition from chip memory to 8051
+                        * memory.
+                        */
+                       if (read_8051_data(dd,
+                                          (u32)(csr_off - barlen),
+                                          sizeof(data), &data))
+                               break; /* failed */
+               } else
+                       data = readq(base + total);
+               if (put_user(data, (unsigned long __user *)(buf + total)))
+                       break;
+       }
+       *f_pos += total;
+       return total;
+}
+
+/* NOTE: assumes unsigned long is 8 bytes */
+static ssize_t ui_write(struct file *filp, const char __user *buf,
+                       size_t count, loff_t *f_pos)
+{
+       struct hfi1_devdata *dd = filp->private_data;
+       void __iomem *base;
+       unsigned long total, data, csr_off;
+       int in_lcb;
+
+       /* only write 8 byte quantities */
+       if ((count % 8) != 0)
+               return -EINVAL;
+       /* offset must be 8-byte aligned */
+       if ((*f_pos % 8) != 0)
+               return -EINVAL;
+       /* source buffer must be 8-byte aligned */
+       if ((unsigned long)buf % 8 != 0)
+               return -EINVAL;
+       /* must be in range */
+       if (*f_pos + count > dd->kregend - dd->kregbase)
+               return -EINVAL;
+
+       base = (void __iomem *)dd->kregbase + *f_pos;
+       csr_off = *f_pos;
+       in_lcb = 0;
+       for (total = 0; total < count; total += 8, csr_off += 8) {
+               if (get_user(data, (unsigned long __user *)(buf + total)))
+                       break;
+               /* accessing LCB CSRs requires a special procedure */
+               if (is_lcb_offset(csr_off)) {
+                       if (!in_lcb) {
+                               int ret = acquire_lcb_access(dd, 1);
+
+                               if (ret)
+                                       break;
+                               in_lcb = 1;
+                       }
+               } else {
+                       if (in_lcb) {
+                               release_lcb_access(dd, 1);
+                               in_lcb = 0;
+                       }
+               }
+               writeq(data, base + total);
+       }
+       if (in_lcb)
+               release_lcb_access(dd, 1);
+       *f_pos += total;
+       return total;
+}
+
+static const struct file_operations ui_file_ops = {
+       .owner = THIS_MODULE,
+       .llseek = ui_lseek,
+       .read = ui_read,
+       .write = ui_write,
+       .open = ui_open,
+       .release = ui_release,
+};
+#define UI_OFFSET 192  /* device minor offset for UI devices */
+static int create_ui = 1;
+
+static struct cdev wildcard_cdev;
+static struct device *wildcard_device;
+
+static atomic_t user_count = ATOMIC_INIT(0);
+
+static void user_remove(struct hfi1_devdata *dd)
+{
+       if (atomic_dec_return(&user_count) == 0)
+               hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device);
+
+       hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+       hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device);
+}
+
+static int user_add(struct hfi1_devdata *dd)
+{
+       char name[10];
+       int ret;
+
+       if (atomic_inc_return(&user_count) == 1) {
+               ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops,
+                                    &wildcard_cdev, &wildcard_device);
+               if (ret)
+                       goto done;
+       }
+
+       snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
+       ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops,
+                            &dd->user_cdev, &dd->user_device);
+       if (ret)
+               goto done;
+
+       if (create_ui) {
+               snprintf(name, sizeof(name),
+                        "%s_ui%d", class_name(), dd->unit);
+               ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops,
+                                    &dd->ui_cdev, &dd->ui_device);
+               if (ret)
+                       goto done;
+       }
+
+       return 0;
+done:
+       user_remove(dd);
+       return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int hfi1_device_create(struct hfi1_devdata *dd)
+{
+       int r, ret;
+
+       r = user_add(dd);
+       ret = hfi1_diag_add(dd);
+       if (r && !ret)
+               ret = r;
+       return ret;
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void hfi1_device_remove(struct hfi1_devdata *dd)
+{
+       user_remove(dd);
+       hfi1_diag_remove(dd);
+}
diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c
new file mode 100644 (file)
index 0000000..5c2f2ed
--- /dev/null
@@ -0,0 +1,1620 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/firmware.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/crc32.h>
+
+#include "hfi.h"
+#include "trace.h"
+
+/*
+ * Make it easy to toggle firmware file name and if it gets loaded by
+ * editing the following. This may be something we do while in development
+ * but not necessarily something a user would ever need to use.
+ */
+#define DEFAULT_FW_8051_NAME_FPGA "hfi_dc8051.bin"
+#define DEFAULT_FW_8051_NAME_ASIC "hfi1_dc8051.fw"
+#define DEFAULT_FW_FABRIC_NAME "hfi1_fabric.fw"
+#define DEFAULT_FW_SBUS_NAME "hfi1_sbus.fw"
+#define DEFAULT_FW_PCIE_NAME "hfi1_pcie.fw"
+#define DEFAULT_PLATFORM_CONFIG_NAME "hfi1_platform.dat"
+
+static uint fw_8051_load = 1;
+static uint fw_fabric_serdes_load = 1;
+static uint fw_pcie_serdes_load = 1;
+static uint fw_sbus_load = 1;
+static uint platform_config_load = 1;
+
+/* Firmware file names get set in hfi1_firmware_init() based on the above */
+static char *fw_8051_name;
+static char *fw_fabric_serdes_name;
+static char *fw_sbus_name;
+static char *fw_pcie_serdes_name;
+static char *platform_config_name;
+
+#define SBUS_MAX_POLL_COUNT 100
+#define SBUS_COUNTER(reg, name) \
+       (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \
+        ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK)
+
+/*
+ * Firmware security header.
+ */
+struct css_header {
+       u32 module_type;
+       u32 header_len;
+       u32 header_version;
+       u32 module_id;
+       u32 module_vendor;
+       u32 date;               /* BCD yyyymmdd */
+       u32 size;               /* in DWORDs */
+       u32 key_size;           /* in DWORDs */
+       u32 modulus_size;       /* in DWORDs */
+       u32 exponent_size;      /* in DWORDs */
+       u32 reserved[22];
+};
+/* expected field values */
+#define CSS_MODULE_TYPE           0x00000006
+#define CSS_HEADER_LEN    0x000000a1
+#define CSS_HEADER_VERSION 0x00010000
+#define CSS_MODULE_VENDOR  0x00008086
+
+#define KEY_SIZE      256
+#define MU_SIZE                8
+#define EXPONENT_SIZE  4
+
+/* the file itself */
+struct firmware_file {
+       struct css_header css_header;
+       u8 modulus[KEY_SIZE];
+       u8 exponent[EXPONENT_SIZE];
+       u8 signature[KEY_SIZE];
+       u8 firmware[];
+};
+
+struct augmented_firmware_file {
+       struct css_header css_header;
+       u8 modulus[KEY_SIZE];
+       u8 exponent[EXPONENT_SIZE];
+       u8 signature[KEY_SIZE];
+       u8 r2[KEY_SIZE];
+       u8 mu[MU_SIZE];
+       u8 firmware[];
+};
+
+/* augmented file size difference */
+#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \
+                                               sizeof(struct firmware_file))
+
+struct firmware_details {
+       /* Linux core piece */
+       const struct firmware *fw;
+
+       struct css_header *css_header;
+       u8 *firmware_ptr;               /* pointer to binary data */
+       u32 firmware_len;               /* length in bytes */
+       u8 *modulus;                    /* pointer to the modulus */
+       u8 *exponent;                   /* pointer to the exponent */
+       u8 *signature;                  /* pointer to the signature */
+       u8 *r2;                         /* pointer to r2 */
+       u8 *mu;                         /* pointer to mu */
+       struct augmented_firmware_file dummy_header;
+};
+
+/*
+ * The mutex protects fw_state, fw_err, and all of the firmware_details
+ * variables.
+ */
+static DEFINE_MUTEX(fw_mutex);
+enum fw_state {
+       FW_EMPTY,
+       FW_ACQUIRED,
+       FW_ERR
+};
+static enum fw_state fw_state = FW_EMPTY;
+static int fw_err;
+static struct firmware_details fw_8051;
+static struct firmware_details fw_fabric;
+static struct firmware_details fw_pcie;
+static struct firmware_details fw_sbus;
+static const struct firmware *platform_config;
+
+/* flags for turn_off_spicos() */
+#define SPICO_SBUS   0x1
+#define SPICO_FABRIC 0x2
+#define ENABLE_SPICO_SMASK 0x1
+
+/* security block commands */
+#define RSA_CMD_INIT  0x1
+#define RSA_CMD_START 0x2
+
+/* security block status */
+#define RSA_STATUS_IDLE   0x0
+#define RSA_STATUS_ACTIVE 0x1
+#define RSA_STATUS_DONE   0x2
+#define RSA_STATUS_FAILED 0x3
+
+/* RSA engine timeout, in ms */
+#define RSA_ENGINE_TIMEOUT 100 /* ms */
+
+/* hardware mutex timeout, in ms */
+#define HM_TIMEOUT 4000 /* 4 s */
+
+/* 8051 memory access timeout, in us */
+#define DC8051_ACCESS_TIMEOUT 100 /* us */
+
+/* the number of fabric SerDes on the SBus */
+#define NUM_FABRIC_SERDES 4
+
+/* SBus fabric SerDes addresses, one set per HFI */
+static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = {
+       { 0x01, 0x02, 0x03, 0x04 },
+       { 0x28, 0x29, 0x2a, 0x2b }
+};
+
+/* SBus PCIe SerDes addresses, one set per HFI */
+static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = {
+       { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+         0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 },
+       { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d,
+         0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d }
+};
+
+/* SBus PCIe PCS addresses, one set per HFI */
+const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = {
+       { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17,
+         0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 },
+       { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+         0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e }
+};
+
+/* SBus fabric SerDes broadcast addresses, one per HFI */
+static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 };
+static const u8 all_fabric_serdes_broadcast = 0xe1;
+
+/* SBus PCIe SerDes broadcast addresses, one per HFI */
+const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 };
+static const u8 all_pcie_serdes_broadcast = 0xe0;
+
+/* forwards */
+static void dispose_one_firmware(struct firmware_details *fdet);
+
+/*
+ * Read a single 64-bit value from 8051 data memory.
+ *
+ * Expects:
+ * o caller to have already set up data read, no auto increment
+ * o caller to turn off read enable when finished
+ *
+ * The address argument is a byte offset.  Bits 0:2 in the address are
+ * ignored - i.e. the hardware will always do aligned 8-byte reads as if
+ * the lower bits are zero.
+ *
+ * Return 0 on success, -ENXIO on a read error (timeout).
+ */
+static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result)
+{
+       u64 reg;
+       int count;
+
+       /* start the read at the given address */
+       reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+               | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+       /* wait until ACCESS_COMPLETED is set */
+       count = 0;
+       while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+                   == 0) {
+               count++;
+               if (count > DC8051_ACCESS_TIMEOUT) {
+                       dd_dev_err(dd, "timeout reading 8051 data\n");
+                       return -ENXIO;
+               }
+               ndelay(10);
+       }
+
+       /* gather the data */
+       *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA);
+
+       return 0;
+}
+
+/*
+ * Read 8051 data starting at addr, for len bytes.  Will read in 8-byte chunks.
+ * Return 0 on success, -errno on error.
+ */
+int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result)
+{
+       unsigned long flags;
+       u32 done;
+       int ret = 0;
+
+       spin_lock_irqsave(&dd->dc8051_memlock, flags);
+
+       /* data read set-up, no auto-increment */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+       for (done = 0; done < len; addr += 8, done += 8, result++) {
+               ret = __read_8051_data(dd, addr, result);
+               if (ret)
+                       break;
+       }
+
+       /* turn off read enable */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+
+       spin_unlock_irqrestore(&dd->dc8051_memlock, flags);
+
+       return ret;
+}
+
+/*
+ * Write data or code to the 8051 code or data RAM.
+ */
+static int write_8051(struct hfi1_devdata *dd, int code, u32 start,
+                     const u8 *data, u32 len)
+{
+       u64 reg;
+       u32 offset;
+       int aligned, count;
+
+       /* check alignment */
+       aligned = ((unsigned long)data & 0x7) == 0;
+
+       /* write set-up */
+       reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull)
+               | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg);
+
+       reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK)
+                       << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT)
+               | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg);
+
+       /* write */
+       for (offset = 0; offset < len; offset += 8) {
+               int bytes = len - offset;
+
+               if (bytes < 8) {
+                       reg = 0;
+                       memcpy(&reg, &data[offset], bytes);
+               } else if (aligned) {
+                       reg = *(u64 *)&data[offset];
+               } else {
+                       memcpy(&reg, &data[offset], 8);
+               }
+               write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg);
+
+               /* wait until ACCESS_COMPLETED is set */
+               count = 0;
+               while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS)
+                   & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK)
+                   == 0) {
+                       count++;
+                       if (count > DC8051_ACCESS_TIMEOUT) {
+                               dd_dev_err(dd, "timeout writing 8051 data\n");
+                               return -ENXIO;
+                       }
+                       udelay(1);
+               }
+       }
+
+       /* turn off write access, auto increment (also sets to data access) */
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0);
+       write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0);
+
+       return 0;
+}
+
+/* return 0 if values match, non-zero and complain otherwise */
+static int invalid_header(struct hfi1_devdata *dd, const char *what,
+                         u32 actual, u32 expected)
+{
+       if (actual == expected)
+               return 0;
+
+       dd_dev_err(dd,
+               "invalid firmware header field %s: expected 0x%x, actual 0x%x\n",
+               what, expected, actual);
+       return 1;
+}
+
+/*
+ * Verify that the static fields in the CSS header match.
+ */
+static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css)
+{
+       /* verify CSS header fields (most sizes are in DW, so add /4) */
+       if (invalid_header(dd, "module_type", css->module_type, CSS_MODULE_TYPE)
+                       || invalid_header(dd, "header_len", css->header_len,
+                                       (sizeof(struct firmware_file)/4))
+                       || invalid_header(dd, "header_version",
+                                       css->header_version, CSS_HEADER_VERSION)
+                       || invalid_header(dd, "module_vendor",
+                                       css->module_vendor, CSS_MODULE_VENDOR)
+                       || invalid_header(dd, "key_size",
+                                       css->key_size, KEY_SIZE/4)
+                       || invalid_header(dd, "modulus_size",
+                                       css->modulus_size, KEY_SIZE/4)
+                       || invalid_header(dd, "exponent_size",
+                                       css->exponent_size, EXPONENT_SIZE/4)) {
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Make sure there are at least some bytes after the prefix.
+ */
+static int payload_check(struct hfi1_devdata *dd, const char *name,
+                        long file_size, long prefix_size)
+{
+       /* make sure we have some payload */
+       if (prefix_size >= file_size) {
+               dd_dev_err(dd,
+                       "firmware \"%s\", size %ld, must be larger than %ld bytes\n",
+                       name, file_size, prefix_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * Request the firmware from the system.  Extract the pieces and fill in
+ * fdet.  If successful, the caller will need to call dispose_one_firmware().
+ * Returns 0 on success, -ERRNO on error.
+ */
+static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name,
+                              struct firmware_details *fdet)
+{
+       struct css_header *css;
+       int ret;
+
+       memset(fdet, 0, sizeof(*fdet));
+
+       ret = request_firmware(&fdet->fw, name, &dd->pcidev->dev);
+       if (ret) {
+               dd_dev_err(dd, "cannot load firmware \"%s\", err %d\n",
+                       name, ret);
+               return ret;
+       }
+
+       /* verify the firmware */
+       if (fdet->fw->size < sizeof(struct css_header)) {
+               dd_dev_err(dd, "firmware \"%s\" is too small\n", name);
+               ret = -EINVAL;
+               goto done;
+       }
+       css = (struct css_header *)fdet->fw->data;
+
+       hfi1_cdbg(FIRMWARE, "Firmware %s details:", name);
+       hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size);
+       hfi1_cdbg(FIRMWARE, "CSS structure:");
+       hfi1_cdbg(FIRMWARE, "  module_type    0x%x", css->module_type);
+       hfi1_cdbg(FIRMWARE, "  header_len     0x%03x (0x%03x bytes)",
+                 css->header_len, 4 * css->header_len);
+       hfi1_cdbg(FIRMWARE, "  header_version 0x%x", css->header_version);
+       hfi1_cdbg(FIRMWARE, "  module_id      0x%x", css->module_id);
+       hfi1_cdbg(FIRMWARE, "  module_vendor  0x%x", css->module_vendor);
+       hfi1_cdbg(FIRMWARE, "  date           0x%x", css->date);
+       hfi1_cdbg(FIRMWARE, "  size           0x%03x (0x%03x bytes)",
+                 css->size, 4 * css->size);
+       hfi1_cdbg(FIRMWARE, "  key_size       0x%03x (0x%03x bytes)",
+                 css->key_size, 4 * css->key_size);
+       hfi1_cdbg(FIRMWARE, "  modulus_size   0x%03x (0x%03x bytes)",
+                 css->modulus_size, 4 * css->modulus_size);
+       hfi1_cdbg(FIRMWARE, "  exponent_size  0x%03x (0x%03x bytes)",
+                 css->exponent_size, 4 * css->exponent_size);
+       hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes",
+                 fdet->fw->size - sizeof(struct firmware_file));
+
+       /*
+        * If the file does not have a valid CSS header, fail.
+        * Otherwise, check the CSS size field for an expected size.
+        * The augmented file has r2 and mu inserted after the header
+        * was generated, so there will be a known difference between
+        * the CSS header size and the actual file size.  Use this
+        * difference to identify an augmented file.
+        *
+        * Note: css->size is in DWORDs, multiply by 4 to get bytes.
+        */
+       ret = verify_css_header(dd, css);
+       if (ret) {
+               dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name);
+       } else if ((css->size*4) == fdet->fw->size) {
+               /* non-augmented firmware file */
+               struct firmware_file *ff = (struct firmware_file *)
+                                                       fdet->fw->data;
+
+               /* make sure there are bytes in the payload */
+               ret = payload_check(dd, name, fdet->fw->size,
+                                               sizeof(struct firmware_file));
+               if (ret == 0) {
+                       fdet->css_header = css;
+                       fdet->modulus = ff->modulus;
+                       fdet->exponent = ff->exponent;
+                       fdet->signature = ff->signature;
+                       fdet->r2 = fdet->dummy_header.r2; /* use dummy space */
+                       fdet->mu = fdet->dummy_header.mu; /* use dummy space */
+                       fdet->firmware_ptr = ff->firmware;
+                       fdet->firmware_len = fdet->fw->size -
+                                               sizeof(struct firmware_file);
+                       /*
+                        * Header does not include r2 and mu - generate here.
+                        * For now, fail.
+                        */
+                       dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n");
+                       ret = -EINVAL;
+               }
+       } else if ((css->size*4) + AUGMENT_SIZE == fdet->fw->size) {
+               /* augmented firmware file */
+               struct augmented_firmware_file *aff =
+                       (struct augmented_firmware_file *)fdet->fw->data;
+
+               /* make sure there are bytes in the payload */
+               ret = payload_check(dd, name, fdet->fw->size,
+                                       sizeof(struct augmented_firmware_file));
+               if (ret == 0) {
+                       fdet->css_header = css;
+                       fdet->modulus = aff->modulus;
+                       fdet->exponent = aff->exponent;
+                       fdet->signature = aff->signature;
+                       fdet->r2 = aff->r2;
+                       fdet->mu = aff->mu;
+                       fdet->firmware_ptr = aff->firmware;
+                       fdet->firmware_len = fdet->fw->size -
+                                       sizeof(struct augmented_firmware_file);
+               }
+       } else {
+               /* css->size check failed */
+               dd_dev_err(dd,
+                       "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n",
+                       fdet->fw->size/4, (fdet->fw->size - AUGMENT_SIZE)/4,
+                       css->size);
+
+               ret = -EINVAL;
+       }
+
+done:
+       /* if returning an error, clean up after ourselves */
+       if (ret)
+               dispose_one_firmware(fdet);
+       return ret;
+}
+
+static void dispose_one_firmware(struct firmware_details *fdet)
+{
+       release_firmware(fdet->fw);
+       fdet->fw = NULL;
+}
+
+/*
+ * Called by all HFIs when loading their firmware - i.e. device probe time.
+ * The first one will do the actual firmware load.  Use a mutex to resolve
+ * any possible race condition.
+ *
+ * The call to this routine cannot be moved to driver load because the kernel
+ * call request_firmware() requires a device which is only available after
+ * the first device probe.
+ */
+static int obtain_firmware(struct hfi1_devdata *dd)
+{
+       int err = 0;
+
+       mutex_lock(&fw_mutex);
+       if (fw_state == FW_ACQUIRED) {
+               goto done;      /* already acquired */
+       } else if (fw_state == FW_ERR) {
+               err = fw_err;
+               goto done;      /* already tried and failed */
+       }
+
+       if (fw_8051_load) {
+               err = obtain_one_firmware(dd, fw_8051_name, &fw_8051);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_fabric_serdes_load) {
+               err = obtain_one_firmware(dd, fw_fabric_serdes_name,
+                       &fw_fabric);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_sbus_load) {
+               err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus);
+               if (err)
+                       goto done;
+       }
+
+       if (fw_pcie_serdes_load) {
+               err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie);
+               if (err)
+                       goto done;
+       }
+
+       if (platform_config_load) {
+               platform_config = NULL;
+               err = request_firmware(&platform_config, platform_config_name,
+                                               &dd->pcidev->dev);
+               if (err) {
+                       err = 0;
+                       platform_config = NULL;
+               }
+       }
+
+       /* success */
+       fw_state = FW_ACQUIRED;
+
+done:
+       if (err) {
+               fw_err = err;
+               fw_state = FW_ERR;
+       }
+       mutex_unlock(&fw_mutex);
+
+       return err;
+}
+
+/*
+ * Called when the driver unloads.  The timing is asymmetric with its
+ * counterpart, obtain_firmware().  If called at device remove time,
+ * then it is conceivable that another device could probe while the
+ * firmware is being disposed.  The mutexes can be moved to do that
+ * safely, but then the firmware would be requested from the OS multiple
+ * times.
+ *
+ * No mutex is needed as the driver is unloading and there cannot be any
+ * other callers.
+ */
+void dispose_firmware(void)
+{
+       dispose_one_firmware(&fw_8051);
+       dispose_one_firmware(&fw_fabric);
+       dispose_one_firmware(&fw_pcie);
+       dispose_one_firmware(&fw_sbus);
+
+       release_firmware(platform_config);
+       platform_config = NULL;
+
+       /* retain the error state, otherwise revert to empty */
+       if (fw_state != FW_ERR)
+               fw_state = FW_EMPTY;
+}
+
+/*
+ * Write a block of data to a given array CSR.  All calls will be in
+ * multiples of 8 bytes.
+ */
+static void write_rsa_data(struct hfi1_devdata *dd, int what,
+                          const u8 *data, int nbytes)
+{
+       int qw_size = nbytes/8;
+       int i;
+
+       if (((unsigned long)data & 0x7) == 0) {
+               /* aligned */
+               u64 *ptr = (u64 *)data;
+
+               for (i = 0; i < qw_size; i++, ptr++)
+                       write_csr(dd, what + (8*i), *ptr);
+       } else {
+               /* not aligned */
+               for (i = 0; i < qw_size; i++, data += 8) {
+                       u64 value;
+
+                       memcpy(&value, data, 8);
+                       write_csr(dd, what + (8*i), value);
+               }
+       }
+}
+
+/*
+ * Write a block of data to a given CSR as a stream of writes.  All calls will
+ * be in multiples of 8 bytes.
+ */
+static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what,
+                                   const u8 *data, int nbytes)
+{
+       u64 *ptr = (u64 *)data;
+       int qw_size = nbytes/8;
+
+       for (; qw_size > 0; qw_size--, ptr++)
+               write_csr(dd, what, *ptr);
+}
+
+/*
+ * Download the signature and start the RSA mechanism.  Wait for
+ * RSA_ENGINE_TIMEOUT before giving up.
+ */
+static int run_rsa(struct hfi1_devdata *dd, const char *who,
+                  const u8 *signature)
+{
+       unsigned long timeout;
+       u64 reg;
+       u32 status;
+       int ret = 0;
+
+       /* write the signature */
+       write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE);
+
+       /* initialize RSA */
+       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT);
+
+       /*
+        * Make sure the engine is idle and insert a delay between the two
+        * writes to MISC_CFG_RSA_CMD.
+        */
+       status = (read_csr(dd, MISC_CFG_FW_CTRL)
+                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+       if (status != RSA_STATUS_IDLE) {
+               dd_dev_err(dd, "%s security engine not idle - giving up\n",
+                       who);
+               return -EBUSY;
+       }
+
+       /* start RSA */
+       write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START);
+
+       /*
+        * Look for the result.
+        *
+        * The RSA engine is hooked up to two MISC errors.  The driver
+        * masks these errors as they do not respond to the standard
+        * error "clear down" mechanism.  Look for these errors here and
+        * clear them when possible.  This routine will exit with the
+        * errors of the current run still set.
+        *
+        * MISC_FW_AUTH_FAILED_ERR
+        *      Firmware authorization failed.  This can be cleared by
+        *      re-initializing the RSA engine, then clearing the status bit.
+        *      Do not re-init the RSA angine immediately after a successful
+        *      run - this will reset the current authorization.
+        *
+        * MISC_KEY_MISMATCH_ERR
+        *      Key does not match.  The only way to clear this is to load
+        *      a matching key then clear the status bit.  If this error
+        *      is raised, it will persist outside of this routine until a
+        *      matching key is loaded.
+        */
+       timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies;
+       while (1) {
+               status = (read_csr(dd, MISC_CFG_FW_CTRL)
+                          & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK)
+                            >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT;
+
+               if (status == RSA_STATUS_IDLE) {
+                       /* should not happen */
+                       dd_dev_err(dd, "%s firmware security bad idle state\n",
+                               who);
+                       ret = -EINVAL;
+                       break;
+               } else if (status == RSA_STATUS_DONE) {
+                       /* finished successfully */
+                       break;
+               } else if (status == RSA_STATUS_FAILED) {
+                       /* finished unsuccessfully */
+                       ret = -EINVAL;
+                       break;
+               }
+               /* else still active */
+
+               if (time_after(jiffies, timeout)) {
+                       /*
+                        * Timed out while active.  We can't reset the engine
+                        * if it is stuck active, but run through the
+                        * error code to see what error bits are set.
+                        */
+                       dd_dev_err(dd, "%s firmware security time out\n", who);
+                       ret = -ETIMEDOUT;
+                       break;
+               }
+
+               msleep(20);
+       }
+
+       /*
+        * Arrive here on success or failure.  Clear all RSA engine
+        * errors.  All current errors will stick - the RSA logic is keeping
+        * error high.  All previous errors will clear - the RSA logic
+        * is not keeping the error high.
+        */
+       write_csr(dd, MISC_ERR_CLEAR,
+                       MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK
+                       | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK);
+       /*
+        * All that is left are the current errors.  Print failure details,
+        * if any.
+        */
+       reg = read_csr(dd, MISC_ERR_STATUS);
+       if (ret) {
+               if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK)
+                       dd_dev_err(dd, "%s firmware authorization failed\n",
+                               who);
+               if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)
+                       dd_dev_err(dd, "%s firmware key mismatch\n", who);
+       }
+
+       return ret;
+}
+
+static void load_security_variables(struct hfi1_devdata *dd,
+                                   struct firmware_details *fdet)
+{
+       /* Security variables a.  Write the modulus */
+       write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE);
+       /* Security variables b.  Write the r2 */
+       write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE);
+       /* Security variables c.  Write the mu */
+       write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE);
+       /* Security variables d.  Write the header */
+       write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD,
+                       (u8 *)fdet->css_header, sizeof(struct css_header));
+}
+
+/* return the 8051 firmware state */
+static inline u32 get_firmware_state(struct hfi1_devdata *dd)
+{
+       u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+
+       return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK;
+}
+
+/*
+ * Wait until the firmware is up and ready to take host requests.
+ * Return 0 on success, -ETIMEDOUT on timeout.
+ */
+int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout)
+{
+       unsigned long timeout;
+
+       /* in the simulator, the fake 8051 is always ready */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return 0;
+
+       timeout = msecs_to_jiffies(mstimeout) + jiffies;
+       while (1) {
+               if (get_firmware_state(dd) == 0xa0)     /* ready */
+                       return 0;
+               if (time_after(jiffies, timeout))       /* timed out */
+                       return -ETIMEDOUT;
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+}
+
+/*
+ * Load the 8051 firmware.
+ */
+static int load_8051_firmware(struct hfi1_devdata *dd,
+                             struct firmware_details *fdet)
+{
+       u64 reg;
+       int ret;
+       u8 ver_a, ver_b;
+
+       /*
+        * DC Reset sequence
+        * Load DC 8051 firmware
+        */
+       /*
+        * DC reset step 1: Reset DC8051
+        */
+       reg = DC_DC8051_CFG_RST_M8051W_SMASK
+               | DC_DC8051_CFG_RST_CRAM_SMASK
+               | DC_DC8051_CFG_RST_DRAM_SMASK
+               | DC_DC8051_CFG_RST_IRAM_SMASK
+               | DC_DC8051_CFG_RST_SFR_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+       /*
+        * DC reset step 2 (optional): Load 8051 data memory with link
+        * configuration
+        */
+
+       /*
+        * DC reset step 3: Load DC8051 firmware
+        */
+       /* release all but the core reset */
+       reg = DC_DC8051_CFG_RST_M8051W_SMASK;
+       write_csr(dd, DC_DC8051_CFG_RST, reg);
+
+       /* Firmware load step 1 */
+       load_security_variables(dd, fdet);
+
+       /*
+        * Firmware load step 2.  Clear MISC_CFG_FW_CTRL.FW_8051_LOADED
+        */
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+
+       /* Firmware load steps 3-5 */
+       ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr,
+                                                       fdet->firmware_len);
+       if (ret)
+               return ret;
+
+       /*
+        * DC reset step 4. Host starts the DC8051 firmware
+        */
+       /*
+        * Firmware load step 6.  Set MISC_CFG_FW_CTRL.FW_8051_LOADED
+        */
+       write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK);
+
+       /* Firmware load steps 7-10 */
+       ret = run_rsa(dd, "8051", fdet->signature);
+       if (ret)
+               return ret;
+
+       /* clear all reset bits, releasing the 8051 */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+
+       /*
+        * DC reset step 5. Wait for firmware to be ready to accept host
+        * requests.
+        */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) { /* timed out */
+               dd_dev_err(dd, "8051 start timeout, current state 0x%x\n",
+                       get_firmware_state(dd));
+               return -ETIMEDOUT;
+       }
+
+       read_misc_status(dd, &ver_a, &ver_b);
+       dd_dev_info(dd, "8051 firmware version %d.%d\n",
+               (int)ver_b, (int)ver_a);
+       dd->dc8051_ver = dc8051_ver(ver_b, ver_a);
+
+       return 0;
+}
+
+/* SBus Master broadcast address */
+#define SBUS_MASTER_BROADCAST 0xfd
+
+/*
+ * Write the SBus request register
+ *
+ * No need for masking - the arguments are sized exactly.
+ */
+void sbus_request(struct hfi1_devdata *dd,
+                 u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+       write_csr(dd, ASIC_CFG_SBUS_REQUEST,
+               ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT)
+               | ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT)
+               | ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT)
+               | ((u64)receiver_addr
+                       << ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT));
+}
+
+/*
+ * Turn off the SBus and fabric serdes spicos.
+ *
+ * + Must be called with Sbus fast mode turned on.
+ * + Must be called after fabric serdes broadcast is set up.
+ * + Must be called before the 8051 is loaded - assumes 8051 is not loaded
+ *   when using MISC_CFG_FW_CTRL.
+ */
+static void turn_off_spicos(struct hfi1_devdata *dd, int flags)
+{
+       /* only needed on A0 */
+       if (!is_a0(dd))
+               return;
+
+       dd_dev_info(dd, "Turning off spicos:%s%s\n",
+               flags & SPICO_SBUS ? " SBus" : "",
+               flags & SPICO_FABRIC ? " fabric" : "");
+
+       write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK);
+       /* disable SBus spico */
+       if (flags & SPICO_SBUS)
+               sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01,
+                       WRITE_SBUS_RECEIVER, 0x00000040);
+
+       /* disable the fabric serdes spicos */
+       if (flags & SPICO_FABRIC)
+               sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id],
+                            0x07, WRITE_SBUS_RECEIVER, 0x00000000);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+}
+
+/*
+ *  Reset all of the fabric serdes for our HFI.
+ */
+void fabric_serdes_reset(struct hfi1_devdata *dd)
+{
+       u8 ra;
+
+       if (dd->icode != ICODE_RTL_SILICON) /* only for RTL */
+               return;
+
+       ra = fabric_serdes_broadcast[dd->hfi1_id];
+
+       acquire_hw_mutex(dd);
+       set_sbus_fast_mode(dd);
+       /* place SerDes in reset and disable SPICO */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+       /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+       udelay(1);
+       /* remove SerDes reset */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+       /* turn SPICO enable on */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+       clear_sbus_fast_mode(dd);
+       release_hw_mutex(dd);
+}
+
+/* Access to the SBus in this routine should probably be serialized */
+int sbus_request_slow(struct hfi1_devdata *dd,
+                     u8 receiver_addr, u8 data_addr, u8 command, u32 data_in)
+{
+       u64 reg, count = 0;
+
+       sbus_request(dd, receiver_addr, data_addr, command, data_in);
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+                 ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK);
+       /* Wait for both DONE and RCV_DATA_VALID to go high */
+       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+                (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) {
+               if (count++ >= SBUS_MAX_POLL_COUNT) {
+                       u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+                       /*
+                        * If the loop has timed out, we are OK if DONE bit
+                        * is set and RCV_DATA_VALID and EXECUTE counters
+                        * are the same. If not, we cannot proceed.
+                        */
+                       if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) &&
+                           (SBUS_COUNTER(counts, RCV_DATA_VALID) ==
+                            SBUS_COUNTER(counts, EXECUTE)))
+                               break;
+                       return -ETIMEDOUT;
+               }
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       }
+       count = 0;
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+       /* Wait for DONE to clear after EXECUTE is cleared */
+       reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) {
+               if (count++ >= SBUS_MAX_POLL_COUNT)
+                       return -ETIME;
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_RESULT);
+       }
+       return 0;
+}
+
+static int load_fabric_serdes_firmware(struct hfi1_devdata *dd,
+                                      struct firmware_details *fdet)
+{
+       int i, err;
+       const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */
+
+       dd_dev_info(dd, "Downloading fabric firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: place SerDes in reset and disable SPICO */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011);
+       /* wait 100 refclk cycles @ 156.25MHz => 640ns */
+       udelay(1);
+       /* step 3:  remove SerDes reset */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010);
+       /* step 4: assert IMEM override */
+       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000);
+       /* step 5: download SerDes machine code */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER,
+                                       *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 6: IMEM override off */
+       sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000);
+       /* step 7: turn ECC on */
+       sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+       /* steps 8-11: run the RSA engine */
+       err = run_rsa(dd, "fabric serdes", fdet->signature);
+       if (err)
+               return err;
+
+       /* step 12: turn SPICO enable on */
+       sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002);
+       /* step 13: enable core hardware interrupts */
+       sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000);
+
+       return 0;
+}
+
+static int load_sbus_firmware(struct hfi1_devdata *dd,
+                             struct firmware_details *fdet)
+{
+       int i, err;
+       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+       dd_dev_info(dd, "Downloading SBus firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: place SPICO into reset and enable off */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0);
+       /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240);
+       /* step 4: set starting IMEM address for burst download */
+       sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000);
+       /* step 5: download the SBus Master machine code */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER,
+                                       *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 6: set IMEM_CNTL_EN off */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040);
+       /* step 7: turn ECC on */
+       sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000);
+
+       /* steps 8-11: run the RSA engine */
+       err = run_rsa(dd, "SBus", fdet->signature);
+       if (err)
+               return err;
+
+       /* step 12: set SPICO_ENABLE on */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+
+       return 0;
+}
+
+static int load_pcie_serdes_firmware(struct hfi1_devdata *dd,
+                                    struct firmware_details *fdet)
+{
+       int i;
+       const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */
+
+       dd_dev_info(dd, "Downloading PCIe firmware\n");
+
+       /* step 1: load security variables */
+       load_security_variables(dd, fdet);
+       /* step 2: assert single step (halts the SBus Master spico) */
+       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001);
+       /* step 3: enable XDMEM access */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40);
+       /* step 4: load firmware into SBus Master XDMEM */
+       /* NOTE: the dmem address, write_en, and wdata are all pre-packed,
+          we only need to pick up the bytes and write them */
+       for (i = 0; i < fdet->firmware_len; i += 4) {
+               sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER,
+                                       *(u32 *)&fdet->firmware_ptr[i]);
+       }
+       /* step 5: disable XDMEM access */
+       sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140);
+       /* step 6: allow SBus Spico to run */
+       sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000);
+
+       /* steps 7-11: run RSA, if it succeeds, firmware is available to
+          be swapped */
+       return run_rsa(dd, "PCIe serdes", fdet->signature);
+}
+
+/*
+ * Set the given broadcast values on the given list of devices.
+ */
+static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2,
+                                const u8 *addrs, int count)
+{
+       while (--count >= 0) {
+               /*
+                * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave
+                * defaults for everything else.  Do not read-modify-write,
+                * per instruction from the manufacturer.
+                *
+                * Register 0xfd:
+                *      bits    what
+                *      -----   ---------------------------------
+                *        0     IGNORE_BROADCAST  (default 0)
+                *      11:4    BROADCAST_GROUP_1 (default 0xff)
+                *      23:16   BROADCAST_GROUP_2 (default 0xff)
+                */
+               sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER,
+                               (u32)bg1 << 4 | (u32)bg2 << 16);
+       }
+}
+
+int acquire_hw_mutex(struct hfi1_devdata *dd)
+{
+       unsigned long timeout;
+       int try = 0;
+       u8 mask = 1 << dd->hfi1_id;
+       u8 user;
+
+retry:
+       timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies;
+       while (1) {
+               write_csr(dd, ASIC_CFG_MUTEX, mask);
+               user = (u8)read_csr(dd, ASIC_CFG_MUTEX);
+               if (user == mask)
+                       return 0; /* success */
+               if (time_after(jiffies, timeout))
+                       break; /* timed out */
+               msleep(20);
+       }
+
+       /* timed out */
+       dd_dev_err(dd,
+               "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n",
+               (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up");
+
+       if (try == 0) {
+               /* break mutex and retry */
+               write_csr(dd, ASIC_CFG_MUTEX, 0);
+               try++;
+               goto retry;
+       }
+
+       return -EBUSY;
+}
+
+void release_hw_mutex(struct hfi1_devdata *dd)
+{
+       write_csr(dd, ASIC_CFG_MUTEX, 0);
+}
+
+void set_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE,
+                               ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK);
+}
+
+void clear_sbus_fast_mode(struct hfi1_devdata *dd)
+{
+       u64 reg, count = 0;
+
+       reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+       while (SBUS_COUNTER(reg, EXECUTE) !=
+              SBUS_COUNTER(reg, RCV_DATA_VALID)) {
+               if (count++ >= SBUS_MAX_POLL_COUNT)
+                       break;
+               udelay(1);
+               reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS);
+       }
+       write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+}
+
+int load_firmware(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       if (fw_sbus_load || fw_fabric_serdes_load) {
+               ret = acquire_hw_mutex(dd);
+               if (ret)
+                       return ret;
+
+               set_sbus_fast_mode(dd);
+
+               /*
+                * The SBus contains part of the fabric firmware and so must
+                * also be downloaded.
+                */
+               if (fw_sbus_load) {
+                       turn_off_spicos(dd, SPICO_SBUS);
+                       ret = load_sbus_firmware(dd, &fw_sbus);
+                       if (ret)
+                               goto clear;
+               }
+
+               if (fw_fabric_serdes_load) {
+                       set_serdes_broadcast(dd, all_fabric_serdes_broadcast,
+                                       fabric_serdes_broadcast[dd->hfi1_id],
+                                       fabric_serdes_addrs[dd->hfi1_id],
+                                       NUM_FABRIC_SERDES);
+                       turn_off_spicos(dd, SPICO_FABRIC);
+                       ret = load_fabric_serdes_firmware(dd, &fw_fabric);
+               }
+
+clear:
+               clear_sbus_fast_mode(dd);
+               release_hw_mutex(dd);
+               if (ret)
+                       return ret;
+       }
+
+       if (fw_8051_load) {
+               ret = load_8051_firmware(dd, &fw_8051);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+int hfi1_firmware_init(struct hfi1_devdata *dd)
+{
+       /* only RTL can use these */
+       if (dd->icode != ICODE_RTL_SILICON) {
+               fw_fabric_serdes_load = 0;
+               fw_pcie_serdes_load = 0;
+               fw_sbus_load = 0;
+       }
+
+       /* no 8051 or QSFP on simulator */
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               fw_8051_load = 0;
+               platform_config_load = 0;
+       }
+
+       if (!fw_8051_name) {
+               if (dd->icode == ICODE_RTL_SILICON)
+                       fw_8051_name = DEFAULT_FW_8051_NAME_ASIC;
+               else
+                       fw_8051_name = DEFAULT_FW_8051_NAME_FPGA;
+       }
+       if (!fw_fabric_serdes_name)
+               fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME;
+       if (!fw_sbus_name)
+               fw_sbus_name = DEFAULT_FW_SBUS_NAME;
+       if (!fw_pcie_serdes_name)
+               fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME;
+       if (!platform_config_name)
+               platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME;
+
+       return obtain_firmware(dd);
+}
+
+int parse_platform_config(struct hfi1_devdata *dd)
+{
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+       u32 *ptr = NULL;
+       u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0;
+       u32 record_idx = 0, table_type = 0, table_length_dwords = 0;
+
+       if (platform_config == NULL) {
+               dd_dev_info(dd, "%s: Missing config file\n", __func__);
+               goto bail;
+       }
+       ptr = (u32 *)platform_config->data;
+
+       magic_num = *ptr;
+       ptr++;
+       if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) {
+               dd_dev_info(dd, "%s: Bad config file\n", __func__);
+               goto bail;
+       }
+
+       while (ptr < (u32 *)(platform_config->data + platform_config->size)) {
+               header1 = *ptr;
+               header2 = *(ptr + 1);
+               if (header1 != ~header2) {
+                       dd_dev_info(dd, "%s: Failed validation at offset %ld\n",
+                               __func__, (ptr - (u32 *)platform_config->data));
+                       goto bail;
+               }
+
+               record_idx = *ptr &
+                       ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1);
+
+               table_length_dwords = (*ptr >>
+                               PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) &
+                     ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1);
+
+               table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) &
+                       ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1);
+
+               /* Done with this set of headers */
+               ptr += 2;
+
+               if (record_idx) {
+                       /* data table */
+                       switch (table_type) {
+                       case PLATFORM_CONFIG_SYSTEM_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                                       1;
+                               break;
+                       case PLATFORM_CONFIG_PORT_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                                       2;
+                               break;
+                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+                               pcfgcache->config_tables[table_type].num_table =
+                                                       table_length_dwords;
+                               break;
+                       default:
+                               dd_dev_info(dd,
+                                     "%s: Unknown data table %d, offset %ld\n",
+                                       __func__, table_type,
+                                      (ptr - (u32 *)platform_config->data));
+                               goto bail; /* We don't trust this file now */
+                       }
+                       pcfgcache->config_tables[table_type].table = ptr;
+               } else {
+                       /* metadata table */
+                       switch (table_type) {
+                       case PLATFORM_CONFIG_SYSTEM_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_PORT_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+                               /* fall through */
+                       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+                               break;
+                       default:
+                               dd_dev_info(dd,
+                                 "%s: Unknown metadata table %d, offset %ld\n",
+                                 __func__, table_type,
+                                 (ptr - (u32 *)platform_config->data));
+                               goto bail; /* We don't trust this file now */
+                       }
+                       pcfgcache->config_tables[table_type].table_metadata =
+                                                                       ptr;
+               }
+
+               /* Calculate and check table crc */
+               crc = crc32_le(~(u32)0, (unsigned char const *)ptr,
+                               (table_length_dwords * 4));
+               crc ^= ~(u32)0;
+
+               /* Jump the table */
+               ptr += table_length_dwords;
+               if (crc != *ptr) {
+                       dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n",
+                               __func__, (ptr - (u32 *)platform_config->data));
+                       goto bail;
+               }
+               /* Jump the CRC DWORD */
+               ptr++;
+       }
+
+       pcfgcache->cache_valid = 1;
+       return 0;
+bail:
+       memset(pcfgcache, 0, sizeof(struct platform_config_cache));
+       return -EINVAL;
+}
+
+static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table,
+               int field, u32 *field_len_bits, u32 *field_start_bits)
+{
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+       u32 *src_ptr = NULL;
+
+       if (!pcfgcache->cache_valid)
+               return -EINVAL;
+
+       switch (table) {
+       case PLATFORM_CONFIG_SYSTEM_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_PORT_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+               if (field && field < platform_config_table_limits[table])
+                       src_ptr =
+                       pcfgcache->config_tables[table].table_metadata + field;
+               break;
+       default:
+               dd_dev_info(dd, "%s: Unknown table\n", __func__);
+               break;
+       }
+
+       if (!src_ptr)
+               return -EINVAL;
+
+       if (field_start_bits)
+               *field_start_bits = *src_ptr &
+                     ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1);
+
+       if (field_len_bits)
+               *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT)
+                      & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1);
+
+       return 0;
+}
+
+/* This is the central interface to getting data out of the platform config
+ * file. It depends on parse_platform_config() having populated the
+ * platform_config_cache in hfi1_devdata, and checks the cache_valid member to
+ * validate the sanity of the cache.
+ *
+ * The non-obvious parameters:
+ * @table_index: Acts as a look up key into which instance of the tables the
+ * relevant field is fetched from.
+ *
+ * This applies to the data tables that have multiple instances. The port table
+ * is an exception to this rule as each HFI only has one port and thus the
+ * relevant table can be distinguished by hfi_id.
+ *
+ * @data: pointer to memory that will be populated with the field requested.
+ * @len: length of memory pointed by @data in bytes.
+ */
+int get_platform_config_field(struct hfi1_devdata *dd,
+                       enum platform_config_table_type_encoding table_type,
+                       int table_index, int field_index, u32 *data, u32 len)
+{
+       int ret = 0, wlen = 0, seek = 0;
+       u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL;
+       struct platform_config_cache *pcfgcache = &dd->pcfg_cache;
+
+       if (data)
+               memset(data, 0, len);
+       else
+               return -EINVAL;
+
+       ret = get_platform_fw_field_metadata(dd, table_type, field_index,
+                                       &field_len_bits, &field_start_bits);
+       if (ret)
+               return -EINVAL;
+
+       /* Convert length to bits */
+       len *= 8;
+
+       /* Our metadata function checked cache_valid and field_index for us */
+       switch (table_type) {
+       case PLATFORM_CONFIG_SYSTEM_TABLE:
+               src_ptr = pcfgcache->config_tables[table_type].table;
+
+               if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) {
+                       if (len < field_len_bits)
+                               return -EINVAL;
+
+                       seek = field_start_bits/8;
+                       wlen = field_len_bits/8;
+
+                       src_ptr = (u32 *)((u8 *)src_ptr + seek);
+
+                       /* We expect the field to be byte aligned and whole byte
+                        * lengths if we are here */
+                       memcpy(data, src_ptr, wlen);
+                       return 0;
+               }
+               break;
+       case PLATFORM_CONFIG_PORT_TABLE:
+               /* Port table is 4 DWORDS in META_VERSION 0 */
+               src_ptr = dd->hfi1_id ?
+                       pcfgcache->config_tables[table_type].table + 4 :
+                       pcfgcache->config_tables[table_type].table;
+               break;
+       case PLATFORM_CONFIG_RX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_TX_PRESET_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
+               /* fall through */
+       case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
+               src_ptr = pcfgcache->config_tables[table_type].table;
+
+               if (table_index <
+                       pcfgcache->config_tables[table_type].num_table)
+                       src_ptr += table_index;
+               else
+                       src_ptr = NULL;
+               break;
+       default:
+               dd_dev_info(dd, "%s: Unknown table\n", __func__);
+               break;
+       }
+
+       if (!src_ptr || len < field_len_bits)
+               return -EINVAL;
+
+       src_ptr += (field_start_bits/32);
+       *data = (*src_ptr >> (field_start_bits % 32)) &
+                       ((1 << field_len_bits) - 1);
+
+       return 0;
+}
+
+/*
+ * Download the firmware needed for the Gen3 PCIe SerDes.  An update
+ * to the SBus firmware is needed before updating the PCIe firmware.
+ *
+ * Note: caller must be holding the HW mutex.
+ */
+int load_pcie_firmware(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       /* both firmware loads below use the SBus */
+       set_sbus_fast_mode(dd);
+
+       if (fw_sbus_load) {
+               turn_off_spicos(dd, SPICO_SBUS);
+               ret = load_sbus_firmware(dd, &fw_sbus);
+               if (ret)
+                       goto done;
+       }
+
+       if (fw_pcie_serdes_load) {
+               dd_dev_info(dd, "Setting PCIe SerDes broadcast\n");
+               set_serdes_broadcast(dd, all_pcie_serdes_broadcast,
+                                       pcie_serdes_broadcast[dd->hfi1_id],
+                                       pcie_serdes_addrs[dd->hfi1_id],
+                                       NUM_PCIE_SERDES);
+               ret = load_pcie_serdes_firmware(dd, &fw_pcie);
+               if (ret)
+                       goto done;
+       }
+
+done:
+       clear_sbus_fast_mode(dd);
+
+       return ret;
+}
+
+/*
+ * Read the GUID from the hardware, store it in dd.
+ */
+void read_guid(struct hfi1_devdata *dd)
+{
+       dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID);
+       dd_dev_info(dd, "GUID %llx",
+               (unsigned long long)dd->base_guid);
+}
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
new file mode 100644 (file)
index 0000000..8ca171b
--- /dev/null
@@ -0,0 +1,1821 @@
+#ifndef _HFI1_KERNEL_H
+#define _HFI1_KERNEL_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include "chip_registers.h"
+#include "common.h"
+#include "verbs.h"
+#include "pio.h"
+#include "chip.h"
+#include "mad.h"
+#include "qsfp.h"
+#include "platform_config.h"
+
+/* bumped 1 from s/w major version of TrueScale */
+#define HFI1_CHIP_VERS_MAJ 3U
+
+/* don't care about this except printing */
+#define HFI1_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define HFI1_OUI 0x001175
+#define HFI1_OUI_LSB 40
+
+#define DROP_PACKET_OFF                0
+#define DROP_PACKET_ON         1
+
+extern unsigned long hfi1_cap_mask;
+#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
+#define HFI1_CAP_UGET_MASK(mask, cap) \
+       (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap)
+#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap))
+#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap))
+#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap))
+#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \
+                       HFI1_CAP_MISC_MASK)
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted hfi1_statnames[] in debugfs.c must
+ * change to match.
+ */
+struct hfi1_ib_stats {
+       __u64 sps_ints; /* number of interrupts handled */
+       __u64 sps_errints; /* number of error interrupts */
+       __u64 sps_txerrs; /* tx-related packet errors */
+       __u64 sps_rcverrs; /* non-crc rcv packet errors */
+       __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+       __u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+       __u64 sps_ctxts; /* number of contexts currently open */
+       __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+       __u64 sps_buffull;
+       __u64 sps_hdrfull;
+};
+
+extern struct hfi1_ib_stats hfi1_stats;
+extern const struct pci_error_handlers hfi1_pci_err_handler;
+
+/*
+ * First-cut criterion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct hfi1_opcode_stats_perctx;
+#endif
+
+/*
+ * struct ps_state keeps state associated with RX queue "prescanning"
+ * (prescanning for FECNs, and BECNs), if prescanning is in use.
+ */
+struct ps_state {
+       u32 ps_head;
+       int initialized;
+};
+
+struct ctxt_eager_bufs {
+       ssize_t size;            /* total size of eager buffers */
+       u32 count;               /* size of buffers array */
+       u32 numbufs;             /* number of buffers allocated */
+       u32 alloced;             /* number of rcvarray entries used */
+       u32 rcvtid_size;         /* size of each eager rcv tid */
+       u32 threshold;           /* head update threshold */
+       struct eager_buffer {
+               void *addr;
+               dma_addr_t phys;
+               ssize_t len;
+       } *buffers;
+       struct {
+               void *addr;
+               dma_addr_t phys;
+       } *rcvtids;
+};
+
+struct hfi1_ctxtdata {
+       /* shadow the ctxt's RcvCtrl register */
+       u64 rcvctrl;
+       /* rcvhdrq base, needs mmap before useful */
+       void *rcvhdrq;
+       /* kernel virtual address where hdrqtail is updated */
+       volatile __le64 *rcvhdrtail_kvaddr;
+       /*
+        * Shared page for kernel to signal user processes that send buffers
+        * need disarming.  The process should call HFI1_CMD_DISARM_BUFS
+        * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+        */
+       unsigned long *user_event_mask;
+       /* when waiting for rcv or pioavail */
+       wait_queue_head_t wait;
+       /* rcvhdrq size (for freeing) */
+       size_t rcvhdrq_size;
+       /* number of rcvhdrq entries */
+       u16 rcvhdrq_cnt;
+       /* size of each of the rcvhdrq entries */
+       u16 rcvhdrqentsize;
+       /* mmap of hdrq, must fit in 44 bits */
+       dma_addr_t rcvhdrq_phys;
+       dma_addr_t rcvhdrqtailaddr_phys;
+       struct ctxt_eager_bufs egrbufs;
+       /* this receive context's assigned PIO ACK send context */
+       struct send_context *sc;
+
+       /* dynamic receive available interrupt timeout */
+       u32 rcvavail_timeout;
+       /*
+        * number of opens (including slave sub-contexts) on this instance
+        * (ignoring forks, dup, etc. for now)
+        */
+       int cnt;
+       /*
+        * how much space to leave at start of eager TID entries for
+        * protocol use, on each TID
+        */
+       /* instead of calculating it */
+       unsigned ctxt;
+       /* non-zero if ctxt is being shared. */
+       u16 subctxt_cnt;
+       /* non-zero if ctxt is being shared. */
+       u16 subctxt_id;
+       u8 uuid[16];
+       /* job key */
+       u16 jkey;
+       /* number of RcvArray groups for this context. */
+       u32 rcv_array_groups;
+       /* index of first eager TID entry. */
+       u32 eager_base;
+       /* number of expected TID entries */
+       u32 expected_count;
+       /* index of first expected TID entry. */
+       u32 expected_base;
+       /* cursor into the exp group sets */
+       atomic_t tidcursor;
+       /* number of exp TID groups assigned to the ctxt */
+       u16 numtidgroups;
+       /* size of exp TID group fields in tidusemap */
+       u16 tidmapcnt;
+       /* exp TID group usage bitfield array */
+       unsigned long *tidusemap;
+       /* pinned pages for exp sends, allocated at open */
+       struct page **tid_pg_list;
+       /* dma handles for exp tid pages */
+       dma_addr_t *physshadow;
+       /* lock protecting all Expected TID data */
+       spinlock_t exp_lock;
+       /* number of pio bufs for this ctxt (all procs, if shared) */
+       u32 piocnt;
+       /* first pio buffer for this ctxt */
+       u32 pio_base;
+       /* chip offset of PIO buffers for this ctxt */
+       u32 piobufs;
+       /* per-context configuration flags */
+       u16 flags;
+       /* per-context event flags for fileops/intr communication */
+       unsigned long event_flags;
+       /* WAIT_RCV that timed out, no interrupt */
+       u32 rcvwait_to;
+       /* WAIT_PIO that timed out, no interrupt */
+       u32 piowait_to;
+       /* WAIT_RCV already happened, no wait */
+       u32 rcvnowait;
+       /* WAIT_PIO already happened, no wait */
+       u32 pionowait;
+       /* total number of polled urgent packets */
+       u32 urgent;
+       /* saved total number of polled urgent packets for poll edge trigger */
+       u32 urgent_poll;
+       /* pid of process using this ctxt */
+       pid_t pid;
+       pid_t subpid[HFI1_MAX_SHARED_CTXTS];
+       /* same size as task_struct .comm[], command that opened context */
+       char comm[16];
+       /* so file ops can get at unit */
+       struct hfi1_devdata *dd;
+       /* so functions that need physical port can get it easily */
+       struct hfi1_pportdata *ppd;
+       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+       void *subctxt_uregbase;
+       /* An array of pages for the eager receive buffers * N */
+       void *subctxt_rcvegrbuf;
+       /* An array of pages for the eager header queue entries * N */
+       void *subctxt_rcvhdr_base;
+       /* The version of the library which opened this ctxt */
+       u32 userversion;
+       /* Bitmask of active slaves */
+       u32 active_slaves;
+       /* Type of packets or conditions we want to poll for */
+       u16 poll_type;
+       /* receive packet sequence counter */
+       u8 seq_cnt;
+       u8 redirect_seq_cnt;
+       /* ctxt rcvhdrq head offset */
+       u32 head;
+       u32 pkt_count;
+       /* QPs waiting for context processing */
+       struct list_head qp_wait_list;
+       /* interrupt handling */
+       u64 imask;      /* clear interrupt mask */
+       int ireg;       /* clear interrupt register */
+       unsigned numa_id; /* numa node of this context */
+       /* verbs stats per CTX */
+       struct hfi1_opcode_stats_perctx *opstats;
+       /*
+        * This is the kernel thread that will keep making
+        * progress on the user sdma requests behind the scenes.
+        * There is one per context (shared contexts use the master's).
+        */
+       struct task_struct *progress;
+       struct list_head sdma_queues;
+       spinlock_t sdma_qlock;
+
+#ifdef CONFIG_PRESCAN_RXQ
+       struct ps_state ps_state;
+#endif /* CONFIG_PRESCAN_RXQ */
+
+       /*
+        * The interrupt handler for a particular receive context can vary
+        * throughout it's lifetime. This is not a lock protected data member so
+        * it must be updated atomically and the prev and new value must always
+        * be valid. Worst case is we process an extra interrupt and up to 64
+        * packets with the wrong interrupt handler.
+        */
+       void (*do_interrupt)(struct hfi1_ctxtdata *rcd);
+};
+
+/*
+ * Represents a single packet at a high level. Put commonly computed things in
+ * here so we do not have to keep doing them over and over. The rule of thumb is
+ * if something is used one time to derive some value, store that something in
+ * here. If it is used multiple times, then store the result of that derivation
+ * in here.
+ */
+struct hfi1_packet {
+       void *ebuf;
+       void *hdr;
+       struct hfi1_ctxtdata *rcd;
+       __le32 *rhf_addr;
+       struct hfi1_qp *qp;
+       struct hfi1_other_headers *ohdr;
+       u64 rhf;
+       u32 maxcnt;
+       u32 rhqoff;
+       u32 hdrqtail;
+       int numpkt;
+       u16 tlen;
+       u16 hlen;
+       s16 etail;
+       u16 rsize;
+       u8 updegr;
+       u8 rcv_flags;
+       u8 etype;
+};
+
+static inline bool has_sc4_bit(struct hfi1_packet *p)
+{
+       return !!rhf_dc_info(p->rhf);
+}
+
+/*
+ * Private data for snoop/capture support.
+ */
+struct hfi1_snoop_data {
+       int mode_flag;
+       struct cdev cdev;
+       struct device *class_dev;
+       spinlock_t snoop_lock;
+       struct list_head queue;
+       wait_queue_head_t waitq;
+       void *filter_value;
+       int (*filter_callback)(void *hdr, void *data, void *value);
+       u64 dcc_cfg; /* saved value of DCC Cfg register */
+};
+
+/* snoop mode_flag values */
+#define HFI1_PORT_SNOOP_MODE     1U
+#define HFI1_PORT_CAPTURE_MODE   2U
+
+struct hfi1_sge_state;
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */
+#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */
+#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define HFI1_IB_CFG_SPD 5 /* current Link spd */
+#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */
+#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */
+#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */
+#define HFI1_IB_CFG_VL_HIGH_LIMIT 19
+#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * HFI or Host Link States
+ *
+ * These describe the states the driver thinks the logical and physical
+ * states are in.  Used as an argument to set_link_state().  Implemented
+ * as bits for easy multi-state checking.  The actual state can only be
+ * one.
+ */
+#define __HLS_UP_INIT_BP       0
+#define __HLS_UP_ARMED_BP      1
+#define __HLS_UP_ACTIVE_BP     2
+#define __HLS_DN_DOWNDEF_BP    3       /* link down default */
+#define __HLS_DN_POLL_BP       4
+#define __HLS_DN_DISABLE_BP    5
+#define __HLS_DN_OFFLINE_BP    6
+#define __HLS_VERIFY_CAP_BP    7
+#define __HLS_GOING_UP_BP      8
+#define __HLS_GOING_OFFLINE_BP  9
+#define __HLS_LINK_COOLDOWN_BP 10
+
+#define HLS_UP_INIT      (1 << __HLS_UP_INIT_BP)
+#define HLS_UP_ARMED     (1 << __HLS_UP_ARMED_BP)
+#define HLS_UP_ACTIVE    (1 << __HLS_UP_ACTIVE_BP)
+#define HLS_DN_DOWNDEF   (1 << __HLS_DN_DOWNDEF_BP) /* link down default */
+#define HLS_DN_POLL      (1 << __HLS_DN_POLL_BP)
+#define HLS_DN_DISABLE   (1 << __HLS_DN_DISABLE_BP)
+#define HLS_DN_OFFLINE   (1 << __HLS_DN_OFFLINE_BP)
+#define HLS_VERIFY_CAP   (1 << __HLS_VERIFY_CAP_BP)
+#define HLS_GOING_UP     (1 << __HLS_GOING_UP_BP)
+#define HLS_GOING_OFFLINE (1 << __HLS_GOING_OFFLINE_BP)
+#define HLS_LINK_COOLDOWN (1 << __HLS_LINK_COOLDOWN_BP)
+
+#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE)
+
+/* use this MTU size if none other is given */
+#define HFI1_DEFAULT_ACTIVE_MTU 8192
+/* use this MTU size as the default maximum */
+#define HFI1_DEFAULT_MAX_MTU 8192
+/* default partition key */
+#define DEFAULT_PKEY 0xffff
+
+/*
+ * Possible fabric manager config parameters for fm_{get,set}_table()
+ */
+#define FM_TBL_VL_HIGH_ARB             1 /* Get/set VL high prio weights */
+#define FM_TBL_VL_LOW_ARB              2 /* Get/set VL low prio weights */
+#define FM_TBL_BUFFER_CONTROL          3 /* Get/set Buffer Control */
+#define FM_TBL_SC2VLNT                 4 /* Get/set SC->VLnt */
+#define FM_TBL_VL_PREEMPT_ELEMS                5 /* Get (no set) VL preempt elems */
+#define FM_TBL_VL_PREEMPT_MATRIX       6 /* Get (no set) VL preempt matrix */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB
+ */
+#define HFI1_RCVCTRL_TAILUPD_ENB 0x01
+#define HFI1_RCVCTRL_TAILUPD_DIS 0x02
+#define HFI1_RCVCTRL_CTXT_ENB 0x04
+#define HFI1_RCVCTRL_CTXT_DIS 0x08
+#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10
+#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20
+#define HFI1_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define HFI1_RCVCTRL_PKEY_DIS 0x80
+#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400
+#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800
+#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000
+#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000
+#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
+#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
+#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+
+/* partition enforcement flags */
+#define HFI1_PART_ENFORCE_IN   0x1
+#define HFI1_PART_ENFORCE_OUT  0x2
+
+/* how often we check for synthetic counter wrap around */
+#define SYNTH_CNT_TIME 2
+
+/* Counter flags */
+#define CNTR_NORMAL            0x0 /* Normal counters, just read register */
+#define CNTR_SYNTH             0x1 /* Synthetic counters, saturate at all 1s */
+#define CNTR_DISABLED          0x2 /* Disable this counter */
+#define CNTR_32BIT             0x4 /* Simulate 64 bits for this counter */
+#define CNTR_VL                        0x8 /* Per VL counter */
+#define CNTR_INVALID_VL                -1  /* Specifies invalid VL */
+#define CNTR_MODE_W            0x0
+#define CNTR_MODE_R            0x1
+
+/* VLs Supported/Operational */
+#define HFI1_MIN_VLS_SUPPORTED 1
+#define HFI1_MAX_VLS_SUPPORTED 8
+
+static inline void incr_cntr64(u64 *cntr)
+{
+       if (*cntr < (u64)-1LL)
+               (*cntr)++;
+}
+
+static inline void incr_cntr32(u32 *cntr)
+{
+       if (*cntr < (u32)-1LL)
+               (*cntr)++;
+}
+
+#define MAX_NAME_SIZE 64
+struct hfi1_msix_entry {
+       struct msix_entry msix;
+       void *arg;
+       char name[MAX_NAME_SIZE];
+       cpumask_var_t mask;
+};
+
+/* per-SL CCA information */
+struct cca_timer {
+       struct hrtimer hrtimer;
+       struct hfi1_pportdata *ppd; /* read-only */
+       int sl; /* read-only */
+       u16 ccti; /* read/write - current value of CCTI */
+};
+
+struct link_down_reason {
+       /*
+        * SMA-facing value.  Should be set from .latest when
+        * HLS_UP_* -> HLS_DN_* transition actually occurs.
+        */
+       u8 sma;
+       u8 latest;
+};
+
+enum {
+       LO_PRIO_TABLE,
+       HI_PRIO_TABLE,
+       MAX_PRIO_TABLE
+};
+
+struct vl_arb_cache {
+       spinlock_t lock;
+       struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE];
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct hfi1_pportdata {
+       struct hfi1_ibport ibport_data;
+
+       struct hfi1_devdata *dd;
+       struct kobject pport_cc_kobj;
+       struct kobject sc2vl_kobj;
+       struct kobject sl2sc_kobj;
+       struct kobject vl2mtu_kobj;
+
+       /* QSFP support */
+       struct qsfp_data qsfp_info;
+
+       /* GUID for this interface, in host order */
+       u64 guid;
+       /* GUID for peer interface, in host order */
+       u64 neighbor_guid;
+
+       /* up or down physical link state */
+       u32 linkup;
+
+       /*
+        * this address is mapped read-only into user processes so they can
+        * get status cheaply, whenever they want.  One qword of status per port
+        */
+       u64 *statusp;
+
+       /* SendDMA related entries */
+
+       struct workqueue_struct *hfi1_wq;
+
+       /* move out of interrupt context */
+       struct work_struct link_vc_work;
+       struct work_struct link_up_work;
+       struct work_struct link_down_work;
+       struct work_struct sma_message_work;
+       struct work_struct freeze_work;
+       struct work_struct link_downgrade_work;
+       struct work_struct link_bounce_work;
+       /* host link state variables */
+       struct mutex hls_lock;
+       u32 host_link_state;
+
+       spinlock_t            sdma_alllock ____cacheline_aligned_in_smp;
+
+       u32 lstate;     /* logical link state */
+
+       /* these are the "32 bit" regs */
+
+       u32 ibmtu; /* The MTU programmed for this unit */
+       /*
+        * Current max size IB packet (in bytes) including IB headers, that
+        * we can send. Changes when ibmtu changes.
+        */
+       u32 ibmaxlen;
+       u32 current_egress_rate; /* units [10^6 bits/sec] */
+       /* LID programmed for this instance */
+       u16 lid;
+       /* list of pkeys programmed; 0 if not set */
+       u16 pkeys[MAX_PKEY_VALUES];
+       u16 link_width_supported;
+       u16 link_width_downgrade_supported;
+       u16 link_speed_supported;
+       u16 link_width_enabled;
+       u16 link_width_downgrade_enabled;
+       u16 link_speed_enabled;
+       u16 link_width_active;
+       u16 link_width_downgrade_tx_active;
+       u16 link_width_downgrade_rx_active;
+       u16 link_speed_active;
+       u8 vls_supported;
+       u8 vls_operational;
+       /* LID mask control */
+       u8 lmc;
+       /* Rx Polarity inversion (compensate for ~tx on partner) */
+       u8 rx_pol_inv;
+
+       u8 hw_pidx;     /* physical port index */
+       u8 port;        /* IB port number and index into dd->pports - 1 */
+       /* type of neighbor node */
+       u8 neighbor_type;
+       u8 neighbor_normal;
+       u8 neighbor_fm_security; /* 1 if firmware checking is disabled */
+       u8 neighbor_port_number;
+       u8 is_sm_config_started;
+       u8 offline_disabled_reason;
+       u8 is_active_optimize_enabled;
+       u8 driver_link_ready;   /* driver ready for active link */
+       u8 link_enabled;        /* link enabled? */
+       u8 linkinit_reason;
+       u8 local_tx_rate;       /* rate given to 8051 firmware */
+
+       /* placeholders for IB MAD packet settings */
+       u8 overrun_threshold;
+       u8 phy_error_threshold;
+
+       /* used to override LED behavior */
+       u8 led_override;  /* Substituted for normal value, if non-zero */
+       u16 led_override_timeoff; /* delta to next timer event */
+       u8 led_override_vals[2]; /* Alternates per blink-frame */
+       u8 led_override_phase; /* Just counts, LSB picks from vals[] */
+       atomic_t led_override_timer_active;
+       /* Used to flash LEDs in override mode */
+       struct timer_list led_override_timer;
+       u32 sm_trap_qp;
+       u32 sa_qp;
+
+       /*
+        * cca_timer_lock protects access to the per-SL cca_timer
+        * structures (specifically the ccti member).
+        */
+       spinlock_t cca_timer_lock ____cacheline_aligned_in_smp;
+       struct cca_timer cca_timer[OPA_MAX_SLS];
+
+       /* List of congestion control table entries */
+       struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX];
+
+       /* congestion entries, each entry corresponding to a SL */
+       struct opa_congestion_setting_entry_shadow
+               congestion_entries[OPA_MAX_SLS];
+
+       /*
+        * cc_state_lock protects (write) access to the per-port
+        * struct cc_state.
+        */
+       spinlock_t cc_state_lock ____cacheline_aligned_in_smp;
+
+       struct cc_state __rcu *cc_state;
+
+       /* Total number of congestion control table entries */
+       u16 total_cct_entry;
+
+       /* Bit map identifying service level */
+       u32 cc_sl_control_map;
+
+       /* CA's max number of 64 entry units in the congestion control table */
+       u8 cc_max_table_entries;
+
+       /* begin congestion log related entries
+        * cc_log_lock protects all congestion log related data */
+       spinlock_t cc_log_lock ____cacheline_aligned_in_smp;
+       u8 threshold_cong_event_map[OPA_MAX_SLS/8];
+       u16 threshold_event_counter;
+       struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS];
+       int cc_log_idx; /* index for logging events */
+       int cc_mad_idx; /* index for reporting events */
+       /* end congestion log related entries */
+
+       struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE];
+
+       /* port relative counter buffer */
+       u64 *cntrs;
+       /* port relative synthetic counter buffer */
+       u64 *scntrs;
+       /* we synthesize port_xmit_discards from several egress errors */
+       u64 port_xmit_discards;
+       u64 port_xmit_constraint_errors;
+       u64 port_rcv_constraint_errors;
+       /* count of 'link_err' interrupts from DC */
+       u64 link_downed;
+       /* number of times link retrained successfully */
+       u64 link_up;
+       /* port_ltp_crc_mode is returned in 'portinfo' MADs */
+       u16 port_ltp_crc_mode;
+       /* port_crc_mode_enabled is the crc we support */
+       u8 port_crc_mode_enabled;
+       /* mgmt_allowed is also returned in 'portinfo' MADs */
+       u8 mgmt_allowed;
+       u8 part_enforce; /* partition enforcement flags */
+       struct link_down_reason local_link_down_reason;
+       struct link_down_reason neigh_link_down_reason;
+       /* Value to be sent to link peer on LinkDown .*/
+       u8 remote_link_down_reason;
+       /* Error events that will cause a port bounce. */
+       u32 port_error_action;
+};
+
+typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+typedef void (*opcode_handler)(struct hfi1_packet *packet);
+
+/* return values for the RHF receive functions */
+#define RHF_RCV_CONTINUE  0    /* keep going */
+#define RHF_RCV_DONE     1     /* stop, this packet processed */
+#define RHF_RCV_REPROCESS 2    /* stop. retain this packet */
+
+struct rcv_array_data {
+       u8 group_size;
+       u16 ngroups;
+       u16 nctxt_extra;
+};
+
+struct per_vl_data {
+       u16 mtu;
+       struct send_context *sc;
+};
+
+/* 16 to directly index */
+#define PER_VL_SEND_CONTEXTS 16
+
+struct err_info_rcvport {
+       u8 status_and_code;
+       u64 packet_flit1;
+       u64 packet_flit2;
+};
+
+struct err_info_constraint {
+       u8 status;
+       u16 pkey;
+       u32 slid;
+};
+
+struct hfi1_temp {
+       unsigned int curr;       /* current temperature */
+       unsigned int lo_lim;     /* low temperature limit */
+       unsigned int hi_lim;     /* high temperature limit */
+       unsigned int crit_lim;   /* critical temperature limit */
+       u8 triggers;      /* temperature triggers */
+};
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a hfi1_pportdata struct.
+ */
+struct sdma_engine;
+struct sdma_vl_map;
+
+#define BOARD_VERS_MAX 96 /* how long the version string can be */
+#define SERIAL_MAX 16 /* length of the serial number */
+
+struct hfi1_devdata {
+       struct hfi1_ibdev verbs_dev;     /* must be first */
+       struct list_head list;
+       /* pointers to related structs for this device */
+       /* pci access data structure */
+       struct pci_dev *pcidev;
+       struct cdev user_cdev;
+       struct cdev diag_cdev;
+       struct cdev ui_cdev;
+       struct device *user_device;
+       struct device *diag_device;
+       struct device *ui_device;
+
+       /* mem-mapped pointer to base of chip regs */
+       u8 __iomem *kregbase;
+       /* end of mem-mapped chip space excluding sendbuf and user regs */
+       u8 __iomem *kregend;
+       /* physical address of chip for io_remap, etc. */
+       resource_size_t physaddr;
+       /* receive context data */
+       struct hfi1_ctxtdata **rcd;
+       /* send context data */
+       struct send_context_info *send_contexts;
+       /* map hardware send contexts to software index */
+       u8 *hw_to_sw;
+       /* spinlock for allocating and releasing send context resources */
+       spinlock_t sc_lock;
+       /* Per VL data. Enough for all VLs but not all elements are set/used. */
+       struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
+       /* seqlock for sc2vl */
+       seqlock_t sc2vl_lock;
+       u64 sc2vl[4];
+       /* Send Context initialization lock. */
+       spinlock_t sc_init_lock;
+
+       /* fields common to all SDMA engines */
+
+       /* default flags to last descriptor */
+       u64 default_desc1;
+       volatile __le64                    *sdma_heads_dma; /* DMA'ed by chip */
+       dma_addr_t                          sdma_heads_phys;
+       void                               *sdma_pad_dma; /* DMA'ed by chip */
+       dma_addr_t                          sdma_pad_phys;
+       /* for deallocation */
+       size_t                              sdma_heads_size;
+       /* number from the chip */
+       u32                                 chip_sdma_engines;
+       /* num used */
+       u32                                 num_sdma;
+       /* lock for sdma_map */
+       spinlock_t                          sde_map_lock;
+       /* array of engines sized by num_sdma */
+       struct sdma_engine                 *per_sdma;
+       /* array of vl maps */
+       struct sdma_vl_map __rcu           *sdma_map;
+       /* SPC freeze waitqueue and variable */
+       wait_queue_head_t                 sdma_unfreeze_wq;
+       atomic_t                          sdma_unfreeze_count;
+
+
+       /* hfi1_pportdata, points to array of (physical) port-specific
+        * data structs, indexed by pidx (0..n-1)
+        */
+       struct hfi1_pportdata *pport;
+
+       /* mem-mapped pointer to base of PIO buffers */
+       void __iomem *piobase;
+       /*
+        * write-combining mem-mapped pointer to base of RcvArray
+        * memory.
+        */
+       void __iomem *rcvarray_wc;
+       /*
+        * credit return base - a per-NUMA range of DMA address that
+        * the chip will use to update the per-context free counter
+        */
+       struct credit_return_base *cr_base;
+
+       /* send context numbers and sizes for each type */
+       struct sc_config_sizes sc_sizes[SC_MAX];
+
+       u32 lcb_access_count;           /* count of LCB users */
+
+       char *boardname; /* human readable board info */
+
+       /* device (not port) flags, basically device capabilities */
+       u32 flags;
+
+       /* reset value */
+       u64 z_int_counter;
+       u64 z_rcv_limit;
+       /* percpu int_counter */
+       u64 __percpu *int_counter;
+       u64 __percpu *rcv_limit;
+
+       /* number of receive contexts in use by the driver */
+       u32 num_rcv_contexts;
+       /* number of pio send contexts in use by the driver */
+       u32 num_send_contexts;
+       /*
+        * number of ctxts available for PSM open
+        */
+       u32 freectxts;
+       /* base receive interrupt timeout, in CSR units */
+       u32 rcv_intr_timeout_csr;
+
+       u64 __iomem *egrtidbase;
+       spinlock_t sendctrl_lock; /* protect changes to SendCtrl */
+       spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
+       /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+       spinlock_t uctxt_lock; /* rcd and user context changes */
+       /* exclusive access to 8051 */
+       spinlock_t dc8051_lock;
+       /* exclusive access to 8051 memory */
+       spinlock_t dc8051_memlock;
+       int dc8051_timed_out;   /* remember if the 8051 timed out */
+       /*
+        * A page that will hold event notification bitmaps for all
+        * contexts. This page will be mapped into all processes.
+        */
+       unsigned long *events;
+       /*
+        * per unit status, see also portdata statusp
+        * mapped read-only into user processes so they can get unit and
+        * IB link status cheaply
+        */
+       struct hfi1_status *status;
+       u32 freezelen; /* max length of freezemsg */
+
+       /* revision register shadow */
+       u64 revision;
+       /* Base GUID for device (network order) */
+       u64 base_guid;
+
+       /* these are the "32 bit" regs */
+
+       /* value we put in kr_rcvhdrsize */
+       u32 rcvhdrsize;
+       /* number of receive contexts the chip supports */
+       u32 chip_rcv_contexts;
+       /* number of receive array entries */
+       u32 chip_rcv_array_count;
+       /* number of PIO send contexts the chip supports */
+       u32 chip_send_contexts;
+       /* number of bytes in the PIO memory buffer */
+       u32 chip_pio_mem_size;
+       /* number of bytes in the SDMA memory buffer */
+       u32 chip_sdma_mem_size;
+
+       /* size of each rcvegrbuffer */
+       u32 rcvegrbufsize;
+       /* log2 of above */
+       u16 rcvegrbufsize_shift;
+       /* both sides of the PCIe link are gen3 capable */
+       u8 link_gen3_capable;
+       /* localbus width (1, 2,4,8,16,32) from config space  */
+       u32 lbus_width;
+       /* localbus speed in MHz */
+       u32 lbus_speed;
+       int unit; /* unit # of this chip */
+       int node; /* home node of this chip */
+
+       /* save these PCI fields to restore after a reset */
+       u32 pcibar0;
+       u32 pcibar1;
+       u32 pci_rom;
+       u16 pci_command;
+       u16 pcie_devctl;
+       u16 pcie_lnkctl;
+       u16 pcie_devctl2;
+       u32 pci_msix0;
+       u32 pci_lnkctl3;
+       u32 pci_tph2;
+
+       /*
+        * ASCII serial number, from flash, large enough for original
+        * all digit strings, and longer serial number format
+        */
+       u8 serial[SERIAL_MAX];
+       /* human readable board version */
+       u8 boardversion[BOARD_VERS_MAX];
+       u8 lbus_info[32]; /* human readable localbus info */
+       /* chip major rev, from CceRevision */
+       u8 majrev;
+       /* chip minor rev, from CceRevision */
+       u8 minrev;
+       /* hardware ID */
+       u8 hfi1_id;
+       /* implementation code */
+       u8 icode;
+       /* default link down value (poll/sleep) */
+       u8 link_default;
+       /* vAU of this device */
+       u8 vau;
+       /* vCU of this device */
+       u8 vcu;
+       /* link credits of this device */
+       u16 link_credits;
+       /* initial vl15 credits to use */
+       u16 vl15_init;
+
+       /* Misc small ints */
+       /* Number of physical ports available */
+       u8 num_pports;
+       /* Lowest context number which can be used by user processes */
+       u8 first_user_ctxt;
+       u8 n_krcv_queues;
+       u8 qos_shift;
+       u8 qpn_mask;
+
+       u16 rhf_offset; /* offset of RHF within receive header entry */
+       u16 irev;       /* implementation revision */
+       u16 dc8051_ver; /* 8051 firmware version */
+
+       struct platform_config_cache pcfg_cache;
+       /* control high-level access to qsfp */
+       struct mutex qsfp_i2c_mutex;
+
+       struct diag_client *diag_client;
+       spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */
+
+       u8 psxmitwait_supported;
+       /* cycle length of PS* counters in HW (in picoseconds) */
+       u16 psxmitwait_check_rate;
+       /* high volume overflow errors deferred to tasklet */
+       struct tasklet_struct error_tasklet;
+       /* per device cq worker */
+       struct kthread_worker *worker;
+
+       /* MSI-X information */
+       struct hfi1_msix_entry *msix_entries;
+       u32 num_msix_entries;
+
+       /* INTx information */
+       u32 requested_intx_irq;         /* did we request one? */
+       char intx_name[MAX_NAME_SIZE];  /* INTx name */
+
+       /* general interrupt: mask of handled interrupts */
+       u64 gi_mask[CCE_NUM_INT_CSRS];
+
+       struct rcv_array_data rcv_entries;
+
+       /*
+        * 64 bit synthetic counters
+        */
+       struct timer_list synth_stats_timer;
+
+       /*
+        * device counters
+        */
+       char *cntrnames;
+       size_t cntrnameslen;
+       size_t ndevcntrs;
+       u64 *cntrs;
+       u64 *scntrs;
+
+       /*
+        * remembered values for synthetic counters
+        */
+       u64 last_tx;
+       u64 last_rx;
+
+       /*
+        * per-port counters
+        */
+       size_t nportcntrs;
+       char *portcntrnames;
+       size_t portcntrnameslen;
+
+       struct hfi1_snoop_data hfi1_snoop;
+
+       struct err_info_rcvport err_info_rcvport;
+       struct err_info_constraint err_info_rcv_constraint;
+       struct err_info_constraint err_info_xmit_constraint;
+       u8 err_info_uncorrectable;
+       u8 err_info_fmconfig;
+
+       atomic_t drop_packet;
+       u8 do_drop;
+
+       /* receive interrupt functions */
+       rhf_rcv_function_ptr *rhf_rcv_function_map;
+       rhf_rcv_function_ptr normal_rhf_rcv_functions[8];
+
+       /*
+        * Handlers for outgoing data so that snoop/capture does not
+        * have to have its hooks in the send path
+        */
+       int (*process_pio_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+                               u32 hdrwords, struct hfi1_sge_state *ss,
+                               u32 len, u32 plen, u32 dwords, u64 pbc);
+       int (*process_dma_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+                               u32 hdrwords, struct hfi1_sge_state *ss,
+                               u32 len, u32 plen, u32 dwords, u64 pbc);
+       void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                               u64 pbc, const void *from, size_t count);
+
+       /* OUI comes from the HW. Used everywhere as 3 separate bytes. */
+       u8 oui1;
+       u8 oui2;
+       u8 oui3;
+       /* Timer and counter used to detect RcvBufOvflCnt changes */
+       struct timer_list rcverr_timer;
+       u32 rcv_ovfl_cnt;
+
+       int assigned_node_id;
+       wait_queue_head_t event_queue;
+
+       /* Save the enabled LCB error bits */
+       u64 lcb_err_en;
+       u8 dc_shutdown;
+};
+
+/* 8051 firmware version helper */
+#define dc8051_ver(a, b) ((a) << 8 | (b))
+
+/* f_put_tid types */
+#define PT_EXPECTED 0
+#define PT_EAGER    1
+#define PT_INVALID  2
+
+/* Private data for file operations */
+struct hfi1_filedata {
+       struct hfi1_ctxtdata *uctxt;
+       unsigned subctxt;
+       struct hfi1_user_sdma_comp_q *cq;
+       struct hfi1_user_sdma_pkt_q *pq;
+       /* for cpu affinity; -1 if none */
+       int rec_cpu_num;
+};
+
+extern struct list_head hfi1_dev_list;
+extern spinlock_t hfi1_devs_lock;
+struct hfi1_devdata *hfi1_lookup(int unit);
+extern u32 hfi1_cpulist_count;
+extern unsigned long *hfi1_cpulist;
+
+extern unsigned int snoop_drop_send;
+extern unsigned int snoop_force_capture;
+int hfi1_init(struct hfi1_devdata *, int);
+int hfi1_count_units(int *npresentp, int *nupp);
+int hfi1_count_active_units(void);
+
+int hfi1_diag_add(struct hfi1_devdata *);
+void hfi1_diag_remove(struct hfi1_devdata *);
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup);
+
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd);
+
+int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *);
+int hfi1_create_ctxts(struct hfi1_devdata *dd);
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32);
+void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *,
+                        struct hfi1_devdata *, u8, u8);
+void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *);
+
+void handle_receive_interrupt(struct hfi1_ctxtdata *);
+void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd);
+void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd);
+int hfi1_reset_device(int);
+
+/* return the driver's idea of the logical OPA port state */
+static inline u32 driver_lstate(struct hfi1_pportdata *ppd)
+{
+       return ppd->lstate; /* use the cached value */
+}
+
+static inline u16 generate_jkey(kuid_t uid)
+{
+       return from_kuid(current_user_ns(), uid) & 0xffff;
+}
+
+/*
+ * active_egress_rate
+ *
+ * returns the active egress rate in units of [10^6 bits/sec]
+ */
+static inline u32 active_egress_rate(struct hfi1_pportdata *ppd)
+{
+       u16 link_speed = ppd->link_speed_active;
+       u16 link_width = ppd->link_width_active;
+       u32 egress_rate;
+
+       if (link_speed == OPA_LINK_SPEED_25G)
+               egress_rate = 25000;
+       else /* assume OPA_LINK_SPEED_12_5G */
+               egress_rate = 12500;
+
+       switch (link_width) {
+       case OPA_LINK_WIDTH_4X:
+               egress_rate *= 4;
+               break;
+       case OPA_LINK_WIDTH_3X:
+               egress_rate *= 3;
+               break;
+       case OPA_LINK_WIDTH_2X:
+               egress_rate *= 2;
+               break;
+       default:
+               /* assume IB_WIDTH_1X */
+               break;
+       }
+
+       return egress_rate;
+}
+
+/*
+ * egress_cycles
+ *
+ * Returns the number of 'fabric clock cycles' to egress a packet
+ * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock
+ * rate is (approximately) 805 MHz, the units of the returned value
+ * are (1/805 MHz).
+ */
+static inline u32 egress_cycles(u32 len, u32 rate)
+{
+       u32 cycles;
+
+       /*
+        * cycles is:
+        *
+        *          (length) [bits] / (rate) [bits/sec]
+        *  ---------------------------------------------------
+        *  fabric_clock_period == 1 /(805 * 10^6) [cycles/sec]
+        */
+
+       cycles = len * 8; /* bits */
+       cycles *= 805;
+       cycles /= rate;
+
+       return cycles;
+}
+
+void set_link_ipg(struct hfi1_pportdata *ppd);
+void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
+                 u32 rqpn, u8 svc_type);
+void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
+               u32 pkey, u32 slid, u32 dlid, u8 sc5,
+               const struct ib_grh *old_grh);
+
+#define PACKET_EGRESS_TIMEOUT 350
+static inline void pause_for_credit_return(struct hfi1_devdata *dd)
+{
+       /* Pause at least 1us, to ensure chip returns all credits */
+       u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000;
+
+       udelay(usec ? usec : 1);
+}
+
+/**
+ * sc_to_vlt() reverse lookup sc to vl
+ * @dd - devdata
+ * @sc5 - 5 bit sc
+ */
+static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5)
+{
+       unsigned seq;
+       u8 rval;
+
+       if (sc5 >= OPA_MAX_SCS)
+               return (u8)(0xff);
+
+       do {
+               seq = read_seqbegin(&dd->sc2vl_lock);
+               rval = *(((u8 *)dd->sc2vl) + sc5);
+       } while (read_seqretry(&dd->sc2vl_lock, seq));
+
+       return rval;
+}
+
+#define PKEY_MEMBER_MASK 0x8000
+#define PKEY_LOW_15_MASK 0x7fff
+
+/*
+ * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for ingress partition keys
+ * specified in the OPAv1 spec., section 9.10.14.
+ */
+static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+       u16 mkey = pkey & PKEY_LOW_15_MASK;
+       u16 ment = ent & PKEY_LOW_15_MASK;
+
+       if (mkey == ment) {
+               /*
+                * If pkey[15] is clear (limited partition member),
+                * is bit 15 in the corresponding table element
+                * clear (limited member)?
+                */
+               if (!(pkey & PKEY_MEMBER_MASK))
+                       return !!(ent & PKEY_MEMBER_MASK);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * ingress_pkey_table_search - search the entire pkey table for
+ * an entry which matches 'pkey'. return 0 if a match is found,
+ * and 1 otherwise.
+ */
+static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey)
+{
+       int i;
+
+       for (i = 0; i < MAX_PKEY_VALUES; i++) {
+               if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                       return 0;
+       }
+       return 1;
+}
+
+/*
+ * ingress_pkey_table_fail - record a failure of ingress pkey validation,
+ * i.e., increment port_rcv_constraint_errors for the port, and record
+ * the 'error info' for this failure.
+ */
+static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey,
+                                   u16 slid)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       incr_cntr64(&ppd->port_rcv_constraint_errors);
+       if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) {
+               dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK;
+               dd->err_info_rcv_constraint.slid = slid;
+               dd->err_info_rcv_constraint.pkey = pkey;
+       }
+}
+
+/*
+ * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx
+ * is a hint as to the best place in the partition key table to begin
+ * searching. This function should not be called on the data path because
+ * of performance reasons. On datapath pkey check is expected to be done
+ * by HW and rcv_pkey_check function should be called instead.
+ */
+static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+                                    u8 sc5, u8 idx, u16 slid)
+{
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+               return 0;
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       /* Is the pkey = 0x0, or 0x8000? */
+       if ((pkey & PKEY_LOW_15_MASK) == 0)
+               goto bad;
+
+       /* The most likely matching pkey has index 'idx' */
+       if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx]))
+               return 0;
+
+       /* no match - try the whole table */
+       if (!ingress_pkey_table_search(ppd, pkey))
+               return 0;
+
+bad:
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+/*
+ * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1
+ * otherwise. It only ensures pkey is vlid for QP0. This function
+ * should be called on the data path instead of ingress_pkey_check
+ * as on data path, pkey check is done by HW (except for QP0).
+ */
+static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey,
+                                u8 sc5, u16 slid)
+{
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN))
+               return 0;
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+       return 0;
+bad:
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+/* MTU handling */
+
+/* MTU enumeration, 256-4k match IB */
+#define OPA_MTU_0     0
+#define OPA_MTU_256   1
+#define OPA_MTU_512   2
+#define OPA_MTU_1024  3
+#define OPA_MTU_2048  4
+#define OPA_MTU_4096  5
+
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd);
+int mtu_to_enum(u32 mtu, int default_if_bad);
+u16 enum_to_mtu(int);
+static inline int valid_ib_mtu(unsigned int mtu)
+{
+       return mtu == 256 || mtu == 512 ||
+               mtu == 1024 || mtu == 2048 ||
+               mtu == 4096;
+}
+static inline int valid_opa_max_mtu(unsigned int mtu)
+{
+       return mtu >= 2048 &&
+               (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240);
+}
+
+int set_mtu(struct hfi1_pportdata *);
+
+int hfi1_set_lid(struct hfi1_pportdata *, u32, u8);
+void hfi1_disable_after_error(struct hfi1_devdata *);
+int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int);
+int hfi1_rcvbuf_validate(u32, u8, u16 *);
+
+int fm_get_table(struct hfi1_pportdata *, int, void *);
+int fm_set_table(struct hfi1_pportdata *, int, void *);
+
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf);
+void reset_link_credits(struct hfi1_devdata *dd);
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu);
+
+int snoop_recv_handler(struct hfi1_packet *packet);
+int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+                          u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                          u32 plen, u32 dwords, u64 pbc);
+int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
+                          u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                          u32 plen, u32 dwords, u64 pbc);
+void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
+                          u64 pbc, const void *from, size_t count);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define ctxt_fp(fp) \
+       (((struct hfi1_filedata *)(fp)->private_data)->uctxt)
+#define subctxt_fp(fp) \
+       (((struct hfi1_filedata *)(fp)->private_data)->subctxt)
+#define tidcursor_fp(fp) \
+       (((struct hfi1_filedata *)(fp)->private_data)->tidcursor)
+#define user_sdma_pkt_fp(fp) \
+       (((struct hfi1_filedata *)(fp)->private_data)->pq)
+#define user_sdma_comp_fp(fp) \
+       (((struct hfi1_filedata *)(fp)->private_data)->cq)
+
+static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
+{
+       return ppd->dd;
+}
+
+static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev)
+{
+       return container_of(dev, struct hfi1_devdata, verbs_dev);
+}
+
+static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+       return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp)
+{
+       return container_of(ibp, struct hfi1_pportdata, ibport_data);
+}
+
+static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+       WARN_ON(pidx >= dd->num_pports);
+       return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 ret;
+
+       if (index >= ARRAY_SIZE(ppd->pkeys))
+               ret = 0;
+       else
+               ret = ppd->pkeys[index];
+
+       return ret;
+}
+
+/*
+ * Readers of cc_state must call get_cc_state() under rcu_read_lock().
+ * Writers of cc_state must call get_cc_state() under cc_state_lock.
+ */
+static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd)
+{
+       return rcu_dereference(ppd->cc_state);
+}
+
+/*
+ * values for dd->flags (_device_ related flags)
+ */
+#define HFI1_INITTED           0x1    /* chip and driver up and initted */
+#define HFI1_PRESENT           0x2    /* chip accesses can be done */
+#define HFI1_FROZEN            0x4    /* chip in SPC freeze */
+#define HFI1_HAS_SDMA_TIMEOUT  0x8
+#define HFI1_HAS_SEND_DMA      0x10   /* Supports Send DMA */
+#define HFI1_FORCED_FREEZE     0x80   /* driver forced freeze mode */
+#define HFI1_DO_INIT_ASIC      0x100  /* This device will init the ASIC */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define HFI1_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+
+/* ctxt_flag bit offsets */
+               /* context has been setup */
+#define HFI1_CTXT_SETUP_DONE 1
+               /* waiting for a packet to arrive */
+#define HFI1_CTXT_WAITING_RCV   2
+               /* master has not finished initializing */
+#define HFI1_CTXT_MASTER_UNINIT 4
+               /* waiting for an urgent packet to arrive */
+#define HFI1_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *,
+                                 const struct pci_device_id *);
+void hfi1_free_devdata(struct hfi1_devdata *);
+void cc_state_reclaim(struct rcu_head *rcu);
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define HFI1_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define HFI1_LED_LOG 2  /* Logical (link) YELLOW LED */
+void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val);
+
+#define HFI1_CREDIT_RETURN_RATE (100)
+
+/*
+ * The number of words for the KDETH protocol field.  If this is
+ * larger then the actual field used, then part of the payload
+ * will be in the header.
+ *
+ * Optimally, we want this sized so that a typical case will
+ * use full cache lines.  The typical local KDETH header would
+ * be:
+ *
+ *     Bytes   Field
+ *       8     LRH
+ *      12     BHT
+ *      ??     KDETH
+ *       8     RHF
+ *     ---
+ *      28 + KDETH
+ *
+ * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS
+ */
+#define DEFAULT_RCVHDRSIZE 9
+
+/*
+ * Maximal header byte count:
+ *
+ *     Bytes   Field
+ *       8     LRH
+ *      40     GRH (optional)
+ *      12     BTH
+ *      ??     KDETH
+ *       8     RHF
+ *     ---
+ *      68 + KDETH
+ *
+ * We also want to maintain a cache line alignment to assist DMA'ing
+ * of the header bytes.  Round up to a good size.
+ */
+#define DEFAULT_RCVHDR_ENTSIZE 32
+
+int hfi1_get_user_pages(unsigned long, size_t, struct page **);
+void hfi1_release_user_pages(struct page **, size_t);
+
+static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+       *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
+{
+       /*
+        * volatile because it's a DMA target from the chip, routine is
+        * inlined, and don't want register caching or reordering.
+        */
+       return (u32) le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_hfi1_version[];
+
+int hfi1_device_create(struct hfi1_devdata *);
+void hfi1_device_remove(struct hfi1_devdata *);
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+                          struct kobject *kobj);
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *);
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *);
+/* Hook for sysfs read of QSFP */
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
+
+int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *);
+void hfi1_pcie_cleanup(struct pci_dev *);
+int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *,
+                    const struct pci_device_id *);
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
+void hfi1_pcie_flr(struct hfi1_devdata *);
+int pcie_speeds(struct hfi1_devdata *);
+void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *);
+void hfi1_enable_intx(struct pci_dev *);
+void hfi1_nomsix(struct hfi1_devdata *);
+void restore_pci_variables(struct hfi1_devdata *dd);
+int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+int parse_platform_config(struct hfi1_devdata *dd);
+int get_platform_config_field(struct hfi1_devdata *dd,
+                       enum platform_config_table_type_encoding table_type,
+                       int table_index, int field_index, u32 *data, u32 len);
+
+dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long,
+                        size_t, int);
+const char *get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void flush_wc(void)
+{
+       asm volatile("sfence" : : : "memory");
+}
+
+void handle_eflags(struct hfi1_packet *packet);
+int process_receive_ib(struct hfi1_packet *packet);
+int process_receive_bypass(struct hfi1_packet *packet);
+int process_receive_error(struct hfi1_packet *packet);
+int kdeth_process_expected(struct hfi1_packet *packet);
+int kdeth_process_eager(struct hfi1_packet *packet);
+int process_receive_invalid(struct hfi1_packet *packet);
+
+extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8];
+
+void update_sge(struct hfi1_sge_state *ss, u32 length);
+
+/* global module parameter variables */
+extern unsigned int hfi1_max_mtu;
+extern unsigned int hfi1_cu;
+extern unsigned int user_credit_return_threshold;
+extern uint num_rcv_contexts;
+extern unsigned n_krcvqs;
+extern u8 krcvqs[];
+extern int krcvqsset;
+extern uint kdeth_qp;
+extern uint loopback;
+extern uint quick_linkup;
+extern uint rcv_intr_timeout;
+extern uint rcv_intr_count;
+extern uint rcv_intr_dynamic;
+extern ushort link_crc_mask;
+
+extern struct mutex hfi1_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define DRIVER_NAME            "hfi1"
+#define HFI1_USER_MINOR_BASE     0
+#define HFI1_TRACE_MINOR         127
+#define HFI1_DIAGPKT_MINOR       128
+#define HFI1_DIAG_MINOR_BASE     129
+#define HFI1_SNOOP_CAPTURE_BASE  200
+#define HFI1_NMINORS             255
+
+#define PCI_VENDOR_ID_INTEL 0x8086
+#define PCI_DEVICE_ID_INTEL0 0x24f0
+#define PCI_DEVICE_ID_INTEL1 0x24f1
+
+#define HFI1_PKT_USER_SC_INTEGRITY                                         \
+       (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK            \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK              \
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK)
+
+#define HFI1_PKT_KERNEL_SC_INTEGRITY                                       \
+       (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK)
+
+static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
+                                                 u16 ctxt_type)
+{
+       u64 base_sc_integrity =
+       SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+       | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK
+       | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+       if (ctxt_type == SC_USER)
+               base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY;
+       else
+               base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
+
+       if (is_a0(dd))
+               /* turn off send-side job key checks - A0 erratum */
+               return base_sc_integrity &
+                      ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+       return base_sc_integrity;
+}
+
+static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd)
+{
+       u64 base_sdma_integrity =
+       SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK
+       | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK
+       | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK;
+
+       if (is_a0(dd))
+               /* turn off send-side job key checks - A0 erratum */
+               return base_sdma_integrity &
+                      ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+       return base_sdma_integrity;
+}
+
+/*
+ * hfi1_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  hfi1_dev_porterr is
+ * the same as dd_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ */
+#define hfi1_early_err(dev, fmt, ...) \
+       dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define hfi1_early_info(dev, fmt, ...) \
+       dev_info(dev, fmt, ##__VA_ARGS__)
+
+#define dd_dev_emerg(dd, fmt, ...) \
+       dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
+                 get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_err(dd, fmt, ...) \
+       dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+#define dd_dev_warn(dd, fmt, ...) \
+       dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_warn_ratelimited(dd, fmt, ...) \
+       dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define dd_dev_info(dd, fmt, ...) \
+       dev_info(&(dd)->pcidev->dev, "%s: " fmt, \
+                       get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define hfi1_dev_porterr(dd, port, fmt, ...) \
+       dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
+                       get_unit_name((dd)->unit), (dd)->unit, (port), \
+                       ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct hfi1_hwerror_msgs {
+       u64 mask;
+       const char *msg;
+       size_t sz;
+};
+
+/* in intr.c... */
+void hfi1_format_hwerrors(u64 hwerrs,
+                         const struct hfi1_hwerror_msgs *hwerrmsgs,
+                         size_t nhwerrmsgs, char *msg, size_t lmsg);
+
+#define USER_OPCODE_CHECK_VAL 0xC0
+#define USER_OPCODE_CHECK_MASK 0xC0
+#define OPCODE_CHECK_VAL_DISABLED 0x0
+#define OPCODE_CHECK_MASK_DISABLED 0x0
+
+static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       dd->z_int_counter = get_all_cpu_total(dd->int_counter);
+       dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit);
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.z_rc_acks =
+                       get_all_cpu_total(ppd->ibport_data.rc_acks);
+               ppd->ibport_data.z_rc_qacks =
+                       get_all_cpu_total(ppd->ibport_data.rc_qacks);
+       }
+}
+
+/* Control LED state */
+static inline void setextled(struct hfi1_devdata *dd, u32 on)
+{
+       if (on)
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F);
+       else
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0x10);
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
+
+#endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
new file mode 100644 (file)
index 0000000..a877eda
--- /dev/null
@@ -0,0 +1,1722 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/hrtimer.h>
+
+#include "hfi.h"
+#include "device.h"
+#include "common.h"
+#include "mad.h"
+#include "sdma.h"
+#include "debugfs.h"
+#include "verbs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define HFI1_MIN_USER_CTXT_BUFCNT 7
+
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
+#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
+#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+
+/*
+ * Number of user receive contexts we are configured to use (to allow for more
+ * pio buffers per ctxt, etc.)  Zero means use one user context per CPU.
+ */
+uint num_rcv_contexts;
+module_param_named(num_rcv_contexts, num_rcv_contexts, uint, S_IRUGO);
+MODULE_PARM_DESC(
+       num_rcv_contexts, "Set max number of user receive contexts to use");
+
+u8 krcvqs[RXE_NUM_DATA_VL];
+int krcvqsset;
+module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "Array of the number of kernel receive queues by VL");
+
+/* computed based on above array */
+unsigned n_krcvqs;
+
+static unsigned hfi1_rcvarr_split = 25;
+module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers");
+
+static uint eager_buffer_size = (2 << 20); /* 2MB */
+module_param(eager_buffer_size, uint, S_IRUGO);
+MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB");
+
+static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */
+module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)");
+
+static uint hfi1_hdrq_entsize = 32;
+module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO);
+MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B");
+
+unsigned int user_credit_return_threshold = 33;        /* default is 33% */
+module_param(user_credit_return_threshold, uint, S_IRUGO);
+MODULE_PARM_DESC(user_credit_return_theshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+
+static inline u64 encode_rcv_header_entry_size(u16);
+
+static struct idr hfi1_unit_table;
+u32 hfi1_cpulist_count;
+unsigned long *hfi1_cpulist;
+
+/*
+ * Common code for creating the receive context array.
+ */
+int hfi1_create_ctxts(struct hfi1_devdata *dd)
+{
+       unsigned i;
+       int ret;
+       int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+       if (local_node_id < 0)
+               local_node_id = numa_node_id();
+       dd->assigned_node_id = local_node_id;
+
+       dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL);
+       if (!dd->rcd) {
+               dd_dev_err(dd,
+                       "Unable to allocate receive context array, failing\n");
+               goto nomem;
+       }
+
+       /* create one or more kernel contexts */
+       for (i = 0; i < dd->first_user_ctxt; ++i) {
+               struct hfi1_pportdata *ppd;
+               struct hfi1_ctxtdata *rcd;
+
+               ppd = dd->pport + (i % dd->num_pports);
+               rcd = hfi1_create_ctxtdata(ppd, i);
+               if (!rcd) {
+                       dd_dev_err(dd,
+                               "Unable to allocate kernel receive context, failing\n");
+                       goto nomem;
+               }
+               /*
+                * Set up the kernel context flags here and now because they
+                * use default values for all receive side memories.  User
+                * contexts will be handled as they are created.
+                */
+               rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+                       HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+                       HFI1_CAP_KGET(NODROP_EGR_FULL) |
+                       HFI1_CAP_KGET(DMA_RTAIL);
+               rcd->seq_cnt = 1;
+
+               rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
+               if (!rcd->sc) {
+                       dd_dev_err(dd,
+                               "Unable to allocate kernel send context, failing\n");
+                       dd->rcd[rcd->ctxt] = NULL;
+                       hfi1_free_ctxtdata(dd, rcd);
+                       goto nomem;
+               }
+
+               ret = hfi1_init_ctxt(rcd->sc);
+               if (ret < 0) {
+                       dd_dev_err(dd,
+                                  "Failed to setup kernel receive context, failing\n");
+                       sc_free(rcd->sc);
+                       dd->rcd[rcd->ctxt] = NULL;
+                       hfi1_free_ctxtdata(dd, rcd);
+                       ret = -EFAULT;
+                       goto bail;
+               }
+       }
+
+       return 0;
+nomem:
+       ret = -ENOMEM;
+bail:
+       kfree(dd->rcd);
+       dd->rcd = NULL;
+       return ret;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct hfi1_ctxtdata *rcd;
+       unsigned kctxt_ngroups = 0;
+       u32 base;
+
+       if (dd->rcv_entries.nctxt_extra >
+           dd->num_rcv_contexts - dd->first_user_ctxt)
+               kctxt_ngroups = (dd->rcv_entries.nctxt_extra -
+                                (dd->num_rcv_contexts - dd->first_user_ctxt));
+       rcd = kzalloc(sizeof(*rcd), GFP_KERNEL);
+       if (rcd) {
+               u32 rcvtids, max_entries;
+
+               dd_dev_info(dd, "%s: setting up context %u\n", __func__, ctxt);
+
+               INIT_LIST_HEAD(&rcd->qp_wait_list);
+               rcd->ppd = ppd;
+               rcd->dd = dd;
+               rcd->cnt = 1;
+               rcd->ctxt = ctxt;
+               dd->rcd[ctxt] = rcd;
+               rcd->numa_id = numa_node_id();
+               rcd->rcv_array_groups = dd->rcv_entries.ngroups;
+
+               spin_lock_init(&rcd->exp_lock);
+
+               /*
+                * Calculate the context's RcvArray entry starting point.
+                * We do this here because we have to take into account all
+                * the RcvArray entries that previous context would have
+                * taken and we have to account for any extra groups
+                * assigned to the kernel or user contexts.
+                */
+               if (ctxt < dd->first_user_ctxt) {
+                       if (ctxt < kctxt_ngroups) {
+                               base = ctxt * (dd->rcv_entries.ngroups + 1);
+                               rcd->rcv_array_groups++;
+                       } else
+                               base = kctxt_ngroups +
+                                       (ctxt * dd->rcv_entries.ngroups);
+               } else {
+                       u16 ct = ctxt - dd->first_user_ctxt;
+
+                       base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) +
+                               kctxt_ngroups);
+                       if (ct < dd->rcv_entries.nctxt_extra) {
+                               base += ct * (dd->rcv_entries.ngroups + 1);
+                               rcd->rcv_array_groups++;
+                       } else
+                               base += dd->rcv_entries.nctxt_extra +
+                                       (ct * dd->rcv_entries.ngroups);
+               }
+               rcd->eager_base = base * dd->rcv_entries.group_size;
+
+               /* Validate and initialize Rcv Hdr Q variables */
+               if (rcvhdrcnt % HDRQ_INCREMENT) {
+                       dd_dev_err(dd,
+                                  "ctxt%u: header queue count %d must be divisible by %d\n",
+                                  rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT);
+                       goto bail;
+               }
+               rcd->rcvhdrq_cnt = rcvhdrcnt;
+               rcd->rcvhdrqentsize = hfi1_hdrq_entsize;
+               /*
+                * Simple Eager buffer allocation: we have already pre-allocated
+                * the number of RcvArray entry groups. Each ctxtdata structure
+                * holds the number of groups for that context.
+                *
+                * To follow CSR requirements and maintain cacheline alignment,
+                * make sure all sizes and bases are multiples of group_size.
+                *
+                * The expected entry count is what is left after assigning
+                * eager.
+                */
+               max_entries = rcd->rcv_array_groups *
+                       dd->rcv_entries.group_size;
+               rcvtids = ((max_entries * hfi1_rcvarr_split) / 100);
+               rcd->egrbufs.count = round_down(rcvtids,
+                                               dd->rcv_entries.group_size);
+               if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) {
+                       dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n",
+                                  rcd->ctxt);
+                       rcd->egrbufs.count = MAX_EAGER_ENTRIES;
+               }
+               dd_dev_info(dd, "ctxt%u: max Eager buffer RcvArray entries: %u\n",
+                           rcd->ctxt, rcd->egrbufs.count);
+
+               /*
+                * Allocate array that will hold the eager buffer accounting
+                * data.
+                * This will allocate the maximum possible buffer count based
+                * on the value of the RcvArray split parameter.
+                * The resulting value will be rounded down to the closest
+                * multiple of dd->rcv_entries.group_size.
+                */
+               rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) *
+                                              rcd->egrbufs.count, GFP_KERNEL);
+               if (!rcd->egrbufs.buffers)
+                       goto bail;
+               rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) *
+                                              rcd->egrbufs.count, GFP_KERNEL);
+               if (!rcd->egrbufs.rcvtids)
+                       goto bail;
+               rcd->egrbufs.size = eager_buffer_size;
+               /*
+                * The size of the buffers programmed into the RcvArray
+                * entries needs to be big enough to handle the highest
+                * MTU supported.
+                */
+               if (rcd->egrbufs.size < hfi1_max_mtu) {
+                       rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
+                       dd_dev_info(dd,
+                                   "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+                                   rcd->ctxt, rcd->egrbufs.size);
+               }
+               rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
+
+               if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+                       rcd->opstats = kzalloc(sizeof(*rcd->opstats),
+                               GFP_KERNEL);
+                       if (!rcd->opstats) {
+                               dd_dev_err(dd,
+                                          "ctxt%u: Unable to allocate per ctxt stats buffer\n",
+                                          rcd->ctxt);
+                               goto bail;
+                       }
+               }
+       }
+       return rcd;
+bail:
+       kfree(rcd->opstats);
+       kfree(rcd->egrbufs.rcvtids);
+       kfree(rcd->egrbufs.buffers);
+       kfree(rcd);
+       return NULL;
+}
+
+/*
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid.
+ */
+static inline u64 encode_rcv_header_entry_size(u16 size)
+{
+       /* there are only 3 valid receive header entry sizes */
+       if (size == 2)
+               return 1;
+       if (size == 16)
+               return 2;
+       else if (size == 32)
+               return 4;
+       return 0; /* invalid */
+}
+
+/*
+ * Select the largest ccti value over all SLs to determine the intra-
+ * packet gap for the link.
+ *
+ * called with cca_timer_lock held (to protect access to cca_timer
+ * array), and rcu_read_lock() (to protect access to cc_state).
+ */
+void set_link_ipg(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct cc_state *cc_state;
+       int i;
+       u16 cce, ccti_limit, max_ccti = 0;
+       u16 shift, mult;
+       u64 src;
+       u32 current_egress_rate; /* Mbits /sec */
+       u32 max_pkt_time;
+       /*
+        * max_pkt_time is the maximum packet egress time in units
+        * of the fabric clock period 1/(805 MHz).
+        */
+
+       cc_state = get_cc_state(ppd);
+
+       if (cc_state == NULL)
+               /*
+                * This should _never_ happen - rcu_read_lock() is held,
+                * and set_link_ipg() should not be called if cc_state
+                * is NULL.
+                */
+               return;
+
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               u16 ccti = ppd->cca_timer[i].ccti;
+
+               if (ccti > max_ccti)
+                       max_ccti = ccti;
+       }
+
+       ccti_limit = cc_state->cct.ccti_limit;
+       if (max_ccti > ccti_limit)
+               max_ccti = ccti_limit;
+
+       cce = cc_state->cct.entries[max_ccti].entry;
+       shift = (cce & 0xc000) >> 14;
+       mult = (cce & 0x3fff);
+
+       current_egress_rate = active_egress_rate(ppd);
+
+       max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate);
+
+       src = (max_pkt_time >> shift) * mult;
+
+       src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK;
+       src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT;
+
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, src);
+}
+
+static enum hrtimer_restart cca_timer_fn(struct hrtimer *t)
+{
+       struct cca_timer *cca_timer;
+       struct hfi1_pportdata *ppd;
+       int sl;
+       u16 ccti, ccti_timer, ccti_min;
+       struct cc_state *cc_state;
+
+       cca_timer = container_of(t, struct cca_timer, hrtimer);
+       ppd = cca_timer->ppd;
+       sl = cca_timer->sl;
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (cc_state == NULL) {
+               rcu_read_unlock();
+               return HRTIMER_NORESTART;
+       }
+
+       /*
+        * 1) decrement ccti for SL
+        * 2) calculate IPG for link (set_link_ipg())
+        * 3) restart timer, unless ccti is at min value
+        */
+
+       ccti_min = cc_state->cong_setting.entries[sl].ccti_min;
+       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+
+       spin_lock(&ppd->cca_timer_lock);
+
+       ccti = cca_timer->ccti;
+
+       if (ccti > ccti_min) {
+               cca_timer->ccti--;
+               set_link_ipg(ppd);
+       }
+
+       spin_unlock(&ppd->cca_timer_lock);
+
+       rcu_read_unlock();
+
+       if (ccti > ccti_min) {
+               unsigned long nsec = 1024 * ccti_timer;
+               /* ccti_timer is in units of 1.024 usec */
+               hrtimer_forward_now(t, ns_to_ktime(nsec));
+               return HRTIMER_RESTART;
+       }
+       return HRTIMER_NORESTART;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd,
+                        struct hfi1_devdata *dd, u8 hw_pidx, u8 port)
+{
+       int i, size;
+       uint default_pkey_idx;
+
+       ppd->dd = dd;
+       ppd->hw_pidx = hw_pidx;
+       ppd->port = port; /* IB port number, not index */
+
+       default_pkey_idx = 1;
+
+       ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY;
+       if (loopback) {
+               hfi1_early_err(&pdev->dev,
+                              "Faking data partition 0x8001 in idx %u\n",
+                              !default_pkey_idx);
+               ppd->pkeys[!default_pkey_idx] = 0x8001;
+       }
+
+       INIT_WORK(&ppd->link_vc_work, handle_verify_cap);
+       INIT_WORK(&ppd->link_up_work, handle_link_up);
+       INIT_WORK(&ppd->link_down_work, handle_link_down);
+       INIT_WORK(&ppd->freeze_work, handle_freeze);
+       INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade);
+       INIT_WORK(&ppd->sma_message_work, handle_sma_message);
+       INIT_WORK(&ppd->link_bounce_work, handle_link_bounce);
+       mutex_init(&ppd->hls_lock);
+       spin_lock_init(&ppd->sdma_alllock);
+       spin_lock_init(&ppd->qsfp_info.qsfp_lock);
+
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+
+       ppd->hfi1_wq = NULL;
+
+       spin_lock_init(&ppd->cca_timer_lock);
+
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC,
+                            HRTIMER_MODE_REL);
+               ppd->cca_timer[i].ppd = ppd;
+               ppd->cca_timer[i].sl = i;
+               ppd->cca_timer[i].ccti = 0;
+               ppd->cca_timer[i].hrtimer.function = cca_timer_fn;
+       }
+
+       ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT;
+
+       spin_lock_init(&ppd->cc_state_lock);
+       spin_lock_init(&ppd->cc_log_lock);
+       size = sizeof(struct cc_state);
+       RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL));
+       if (!rcu_dereference(ppd->cc_state))
+               goto bail;
+       return;
+
+bail:
+
+       hfi1_early_err(&pdev->dev,
+                      "Congestion Control Agent disabled for port %d\n", port);
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct hfi1_devdata *dd)
+{
+       return 0;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the hfi1_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Ensure chip does no sends or receives, tail updates, or
+        * pioavail updates while we re-initialize.  This is mostly
+        * for the driver data structures, not chip registers.
+        */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+                                 HFI1_RCVCTRL_INTRAVAIL_DIS |
+                                 HFI1_RCVCTRL_TAILUPD_DIS, i);
+       pio_send_control(dd, PSC_GLOBAL_DISABLE);
+       for (i = 0; i < dd->num_send_contexts; i++)
+               sc_disable(dd->send_contexts[i].sc);
+
+       return 0;
+}
+
+static void enable_chip(struct hfi1_devdata *dd)
+{
+       u32 rcvmask;
+       u32 i;
+
+       /* enable PIO send */
+       pio_send_control(dd, PSC_GLOBAL_ENABLE);
+
+       /*
+        * Enable kernel ctxts' receive and receive interrupt.
+        * Other ctxts done as user opens and initializes them.
+        */
+       rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB;
+       for (i = 0; i < dd->first_user_ctxt; ++i) {
+               rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ?
+                       HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
+               if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR))
+                       rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL))
+                       rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+               if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL))
+                       rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+               hfi1_rcvctrl(dd, rcvmask, i);
+               sc_enable(dd->rcd[i]->sc);
+       }
+}
+
+/**
+ * create_workqueues - create per port workqueues
+ * @dd: the hfi1_ib device
+ */
+static int create_workqueues(struct hfi1_devdata *dd)
+{
+       int pidx;
+       struct hfi1_pportdata *ppd;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (!ppd->hfi1_wq) {
+                       char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
+
+                       snprintf(wq_name, sizeof(wq_name), "hfi%d_%d",
+                                dd->unit, pidx);
+                       ppd->hfi1_wq =
+                               create_singlethread_workqueue(wq_name);
+                       if (!ppd->hfi1_wq)
+                               goto wq_error;
+               }
+       }
+       return 0;
+wq_error:
+       pr_err("create_singlethread_workqueue failed for port %d\n",
+               pidx + 1);
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (ppd->hfi1_wq) {
+                       destroy_workqueue(ppd->hfi1_wq);
+                       ppd->hfi1_wq = NULL;
+               }
+       }
+       return -ENOMEM;
+}
+
+/**
+ * hfi1_init - do the actual initialization sequence on the chip
+ * @dd: the hfi1_ib device
+ * @reinit: re-initializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int hfi1_init(struct hfi1_devdata *dd, int reinit)
+{
+       int ret = 0, pidx, lastfail = 0;
+       unsigned i, len;
+       struct hfi1_ctxtdata *rcd;
+       struct hfi1_pportdata *ppd;
+
+       /* Set up recv low level handlers */
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] =
+                                               kdeth_process_expected;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] =
+                                               kdeth_process_eager;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] =
+                                               process_receive_error;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] =
+                                               process_receive_bypass;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] =
+                                               process_receive_invalid;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] =
+                                               process_receive_invalid;
+       dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] =
+                                               process_receive_invalid;
+       dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions;
+
+       /* Set up send low level handlers */
+       dd->process_pio_send = hfi1_verbs_send_pio;
+       dd->process_dma_send = hfi1_verbs_send_dma;
+       dd->pio_inline_send = pio_copy;
+
+       if (is_a0(dd)) {
+               atomic_set(&dd->drop_packet, DROP_PACKET_ON);
+               dd->do_drop = 1;
+       } else {
+               atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
+               dd->do_drop = 0;
+       }
+
+       /* make sure the link is not "up" */
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               ppd->linkup = 0;
+       }
+
+       if (reinit)
+               ret = init_after_reset(dd);
+       else
+               ret = loadtime_init(dd);
+       if (ret)
+               goto done;
+
+       /* dd->rcd can be NULL if early initialization failed */
+       for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+               /*
+                * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+                * re-init, the simplest way to handle this is to free
+                * existing, and re-allocate.
+                * Need to re-create rest of ctxt 0 ctxtdata as well.
+                */
+               rcd = dd->rcd[i];
+               if (!rcd)
+                       continue;
+
+               rcd->do_interrupt = &handle_receive_interrupt;
+
+               lastfail = hfi1_create_rcvhdrq(dd, rcd);
+               if (!lastfail)
+                       lastfail = hfi1_setup_eagerbufs(rcd);
+               if (lastfail)
+                       dd_dev_err(dd,
+                               "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+       }
+       if (lastfail)
+               ret = lastfail;
+
+       /* Allocate enough memory for user event notification. */
+       len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS *
+                   sizeof(*dd->events), PAGE_SIZE);
+       dd->events = vmalloc_user(len);
+       if (!dd->events)
+               dd_dev_err(dd, "Failed to allocate user events page\n");
+       /*
+        * Allocate a page for device and port status.
+        * Page will be shared amongst all user processes.
+        */
+       dd->status = vmalloc_user(PAGE_SIZE);
+       if (!dd->status)
+               dd_dev_err(dd, "Failed to allocate dev status page\n");
+       else
+               dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) -
+                                            sizeof(dd->status->freezemsg));
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (dd->status)
+                       /* Currently, we only have one port */
+                       ppd->statusp = &dd->status->port;
+
+               set_mtu(ppd);
+       }
+
+       /* enable chip even if we have an error, so we can debug cause */
+       enable_chip(dd);
+
+       ret = hfi1_cq_init(dd);
+done:
+       /*
+        * Set status even if port serdes is not initialized
+        * so that diags will work.
+        */
+       if (dd->status)
+               dd->status->dev |= HFI1_STATUS_CHIP_PRESENT |
+                       HFI1_STATUS_INITTED;
+       if (!ret) {
+               /* enable all interrupts from the chip */
+               set_intr_state(dd, 1);
+
+               /* chip is OK for user apps; mark it as initialized */
+               for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                       ppd = dd->pport + pidx;
+
+                       /* initialize the qsfp if it exists
+                        * Requires interrupts to be enabled so we are notified
+                        * when the QSFP completes reset, and has
+                        * to be done before bringing up the SERDES
+                        */
+                       init_qsfp(ppd);
+
+                       /* start the serdes - must be after interrupts are
+                          enabled so we are notified when the link goes up */
+                       lastfail = bringup_serdes(ppd);
+                       if (lastfail)
+                               dd_dev_info(dd,
+                                       "Failed to bring up port %u\n",
+                                       ppd->port);
+
+                       /*
+                        * Set status even if port serdes is not initialized
+                        * so that diags will work.
+                        */
+                       if (ppd->statusp)
+                               *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT |
+                                                       HFI1_STATUS_INITTED;
+                       if (!ppd->link_speed_enabled)
+                               continue;
+               }
+       }
+
+       /* if ret is non-zero, we probably should do some cleanup here... */
+       return ret;
+}
+
+static inline struct hfi1_devdata *__hfi1_lookup(int unit)
+{
+       return idr_find(&hfi1_unit_table, unit);
+}
+
+struct hfi1_devdata *hfi1_lookup(int unit)
+{
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       dd = __hfi1_lookup(unit);
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+
+       return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void stop_timers(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int pidx;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               if (ppd->led_override_timer.data) {
+                       del_timer_sync(&ppd->led_override_timer);
+                       atomic_set(&ppd->led_override_timer_active, 0);
+               }
+       }
+}
+
+/**
+ * shutdown_device - shut down a device
+ * @dd: the hfi1_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by hfi1_init(dd, 1)
+ */
+static void shutdown_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       unsigned pidx;
+       int i;
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               ppd->linkup = 0;
+               if (ppd->statusp)
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+       }
+       dd->flags &= ~HFI1_INITTED;
+
+       /* mask interrupts, but not errors */
+       set_intr_state(dd, 0);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+               for (i = 0; i < dd->num_rcv_contexts; i++)
+                       hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS |
+                                         HFI1_RCVCTRL_CTXT_DIS |
+                                         HFI1_RCVCTRL_INTRAVAIL_DIS |
+                                         HFI1_RCVCTRL_PKEY_DIS |
+                                         HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i);
+               /*
+                * Gracefully stop all sends allowing any in progress to
+                * trickle out first.
+                */
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       sc_flush(dd->send_contexts[i].sc);
+       }
+
+       /*
+        * Enough for anything that's going to trickle out to have actually
+        * done so.
+        */
+       udelay(20);
+
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               ppd = dd->pport + pidx;
+
+               /* disable all contexts */
+               for (i = 0; i < dd->num_send_contexts; i++)
+                       sc_disable(dd->send_contexts[i].sc);
+               /* disable the send device */
+               pio_send_control(dd, PSC_GLOBAL_DISABLE);
+
+               /*
+                * Clear SerdesEnable.
+                * We can't count on interrupts since we are stopping.
+                */
+               hfi1_quiet_serdes(ppd);
+
+               if (ppd->hfi1_wq) {
+                       destroy_workqueue(ppd->hfi1_wq);
+                       ppd->hfi1_wq = NULL;
+               }
+       }
+       sdma_exit(dd);
+}
+
+/**
+ * hfi1_free_ctxtdata - free a context's allocated data
+ * @dd: the hfi1_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after hfi1_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+       unsigned e;
+
+       if (!rcd)
+               return;
+
+       if (rcd->rcvhdrq) {
+               dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+                                 rcd->rcvhdrq, rcd->rcvhdrq_phys);
+               rcd->rcvhdrq = NULL;
+               if (rcd->rcvhdrtail_kvaddr) {
+                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                         (void *)rcd->rcvhdrtail_kvaddr,
+                                         rcd->rcvhdrqtailaddr_phys);
+                       rcd->rcvhdrtail_kvaddr = NULL;
+               }
+       }
+
+       /* all the RcvArray entries should have been cleared by now */
+       kfree(rcd->egrbufs.rcvtids);
+
+       for (e = 0; e < rcd->egrbufs.alloced; e++) {
+               if (rcd->egrbufs.buffers[e].phys)
+                       dma_free_coherent(&dd->pcidev->dev,
+                                         rcd->egrbufs.buffers[e].len,
+                                         rcd->egrbufs.buffers[e].addr,
+                                         rcd->egrbufs.buffers[e].phys);
+       }
+       kfree(rcd->egrbufs.buffers);
+
+       sc_free(rcd->sc);
+       vfree(rcd->physshadow);
+       vfree(rcd->tid_pg_list);
+       vfree(rcd->user_event_mask);
+       vfree(rcd->subctxt_uregbase);
+       vfree(rcd->subctxt_rcvegrbuf);
+       vfree(rcd->subctxt_rcvhdr_base);
+       kfree(rcd->tidusemap);
+       kfree(rcd->opstats);
+       kfree(rcd);
+}
+
+void hfi1_free_devdata(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       idr_remove(&hfi1_unit_table, dd->unit);
+       list_del(&dd->list);
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+       rcu_barrier(); /* wait for rcu callbacks to complete */
+       free_percpu(dd->int_counter);
+       free_percpu(dd->rcv_limit);
+       ib_dealloc_device(&dd->verbs_dev.ibdev);
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+       unsigned long flags;
+       struct hfi1_devdata *dd;
+       int ret;
+
+       dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra);
+       if (!dd)
+               return ERR_PTR(-ENOMEM);
+       /* extra is * number of ports */
+       dd->num_pports = extra / sizeof(struct hfi1_pportdata);
+       dd->pport = (struct hfi1_pportdata *)(dd + 1);
+
+       INIT_LIST_HEAD(&dd->list);
+       dd->node = dev_to_node(&pdev->dev);
+       if (dd->node < 0)
+               dd->node = 0;
+       idr_preload(GFP_KERNEL);
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+
+       ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
+       if (ret >= 0) {
+               dd->unit = ret;
+               list_add(&dd->list, &hfi1_dev_list);
+       }
+
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       idr_preload_end();
+
+       if (ret < 0) {
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate unit ID: error %d\n", -ret);
+               goto bail;
+       }
+       /*
+        * Initialize all locks for the device. This needs to be as early as
+        * possible so locks are usable.
+        */
+       spin_lock_init(&dd->sc_lock);
+       spin_lock_init(&dd->sendctrl_lock);
+       spin_lock_init(&dd->rcvctrl_lock);
+       spin_lock_init(&dd->uctxt_lock);
+       spin_lock_init(&dd->hfi1_diag_trans_lock);
+       spin_lock_init(&dd->sc_init_lock);
+       spin_lock_init(&dd->dc8051_lock);
+       spin_lock_init(&dd->dc8051_memlock);
+       mutex_init(&dd->qsfp_i2c_mutex);
+       seqlock_init(&dd->sc2vl_lock);
+       spin_lock_init(&dd->sde_map_lock);
+       init_waitqueue_head(&dd->event_queue);
+
+       dd->int_counter = alloc_percpu(u64);
+       if (!dd->int_counter) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu int_counter\n");
+               goto bail;
+       }
+
+       dd->rcv_limit = alloc_percpu(u64);
+       if (!dd->rcv_limit) {
+               ret = -ENOMEM;
+               hfi1_early_err(&pdev->dev,
+                              "Could not allocate per-cpu rcv_limit\n");
+               goto bail;
+       }
+
+       if (!hfi1_cpulist_count) {
+               u32 count = num_online_cpus();
+
+               hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) *
+                                     sizeof(long), GFP_KERNEL);
+               if (hfi1_cpulist)
+                       hfi1_cpulist_count = count;
+               else
+                       hfi1_early_err(
+                       &pdev->dev,
+                       "Could not alloc cpulist info, cpu affinity might be wrong\n");
+       }
+       hfi1_dbg_ibdev_init(&dd->verbs_dev);
+       return dd;
+
+bail:
+       if (!list_empty(&dd->list))
+               list_del_init(&dd->list);
+       ib_dealloc_device(&dd->verbs_dev.ibdev);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void hfi1_disable_after_error(struct hfi1_devdata *dd)
+{
+       if (dd->flags & HFI1_INITTED) {
+               u32 pidx;
+
+               dd->flags &= ~HFI1_INITTED;
+               if (dd->pport)
+                       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+                               struct hfi1_pportdata *ppd;
+
+                               ppd = dd->pport + pidx;
+                               if (dd->flags & HFI1_PRESENT)
+                                       set_link_state(ppd, HLS_DN_DISABLE);
+
+                               if (ppd->statusp)
+                                       *ppd->statusp &= ~HFI1_STATUS_IB_READY;
+                       }
+       }
+
+       /*
+        * Mark as having had an error for driver, and also
+        * for /sys and status word mapped to user programs.
+        * This marks unit as not usable, until reset.
+        */
+       if (dd->status)
+               dd->status->dev |= HFI1_STATUS_HWERROR;
+}
+
+static void remove_one(struct pci_dev *);
+static int init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
+#define PFX DRIVER_NAME ": "
+
+static const struct pci_device_id hfi1_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) },
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) },
+       { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl);
+
+static struct pci_driver hfi1_pci_driver = {
+       .name = DRIVER_NAME,
+       .probe = init_one,
+       .remove = remove_one,
+       .id_table = hfi1_pci_tbl,
+       .err_handler = &hfi1_pci_err_handler,
+};
+
+static void __init compute_krcvqs(void)
+{
+       int i;
+
+       for (i = 0; i < krcvqsset; i++)
+               n_krcvqs += krcvqs[i];
+}
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init hfi1_mod_init(void)
+{
+       int ret;
+
+       ret = dev_init();
+       if (ret)
+               goto bail;
+
+       /* validate max MTU before any devices start */
+       if (!valid_opa_max_mtu(hfi1_max_mtu)) {
+               pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
+                      hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU);
+               hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
+       }
+       /* valid CUs run from 1-128 in powers of 2 */
+       if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu))
+               hfi1_cu = 1;
+       /* valid credit return threshold is 0-100, variable is unsigned */
+       if (user_credit_return_threshold > 100)
+               user_credit_return_threshold = 100;
+
+       compute_krcvqs();
+       /* sanitize receive interrupt count, time must wait until after
+          the hardware type is known */
+       if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK)
+               rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK;
+       /* reject invalid combinations */
+       if (rcv_intr_count == 0 && rcv_intr_timeout == 0) {
+               pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
+               rcv_intr_count = 1;
+       }
+       if (rcv_intr_count > 1 && rcv_intr_timeout == 0) {
+               /*
+                * Avoid indefinite packet delivery by requiring a timeout
+                * if count is > 1.
+                */
+               pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
+               rcv_intr_timeout = 1;
+       }
+       if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) {
+               /*
+                * The dynamic algorithm expects a non-zero timeout
+                * and a count > 1.
+                */
+               pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
+               rcv_intr_dynamic = 0;
+       }
+
+       /* sanitize link CRC options */
+       link_crc_mask &= SUPPORTED_CRCS;
+
+       /*
+        * These must be called before the driver is registered with
+        * the PCI subsystem.
+        */
+       idr_init(&hfi1_unit_table);
+
+       hfi1_dbg_init();
+       ret = pci_register_driver(&hfi1_pci_driver);
+       if (ret < 0) {
+               pr_err("Unable to register driver: error %d\n", -ret);
+               goto bail_dev;
+       }
+       goto bail; /* all OK */
+
+bail_dev:
+       hfi1_dbg_exit();
+       idr_destroy(&hfi1_unit_table);
+       dev_cleanup();
+bail:
+       return ret;
+}
+
+module_init(hfi1_mod_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit hfi1_mod_cleanup(void)
+{
+       pci_unregister_driver(&hfi1_pci_driver);
+       hfi1_dbg_exit();
+       hfi1_cpulist_count = 0;
+       kfree(hfi1_cpulist);
+
+       idr_destroy(&hfi1_unit_table);
+       dispose_firmware();     /* asymmetric with obtain_firmware() */
+       dev_cleanup();
+}
+
+module_exit(hfi1_mod_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct hfi1_devdata *dd)
+{
+       int ctxt;
+       int pidx;
+       struct hfi1_ctxtdata **tmp;
+       unsigned long flags;
+
+       /* users can't do anything more with chip */
+       for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+               struct hfi1_pportdata *ppd = &dd->pport[pidx];
+               struct cc_state *cc_state;
+               int i;
+
+               if (ppd->statusp)
+                       *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT;
+
+               for (i = 0; i < OPA_MAX_SLS; i++)
+                       hrtimer_cancel(&ppd->cca_timer[i].hrtimer);
+
+               spin_lock(&ppd->cc_state_lock);
+               cc_state = get_cc_state(ppd);
+               rcu_assign_pointer(ppd->cc_state, NULL);
+               spin_unlock(&ppd->cc_state_lock);
+
+               if (cc_state)
+                       call_rcu(&cc_state->rcu, cc_state_reclaim);
+       }
+
+       free_credit_return(dd);
+
+       /*
+        * Free any resources still in use (usually just kernel contexts)
+        * at unload; we do for ctxtcnt, because that's what we allocate.
+        * We acquire lock to be really paranoid that rcd isn't being
+        * accessed from some interrupt-related code (that should not happen,
+        * but best to be sure).
+        */
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       tmp = dd->rcd;
+       dd->rcd = NULL;
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+       for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) {
+               struct hfi1_ctxtdata *rcd = tmp[ctxt];
+
+               tmp[ctxt] = NULL; /* debugging paranoia */
+               if (rcd) {
+                       hfi1_clear_tids(rcd);
+                       hfi1_free_ctxtdata(dd, rcd);
+               }
+       }
+       kfree(tmp);
+       /* must follow rcv context free - need to remove rcv's hooks */
+       for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++)
+               sc_free(dd->send_contexts[ctxt].sc);
+       dd->num_send_contexts = 0;
+       kfree(dd->send_contexts);
+       dd->send_contexts = NULL;
+       kfree(dd->boardname);
+       vfree(dd->events);
+       vfree(dd->status);
+       hfi1_cq_exit(dd);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void postinit_cleanup(struct hfi1_devdata *dd)
+{
+       hfi1_start_cleanup(dd);
+
+       hfi1_pcie_ddcleanup(dd);
+       hfi1_pcie_cleanup(dd->pcidev);
+
+       cleanup_device_data(dd);
+
+       hfi1_free_devdata(dd);
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret = 0, j, pidx, initfail;
+       struct hfi1_devdata *dd = NULL;
+
+       /* First, lock the non-writable module parameters */
+       HFI1_CAP_LOCK();
+
+       /* Validate some global module parameters */
+       if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+               hfi1_early_err(&pdev->dev, "Header queue  count too small\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* use the encoding function as a sanitization check */
+       if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
+               hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
+                              hfi1_hdrq_entsize);
+               goto bail;
+       }
+
+       /* The receive eager buffer size must be set before the receive
+        * contexts are created.
+        *
+        * Set the eager buffer size.  Validate that it falls in a range
+        * allowed by the hardware - all powers of 2 between the min and
+        * max.  The maximum valid MTU is within the eager buffer range
+        * so we do not need to cap the max_mtu by an eager buffer size
+        * setting.
+        */
+       if (eager_buffer_size) {
+               if (!is_power_of_2(eager_buffer_size))
+                       eager_buffer_size =
+                               roundup_pow_of_two(eager_buffer_size);
+               eager_buffer_size =
+                       clamp_val(eager_buffer_size,
+                                 MIN_EAGER_BUFFER * 8,
+                                 MAX_EAGER_BUFFER_TOTAL);
+               hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
+                               eager_buffer_size);
+       } else {
+               hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* restrict value of hfi1_rcvarr_split */
+       hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
+
+       ret = hfi1_pcie_init(pdev, ent);
+       if (ret)
+               goto bail;
+
+       /*
+        * Do device-specific initialization, function table setup, dd
+        * allocation, etc.
+        */
+       switch (ent->device) {
+       case PCI_DEVICE_ID_INTEL0:
+       case PCI_DEVICE_ID_INTEL1:
+               dd = hfi1_init_dd(pdev, ent);
+               break;
+       default:
+               hfi1_early_err(&pdev->dev,
+                              "Failing on unknown Intel deviceid 0x%x\n",
+                              ent->device);
+               ret = -ENODEV;
+       }
+
+       if (IS_ERR(dd))
+               ret = PTR_ERR(dd);
+       if (ret)
+               goto clean_bail; /* error already printed */
+
+       ret = create_workqueues(dd);
+       if (ret)
+               goto clean_bail;
+
+       /* do the generic initialization */
+       initfail = hfi1_init(dd, 0);
+
+       ret = hfi1_register_ib_device(dd);
+
+       /*
+        * Now ready for use.  this should be cleared whenever we
+        * detect a reset, or initiate one.  If earlier failure,
+        * we still create devices, so diags, etc. can be used
+        * to determine cause of problem.
+        */
+       if (!initfail && !ret)
+               dd->flags |= HFI1_INITTED;
+
+       j = hfi1_device_create(dd);
+       if (j)
+               dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+
+       if (initfail || ret) {
+               stop_timers(dd);
+               flush_workqueue(ib_wq);
+               for (pidx = 0; pidx < dd->num_pports; ++pidx)
+                       hfi1_quiet_serdes(dd->pport + pidx);
+               if (!j)
+                       hfi1_device_remove(dd);
+               if (!ret)
+                       hfi1_unregister_ib_device(dd);
+               postinit_cleanup(dd);
+               if (initfail)
+                       ret = initfail;
+               goto bail;      /* everything already cleaned */
+       }
+
+       sdma_start(dd);
+
+       return 0;
+
+clean_bail:
+       hfi1_pcie_cleanup(pdev);
+bail:
+       return ret;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       /* unregister from IB core */
+       hfi1_unregister_ib_device(dd);
+
+       /*
+        * Disable the IB link, disable interrupts on the device,
+        * clear dma engines, etc.
+        */
+       shutdown_device(dd);
+
+       stop_timers(dd);
+
+       /* wait until all of our (qsfp) queue_work() calls complete */
+       flush_workqueue(ib_wq);
+
+       hfi1_device_remove(dd);
+
+       postinit_cleanup(dd);
+}
+
+/**
+ * hfi1_create_rcvhdrq - create a receive header queue
+ * @dd: the hfi1_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
+{
+       unsigned amt;
+       u64 reg;
+
+       if (!rcd->rcvhdrq) {
+               dma_addr_t phys_hdrqtail;
+               gfp_t gfp_flags;
+
+               /*
+                * rcvhdrqentsize is in DWs, so we have to convert to bytes
+                * (* sizeof(u32)).
+                */
+               amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize *
+                           sizeof(u32), PAGE_SIZE);
+
+               gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+                       GFP_USER : GFP_KERNEL;
+               rcd->rcvhdrq = dma_zalloc_coherent(
+                       &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+                       gfp_flags | __GFP_COMP);
+
+               if (!rcd->rcvhdrq) {
+                       dd_dev_err(dd,
+                               "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+                               amt, rcd->ctxt);
+                       goto bail;
+               }
+
+               /* Event mask is per device now and is in hfi1_devdata */
+               /*if (rcd->ctxt >= dd->first_user_ctxt) {
+                       rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
+                       if (!rcd->user_event_mask)
+                               goto bail_free_hdrq;
+                               }*/
+
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+                       rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
+                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+                               gfp_flags);
+                       if (!rcd->rcvhdrtail_kvaddr)
+                               goto bail_free;
+                       rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+               }
+
+               rcd->rcvhdrq_size = amt;
+       }
+       /*
+        * These values are per-context:
+        *      RcvHdrCnt
+        *      RcvHdrEntSize
+        *      RcvHdrSize
+        */
+       reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
+                       & RCV_HDR_CNT_CNT_MASK)
+               << RCV_HDR_CNT_CNT_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
+       reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
+                       & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
+               << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
+       reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK)
+               << RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+       write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
+       return 0;
+
+bail_free:
+       dd_dev_err(dd,
+               "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+               rcd->ctxt);
+       vfree(rcd->user_event_mask);
+       rcd->user_event_mask = NULL;
+       dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+                         rcd->rcvhdrq_phys);
+       rcd->rcvhdrq = NULL;
+bail:
+       return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 max_entries, egrtop, alloced_bytes = 0, idx = 0;
+       gfp_t gfp_flags;
+       u16 order;
+       int ret = 0;
+       u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu);
+
+       /*
+        * GFP_USER, but without GFP_FS, so buffer cache can be
+        * coalesced (we hope); otherwise, even at order 4,
+        * heavy filesystem activity makes these fail, and we can
+        * use compound pages.
+        */
+       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+       /*
+        * The minimum size of the eager buffers is a groups of MTU-sized
+        * buffers.
+        * The global eager_buffer_size parameter is checked against the
+        * theoretical lower limit of the value. Here, we check against the
+        * MTU.
+        */
+       if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size))
+               rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size;
+       /*
+        * If using one-pkt-per-egr-buffer, lower the eager buffer
+        * size to the max MTU (page-aligned).
+        */
+       if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR))
+               rcd->egrbufs.rcvtid_size = round_mtu;
+
+       /*
+        * Eager buffers sizes of 1MB or less require smaller TID sizes
+        * to satisfy the "multiple of 8 RcvArray entries" requirement.
+        */
+       if (rcd->egrbufs.size <= (1 << 20))
+               rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu,
+                       rounddown_pow_of_two(rcd->egrbufs.size / 8));
+
+       while (alloced_bytes < rcd->egrbufs.size &&
+              rcd->egrbufs.alloced < rcd->egrbufs.count) {
+               rcd->egrbufs.buffers[idx].addr =
+                       dma_zalloc_coherent(&dd->pcidev->dev,
+                                           rcd->egrbufs.rcvtid_size,
+                                           &rcd->egrbufs.buffers[idx].phys,
+                                           gfp_flags);
+               if (rcd->egrbufs.buffers[idx].addr) {
+                       rcd->egrbufs.buffers[idx].len =
+                               rcd->egrbufs.rcvtid_size;
+                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr =
+                               rcd->egrbufs.buffers[idx].addr;
+                       rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys =
+                               rcd->egrbufs.buffers[idx].phys;
+                       rcd->egrbufs.alloced++;
+                       alloced_bytes += rcd->egrbufs.rcvtid_size;
+                       idx++;
+               } else {
+                       u32 new_size, i, j;
+                       u64 offset = 0;
+
+                       /*
+                        * Fail the eager buffer allocation if:
+                        *   - we are already using the lowest acceptable size
+                        *   - we are using one-pkt-per-egr-buffer (this implies
+                        *     that we are accepting only one size)
+                        */
+                       if (rcd->egrbufs.rcvtid_size == round_mtu ||
+                           !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) {
+                               dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n",
+                                       rcd->ctxt);
+                               goto bail_rcvegrbuf_phys;
+                       }
+
+                       new_size = rcd->egrbufs.rcvtid_size / 2;
+
+                       /*
+                        * If the first attempt to allocate memory failed, don't
+                        * fail everything but continue with the next lower
+                        * size.
+                        */
+                       if (idx == 0) {
+                               rcd->egrbufs.rcvtid_size = new_size;
+                               continue;
+                       }
+
+                       /*
+                        * Re-partition already allocated buffers to a smaller
+                        * size.
+                        */
+                       rcd->egrbufs.alloced = 0;
+                       for (i = 0, j = 0, offset = 0; j < idx; i++) {
+                               if (i >= rcd->egrbufs.count)
+                                       break;
+                               rcd->egrbufs.rcvtids[i].phys =
+                                       rcd->egrbufs.buffers[j].phys + offset;
+                               rcd->egrbufs.rcvtids[i].addr =
+                                       rcd->egrbufs.buffers[j].addr + offset;
+                               rcd->egrbufs.alloced++;
+                               if ((rcd->egrbufs.buffers[j].phys + offset +
+                                    new_size) ==
+                                   (rcd->egrbufs.buffers[j].phys +
+                                    rcd->egrbufs.buffers[j].len)) {
+                                       j++;
+                                       offset = 0;
+                               } else
+                                       offset += new_size;
+                       }
+                       rcd->egrbufs.rcvtid_size = new_size;
+               }
+       }
+       rcd->egrbufs.numbufs = idx;
+       rcd->egrbufs.size = alloced_bytes;
+
+       dd_dev_info(dd, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+               rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size,
+               rcd->egrbufs.size);
+
+       /*
+        * Set the contexts rcv array head update threshold to the closest
+        * power of 2 (so we can use a mask instead of modulo) below half
+        * the allocated entries.
+        */
+       rcd->egrbufs.threshold =
+               rounddown_pow_of_two(rcd->egrbufs.alloced / 2);
+       /*
+        * Compute the expected RcvArray entry base. This is done after
+        * allocating the eager buffers in order to maximize the
+        * expected RcvArray entries for the context.
+        */
+       max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size;
+       egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size);
+       rcd->expected_count = max_entries - egrtop;
+       if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2)
+               rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2;
+
+       rcd->expected_base = rcd->eager_base + egrtop;
+       dd_dev_info(dd, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
+                   rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count,
+                   rcd->eager_base, rcd->expected_base);
+
+       if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) {
+               dd_dev_err(dd, "ctxt%u: current Eager buffer size is invalid %u\n",
+                          rcd->ctxt, rcd->egrbufs.rcvtid_size);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       for (idx = 0; idx < rcd->egrbufs.alloced; idx++) {
+               hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER,
+                             rcd->egrbufs.rcvtids[idx].phys, order);
+               cond_resched();
+       }
+       goto bail;
+
+bail_rcvegrbuf_phys:
+       for (idx = 0; idx < rcd->egrbufs.alloced &&
+                    rcd->egrbufs.buffers[idx].addr;
+            idx++) {
+               dma_free_coherent(&dd->pcidev->dev,
+                                 rcd->egrbufs.buffers[idx].len,
+                                 rcd->egrbufs.buffers[idx].addr,
+                                 rcd->egrbufs.buffers[idx].phys);
+               rcd->egrbufs.buffers[idx].addr = NULL;
+               rcd->egrbufs.buffers[idx].phys = 0;
+               rcd->egrbufs.buffers[idx].len = 0;
+       }
+bail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c
new file mode 100644 (file)
index 0000000..426582b
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "sdma.h"
+
+/**
+ * format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+       strlcat(msg, "[", msgl);
+       strlcat(msg, hwmsg, msgl);
+       strlcat(msg, "]", msgl);
+}
+
+/**
+ * hfi1_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs,
+                         size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+       int i;
+
+       for (i = 0; i < nhwerrmsgs; i++)
+               if (hwerrs & hwerrmsgs[i].mask)
+                       format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev)
+{
+       struct ib_event event;
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Only call ib_dispatch_event() if the IB device has been
+        * registered.  HFI1_INITED is set iff the driver has successfully
+        * registered with the IB core.
+        */
+       if (!(dd->flags & HFI1_INITTED))
+               return;
+       event.device = &dd->verbs_dev.ibdev;
+       event.element.port_num = ppd->port;
+       event.event = ev;
+       ib_dispatch_event(&event);
+}
+
+/*
+ * Handle a linkup or link down notification.
+ * This is called outside an interrupt.
+ */
+void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       enum ib_event_type ev;
+
+       if (!(ppd->linkup ^ !!linkup))
+               return; /* no change, nothing to do */
+
+       if (linkup) {
+               /*
+                * Quick linkup and all link up on the simulator does not
+                * trigger or implement:
+                *      - VerifyCap interrupt
+                *      - VerifyCap frames
+                * But rather moves directly to LinkUp.
+                *
+                * Do the work of the VerifyCap interrupt handler,
+                * handle_verify_cap(), but do not try moving the state to
+                * LinkUp as we are already there.
+                *
+                * NOTE: This uses this device's vAU, vCU, and vl15_init for
+                * the remote values.  Both sides must be using the values.
+                */
+               if (quick_linkup
+                           || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+                       set_up_vl15(dd, dd->vau, dd->vl15_init);
+                       assign_remote_cm_au_table(dd, dd->vcu);
+                       ppd->neighbor_guid =
+                               read_csr(dd,
+                                       DC_DC8051_STS_REMOTE_GUID);
+                       ppd->neighbor_type =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+                                       DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+                       ppd->neighbor_port_number =
+                               read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+                       dd_dev_info(dd,
+                               "Neighbor GUID: %llx Neighbor type %d\n",
+                               ppd->neighbor_guid,
+                               ppd->neighbor_type);
+               }
+
+               /* physical link went up */
+               ppd->linkup = 1;
+               ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+
+               /* link widths are not available until the link is fully up */
+               get_linkup_link_widths(ppd);
+
+       } else {
+               /* physical link went down */
+               ppd->linkup = 0;
+
+               /* clear HW details of the previous connection */
+               reset_link_credits(dd);
+
+               /* freeze after a link down to guarantee a clean egress */
+               start_freeze_handling(ppd, FREEZE_SELF|FREEZE_LINK_DOWN);
+
+               ev = IB_EVENT_PORT_ERR;
+
+               hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT);
+
+               /* if we are down, the neighbor is down */
+               ppd->neighbor_normal = 0;
+
+               /* notify IB of the link change */
+               signal_ib_event(ppd, ev);
+       }
+
+
+}
+
+/*
+ * Handle receive or urgent interrupts for user contexts.  This means a user
+ * process was waiting for a packet to arrive, and didn't want to poll.
+ */
+void handle_user_interrupt(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->uctxt_lock, flags);
+       if (!rcd->cnt)
+               goto done;
+
+       if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) {
+               wake_up_interruptible(&rcd->wait);
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt);
+       } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG,
+                                                       &rcd->event_flags)) {
+               rcd->urgent++;
+               wake_up_interruptible(&rcd->wait);
+       }
+done:
+       spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
new file mode 100644 (file)
index 0000000..fa361b4
--- /dev/null
@@ -0,0 +1,186 @@
+#ifndef _HFI1_IOWAIT_H
+#define _HFI1_IOWAIT_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+/*
+ * typedef (*restart_t)() - restart callback
+ * @work: pointer to work structure
+ */
+typedef void (*restart_t)(struct work_struct *work);
+
+struct sdma_txreq;
+struct sdma_engine;
+/**
+ * struct iowait - linkage for delayed progress/waiting
+ * @list: used to add/insert into QP/PQ wait lists
+ * @tx_head: overflow list of sdma_txreq's
+ * @sleep: no space callback
+ * @wakeup: space callback
+ * @iowork: workqueue overhead
+ * @wait_dma: wait for sdma_busy == 0
+ * @sdma_busy: # of packets in flight
+ * @count: total number of descriptors in tx_head'ed list
+ * @tx_limit: limit for overflow queuing
+ * @tx_count: number of tx entry's in tx_head'ed list
+ *
+ * This is to be embedded in user's state structure
+ * (QP or PQ).
+ *
+ * The sleep and wakeup members are a
+ * bit misnamed.   They do not strictly
+ * speaking sleep or wake up, but they
+ * are callbacks for the ULP to implement
+ * what ever queuing/dequeuing of
+ * the embedded iowait and its containing struct
+ * when a resource shortage like SDMA ring space is seen.
+ *
+ * Both potentially have locks help
+ * so sleeping is not allowed.
+ *
+ * The wait_dma member along with the iow
+ */
+
+struct iowait {
+       struct list_head list;
+       struct list_head tx_head;
+       int (*sleep)(
+               struct sdma_engine *sde,
+               struct iowait *wait,
+               struct sdma_txreq *tx,
+               unsigned seq);
+       void (*wakeup)(struct iowait *wait, int reason);
+       struct work_struct iowork;
+       wait_queue_head_t wait_dma;
+       atomic_t sdma_busy;
+       u32 count;
+       u32 tx_limit;
+       u32 tx_count;
+};
+
+#define SDMA_AVAIL_REASON 0
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @wakeup: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+
+static inline void iowait_init(
+       struct iowait *wait,
+       u32 tx_limit,
+       void (*func)(struct work_struct *work),
+       int (*sleep)(
+               struct sdma_engine *sde,
+               struct iowait *wait,
+               struct sdma_txreq *tx,
+               unsigned seq),
+       void (*wakeup)(struct iowait *wait, int reason))
+{
+       wait->count = 0;
+       INIT_LIST_HEAD(&wait->list);
+       INIT_LIST_HEAD(&wait->tx_head);
+       INIT_WORK(&wait->iowork, func);
+       init_waitqueue_head(&wait->wait_dma);
+       atomic_set(&wait->sdma_busy, 0);
+       wait->tx_limit = tx_limit;
+       wait->sleep = sleep;
+       wait->wakeup = wakeup;
+}
+
+/**
+ * iowait_schedule() - initialize wait structure
+ * @wait: wait struct to schedule
+ * @wq: workqueue for schedule
+ */
+static inline void iowait_schedule(
+       struct iowait *wait,
+       struct workqueue_struct *wq)
+{
+       queue_work(wq, &wait->iowork);
+}
+
+/**
+ * iowait_sdma_drain() - wait for DMAs to drain
+ *
+ * @wait: iowait structure
+ *
+ * This will delay until the iowait sdmas have
+ * completed.
+ */
+static inline void iowait_sdma_drain(struct iowait *wait)
+{
+       wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy));
+}
+
+/**
+ * iowait_drain_wakeup() - trigger iowait_drain() waiter
+ *
+ * @wait: iowait structure
+ *
+ * This will trigger any waiters.
+ */
+static inline void iowait_drain_wakeup(struct iowait *wait)
+{
+       wake_up(&wait->wait_dma);
+}
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/keys.c b/drivers/staging/rdma/hfi1/keys.c
new file mode 100644 (file)
index 0000000..f6eff17
--- /dev/null
@@ -0,0 +1,411 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/**
+ * hfi1_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+
+int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region)
+{
+       unsigned long flags;
+       u32 r;
+       u32 n;
+       int ret = 0;
+       struct hfi1_ibdev *dev = to_idev(mr->pd->device);
+       struct hfi1_lkey_table *rkt = &dev->lk_table;
+
+       hfi1_get_mr(mr);
+       spin_lock_irqsave(&rkt->lock, flags);
+
+       /* special case for dma_mr lkey == 0 */
+       if (dma_region) {
+               struct hfi1_mregion *tmr;
+
+               tmr = rcu_access_pointer(dev->dma_mr);
+               if (!tmr) {
+                       rcu_assign_pointer(dev->dma_mr, mr);
+                       mr->lkey_published = 1;
+               } else {
+                       hfi1_put_mr(mr);
+               }
+               goto success;
+       }
+
+       /* Find the next available LKEY */
+       r = rkt->next;
+       n = r;
+       for (;;) {
+               if (!rcu_access_pointer(rkt->table[r]))
+                       break;
+               r = (r + 1) & (rkt->max - 1);
+               if (r == n)
+                       goto bail;
+       }
+       rkt->next = (r + 1) & (rkt->max - 1);
+       /*
+        * Make sure lkey is never zero which is reserved to indicate an
+        * unrestricted LKEY.
+        */
+       rkt->gen++;
+       /*
+        * bits are capped in verbs.c to ensure enough bits for
+        * generation number
+        */
+       mr->lkey = (r << (32 - hfi1_lkey_table_size)) |
+               ((((1 << (24 - hfi1_lkey_table_size)) - 1) & rkt->gen)
+                << 8);
+       if (mr->lkey == 0) {
+               mr->lkey |= 1 << 8;
+               rkt->gen++;
+       }
+       rcu_assign_pointer(rkt->table[r], mr);
+       mr->lkey_published = 1;
+success:
+       spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+       return ret;
+bail:
+       hfi1_put_mr(mr);
+       spin_unlock_irqrestore(&rkt->lock, flags);
+       ret = -ENOMEM;
+       goto out;
+}
+
+/**
+ * hfi1_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+void hfi1_free_lkey(struct hfi1_mregion *mr)
+{
+       unsigned long flags;
+       u32 lkey = mr->lkey;
+       u32 r;
+       struct hfi1_ibdev *dev = to_idev(mr->pd->device);
+       struct hfi1_lkey_table *rkt = &dev->lk_table;
+       int freed = 0;
+
+       spin_lock_irqsave(&rkt->lock, flags);
+       if (!mr->lkey_published)
+               goto out;
+       if (lkey == 0)
+               RCU_INIT_POINTER(dev->dma_mr, NULL);
+       else {
+               r = lkey >> (32 - hfi1_lkey_table_size);
+               RCU_INIT_POINTER(rkt->table[r], NULL);
+       }
+       mr->lkey_published = 0;
+       freed++;
+out:
+       spin_unlock_irqrestore(&rkt->lock, flags);
+       if (freed) {
+               synchronize_rcu();
+               hfi1_put_mr(mr);
+       }
+}
+
+/**
+ * hfi1_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * increments the reference count upon success
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
+                struct hfi1_sge *isge, struct ib_sge *sge, int acc)
+{
+       struct hfi1_mregion *mr;
+       unsigned n, m;
+       size_t off;
+
+       /*
+        * We use LKEY == zero for kernel virtual addresses
+        * (see hfi1_get_dma_mr and dma.c).
+        */
+       rcu_read_lock();
+       if (sge->lkey == 0) {
+               struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
+
+               if (pd->user)
+                       goto bail;
+               mr = rcu_dereference(dev->dma_mr);
+               if (!mr)
+                       goto bail;
+               atomic_inc(&mr->refcount);
+               rcu_read_unlock();
+
+               isge->mr = mr;
+               isge->vaddr = (void *) sge->addr;
+               isge->length = sge->length;
+               isge->sge_length = sge->length;
+               isge->m = 0;
+               isge->n = 0;
+               goto ok;
+       }
+       mr = rcu_dereference(
+               rkt->table[(sge->lkey >> (32 - hfi1_lkey_table_size))]);
+       if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+               goto bail;
+
+       off = sge->addr - mr->user_base;
+       if (unlikely(sge->addr < mr->user_base ||
+                    off + sge->length > mr->length ||
+                    (mr->access_flags & acc) != acc))
+               goto bail;
+       atomic_inc(&mr->refcount);
+       rcu_read_unlock();
+
+       off += mr->offset;
+       if (mr->page_shift) {
+               /*
+               page sizes are uniform power of 2 so no loop is necessary
+               entries_spanned_by_off is the number of times the loop below
+               would have executed.
+               */
+               size_t entries_spanned_by_off;
+
+               entries_spanned_by_off = off >> mr->page_shift;
+               off -= (entries_spanned_by_off << mr->page_shift);
+               m = entries_spanned_by_off / HFI1_SEGSZ;
+               n = entries_spanned_by_off % HFI1_SEGSZ;
+       } else {
+               m = 0;
+               n = 0;
+               while (off >= mr->map[m]->segs[n].length) {
+                       off -= mr->map[m]->segs[n].length;
+                       n++;
+                       if (n >= HFI1_SEGSZ) {
+                               m++;
+                               n = 0;
+                       }
+               }
+       }
+       isge->mr = mr;
+       isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+       isge->length = mr->map[m]->segs[n].length - off;
+       isge->sge_length = sge->length;
+       isge->m = m;
+       isge->n = n;
+ok:
+       return 1;
+bail:
+       rcu_read_unlock();
+       return 0;
+}
+
+/**
+ * hfi1_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
+                u32 len, u64 vaddr, u32 rkey, int acc)
+{
+       struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+       struct hfi1_mregion *mr;
+       unsigned n, m;
+       size_t off;
+
+       /*
+        * We use RKEY == zero for kernel virtual addresses
+        * (see hfi1_get_dma_mr and dma.c).
+        */
+       rcu_read_lock();
+       if (rkey == 0) {
+               struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
+               struct hfi1_ibdev *dev = to_idev(pd->ibpd.device);
+
+               if (pd->user)
+                       goto bail;
+               mr = rcu_dereference(dev->dma_mr);
+               if (!mr)
+                       goto bail;
+               atomic_inc(&mr->refcount);
+               rcu_read_unlock();
+
+               sge->mr = mr;
+               sge->vaddr = (void *) vaddr;
+               sge->length = len;
+               sge->sge_length = len;
+               sge->m = 0;
+               sge->n = 0;
+               goto ok;
+       }
+
+       mr = rcu_dereference(
+               rkt->table[(rkey >> (32 - hfi1_lkey_table_size))]);
+       if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+               goto bail;
+
+       off = vaddr - mr->iova;
+       if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+                    (mr->access_flags & acc) == 0))
+               goto bail;
+       atomic_inc(&mr->refcount);
+       rcu_read_unlock();
+
+       off += mr->offset;
+       if (mr->page_shift) {
+               /*
+               page sizes are uniform power of 2 so no loop is necessary
+               entries_spanned_by_off is the number of times the loop below
+               would have executed.
+               */
+               size_t entries_spanned_by_off;
+
+               entries_spanned_by_off = off >> mr->page_shift;
+               off -= (entries_spanned_by_off << mr->page_shift);
+               m = entries_spanned_by_off / HFI1_SEGSZ;
+               n = entries_spanned_by_off % HFI1_SEGSZ;
+       } else {
+               m = 0;
+               n = 0;
+               while (off >= mr->map[m]->segs[n].length) {
+                       off -= mr->map[m]->segs[n].length;
+                       n++;
+                       if (n >= HFI1_SEGSZ) {
+                               m++;
+                               n = 0;
+                       }
+               }
+       }
+       sge->mr = mr;
+       sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+       sge->length = mr->map[m]->segs[n].length - off;
+       sge->sge_length = len;
+       sge->m = m;
+       sge->n = n;
+ok:
+       return 1;
+bail:
+       rcu_read_unlock();
+       return 0;
+}
+
+/*
+ * Initialize the memory region specified by the work request.
+ */
+int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr)
+{
+       struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+       struct hfi1_pd *pd = to_ipd(qp->ibqp.pd);
+       struct hfi1_mregion *mr;
+       u32 rkey = wr->wr.fast_reg.rkey;
+       unsigned i, n, m;
+       int ret = -EINVAL;
+       unsigned long flags;
+       u64 *page_list;
+       size_t ps;
+
+       spin_lock_irqsave(&rkt->lock, flags);
+       if (pd->user || rkey == 0)
+               goto bail;
+
+       mr = rcu_dereference_protected(
+               rkt->table[(rkey >> (32 - hfi1_lkey_table_size))],
+               lockdep_is_held(&rkt->lock));
+       if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd))
+               goto bail;
+
+       if (wr->wr.fast_reg.page_list_len > mr->max_segs)
+               goto bail;
+
+       ps = 1UL << wr->wr.fast_reg.page_shift;
+       if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len)
+               goto bail;
+
+       mr->user_base = wr->wr.fast_reg.iova_start;
+       mr->iova = wr->wr.fast_reg.iova_start;
+       mr->lkey = rkey;
+       mr->length = wr->wr.fast_reg.length;
+       mr->access_flags = wr->wr.fast_reg.access_flags;
+       page_list = wr->wr.fast_reg.page_list->page_list;
+       m = 0;
+       n = 0;
+       for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+               mr->map[m]->segs[n].vaddr = (void *) page_list[i];
+               mr->map[m]->segs[n].length = ps;
+               if (++n == HFI1_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+
+       ret = 0;
+bail:
+       spin_unlock_irqrestore(&rkt->lock, flags);
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c
new file mode 100644 (file)
index 0000000..37269eb
--- /dev/null
@@ -0,0 +1,4257 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
+                       / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+/* the reset value from the FM is supposed to be 0xffff, handle both */
+#define OPA_LINK_WIDTH_RESET_OLD 0x0fff
+#define OPA_LINK_WIDTH_RESET 0xffff
+
+static int reply(struct ib_mad_hdr *smp)
+{
+       /*
+        * The verbs framework will handle the directed/LID route
+        * packet changes.
+        */
+       smp->method = IB_MGMT_METHOD_GET_RESP;
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+               smp->status |= IB_SMP_DIRECTION;
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static inline void clear_opa_smp_data(struct opa_smp *smp)
+{
+       void *data = opa_get_smp_data(smp);
+       size_t size = opa_get_smp_data_size(smp);
+
+       memset(data, 0, size);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+{
+       struct ib_mad_send_buf *send_buf;
+       struct ib_mad_agent *agent;
+       struct ib_smp *smp;
+       int ret;
+       unsigned long flags;
+       unsigned long timeout;
+       int pkey_idx;
+       u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
+
+       agent = ibp->send_agent;
+       if (!agent)
+               return;
+
+       /* o14-3.2.1 */
+       if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE)
+               return;
+
+       /* o14-2 */
+       if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
+               return;
+
+       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+       if (pkey_idx < 0) {
+               pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n",
+                       __func__, hfi1_get_pkey(ibp, 1));
+               pkey_idx = 1;
+       }
+
+       send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0,
+                                     IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+                                     GFP_ATOMIC, IB_MGMT_BASE_VERSION);
+       if (IS_ERR(send_buf))
+               return;
+
+       smp = send_buf->mad;
+       smp->base_version = IB_MGMT_BASE_VERSION;
+       smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+       smp->class_version = 1;
+       smp->method = IB_MGMT_METHOD_TRAP;
+       ibp->tid++;
+       smp->tid = cpu_to_be64(ibp->tid);
+       smp->attr_id = IB_SMP_ATTR_NOTICE;
+       /* o14-1: smp->mkey = 0; */
+       memcpy(smp->data, data, len);
+
+       spin_lock_irqsave(&ibp->lock, flags);
+       if (!ibp->sm_ah) {
+               if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+                       struct ib_ah *ah;
+
+                       ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid);
+                       if (IS_ERR(ah))
+                               ret = PTR_ERR(ah);
+                       else {
+                               send_buf->ah = ah;
+                               ibp->sm_ah = to_iah(ah);
+                               ret = 0;
+                       }
+               } else
+                       ret = -EINVAL;
+       } else {
+               send_buf->ah = &ibp->sm_ah->ibah;
+               ret = 0;
+       }
+       spin_unlock_irqrestore(&ibp->lock, flags);
+
+       if (!ret)
+               ret = ib_post_send_mad(send_buf, NULL);
+       if (!ret) {
+               /* 4.096 usec. */
+               timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
+               ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
+       } else {
+               ib_free_send_mad(send_buf);
+               ibp->trap_timeout = 0;
+       }
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+                   u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
+{
+       struct ib_mad_notice_attr data;
+
+       if (trap_num == IB_NOTICE_TRAP_BAD_PKEY)
+               ibp->pkey_violations++;
+       else
+               ibp->qkey_violations++;
+       ibp->n_pkt_drops++;
+
+       /* Send violation trap */
+       data.generic_type = IB_NOTICE_TYPE_SECURITY;
+       data.prod_type_msb = 0;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = trap_num;
+       data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+       data.toggle_count = 0;
+       memset(&data.details, 0, sizeof(data.details));
+       data.details.ntc_257_258.lid1 = lid1;
+       data.details.ntc_257_258.lid2 = lid2;
+       data.details.ntc_257_258.key = cpu_to_be32(key);
+       data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
+       data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+                    __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
+{
+       struct ib_mad_notice_attr data;
+
+       /* Send violation trap */
+       data.generic_type = IB_NOTICE_TYPE_SECURITY;
+       data.prod_type_msb = 0;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
+       data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+       data.toggle_count = 0;
+       memset(&data.details, 0, sizeof(data.details));
+       data.details.ntc_256.lid = data.issuer_lid;
+       data.details.ntc_256.method = mad->method;
+       data.details.ntc_256.attr_id = mad->attr_id;
+       data.details.ntc_256.attr_mod = mad->attr_mod;
+       data.details.ntc_256.mkey = mkey;
+       if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+
+               data.details.ntc_256.dr_slid = (__force __be16)dr_slid;
+               data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+               if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
+                       data.details.ntc_256.dr_trunc_hop |=
+                               IB_NOTICE_TRAP_DR_TRUNC;
+                       hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
+               }
+               data.details.ntc_256.dr_trunc_hop |= hop_cnt;
+               memcpy(data.details.ntc_256.dr_rtn_path, return_path,
+                      hop_cnt);
+       }
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void hfi1_cap_mask_chg(struct hfi1_ibport *ibp)
+{
+       struct ib_mad_notice_attr data;
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_msb = 0;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+       data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+       data.toggle_count = 0;
+       memset(&data.details, 0, sizeof(data.details));
+       data.details.ntc_144.lid = data.issuer_lid;
+       data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
+{
+       struct ib_mad_notice_attr data;
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_msb = 0;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
+       data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+       data.toggle_count = 0;
+       memset(&data.details, 0, sizeof(data.details));
+       data.details.ntc_145.lid = data.issuer_lid;
+       data.details.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
+{
+       struct ib_mad_notice_attr data;
+
+       data.generic_type = IB_NOTICE_TYPE_INFO;
+       data.prod_type_msb = 0;
+       data.prod_type_lsb = IB_NOTICE_PROD_CA;
+       data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+       data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+       data.toggle_count = 0;
+       memset(&data.details, 0, sizeof(data.details));
+       data.details.ntc_144.lid = data.issuer_lid;
+       data.details.ntc_144.local_changes = 1;
+       data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
+
+       send_trap(ibp, &data, sizeof(data));
+}
+
+static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
+                                  u8 *data, struct ib_device *ibdev,
+                                  u8 port, u32 *resp_len)
+{
+       struct opa_node_description *nd;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       nd = (struct opa_node_description *)data;
+
+       memcpy(nd->data, ibdev->node_desc, sizeof(nd->data));
+
+       if (resp_len)
+               *resp_len += sizeof(*nd);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct opa_node_info *ni;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+       ni = (struct opa_node_info *)data;
+
+       /* GUID 0 is illegal */
+       if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ni->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+       ni->base_version = OPA_MGMT_BASE_VERSION;
+       ni->class_version = OPA_SMI_CLASS_VERSION;
+       ni->node_type = 1;     /* channel adapter */
+       ni->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       ni->system_image_guid = ib_hfi1_sys_image_guid;
+       /* Use first-port GUID as node */
+       ni->node_guid = cpu_to_be64(dd->pport->guid);
+       ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+       ni->device_id = cpu_to_be16(dd->pcidev->device);
+       ni->revision = cpu_to_be32(dd->minrev);
+       ni->local_port_num = port;
+       ni->vendor_id[0] = dd->oui1;
+       ni->vendor_id[1] = dd->oui2;
+       ni->vendor_id[2] = dd->oui3;
+
+       if (resp_len)
+               *resp_len += sizeof(*ni);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+                            u8 port)
+{
+       struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */
+
+       /* GUID 0 is illegal */
+       if (smp->attr_mod || pidx >= dd->num_pports ||
+           dd->pport[pidx].guid == 0)
+               smp->status |= IB_SMP_INVALID_FIELD;
+       else
+               nip->port_guid = cpu_to_be64(dd->pport[pidx].guid);
+
+       nip->base_version = OPA_MGMT_BASE_VERSION;
+       nip->class_version = OPA_SMI_CLASS_VERSION;
+       nip->node_type = 1;     /* channel adapter */
+       nip->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       nip->sys_guid = ib_hfi1_sys_image_guid;
+        /* Use first-port GUID as node */
+       nip->node_guid = cpu_to_be64(dd->pport->guid);
+       nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd));
+       nip->device_id = cpu_to_be16(dd->pcidev->device);
+       nip->revision = cpu_to_be32(dd->minrev);
+       nip->local_port_num = port;
+       nip->vendor_id[0] = dd->oui1;
+       nip->vendor_id[1] = dd->oui2;
+       nip->vendor_id[2] = dd->oui3;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w);
+}
+
+static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s)
+{
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s);
+}
+
+static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
+                     int mad_flags, __be64 mkey, __be32 dr_slid,
+                     u8 return_path[], u8 hop_cnt)
+{
+       int valid_mkey = 0;
+       int ret = 0;
+
+       /* Is the mkey in the process of expiring? */
+       if (ibp->mkey_lease_timeout &&
+           time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
+               /* Clear timeout and mkey protection field. */
+               ibp->mkey_lease_timeout = 0;
+               ibp->mkeyprot = 0;
+       }
+
+       if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->mkey == 0 ||
+           ibp->mkey == mkey)
+               valid_mkey = 1;
+
+       /* Unset lease timeout on any valid Get/Set/TrapRepress */
+       if (valid_mkey && ibp->mkey_lease_timeout &&
+           (mad->method == IB_MGMT_METHOD_GET ||
+            mad->method == IB_MGMT_METHOD_SET ||
+            mad->method == IB_MGMT_METHOD_TRAP_REPRESS))
+               ibp->mkey_lease_timeout = 0;
+
+       if (!valid_mkey) {
+               switch (mad->method) {
+               case IB_MGMT_METHOD_GET:
+                       /* Bad mkey not a violation below level 2 */
+                       if (ibp->mkeyprot < 2)
+                               break;
+               case IB_MGMT_METHOD_SET:
+               case IB_MGMT_METHOD_TRAP_REPRESS:
+                       if (ibp->mkey_violations != 0xFFFF)
+                               ++ibp->mkey_violations;
+                       if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
+                               ibp->mkey_lease_timeout = jiffies +
+                                       ibp->mkey_lease_period * HZ;
+                       /* Generate a trap notice. */
+                       bad_mkey(ibp, mad, mkey, dr_slid, return_path,
+                                hop_cnt);
+                       ret = 1;
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * The SMA caches reads from LCB registers in case the LCB is unavailable.
+ * (The LCB is unavailable in certain link states, for example.)
+ */
+struct lcb_datum {
+       u32 off;
+       u64 val;
+};
+
+static struct lcb_datum lcb_cache[] = {
+       { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 },
+};
+
+static int write_lcb_cache(u32 off, u64 val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+               if (lcb_cache[i].off == off) {
+                       lcb_cache[i].val = val;
+                       return 0;
+               }
+       }
+
+       pr_warn("%s bad offset 0x%x\n", __func__, off);
+       return -1;
+}
+
+static int read_lcb_cache(u32 off, u64 *val)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) {
+               if (lcb_cache[i].off == off) {
+                       *val = lcb_cache[i].val;
+                       return 0;
+               }
+       }
+
+       pr_warn("%s bad offset 0x%x\n", __func__, off);
+       return -1;
+}
+
+void read_ltp_rtt(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, &reg))
+               dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__);
+       else
+               write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg);
+}
+
+static u8 __opa_porttype(struct hfi1_pportdata *ppd)
+{
+       if (qsfp_mod_present(ppd)) {
+               if (ppd->qsfp_info.cache_valid)
+                       return OPA_PORT_TYPE_STANDARD;
+               return OPA_PORT_TYPE_DISCONNECTED;
+       }
+       return OPA_PORT_TYPE_UNKNOWN;
+}
+
+static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       int i;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       struct opa_port_info *pi = (struct opa_port_info *)data;
+       u8 mtu;
+       u8 credit_rate;
+       u32 state;
+       u32 num_ports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 buffer_units;
+       u64 tmp = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       dd = dd_from_ibdev(ibdev);
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       ibp = &ppd->ibport_data;
+
+       if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+               ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       pi->lid = cpu_to_be32(ppd->lid);
+
+       /* Only return the mkey if the protection field allows it. */
+       if (!(smp->method == IB_MGMT_METHOD_GET &&
+             ibp->mkey != smp->mkey &&
+             ibp->mkeyprot == 1))
+               pi->mkey = ibp->mkey;
+
+       pi->subnet_prefix = ibp->gid_prefix;
+       pi->sm_lid = cpu_to_be32(ibp->sm_lid);
+       pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+       pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
+       pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp);
+       pi->sa_qp = cpu_to_be32(ppd->sa_qp);
+
+       pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled);
+       pi->link_width.supported = cpu_to_be16(ppd->link_width_supported);
+       pi->link_width.active = cpu_to_be16(ppd->link_width_active);
+
+       pi->link_width_downgrade.supported =
+                       cpu_to_be16(ppd->link_width_downgrade_supported);
+       pi->link_width_downgrade.enabled =
+                       cpu_to_be16(ppd->link_width_downgrade_enabled);
+       pi->link_width_downgrade.tx_active =
+                       cpu_to_be16(ppd->link_width_downgrade_tx_active);
+       pi->link_width_downgrade.rx_active =
+                       cpu_to_be16(ppd->link_width_downgrade_rx_active);
+
+       pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported);
+       pi->link_speed.active = cpu_to_be16(ppd->link_speed_active);
+       pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled);
+
+       state = driver_lstate(ppd);
+
+       if (start_of_sm_config && (state == IB_PORT_INIT))
+               ppd->is_sm_config_started = 1;
+
+       pi->port_phys_conf = __opa_porttype(ppd) & 0xf;
+
+#if PI_LED_ENABLE_SUP
+       pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+       pi->port_states.ledenable_offlinereason |=
+               ppd->is_sm_config_started << 5;
+       pi->port_states.ledenable_offlinereason |=
+               ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+#else
+       pi->port_states.offline_reason = ppd->neighbor_normal << 4;
+       pi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+       pi->port_states.offline_reason |= ppd->offline_disabled_reason &
+                                               OPA_PI_MASK_OFFLINE_REASON;
+#endif /* PI_LED_ENABLE_SUP */
+
+       pi->port_states.portphysstate_portstate =
+               (hfi1_ibphys_portstate(ppd) << 4) | state;
+
+       pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
+
+       memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu));
+       for (i = 0; i < ppd->vls_supported; i++) {
+               mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU);
+               if ((i % 2) == 0)
+                       pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4);
+               else
+                       pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu;
+       }
+       /* don't forget VL 15 */
+       mtu = mtu_to_enum(dd->vld[15].mtu, 2048);
+       pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu;
+       pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL;
+       pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS);
+       pi->partenforce_filterraw |=
+               (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON);
+       if (ppd->part_enforce & HFI1_PART_ENFORCE_IN)
+               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN;
+       if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT)
+               pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT;
+       pi->mkey_violations = cpu_to_be16(ibp->mkey_violations);
+       /* P_KeyViolations are counted by hardware. */
+       pi->pkey_violations = cpu_to_be16(ibp->pkey_violations);
+       pi->qkey_violations = cpu_to_be16(ibp->qkey_violations);
+
+       pi->vl.cap = ppd->vls_supported;
+       pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit);
+       pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP);
+       pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP);
+
+       pi->clientrereg_subnettimeout = ibp->subnet_timeout;
+
+       pi->port_link_mode  = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 |
+                                         OPA_PORT_LINK_MODE_OPA << 5 |
+                                         OPA_PORT_LINK_MODE_OPA);
+
+       pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode);
+
+       pi->port_mode = cpu_to_be16(
+                               ppd->is_active_optimize_enabled ?
+                                       OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0);
+
+       pi->port_packet_format.supported =
+               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+       pi->port_packet_format.enabled =
+               cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B);
+
+       /* flit_control.interleave is (OPA V1, version .76):
+        * bits         use
+        * ----         ---
+        * 2            res
+        * 2            DistanceSupported
+        * 2            DistanceEnabled
+        * 5            MaxNextLevelTxEnabled
+        * 5            MaxNestLevelRxSupported
+        *
+        * HFI supports only "distance mode 1" (see OPA V1, version .76,
+        * section 9.6.2), so set DistanceSupported, DistanceEnabled
+        * to 0x1.
+        */
+       pi->flit_control.interleave = cpu_to_be16(0x1400);
+
+       pi->link_down_reason = ppd->local_link_down_reason.sma;
+       pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma;
+       pi->port_error_action = cpu_to_be32(ppd->port_error_action);
+       pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096);
+
+       /* 32.768 usec. response time (guessing) */
+       pi->resptimevalue = 3;
+
+       pi->local_port_num = port;
+
+       /* buffer info for FM */
+       pi->overall_buffer_space = cpu_to_be16(dd->link_credits);
+
+       pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid);
+       pi->neigh_port_num = ppd->neighbor_port_number;
+       pi->port_neigh_mode =
+               (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) |
+               (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) |
+               (ppd->neighbor_fm_security ?
+                       OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0);
+
+       /* HFIs shall always return VL15 credits to their
+        * neighbor in a timely manner, without any credit return pacing.
+        */
+       credit_rate = 0;
+       buffer_units  = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC;
+       buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK;
+       buffer_units |= (credit_rate << 6) &
+                               OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE;
+       buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
+       pi->buffer_units = cpu_to_be32(buffer_units);
+
+       pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported);
+
+       /* HFI supports a replay buffer 128 LTPs in size */
+       pi->replay_depth.buffer = 0x80;
+       /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp);
+
+       /* this counter is 16 bits wide, but the replay_depth.wire
+        * variable is only 8 bits */
+       if (tmp > 0xff)
+               tmp = 0xff;
+       pi->replay_depth.wire = tmp;
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_port_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+       struct hfi1_pportdata *ppd = dd->pport + port - 1;
+
+       memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys));
+
+       return 0;
+}
+
+static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 n_blocks_req = OPA_AM_NBLK(am);
+       u32 start_block = am & 0x7ff;
+       __be16 *p;
+       u16 *q;
+       int i;
+       u16 n_blocks_avail;
+       unsigned npkeys = hfi1_get_npkeys(dd);
+       size_t size;
+
+       if (n_blocks_req == 0) {
+               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+                       port, start_block, n_blocks_req);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+       size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16);
+
+       if (start_block + n_blocks_req > n_blocks_avail ||
+           n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+               pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; "
+                       "avail 0x%x; blk/smp 0x%lx\n",
+                       start_block, n_blocks_req, n_blocks_avail,
+                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       p = (__be16 *) data;
+       q = (u16 *)data;
+       /* get the real pkeys if we are requesting the first block */
+       if (start_block == 0) {
+               get_pkeys(dd, port, q);
+               for (i = 0; i < npkeys; i++)
+                       p[i] = cpu_to_be16(q[i]);
+               if (resp_len)
+                       *resp_len += size;
+       } else
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+enum {
+       HFI_TRANSITION_DISALLOWED,
+       HFI_TRANSITION_IGNORED,
+       HFI_TRANSITION_ALLOWED,
+       HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * Use shortened names to improve readability of
+ * {logical,physical}_state_transitions
+ */
+enum {
+       __D = HFI_TRANSITION_DISALLOWED,
+       __I = HFI_TRANSITION_IGNORED,
+       __A = HFI_TRANSITION_ALLOWED,
+       __U = HFI_TRANSITION_UNDEFINED,
+};
+
+/*
+ * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are
+ * represented in physical_state_transitions.
+ */
+#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1)
+
+/*
+ * Within physical_state_transitions, rows represent "old" states,
+ * columns "new" states, and physical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 6-4).
+ */
+static const struct {
+       u8 allowed[__N_PHYSTATES][__N_PHYSTATES];
+} physical_state_transitions = {
+       {
+               /* 2    3    4    5    6    7    8    9   10   11 */
+       /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D },
+       /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A },
+       /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D },
+       /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D },
+       /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D },
+       /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U },
+       /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I },
+       }
+};
+
+/*
+ * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented
+ * logical_state_transitions
+ */
+
+#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1)
+
+/*
+ * Within logical_state_transitions rows represent "old" states,
+ * columns "new" states, and logical_state_transitions.allowed[old][new]
+ * indicates if the transition from old state to new state is legal (see
+ * OPAg1v1, Table 9-12).
+ */
+static const struct {
+       u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES];
+} logical_state_transitions = {
+       {
+               /* 1    2    3    4    5 */
+       /* 1 */ { __I, __D, __D, __D, __U},
+       /* 2 */ { __D, __I, __A, __D, __U},
+       /* 3 */ { __D, __D, __I, __A, __U},
+       /* 4 */ { __D, __D, __I, __I, __U},
+       /* 5 */ { __U, __U, __U, __U, __U},
+       }
+};
+
+static int logical_transition_allowed(int old, int new)
+{
+       if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER ||
+           new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) {
+               pr_warn("invalid logical state(s) (old %d new %d)\n",
+                       old, new);
+               return HFI_TRANSITION_UNDEFINED;
+       }
+
+       if (new == IB_PORT_NOP)
+               return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+       /* adjust states for indexing into logical_state_transitions */
+       old -= IB_PORT_DOWN;
+       new -= IB_PORT_DOWN;
+
+       if (old < 0 || new < 0)
+               return HFI_TRANSITION_UNDEFINED;
+       return logical_state_transitions.allowed[old][new];
+}
+
+static int physical_transition_allowed(int old, int new)
+{
+       if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX ||
+           new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) {
+               pr_warn("invalid physical state(s) (old %d new %d)\n",
+                       old, new);
+               return HFI_TRANSITION_UNDEFINED;
+       }
+
+       if (new == IB_PORTPHYSSTATE_NOP)
+               return HFI_TRANSITION_ALLOWED; /* always allowed */
+
+       /* adjust states for indexing into physical_state_transitions */
+       old -= IB_PORTPHYSSTATE_POLLING;
+       new -= IB_PORTPHYSSTATE_POLLING;
+
+       if (old < 0 || new < 0)
+               return HFI_TRANSITION_UNDEFINED;
+       return physical_state_transitions.allowed[old][new];
+}
+
+static int port_states_transition_allowed(struct hfi1_pportdata *ppd,
+                                         u32 logical_new, u32 physical_new)
+{
+       u32 physical_old = driver_physical_state(ppd);
+       u32 logical_old = driver_logical_state(ppd);
+       int ret, logical_allowed, physical_allowed;
+
+       logical_allowed = ret =
+               logical_transition_allowed(logical_old, logical_new);
+
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               pr_warn("invalid logical state transition %s -> %s\n",
+                       opa_lstate_name(logical_old),
+                       opa_lstate_name(logical_new));
+               return ret;
+       }
+
+       physical_allowed = ret =
+               physical_transition_allowed(physical_old, physical_new);
+
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               pr_warn("invalid physical state transition %s -> %s\n",
+                       opa_pstate_name(physical_old),
+                       opa_pstate_name(physical_new));
+               return ret;
+       }
+
+       if (logical_allowed == HFI_TRANSITION_IGNORED &&
+           physical_allowed == HFI_TRANSITION_IGNORED)
+               return HFI_TRANSITION_IGNORED;
+
+       /*
+        * Either physical_allowed or logical_allowed is
+        * HFI_TRANSITION_ALLOWED.
+        */
+       return HFI_TRANSITION_ALLOWED;
+}
+
+static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp,
+                          u32 logical_state, u32 phys_state,
+                          int suppress_idle_sma)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 link_state;
+       int ret;
+
+       ret = port_states_transition_allowed(ppd, logical_state, phys_state);
+       if (ret == HFI_TRANSITION_DISALLOWED ||
+           ret == HFI_TRANSITION_UNDEFINED) {
+               /* error message emitted above */
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return 0;
+       }
+
+       if (ret == HFI_TRANSITION_IGNORED)
+               return 0;
+
+       if ((phys_state != IB_PORTPHYSSTATE_NOP) &&
+           !(logical_state == IB_PORT_DOWN ||
+             logical_state == IB_PORT_NOP)){
+               pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n",
+                       logical_state, phys_state);
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       /*
+        * Logical state changes are summarized in OPAv1g1 spec.,
+        * Table 9-12; physical state changes are summarized in
+        * OPAv1g1 spec., Table 6.4.
+        */
+       switch (logical_state) {
+       case IB_PORT_NOP:
+               if (phys_state == IB_PORTPHYSSTATE_NOP)
+                       break;
+               /* FALLTHROUGH */
+       case IB_PORT_DOWN:
+               if (phys_state == IB_PORTPHYSSTATE_NOP)
+                       link_state = HLS_DN_DOWNDEF;
+               else if (phys_state == IB_PORTPHYSSTATE_POLLING) {
+                       link_state = HLS_DN_POLL;
+                       set_link_down_reason(ppd,
+                            OPA_LINKDOWN_REASON_FM_BOUNCE, 0,
+                            OPA_LINKDOWN_REASON_FM_BOUNCE);
+               } else if (phys_state == IB_PORTPHYSSTATE_DISABLED)
+                       link_state = HLS_DN_DISABLE;
+               else {
+                       pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n",
+                               phys_state);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       break;
+               }
+
+               set_link_state(ppd, link_state);
+               if (link_state == HLS_DN_DISABLE &&
+                   (ppd->offline_disabled_reason >
+                    OPA_LINKDOWN_REASON_SMA_DISABLED ||
+                    ppd->offline_disabled_reason ==
+                    OPA_LINKDOWN_REASON_NONE))
+                       ppd->offline_disabled_reason =
+                       OPA_LINKDOWN_REASON_SMA_DISABLED;
+               /*
+                * Don't send a reply if the response would be sent
+                * through the disabled port.
+                */
+               if (link_state == HLS_DN_DISABLE && smp->hop_cnt)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               break;
+       case IB_PORT_ARMED:
+               ret = set_link_state(ppd, HLS_UP_ARMED);
+               if ((ret == 0) && (suppress_idle_sma == 0))
+                       send_idle_sma(dd, SMA_IDLE_ARM);
+               break;
+       case IB_PORT_ACTIVE:
+               if (ppd->neighbor_normal) {
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret == 0)
+                               send_idle_sma(dd, SMA_IDLE_ACTIVE);
+               } else {
+                       pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n");
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               }
+               break;
+       default:
+               pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n",
+                       logical_state);
+               smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       return 0;
+}
+
+/**
+ * subn_set_opa_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ */
+static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct opa_port_info *pi = (struct opa_port_info *)data;
+       struct ib_event event;
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       u8 clientrereg;
+       unsigned long flags;
+       u32 smlid, opa_lid; /* tmp vars to hold LID values */
+       u16 lid;
+       u8 ls_old, ls_new, ps_new;
+       u8 vls;
+       u8 msl;
+       u8 crc_enabled;
+       u16 lse, lwe, mtu;
+       u32 num_ports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       int ret, i, invalid = 0, call_set_mtu = 0;
+       int call_link_downgrade_policy = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       opa_lid = be32_to_cpu(pi->lid);
+       if (opa_lid & 0xFFFF0000) {
+               pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               goto get_only;
+       }
+
+       lid = (u16)(opa_lid & 0x0000FFFF);
+
+       smlid = be32_to_cpu(pi->sm_lid);
+       if (smlid & 0xFFFF0000) {
+               pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               goto get_only;
+       }
+       smlid &= 0x0000FFFF;
+
+       clientrereg = (pi->clientrereg_subnettimeout &
+                       OPA_PI_MASK_CLIENT_REREGISTER);
+
+       dd = dd_from_ibdev(ibdev);
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       ibp = &ppd->ibport_data;
+       event.device = ibdev;
+       event.element.port_num = port;
+
+       ls_old = driver_lstate(ppd);
+
+       ibp->mkey = pi->mkey;
+       ibp->gid_prefix = pi->subnet_prefix;
+       ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
+
+       /* Must be a valid unicast LID address. */
+       if ((lid == 0 && ls_old > IB_PORT_INIT) ||
+            lid >= HFI1_MULTICAST_LID_BASE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
+                       lid);
+       } else if (ppd->lid != lid ||
+                ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) {
+               if (ppd->lid != lid)
+                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT);
+               if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC))
+                       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT);
+               hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
+               event.event = IB_EVENT_LID_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       msl = pi->smsl & OPA_PI_MASK_SMSL;
+       if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON)
+               ppd->linkinit_reason =
+                       (pi->partenforce_filterraw &
+                        OPA_PI_MASK_LINKINIT_REASON);
+       /* enable/disable SW pkey checking as per FM control */
+       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN)
+               ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
+       else
+               ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN;
+
+       if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT)
+               ppd->part_enforce |= HFI1_PART_ENFORCE_OUT;
+       else
+               ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT;
+
+       /* Must be a valid unicast LID address. */
+       if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
+            smlid >= HFI1_MULTICAST_LID_BASE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
+       } else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
+               pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid);
+               spin_lock_irqsave(&ibp->lock, flags);
+               if (ibp->sm_ah) {
+                       if (smlid != ibp->sm_lid)
+                               ibp->sm_ah->attr.dlid = smlid;
+                       if (msl != ibp->sm_sl)
+                               ibp->sm_ah->attr.sl = msl;
+               }
+               spin_unlock_irqrestore(&ibp->lock, flags);
+               if (smlid != ibp->sm_lid)
+                       ibp->sm_lid = smlid;
+               if (msl != ibp->sm_sl)
+                       ibp->sm_sl = msl;
+               event.event = IB_EVENT_SM_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       if (pi->link_down_reason == 0) {
+               ppd->local_link_down_reason.sma = 0;
+               ppd->local_link_down_reason.latest = 0;
+       }
+
+       if (pi->neigh_link_down_reason == 0) {
+               ppd->neigh_link_down_reason.sma = 0;
+               ppd->neigh_link_down_reason.latest = 0;
+       }
+
+       ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp);
+       ppd->sa_qp = be32_to_cpu(pi->sa_qp);
+
+       ppd->port_error_action = be32_to_cpu(pi->port_error_action);
+       lwe = be16_to_cpu(pi->link_width.enabled);
+       if (lwe) {
+               if (lwe == OPA_LINK_WIDTH_RESET
+                               || lwe == OPA_LINK_WIDTH_RESET_OLD)
+                       set_link_width_enabled(ppd, ppd->link_width_supported);
+               else if ((lwe & ~ppd->link_width_supported) == 0)
+                       set_link_width_enabled(ppd, lwe);
+               else
+                       smp->status |= IB_SMP_INVALID_FIELD;
+       }
+       lwe = be16_to_cpu(pi->link_width_downgrade.enabled);
+       /* LWD.E is always applied - 0 means "disabled" */
+       if (lwe == OPA_LINK_WIDTH_RESET
+                       || lwe == OPA_LINK_WIDTH_RESET_OLD) {
+               set_link_width_downgrade_enabled(ppd,
+                               ppd->link_width_downgrade_supported);
+       } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) {
+               /* only set and apply if something changed */
+               if (lwe != ppd->link_width_downgrade_enabled) {
+                       set_link_width_downgrade_enabled(ppd, lwe);
+                       call_link_downgrade_policy = 1;
+               }
+       } else
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       lse = be16_to_cpu(pi->link_speed.enabled);
+       if (lse) {
+               if (lse & be16_to_cpu(pi->link_speed.supported))
+                       set_link_speed_enabled(ppd, lse);
+               else
+                       smp->status |= IB_SMP_INVALID_FIELD;
+       }
+
+       ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6;
+       ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT,
+                                   ibp->vl_high_limit);
+
+       if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) ||
+               ppd->vls_supported > ARRAY_SIZE(dd->vld)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if ((i % 2) == 0)
+                       mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4)
+                                         & 0xF);
+               else
+                       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF);
+               if (mtu == 0xffff) {
+                       pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n",
+                               mtu,
+                               (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       mtu = hfi1_max_mtu; /* use a valid MTU */
+               }
+               if (dd->vld[i].mtu != mtu) {
+                       dd_dev_info(dd,
+                               "MTU change on vl %d from %d to %d\n",
+                               i, dd->vld[i].mtu, mtu);
+                       dd->vld[i].mtu = mtu;
+                       call_set_mtu++;
+               }
+       }
+       /* As per OPAV1 spec: VL15 must support and be configured
+        * for operation with a 2048 or larger MTU.
+        */
+       mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF);
+       if (mtu < 2048 || mtu == 0xffff)
+               mtu = 2048;
+       if (dd->vld[15].mtu != mtu) {
+               dd_dev_info(dd,
+                       "MTU change on vl 15 from %d to %d\n",
+                       dd->vld[15].mtu, mtu);
+               dd->vld[15].mtu = mtu;
+               call_set_mtu++;
+       }
+       if (call_set_mtu)
+               set_mtu(ppd);
+
+       /* Set operational VLs */
+       vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL;
+       if (vls) {
+               if (vls > ppd->vls_supported) {
+                       pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n",
+                               pi->operational_vls);
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               } else {
+                       if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS,
+                                               vls) == -EINVAL)
+                               smp->status |= IB_SMP_INVALID_FIELD;
+               }
+       }
+
+       if (pi->mkey_violations == 0)
+               ibp->mkey_violations = 0;
+
+       if (pi->pkey_violations == 0)
+               ibp->pkey_violations = 0;
+
+       if (pi->qkey_violations == 0)
+               ibp->qkey_violations = 0;
+
+       ibp->subnet_timeout =
+               pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT;
+
+       crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode);
+       crc_enabled >>= 4;
+       crc_enabled &= 0xf;
+
+       if (crc_enabled != 0)
+               ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled);
+
+       ppd->is_active_optimize_enabled =
+                       !!(be16_to_cpu(pi->port_mode)
+                                       & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE);
+
+       ls_new = pi->port_states.portphysstate_portstate &
+                       OPA_PI_MASK_PORT_STATE;
+       ps_new = (pi->port_states.portphysstate_portstate &
+                       OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4;
+
+       if (ls_old == IB_PORT_INIT) {
+               if (start_of_sm_config) {
+                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+                               ppd->is_sm_config_started = 1;
+               } else if (ls_new == IB_PORT_ARMED) {
+                       if (ppd->is_sm_config_started == 0)
+                               invalid = 1;
+               }
+       }
+
+       /* Handle CLIENT_REREGISTER event b/c SM asked us for it */
+       if (clientrereg) {
+               event.event = IB_EVENT_CLIENT_REREGISTER;
+               ib_dispatch_event(&event);
+       }
+
+       /*
+        * Do the port state change now that the other link parameters
+        * have been set.
+        * Changing the port physical state only makes sense if the link
+        * is down or is being set to down.
+        */
+
+       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+       if (ret)
+               return ret;
+
+       ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+
+       /* restore re-reg bit per o14-12.2.1 */
+       pi->clientrereg_subnettimeout |= clientrereg;
+
+       /*
+        * Apply the new link downgrade policy.  This may result in a link
+        * bounce.  Do this after everything else so things are settled.
+        * Possible problem: if setting the port state above fails, then
+        * the policy change is not applied.
+        */
+       if (call_link_downgrade_policy)
+               apply_link_downgrade_policy(ppd, 0);
+
+       return ret;
+
+get_only:
+       return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len);
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the hfi1_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+       int changed = 0;
+       int update_includes_mgmt_partition = 0;
+
+       /*
+        * IB port one/two always maps to context zero/one,
+        * always a kernel context, no locking needed
+        * If we get here with ppd setup, no need to check
+        * that rcd is valid.
+        */
+       ppd = dd->pport + (port - 1);
+       /*
+        * If the update does not include the management pkey, don't do it.
+        */
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+               if (pkeys[i] == LIM_MGMT_P_KEY) {
+                       update_includes_mgmt_partition = 1;
+                       break;
+               }
+       }
+
+       if (!update_includes_mgmt_partition)
+               return 1;
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+               u16 key = pkeys[i];
+               u16 okey = ppd->pkeys[i];
+
+               if (key == okey)
+                       continue;
+               /*
+                * The SM gives us the complete PKey table. We have
+                * to ensure that we put the PKeys in the matching
+                * slots.
+                */
+               ppd->pkeys[i] = key;
+               changed = 1;
+       }
+
+       if (changed) {
+               struct ib_event event;
+
+               (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+
+               event.event = IB_EVENT_PKEY_CHANGE;
+               event.device = &dd->verbs_dev.ibdev;
+               event.element.port_num = port;
+               ib_dispatch_event(&event);
+       }
+       return 0;
+}
+
+static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 n_blocks_sent = OPA_AM_NBLK(am);
+       u32 start_block = am & 0x7ff;
+       u16 *p = (u16 *) data;
+       __be16 *q = (__be16 *)data;
+       int i;
+       u16 n_blocks_avail;
+       unsigned npkeys = hfi1_get_npkeys(dd);
+
+       if (n_blocks_sent == 0) {
+               pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n",
+                       port, start_block, n_blocks_sent);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1;
+
+       if (start_block + n_blocks_sent > n_blocks_avail ||
+           n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) {
+               pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n",
+                       start_block, n_blocks_sent, n_blocks_avail,
+                       OPA_NUM_PKEY_BLOCKS_PER_SMP);
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++)
+               p[i] = be16_to_cpu(q[i]);
+
+       if (start_block == 0 && set_pkeys(dd, port, p) != 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len);
+}
+
+static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+       u64 *val = (u64 *)data;
+
+       *val++ = read_csr(dd, SEND_SC2VLT0);
+       *val++ = read_csr(dd, SEND_SC2VLT1);
+       *val++ = read_csr(dd, SEND_SC2VLT2);
+       *val++ = read_csr(dd, SEND_SC2VLT3);
+       return 0;
+}
+
+#define ILLEGAL_VL 12
+/*
+ * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except
+ * for SC15, which must map to VL15). If we don't remap things this
+ * way it is possible for VL15 counters to increment when we try to
+ * send on a SC which is mapped to an invalid VL.
+ */
+static void filter_sc2vlt(void *data)
+{
+       int i;
+       u8 *pd = (u8 *)data;
+
+       for (i = 0; i < OPA_MAX_SCS; i++) {
+               if (i == 15)
+                       continue;
+               if ((pd[i] & 0x1f) == 0xf)
+                       pd[i] = ILLEGAL_VL;
+       }
+}
+
+static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data)
+{
+       u64 *val = (u64 *)data;
+
+       filter_sc2vlt(data);
+
+       write_csr(dd, SEND_SC2VLT0, *val++);
+       write_csr(dd, SEND_SC2VLT1, *val++);
+       write_csr(dd, SEND_SC2VLT2, *val++);
+       write_csr(dd, SEND_SC2VLT3, *val++);
+       write_seqlock_irq(&dd->sc2vl_lock);
+       memcpy(dd->sc2vl, (u64 *)data, sizeof(dd->sc2vl));
+       write_sequnlock_irq(&dd->sc2vl_lock);
+       return 0;
+}
+
+static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = (u8 *)data;
+       size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */
+       unsigned i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++)
+               *p++ = ibp->sl_to_sc[i];
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = (u8 *)data;
+       int i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i <  ARRAY_SIZE(ibp->sl_to_sc); i++)
+               ibp->sl_to_sc[i] = *p++;
+
+       return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = (u8 *)data;
+       size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */
+       unsigned i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+               *p++ = ibp->sc_to_sl[i];
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *p = (u8 *)data;
+       int i;
+
+       if (am) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++)
+               ibp->sc_to_sl[i] = *p++;
+
+       return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       void *vp = (void *) data;
+       size_t size = 4 * sizeof(u64);
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       get_sc2vlt_tables(dd, vp);
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NBLK(am);
+       int async_update = OPA_AM_ASYNC(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       void *vp = (void *) data;
+       struct hfi1_pportdata *ppd;
+       int lstate;
+
+       if (n_blocks != 1 || async_update) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       lstate = driver_lstate(ppd);
+       /* it's known that async_update is 0 by this point, but include
+        * the explicit check for clarity */
+       if (!async_update &&
+           (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       set_sc2vlt_tables(dd, vp);
+
+       return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       void *vp = (void *) data;
+       int size;
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+
+       size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp);
+
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       u32 n_blocks = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       void *vp = (void *) data;
+       int lstate;
+
+       if (n_blocks != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* IB numbers ports from 1, hw from 0 */
+       ppd = dd->pport + (port - 1);
+       lstate = driver_lstate(ppd);
+       if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+
+       fm_set_table(ppd, FM_TBL_SC2VLNT, vp);
+
+       return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                        resp_len);
+}
+
+static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port,
+                             u32 *resp_len)
+{
+       u32 nports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 lstate;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+
+       if (nports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ibp = to_iport(ibdev, port);
+       ppd = ppd_from_ibp(ibp);
+
+       lstate = driver_lstate(ppd);
+
+       if (start_of_sm_config && (lstate == IB_PORT_INIT))
+               ppd->is_sm_config_started = 1;
+
+#if PI_LED_ENABLE_SUP
+       psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4;
+       psi->port_states.ledenable_offlinereason |=
+               ppd->is_sm_config_started << 5;
+       psi->port_states.ledenable_offlinereason |=
+               ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON;
+#else
+       psi->port_states.offline_reason = ppd->neighbor_normal << 4;
+       psi->port_states.offline_reason |= ppd->is_sm_config_started << 5;
+       psi->port_states.offline_reason |= ppd->offline_disabled_reason &
+                               OPA_PI_MASK_OFFLINE_REASON;
+#endif /* PI_LED_ENABLE_SUP */
+
+       psi->port_states.portphysstate_portstate =
+               (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf);
+       psi->link_width_downgrade_tx_active =
+         ppd->link_width_downgrade_tx_active;
+       psi->link_width_downgrade_rx_active =
+         ppd->link_width_downgrade_rx_active;
+       if (resp_len)
+               *resp_len += sizeof(struct opa_port_state_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port,
+                             u32 *resp_len)
+{
+       u32 nports = OPA_AM_NPORT(am);
+       u32 start_of_sm_config = OPA_AM_START_SM_CFG(am);
+       u32 ls_old;
+       u8 ls_new, ps_new;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct opa_port_state_info *psi = (struct opa_port_state_info *) data;
+       int ret, invalid = 0;
+
+       if (nports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ibp = to_iport(ibdev, port);
+       ppd = ppd_from_ibp(ibp);
+
+       ls_old = driver_lstate(ppd);
+
+       ls_new = port_states_to_logical_state(&psi->port_states);
+       ps_new = port_states_to_phys_state(&psi->port_states);
+
+       if (ls_old == IB_PORT_INIT) {
+               if (start_of_sm_config) {
+                       if (ls_new == ls_old || (ls_new == IB_PORT_ARMED))
+                               ppd->is_sm_config_started = 1;
+               } else if (ls_new == IB_PORT_ARMED) {
+                       if (ppd->is_sm_config_started == 0)
+                               invalid = 1;
+               }
+       }
+
+       ret = set_port_states(ppd, smp, ls_new, ps_new, invalid);
+       if (ret)
+               return ret;
+
+       if (invalid)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data,
+                                    struct ib_device *ibdev, u8 port,
+                                    u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 addr = OPA_AM_CI_ADDR(am);
+       u32 len = OPA_AM_CI_LEN(am) + 1;
+       int ret;
+
+#define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */
+#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1)
+#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK)
+
+       /* check that addr is within spec, and
+        * addr and (addr + len - 1) are on the same "page" */
+       if (addr >= 4096 ||
+               (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ret = get_cable_info(dd, port, addr, len, data);
+
+       if (ret == -ENODEV) {
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       /* The address range for the CableInfo SMA query is wider than the
+        * memory available on the QSFP cable. We want to return a valid
+        * response, albeit zeroed out, for address ranges beyond available
+        * memory but that are within the CableInfo query spec
+        */
+       if (ret < 0 && ret != -ERANGE) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       if (resp_len)
+               *resp_len += len;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       u32 num_ports = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       struct buffer_control *p = (struct buffer_control *) data;
+       int size;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       ppd = dd->pport + (port - 1);
+       size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p);
+       trace_bct_get(dd, p);
+       if (resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data,
+                             struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       u32 num_ports = OPA_AM_NPORT(am);
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_pportdata *ppd;
+       struct buffer_control *p = (struct buffer_control *) data;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+       ppd = dd->pport + (port - 1);
+       trace_bct_set(dd, p);
+       if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len);
+}
+
+static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+                                struct ib_device *ibdev, u8 port,
+                                u32 *resp_len)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+       u32 num_ports = OPA_AM_NPORT(am);
+       u8 section = (am & 0x00ff0000) >> 16;
+       u8 *p = data;
+       int size = 0;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       switch (section) {
+       case OPA_VLARB_LOW_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               break;
+       case OPA_VLARB_HIGH_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               break;
+       case OPA_VLARB_PREEMPT_ELEMENTS:
+               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p);
+               break;
+       case OPA_VLARB_PREEMPT_MATRIX:
+               size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p);
+               break;
+       default:
+               pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n",
+                       be32_to_cpu(smp->attr_mod));
+               smp->status |= IB_SMP_INVALID_FIELD;
+               break;
+       }
+
+       if (size > 0 && resp_len)
+               *resp_len += size;
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data,
+                                struct ib_device *ibdev, u8 port,
+                                u32 *resp_len)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+       u32 num_ports = OPA_AM_NPORT(am);
+       u8 section = (am & 0x00ff0000) >> 16;
+       u8 *p = data;
+
+       if (num_ports != 1) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       switch (section) {
+       case OPA_VLARB_LOW_ELEMENTS:
+               (void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p);
+               break;
+       case OPA_VLARB_HIGH_ELEMENTS:
+               (void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p);
+               break;
+       /* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX
+        * can be changed from the default values */
+       case OPA_VLARB_PREEMPT_ELEMENTS:
+               /* FALLTHROUGH */
+       case OPA_VLARB_PREEMPT_MATRIX:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               break;
+       default:
+               pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n",
+                       be32_to_cpu(smp->attr_mod));
+               smp->status |= IB_SMP_INVALID_FIELD;
+               break;
+       }
+
+       return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_pma_mad {
+       struct ib_mad_hdr mad_hdr;
+       u8 data[2024];
+} __packed;
+
+struct opa_class_port_info {
+       u8 base_version;
+       u8 class_version;
+       __be16 cap_mask;
+       __be32 cap_mask2_resp_time;
+
+       u8 redirect_gid[16];
+       __be32 redirect_tc_fl;
+       __be32 redirect_lid;
+       __be32 redirect_sl_qp;
+       __be32 redirect_qkey;
+
+       u8 trap_gid[16];
+       __be32 trap_tc_fl;
+       __be32 trap_lid;
+       __be32 trap_hl_qp;
+       __be32 trap_qkey;
+
+       __be16 trap_pkey;
+       __be16 redirect_pkey;
+
+       u8 trap_sl_rsvd;
+       u8 reserved[3];
+} __packed;
+
+struct opa_port_status_req {
+       __u8 port_num;
+       __u8 reserved[3];
+       __be32 vl_select_mask;
+};
+
+#define VL_MASK_ALL            0x000080ff
+
+struct opa_port_status_rsp {
+       __u8 port_num;
+       __u8 reserved[3];
+       __be32  vl_select_mask;
+
+       /* Data counters */
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_pkts;
+       __be64 port_rcv_pkts;
+       __be64 port_multicast_xmit_pkts;
+       __be64 port_multicast_rcv_pkts;
+       __be64 port_xmit_wait;
+       __be64 sw_port_congestion;
+       __be64 port_rcv_fecn;
+       __be64 port_rcv_becn;
+       __be64 port_xmit_time_cong;
+       __be64 port_xmit_wasted_bw;
+       __be64 port_xmit_wait_data;
+       __be64 port_rcv_bubble;
+       __be64 port_mark_fecn;
+       /* Error counters */
+       __be64 port_rcv_constraint_errors;
+       __be64 port_rcv_switch_relay_errors;
+       __be64 port_xmit_discards;
+       __be64 port_xmit_constraint_errors;
+       __be64 port_rcv_remote_physical_errors;
+       __be64 local_link_integrity_errors;
+       __be64 port_rcv_errors;
+       __be64 excessive_buffer_overruns;
+       __be64 fm_config_errors;
+       __be32 link_error_recovery;
+       __be32 link_downed;
+       u8 uncorrectable_errors;
+
+       u8 link_quality_indicator; /* 5res, 3bit */
+       u8 res2[6];
+       struct _vls_pctrs {
+               /* per-VL Data counters */
+               __be64 port_vl_xmit_data;
+               __be64 port_vl_rcv_data;
+               __be64 port_vl_xmit_pkts;
+               __be64 port_vl_rcv_pkts;
+               __be64 port_vl_xmit_wait;
+               __be64 sw_port_vl_congestion;
+               __be64 port_vl_rcv_fecn;
+               __be64 port_vl_rcv_becn;
+               __be64 port_xmit_time_cong;
+               __be64 port_vl_xmit_wasted_bw;
+               __be64 port_vl_xmit_wait_data;
+               __be64 port_vl_rcv_bubble;
+               __be64 port_vl_mark_fecn;
+               __be64 port_vl_xmit_discards;
+       } vls[0]; /* real array size defined by # bits set in vl_select_mask */
+};
+
+enum counter_selects {
+       CS_PORT_XMIT_DATA                       = (1 << 31),
+       CS_PORT_RCV_DATA                        = (1 << 30),
+       CS_PORT_XMIT_PKTS                       = (1 << 29),
+       CS_PORT_RCV_PKTS                        = (1 << 28),
+       CS_PORT_MCAST_XMIT_PKTS                 = (1 << 27),
+       CS_PORT_MCAST_RCV_PKTS                  = (1 << 26),
+       CS_PORT_XMIT_WAIT                       = (1 << 25),
+       CS_SW_PORT_CONGESTION                   = (1 << 24),
+       CS_PORT_RCV_FECN                        = (1 << 23),
+       CS_PORT_RCV_BECN                        = (1 << 22),
+       CS_PORT_XMIT_TIME_CONG                  = (1 << 21),
+       CS_PORT_XMIT_WASTED_BW                  = (1 << 20),
+       CS_PORT_XMIT_WAIT_DATA                  = (1 << 19),
+       CS_PORT_RCV_BUBBLE                      = (1 << 18),
+       CS_PORT_MARK_FECN                       = (1 << 17),
+       CS_PORT_RCV_CONSTRAINT_ERRORS           = (1 << 16),
+       CS_PORT_RCV_SWITCH_RELAY_ERRORS         = (1 << 15),
+       CS_PORT_XMIT_DISCARDS                   = (1 << 14),
+       CS_PORT_XMIT_CONSTRAINT_ERRORS          = (1 << 13),
+       CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS      = (1 << 12),
+       CS_LOCAL_LINK_INTEGRITY_ERRORS          = (1 << 11),
+       CS_PORT_RCV_ERRORS                      = (1 << 10),
+       CS_EXCESSIVE_BUFFER_OVERRUNS            = (1 << 9),
+       CS_FM_CONFIG_ERRORS                     = (1 << 8),
+       CS_LINK_ERROR_RECOVERY                  = (1 << 7),
+       CS_LINK_DOWNED                          = (1 << 6),
+       CS_UNCORRECTABLE_ERRORS                 = (1 << 5),
+};
+
+struct opa_clear_port_status {
+       __be64 port_select_mask[4];
+       __be32 counter_select_mask;
+};
+
+struct opa_aggregate {
+       __be16 attr_id;
+       __be16 err_reqlength;   /* 1 bit, 8 res, 7 bit */
+       __be32 attr_mod;
+       u8 data[0];
+};
+
+/* Request contains first two fields, response contains those plus the rest */
+struct opa_port_data_counters_msg {
+       __be64 port_select_mask[4];
+       __be32 vl_select_mask;
+
+       /* Response fields follow */
+       __be32 reserved1;
+       struct _port_dctrs {
+               u8 port_number;
+               u8 reserved2[3];
+               __be32 link_quality_indicator; /* 29res, 3bit */
+
+               /* Data counters */
+               __be64 port_xmit_data;
+               __be64 port_rcv_data;
+               __be64 port_xmit_pkts;
+               __be64 port_rcv_pkts;
+               __be64 port_multicast_xmit_pkts;
+               __be64 port_multicast_rcv_pkts;
+               __be64 port_xmit_wait;
+               __be64 sw_port_congestion;
+               __be64 port_rcv_fecn;
+               __be64 port_rcv_becn;
+               __be64 port_xmit_time_cong;
+               __be64 port_xmit_wasted_bw;
+               __be64 port_xmit_wait_data;
+               __be64 port_rcv_bubble;
+               __be64 port_mark_fecn;
+
+               __be64 port_error_counter_summary;
+               /* Sum of error counts/port */
+
+               struct _vls_dctrs {
+                       /* per-VL Data counters */
+                       __be64 port_vl_xmit_data;
+                       __be64 port_vl_rcv_data;
+                       __be64 port_vl_xmit_pkts;
+                       __be64 port_vl_rcv_pkts;
+                       __be64 port_vl_xmit_wait;
+                       __be64 sw_port_vl_congestion;
+                       __be64 port_vl_rcv_fecn;
+                       __be64 port_vl_rcv_becn;
+                       __be64 port_xmit_time_cong;
+                       __be64 port_vl_xmit_wasted_bw;
+                       __be64 port_vl_xmit_wait_data;
+                       __be64 port_vl_rcv_bubble;
+                       __be64 port_vl_mark_fecn;
+               } vls[0];
+               /* array size defined by #bits set in vl_select_mask*/
+       } port[1]; /* array size defined by  #ports in attribute modifier */
+};
+
+struct opa_port_error_counters64_msg {
+       /* Request contains first two fields, response contains the
+        * whole magilla */
+       __be64 port_select_mask[4];
+       __be32 vl_select_mask;
+
+       /* Response-only fields follow */
+       __be32 reserved1;
+       struct _port_ectrs {
+               u8 port_number;
+               u8 reserved2[7];
+               __be64 port_rcv_constraint_errors;
+               __be64 port_rcv_switch_relay_errors;
+               __be64 port_xmit_discards;
+               __be64 port_xmit_constraint_errors;
+               __be64 port_rcv_remote_physical_errors;
+               __be64 local_link_integrity_errors;
+               __be64 port_rcv_errors;
+               __be64 excessive_buffer_overruns;
+               __be64 fm_config_errors;
+               __be32 link_error_recovery;
+               __be32 link_downed;
+               u8 uncorrectable_errors;
+               u8 reserved3[7];
+               struct _vls_ectrs {
+                       __be64 port_vl_xmit_discards;
+               } vls[0];
+               /* array size defined by #bits set in vl_select_mask */
+       } port[1]; /* array size defined by #ports in attribute modifier */
+};
+
+struct opa_port_error_info_msg {
+       __be64 port_select_mask[4];
+       __be32 error_info_select_mask;
+       __be32 reserved1;
+       struct _port_ei {
+
+               u8 port_number;
+               u8 reserved2[7];
+
+               /* PortRcvErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       union {
+                               u8 raw[17];
+                               struct {
+                                       /* EI1to12 format */
+                                       u8 packet_flit1[8];
+                                       u8 packet_flit2[8];
+                                       u8 remaining_flit_bits12;
+                               } ei1to12;
+                               struct {
+                                       u8 packet_bytes[8];
+                                       u8 remaining_flit_bits;
+                               } ei13;
+                       } ei;
+                       u8 reserved3[6];
+               } __packed port_rcv_ei;
+
+               /* ExcessiveBufferOverrunInfo */
+               struct {
+                       u8 status_and_sc;
+                       u8 reserved4[7];
+               } __packed excessive_buffer_overrun_ei;
+
+               /* PortXmitConstraintErrorInfo */
+               struct {
+                       u8 status;
+                       u8 reserved5;
+                       __be16 pkey;
+                       __be32 slid;
+               } __packed port_xmit_constraint_ei;
+
+               /* PortRcvConstraintErrorInfo */
+               struct {
+                       u8 status;
+                       u8 reserved6;
+                       __be16 pkey;
+                       __be32 slid;
+               } __packed port_rcv_constraint_ei;
+
+               /* PortRcvSwitchRelayErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 reserved7[3];
+                       __u32 error_info;
+               } __packed port_rcv_switch_relay_ei;
+
+               /* UncorrectableErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 reserved8;
+               } __packed uncorrectable_ei;
+
+               /* FMConfigErrorInfo */
+               struct {
+                       u8 status_and_code;
+                       u8 error_info;
+               } __packed fm_config_ei;
+               __u32 reserved9;
+       } port[1]; /* actual array size defined by #ports in attr modifier */
+};
+
+/* opa_port_error_info_msg error_info_select_mask bit definitions */
+enum error_info_selects {
+       ES_PORT_RCV_ERROR_INFO                  = (1 << 31),
+       ES_EXCESSIVE_BUFFER_OVERRUN_INFO        = (1 << 30),
+       ES_PORT_XMIT_CONSTRAINT_ERROR_INFO      = (1 << 29),
+       ES_PORT_RCV_CONSTRAINT_ERROR_INFO       = (1 << 28),
+       ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO     = (1 << 27),
+       ES_UNCORRECTABLE_ERROR_INFO             = (1 << 26),
+       ES_FM_CONFIG_ERROR_INFO                 = (1 << 25)
+};
+
+static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp,
+                               struct ib_device *ibdev, u32 *resp_len)
+{
+       struct opa_class_port_info *p =
+               (struct opa_class_port_info *)pmp->data;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       if (pmp->mad_hdr.attr_mod != 0)
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+       p->base_version = OPA_MGMT_BASE_VERSION;
+       p->class_version = OPA_SMI_CLASS_VERSION;
+       /*
+        * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+        */
+       p->cap_mask2_resp_time = cpu_to_be32(18);
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static void a0_portstatus(struct hfi1_pportdata *ppd,
+                         struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+{
+       if (!is_bx(ppd->dd)) {
+               unsigned long vl;
+               int vfi = 0;
+               u64 max_vl_xmit_wait = 0, tmp;
+               u32 vl_all_mask = VL_MASK_ALL;
+               u64 rcv_data, rcv_bubble;
+
+               rcv_data = be64_to_cpu(rsp->port_rcv_data);
+               rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
+               /* In the measured time period, calculate the total number
+                * of flits that were received. Subtract out one false
+                * rcv_bubble increment for every 32 received flits but
+                * don't let the number go negative.
+                */
+               if (rcv_bubble >= (rcv_data>>5)) {
+                       rcv_bubble -= (rcv_data>>5);
+                       rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
+               }
+               for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                                8 * sizeof(vl_select_mask)) {
+                       rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
+                       rcv_bubble =
+                               be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
+                       if (rcv_bubble >= (rcv_data>>5)) {
+                               rcv_bubble -= (rcv_data>>5);
+                               rsp->vls[vfi].port_vl_rcv_bubble =
+                                                       cpu_to_be64(rcv_bubble);
+                       }
+                       vfi++;
+               }
+
+               for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
+                                8 * sizeof(vl_all_mask)) {
+                       tmp = read_port_cntr(ppd, C_TX_WAIT_VL,
+                                            idx_from_vl(vl));
+                       if (tmp > max_vl_xmit_wait)
+                               max_vl_xmit_wait = tmp;
+               }
+               rsp->port_xmit_wait = cpu_to_be64(max_vl_xmit_wait);
+       }
+}
+
+
+static int pma_get_opa_portstatus(struct opa_pma_mad *pmp,
+                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       struct opa_port_status_req *req =
+               (struct opa_port_status_req *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_port_status_rsp *rsp;
+       u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       unsigned long vl;
+       size_t response_data_size;
+       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       u8 port_num = req->port_num;
+       u8 num_vls = hweight32(vl_select_mask);
+       struct _vls_pctrs *vlinfo;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       int vfi;
+       u64 tmp, tmp2;
+
+       response_data_size = sizeof(struct opa_port_status_rsp) +
+                               num_vls * sizeof(struct _vls_pctrs);
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       if (nports != 1 || (port_num && port_num != port)
+           || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       rsp = (struct opa_port_status_rsp *)pmp->data;
+       if (port_num)
+               rsp->port_num = port_num;
+       else
+               rsp->port_num = port;
+
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+
+       hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
+
+       rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                        CNTR_INVALID_VL));
+       rsp->port_rcv_bubble =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                        CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                       CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                         CNTR_INVALID_VL));
+       rsp->port_xmit_wait =
+               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+       rsp->port_rcv_fecn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+       rsp->port_rcv_becn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                          CNTR_INVALID_VL));
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                         CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                       CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+       rsp->port_rcv_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL));
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+       rsp->fm_config_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                         CNTR_INVALID_VL));
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                         CNTR_INVALID_VL));
+
+       /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       vlinfo = &(rsp->vls[0]);
+       vfi = 0;
+       /* The vl_select_mask has been checked above, and we know
+        * that it contains only entries which represent valid VLs.
+        * So in the for_each_set_bit() loop below, we don't need
+        * any additional checks for vl.
+        */
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+
+               tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
+               rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp);
+               rsp->vls[vfi].port_vl_rcv_bubble =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_pkts =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_data =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_pkts =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_wait =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_fecn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_becn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+                                       idx_from_vl(vl)));
+
+               vlinfo++;
+               vfi++;
+       }
+
+       a0_portstatus(ppd, rsp, vl_select_mask);
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u64 error_counter_summary = 0, tmp;
+
+       error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                               CNTR_INVALID_VL);
+       /* port_rcv_switch_relay_errors is 0 for HFIs */
+       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_TX_REPLAY,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RX_REPLAY,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_SEQ_CRC_CNT,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR,
+                                               CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+       error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                               CNTR_INVALID_VL);
+       /* ppd->link_downed is a 32-bit value */
+       error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL);
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       /* this is an 8-bit quantity */
+       error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       return error_counter_summary;
+}
+
+static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp,
+                           u32 vl_select_mask)
+{
+       if (!is_bx(dd)) {
+               unsigned long vl;
+               int vfi = 0;
+               u64 rcv_data, rcv_bubble, sum_vl_xmit_wait = 0;
+
+               rcv_data = be64_to_cpu(rsp->port_rcv_data);
+               rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble);
+               /* In the measured time period, calculate the total number
+                * of flits that were received. Subtract out one false
+                * rcv_bubble increment for every 32 received flits but
+                * don't let the number go negative.
+                */
+               if (rcv_bubble >= (rcv_data>>5)) {
+                       rcv_bubble -= (rcv_data>>5);
+                       rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble);
+               }
+               for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                               8 * sizeof(vl_select_mask)) {
+                       rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data);
+                       rcv_bubble =
+                               be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble);
+                       if (rcv_bubble >= (rcv_data>>5)) {
+                               rcv_bubble -= (rcv_data>>5);
+                               rsp->vls[vfi].port_vl_rcv_bubble =
+                                                       cpu_to_be64(rcv_bubble);
+                       }
+                       vfi++;
+               }
+               vfi = 0;
+               for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                               8 * sizeof(vl_select_mask)) {
+                       u64 tmp = sum_vl_xmit_wait +
+                               be64_to_cpu(rsp->vls[vfi++].port_vl_xmit_wait);
+                       if (tmp < sum_vl_xmit_wait) {
+                               /* we wrapped */
+                               sum_vl_xmit_wait = (u64) ~0;
+                               break;
+                       }
+                       sum_vl_xmit_wait = tmp;
+               }
+               if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait)
+                       rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait);
+       }
+}
+
+static int pma_get_opa_datacounters(struct opa_pma_mad *pmp,
+                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       struct opa_port_data_counters_msg *req =
+               (struct opa_port_data_counters_msg *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct _port_dctrs *rsp;
+       struct _vls_dctrs *vlinfo;
+       size_t response_data_size;
+       u32 num_ports;
+       u8 num_pslm;
+       u8 lq, num_vls;
+       u64 port_mask;
+       unsigned long port_num;
+       unsigned long vl;
+       u32 vl_select_mask;
+       int vfi;
+
+       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+       vl_select_mask = be32_to_cpu(req->vl_select_mask);
+
+       if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* Sanity check */
+       response_data_size = sizeof(struct opa_port_data_counters_msg) +
+                               num_vls * sizeof(struct _vls_dctrs);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the
+        * port the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if ((u8)port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       rsp = (struct _port_dctrs *)&(req->port[0]);
+       memset(rsp, 0, sizeof(*rsp));
+
+       rsp->port_number = port;
+       /*
+        * Note that link_quality_indicator is a 32 bit quantity in
+        * 'datacounters' queries (as opposed to 'portinfo' queries,
+        * where it's a byte).
+        */
+       hfi1_read_link_quality(dd, &lq);
+       rsp->link_quality_indicator = cpu_to_be32((u32)lq);
+
+       /* rsp->sw_port_congestion is 0 for HFIs */
+       /* rsp->port_xmit_time_cong is 0 for HFIs */
+       /* rsp->port_xmit_wasted_bw ??? */
+       /* rsp->port_xmit_wait_data ??? */
+       /* rsp->port_mark_fecn is 0 for HFIs */
+
+       rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_bubble =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL));
+       rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_multicast_xmit_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_multicast_rcv_pkts =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS,
+                                               CNTR_INVALID_VL));
+       rsp->port_xmit_wait =
+               cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL));
+       rsp->port_rcv_fecn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL));
+       rsp->port_rcv_becn =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL));
+
+       rsp->port_error_counter_summary =
+               cpu_to_be64(get_error_counter_summary(ibdev, port));
+
+       vlinfo = &(rsp->vls[0]);
+       vfi = 0;
+       /* The vl_select_mask has been checked above, and we know
+        * that it contains only entries which represent valid VLs.
+        * So in the for_each_set_bit() loop below, we don't need
+        * any additional checks for vl.
+        */
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                8 * sizeof(req->vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+
+               rsp->vls[vfi].port_vl_xmit_data =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL,
+                                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_data =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL,
+                                                       idx_from_vl(vl)));
+               rsp->vls[vfi].port_vl_rcv_bubble =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL,
+                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_pkts =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL,
+                                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_pkts =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL,
+                                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_xmit_wait =
+                       cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL,
+                                                       idx_from_vl(vl)));
+
+               rsp->vls[vfi].port_vl_rcv_fecn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL,
+                                                       idx_from_vl(vl)));
+               rsp->vls[vfi].port_vl_rcv_becn =
+                       cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL,
+                                                       idx_from_vl(vl)));
+
+               /* rsp->port_vl_xmit_time_cong is 0 for HFIs */
+               /* rsp->port_vl_xmit_wasted_bw ??? */
+               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ???
+                * does this differ from rsp->vls[vfi].port_vl_xmit_wait */
+               /*rsp->vls[vfi].port_vl_mark_fecn =
+                       cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT
+                               + offset));
+               */
+               vlinfo++;
+               vfi++;
+       }
+
+       a0_datacounters(dd, rsp, vl_select_mask);
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_porterrors(struct opa_pma_mad *pmp,
+                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       size_t response_data_size;
+       struct _port_ectrs *rsp;
+       unsigned long port_num;
+       struct opa_port_error_counters64_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u32 num_ports;
+       u8 num_pslm;
+       u8 num_vls;
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct _vls_ectrs *vlinfo;
+       unsigned long vl;
+       u64 port_mask, tmp, tmp2;
+       u32 vl_select_mask;
+       int vfi;
+
+       req = (struct opa_port_error_counters64_msg *)pmp->data;
+
+       num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+       num_vls = hweight32(be32_to_cpu(req->vl_select_mask));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       response_data_size = sizeof(struct opa_port_error_counters64_msg) +
+                               num_vls * sizeof(struct _vls_ectrs);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+       /*
+        * The bit set in the mask needs to be consistent with the
+        * port the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                       sizeof(port_mask));
+
+       if ((u8)port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       rsp = (struct _port_ectrs *)&(req->port[0]);
+
+       ibp = to_iport(ibdev, port_num);
+       ppd = ppd_from_ibp(ibp);
+
+       memset(rsp, 0, sizeof(*rsp));
+       rsp->port_number = (u8)port_num;
+
+       rsp->port_rcv_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       /* port_rcv_switch_relay_errors is 0 for HFIs */
+       rsp->port_xmit_discards =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD,
+                                               CNTR_INVALID_VL));
+       rsp->port_rcv_remote_physical_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR,
+                                               CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL);
+       if (tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->local_link_integrity_errors = cpu_to_be64(~0);
+       } else {
+               rsp->local_link_integrity_errors = cpu_to_be64(tmp2);
+       }
+       tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL);
+       tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                       CNTR_INVALID_VL);
+       if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) {
+               /* overflow/wrapped */
+               rsp->link_error_recovery = cpu_to_be32(~0);
+       } else {
+               rsp->link_error_recovery = cpu_to_be32(tmp2);
+       }
+       rsp->port_xmit_constraint_errors =
+               cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR,
+                                          CNTR_INVALID_VL));
+       rsp->excessive_buffer_overruns =
+               cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL));
+       rsp->fm_config_errors =
+               cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR,
+                                               CNTR_INVALID_VL));
+       rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN,
+                                               CNTR_INVALID_VL));
+       tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL);
+       rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff;
+
+       vlinfo = (struct _vls_ectrs *)&(rsp->vls[0]);
+       vfi = 0;
+       vl_select_mask = be32_to_cpu(req->vl_select_mask);
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(req->vl_select_mask)) {
+               memset(vlinfo, 0, sizeof(*vlinfo));
+               /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */
+               vlinfo += 1;
+               vfi++;
+       }
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp,
+                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       size_t response_data_size;
+       struct _port_ei *rsp;
+       struct opa_port_error_info_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u64 port_mask;
+       u32 num_ports;
+       unsigned long port_num;
+       u8 num_pslm;
+       u64 reg;
+
+       req = (struct opa_port_error_info_msg *)pmp->data;
+       rsp = (struct _port_ei *)&(req->port[0]);
+
+       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+       memset(rsp, 0, sizeof(*rsp));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* Sanity check */
+       response_data_size = sizeof(struct opa_port_error_info_msg);
+
+       if (response_data_size > sizeof(pmp->data)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the port
+        * the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if ((u8)port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /* PortRcvErrorInfo */
+       rsp->port_rcv_ei.status_and_code =
+               dd->err_info_rcvport.status_and_code;
+       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1,
+               &dd->err_info_rcvport.packet_flit1, sizeof(u64));
+       memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2,
+               &dd->err_info_rcvport.packet_flit2, sizeof(u64));
+
+       /* ExcessiverBufferOverrunInfo */
+       reg = read_csr(dd, RCV_ERR_INFO);
+       if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) {
+               /* if the RcvExcessBufferOverrun bit is set, save SC of
+                * first pkt that encountered an excess buffer overrun */
+               u8 tmp = (u8)reg;
+
+               tmp &=  RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK;
+               tmp <<= 2;
+               rsp->excessive_buffer_overrun_ei.status_and_sc = tmp;
+               /* set the status bit */
+               rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80;
+       }
+
+       rsp->port_xmit_constraint_ei.status =
+               dd->err_info_xmit_constraint.status;
+       rsp->port_xmit_constraint_ei.pkey =
+               cpu_to_be16(dd->err_info_xmit_constraint.pkey);
+       rsp->port_xmit_constraint_ei.slid =
+               cpu_to_be32(dd->err_info_xmit_constraint.slid);
+
+       rsp->port_rcv_constraint_ei.status =
+               dd->err_info_rcv_constraint.status;
+       rsp->port_rcv_constraint_ei.pkey =
+               cpu_to_be16(dd->err_info_rcv_constraint.pkey);
+       rsp->port_rcv_constraint_ei.slid =
+               cpu_to_be32(dd->err_info_rcv_constraint.slid);
+
+       /* UncorrectableErrorInfo */
+       rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable;
+
+       /* FMConfigErrorInfo */
+       rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig;
+
+       if (resp_len)
+               *resp_len += response_data_size;
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_portstatus(struct opa_pma_mad *pmp,
+                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       struct opa_clear_port_status *req =
+               (struct opa_clear_port_status *)pmp->data;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
+       u64 portn = be64_to_cpu(req->port_select_mask[3]);
+       u32 counter_select = be32_to_cpu(req->counter_select_mask);
+       u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+       unsigned long vl;
+
+       if ((nports != 1) || (portn != 1 << port)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+       /*
+        * only counters returned by pma_get_opa_portstatus() are
+        * handled, so when pma_get_opa_portstatus() gets a fix,
+        * the corresponding change should be made here as well.
+        */
+
+       if (counter_select & CS_PORT_XMIT_DATA)
+               write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_DATA)
+               write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_PKTS)
+               write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_PKTS)
+               write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_MCAST_XMIT_PKTS)
+               write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_MCAST_RCV_PKTS)
+               write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_WAIT)
+               write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_sw_portCongestion for HFIs */
+
+       if (counter_select & CS_PORT_RCV_FECN)
+               write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_BECN)
+               write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_port_xmit_time_cong for HFIs */
+       /* ignore cs_port_xmit_wasted_bw for now */
+       /* ignore cs_port_xmit_wait_data for now */
+       if (counter_select & CS_PORT_RCV_BUBBLE)
+               write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0);
+
+       /* Only applicable for switch */
+       /*if (counter_select & CS_PORT_MARK_FECN)
+               write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/
+
+       if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS)
+               write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+       /* ignore cs_port_rcv_switch_relay_errors for HFIs */
+       if (counter_select & CS_PORT_XMIT_DISCARDS)
+               write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS)
+               write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS)
+               write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) {
+               write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+               write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+       }
+
+       if (counter_select & CS_LINK_ERROR_RECOVERY) {
+               write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+               write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT,
+                                               CNTR_INVALID_VL, 0);
+       }
+
+       if (counter_select & CS_PORT_RCV_ERRORS)
+               write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) {
+               write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+               dd->rcv_ovfl_cnt = 0;
+       }
+
+       if (counter_select & CS_FM_CONFIG_ERRORS)
+               write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_LINK_DOWNED)
+               write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0);
+
+       if (counter_select & CS_UNCORRECTABLE_ERRORS)
+               write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
+
+       for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
+                        8 * sizeof(vl_select_mask)) {
+
+               if (counter_select & CS_PORT_XMIT_DATA)
+                       write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_DATA)
+                       write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_XMIT_PKTS)
+                       write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_PKTS)
+                       write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_XMIT_WAIT)
+                       write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0);
+
+               /* sw_port_vl_congestion is 0 for HFIs */
+               if (counter_select & CS_PORT_RCV_FECN)
+                       write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0);
+
+               if (counter_select & CS_PORT_RCV_BECN)
+                       write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0);
+
+               /* port_vl_xmit_time_cong is 0 for HFIs */
+               /* port_vl_xmit_wasted_bw ??? */
+               /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */
+               if (counter_select & CS_PORT_RCV_BUBBLE)
+                       write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0);
+
+               /*if (counter_select & CS_PORT_MARK_FECN)
+                    write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0);
+               */
+               /* port_vl_xmit_discards ??? */
+       }
+
+       if (resp_len)
+               *resp_len += sizeof(*req);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp,
+                       struct ib_device *ibdev, u8 port, u32 *resp_len)
+{
+       struct _port_ei *rsp;
+       struct opa_port_error_info_msg *req;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       u64 port_mask;
+       u32 num_ports;
+       unsigned long port_num;
+       u8 num_pslm;
+       u32 error_info_select;
+
+       req = (struct opa_port_error_info_msg *)pmp->data;
+       rsp = (struct _port_ei *)&(req->port[0]);
+
+       num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod));
+       num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3]));
+
+       memset(rsp, 0, sizeof(*rsp));
+
+       if (num_ports != 1 || num_ports != num_pslm) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       /*
+        * The bit set in the mask needs to be consistent with the port
+        * the request came in on.
+        */
+       port_mask = be64_to_cpu(req->port_select_mask[3]);
+       port_num = find_first_bit((unsigned long *)&port_mask,
+                                 sizeof(port_mask));
+
+       if ((u8)port_num != port) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       error_info_select = be32_to_cpu(req->error_info_select_mask);
+
+       /* PortRcvErrorInfo */
+       if (error_info_select & ES_PORT_RCV_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+
+       /* ExcessiverBufferOverrunInfo */
+       if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO)
+               /* status bit is essentially kept in the h/w - bit 5 of
+                * RCV_ERR_INFO */
+               write_csr(dd, RCV_ERR_INFO,
+                         RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+
+       if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO)
+               dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+       if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO)
+               dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK;
+
+       /* UncorrectableErrorInfo */
+       if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK;
+
+       /* FMConfigErrorInfo */
+       if (error_info_select & ES_FM_CONFIG_ERROR_INFO)
+               /* turn off status bit */
+               dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK;
+
+       if (resp_len)
+               *resp_len += sizeof(*req);
+
+       return reply((struct ib_mad_hdr *)pmp);
+}
+
+struct opa_congestion_info_attr {
+       __be16 congestion_info;
+       u8 control_table_cap;   /* Multiple of 64 entry unit CCTs */
+       u8 congestion_log_length;
+} __packed;
+
+static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data,
+                                   struct ib_device *ibdev, u8 port,
+                                   u32 *resp_len)
+{
+       struct opa_congestion_info_attr *p =
+               (struct opa_congestion_info_attr *)data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       p->congestion_info = 0;
+       p->control_table_cap = ppd->cc_max_table_entries;
+       p->congestion_log_length = OPA_CONG_LOG_ELEMS;
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am,
+                                            u8 *data,
+                                            struct ib_device *ibdev,
+                                            u8 port, u32 *resp_len)
+{
+       int i;
+       struct opa_congestion_setting_attr *p =
+               (struct opa_congestion_setting_attr *) data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_congestion_setting_entry_shadow *entries;
+       struct cc_state *cc_state;
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (cc_state == NULL) {
+               rcu_read_unlock();
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       entries = cc_state->cong_setting.entries;
+       p->port_control = cpu_to_be16(cc_state->cong_setting.port_control);
+       p->control_map = cpu_to_be32(cc_state->cong_setting.control_map);
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               p->entries[i].ccti_increase = entries[i].ccti_increase;
+               p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+               p->entries[i].trigger_threshold =
+                       entries[i].trigger_threshold;
+               p->entries[i].ccti_min = entries[i].ccti_min;
+       }
+
+       rcu_read_unlock();
+
+       if (resp_len)
+               *resp_len += sizeof(*p);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data,
+                                      struct ib_device *ibdev, u8 port,
+                                      u32 *resp_len)
+{
+       struct opa_congestion_setting_attr *p =
+               (struct opa_congestion_setting_attr *) data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_congestion_setting_entry_shadow *entries;
+       int i;
+
+       ppd->cc_sl_control_map = be32_to_cpu(p->control_map);
+
+       entries = ppd->congestion_entries;
+       for (i = 0; i < OPA_MAX_SLS; i++) {
+               entries[i].ccti_increase = p->entries[i].ccti_increase;
+               entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer);
+               entries[i].trigger_threshold =
+                       p->entries[i].trigger_threshold;
+               entries[i].ccti_min = p->entries[i].ccti_min;
+       }
+
+       return __subn_get_opa_cong_setting(smp, am, data, ibdev, port,
+                                          resp_len);
+}
+
+static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am,
+                                       u8 *data, struct ib_device *ibdev,
+                                       u8 port, u32 *resp_len)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data;
+       s64 ts;
+       int i;
+
+       if (am != 0) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       spin_lock(&ppd->cc_log_lock);
+
+       cong_log->log_type = OPA_CC_LOG_TYPE_HFI;
+       cong_log->congestion_flags = 0;
+       cong_log->threshold_event_counter =
+               cpu_to_be16(ppd->threshold_event_counter);
+       memcpy(cong_log->threshold_cong_event_map,
+              ppd->threshold_cong_event_map,
+              sizeof(cong_log->threshold_cong_event_map));
+       /* keep timestamp in units of 1.024 usec */
+       ts = ktime_to_ns(ktime_get()) / 1024;
+       cong_log->current_time_stamp = cpu_to_be32(ts);
+       for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) {
+               struct opa_hfi1_cong_log_event_internal *cce =
+                       &ppd->cc_events[ppd->cc_mad_idx++];
+               if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS)
+                       ppd->cc_mad_idx = 0;
+               /*
+                * Entries which are older than twice the time
+                * required to wrap the counter are supposed to
+                * be zeroed (CA10-49 IBTA, release 1.2.1, V1).
+                */
+               if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX))
+                       continue;
+               memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3);
+               memcpy(cong_log->events[i].remote_qp_number_cn_entry,
+                       &cce->rqpn, 3);
+               cong_log->events[i].sl_svc_type_cn_entry =
+                       ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7);
+               cong_log->events[i].remote_lid_cn_entry =
+                       cpu_to_be32(cce->rlid);
+               cong_log->events[i].timestamp_cn_entry =
+                       cpu_to_be32(cce->timestamp);
+       }
+
+       /*
+        * Reset threshold_cong_event_map, and threshold_event_counter
+        * to 0 when log is read.
+        */
+       memset(ppd->threshold_cong_event_map, 0x0,
+              sizeof(ppd->threshold_cong_event_map));
+       ppd->threshold_event_counter = 0;
+
+       spin_unlock(&ppd->cc_log_lock);
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_hfi1_cong_log);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct ib_cc_table_attr *cc_table_attr =
+               (struct ib_cc_table_attr *) data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 start_block = OPA_AM_START_BLK(am);
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct ib_cc_table_entry_shadow *entries;
+       int i, j;
+       u32 sentry, eentry;
+       struct cc_state *cc_state;
+
+       /* sanity check n_blocks, start_block */
+       if (n_blocks == 0 ||
+           start_block + n_blocks > ppd->cc_max_table_entries) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       rcu_read_lock();
+
+       cc_state = get_cc_state(ppd);
+
+       if (cc_state == NULL) {
+               rcu_read_unlock();
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       sentry = start_block * IB_CCT_ENTRIES;
+       eentry = sentry + (IB_CCT_ENTRIES * n_blocks);
+
+       cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit);
+
+       entries = cc_state->cct.entries;
+
+       /* return n_blocks, though the last block may not be full */
+       for (j = 0, i = sentry; i < eentry; j++, i++)
+               cc_table_attr->ccti_entries[j].entry =
+                       cpu_to_be16(entries[i].entry);
+
+       rcu_read_unlock();
+
+       if (resp_len)
+               *resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+void cc_state_reclaim(struct rcu_head *rcu)
+{
+       struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu);
+
+       kfree(cc_state);
+}
+
+static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 start_block = OPA_AM_START_BLK(am);
+       u32 n_blocks = OPA_AM_NBLK(am);
+       struct ib_cc_table_entry_shadow *entries;
+       int i, j;
+       u32 sentry, eentry;
+       u16 ccti_limit;
+       struct cc_state *old_cc_state, *new_cc_state;
+
+       /* sanity check n_blocks, start_block */
+       if (n_blocks == 0 ||
+           start_block + n_blocks > ppd->cc_max_table_entries) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       sentry = start_block * IB_CCT_ENTRIES;
+       eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) +
+                (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1;
+
+       /* sanity check ccti_limit */
+       ccti_limit = be16_to_cpu(p->ccti_limit);
+       if (ccti_limit + 1 > eentry) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL);
+       if (new_cc_state == NULL)
+               goto getit;
+
+       spin_lock(&ppd->cc_state_lock);
+
+       old_cc_state = get_cc_state(ppd);
+
+       if (old_cc_state == NULL) {
+               spin_unlock(&ppd->cc_state_lock);
+               kfree(new_cc_state);
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       *new_cc_state = *old_cc_state;
+
+       new_cc_state->cct.ccti_limit = ccti_limit;
+
+       entries = ppd->ccti_entries;
+       ppd->total_cct_entry = ccti_limit + 1;
+
+       for (j = 0, i = sentry; i < eentry; j++, i++)
+               entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry);
+
+       memcpy(new_cc_state->cct.entries, entries,
+              eentry * sizeof(struct ib_cc_table_entry));
+
+       new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED;
+       new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map;
+       memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries,
+              OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry));
+
+       rcu_assign_pointer(ppd->cc_state, new_cc_state);
+
+       spin_unlock(&ppd->cc_state_lock);
+
+       call_rcu(&old_cc_state->rcu, cc_state_reclaim);
+
+getit:
+       return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len);
+}
+
+struct opa_led_info {
+       __be32 rsvd_led_mask;
+       __be32 rsvd;
+};
+
+#define OPA_LED_SHIFT  31
+#define OPA_LED_MASK   (1 << OPA_LED_SHIFT)
+
+static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_led_info *p = (struct opa_led_info *) data;
+       u32 nport = OPA_AM_NPORT(am);
+       u64 reg;
+
+       if (nport != 1 || OPA_AM_PORTNUM(am)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       reg = read_csr(dd, DCC_CFG_LED_CNTRL);
+       if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) &&
+               ((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf))
+                       p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK);
+
+       if (resp_len)
+               *resp_len += sizeof(struct opa_led_info);
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data,
+                                  struct ib_device *ibdev, u8 port,
+                                  u32 *resp_len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct opa_led_info *p = (struct opa_led_info *) data;
+       u32 nport = OPA_AM_NPORT(am);
+       int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK);
+
+       if (nport != 1 || OPA_AM_PORTNUM(am)) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       setextled(dd, on);
+
+       return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len);
+}
+
+static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u32 *resp_len)
+{
+       int ret;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       switch (attr_id) {
+       case IB_SMP_ATTR_NODE_DESC:
+               ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_NODE_INFO:
+               ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PORT_INFO:
+               ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PKEY_TABLE:
+               ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+               ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+               ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+               ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+               ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_PORT_STATE_INFO:
+               ret = __subn_get_opa_psi(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+               ret = __subn_get_opa_bct(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_CABLE_INFO:
+               ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port,
+                                               resp_len);
+               break;
+       case IB_SMP_ATTR_VL_ARB_TABLE:
+               ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port,
+                                           resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_INFO:
+               ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+               ret = __subn_get_opa_cong_setting(smp, am, data, ibdev,
+                                                 port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_LOG:
+               ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev,
+                                                  port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+               ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_LED_INFO:
+               ret = __subn_get_opa_led_info(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_SM_INFO:
+               if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               if (ibp->port_cap_flags & IB_PORT_SM)
+                       return IB_MAD_RESULT_SUCCESS;
+               /* FALLTHROUGH */
+       default:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+       return ret;
+}
+
+static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am,
+                           u8 *data, struct ib_device *ibdev, u8 port,
+                           u32 *resp_len)
+{
+       int ret;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       switch (attr_id) {
+       case IB_SMP_ATTR_PORT_INFO:
+               ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_PKEY_TABLE:
+               ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SL_TO_SC_MAP:
+               ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_SL_MAP:
+               ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLT_MAP:
+               ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_SC_TO_VLNT_MAP:
+               ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port,
+                                              resp_len);
+               break;
+       case OPA_ATTRIB_ID_PORT_STATE_INFO:
+               ret = __subn_set_opa_psi(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE:
+               ret = __subn_set_opa_bct(smp, am, data, ibdev, port,
+                                        resp_len);
+               break;
+       case IB_SMP_ATTR_VL_ARB_TABLE:
+               ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port,
+                                           resp_len);
+               break;
+       case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING:
+               ret = __subn_set_opa_cong_setting(smp, am, data, ibdev,
+                                                 port, resp_len);
+               break;
+       case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE:
+               ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_LED_INFO:
+               ret = __subn_set_opa_led_info(smp, am, data, ibdev, port,
+                                             resp_len);
+               break;
+       case IB_SMP_ATTR_SM_INFO:
+               if (ibp->port_cap_flags & IB_PORT_SM_DISABLED)
+                       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               if (ibp->port_cap_flags & IB_PORT_SM)
+                       return IB_MAD_RESULT_SUCCESS;
+               /* FALLTHROUGH */
+       default:
+               smp->status |= IB_SMP_UNSUP_METH_ATTR;
+               ret = reply((struct ib_mad_hdr *)smp);
+               break;
+       }
+       return ret;
+}
+
+static inline void set_aggr_error(struct opa_aggregate *ag)
+{
+       ag->err_reqlength |= cpu_to_be16(0x8000);
+}
+
+static int subn_get_opa_aggregate(struct opa_smp *smp,
+                                 struct ib_device *ibdev, u8 port,
+                                 u32 *resp_len)
+{
+       int i;
+       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+       u8 *next_smp = opa_get_smp_data(smp);
+
+       if (num_attr < 1 || num_attr > 117) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < num_attr; i++) {
+               struct opa_aggregate *agg;
+               size_t agg_data_len;
+               size_t agg_size;
+               u32 am;
+
+               agg = (struct opa_aggregate *)next_smp;
+               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+               agg_size = sizeof(*agg) + agg_data_len;
+               am = be32_to_cpu(agg->attr_mod);
+
+               *resp_len += agg_size;
+
+               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+
+               /* zero the payload for this segment */
+               memset(next_smp + sizeof(*agg), 0, agg_data_len);
+
+               (void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data,
+                                       ibdev, port, NULL);
+               if (smp->status & ~IB_SMP_DIRECTION) {
+                       set_aggr_error(agg);
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+               next_smp += agg_size;
+
+       }
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+static int subn_set_opa_aggregate(struct opa_smp *smp,
+                                 struct ib_device *ibdev, u8 port,
+                                 u32 *resp_len)
+{
+       int i;
+       u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff;
+       u8 *next_smp = opa_get_smp_data(smp);
+
+       if (num_attr < 1 || num_attr > 117) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               return reply((struct ib_mad_hdr *)smp);
+       }
+
+       for (i = 0; i < num_attr; i++) {
+               struct opa_aggregate *agg;
+               size_t agg_data_len;
+               size_t agg_size;
+               u32 am;
+
+               agg = (struct opa_aggregate *)next_smp;
+               agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8;
+               agg_size = sizeof(*agg) + agg_data_len;
+               am = be32_to_cpu(agg->attr_mod);
+
+               *resp_len += agg_size;
+
+               if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) {
+                       smp->status |= IB_SMP_INVALID_FIELD;
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+
+               (void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data,
+                                       ibdev, port, NULL);
+               if (smp->status & ~IB_SMP_DIRECTION) {
+                       set_aggr_error(agg);
+                       return reply((struct ib_mad_hdr *)smp);
+               }
+               next_smp += agg_size;
+
+       }
+
+       return reply((struct ib_mad_hdr *)smp);
+}
+
+/*
+ * OPAv1 specifies that, on the transition to link up, these counters
+ * are cleared:
+ *   PortRcvErrors [*]
+ *   LinkErrorRecovery
+ *   LocalLinkIntegrityErrors
+ *   ExcessiveBufferOverruns [*]
+ *
+ * [*] Error info associated with these counters is retained, but the
+ * error info status is reset to 0.
+ */
+void clear_linkup_counters(struct hfi1_devdata *dd)
+{
+       /* PortRcvErrors */
+       write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0);
+       dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK;
+       /* LinkErrorRecovery */
+       write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0);
+       write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0);
+       /* LocalLinkIntegrityErrors */
+       write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0);
+       write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0);
+       /* ExcessiveBufferOverruns */
+       write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0);
+       dd->rcv_ovfl_cnt = 0;
+       dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK;
+}
+
+/*
+ * is_local_mad() returns 1 if 'mad' is sent from, and destined to the
+ * local node, 0 otherwise.
+ */
+static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad,
+                       const struct ib_wc *in_wc)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       const struct opa_smp *smp = (const struct opa_smp *)mad;
+
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+               return (smp->hop_cnt == 0 &&
+                       smp->route.dr.dr_slid == OPA_LID_PERMISSIVE &&
+                       smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE);
+       }
+
+       return (in_wc->slid == ppd->lid);
+}
+
+/*
+ * opa_local_smp_check() should only be called on MADs for which
+ * is_local_mad() returns true. It applies the SMP checks that are
+ * specific to SMPs which are sent from, and destined to this node.
+ * opa_local_smp_check() returns 0 if the SMP passes its checks, 1
+ * otherwise.
+ *
+ * SMPs which arrive from other nodes are instead checked by
+ * opa_smp_check().
+ */
+static int opa_local_smp_check(struct hfi1_ibport *ibp,
+                              const struct ib_wc *in_wc)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 slid = in_wc->slid;
+       u16 pkey;
+
+       if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
+               return 1;
+
+       pkey = ppd->pkeys[in_wc->pkey_index];
+       /*
+        * We need to do the "node-local" checks specified in OPAv1,
+        * rev 0.90, section 9.10.26, which are:
+        *   - pkey is 0x7fff, or 0xffff
+        *   - Source QPN == 0 || Destination QPN == 0
+        *   - the MAD header's management class is either
+        *     IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or
+        *     IB_MGMT_CLASS_SUBN_LID_ROUTED
+        *   - SLID != 0
+        *
+        * However, we know (and so don't need to check again) that,
+        * for local SMPs, the MAD stack passes MADs with:
+        *   - Source QPN of 0
+        *   - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+        *   - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or
+        *     our own port's lid
+        *
+        */
+       if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
+               return 0;
+       ingress_pkey_table_fail(ppd, pkey, slid);
+       return 1;
+}
+
+static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
+                           u8 port, const struct opa_mad *in_mad,
+                           struct opa_mad *out_mad,
+                           u32 *resp_len)
+{
+       struct opa_smp *smp = (struct opa_smp *)out_mad;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       u8 *data;
+       u32 am;
+       __be16 attr_id;
+       int ret;
+
+       *out_mad = *in_mad;
+       data = opa_get_smp_data(smp);
+
+       am = be32_to_cpu(smp->attr_mod);
+       attr_id = smp->attr_id;
+       if (smp->class_version != OPA_SMI_CLASS_VERSION) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)smp);
+               goto bail;
+       }
+       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey,
+                        smp->route.dr.dr_slid, smp->route.dr.return_path,
+                        smp->hop_cnt);
+       if (ret) {
+               u32 port_num = be32_to_cpu(smp->attr_mod);
+
+               /*
+                * If this is a get/set portinfo, we already check the
+                * M_Key if the MAD is for another port and the M_Key
+                * is OK on the receiving port. This check is needed
+                * to increment the error counters when the M_Key
+                * fails to match on *both* ports.
+                */
+               if (attr_id == IB_SMP_ATTR_PORT_INFO &&
+                   (smp->method == IB_MGMT_METHOD_GET ||
+                    smp->method == IB_MGMT_METHOD_SET) &&
+                   port_num && port_num <= ibdev->phys_port_cnt &&
+                   port != port_num)
+                       (void) check_mkey(to_iport(ibdev, port_num),
+                                         (struct ib_mad_hdr *)smp, 0,
+                                         smp->mkey, smp->route.dr.dr_slid,
+                                         smp->route.dr.return_path,
+                                         smp->hop_cnt);
+               ret = IB_MAD_RESULT_FAILURE;
+               goto bail;
+       }
+
+       *resp_len = opa_get_smp_header_size(smp);
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (attr_id) {
+               default:
+                       clear_opa_smp_data(smp);
+                       ret = subn_get_opa_sma(attr_id, smp, am, data,
+                                              ibdev, port, resp_len);
+                       goto bail;
+               case OPA_ATTRIB_ID_AGGREGATE:
+                       ret = subn_get_opa_aggregate(smp, ibdev, port,
+                                                    resp_len);
+                       goto bail;
+               }
+       case IB_MGMT_METHOD_SET:
+               switch (attr_id) {
+               default:
+                       ret = subn_set_opa_sma(attr_id, smp, am, data,
+                                              ibdev, port, resp_len);
+                       goto bail;
+               case OPA_ATTRIB_ID_AGGREGATE:
+                       ret = subn_set_opa_aggregate(smp, ibdev, port,
+                                                    resp_len);
+                       goto bail;
+               }
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_REPORT_RESP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               goto bail;
+       default:
+               smp->status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)smp);
+       }
+
+bail:
+       return ret;
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+                       u8 port, const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_smp *smp = (struct ib_smp *)out_mad;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       int ret;
+
+       *out_mad = *in_mad;
+       if (smp->class_version != 1) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_mad_hdr *)smp);
+               goto bail;
+       }
+
+       ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags,
+                        smp->mkey, (__force __be32)smp->dr_slid,
+                        smp->return_path, smp->hop_cnt);
+       if (ret) {
+               u32 port_num = be32_to_cpu(smp->attr_mod);
+
+               /*
+                * If this is a get/set portinfo, we already check the
+                * M_Key if the MAD is for another port and the M_Key
+                * is OK on the receiving port. This check is needed
+                * to increment the error counters when the M_Key
+                * fails to match on *both* ports.
+                */
+               if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+                   (smp->method == IB_MGMT_METHOD_GET ||
+                    smp->method == IB_MGMT_METHOD_SET) &&
+                   port_num && port_num <= ibdev->phys_port_cnt &&
+                   port != port_num)
+                       (void) check_mkey(to_iport(ibdev, port_num),
+                                         (struct ib_mad_hdr *)smp, 0,
+                                         smp->mkey,
+                                         (__force __be32)smp->dr_slid,
+                                         smp->return_path, smp->hop_cnt);
+               ret = IB_MAD_RESULT_FAILURE;
+               goto bail;
+       }
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_NODE_INFO:
+                       ret = subn_get_nodeinfo(smp, ibdev, port);
+                       goto bail;
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)smp);
+                       goto bail;
+               }
+       }
+
+bail:
+       return ret;
+}
+
+static int process_perf_opa(struct ib_device *ibdev, u8 port,
+                           const struct opa_mad *in_mad,
+                           struct opa_mad *out_mad, u32 *resp_len)
+{
+       struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad;
+       int ret;
+
+       *out_mad = *in_mad;
+
+       if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               return reply((struct ib_mad_hdr *)pmp);
+       }
+
+       *resp_len = sizeof(pmp->mad_hdr);
+
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_CLASS_PORT_INFO:
+                       ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len);
+                       goto bail;
+               case OPA_PM_ATTRIB_ID_PORT_STATUS:
+                       ret = pma_get_opa_portstatus(pmp, ibdev, port,
+                                                               resp_len);
+                       goto bail;
+               case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS:
+                       ret = pma_get_opa_datacounters(pmp, ibdev, port,
+                                                               resp_len);
+                       goto bail;
+               case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS:
+                       ret = pma_get_opa_porterrors(pmp, ibdev, port,
+                                                               resp_len);
+                       goto bail;
+               case OPA_PM_ATTRIB_ID_ERROR_INFO:
+                       ret = pma_get_opa_errorinfo(pmp, ibdev, port,
+                                                               resp_len);
+                       goto bail;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_SET:
+               switch (pmp->mad_hdr.attr_id) {
+               case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS:
+                       ret = pma_set_opa_portstatus(pmp, ibdev, port,
+                                                               resp_len);
+                       goto bail;
+               case OPA_PM_ATTRIB_ID_ERROR_INFO:
+                       ret = pma_set_opa_errorinfo(pmp, ibdev, port,
+                                                               resp_len);
+                       goto bail;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_mad_hdr *)pmp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               goto bail;
+
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_mad_hdr *)pmp);
+       }
+
+bail:
+       return ret;
+}
+
+static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags,
+                               u8 port, const struct ib_wc *in_wc,
+                               const struct ib_grh *in_grh,
+                               const struct opa_mad *in_mad,
+                               struct opa_mad *out_mad, size_t *out_mad_size,
+                               u16 *out_mad_pkey_index)
+{
+       int ret;
+       int pkey_idx;
+       u32 resp_len = 0;
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+
+       pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
+       if (pkey_idx < 0) {
+               pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n",
+                       hfi1_get_pkey(ibp, 1));
+               pkey_idx = 1;
+       }
+       *out_mad_pkey_index = (u16)pkey_idx;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               if (is_local_mad(ibp, in_mad, in_wc)) {
+                       ret = opa_local_smp_check(ibp, in_wc);
+                       if (ret)
+                               return IB_MAD_RESULT_FAILURE;
+               }
+               ret = process_subn_opa(ibdev, mad_flags, port, in_mad,
+                                      out_mad, &resp_len);
+               goto bail;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf_opa(ibdev, port, in_mad, out_mad,
+                                      &resp_len);
+               goto bail;
+
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+       }
+
+bail:
+       if (ret & IB_MAD_RESULT_REPLY)
+               *out_mad_size = round_up(resp_len, 8);
+       else if (ret & IB_MAD_RESULT_SUCCESS)
+               *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh);
+
+       return ret;
+}
+
+static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                              const struct ib_wc *in_wc,
+                              const struct ib_grh *in_grh,
+                              const struct ib_mad *in_mad,
+                              struct ib_mad *out_mad)
+{
+       int ret;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+               goto bail;
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+       }
+
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index)
+{
+       switch (in_mad->base_version) {
+       case OPA_MGMT_BASE_VERSION:
+               if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
+                       dev_err(ibdev->dma_device, "invalid in_mad_size\n");
+                       return IB_MAD_RESULT_FAILURE;
+               }
+               return hfi1_process_opa_mad(ibdev, mad_flags, port,
+                                           in_wc, in_grh,
+                                           (struct opa_mad *)in_mad,
+                                           (struct opa_mad *)out_mad,
+                                           out_mad_size,
+                                           out_mad_pkey_index);
+       case IB_MGMT_BASE_VERSION:
+               return hfi1_process_ib_mad(ibdev, mad_flags, port,
+                                         in_wc, in_grh,
+                                         (const struct ib_mad *)in_mad,
+                                         (struct ib_mad *)out_mad);
+       default:
+               break;
+       }
+
+       return IB_MAD_RESULT_FAILURE;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+                        struct ib_mad_send_wc *mad_send_wc)
+{
+       ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int hfi1_create_agents(struct hfi1_ibdev *dev)
+{
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       struct ib_mad_agent *agent;
+       struct hfi1_ibport *ibp;
+       int p;
+       int ret;
+
+       for (p = 0; p < dd->num_pports; p++) {
+               ibp = &dd->pport[p].ibport_data;
+               agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
+                                             NULL, 0, send_handler,
+                                             NULL, NULL, 0);
+               if (IS_ERR(agent)) {
+                       ret = PTR_ERR(agent);
+                       goto err;
+               }
+
+               ibp->send_agent = agent;
+       }
+
+       return 0;
+
+err:
+       for (p = 0; p < dd->num_pports; p++) {
+               ibp = &dd->pport[p].ibport_data;
+               if (ibp->send_agent) {
+                       agent = ibp->send_agent;
+                       ibp->send_agent = NULL;
+                       ib_unregister_mad_agent(agent);
+               }
+       }
+
+       return ret;
+}
+
+void hfi1_free_agents(struct hfi1_ibdev *dev)
+{
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       struct ib_mad_agent *agent;
+       struct hfi1_ibport *ibp;
+       int p;
+
+       for (p = 0; p < dd->num_pports; p++) {
+               ibp = &dd->pport[p].ibport_data;
+               if (ibp->send_agent) {
+                       agent = ibp->send_agent;
+                       ibp->send_agent = NULL;
+                       ib_unregister_mad_agent(agent);
+               }
+               if (ibp->sm_ah) {
+                       ib_destroy_ah(&ibp->sm_ah->ibah);
+                       ibp->sm_ah = NULL;
+               }
+       }
+}
diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h
new file mode 100644 (file)
index 0000000..4745750
--- /dev/null
@@ -0,0 +1,325 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MAD_H
+#define _HFI1_MAD_H
+
+#include <rdma/ib_pma.h>
+#define USE_PI_LED_ENABLE      1 /* use led enabled bit in struct
+                                  * opa_port_states, if available */
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
+#ifndef PI_LED_ENABLE_SUP
+#define PI_LED_ENABLE_SUP 0
+#endif
+#include "opa_compat.h"
+
+
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+       u8 reserved;
+       u8 reserved1;
+       __be16 port_check_rate;
+       __be16 symbol_error_counter;
+       u8 link_error_recovery_counter;
+       u8 link_downed_counter;
+       __be16 port_rcv_errors;
+       __be16 port_rcv_remphys_errors;
+       __be16 port_rcv_switch_relay_errors;
+       __be16 port_xmit_discards;
+       u8 port_xmit_constraint_errors;
+       u8 port_rcv_constraint_errors;
+       u8 reserved2;
+       u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+       __be16 reserved3;
+       __be16 vl15_dropped;
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_packets;
+       __be64 port_rcv_packets;
+       __be64 port_xmit_wait;
+       __be64 port_adr_events;
+} __packed;
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+#define OPA_MAX_PREEMPT_CAP         32
+#define OPA_VLARB_LOW_ELEMENTS       0
+#define OPA_VLARB_HIGH_ELEMENTS      1
+#define OPA_VLARB_PREEMPT_ELEMENTS   2
+#define OPA_VLARB_PREEMPT_MATRIX     3
+
+#define HFI1_XMIT_RATE_UNSUPPORTED               0x0
+#define HFI1_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define HFI1_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_CC_SVCTYPE_RC 0x0
+#define IB_CC_SVCTYPE_UC 0x1
+#define IB_CC_SVCTYPE_RD 0x2
+#define IB_CC_SVCTYPE_UD 0x3
+
+
+/*
+ * There should be an equivalent IB #define for the following, but
+ * I cannot find it.
+ */
+#define OPA_CC_LOG_TYPE_HFI    2
+
+struct opa_hfi1_cong_log_event_internal {
+       u32 lqpn;
+       u32 rqpn;
+       u8 sl;
+       u8 svc_type;
+       u32 rlid;
+       s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */
+};
+
+struct opa_hfi1_cong_log_event {
+       u8 local_qp_cn_entry[3];
+       u8 remote_qp_number_cn_entry[3];
+       u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */
+       u8 reserved;
+       __be32 remote_lid_cn_entry;
+       __be32 timestamp_cn_entry;
+} __packed;
+
+#define OPA_CONG_LOG_ELEMS     96
+
+struct opa_hfi1_cong_log {
+       u8 log_type;
+       u8 congestion_flags;
+       __be16 threshold_event_counter;
+       __be32 current_time_stamp;
+       u8 threshold_cong_event_map[OPA_MAX_SLS/8];
+       struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS];
+} __packed;
+
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct opa_congestion_setting_entry {
+       u8 ccti_increase;
+       u8 reserved;
+       __be16 ccti_timer;
+       u8 trigger_threshold;
+       u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_entry_shadow {
+       u8 ccti_increase;
+       u8 reserved;
+       u16 ccti_timer;
+       u8 trigger_threshold;
+       u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct opa_congestion_setting_attr {
+       __be32 control_map;
+       __be16 port_control;
+       struct opa_congestion_setting_entry entries[OPA_MAX_SLS];
+} __packed;
+
+struct opa_congestion_setting_attr_shadow {
+       u32 control_map;
+       u16 port_control;
+       struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+       __be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+       u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+       __be16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+       u16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+       (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+       u16 ccti_limit; /* max CCTI for cc table */
+       struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * struct cc_state combines the (active) per-port congestion control
+ * table, and the (active) per-SL congestion settings. cc_state data
+ * may need to be read in code paths that we want to be fast, so it
+ * is an RCU protected structure.
+ */
+struct cc_state {
+       struct rcu_head rcu;
+       struct cc_table_shadow cct;
+       struct opa_congestion_setting_attr_shadow cong_setting;
+};
+
+/*
+ * OPA BufferControl MAD
+ */
+
+/* attribute modifier macros */
+#define OPA_AM_NPORT_SHIFT     24
+#define OPA_AM_NPORT_MASK      0xff
+#define OPA_AM_NPORT_SMASK     (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT)
+#define OPA_AM_NPORT(am)       (((am) >> OPA_AM_NPORT_SHIFT) & \
+                                       OPA_AM_NPORT_MASK)
+
+#define OPA_AM_NBLK_SHIFT      24
+#define OPA_AM_NBLK_MASK       0xff
+#define OPA_AM_NBLK_SMASK      (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT)
+#define OPA_AM_NBLK(am)                (((am) >> OPA_AM_NBLK_SHIFT) & \
+                                       OPA_AM_NBLK_MASK)
+
+#define OPA_AM_START_BLK_SHIFT 0
+#define OPA_AM_START_BLK_MASK  0xff
+#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \
+                                       OPA_AM_START_BLK_SHIFT)
+#define OPA_AM_START_BLK(am)   (((am) >> OPA_AM_START_BLK_SHIFT) & \
+                                       OPA_AM_START_BLK_MASK)
+
+#define OPA_AM_PORTNUM_SHIFT   0
+#define OPA_AM_PORTNUM_MASK    0xff
+#define OPA_AM_PORTNUM_SMASK   (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT)
+#define OPA_AM_PORTNUM(am)     (((am) >> OPA_AM_PORTNUM_SHIFT) & \
+                                       OPA_AM_PORTNUM_MASK)
+
+#define OPA_AM_ASYNC_SHIFT     12
+#define OPA_AM_ASYNC_MASK      0x1
+#define OPA_AM_ASYNC_SMASK     (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT)
+#define OPA_AM_ASYNC(am)       (((am) >> OPA_AM_ASYNC_SHIFT) & \
+                                       OPA_AM_ASYNC_MASK)
+
+#define OPA_AM_START_SM_CFG_SHIFT      9
+#define OPA_AM_START_SM_CFG_MASK       0x1
+#define OPA_AM_START_SM_CFG_SMASK      (OPA_AM_START_SM_CFG_MASK << \
+                                               OPA_AM_START_SM_CFG_SHIFT)
+#define OPA_AM_START_SM_CFG(am)                (((am) >> OPA_AM_START_SM_CFG_SHIFT) \
+                                               & OPA_AM_START_SM_CFG_MASK)
+
+#define OPA_AM_CI_ADDR_SHIFT   19
+#define OPA_AM_CI_ADDR_MASK    0xfff
+#define OPA_AM_CI_ADDR_SMASK   (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT)
+#define OPA_AM_CI_ADDR(am)     (((am) >> OPA_AM_CI_ADDR_SHIFT) & \
+                                       OPA_AM_CI_ADDR_MASK)
+
+#define OPA_AM_CI_LEN_SHIFT    13
+#define OPA_AM_CI_LEN_MASK     0x3f
+#define OPA_AM_CI_LEN_SMASK    (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT)
+#define OPA_AM_CI_LEN(am)      (((am) >> OPA_AM_CI_LEN_SHIFT) & \
+                                       OPA_AM_CI_LEN_MASK)
+
+/* error info macros */
+#define OPA_EI_STATUS_SMASK    0x80
+#define OPA_EI_CODE_SMASK      0x0f
+
+struct vl_limit {
+       __be16 dedicated;
+       __be16 shared;
+};
+
+struct buffer_control {
+       __be16 reserved;
+       __be16 overall_shared_limit;
+       struct vl_limit vl[OPA_MAX_VLS];
+};
+
+struct sc2vlnt {
+       u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */
+};
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+       cpu_to_be32(COUNTER_MASK(1, 0) | \
+                   COUNTER_MASK(1, 1) | \
+                   COUNTER_MASK(1, 2) | \
+                   COUNTER_MASK(1, 3) | \
+                   COUNTER_MASK(1, 4))
+
+#endif                         /* _HFI1_MAD_H */
diff --git a/drivers/staging/rdma/hfi1/mmap.c b/drivers/staging/rdma/hfi1/mmap.c
new file mode 100644 (file)
index 0000000..5173b1c
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "verbs.h"
+
+/**
+ * hfi1_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct hfi1_mmap_info
+ */
+void hfi1_release_mmap_info(struct kref *ref)
+{
+       struct hfi1_mmap_info *ip =
+               container_of(ref, struct hfi1_mmap_info, ref);
+       struct hfi1_ibdev *dev = to_idev(ip->context->device);
+
+       spin_lock_irq(&dev->pending_lock);
+       list_del(&ip->pending_mmaps);
+       spin_unlock_irq(&dev->pending_lock);
+
+       vfree(ip->obj);
+       kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void hfi1_vma_open(struct vm_area_struct *vma)
+{
+       struct hfi1_mmap_info *ip = vma->vm_private_data;
+
+       kref_get(&ip->ref);
+}
+
+static void hfi1_vma_close(struct vm_area_struct *vma)
+{
+       struct hfi1_mmap_info *ip = vma->vm_private_data;
+
+       kref_put(&ip->ref, hfi1_release_mmap_info);
+}
+
+static struct vm_operations_struct hfi1_vm_ops = {
+       .open =     hfi1_vma_open,
+       .close =    hfi1_vma_close,
+};
+
+/**
+ * hfi1_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       struct hfi1_ibdev *dev = to_idev(context->device);
+       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned long size = vma->vm_end - vma->vm_start;
+       struct hfi1_mmap_info *ip, *pp;
+       int ret = -EINVAL;
+
+       /*
+        * Search the device's list of objects waiting for a mmap call.
+        * Normally, this list is very short since a call to create a
+        * CQ, QP, or SRQ is soon followed by a call to mmap().
+        */
+       spin_lock_irq(&dev->pending_lock);
+       list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+                                pending_mmaps) {
+               /* Only the creator is allowed to mmap the object */
+               if (context != ip->context || (__u64) offset != ip->offset)
+                       continue;
+               /* Don't allow a mmap larger than the object. */
+               if (size > ip->size)
+                       break;
+
+               list_del_init(&ip->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+
+               ret = remap_vmalloc_range(vma, ip->obj, 0);
+               if (ret)
+                       goto done;
+               vma->vm_ops = &hfi1_vm_ops;
+               vma->vm_private_data = ip;
+               hfi1_vma_open(vma);
+               goto done;
+       }
+       spin_unlock_irq(&dev->pending_lock);
+done:
+       return ret;
+}
+
+/*
+ * Allocate information for hfi1_mmap
+ */
+struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev,
+                                            u32 size,
+                                            struct ib_ucontext *context,
+                                            void *obj) {
+       struct hfi1_mmap_info *ip;
+
+       ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+       if (!ip)
+               goto bail;
+
+       size = PAGE_ALIGN(size);
+
+       spin_lock_irq(&dev->mmap_offset_lock);
+       if (dev->mmap_offset == 0)
+               dev->mmap_offset = PAGE_SIZE;
+       ip->offset = dev->mmap_offset;
+       dev->mmap_offset += size;
+       spin_unlock_irq(&dev->mmap_offset_lock);
+
+       INIT_LIST_HEAD(&ip->pending_mmaps);
+       ip->size = size;
+       ip->context = context;
+       ip->obj = obj;
+       kref_init(&ip->ref);
+
+bail:
+       return ip;
+}
+
+void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
+                          u32 size, void *obj)
+{
+       size = PAGE_ALIGN(size);
+
+       spin_lock_irq(&dev->mmap_offset_lock);
+       if (dev->mmap_offset == 0)
+               dev->mmap_offset = PAGE_SIZE;
+       ip->offset = dev->mmap_offset;
+       dev->mmap_offset += size;
+       spin_unlock_irq(&dev->mmap_offset_lock);
+
+       ip->size = size;
+       ip->obj = obj;
+}
diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c
new file mode 100644 (file)
index 0000000..bd64e4f
--- /dev/null
@@ -0,0 +1,551 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+
+/* Fast memory region */
+struct hfi1_fmr {
+       struct ib_fmr ibfmr;
+       struct hfi1_mregion mr;        /* must be last */
+};
+
+static inline struct hfi1_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+       return container_of(ibfmr, struct hfi1_fmr, ibfmr);
+}
+
+static int init_mregion(struct hfi1_mregion *mr, struct ib_pd *pd,
+                       int count)
+{
+       int m, i = 0;
+       int rval = 0;
+
+       m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+       for (; i < m; i++) {
+               mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+               if (!mr->map[i])
+                       goto bail;
+       }
+       mr->mapsz = m;
+       init_completion(&mr->comp);
+       /* count returning the ptr to user */
+       atomic_set(&mr->refcount, 1);
+       mr->pd = pd;
+       mr->max_segs = count;
+out:
+       return rval;
+bail:
+       while (i)
+               kfree(mr->map[--i]);
+       rval = -ENOMEM;
+       goto out;
+}
+
+static void deinit_mregion(struct hfi1_mregion *mr)
+{
+       int i = mr->mapsz;
+
+       mr->mapsz = 0;
+       while (i)
+               kfree(mr->map[--i]);
+}
+
+
+/**
+ * hfi1_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see dma.c).
+ */
+struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc)
+{
+       struct hfi1_mr *mr = NULL;
+       struct ib_mr *ret;
+       int rval;
+
+       if (to_ipd(pd)->user) {
+               ret = ERR_PTR(-EPERM);
+               goto bail;
+       }
+
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       rval = init_mregion(&mr->mr, pd, 0);
+       if (rval) {
+               ret = ERR_PTR(rval);
+               goto bail;
+       }
+
+
+       rval = hfi1_alloc_lkey(&mr->mr, 1);
+       if (rval) {
+               ret = ERR_PTR(rval);
+               goto bail_mregion;
+       }
+
+       mr->mr.access_flags = acc;
+       ret = &mr->ibmr;
+done:
+       return ret;
+
+bail_mregion:
+       deinit_mregion(&mr->mr);
+bail:
+       kfree(mr);
+       goto done;
+}
+
+static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd)
+{
+       struct hfi1_mr *mr;
+       int rval = -ENOMEM;
+       int m;
+
+       /* Allocate struct plus pointers to first level page tables. */
+       m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+       mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
+       if (!mr)
+               goto bail;
+
+       rval = init_mregion(&mr->mr, pd, count);
+       if (rval)
+               goto bail;
+       /*
+        * ib_reg_phys_mr() will initialize mr->ibmr except for
+        * lkey and rkey.
+        */
+       rval = hfi1_alloc_lkey(&mr->mr, 0);
+       if (rval)
+               goto bail_mregion;
+       mr->ibmr.lkey = mr->mr.lkey;
+       mr->ibmr.rkey = mr->mr.lkey;
+done:
+       return mr;
+
+bail_mregion:
+       deinit_mregion(&mr->mr);
+bail:
+       kfree(mr);
+       mr = ERR_PTR(rval);
+       goto done;
+}
+
+/**
+ * hfi1_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
+                              struct ib_phys_buf *buffer_list,
+                              int num_phys_buf, int acc, u64 *iova_start)
+{
+       struct hfi1_mr *mr;
+       int n, m, i;
+       struct ib_mr *ret;
+
+       mr = alloc_mr(num_phys_buf, pd);
+       if (IS_ERR(mr)) {
+               ret = (struct ib_mr *)mr;
+               goto bail;
+       }
+
+       mr->mr.user_base = *iova_start;
+       mr->mr.iova = *iova_start;
+       mr->mr.access_flags = acc;
+
+       m = 0;
+       n = 0;
+       for (i = 0; i < num_phys_buf; i++) {
+               mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+               mr->mr.length += buffer_list[i].size;
+               n++;
+               if (n == HFI1_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                              u64 virt_addr, int mr_access_flags,
+                              struct ib_udata *udata)
+{
+       struct hfi1_mr *mr;
+       struct ib_umem *umem;
+       struct scatterlist *sg;
+       int n, m, entry;
+       struct ib_mr *ret;
+
+       if (length == 0) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       umem = ib_umem_get(pd->uobject->context, start, length,
+                          mr_access_flags, 0);
+       if (IS_ERR(umem))
+               return (void *) umem;
+
+       n = umem->nmap;
+
+       mr = alloc_mr(n, pd);
+       if (IS_ERR(mr)) {
+               ret = (struct ib_mr *)mr;
+               ib_umem_release(umem);
+               goto bail;
+       }
+
+       mr->mr.user_base = start;
+       mr->mr.iova = virt_addr;
+       mr->mr.length = length;
+       mr->mr.offset = ib_umem_offset(umem);
+       mr->mr.access_flags = mr_access_flags;
+       mr->umem = umem;
+
+       if (is_power_of_2(umem->page_size))
+               mr->mr.page_shift = ilog2(umem->page_size);
+       m = 0;
+       n = 0;
+       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+                       void *vaddr;
+
+                       vaddr = page_address(sg_page(sg));
+                       if (!vaddr) {
+                               ret = ERR_PTR(-EINVAL);
+                               goto bail;
+                       }
+                       mr->mr.map[m]->segs[n].vaddr = vaddr;
+                       mr->mr.map[m]->segs[n].length = umem->page_size;
+                       n++;
+                       if (n == HFI1_SEGSZ) {
+                               m++;
+                               n = 0;
+                       }
+       }
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by hfi1_get_dma_mr()
+ * or hfi1_reg_user_mr().
+ */
+int hfi1_dereg_mr(struct ib_mr *ibmr)
+{
+       struct hfi1_mr *mr = to_imr(ibmr);
+       int ret = 0;
+       unsigned long timeout;
+
+       hfi1_free_lkey(&mr->mr);
+
+       hfi1_put_mr(&mr->mr); /* will set completion if last */
+       timeout = wait_for_completion_timeout(&mr->mr.comp,
+               5 * HZ);
+       if (!timeout) {
+               dd_dev_err(
+                       dd_from_ibdev(mr->mr.pd->device),
+                       "hfi1_dereg_mr timeout mr %p pd %p refcount %u\n",
+                       mr, mr->mr.pd, atomic_read(&mr->mr.refcount));
+               hfi1_get_mr(&mr->mr);
+               ret = -EBUSY;
+               goto out;
+       }
+       deinit_mregion(&mr->mr);
+       if (mr->umem)
+               ib_umem_release(mr->umem);
+       kfree(mr);
+out:
+       return ret;
+}
+
+/*
+ * Allocate a memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ *
+ * Return the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
+                           enum ib_mr_type mr_type,
+                           u32 max_num_sg)
+{
+       struct hfi1_mr *mr;
+
+       if (mr_type != IB_MR_TYPE_MEM_REG)
+               return ERR_PTR(-EINVAL);
+
+       mr = alloc_mr(max_num_sg, pd);
+       if (IS_ERR(mr))
+               return (struct ib_mr *)mr;
+
+       return &mr->ibmr;
+}
+
+struct ib_fast_reg_page_list *
+hfi1_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len)
+{
+       unsigned size = page_list_len * sizeof(u64);
+       struct ib_fast_reg_page_list *pl;
+
+       if (size > PAGE_SIZE)
+               return ERR_PTR(-EINVAL);
+
+       pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+       if (!pl)
+               return ERR_PTR(-ENOMEM);
+
+       pl->page_list = kzalloc(size, GFP_KERNEL);
+       if (!pl->page_list)
+               goto err_free;
+
+       return pl;
+
+err_free:
+       kfree(pl);
+       return ERR_PTR(-ENOMEM);
+}
+
+void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+       kfree(pl->page_list);
+       kfree(pl);
+}
+
+/**
+ * hfi1_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+                             struct ib_fmr_attr *fmr_attr)
+{
+       struct hfi1_fmr *fmr;
+       int m;
+       struct ib_fmr *ret;
+       int rval = -ENOMEM;
+
+       /* Allocate struct plus pointers to first level page tables. */
+       m = (fmr_attr->max_pages + HFI1_SEGSZ - 1) / HFI1_SEGSZ;
+       fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
+       if (!fmr)
+               goto bail;
+
+       rval = init_mregion(&fmr->mr, pd, fmr_attr->max_pages);
+       if (rval)
+               goto bail;
+
+       /*
+        * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+        * rkey.
+        */
+       rval = hfi1_alloc_lkey(&fmr->mr, 0);
+       if (rval)
+               goto bail_mregion;
+       fmr->ibfmr.rkey = fmr->mr.lkey;
+       fmr->ibfmr.lkey = fmr->mr.lkey;
+       /*
+        * Resources are allocated but no valid mapping (RKEY can't be
+        * used).
+        */
+       fmr->mr.access_flags = mr_access_flags;
+       fmr->mr.max_segs = fmr_attr->max_pages;
+       fmr->mr.page_shift = fmr_attr->page_shift;
+
+       ret = &fmr->ibfmr;
+done:
+       return ret;
+
+bail_mregion:
+       deinit_mregion(&fmr->mr);
+bail:
+       kfree(fmr);
+       ret = ERR_PTR(rval);
+       goto done;
+}
+
+/**
+ * hfi1_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+                     int list_len, u64 iova)
+{
+       struct hfi1_fmr *fmr = to_ifmr(ibfmr);
+       struct hfi1_lkey_table *rkt;
+       unsigned long flags;
+       int m, n, i;
+       u32 ps;
+       int ret;
+
+       i = atomic_read(&fmr->mr.refcount);
+       if (i > 2)
+               return -EBUSY;
+
+       if (list_len > fmr->mr.max_segs) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       rkt = &to_idev(ibfmr->device)->lk_table;
+       spin_lock_irqsave(&rkt->lock, flags);
+       fmr->mr.user_base = iova;
+       fmr->mr.iova = iova;
+       ps = 1 << fmr->mr.page_shift;
+       fmr->mr.length = list_len * ps;
+       m = 0;
+       n = 0;
+       for (i = 0; i < list_len; i++) {
+               fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+               fmr->mr.map[m]->segs[n].length = ps;
+               if (++n == HFI1_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+       spin_unlock_irqrestore(&rkt->lock, flags);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int hfi1_unmap_fmr(struct list_head *fmr_list)
+{
+       struct hfi1_fmr *fmr;
+       struct hfi1_lkey_table *rkt;
+       unsigned long flags;
+
+       list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+               rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+               spin_lock_irqsave(&rkt->lock, flags);
+               fmr->mr.user_base = 0;
+               fmr->mr.iova = 0;
+               fmr->mr.length = 0;
+               spin_unlock_irqrestore(&rkt->lock, flags);
+       }
+       return 0;
+}
+
+/**
+ * hfi1_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int hfi1_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+       struct hfi1_fmr *fmr = to_ifmr(ibfmr);
+       int ret = 0;
+       unsigned long timeout;
+
+       hfi1_free_lkey(&fmr->mr);
+       hfi1_put_mr(&fmr->mr); /* will set completion if last */
+       timeout = wait_for_completion_timeout(&fmr->mr.comp,
+               5 * HZ);
+       if (!timeout) {
+               hfi1_get_mr(&fmr->mr);
+               ret = -EBUSY;
+               goto out;
+       }
+       deinit_mregion(&fmr->mr);
+       kfree(fmr);
+out:
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h
new file mode 100644 (file)
index 0000000..f64eec1
--- /dev/null
@@ -0,0 +1,129 @@
+#ifndef _LINUX_H
+#define _LINUX_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This header file is for OPA-specific definitions which are
+ * required by the HFI driver, and which aren't yet in the Linux
+ * IB core. We'll collect these all here, then merge them into
+ * the kernel when that's convenient.
+ */
+
+/* OPA SMA attribute IDs */
+#define OPA_ATTRIB_ID_CONGESTION_INFO          cpu_to_be16(0x008b)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG       cpu_to_be16(0x008f)
+#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING   cpu_to_be16(0x0090)
+#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091)
+
+/* OPA PMA attribute IDs */
+#define OPA_PM_ATTRIB_ID_PORT_STATUS           cpu_to_be16(0x0040)
+#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS     cpu_to_be16(0x0041)
+#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS    cpu_to_be16(0x0042)
+#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS   cpu_to_be16(0x0043)
+#define OPA_PM_ATTRIB_ID_ERROR_INFO            cpu_to_be16(0x0044)
+
+/* OPA status codes */
+#define OPA_PM_STATUS_REQUEST_TOO_LARGE                cpu_to_be16(0x100)
+
+static inline u8 port_states_to_logical_state(struct opa_port_states *ps)
+{
+       return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE;
+}
+
+static inline u8 port_states_to_phys_state(struct opa_port_states *ps)
+{
+       return ((ps->portphysstate_portstate &
+                 OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf;
+}
+
+/*
+ * OPA port physical states
+ * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState
+ * values.
+ *
+ * When writing, only values 0-3 are valid, other values are ignored.
+ * When reading, 0 is reserved.
+ *
+ * Returned by the ibphys_portstate() routine.
+ */
+enum opa_port_phys_state {
+       IB_PORTPHYSSTATE_NOP = 0,
+       /* 1 is reserved */
+       IB_PORTPHYSSTATE_POLLING = 2,
+       IB_PORTPHYSSTATE_DISABLED = 3,
+       IB_PORTPHYSSTATE_TRAINING = 4,
+       IB_PORTPHYSSTATE_LINKUP = 5,
+       IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6,
+       IB_PORTPHYSSTATE_PHY_TEST = 7,
+       /* 8 is reserved */
+       OPA_PORTPHYSSTATE_OFFLINE = 9,
+       OPA_PORTPHYSSTATE_GANGED = 10,
+       OPA_PORTPHYSSTATE_TEST = 11,
+       OPA_PORTPHYSSTATE_MAX = 11,
+       /* values 12-15 are reserved/ignored */
+};
+
+/* OPA_PORT_TYPE_* definitions - these belong in opa_port_info.h */
+#define OPA_PORT_TYPE_UNKNOWN          0
+#define OPA_PORT_TYPE_DISCONNECTED     1
+/* port is not currently usable, CableInfo not available */
+#define OPA_PORT_TYPE_FIXED            2
+/* A fixed backplane port in a director class switch. All OPA ASICS */
+#define OPA_PORT_TYPE_VARIABLE         3
+/* A backplane port in a blade system, possibly mixed configuration */
+#define OPA_PORT_TYPE_STANDARD         4
+/* implies a SFF-8636 defined format for CableInfo (QSFP) */
+#define OPA_PORT_TYPE_SI_PHOTONICS      5
+/* A silicon photonics module implies TBD defined format for CableInfo
+ * as defined by Intel SFO group */
+/* 6 - 15 are reserved */
+
+#endif /* _LINUX_H */
diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c
new file mode 100644 (file)
index 0000000..ac5653c
--- /dev/null
@@ -0,0 +1,1253 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "chip_registers.h"
+
+/* link speed vector for Gen3 speed - not in Linux headers */
+#define GEN1_SPEED_VECTOR 0x1
+#define GEN2_SPEED_VECTOR 0x2
+#define GEN3_SPEED_VECTOR 0x3
+
+/*
+ * This file contains PCIe utility routines.
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ */
+static void tune_pcie_caps(struct hfi1_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore dd_dev_err() can't be used for error
+ * printing.
+ */
+int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret;
+
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               /*
+                * This can happen (in theory) iff:
+                * We did a chip reset, and then failed to reprogram the
+                * BAR, or the chip reset due to an internal error.  We then
+                * unloaded the driver and reloaded it.
+                *
+                * Both reset cases set the BAR back to initial state.  For
+                * the latter case, the AER sticky error bit at offset 0x718
+                * should be set, but the Linux kernel doesn't yet know
+                * about that, it appears.  If the original BAR was retained
+                * in the kernel data structures, this may be OK.
+                */
+               hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
+                              -ret);
+               goto done;
+       }
+
+       ret = pci_request_regions(pdev, DRIVER_NAME);
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "pci_request_regions fails: err %d\n", -ret);
+               goto bail;
+       }
+
+       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               /*
+                * If the 64 bit setup fails, try 32 bit.  Some systems
+                * do not setup 64 bit maps on systems with 2GB or less
+                * memory installed.
+                */
+               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+               if (ret) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Unable to set DMA mask: %d\n", ret);
+                       goto bail;
+               }
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+       } else
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "Unable to set DMA consistent mask: %d\n", ret);
+               goto bail;
+       }
+
+       pci_set_master(pdev);
+       ret = pci_enable_pcie_error_reporting(pdev);
+       if (ret) {
+               hfi1_early_err(&pdev->dev,
+                              "Unable to enable pcie error reporting: %d\n",
+                             ret);
+               ret = 0;
+       }
+       goto done;
+
+bail:
+       hfi1_pcie_cleanup(pdev);
+done:
+       return ret;
+}
+
+/*
+ * Clean what was done in hfi1_pcie_init()
+ */
+void hfi1_pcie_cleanup(struct pci_dev *pdev)
+{
+       pci_disable_device(pdev);
+       /*
+        * Release regions should be called after the disable. OK to
+        * call if request regions has not been called or failed.
+        */
+       pci_release_regions(pdev);
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev,
+                    const struct pci_device_id *ent)
+{
+       unsigned long len;
+       resource_size_t addr;
+
+       dd->pcidev = pdev;
+       pci_set_drvdata(pdev, dd);
+
+       addr = pci_resource_start(pdev, 0);
+       len = pci_resource_len(pdev, 0);
+
+       /*
+        * The TXE PIO buffers are at the tail end of the chip space.
+        * Cut them off and map them separately.
+        */
+
+       /* sanity check vs expectations */
+       if (len != TXE_PIO_SEND + TXE_PIO_SIZE) {
+               dd_dev_err(dd, "chip PIO range does not match\n");
+               return -EINVAL;
+       }
+
+       dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
+       if (!dd->kregbase)
+               return -ENOMEM;
+
+       dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
+       if (!dd->piobase) {
+               iounmap(dd->kregbase);
+               return -ENOMEM;
+       }
+
+       dd->flags |= HFI1_PRESENT;      /* now register routines work */
+
+       dd->kregend = dd->kregbase + TXE_PIO_SEND;
+       dd->physaddr = addr;        /* used for io_remap, etc. */
+
+       /*
+        * Re-map the chip's RcvArray as write-combining to allow us
+        * to write an entire cacheline worth of entries in one shot.
+        * If this re-map fails, just continue - the RcvArray programming
+        * function will handle both cases.
+        */
+       dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
+       dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
+                                    dd->chip_rcv_array_count * 8);
+       dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+       /*
+        * Save BARs and command to rewrite after device reset.
+        */
+       dd->pcibar0 = addr;
+       dd->pcibar1 = addr >> 32;
+       pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom);
+       pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2,
+                                                       &dd->pcie_devctl2);
+       pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+                                                       &dd->pci_lnkctl3);
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
+
+       return 0;
+}
+
+/*
+ * Do PCIe cleanup related to dd, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * Void because all of the core pcie cleanup functions are void.
+ */
+void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
+{
+       u64 __iomem *base = (void __iomem *) dd->kregbase;
+
+       dd->flags &= ~HFI1_PRESENT;
+       dd->kregbase = NULL;
+       iounmap(base);
+       if (dd->rcvarray_wc)
+               iounmap(dd->rcvarray_wc);
+       if (dd->piobase)
+               iounmap(dd->piobase);
+
+       pci_set_drvdata(dd->pcidev, NULL);
+}
+
+/*
+ * Do a Function Level Reset (FLR) on the device.
+ * Based on static function drivers/pci/pci.c:pcie_flr().
+ */
+void hfi1_pcie_flr(struct hfi1_devdata *dd)
+{
+       int i;
+       u16 status;
+
+       /* no need to check for the capability - we know the device has it */
+
+       /* wait for Transaction Pending bit to clear, at most a few ms */
+       for (i = 0; i < 4; i++) {
+               if (i)
+                       msleep((1 << (i - 1)) * 100);
+
+               pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status);
+               if (!(status & PCI_EXP_DEVSTA_TRPND))
+                       goto clear;
+       }
+
+       dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n");
+
+clear:
+       pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL,
+                                               PCI_EXP_DEVCTL_BCR_FLR);
+       /* PCIe spec requires the function to be back within 100ms */
+       msleep(100);
+}
+
+static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt,
+                      struct hfi1_msix_entry *hfi1_msix_entry)
+{
+       int ret;
+       int nvec = *msixcnt;
+       struct msix_entry *msix_entry;
+       int i;
+
+       /* We can't pass hfi1_msix_entry array to msix_setup
+        * so use a dummy msix_entry array and copy the allocated
+        * irq back to the hfi1_msix_entry array. */
+       msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL);
+       if (!msix_entry) {
+               ret = -ENOMEM;
+               goto do_intx;
+       }
+
+       for (i = 0; i < nvec; i++)
+               msix_entry[i] = hfi1_msix_entry[i].msix;
+
+       ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+       if (ret < 0)
+               goto free_msix_entry;
+       nvec = ret;
+
+       for (i = 0; i < nvec; i++)
+               hfi1_msix_entry[i].msix = msix_entry[i];
+
+       kfree(msix_entry);
+       *msixcnt = nvec;
+       return;
+
+free_msix_entry:
+       kfree(msix_entry);
+
+do_intx:
+       dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+                  nvec, ret);
+       *msixcnt = 0;
+       hfi1_enable_intx(dd->pcidev);
+
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_speed(u16 linkstat)
+{
+       u32 speed;
+
+       switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+       default: /* not defined, assume Gen1 */
+       case PCI_EXP_LNKSTA_CLS_2_5GB:
+               speed = 2500; /* Gen 1, 2.5GHz */
+               break;
+       case PCI_EXP_LNKSTA_CLS_5_0GB:
+               speed = 5000; /* Gen 2, 5GHz */
+               break;
+       case GEN3_SPEED_VECTOR:
+               speed = 8000; /* Gen 3, 8GHz */
+               break;
+       }
+       return speed;
+}
+
+/* return the PCIe link speed from the given link status */
+static u32 extract_width(u16 linkstat)
+{
+       return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT;
+}
+
+/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */
+static void update_lbus_info(struct hfi1_devdata *dd)
+{
+       u16 linkstat;
+
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+       dd->lbus_width = extract_width(linkstat);
+       dd->lbus_speed = extract_speed(linkstat);
+       snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+                "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width);
+}
+
+/*
+ * Read in the current PCIe link width and speed.  Find if the link is
+ * Gen3 capable.
+ */
+int pcie_speeds(struct hfi1_devdata *dd)
+{
+       u32 linkcap;
+
+       if (!pci_is_pcie(dd->pcidev)) {
+               dd_dev_err(dd, "Can't find PCI Express capability!\n");
+               return -EINVAL;
+       }
+
+       /* find if our max speed is Gen3 and parent supports Gen3 speeds */
+       dd->link_gen3_capable = 1;
+
+       pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
+       if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) {
+               dd_dev_info(dd,
+                       "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n",
+                       linkcap & PCI_EXP_LNKCAP_SLS);
+               dd->link_gen3_capable = 0;
+       }
+
+       /*
+        * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
+        */
+       if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+               dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
+               dd->link_gen3_capable = 0;
+       }
+
+       /* obtain the link width and current speed */
+       update_lbus_info(dd);
+
+       /* check against expected pcie width and complain if "wrong" */
+       if (dd->lbus_width < 16)
+               dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width);
+
+       return 0;
+}
+
+/*
+ * Returns in *nent:
+ *     - actual number of interrupts allocated
+ *     - 0 if fell back to INTx.
+ */
+void request_msix(struct hfi1_devdata *dd, u32 *nent,
+                 struct hfi1_msix_entry *entry)
+{
+       int pos;
+
+       pos = dd->pcidev->msix_cap;
+       if (*nent && pos) {
+               msix_setup(dd, pos, nent, entry);
+               /* did it, either MSI-X or INTx */
+       } else {
+               *nent = 0;
+               hfi1_enable_intx(dd->pcidev);
+       }
+
+       tune_pcie_caps(dd);
+}
+
+/*
+ * Disable MSI-X.
+ */
+void hfi1_nomsix(struct hfi1_devdata *dd)
+{
+       pci_disable_msix(dd->pcidev);
+}
+
+void hfi1_enable_intx(struct pci_dev *pdev)
+{
+       /* first, turn on INTx */
+       pci_intx(pdev, 1);
+       /* then turn off MSI-X */
+       pci_disable_msix(pdev);
+}
+
+/* restore command and BARs after a reset has wiped them out */
+void restore_pci_variables(struct hfi1_devdata *dd)
+{
+       pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
+       pci_write_config_dword(dd->pcidev,
+                               PCI_BASE_ADDRESS_0, dd->pcibar0);
+       pci_write_config_dword(dd->pcidev,
+                               PCI_BASE_ADDRESS_1, dd->pcibar1);
+       pci_write_config_dword(dd->pcidev,
+                               PCI_ROM_ADDRESS, dd->pci_rom);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2,
+                                                       dd->pcie_devctl2);
+       pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
+                                                       dd->pci_lnkctl3);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
+}
+
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int hfi1_pcie_caps;
+module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+static void tune_pcie_caps(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent;
+       u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+       u16 rc_mrrs, ep_mrrs, max_mrrs;
+
+       /* Find out supported and configured values for parent (root) */
+       parent = dd->pcidev->bus->self;
+       if (!pci_is_root_bus(parent->bus)) {
+               dd_dev_info(dd, "Parent not root\n");
+               return;
+       }
+
+       if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+               return;
+       rc_mpss = parent->pcie_mpss;
+       rc_mps = ffs(pcie_get_mps(parent)) - 8;
+       /* Find out supported and configured values for endpoint (us) */
+       ep_mpss = dd->pcidev->pcie_mpss;
+       ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+       /* Find max payload supported by root, endpoint */
+       if (rc_mpss > ep_mpss)
+               rc_mpss = ep_mpss;
+
+       /* If Supported greater than limit in module param, limit it */
+       if (rc_mpss > (hfi1_pcie_caps & 7))
+               rc_mpss = hfi1_pcie_caps & 7;
+       /* If less than (allowed, supported), bump root payload */
+       if (rc_mpss > rc_mps) {
+               rc_mps = rc_mpss;
+               pcie_set_mps(parent, 128 << rc_mps);
+       }
+       /* If less than (allowed, supported), bump endpoint payload */
+       if (rc_mpss > ep_mps) {
+               ep_mps = rc_mpss;
+               pcie_set_mps(dd->pcidev, 128 << ep_mps);
+       }
+
+       /*
+        * Now the Read Request size.
+        * No field for max supported, but PCIe spec limits it to 4096,
+        * which is code '5' (log2(4096) - 7)
+        */
+       max_mrrs = 5;
+       if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7))
+               max_mrrs = (hfi1_pcie_caps >> 4) & 7;
+
+       max_mrrs = 128 << max_mrrs;
+       rc_mrrs = pcie_get_readrq(parent);
+       ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+       if (max_mrrs > rc_mrrs) {
+               rc_mrrs = max_mrrs;
+               pcie_set_readrq(parent, rc_mrrs);
+       }
+       if (max_mrrs > ep_mrrs) {
+               ep_mrrs = max_mrrs;
+               pcie_set_readrq(dd->pcidev, ep_mrrs);
+       }
+}
+/* End of PCIe capability tuning */
+
+/*
+ * From here through hfi1_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+       switch (state) {
+       case pci_channel_io_normal:
+               dd_dev_info(dd, "State Normal, ignoring\n");
+               break;
+
+       case pci_channel_io_frozen:
+               dd_dev_info(dd, "State Frozen, requesting reset\n");
+               pci_disable_device(pdev);
+               ret = PCI_ERS_RESULT_NEED_RESET;
+               break;
+
+       case pci_channel_io_perm_failure:
+               if (dd) {
+                       dd_dev_info(dd, "State Permanent Failure, disabling\n");
+                       /* no more register accesses! */
+                       dd->flags &= ~HFI1_PRESENT;
+                       hfi1_disable_after_error(dd);
+               }
+                /* else early, or other problem */
+               ret =  PCI_ERS_RESULT_DISCONNECT;
+               break;
+
+       default: /* shouldn't happen */
+               dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n",
+                           state);
+               break;
+       }
+       return ret;
+}
+
+static pci_ers_result_t
+pci_mmio_enabled(struct pci_dev *pdev)
+{
+       u64 words = 0U;
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+       pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+       if (dd && dd->pport) {
+               words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL);
+               if (words == ~0ULL)
+                       ret = PCI_ERS_RESULT_NEED_RESET;
+               dd_dev_info(dd,
+                           "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+                           words, ret);
+       }
+       return  ret;
+}
+
+static pci_ers_result_t
+pci_slot_reset(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n");
+       return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+pci_link_reset(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 link_reset function called, ignored\n");
+       return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+pci_resume(struct pci_dev *pdev)
+{
+       struct hfi1_devdata *dd = pci_get_drvdata(pdev);
+
+       dd_dev_info(dd, "HFI1 resume function called\n");
+       pci_cleanup_aer_uncorrect_error_status(pdev);
+       /*
+        * Running jobs will fail, since it's asynchronous
+        * unlike sysfs-requested reset.   Better than
+        * doing nothing.
+        */
+       hfi1_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers hfi1_pci_err_handler = {
+       .error_detected = pci_error_detected,
+       .mmio_enabled = pci_mmio_enabled,
+       .link_reset = pci_link_reset,
+       .slot_reset = pci_slot_reset,
+       .resume = pci_resume,
+};
+
+/*============================================================================*/
+/* PCIe Gen3 support */
+
+/*
+ * This code is separated out because it is expected to be removed in the
+ * final shipping product.  If not, then it will be revisited and items
+ * will be moved to more standard locations.
+ */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */
+#define DL_STATUS_HFI0 0x1     /* hfi0 firmware download complete */
+#define DL_STATUS_HFI1 0x2     /* hfi1 firmware download complete */
+#define DL_STATUS_BOTH 0x3     /* hfi0 and hfi1 firmware download complete */
+
+/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */
+#define DL_ERR_NONE            0x0     /* no error */
+#define DL_ERR_SWAP_PARITY     0x1     /* parity error in SerDes interrupt */
+                                       /*   or response data */
+#define DL_ERR_DISABLED        0x2     /* hfi disabled */
+#define DL_ERR_SECURITY        0x3     /* security check failed */
+#define DL_ERR_SBUS            0x4     /* SBus status error */
+#define DL_ERR_XFR_PARITY      0x5     /* parity error during ROM transfer*/
+
+/* gasket block secondary bus reset delay */
+#define SBR_DELAY_US 200000    /* 200ms */
+
+/* mask for PCIe capability register lnkctl2 target link speed */
+#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf
+
+static uint pcie_target = 3;
+module_param(pcie_target, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)");
+
+static uint pcie_force;
+module_param(pcie_force, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed");
+
+static uint pcie_retry = 5;
+module_param(pcie_retry, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed");
+
+#define UNSET_PSET 255
+#define DEFAULT_DISCRETE_PSET 2        /* discrete HFI */
+#define DEFAULT_MCP_PSET 4     /* MCP HFI */
+static uint pcie_pset = UNSET_PSET;
+module_param(pcie_pset, uint, S_IRUGO);
+MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10");
+
+/* equalization columns */
+#define PREC 0
+#define ATTN 1
+#define POST 2
+
+/* discrete silicon preliminary equalization values */
+static const u8 discrete_preliminary_eq[11][3] = {
+       /* prec   attn   post */
+       {  0x00,  0x00,  0x12 },        /* p0 */
+       {  0x00,  0x00,  0x0c },        /* p1 */
+       {  0x00,  0x00,  0x0f },        /* p2 */
+       {  0x00,  0x00,  0x09 },        /* p3 */
+       {  0x00,  0x00,  0x00 },        /* p4 */
+       {  0x06,  0x00,  0x00 },        /* p5 */
+       {  0x09,  0x00,  0x00 },        /* p6 */
+       {  0x06,  0x00,  0x0f },        /* p7 */
+       {  0x09,  0x00,  0x09 },        /* p8 */
+       {  0x0c,  0x00,  0x00 },        /* p9 */
+       {  0x00,  0x00,  0x18 },        /* p10 */
+};
+
+/* integrated silicon preliminary equalization values */
+static const u8 integrated_preliminary_eq[11][3] = {
+       /* prec   attn   post */
+       {  0x00,  0x1e,  0x07 },        /* p0 */
+       {  0x00,  0x1e,  0x05 },        /* p1 */
+       {  0x00,  0x1e,  0x06 },        /* p2 */
+       {  0x00,  0x1e,  0x04 },        /* p3 */
+       {  0x00,  0x1e,  0x00 },        /* p4 */
+       {  0x03,  0x1e,  0x00 },        /* p5 */
+       {  0x04,  0x1e,  0x00 },        /* p6 */
+       {  0x03,  0x1e,  0x06 },        /* p7 */
+       {  0x03,  0x1e,  0x04 },        /* p8 */
+       {  0x05,  0x1e,  0x00 },        /* p9 */
+       {  0x00,  0x1e,  0x0a },        /* p10 */
+};
+
+/* helper to format the value to write to hardware */
+#define eq_value(pre, curr, post) \
+       ((((u32)(pre)) << \
+                       PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \
+       | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \
+       | (((u32)(post)) << \
+               PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT))
+
+/*
+ * Load the given EQ preset table into the PCIe hardware.
+ */
+static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs,
+                        u8 div)
+{
+       struct pci_dev *pdev = dd->pcidev;
+       u32 hit_error = 0;
+       u32 violation;
+       u32 i;
+       u8 c_minus1, c0, c_plus1;
+
+       for (i = 0; i < 11; i++) {
+               /* set index */
+               pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i);
+               /* write the value */
+               c_minus1 = eq[i][PREC] / div;
+               c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div);
+               c_plus1 = eq[i][POST] / div;
+               pci_write_config_dword(pdev, PCIE_CFG_REG_PL102,
+                       eq_value(c_minus1, c0, c_plus1));
+               /* check if these coefficients violate EQ rules */
+               pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105,
+                                                               &violation);
+               if (violation
+                   & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){
+                       if (hit_error == 0) {
+                               dd_dev_err(dd,
+                                       "Gen3 EQ Table Coefficient rule violations\n");
+                               dd_dev_err(dd, "         prec   attn   post\n");
+                       }
+                       dd_dev_err(dd, "   p%02d:   %02x     %02x     %02x\n",
+                               i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]);
+                       dd_dev_err(dd, "            %02x     %02x     %02x\n",
+                               (u32)c_minus1, (u32)c0, (u32)c_plus1);
+                       hit_error = 1;
+               }
+       }
+       if (hit_error)
+               return -EINVAL;
+       return 0;
+}
+
+/*
+ * Steps to be done after the PCIe firmware is downloaded and
+ * before the SBR for the Pcie Gen3.
+ * The hardware mutex is already being held.
+ */
+static void pcie_post_steps(struct hfi1_devdata *dd)
+{
+       int i;
+
+       set_sbus_fast_mode(dd);
+       /*
+        * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1.
+        * This avoids a spurious framing error that can otherwise be
+        * generated by the MAC layer.
+        *
+        * Use individual addresses since no broadcast is set up.
+        */
+       for (i = 0; i < NUM_PCIE_SERDES; i++) {
+               sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i],
+                            0x03, WRITE_SBUS_RECEIVER, 0x00022132);
+       }
+
+       clear_sbus_fast_mode(dd);
+}
+
+/*
+ * Trigger a secondary bus reset (SBR) on ourselves using our parent.
+ *
+ * Based on pci_parent_bus_reset() which is not exported by the
+ * kernel core.
+ */
+static int trigger_sbr(struct hfi1_devdata *dd)
+{
+       struct pci_dev *dev = dd->pcidev;
+       struct pci_dev *pdev;
+
+       /* need a parent */
+       if (!dev->bus->self) {
+               dd_dev_err(dd, "%s: no parent device\n", __func__);
+               return -ENOTTY;
+       }
+
+       /* should not be anyone else on the bus */
+       list_for_each_entry(pdev, &dev->bus->devices, bus_list)
+               if (pdev != dev) {
+                       dd_dev_err(dd,
+                               "%s: another device is on the same bus\n",
+                               __func__);
+                       return -ENOTTY;
+               }
+
+       /*
+        * A secondary bus reset (SBR) issues a hot reset to our device.
+        * The following routine does a 1s wait after the reset is dropped
+        * per PCI Trhfa (recovery time).  PCIe 3.0 section 6.6.1 -
+        * Conventional Reset, paragraph 3, line 35 also says that a 1s
+        * delay after a reset is required.  Per spec requirements,
+        * the link is either working or not after that point.
+        */
+       pci_reset_bridge_secondary_bus(dev->bus->self);
+
+       return 0;
+}
+
+/*
+ * Write the given gasket interrupt register.
+ */
+static void write_gasket_interrupt(struct hfi1_devdata *dd, int index,
+                                  u16 code, u16 data)
+{
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8),
+           (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT)
+           |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT)));
+}
+
+/*
+ * Tell the gasket logic how to react to the reset.
+ */
+static void arm_gasket_logic(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = (((u64)1 << dd->hfi1_id)
+                       << ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT)
+               | ((u64)pcie_serdes_broadcast[dd->hfi1_id]
+                       << ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT
+               | ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK
+               | ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK)
+                       << ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT
+               );
+       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg);
+       /* read back to push the write */
+       read_csr(dd, ASIC_PCIE_SD_HOST_CMD);
+}
+
+/*
+ * Do all the steps needed to transition the PCIe link to Gen3 speed.
+ */
+int do_pcie_gen3_transition(struct hfi1_devdata *dd)
+{
+       struct pci_dev *parent;
+       u64 fw_ctrl;
+       u64 reg, therm;
+       u32 reg32, fs, lf;
+       u32 status, err;
+       int ret;
+       int do_retry, retry_count = 0;
+       uint default_pset;
+       u16 target_vector, target_speed;
+       u16 lnkctl, lnkctl2, vendor;
+       u8 nsbr = 1;
+       u8 div;
+       const u8 (*eq)[3];
+       int return_error = 0;
+
+       /* PCIe Gen3 is for the ASIC only */
+       if (dd->icode != ICODE_RTL_SILICON)
+               return 0;
+
+       if (pcie_target == 1) {                 /* target Gen1 */
+               target_vector = GEN1_SPEED_VECTOR;
+               target_speed = 2500;
+       } else if (pcie_target == 2) {          /* target Gen2 */
+               target_vector = GEN2_SPEED_VECTOR;
+               target_speed = 5000;
+       } else if (pcie_target == 3) {          /* target Gen3 */
+               target_vector = GEN3_SPEED_VECTOR;
+               target_speed = 8000;
+       } else {
+               /* off or invalid target - skip */
+               dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__);
+               return 0;
+       }
+
+       /* if already at target speed, done (unless forced) */
+       if (dd->lbus_speed == target_speed) {
+               dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__,
+                       pcie_target,
+                       pcie_force ? "re-doing anyway" : "skipping");
+               if (!pcie_force)
+                       return 0;
+       }
+
+       /*
+        * A0 needs an additional SBR
+        */
+       if (is_a0(dd))
+               nsbr++;
+
+       /*
+        * Do the Gen3 transition.  Steps are those of the PCIe Gen3
+        * recipe.
+        */
+
+       /* step 1: pcie link working in gen1/gen2 */
+
+       /* step 2: if either side is not capable of Gen3, done */
+       if (pcie_target == 3 && !dd->link_gen3_capable) {
+               dd_dev_err(dd, "The PCIe link is not Gen3 capable\n");
+               ret = -ENOSYS;
+               goto done_no_mutex;
+       }
+
+       /* hold the HW mutex across the firmware download and SBR */
+       ret = acquire_hw_mutex(dd);
+       if (ret)
+               return ret;
+
+       /* make sure thermal polling is not causing interrupts */
+       therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN);
+       if (therm) {
+               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0);
+               msleep(100);
+               dd_dev_info(dd, "%s: Disabled therm polling\n",
+                           __func__);
+       }
+
+       /* step 3: download SBus Master firmware */
+       /* step 4: download PCIe Gen3 SerDes firmware */
+retry:
+       dd_dev_info(dd, "%s: downloading firmware\n", __func__);
+       ret = load_pcie_firmware(dd);
+       if (ret)
+               goto done;
+
+       /* step 5: set up device parameter settings */
+       dd_dev_info(dd, "%s: setting PCIe registers\n", __func__);
+
+       /*
+        * PcieCfgSpcie1 - Link Control 3
+        * Leave at reset value.  No need to set PerfEq - link equalization
+        * will be performed automatically after the SBR when the target
+        * speed is 8GT/s.
+        */
+
+       /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff);
+
+       /* step 5a: Set Synopsys Port Logic registers */
+
+       /*
+        * PcieCfgRegPl2 - Port Force Link
+        *
+        * Set the low power field to 0x10 to avoid unnecessary power
+        * management messages.  All other fields are zero.
+        */
+       reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32);
+
+       /*
+        * PcieCfgRegPl100 - Gen3 Control
+        *
+        * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl
+        * turn on PcieCfgRegPl100.EqEieosCnt (erratum)
+        * Everything else zero.
+        */
+       reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK;
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32);
+
+       /*
+        * PcieCfgRegPl101 - Gen3 EQ FS and LF
+        * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping
+        * PcieCfgRegPl103 - Gen3 EQ Preset Index
+        * PcieCfgRegPl105 - Gen3 EQ Status
+        *
+        * Give initial EQ settings.
+        */
+       if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */
+               /* 1000mV, FS=24, LF = 8 */
+               fs = 24;
+               lf = 8;
+               div = 3;
+               eq = discrete_preliminary_eq;
+               default_pset = DEFAULT_DISCRETE_PSET;
+       } else {
+               /* 400mV, FS=29, LF = 9 */
+               fs = 29;
+               lf = 9;
+               div = 1;
+               eq = integrated_preliminary_eq;
+               default_pset = DEFAULT_MCP_PSET;
+       }
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101,
+               (fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT)
+               | (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT));
+       ret = load_eq_table(dd, eq, fs, div);
+       if (ret)
+               goto done;
+
+       /*
+        * PcieCfgRegPl106 - Gen3 EQ Control
+        *
+        * Set Gen3EqPsetReqVec, leave other fields 0.
+        */
+       if (pcie_pset == UNSET_PSET)
+               pcie_pset = default_pset;
+       if (pcie_pset > 10) {   /* valid range is 0-10, inclusive */
+               dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n",
+                       __func__, pcie_pset, default_pset);
+               pcie_pset = default_pset;
+       }
+       dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset);
+       pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106,
+               ((1 << pcie_pset)
+                       << PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT)
+               | PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK
+               | PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK);
+
+       /*
+        * step 5b: Do post firmware download steps via SBus
+        */
+       dd_dev_info(dd, "%s: doing pcie post steps\n", __func__);
+       pcie_post_steps(dd);
+
+       /*
+        * step 5c: Program gasket interrupts
+        */
+       /* set the Rx Bit Rate to REFCLK ratio */
+       write_gasket_interrupt(dd, 0, 0x0006, 0x0050);
+       /* disable pCal for PCIe Gen3 RX equalization */
+       write_gasket_interrupt(dd, 1, 0x0026, 0x5b01);
+       /*
+        * Enable iCal for PCIe Gen3 RX equalization, and set which
+        * evaluation of RX_EQ_EVAL will launch the iCal procedure.
+        */
+       write_gasket_interrupt(dd, 2, 0x0026, 0x5202);
+       /* terminate list */
+       write_gasket_interrupt(dd, 3, 0x0000, 0x0000);
+
+       /*
+        * step 5d: program XMT margin
+        * Right now, leave the default alone.  To change, do a
+        * read-modify-write of:
+        *      CcePcieCtrl.XmtMargin
+        *      CcePcieCtrl.XmitMarginOverwriteEnable
+        */
+
+       /* step 5e: disable active state power management (ASPM) */
+       dd_dev_info(dd, "%s: clearing ASPM\n", __func__);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl);
+       lnkctl &= ~PCI_EXP_LNKCTL_ASPMC;
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl);
+
+       /*
+        * step 5f: clear DirectSpeedChange
+        * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the
+        * change in the speed target from starting before we are ready.
+        * This field defaults to 0 and we are not changing it, so nothing
+        * needs to be done.
+        */
+
+       /* step 5g: Set target link speed */
+       /*
+        * Set target link speed to be target on both device and parent.
+        * On setting the parent: Some system BIOSs "helpfully" set the
+        * parent target speed to Gen2 to match the ASIC's initial speed.
+        * We can set the target Gen3 because we have already checked
+        * that it is Gen3 capable earlier.
+        */
+       dd_dev_info(dd, "%s: setting parent target link speed\n", __func__);
+       parent = dd->pcidev->bus->self;
+       pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2);
+       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+               (u32)lnkctl2);
+       /* only write to parent if target is not as high as ours */
+       if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) {
+               lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+               lnkctl2 |= target_vector;
+               dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+                       (u32)lnkctl2);
+               pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2);
+       } else {
+               dd_dev_info(dd, "%s: ..target speed is OK\n", __func__);
+       }
+
+       dd_dev_info(dd, "%s: setting target link speed\n", __func__);
+       pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2);
+       dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__,
+               (u32)lnkctl2);
+       lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK;
+       lnkctl2 |= target_vector;
+       dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__,
+               (u32)lnkctl2);
+       pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2);
+
+       /* step 5h: arm gasket logic */
+       /* hold DC in reset across the SBR */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */
+       /* save firmware control across the SBR */
+       fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL);
+
+       dd_dev_info(dd, "%s: arming gasket logic\n", __func__);
+       arm_gasket_logic(dd);
+
+       /*
+        * step 6: quiesce PCIe link
+        * The chip has already been reset, so there will be no traffic
+        * from the chip.  Linux has no easy way to enforce that it will
+        * not try to access the device, so we just need to hope it doesn't
+        * do it while we are doing the reset.
+        */
+
+       /*
+        * step 7: initiate the secondary bus reset (SBR)
+        * step 8: hardware brings the links back up
+        * step 9: wait for link speed transition to be complete
+        */
+       dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__);
+       ret = trigger_sbr(dd);
+       if (ret)
+               goto done;
+
+       /* step 10: decide what to do next */
+
+       /* check if we can read PCI space */
+       ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor);
+       if (ret) {
+               dd_dev_info(dd,
+                       "%s: read of VendorID failed after SBR, err %d\n",
+                       __func__, ret);
+               return_error = 1;
+               goto done;
+       }
+       if (vendor == 0xffff) {
+               dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__);
+               return_error = 1;
+               ret = -EIO;
+               goto done;
+       }
+
+       /* restore PCI space registers we know were reset */
+       dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__);
+       restore_pci_variables(dd);
+       /* restore firmware control */
+       write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl);
+
+       /*
+        * Check the gasket block status.
+        *
+        * This is the first CSR read after the SBR.  If the read returns
+        * all 1s (fails), the link did not make it back.
+        *
+        * Once we're sure we can read and write, clear the DC reset after
+        * the SBR.  Then check for any per-lane errors. Then look over
+        * the status.
+        */
+       reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS);
+       dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg);
+       if (reg == ~0ull) {     /* PCIe read failed/timeout */
+               dd_dev_err(dd, "SBR failed - unable to read from device\n");
+               return_error = 1;
+               ret = -ENOSYS;
+               goto done;
+       }
+
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+       /* Set the LED off */
+       if (is_a0(dd))
+               setextled(dd, 0);
+
+       /* check for any per-lane errors */
+       pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, &reg32);
+       dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32);
+
+       /* extract status, look for our HFI */
+       status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT)
+                       & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK;
+       if ((status & (1 << dd->hfi1_id)) == 0) {
+               dd_dev_err(dd,
+                       "%s: gasket status 0x%x, expecting 0x%x\n",
+                       __func__, status, 1 << dd->hfi1_id);
+               ret = -EIO;
+               goto done;
+       }
+
+       /* extract error */
+       err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT)
+               & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK;
+       if (err) {
+               dd_dev_err(dd, "%s: gasket error %d\n", __func__, err);
+               ret = -EIO;
+               goto done;
+       }
+
+       /* update our link information cache */
+       update_lbus_info(dd);
+       dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
+               dd->lbus_info);
+
+       if (dd->lbus_speed != target_speed) { /* not target */
+               /* maybe retry */
+               do_retry = retry_count < pcie_retry;
+               dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
+                       pcie_target, do_retry ? ", retrying" : "");
+               retry_count++;
+               if (do_retry) {
+                       msleep(100); /* allow time to settle */
+                       goto retry;
+               }
+               ret = -EIO;
+       }
+
+done:
+       if (therm) {
+               write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+               msleep(100);
+               dd_dev_info(dd, "%s: Re-enable therm polling\n",
+                           __func__);
+       }
+       release_hw_mutex(dd);
+done_no_mutex:
+       /* return no error if it is OK to be at current speed */
+       if (ret && !return_error) {
+               dd_dev_err(dd, "Proceeding at current speed PCIe speed\n");
+               ret = 0;
+       }
+
+       dd_dev_info(dd, "%s: done\n", __func__);
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c
new file mode 100644 (file)
index 0000000..9991814
--- /dev/null
@@ -0,0 +1,1771 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+
+#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */
+
+#define SC(name) SEND_CTXT_##name
+/*
+ * Send Context functions
+ */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause);
+
+/*
+ * Set the CM reset bit and wait for it to clear.  Use the provided
+ * sendctrl register.  This routine has no locking.
+ */
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl)
+{
+       write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK);
+       while (1) {
+               udelay(1);
+               sendctrl = read_csr(dd, SEND_CTRL);
+               if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0)
+                       break;
+       }
+}
+
+/* defined in header release 48 and higher */
+#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+               << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
+#endif
+
+/* global control of PIO send */
+void pio_send_control(struct hfi1_devdata *dd, int op)
+{
+       u64 reg, mask;
+       unsigned long flags;
+       int write = 1;  /* write sendctrl back */
+       int flush = 0;  /* re-read sendctrl to make sure it is flushed */
+
+       spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+       reg = read_csr(dd, SEND_CTRL);
+       switch (op) {
+       case PSC_GLOBAL_ENABLE:
+               reg |= SEND_CTRL_SEND_ENABLE_SMASK;
+       /* Fall through */
+       case PSC_DATA_VL_ENABLE:
+               /* Disallow sending on VLs not enabled */
+               mask = (((~0ull)<<num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK)<<
+                               SEND_CTRL_UNSUPPORTED_VL_SHIFT;
+               reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask;
+               break;
+       case PSC_GLOBAL_DISABLE:
+               reg &= ~SEND_CTRL_SEND_ENABLE_SMASK;
+               break;
+       case PSC_GLOBAL_VLARB_ENABLE:
+               reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+               break;
+       case PSC_GLOBAL_VLARB_DISABLE:
+               reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK;
+               break;
+       case PSC_CM_RESET:
+               __cm_reset(dd, reg);
+               write = 0; /* CSR already written (and flushed) */
+               break;
+       case PSC_DATA_VL_DISABLE:
+               reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK;
+               flush = 1;
+               break;
+       default:
+               dd_dev_err(dd, "%s: invalid control %d\n", __func__, op);
+               break;
+       }
+
+       if (write) {
+               write_csr(dd, SEND_CTRL, reg);
+               if (flush)
+                       (void) read_csr(dd, SEND_CTRL); /* flush write */
+       }
+
+       spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/* number of send context memory pools */
+#define NUM_SC_POOLS 2
+
+/* Send Context Size (SCS) wildcards */
+#define SCS_POOL_0 -1
+#define SCS_POOL_1 -2
+/* Send Context Count (SCC) wildcards */
+#define SCC_PER_VL -1
+#define SCC_PER_CPU  -2
+
+#define SCC_PER_KRCVQ  -3
+#define SCC_ACK_CREDITS  32
+
+#define PIO_WAIT_BATCH_SIZE 5
+
+/* default send context sizes */
+static struct sc_config_sizes sc_config_sizes[SC_MAX] = {
+       [SC_KERNEL] = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
+                       .count = SCC_PER_VL },/* one per NUMA */
+       [SC_ACK]    = { .size  = SCC_ACK_CREDITS,
+                       .count = SCC_PER_KRCVQ },
+       [SC_USER]   = { .size  = SCS_POOL_0,    /* even divide, pool 0 */
+                       .count = SCC_PER_CPU }, /* one per CPU */
+
+};
+
+/* send context memory pool configuration */
+struct mem_pool_config {
+       int centipercent;       /* % of memory, in 100ths of 1% */
+       int absolute_blocks;    /* absolute block count */
+};
+
+/* default memory pool configuration: 100% in pool 0 */
+static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = {
+       /* centi%, abs blocks */
+       {  10000,     -1 },             /* pool 0 */
+       {      0,     -1 },             /* pool 1 */
+};
+
+/* memory pool information, used when calculating final sizes */
+struct mem_pool_info {
+       int centipercent;       /* 100th of 1% of memory to use, -1 if blocks
+                                  already set */
+       int count;              /* count of contexts in the pool */
+       int blocks;             /* block size of the pool */
+       int size;               /* context size, in blocks */
+};
+
+/*
+ * Convert a pool wildcard to a valid pool index.  The wildcards
+ * start at -1 and increase negatively.  Map them as:
+ *     -1 => 0
+ *     -2 => 1
+ *     etc.
+ *
+ * Return -1 on non-wildcard input, otherwise convert to a pool number.
+ */
+static int wildcard_to_pool(int wc)
+{
+       if (wc >= 0)
+               return -1;      /* non-wildcard */
+       return -wc - 1;
+}
+
+static const char *sc_type_names[SC_MAX] = {
+       "kernel",
+       "ack",
+       "user"
+};
+
+static const char *sc_type_name(int index)
+{
+       if (index < 0 || index >= SC_MAX)
+               return "unknown";
+       return sc_type_names[index];
+}
+
+/*
+ * Read the send context memory pool configuration and send context
+ * size configuration.  Replace any wildcards and come up with final
+ * counts and sizes for the send context types.
+ */
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd)
+{
+       struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } };
+       int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1;
+       int total_contexts = 0;
+       int fixed_blocks;
+       int pool_blocks;
+       int used_blocks;
+       int cp_total;           /* centipercent total */
+       int ab_total;           /* absolute block total */
+       int extra;
+       int i;
+
+       /*
+        * Step 0:
+        *      - copy the centipercents/absolute sizes from the pool config
+        *      - sanity check these values
+        *      - add up centipercents, then later check for full value
+        *      - add up absolute blocks, then later check for over-commit
+        */
+       cp_total = 0;
+       ab_total = 0;
+       for (i = 0; i < NUM_SC_POOLS; i++) {
+               int cp = sc_mem_pool_config[i].centipercent;
+               int ab = sc_mem_pool_config[i].absolute_blocks;
+
+               /*
+                * A negative value is "unused" or "invalid".  Both *can*
+                * be valid, but centipercent wins, so check that first
+                */
+               if (cp >= 0) {                  /* centipercent valid */
+                       cp_total += cp;
+               } else if (ab >= 0) {           /* absolute blocks valid */
+                       ab_total += ab;
+               } else {                        /* neither valid */
+                       dd_dev_err(
+                               dd,
+                               "Send context memory pool %d: both the block count and centipercent are invalid\n",
+                               i);
+                       return -EINVAL;
+               }
+
+               mem_pool_info[i].centipercent = cp;
+               mem_pool_info[i].blocks = ab;
+       }
+
+       /* do not use both % and absolute blocks for different pools */
+       if (cp_total != 0 && ab_total != 0) {
+               dd_dev_err(
+                       dd,
+                       "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n");
+               return -EINVAL;
+       }
+
+       /* if any percentages are present, they must add up to 100% x 100 */
+       if (cp_total != 0 && cp_total != 10000) {
+               dd_dev_err(
+                       dd,
+                       "Send context memory pool centipercent is %d, expecting 10000\n",
+                       cp_total);
+               return -EINVAL;
+       }
+
+       /* the absolute pool total cannot be more than the mem total */
+       if (ab_total > total_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context memory pool absolute block count %d is larger than the memory size %d\n",
+                       ab_total, total_blocks);
+               return -EINVAL;
+       }
+
+       /*
+        * Step 2:
+        *      - copy from the context size config
+        *      - replace context type wildcard counts with real values
+        *      - add up non-memory pool block sizes
+        *      - add up memory pool user counts
+        */
+       fixed_blocks = 0;
+       for (i = 0; i < SC_MAX; i++) {
+               int count = sc_config_sizes[i].count;
+               int size = sc_config_sizes[i].size;
+               int pool;
+
+               /*
+                * Sanity check count: Either a positive value or
+                * one of the expected wildcards is valid.  The positive
+                * value is checked later when we compare against total
+                * memory available.
+                */
+               if (i == SC_ACK) {
+                       count = dd->n_krcv_queues;
+               } else if (i == SC_KERNEL) {
+                       count = num_vls + 1 /* VL15 */;
+               } else if (count == SCC_PER_CPU) {
+                       count = dd->num_rcv_contexts - dd->n_krcv_queues;
+               } else if (count < 0) {
+                       dd_dev_err(
+                               dd,
+                               "%s send context invalid count wildcard %d\n",
+                               sc_type_name(i), count);
+                       return -EINVAL;
+               }
+               if (total_contexts + count > dd->chip_send_contexts)
+                       count = dd->chip_send_contexts - total_contexts;
+
+               total_contexts += count;
+
+               /*
+                * Sanity check pool: The conversion will return a pool
+                * number or -1 if a fixed (non-negative) value.  The fixed
+                * value is checked later when we compare against
+                * total memory available.
+                */
+               pool = wildcard_to_pool(size);
+               if (pool == -1) {                       /* non-wildcard */
+                       fixed_blocks += size * count;
+               } else if (pool < NUM_SC_POOLS) {       /* valid wildcard */
+                       mem_pool_info[pool].count += count;
+               } else {                                /* invalid wildcard */
+                       dd_dev_err(
+                               dd,
+                               "%s send context invalid pool wildcard %d\n",
+                               sc_type_name(i), size);
+                       return -EINVAL;
+               }
+
+               dd->sc_sizes[i].count = count;
+               dd->sc_sizes[i].size = size;
+       }
+       if (fixed_blocks > total_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context fixed block count, %u, larger than total block count %u\n",
+                       fixed_blocks, total_blocks);
+               return -EINVAL;
+       }
+
+       /* step 3: calculate the blocks in the pools, and pool context sizes */
+       pool_blocks = total_blocks - fixed_blocks;
+       if (ab_total > pool_blocks) {
+               dd_dev_err(
+                       dd,
+                       "Send context fixed pool sizes, %u, larger than pool block count %u\n",
+                       ab_total, pool_blocks);
+               return -EINVAL;
+       }
+       /* subtract off the fixed pool blocks */
+       pool_blocks -= ab_total;
+
+       for (i = 0; i < NUM_SC_POOLS; i++) {
+               struct mem_pool_info *pi = &mem_pool_info[i];
+
+               /* % beats absolute blocks */
+               if (pi->centipercent >= 0)
+                       pi->blocks = (pool_blocks * pi->centipercent) / 10000;
+
+               if (pi->blocks == 0 && pi->count != 0) {
+                       dd_dev_err(
+                               dd,
+                               "Send context memory pool %d has %u contexts, but no blocks\n",
+                               i, pi->count);
+                       return -EINVAL;
+               }
+               if (pi->count == 0) {
+                       /* warn about wasted blocks */
+                       if (pi->blocks != 0)
+                               dd_dev_err(
+                                       dd,
+                                       "Send context memory pool %d has %u blocks, but zero contexts\n",
+                                       i, pi->blocks);
+                       pi->size = 0;
+               } else {
+                       pi->size = pi->blocks / pi->count;
+               }
+       }
+
+       /* step 4: fill in the context type sizes from the pool sizes */
+       used_blocks = 0;
+       for (i = 0; i < SC_MAX; i++) {
+               if (dd->sc_sizes[i].size < 0) {
+                       unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size);
+
+                       WARN_ON_ONCE(pool >= NUM_SC_POOLS);
+                       dd->sc_sizes[i].size = mem_pool_info[pool].size;
+               }
+               /* make sure we are not larger than what is allowed by the HW */
+#define PIO_MAX_BLOCKS 1024
+               if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS)
+                       dd->sc_sizes[i].size = PIO_MAX_BLOCKS;
+
+               /* calculate our total usage */
+               used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count;
+       }
+       extra = total_blocks - used_blocks;
+       if (extra != 0)
+               dd_dev_info(dd, "unused send context blocks: %d\n", extra);
+
+       return total_contexts;
+}
+
+int init_send_contexts(struct hfi1_devdata *dd)
+{
+       u16 base;
+       int ret, i, j, context;
+
+       ret = init_credit_return(dd);
+       if (ret)
+               return ret;
+
+       dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8),
+                                       GFP_KERNEL);
+       dd->send_contexts = kcalloc(dd->num_send_contexts,
+                                       sizeof(struct send_context_info),
+                                       GFP_KERNEL);
+       if (!dd->send_contexts || !dd->hw_to_sw) {
+               dd_dev_err(dd, "Unable to allocate send context arrays\n");
+               kfree(dd->hw_to_sw);
+               kfree(dd->send_contexts);
+               free_credit_return(dd);
+               return -ENOMEM;
+       }
+
+       /* hardware context map starts with invalid send context indices */
+       for (i = 0; i < TXE_NUM_CONTEXTS; i++)
+               dd->hw_to_sw[i] = INVALID_SCI;
+
+       /*
+        * All send contexts have their credit sizes.  Allocate credits
+        * for each context one after another from the global space.
+        */
+       context = 0;
+       base = 1;
+       for (i = 0; i < SC_MAX; i++) {
+               struct sc_config_sizes *scs = &dd->sc_sizes[i];
+
+               for (j = 0; j < scs->count; j++) {
+                       struct send_context_info *sci =
+                                               &dd->send_contexts[context];
+                       sci->type = i;
+                       sci->base = base;
+                       sci->credits = scs->size;
+
+                       context++;
+                       base += scs->size;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Allocate a software index and hardware context of the given type.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index,
+                      u32 *hw_context)
+{
+       struct send_context_info *sci;
+       u32 index;
+       u32 context;
+
+       for (index = 0, sci = &dd->send_contexts[0];
+                       index < dd->num_send_contexts; index++, sci++) {
+               if (sci->type == type && sci->allocated == 0) {
+                       sci->allocated = 1;
+                       /* use a 1:1 mapping, but make them non-equal */
+                       context = dd->chip_send_contexts - index - 1;
+                       dd->hw_to_sw[context] = index;
+                       *sw_index = index;
+                       *hw_context = context;
+                       return 0; /* success */
+               }
+       }
+       dd_dev_err(dd, "Unable to locate a free type %d send context\n", type);
+       return -ENOSPC;
+}
+
+/*
+ * Free the send context given by its software index.
+ *
+ * Must be called with dd->sc_lock held.
+ */
+static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context)
+{
+       struct send_context_info *sci;
+
+       sci = &dd->send_contexts[sw_index];
+       if (!sci->allocated) {
+               dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n",
+                       __func__, sw_index, hw_context);
+       }
+       sci->allocated = 0;
+       dd->hw_to_sw[hw_context] = INVALID_SCI;
+}
+
+/* return the base context of a context in a group */
+static inline u32 group_context(u32 context, u32 group)
+{
+       return (context >> group) << group;
+}
+
+/* return the size of a group */
+static inline u32 group_size(u32 group)
+{
+       return 1 << group;
+}
+
+/*
+ * Obtain the credit return addresses, kernel virtual and physical, for the
+ * given sc.
+ *
+ * To understand this routine:
+ * o va and pa are arrays of struct credit_return.  One for each physical
+ *   send context, per NUMA.
+ * o Each send context always looks in its relative location in a struct
+ *   credit_return for its credit return.
+ * o Each send context in a group must have its return address CSR programmed
+ *   with the same value.  Use the address of the first send context in the
+ *   group.
+ */
+static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa)
+{
+       u32 gc = group_context(sc->hw_context, sc->group);
+       u32 index = sc->hw_context & 0x7;
+
+       sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index];
+       *pa = (unsigned long)
+              &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc];
+}
+
+/*
+ * Work queue function triggered in error interrupt routine for
+ * kernel contexts.
+ */
+static void sc_halted(struct work_struct *work)
+{
+       struct send_context *sc;
+
+       sc = container_of(work, struct send_context, halt_work);
+       sc_restart(sc);
+}
+
+/*
+ * Calculate PIO block threshold for this send context using the given MTU.
+ * Trigger a return when one MTU plus optional header of credits remain.
+ *
+ * Parameter mtu is in bytes.
+ * Parameter hdrqentsize is in DWORDs.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize)
+{
+       u32 release_credits;
+       u32 threshold;
+
+       /* add in the header size, then divide by the PIO block size */
+       mtu += hdrqentsize << 2;
+       release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE);
+
+       /* check against this context's credits */
+       if (sc->credits <= release_credits)
+               threshold = 1;
+       else
+               threshold = sc->credits - release_credits;
+
+       return threshold;
+}
+
+/*
+ * Calculate credit threshold in terms of percent of the allocated credits.
+ * Trigger when unreturned credits equal or exceed the percentage of the whole.
+ *
+ * Return value is what to write into the CSR: trigger return when
+ * unreturned credits pass this count.
+ */
+static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent)
+{
+       return (sc->credits * percent) / 100;
+}
+
+/*
+ * Set the credit return threshold.
+ */
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold)
+{
+       unsigned long flags;
+       u32 old_threshold;
+       int force_return = 0;
+
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+
+       old_threshold = (sc->credit_ctrl >>
+                               SC(CREDIT_CTRL_THRESHOLD_SHIFT))
+                        & SC(CREDIT_CTRL_THRESHOLD_MASK);
+
+       if (new_threshold != old_threshold) {
+               sc->credit_ctrl =
+                       (sc->credit_ctrl
+                               & ~SC(CREDIT_CTRL_THRESHOLD_SMASK))
+                       | ((new_threshold
+                               & SC(CREDIT_CTRL_THRESHOLD_MASK))
+                          << SC(CREDIT_CTRL_THRESHOLD_SHIFT));
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                       SC(CREDIT_CTRL), sc->credit_ctrl);
+
+               /* force a credit return on change to avoid a possible stall */
+               force_return = 1;
+       }
+
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+
+       if (force_return)
+               sc_return_credits(sc);
+}
+
+/*
+ * set_pio_integrity
+ *
+ * Set the CHECK_ENABLE register for the send context 'sc'.
+ */
+void set_pio_integrity(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg = 0;
+       u32 hw_context = sc->hw_context;
+       int type = sc->type;
+
+       /*
+        * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if
+        * we're snooping.
+        */
+       if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) &&
+           dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE)
+               reg = hfi1_pkt_default_send_ctxt_mask(dd, type);
+
+       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
+}
+
+/*
+ * Allocate a NUMA relative send context structure of the given type along
+ * with a HW context.
+ */
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+                             uint hdrqentsize, int numa)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       dma_addr_t pa;
+       unsigned long flags;
+       u64 reg;
+       u32 thresh;
+       u32 sw_index;
+       u32 hw_context;
+       int ret;
+       u8 opval, opmask;
+
+       /* do not allocate while frozen */
+       if (dd->flags & HFI1_FROZEN)
+               return NULL;
+
+       sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa);
+       if (!sc) {
+               dd_dev_err(dd, "Cannot allocate send context structure\n");
+               return NULL;
+       }
+
+       spin_lock_irqsave(&dd->sc_lock, flags);
+       ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
+       if (ret) {
+               spin_unlock_irqrestore(&dd->sc_lock, flags);
+               kfree(sc);
+               return NULL;
+       }
+
+       sci = &dd->send_contexts[sw_index];
+       sci->sc = sc;
+
+       sc->dd = dd;
+       sc->node = numa;
+       sc->type = type;
+       spin_lock_init(&sc->alloc_lock);
+       spin_lock_init(&sc->release_lock);
+       spin_lock_init(&sc->credit_ctrl_lock);
+       INIT_LIST_HEAD(&sc->piowait);
+       INIT_WORK(&sc->halt_work, sc_halted);
+       atomic_set(&sc->buffers_allocated, 0);
+       init_waitqueue_head(&sc->halt_wait);
+
+       /* grouping is always single context for now */
+       sc->group = 0;
+
+       sc->sw_index = sw_index;
+       sc->hw_context = hw_context;
+       cr_group_addresses(sc, &pa);
+       sc->credits = sci->credits;
+
+/* PIO Send Memory Address details */
+#define PIO_ADDR_CONTEXT_MASK 0xfful
+#define PIO_ADDR_CONTEXT_SHIFT 16
+       sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK)
+                                       << PIO_ADDR_CONTEXT_SHIFT);
+
+       /* set base and credits */
+       reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK))
+                                       << SC(CTRL_CTXT_DEPTH_SHIFT))
+               | ((sci->base & SC(CTRL_CTXT_BASE_MASK))
+                                       << SC(CTRL_CTXT_BASE_SHIFT));
+       write_kctxt_csr(dd, hw_context, SC(CTRL), reg);
+
+       set_pio_integrity(sc);
+
+       /* unmask all errors */
+       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1);
+
+       /* set the default partition key */
+       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY),
+               (DEFAULT_PKEY &
+                       SC(CHECK_PARTITION_KEY_VALUE_MASK))
+                   << SC(CHECK_PARTITION_KEY_VALUE_SHIFT));
+
+       /* per context type checks */
+       if (type == SC_USER) {
+               opval = USER_OPCODE_CHECK_VAL;
+               opmask = USER_OPCODE_CHECK_MASK;
+       } else {
+               opval = OPCODE_CHECK_VAL_DISABLED;
+               opmask = OPCODE_CHECK_MASK_DISABLED;
+       }
+
+       /* set the send context check opcode mask and value */
+       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE),
+               ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) |
+               ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT)));
+
+       /* set up credit return */
+       reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg);
+
+       /*
+        * Calculate the initial credit return threshold.
+        *
+        * For Ack contexts, set a threshold for half the credits.
+        * For User contexts use the given percentage.  This has been
+        * sanitized on driver start-up.
+        * For Kernel contexts, use the default MTU plus a header.
+        */
+       if (type == SC_ACK) {
+               thresh = sc_percent_to_threshold(sc, 50);
+       } else if (type == SC_USER) {
+               thresh = sc_percent_to_threshold(sc,
+                               user_credit_return_threshold);
+       } else { /* kernel */
+               thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize);
+       }
+       reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT);
+       /* add in early return */
+       if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN))
+               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+       else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */
+               reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK);
+
+       /* set up write-through credit_ctrl */
+       sc->credit_ctrl = reg;
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg);
+
+       /* User send contexts should not allow sending on VL15 */
+       if (type == SC_USER) {
+               reg = 1ULL << 15;
+               write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg);
+       }
+
+       spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+       /*
+        * Allocate shadow ring to track outstanding PIO buffers _after_
+        * unlocking.  We don't know the size until the lock is held and
+        * we can't allocate while the lock is held.  No one is using
+        * the context yet, so allocate it now.
+        *
+        * User contexts do not get a shadow ring.
+        */
+       if (type != SC_USER) {
+               /*
+                * Size the shadow ring 1 larger than the number of credits
+                * so head == tail can mean empty.
+                */
+               sc->sr_size = sci->credits + 1;
+               sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) *
+                               sc->sr_size, GFP_KERNEL, numa);
+               if (!sc->sr) {
+                       dd_dev_err(dd,
+                               "Cannot allocate send context shadow ring structure\n");
+                       sc_free(sc);
+                       return NULL;
+               }
+       }
+
+       dd_dev_info(dd,
+               "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n",
+               sw_index,
+               hw_context,
+               sc_type_name(type),
+               sc->group,
+               sc->credits,
+               sc->credit_ctrl,
+               thresh);
+
+       return sc;
+}
+
+/* free a per-NUMA send context structure */
+void sc_free(struct send_context *sc)
+{
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       u32 sw_index;
+       u32 hw_context;
+
+       if (!sc)
+               return;
+
+       sc->flags |= SCF_IN_FREE;       /* ensure no restarts */
+       dd = sc->dd;
+       if (!list_empty(&sc->piowait))
+               dd_dev_err(dd, "piowait list not empty!\n");
+       sw_index = sc->sw_index;
+       hw_context = sc->hw_context;
+       sc_disable(sc); /* make sure the HW is disabled */
+       flush_work(&sc->halt_work);
+
+       spin_lock_irqsave(&dd->sc_lock, flags);
+       dd->send_contexts[sw_index].sc = NULL;
+
+       /* clear/disable all registers set in sc_alloc */
+       write_kctxt_csr(dd, hw_context, SC(CTRL), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0);
+       write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0);
+       write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0);
+       write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0);
+
+       /* release the index and context for re-use */
+       sc_hw_free(dd, sw_index, hw_context);
+       spin_unlock_irqrestore(&dd->sc_lock, flags);
+
+       kfree(sc->sr);
+       kfree(sc);
+}
+
+/* disable the context */
+void sc_disable(struct send_context *sc)
+{
+       u64 reg;
+       unsigned long flags;
+       struct pio_buf *pbuf;
+
+       if (!sc)
+               return;
+
+       /* do all steps, even if already disabled */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL));
+       reg &= ~SC(CTRL_CTXT_ENABLE_SMASK);
+       sc->flags &= ~SCF_ENABLED;
+       sc_wait_for_packet_egress(sc, 1);
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg);
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       /*
+        * Flush any waiters.  Once the context is disabled,
+        * credit return interrupts are stopped (although there
+        * could be one in-process when the context is disabled).
+        * Wait one microsecond for any lingering interrupts, then
+        * proceed with the flush.
+        */
+       udelay(1);
+       spin_lock_irqsave(&sc->release_lock, flags);
+       if (sc->sr) {   /* this context has a shadow ring */
+               while (sc->sr_tail != sc->sr_head) {
+                       pbuf = &sc->sr[sc->sr_tail].pbuf;
+                       if (pbuf->cb)
+                               (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE);
+                       sc->sr_tail++;
+                       if (sc->sr_tail >= sc->sr_size)
+                               sc->sr_tail = 0;
+               }
+       }
+       spin_unlock_irqrestore(&sc->release_lock, flags);
+}
+
+/* return SendEgressCtxtStatus.PacketOccupancy */
+#define packet_occupancy(r) \
+       (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\
+       >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT)
+
+/* is egress halted on the context? */
+#define egress_halted(r) \
+       ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK)
+
+/* wait for packet egress, optionally pause for credit return  */
+static void sc_wait_for_packet_egress(struct send_context *sc, int pause)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg;
+       u32 loop = 0;
+
+       while (1) {
+               reg = read_csr(dd, sc->hw_context * 8 +
+                              SEND_EGRESS_CTXT_STATUS);
+               /* done if egress is stopped */
+               if (egress_halted(reg))
+                       break;
+               reg = packet_occupancy(reg);
+               if (reg == 0)
+                       break;
+               if (loop > 100) {
+                       dd_dev_err(dd,
+                               "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u\n",
+                               __func__, sc->sw_index,
+                               sc->hw_context, (u32)reg);
+                       break;
+               }
+               loop++;
+               udelay(1);
+       }
+
+       if (pause)
+               /* Add additional delay to ensure chip returns all credits */
+               pause_for_credit_return(dd);
+}
+
+void sc_wait(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               struct send_context *sc = dd->send_contexts[i].sc;
+
+               if (!sc)
+                       continue;
+               sc_wait_for_packet_egress(sc, 0);
+       }
+}
+
+/*
+ * Restart a context after it has been halted due to error.
+ *
+ * If the first step fails - wait for the halt to be asserted, return early.
+ * Otherwise complain about timeouts but keep going.
+ *
+ * It is expected that allocations (enabled flag bit) have been shut off
+ * already (only applies to kernel contexts).
+ */
+int sc_restart(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       u64 reg;
+       u32 loop;
+       int count;
+
+       /* bounce off if not halted, or being free'd */
+       if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE))
+               return -EINVAL;
+
+       dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index,
+               sc->hw_context);
+
+       /*
+        * Step 1: Wait for the context to actually halt.
+        *
+        * The error interrupt is asynchronous to actually setting halt
+        * on the context.
+        */
+       loop = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS));
+               if (reg & SC(STATUS_CTXT_HALTED_SMASK))
+                       break;
+               if (loop > 100) {
+                       dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n",
+                               __func__, sc->sw_index, sc->hw_context);
+                       return -ETIME;
+               }
+               loop++;
+               udelay(1);
+       }
+
+       /*
+        * Step 2: Ensure no users are still trying to write to PIO.
+        *
+        * For kernel contexts, we have already turned off buffer allocation.
+        * Now wait for the buffer count to go to zero.
+        *
+        * For user contexts, the user handling code has cut off write access
+        * to the context's PIO pages before calling this routine and will
+        * restore write access after this routine returns.
+        */
+       if (sc->type != SC_USER) {
+               /* kernel context */
+               loop = 0;
+               while (1) {
+                       count = atomic_read(&sc->buffers_allocated);
+                       if (count == 0)
+                               break;
+                       if (loop > 100) {
+                               dd_dev_err(dd,
+                                       "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n",
+                                       __func__, sc->sw_index,
+                                       sc->hw_context, count);
+                       }
+                       loop++;
+                       udelay(1);
+               }
+       }
+
+       /*
+        * Step 3: Wait for all packets to egress.
+        * This is done while disabling the send context
+        *
+        * Step 4: Disable the context
+        *
+        * This is a superset of the halt.  After the disable, the
+        * errors can be cleared.
+        */
+       sc_disable(sc);
+
+       /*
+        * Step 5: Enable the context
+        *
+        * This enable will clear the halted flag and per-send context
+        * error flags.
+        */
+       return sc_enable(sc);
+}
+
+/*
+ * PIO freeze processing.  To be called after the TXE block is fully frozen.
+ * Go through all frozen send contexts and disable them.  The contexts are
+ * already stopped by the freeze.
+ */
+void pio_freeze(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               /*
+                * Don't disable unallocated, unfrozen, or user send contexts.
+                * User send contexts will be disabled when the process
+                * calls into the driver to reset its context.
+                */
+               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+                       continue;
+
+               /* only need to disable, the context is already stopped */
+               sc_disable(sc);
+       }
+}
+
+/*
+ * Unfreeze PIO for kernel send contexts.  The precondition for calling this
+ * is that all PIO send contexts have been disabled and the SPC freeze has
+ * been cleared.  Now perform the last step and re-enable each kernel context.
+ * User (PSM) processing will occur when PSM calls into the kernel to
+ * acknowledge the freeze.
+ */
+void pio_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       struct send_context *sc;
+       int i;
+
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER)
+                       continue;
+
+               sc_enable(sc);  /* will clear the sc frozen flag */
+       }
+}
+
+/*
+ * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear.
+ * Returns:
+ *     -ETIMEDOUT - if we wait too long
+ *     -EIO       - if there was an error
+ */
+static int pio_init_wait_progress(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int max, count = 0;
+
+       /* max is the longest possible HW init time / delay */
+       max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5;
+       while (1) {
+               reg = read_csr(dd, SEND_PIO_INIT_CTXT);
+               if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK))
+                       break;
+               if (count >= max)
+                       return -ETIMEDOUT;
+               udelay(5);
+               count++;
+       }
+
+       return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0;
+}
+
+/*
+ * Reset all of the send contexts to their power-on state.  Used
+ * only during manual init - no lock against sc_enable needed.
+ */
+void pio_reset_all(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       /* make sure the init engine is not busy */
+       ret = pio_init_wait_progress(dd);
+       /* ignore any timeout */
+       if (ret == -EIO) {
+               /* clear the error */
+               write_csr(dd, SEND_PIO_ERR_CLEAR,
+                       SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK);
+       }
+
+       /* reset init all */
+       write_csr(dd, SEND_PIO_INIT_CTXT,
+                       SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK);
+       udelay(2);
+       ret = pio_init_wait_progress(dd);
+       if (ret < 0) {
+               dd_dev_err(dd,
+                       "PIO send context init %s while initializing all PIO blocks\n",
+                       ret == -ETIMEDOUT ? "is stuck" : "had an error");
+       }
+}
+
+/* enable the context */
+int sc_enable(struct send_context *sc)
+{
+       u64 sc_ctrl, reg, pio;
+       struct hfi1_devdata *dd;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!sc)
+               return -EINVAL;
+       dd = sc->dd;
+
+       /*
+        * Obtain the allocator lock to guard against any allocation
+        * attempts (which should not happen prior to context being
+        * enabled). On the release/disable side we don't need to
+        * worry about locking since the releaser will not do anything
+        * if the context accounting values have not changed.
+        */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+       if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK)))
+               goto unlock; /* already enabled */
+
+       /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */
+
+       *sc->hw_free = 0;
+       sc->free = 0;
+       sc->alloc_free = 0;
+       sc->fill = 0;
+       sc->sr_head = 0;
+       sc->sr_tail = 0;
+       sc->flags = 0;
+       atomic_set(&sc->buffers_allocated, 0);
+
+       /*
+        * Clear all per-context errors.  Some of these will be set when
+        * we are re-enabling after a context halt.  Now that the context
+        * is disabled, the halt will not clear until after the PIO init
+        * engine runs below.
+        */
+       reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS));
+       if (reg)
+               write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR),
+                       reg);
+
+       /*
+        * The HW PIO initialization engine can handle only one init
+        * request at a time. Serialize access to each device's engine.
+        */
+       spin_lock(&dd->sc_init_lock);
+       /*
+        * Since access to this code block is serialized and
+        * each access waits for the initialization to complete
+        * before releasing the lock, the PIO initialization engine
+        * should not be in use, so we don't have to wait for the
+        * InProgress bit to go down.
+        */
+       pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) <<
+              SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) |
+               SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK;
+       write_csr(dd, SEND_PIO_INIT_CTXT, pio);
+       /*
+        * Wait until the engine is done.  Give the chip the required time
+        * so, hopefully, we read the register just once.
+        */
+       udelay(2);
+       ret = pio_init_wait_progress(dd);
+       spin_unlock(&dd->sc_init_lock);
+       if (ret) {
+               dd_dev_err(dd,
+                          "sctxt%u(%u): Context not enabled due to init failure %d\n",
+                          sc->sw_index, sc->hw_context, ret);
+               goto unlock;
+       }
+
+       /*
+        * All is well. Enable the context.
+        */
+       sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK);
+       write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl);
+       /*
+        * Read SendCtxtCtrl to force the write out and prevent a timing
+        * hazard where a PIO write may reach the context before the enable.
+        */
+       read_kctxt_csr(dd, sc->hw_context, SC(CTRL));
+       sc->flags |= SCF_ENABLED;
+
+unlock:
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       return ret;
+}
+
+/* force a credit return on the context */
+void sc_return_credits(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       /* a 0->1 transition schedules a credit return */
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE),
+               SC(CREDIT_FORCE_FORCE_RETURN_SMASK));
+       /*
+        * Ensure that the write is flushed and the credit return is
+        * scheduled. We care more about the 0 -> 1 transition.
+        */
+       read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE));
+       /* set back to 0 for next time */
+       write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0);
+}
+
+/* allow all in-flight packets to drain on the context */
+void sc_flush(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       sc_wait_for_packet_egress(sc, 1);
+}
+
+/* drop all packets on the context, no waiting until they are sent */
+void sc_drop(struct send_context *sc)
+{
+       if (!sc)
+               return;
+
+       dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
+                       __func__, sc->sw_index, sc->hw_context);
+}
+
+/*
+ * Start the software reaction to a context halt or SPC freeze:
+ *     - mark the context as halted or frozen
+ *     - stop buffer allocations
+ *
+ * Called from the error interrupt.  Other work is deferred until
+ * out of the interrupt.
+ */
+void sc_stop(struct send_context *sc, int flag)
+{
+       unsigned long flags;
+
+       /* mark the context */
+       sc->flags |= flag;
+
+       /* stop buffer allocations */
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       sc->flags &= ~SCF_ENABLED;
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+       wake_up(&sc->halt_wait);
+}
+
+#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32))
+#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS)
+
+/*
+ * The send context buffer "allocator".
+ *
+ * @sc: the PIO send context we are allocating from
+ * @len: length of whole packet - including PBC - in dwords
+ * @cb: optional callback to call when the buffer is finished sending
+ * @arg: argument for cb
+ *
+ * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ */
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+                               pio_release_cb cb, void *arg)
+{
+       struct pio_buf *pbuf = NULL;
+       unsigned long flags;
+       unsigned long avail;
+       unsigned long blocks = dwords_to_blocks(dw_len);
+       unsigned long start_fill;
+       int trycount = 0;
+       u32 head, next;
+
+       spin_lock_irqsave(&sc->alloc_lock, flags);
+       if (!(sc->flags & SCF_ENABLED)) {
+               spin_unlock_irqrestore(&sc->alloc_lock, flags);
+               goto done;
+       }
+
+retry:
+       avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free);
+       if (blocks > avail) {
+               /* not enough room */
+               if (unlikely(trycount)) { /* already tried to get more room */
+                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+                       goto done;
+               }
+               /* copy from receiver cache line and recalculate */
+               sc->alloc_free = ACCESS_ONCE(sc->free);
+               avail =
+                       (unsigned long)sc->credits -
+                       (sc->fill - sc->alloc_free);
+               if (blocks > avail) {
+                       /* still no room, actively update */
+                       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+                       sc_release_update(sc);
+                       spin_lock_irqsave(&sc->alloc_lock, flags);
+                       sc->alloc_free = ACCESS_ONCE(sc->free);
+                       trycount++;
+                       goto retry;
+               }
+       }
+
+       /* there is enough room */
+
+       atomic_inc(&sc->buffers_allocated);
+
+       /* read this once */
+       head = sc->sr_head;
+
+       /* "allocate" the buffer */
+       start_fill = sc->fill;
+       sc->fill += blocks;
+
+       /*
+        * Fill the parts that the releaser looks at before moving the head.
+        * The only necessary piece is the sent_at field.  The credits
+        * we have just allocated cannot have been returned yet, so the
+        * cb and arg will not be looked at for a "while".  Put them
+        * on this side of the memory barrier anyway.
+        */
+       pbuf = &sc->sr[head].pbuf;
+       pbuf->sent_at = sc->fill;
+       pbuf->cb = cb;
+       pbuf->arg = arg;
+       pbuf->sc = sc;  /* could be filled in at sc->sr init time */
+       /* make sure this is in memory before updating the head */
+
+       /* calculate next head index, do not store */
+       next = head + 1;
+       if (next >= sc->sr_size)
+               next = 0;
+       /* update the head - must be last! - the releaser can look at fields
+          in pbuf once we move the head */
+       smp_wmb();
+       sc->sr_head = next;
+       spin_unlock_irqrestore(&sc->alloc_lock, flags);
+
+       /* finish filling in the buffer outside the lock */
+       pbuf->start = sc->base_addr + ((start_fill % sc->credits)
+                                                       * PIO_BLOCK_SIZE);
+       pbuf->size = sc->credits * PIO_BLOCK_SIZE;
+       pbuf->end = sc->base_addr + pbuf->size;
+       pbuf->block_count = blocks;
+       pbuf->qw_written = 0;
+       pbuf->carry_bytes = 0;
+       pbuf->carry.val64 = 0;
+done:
+       return pbuf;
+}
+
+/*
+ * There are at least two entities that can turn on credit return
+ * interrupts and they can overlap.  Avoid problems by implementing
+ * a count scheme that is enforced by a lock.  The lock is needed because
+ * the count and CSR write must be paired.
+ */
+
+/*
+ * Start credit return interrupts.  This is managed by a count.  If already
+ * on, just increment the count.
+ */
+void sc_add_credit_return_intr(struct send_context *sc)
+{
+       unsigned long flags;
+
+       /* lock must surround both the count change and the CSR update */
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+       if (sc->credit_intr_count == 0) {
+               sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                       SC(CREDIT_CTRL), sc->credit_ctrl);
+       }
+       sc->credit_intr_count++;
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * Stop credit return interrupts.  This is managed by a count.  Decrement the
+ * count, if the last user, then turn the credit interrupts off.
+ */
+void sc_del_credit_return_intr(struct send_context *sc)
+{
+       unsigned long flags;
+
+       WARN_ON(sc->credit_intr_count == 0);
+
+       /* lock must surround both the count change and the CSR update */
+       spin_lock_irqsave(&sc->credit_ctrl_lock, flags);
+       sc->credit_intr_count--;
+       if (sc->credit_intr_count == 0) {
+               sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK);
+               write_kctxt_csr(sc->dd, sc->hw_context,
+                       SC(CREDIT_CTRL), sc->credit_ctrl);
+       }
+       spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags);
+}
+
+/*
+ * The caller must be careful when calling this.  All needint calls
+ * must be paired with !needint.
+ */
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint)
+{
+       if (needint)
+               sc_add_credit_return_intr(sc);
+       else
+               sc_del_credit_return_intr(sc);
+       trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
+       if (needint) {
+               mmiowb();
+               sc_return_credits(sc);
+       }
+}
+
+/**
+ * sc_piobufavail - callback when a PIO buffer is available
+ * @sc: the send context
+ *
+ * This is called from the interrupt handler when a PIO buffer is
+ * available after hfi1_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+static void sc_piobufavail(struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct list_head *list;
+       struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE];
+       struct hfi1_qp *qp;
+       unsigned long flags;
+       unsigned i, n = 0;
+
+       if (dd->send_contexts[sc->sw_index].type != SC_KERNEL)
+               return;
+       list = &sc->piowait;
+       /*
+        * Note: checking that the piowait list is empty and clearing
+        * the buffer available interrupt needs to be atomic or we
+        * could end up with QPs on the wait list with the interrupt
+        * disabled.
+        */
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       while (!list_empty(list)) {
+               struct iowait *wait;
+
+               if (n == ARRAY_SIZE(qps))
+                       goto full;
+               wait = list_first_entry(list, struct iowait, list);
+               qp = container_of(wait, struct hfi1_qp, s_iowait);
+               list_del_init(&qp->s_iowait.list);
+               /* refcount held until actual wake up */
+               qps[n++] = qp;
+       }
+       /*
+        * Counting: only call wantpiobuf_intr() if there were waiters and they
+        * are now all gone.
+        */
+       if (n)
+               hfi1_sc_wantpiobuf_intr(sc, 0);
+full:
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+       for (i = 0; i < n; i++)
+               hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO);
+}
+
+/* translate a send credit update to a bit code of reasons */
+static inline int fill_code(u64 hw_free)
+{
+       int code = 0;
+
+       if (hw_free & CR_STATUS_SMASK)
+               code |= PRC_STATUS_ERR;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK)
+               code |= PRC_PBC;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK)
+               code |= PRC_THRESHOLD;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK)
+               code |= PRC_FILL_ERR;
+       if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK)
+               code |= PRC_SC_DISABLE;
+       return code;
+}
+
+/* use the jiffies compare to get the wrap right */
+#define sent_before(a, b) time_before(a, b)    /* a < b */
+
+/*
+ * The send context buffer "releaser".
+ */
+void sc_release_update(struct send_context *sc)
+{
+       struct pio_buf *pbuf;
+       u64 hw_free;
+       u32 head, tail;
+       unsigned long old_free;
+       unsigned long extra;
+       unsigned long flags;
+       int code;
+
+       if (!sc)
+               return;
+
+       spin_lock_irqsave(&sc->release_lock, flags);
+       /* update free */
+       hw_free = le64_to_cpu(*sc->hw_free);            /* volatile read */
+       old_free = sc->free;
+       extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT)
+                       - (old_free & CR_COUNTER_MASK))
+                               & CR_COUNTER_MASK;
+       sc->free = old_free + extra;
+       trace_hfi1_piofree(sc, extra);
+
+       /* call sent buffer callbacks */
+       code = -1;                              /* code not yet set */
+       head = ACCESS_ONCE(sc->sr_head);        /* snapshot the head */
+       tail = sc->sr_tail;
+       while (head != tail) {
+               pbuf = &sc->sr[tail].pbuf;
+
+               if (sent_before(sc->free, pbuf->sent_at)) {
+                       /* not sent yet */
+                       break;
+               }
+               if (pbuf->cb) {
+                       if (code < 0) /* fill in code on first user */
+                               code = fill_code(hw_free);
+                       (*pbuf->cb)(pbuf->arg, code);
+               }
+
+               tail++;
+               if (tail >= sc->sr_size)
+                       tail = 0;
+       }
+       /* update tail, in case we moved it */
+       sc->sr_tail = tail;
+       spin_unlock_irqrestore(&sc->release_lock, flags);
+       sc_piobufavail(sc);
+}
+
+/*
+ * Send context group releaser.  Argument is the send context that caused
+ * the interrupt.  Called from the send context interrupt handler.
+ *
+ * Call release on all contexts in the group.
+ *
+ * This routine takes the sc_lock without an irqsave because it is only
+ * called from an interrupt handler.  Adjust if that changes.
+ */
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context)
+{
+       struct send_context *sc;
+       u32 sw_index;
+       u32 gc, gc_end;
+
+       spin_lock(&dd->sc_lock);
+       sw_index = dd->hw_to_sw[hw_context];
+       if (unlikely(sw_index >= dd->num_send_contexts)) {
+               dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n",
+                       __func__, hw_context, sw_index);
+               goto done;
+       }
+       sc = dd->send_contexts[sw_index].sc;
+       if (unlikely(!sc))
+               goto done;
+
+       gc = group_context(hw_context, sc->group);
+       gc_end = gc + group_size(sc->group);
+       for (; gc < gc_end; gc++) {
+               sw_index = dd->hw_to_sw[gc];
+               if (unlikely(sw_index >= dd->num_send_contexts)) {
+                       dd_dev_err(dd,
+                               "%s: invalid hw (%u) to sw (%u) mapping\n",
+                               __func__, hw_context, sw_index);
+                       continue;
+               }
+               sc_release_update(dd->send_contexts[sw_index].sc);
+       }
+done:
+       spin_unlock(&dd->sc_lock);
+}
+
+int init_pervl_scs(struct hfi1_devdata *dd)
+{
+       int i;
+       u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */
+       u32 ctxt;
+
+       dd->vld[15].sc = sc_alloc(dd, SC_KERNEL,
+                                 dd->rcd[0]->rcvhdrqentsize, dd->node);
+       if (!dd->vld[15].sc)
+               goto nomem;
+       hfi1_init_ctxt(dd->vld[15].sc);
+       dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048);
+       for (i = 0; i < num_vls; i++) {
+               /*
+                * Since this function does not deal with a specific
+                * receive context but we need the RcvHdrQ entry size,
+                * use the size from rcd[0]. It is guaranteed to be
+                * valid at this point and will remain the same for all
+                * receive contexts.
+                */
+               dd->vld[i].sc = sc_alloc(dd, SC_KERNEL,
+                                        dd->rcd[0]->rcvhdrqentsize, dd->node);
+               if (!dd->vld[i].sc)
+                       goto nomem;
+
+               hfi1_init_ctxt(dd->vld[i].sc);
+
+               /* non VL15 start with the max MTU */
+               dd->vld[i].mtu = hfi1_max_mtu;
+       }
+       sc_enable(dd->vld[15].sc);
+       ctxt = dd->vld[15].sc->hw_context;
+       mask = all_vl_mask & ~(1LL << 15);
+       write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       dd_dev_info(dd,
+                   "Using send context %u(%u) for VL15\n",
+                   dd->vld[15].sc->sw_index, ctxt);
+       for (i = 0; i < num_vls; i++) {
+               sc_enable(dd->vld[i].sc);
+               ctxt = dd->vld[i].sc->hw_context;
+               mask = all_vl_mask & ~(1LL << i);
+               write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask);
+       }
+       return 0;
+nomem:
+       sc_free(dd->vld[15].sc);
+       for (i = 0; i < num_vls; i++)
+               sc_free(dd->vld[i].sc);
+       return -ENOMEM;
+}
+
+int init_credit_return(struct hfi1_devdata *dd)
+{
+       int ret;
+       int num_numa;
+       int i;
+
+       num_numa = num_online_nodes();
+       /* enforce the expectation that the numas are compact */
+       for (i = 0; i < num_numa; i++) {
+               if (!node_online(i)) {
+                       dd_dev_err(dd, "NUMA nodes are not compact\n");
+                       ret = -EINVAL;
+                       goto done;
+               }
+       }
+
+       dd->cr_base = kcalloc(
+               num_numa,
+               sizeof(struct credit_return_base),
+               GFP_KERNEL);
+       if (!dd->cr_base) {
+               dd_dev_err(dd, "Unable to allocate credit return base\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+       for (i = 0; i < num_numa; i++) {
+               int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
+
+               set_dev_node(&dd->pcidev->dev, i);
+               dd->cr_base[i].va = dma_zalloc_coherent(
+                                       &dd->pcidev->dev,
+                                       bytes,
+                                       &dd->cr_base[i].pa,
+                                       GFP_KERNEL);
+               if (dd->cr_base[i].va == NULL) {
+                       set_dev_node(&dd->pcidev->dev, dd->node);
+                       dd_dev_err(dd,
+                               "Unable to allocate credit return DMA range for NUMA %d\n",
+                               i);
+                       ret = -ENOMEM;
+                       goto done;
+               }
+       }
+       set_dev_node(&dd->pcidev->dev, dd->node);
+
+       ret = 0;
+done:
+       return ret;
+}
+
+void free_credit_return(struct hfi1_devdata *dd)
+{
+       int num_numa;
+       int i;
+
+       if (!dd->cr_base)
+               return;
+
+       num_numa = num_online_nodes();
+       for (i = 0; i < num_numa; i++) {
+               if (dd->cr_base[i].va) {
+                       dma_free_coherent(&dd->pcidev->dev,
+                               TXE_NUM_CONTEXTS
+                                       * sizeof(struct credit_return),
+                               dd->cr_base[i].va,
+                               dd->cr_base[i].pa);
+               }
+       }
+       kfree(dd->cr_base);
+       dd->cr_base = NULL;
+}
diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h
new file mode 100644 (file)
index 0000000..0bb885c
--- /dev/null
@@ -0,0 +1,224 @@
+#ifndef _PIO_H
+#define _PIO_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+/* send context types */
+#define SC_KERNEL 0
+#define SC_ACK    1
+#define SC_USER   2
+#define SC_MAX    3
+
+/* invalid send context index */
+#define INVALID_SCI 0xff
+
+/* PIO buffer release callback function */
+typedef void (*pio_release_cb)(void *arg, int code);
+
+/* PIO release codes - in bits, as there could more than one that apply */
+#define PRC_OK         0       /* no known error */
+#define PRC_STATUS_ERR 0x01    /* credit return due to status error */
+#define PRC_PBC                0x02    /* credit return due to PBC */
+#define PRC_THRESHOLD  0x04    /* credit return due to threshold */
+#define PRC_FILL_ERR   0x08    /* credit return due fill error */
+#define PRC_FORCE      0x10    /* credit return due credit force */
+#define PRC_SC_DISABLE 0x20    /* clean-up after a context disable */
+
+/* byte helper */
+union mix {
+       u64 val64;
+       u32 val32[2];
+       u8  val8[8];
+};
+
+/* an allocated PIO buffer */
+struct pio_buf {
+       struct send_context *sc;/* back pointer to owning send context */
+       pio_release_cb cb;      /* called when the buffer is released */
+       void *arg;              /* argument for cb */
+       void __iomem *start;    /* buffer start address */
+       void __iomem *end;      /* context end address */
+       unsigned long size;     /* context size, in bytes */
+       unsigned long sent_at;  /* buffer is sent when <= free */
+       u32 block_count;        /* size of buffer, in blocks */
+       u32 qw_written;         /* QW written so far */
+       u32 carry_bytes;        /* number of valid bytes in carry */
+       union mix carry;        /* pending unwritten bytes */
+};
+
+/* cache line aligned pio buffer array */
+union pio_shadow_ring {
+       struct pio_buf pbuf;
+       u64 unused[16];         /* cache line spacer */
+} ____cacheline_aligned;
+
+/* per-NUMA send context */
+struct send_context {
+       /* read-only after init */
+       struct hfi1_devdata *dd;                /* device */
+       void __iomem *base_addr;        /* start of PIO memory */
+       union pio_shadow_ring *sr;      /* shadow ring */
+       volatile __le64 *hw_free;       /* HW free counter */
+       struct work_struct halt_work;   /* halted context work queue entry */
+       unsigned long flags;            /* flags */
+       int node;                       /* context home node */
+       int type;                       /* context type */
+       u32 sw_index;                   /* software index number */
+       u32 hw_context;                 /* hardware context number */
+       u32 credits;                    /* number of blocks in context */
+       u32 sr_size;                    /* size of the shadow ring */
+       u32 group;                      /* credit return group */
+       /* allocator fields */
+       spinlock_t alloc_lock ____cacheline_aligned_in_smp;
+       unsigned long fill;             /* official alloc count */
+       unsigned long alloc_free;       /* copy of free (less cache thrash) */
+       u32 sr_head;                    /* shadow ring head */
+       /* releaser fields */
+       spinlock_t release_lock ____cacheline_aligned_in_smp;
+       unsigned long free;             /* official free count */
+       u32 sr_tail;                    /* shadow ring tail */
+       /* list for PIO waiters */
+       struct list_head piowait  ____cacheline_aligned_in_smp;
+       spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
+       u64 credit_ctrl;                /* cache for credit control */
+       u32 credit_intr_count;          /* count of credit intr users */
+       atomic_t buffers_allocated;     /* count of buffers allocated */
+       wait_queue_head_t halt_wait;    /* wait until kernel sees interrupt */
+};
+
+/* send context flags */
+#define SCF_ENABLED 0x01
+#define SCF_IN_FREE 0x02
+#define SCF_HALTED  0x04
+#define SCF_FROZEN  0x08
+
+struct send_context_info {
+       struct send_context *sc;        /* allocated working context */
+       u16 allocated;                  /* has this been allocated? */
+       u16 type;                       /* context type */
+       u16 base;                       /* base in PIO array */
+       u16 credits;                    /* size in PIO array */
+};
+
+/* DMA credit return, index is always (context & 0x7) */
+struct credit_return {
+       volatile __le64 cr[8];
+};
+
+/* NUMA indexed credit return array */
+struct credit_return_base {
+       struct credit_return *va;
+       dma_addr_t pa;
+};
+
+/* send context configuration sizes (one per type) */
+struct sc_config_sizes {
+       short int size;
+       short int count;
+};
+
+/* send context functions */
+int init_credit_return(struct hfi1_devdata *dd);
+void free_credit_return(struct hfi1_devdata *dd);
+int init_sc_pools_and_sizes(struct hfi1_devdata *dd);
+int init_send_contexts(struct hfi1_devdata *dd);
+int init_credit_return(struct hfi1_devdata *dd);
+int init_pervl_scs(struct hfi1_devdata *dd);
+struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
+                             uint hdrqentsize, int numa);
+void sc_free(struct send_context *sc);
+int sc_enable(struct send_context *sc);
+void sc_disable(struct send_context *sc);
+int sc_restart(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_flush(struct send_context *sc);
+void sc_drop(struct send_context *sc);
+void sc_stop(struct send_context *sc, int bit);
+struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
+                       pio_release_cb cb, void *arg);
+void sc_release_update(struct send_context *sc);
+void sc_return_credits(struct send_context *sc);
+void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context);
+void sc_add_credit_return_intr(struct send_context *sc);
+void sc_del_credit_return_intr(struct send_context *sc);
+void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold);
+u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize);
+void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint);
+void sc_wait(struct hfi1_devdata *dd);
+void set_pio_integrity(struct send_context *sc);
+
+/* support functions */
+void pio_reset_all(struct hfi1_devdata *dd);
+void pio_freeze(struct hfi1_devdata *dd);
+void pio_kernel_unfreeze(struct hfi1_devdata *dd);
+
+/* global PIO send control operations */
+#define PSC_GLOBAL_ENABLE 0
+#define PSC_GLOBAL_DISABLE 1
+#define PSC_GLOBAL_VLARB_ENABLE 2
+#define PSC_GLOBAL_VLARB_DISABLE 3
+#define PSC_CM_RESET 4
+#define PSC_DATA_VL_ENABLE 5
+#define PSC_DATA_VL_DISABLE 6
+
+void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl);
+void pio_send_control(struct hfi1_devdata *dd, int op);
+
+
+/* PIO copy routines */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+             const void *from, size_t count);
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+                                       const void *from, size_t nbytes);
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
+void seg_pio_copy_end(struct pio_buf *pbuf);
+
+#endif /* _PIO_H */
diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c
new file mode 100644 (file)
index 0000000..8972bbc
--- /dev/null
@@ -0,0 +1,858 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+
+/* additive distance between non-SOP and SOP space */
+#define SOP_DISTANCE (TXE_PIO_SIZE / 2)
+#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1)
+/* number of QUADWORDs in a block */
+#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64))
+
+/**
+ * pio_copy - copy data block to MMIO space
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @pbc: PBC to send
+ * @from: source, must be 8 byte aligned
+ * @count: number of DWORD (32-bit) quantities to copy from source
+ *
+ * Copy data from source to PIO Send Buffer memory, 8 bytes at a time.
+ * Must always write full BLOCK_SIZE bytes blocks.  The first block must
+ * be written to the corresponding SOP=1 address.
+ *
+ * Known:
+ * o pbuf->start always starts on a block boundary
+ * o pbuf can wrap only at a block boundary
+ */
+void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
+             const void *from, size_t count)
+{
+       void __iomem *dest = pbuf->start + SOP_DISTANCE;
+       void __iomem *send = dest + PIO_BLOCK_SIZE;
+       void __iomem *dend;                     /* 8-byte data end */
+
+       /* write the PBC */
+       writeq(pbc, dest);
+       dest += sizeof(u64);
+
+       /* calculate where the QWORD data ends - in SOP=1 space */
+       dend = dest + ((count>>1) * sizeof(u64));
+
+       if (dend < send) {
+               /* all QWORD data is within the SOP block, does *not*
+                  reach the end of the SOP block */
+
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /*
+                * No boundary checks are needed here:
+                * 0. We're not on the SOP block boundary
+                * 1. The possible DWORD dangle will still be within
+                *    the SOP block
+                * 2. We cannot wrap except on a block boundary.
+                */
+       } else {
+               /* QWORD data extends _to_ or beyond the SOP block */
+
+               /* write 8-byte SOP chunk data */
+               while (dest < send) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /* drop out of the SOP range */
+               dest -= SOP_DISTANCE;
+               dend -= SOP_DISTANCE;
+
+               /*
+                * If the wrap comes before or matches the data end,
+                * copy until until the wrap, then wrap.
+                *
+                * If the data ends at the end of the SOP above and
+                * the buffer wraps, then pbuf->end == dend == dest
+                * and nothing will get written, but we will wrap in
+                * case there is a dangling DWORD.
+                */
+               if (pbuf->end <= dend) {
+                       while (dest < pbuf->end) {
+                               writeq(*(u64 *)from, dest);
+                               from += sizeof(u64);
+                               dest += sizeof(u64);
+                       }
+
+                       dest -= pbuf->size;
+                       dend -= pbuf->size;
+               }
+
+               /* write 8-byte non-SOP, non-wrap chunk data */
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+       }
+       /* at this point we have wrapped if we are going to wrap */
+
+       /* write dangling u32, if any */
+       if (count & 1) {
+               union mix val;
+
+               val.val64 = 0;
+               val.val32[0] = *(u32 *)from;
+               writeq(val.val64, dest);
+               dest += sizeof(u64);
+       }
+       /* fill in rest of block, no need to check pbuf->end
+          as we only wrap on a block boundary */
+       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+               writeq(0, dest);
+               dest += sizeof(u64);
+       }
+
+       /* finished with this buffer */
+       atomic_dec(&pbuf->sc->buffers_allocated);
+}
+
+/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
+#define USE_SHIFTS 1
+#ifdef USE_SHIFTS
+/*
+ * Handle carry bytes using shifts and masks.
+ *
+ * NOTE: the value the unused portion of carry is expected to always be zero.
+ */
+
+/*
+ * "zero" shift - bit shift used to zero out upper bytes.  Input is
+ * the count of LSB bytes to preserve.
+ */
+#define zshift(x) (8 * (8-(x)))
+
+/*
+ * "merge" shift - bit shift used to merge with carry bytes.  Input is
+ * the LSB byte count to move beyond.
+ */
+#define mshift(x) (8 * (x))
+
+/*
+ * Read nbytes bytes from "from" and return them in the LSB bytes
+ * of pbuf->carry.  Other bytes are zeroed.  Any previous value
+ * pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned
+ * o nbytes must not span a QW boundary
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+                                                       unsigned int nbytes)
+{
+       unsigned long off;
+
+       if (nbytes == 0) {
+               pbuf->carry.val64 = 0;
+       } else {
+               /* align our pointer */
+               off = (unsigned long)from & 0x7;
+               from = (void *)((unsigned long)from & ~0x7l);
+               pbuf->carry.val64 = ((*(u64 *)from)
+                               << zshift(nbytes + off))/* zero upper bytes */
+                               >> zshift(nbytes);      /* place at bottom */
+       }
+       pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the next significant bytes
+ * of pbuf->carry.  Unused bytes are zeroed.  It is expected that the extra
+ * read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+                                       const void *from, unsigned int nbytes)
+{
+       unsigned long off = (unsigned long)from & 0x7;
+       unsigned int room, xbytes;
+
+       /* align our pointer */
+       from = (void *)((unsigned long)from & ~0x7l);
+
+       /* check count first - don't read anything if count is zero */
+       while (nbytes) {
+               /* find the number of bytes in this u64 */
+               room = 8 - off; /* this u64 has room for this many bytes */
+               xbytes = nbytes > room ? room : nbytes;
+
+               /*
+                * shift down to zero lower bytes, shift up to zero upper
+                * bytes, shift back down to move into place
+                */
+               pbuf->carry.val64 |= (((*(u64 *)from)
+                                       >> mshift(off))
+                                       << zshift(xbytes))
+                                       >> zshift(xbytes+pbuf->carry_bytes);
+               off = 0;
+               pbuf->carry_bytes += xbytes;
+               nbytes -= xbytes;
+               from += sizeof(u64);
+       }
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+       unsigned int remaining;
+
+       if (zbytes == 0)        /* nothing to do */
+               return;
+
+       remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */
+
+       /* NOTE: zshift only guaranteed to work if remaining != 0 */
+       if (remaining)
+               pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining))
+                                       >> zshift(remaining);
+       else
+               pbuf->carry.val64 = 0;
+       pbuf->carry_bytes = remaining;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the LSB bytes of
+ * pbuf->carry with the upper bytes zeroed..
+ *
+ * NOTES:
+ * o result must keep unused bytes zeroed
+ * o src must be u64 aligned
+ */
+static inline void merge_write8(
+       struct pio_buf *pbuf,
+       void __iomem *dest,
+       const void *src)
+{
+       u64 new, temp;
+
+       new = *(u64 *)src;
+       temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes));
+       writeq(temp, dest);
+       pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void __iomem *dest)
+{
+       writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest)
+{
+       if (pbuf->carry_bytes) {
+               /* unused bytes are always kept zeroed, so just write */
+               writeq(pbuf->carry.val64, dest);
+               return 1;
+       }
+
+       return 0;
+}
+
+#else /* USE_SHIFTS */
+/*
+ * Handle carry bytes using byte copies.
+ *
+ * NOTE: the value the unused portion of carry is left uninitialized.
+ */
+
+/*
+ * Jump copy - no-loop copy for < 8 bytes.
+ */
+static inline void jcopy(u8 *dest, const u8 *src, u32 n)
+{
+       switch (n) {
+       case 7:
+               *dest++ = *src++;
+       case 6:
+               *dest++ = *src++;
+       case 5:
+               *dest++ = *src++;
+       case 4:
+               *dest++ = *src++;
+       case 3:
+               *dest++ = *src++;
+       case 2:
+               *dest++ = *src++;
+       case 1:
+               *dest++ = *src++;
+       }
+}
+
+/*
+ * Read nbytes from "from" and and place them in the low bytes
+ * of pbuf->carry.  Other bytes are left as-is.  Any previous
+ * value in pbuf->carry is lost.
+ *
+ * NOTES:
+ * o do not read from from if nbytes is zero
+ * o from may _not_ be u64 aligned.
+ */
+static inline void read_low_bytes(struct pio_buf *pbuf, const void *from,
+                                                       unsigned int nbytes)
+{
+       jcopy(&pbuf->carry.val8[0], from, nbytes);
+       pbuf->carry_bytes = nbytes;
+}
+
+/*
+ * Read nbytes bytes from "from" and put them at the end of pbuf->carry.
+ * It is expected that the extra read does not overfill carry.
+ *
+ * NOTES:
+ * o from may _not_ be u64 aligned
+ * o nbytes may span a QW boundary
+ */
+static inline void read_extra_bytes(struct pio_buf *pbuf,
+                                       const void *from, unsigned int nbytes)
+{
+       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes);
+       pbuf->carry_bytes += nbytes;
+}
+
+/*
+ * Zero extra bytes from the end of pbuf->carry.
+ *
+ * We do not care about the value of unused bytes in carry, so just
+ * reduce the byte count.
+ *
+ * NOTES:
+ * o zbytes <= old_bytes
+ */
+static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes)
+{
+       pbuf->carry_bytes -= zbytes;
+}
+
+/*
+ * Write a quad word using parts of pbuf->carry and the next 8 bytes of src.
+ * Put the unused part of the next 8 bytes of src into the low bytes of
+ * pbuf->carry.
+ */
+static inline void merge_write8(
+       struct pio_buf *pbuf,
+       void *dest,
+       const void *src)
+{
+       u32 remainder = 8 - pbuf->carry_bytes;
+
+       jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder);
+       writeq(pbuf->carry.val64, dest);
+       jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes);
+}
+
+/*
+ * Write a quad word using all bytes of carry.
+ */
+static inline void carry8_write8(union mix carry, void *dest)
+{
+       writeq(carry.val64, dest);
+}
+
+/*
+ * Write a quad word using all the valid bytes of carry.  If carry
+ * has zero valid bytes, nothing is written.
+ * Returns 0 on nothing written, non-zero on quad word written.
+ */
+static inline int carry_write8(struct pio_buf *pbuf, void *dest)
+{
+       if (pbuf->carry_bytes) {
+               u64 zero = 0;
+
+               jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero,
+                                               8 - pbuf->carry_bytes);
+               writeq(pbuf->carry.val64, dest);
+               return 1;
+       }
+
+       return 0;
+}
+#endif /* USE_SHIFTS */
+
+/*
+ * Segmented PIO Copy - start
+ *
+ * Start a PIO copy.
+ *
+ * @pbuf: destination buffer
+ * @pbc: the PBC for the PIO buffer
+ * @from: data source, QWORD aligned
+ * @nbytes: bytes to copy
+ */
+void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc,
+                               const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + SOP_DISTANCE;
+       void __iomem *send = dest + PIO_BLOCK_SIZE;
+       void __iomem *dend;                     /* 8-byte data end */
+
+       writeq(pbc, dest);
+       dest += sizeof(u64);
+
+       /* calculate where the QWORD data ends - in SOP=1 space */
+       dend = dest + ((nbytes>>3) * sizeof(u64));
+
+       if (dend < send) {
+               /* all QWORD data is within the SOP block, does *not*
+                  reach the end of the SOP block */
+
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /*
+                * No boundary checks are needed here:
+                * 0. We're not on the SOP block boundary
+                * 1. The possible DWORD dangle will still be within
+                *    the SOP block
+                * 2. We cannot wrap except on a block boundary.
+                */
+       } else {
+               /* QWORD data extends _to_ or beyond the SOP block */
+
+               /* write 8-byte SOP chunk data */
+               while (dest < send) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+               /* drop out of the SOP range */
+               dest -= SOP_DISTANCE;
+               dend -= SOP_DISTANCE;
+
+               /*
+                * If the wrap comes before or matches the data end,
+                * copy until until the wrap, then wrap.
+                *
+                * If the data ends at the end of the SOP above and
+                * the buffer wraps, then pbuf->end == dend == dest
+                * and nothing will get written, but we will wrap in
+                * case there is a dangling DWORD.
+                */
+               if (pbuf->end <= dend) {
+                       while (dest < pbuf->end) {
+                               writeq(*(u64 *)from, dest);
+                               from += sizeof(u64);
+                               dest += sizeof(u64);
+                       }
+
+                       dest -= pbuf->size;
+                       dend -= pbuf->size;
+               }
+
+               /* write 8-byte non-SOP, non-wrap chunk data */
+               while (dest < dend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+       }
+       /* at this point we have wrapped if we are going to wrap */
+
+       /* ...but it doesn't matter as we're done writing */
+
+       /* save dangling bytes, if any */
+       read_low_bytes(pbuf, from, nbytes & 0x7);
+
+       pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3);
+}
+
+/*
+ * Mid copy helper, "mixed case" - source is 64-bit aligned but carry
+ * bytes are non-zero.
+ *
+ * Whole u64s must be written to the chip, so bytes must be manually merged.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned.
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+       void __iomem *dend;                     /* 8-byte data end */
+       unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3;
+       unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7;
+
+       /* calculate 8-byte data end */
+       dend = dest + (qw_to_write * sizeof(u64));
+
+       if (pbuf->qw_written < PIO_BLOCK_QWS) {
+               /*
+                * Still within SOP block.  We don't need to check for
+                * wrap because we are still in the first block and
+                * can only wrap on block boundaries.
+                */
+               void __iomem *send;             /* SOP end */
+               void __iomem *xend;
+
+               /* calculate the end of data or end of block, whichever
+                  comes first */
+               send = pbuf->start + PIO_BLOCK_SIZE;
+               xend = send < dend ? send : dend;
+
+               /* shift up to SOP=1 space */
+               dest += SOP_DISTANCE;
+               xend += SOP_DISTANCE;
+
+               /* write 8-byte chunk data */
+               while (dest < xend) {
+                       merge_write8(pbuf, dest, from);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               /* shift down to SOP=0 space */
+               dest -= SOP_DISTANCE;
+       }
+       /*
+        * At this point dest could be (either, both, or neither):
+        * - at dend
+        * - at the wrap
+        */
+
+       /*
+        * If the wrap comes before or matches the data end,
+        * copy until until the wrap, then wrap.
+        *
+        * If dest is at the wrap, we will fall into the if,
+        * not do the loop, when wrap.
+        *
+        * If the data ends at the end of the SOP above and
+        * the buffer wraps, then pbuf->end == dend == dest
+        * and nothing will get written.
+        */
+       if (pbuf->end <= dend) {
+               while (dest < pbuf->end) {
+                       merge_write8(pbuf, dest, from);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               dest -= pbuf->size;
+               dend -= pbuf->size;
+       }
+
+       /* write 8-byte non-SOP, non-wrap chunk data */
+       while (dest < dend) {
+               merge_write8(pbuf, dest, from);
+               from += sizeof(u64);
+               dest += sizeof(u64);
+       }
+
+       /* adjust carry */
+       if (pbuf->carry_bytes < bytes_left) {
+               /* need to read more */
+               read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes);
+       } else {
+               /* remove invalid bytes */
+               zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left);
+       }
+
+       pbuf->qw_written += qw_to_write;
+}
+
+/*
+ * Mid copy helper, "straight case" - source pointer is 64-bit aligned
+ * with no carry bytes.
+ *
+ * @pbuf: destination buffer
+ * @from: data source, is QWORD aligned
+ * @nbytes: bytes to copy
+ *
+ * Must handle nbytes < 8.
+ */
+static void mid_copy_straight(struct pio_buf *pbuf,
+                                               const void *from, size_t nbytes)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+       void __iomem *dend;                     /* 8-byte data end */
+
+       /* calculate 8-byte data end */
+       dend = dest + ((nbytes>>3) * sizeof(u64));
+
+       if (pbuf->qw_written < PIO_BLOCK_QWS) {
+               /*
+                * Still within SOP block.  We don't need to check for
+                * wrap because we are still in the first block and
+                * can only wrap on block boundaries.
+                */
+               void __iomem *send;             /* SOP end */
+               void __iomem *xend;
+
+               /* calculate the end of data or end of block, whichever
+                  comes first */
+               send = pbuf->start + PIO_BLOCK_SIZE;
+               xend = send < dend ? send : dend;
+
+               /* shift up to SOP=1 space */
+               dest += SOP_DISTANCE;
+               xend += SOP_DISTANCE;
+
+               /* write 8-byte chunk data */
+               while (dest < xend) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               /* shift down to SOP=0 space */
+               dest -= SOP_DISTANCE;
+       }
+       /*
+        * At this point dest could be (either, both, or neither):
+        * - at dend
+        * - at the wrap
+        */
+
+       /*
+        * If the wrap comes before or matches the data end,
+        * copy until until the wrap, then wrap.
+        *
+        * If dest is at the wrap, we will fall into the if,
+        * not do the loop, when wrap.
+        *
+        * If the data ends at the end of the SOP above and
+        * the buffer wraps, then pbuf->end == dend == dest
+        * and nothing will get written.
+        */
+       if (pbuf->end <= dend) {
+               while (dest < pbuf->end) {
+                       writeq(*(u64 *)from, dest);
+                       from += sizeof(u64);
+                       dest += sizeof(u64);
+               }
+
+               dest -= pbuf->size;
+               dend -= pbuf->size;
+       }
+
+       /* write 8-byte non-SOP, non-wrap chunk data */
+       while (dest < dend) {
+               writeq(*(u64 *)from, dest);
+               from += sizeof(u64);
+               dest += sizeof(u64);
+       }
+
+       /* we know carry_bytes was zero on entry to this routine */
+       read_low_bytes(pbuf, from, nbytes & 0x7);
+
+       pbuf->qw_written += nbytes>>3;
+}
+
+/*
+ * Segmented PIO Copy - middle
+ *
+ * Must handle any aligned tail and any aligned source with any byte count.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ * @from: data source
+ * @nbytes: number of bytes to copy
+ */
+void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes)
+{
+       unsigned long from_align = (unsigned long)from & 0x7;
+
+       if (pbuf->carry_bytes + nbytes < 8) {
+               /* not enough bytes to fill a QW */
+               read_extra_bytes(pbuf, from, nbytes);
+               return;
+       }
+
+       if (from_align) {
+               /* misaligned source pointer - align it */
+               unsigned long to_align;
+
+               /* bytes to read to align "from" */
+               to_align = 8 - from_align;
+
+               /*
+                * In the advance-to-alignment logic below, we do not need
+                * to check if we are using more than nbytes.  This is because
+                * if we are here, we already know that carry+nbytes will
+                * fill at least one QW.
+                */
+               if (pbuf->carry_bytes + to_align < 8) {
+                       /* not enough align bytes to fill a QW */
+                       read_extra_bytes(pbuf, from, to_align);
+                       from += to_align;
+                       nbytes -= to_align;
+               } else {
+                       /* bytes to fill carry */
+                       unsigned long to_fill = 8 - pbuf->carry_bytes;
+                       /* bytes left over to be read */
+                       unsigned long extra = to_align - to_fill;
+                       void __iomem *dest;
+
+                       /* fill carry... */
+                       read_extra_bytes(pbuf, from, to_fill);
+                       from += to_fill;
+                       nbytes -= to_fill;
+
+                       /* ...now write carry */
+                       dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+                       /*
+                        * The two checks immediately below cannot both be
+                        * true, hence the else.  If we have wrapped, we
+                        * cannot still be within the first block.
+                        * Conversely, if we are still in the first block, we
+                        * cannot have wrapped.  We do the wrap check first
+                        * as that is more likely.
+                        */
+                       /* adjust if we've wrapped */
+                       if (dest >= pbuf->end)
+                               dest -= pbuf->size;
+                       /* jump to SOP range if within the first block */
+                       else if (pbuf->qw_written < PIO_BLOCK_QWS)
+                               dest += SOP_DISTANCE;
+
+                       carry8_write8(pbuf->carry, dest);
+                       pbuf->qw_written++;
+
+                       /* read any extra bytes to do final alignment */
+                       /* this will overwrite anything in pbuf->carry */
+                       read_low_bytes(pbuf, from, extra);
+                       from += extra;
+                       nbytes -= extra;
+               }
+
+               /* at this point, from is QW aligned */
+       }
+
+       if (pbuf->carry_bytes)
+               mid_copy_mix(pbuf, from, nbytes);
+       else
+               mid_copy_straight(pbuf, from, nbytes);
+}
+
+/*
+ * Segmented PIO Copy - end
+ *
+ * Write any remainder (in pbuf->carry) and finish writing the whole block.
+ *
+ * @pbuf: a number of blocks allocated within a PIO send context
+ */
+void seg_pio_copy_end(struct pio_buf *pbuf)
+{
+       void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64));
+
+       /*
+        * The two checks immediately below cannot both be true, hence the
+        * else.  If we have wrapped, we cannot still be within the first
+        * block.  Conversely, if we are still in the first block, we
+        * cannot have wrapped.  We do the wrap check first as that is
+        * more likely.
+        */
+       /* adjust if we have wrapped */
+       if (dest >= pbuf->end)
+               dest -= pbuf->size;
+       /* jump to the SOP range if within the first block */
+       else if (pbuf->qw_written < PIO_BLOCK_QWS)
+               dest += SOP_DISTANCE;
+
+       /* write final bytes, if any */
+       if (carry_write8(pbuf, dest)) {
+               dest += sizeof(u64);
+               /*
+                * NOTE: We do not need to recalculate whether dest needs
+                * SOP_DISTANCE or not.
+                *
+                * If we are in the first block and the dangle write
+                * keeps us in the same block, dest will need
+                * to retain SOP_DISTANCE in the loop below.
+                *
+                * If we are in the first block and the dangle write pushes
+                * us to the next block, then loop below will not run
+                * and dest is not used.  Hence we do not need to update
+                * it.
+                *
+                * If we are past the first block, then SOP_DISTANCE
+                * was never added, so there is nothing to do.
+                */
+       }
+
+       /* fill in rest of block */
+       while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) {
+               writeq(0, dest);
+               dest += sizeof(u64);
+       }
+
+       /* finished with this buffer */
+       atomic_dec(&pbuf->sc->buffers_allocated);
+}
diff --git a/drivers/staging/rdma/hfi1/platform_config.h b/drivers/staging/rdma/hfi1/platform_config.h
new file mode 100644 (file)
index 0000000..8a94a83
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef __PLATFORM_CONFIG_H
+#define __PLATFORM_CONFIG_H
+
+#define METADATA_TABLE_FIELD_START_SHIFT               0
+#define METADATA_TABLE_FIELD_START_LEN_BITS            15
+#define METADATA_TABLE_FIELD_LEN_SHIFT                 16
+#define METADATA_TABLE_FIELD_LEN_LEN_BITS              16
+
+/* Header structure */
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT                        0
+#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS             6
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT              16
+#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS           12
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT                        28
+#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS             4
+
+enum platform_config_table_type_encoding {
+       PLATFORM_CONFIG_TABLE_RESERVED,
+       PLATFORM_CONFIG_SYSTEM_TABLE,
+       PLATFORM_CONFIG_PORT_TABLE,
+       PLATFORM_CONFIG_RX_PRESET_TABLE,
+       PLATFORM_CONFIG_TX_PRESET_TABLE,
+       PLATFORM_CONFIG_QSFP_ATTEN_TABLE,
+       PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE,
+       PLATFORM_CONFIG_TABLE_MAX
+};
+
+enum platform_config_system_table_fields {
+       SYSTEM_TABLE_RESERVED,
+       SYSTEM_TABLE_NODE_STRING,
+       SYSTEM_TABLE_SYSTEM_IMAGE_GUID,
+       SYSTEM_TABLE_NODE_GUID,
+       SYSTEM_TABLE_REVISION,
+       SYSTEM_TABLE_VENDOR_OUI,
+       SYSTEM_TABLE_META_VERSION,
+       SYSTEM_TABLE_DEVICE_ID,
+       SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP,
+       SYSTEM_TABLE_QSFP_POWER_CLASS_MAX,
+       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G,
+       SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G,
+       SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT,
+       SYSTEM_TABLE_MAX
+};
+
+enum platform_config_port_table_fields {
+       PORT_TABLE_RESERVED,
+       PORT_TABLE_PORT_TYPE,
+       PORT_TABLE_ATTENUATION_12G,
+       PORT_TABLE_ATTENUATION_25G,
+       PORT_TABLE_LINK_SPEED_SUPPORTED,
+       PORT_TABLE_LINK_WIDTH_SUPPORTED,
+       PORT_TABLE_VL_CAP,
+       PORT_TABLE_MTU_CAP,
+       PORT_TABLE_TX_LANE_ENABLE_MASK,
+       PORT_TABLE_LOCAL_MAX_TIMEOUT,
+       PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED,
+       PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED,
+       PORT_TABLE_TX_PRESET_IDX_PASSIVE_CU,
+       PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ,
+       PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ,
+       PORT_TABLE_RX_PRESET_IDX,
+       PORT_TABLE_CABLE_REACH_CLASS,
+       PORT_TABLE_MAX
+};
+
+enum platform_config_rx_preset_table_fields {
+       RX_PRESET_TABLE_RESERVED,
+       RX_PRESET_TABLE_QSFP_RX_CDR_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_EQ_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_AMP_APPLY,
+       RX_PRESET_TABLE_QSFP_RX_CDR,
+       RX_PRESET_TABLE_QSFP_RX_EQ,
+       RX_PRESET_TABLE_QSFP_RX_AMP,
+       RX_PRESET_TABLE_MAX
+};
+
+enum platform_config_tx_preset_table_fields {
+       TX_PRESET_TABLE_RESERVED,
+       TX_PRESET_TABLE_PRECUR,
+       TX_PRESET_TABLE_ATTN,
+       TX_PRESET_TABLE_POSTCUR,
+       TX_PRESET_TABLE_QSFP_TX_CDR_APPLY,
+       TX_PRESET_TABLE_QSFP_TX_EQ_APPLY,
+       TX_PRESET_TABLE_QSFP_TX_CDR,
+       TX_PRESET_TABLE_QSFP_TX_EQ,
+       TX_PRESET_TABLE_MAX
+};
+
+enum platform_config_qsfp_attn_table_fields {
+       QSFP_ATTEN_TABLE_RESERVED,
+       QSFP_ATTEN_TABLE_TX_PRESET_IDX,
+       QSFP_ATTEN_TABLE_RX_PRESET_IDX,
+       QSFP_ATTEN_TABLE_MAX
+};
+
+enum platform_config_variable_settings_table_fields {
+       VARIABLE_SETTINGS_TABLE_RESERVED,
+       VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX,
+       VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX,
+       VARIABLE_SETTINGS_TABLE_MAX
+};
+
+struct platform_config_data {
+       u32 *table;
+       u32 *table_metadata;
+       u32 num_table;
+};
+
+/*
+ * This struct acts as a quick reference into the platform_data binary image
+ * and is populated by parse_platform_config(...) depending on the specific
+ * META_VERSION
+ */
+struct platform_config_cache {
+       u8  cache_valid;
+       struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX];
+};
+
+static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = {
+       0,
+       SYSTEM_TABLE_MAX,
+       PORT_TABLE_MAX,
+       RX_PRESET_TABLE_MAX,
+       TX_PRESET_TABLE_MAX,
+       QSFP_ATTEN_TABLE_MAX,
+       VARIABLE_SETTINGS_TABLE_MAX
+};
+
+/* This section defines default values and encodings for the
+ * fields defined for each table above
+ */
+
+/*=====================================================
+ *  System table encodings
+ *====================================================*/
+#define PLATFORM_CONFIG_MAGIC_NUM              0x3d4f5041
+#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN       4
+
+/*
+ * These power classes are the same as defined in SFF 8636 spec rev 2.4
+ * describing byte 129 in table 6-16, except enumerated in a different order
+ */
+enum platform_config_qsfp_power_class_encoding {
+       QSFP_POWER_CLASS_1 = 1,
+       QSFP_POWER_CLASS_2,
+       QSFP_POWER_CLASS_3,
+       QSFP_POWER_CLASS_4,
+       QSFP_POWER_CLASS_5,
+       QSFP_POWER_CLASS_6,
+       QSFP_POWER_CLASS_7
+};
+
+
+/*=====================================================
+ *  Port table encodings
+ *==================================================== */
+enum platform_config_port_type_encoding {
+       PORT_TYPE_RESERVED,
+       PORT_TYPE_DISCONNECTED,
+       PORT_TYPE_FIXED,
+       PORT_TYPE_VARIABLE,
+       PORT_TYPE_QSFP,
+       PORT_TYPE_MAX
+};
+
+enum platform_config_link_speed_supported_encoding {
+       LINK_SPEED_SUPP_12G = 1,
+       LINK_SPEED_SUPP_25G,
+       LINK_SPEED_SUPP_12G_25G,
+       LINK_SPEED_SUPP_MAX
+};
+
+/*
+ * This is a subset (not strict) of the link downgrades
+ * supported. The link downgrades supported are expected
+ * to be supplied to the driver by another entity such as
+ * the fabric manager
+ */
+enum platform_config_link_width_supported_encoding {
+       LINK_WIDTH_SUPP_1X = 1,
+       LINK_WIDTH_SUPP_2X,
+       LINK_WIDTH_SUPP_2X_1X,
+       LINK_WIDTH_SUPP_3X,
+       LINK_WIDTH_SUPP_3X_1X,
+       LINK_WIDTH_SUPP_3X_2X,
+       LINK_WIDTH_SUPP_3X_2X_1X,
+       LINK_WIDTH_SUPP_4X,
+       LINK_WIDTH_SUPP_4X_1X,
+       LINK_WIDTH_SUPP_4X_2X,
+       LINK_WIDTH_SUPP_4X_2X_1X,
+       LINK_WIDTH_SUPP_4X_3X,
+       LINK_WIDTH_SUPP_4X_3X_1X,
+       LINK_WIDTH_SUPP_4X_3X_2X,
+       LINK_WIDTH_SUPP_4X_3X_2X_1X,
+       LINK_WIDTH_SUPP_MAX
+};
+
+enum platform_config_virtual_lane_capability_encoding {
+       VL_CAP_VL0 = 1,
+       VL_CAP_VL0_1,
+       VL_CAP_VL0_2,
+       VL_CAP_VL0_3,
+       VL_CAP_VL0_4,
+       VL_CAP_VL0_5,
+       VL_CAP_VL0_6,
+       VL_CAP_VL0_7,
+       VL_CAP_VL0_8,
+       VL_CAP_VL0_9,
+       VL_CAP_VL0_10,
+       VL_CAP_VL0_11,
+       VL_CAP_VL0_12,
+       VL_CAP_VL0_13,
+       VL_CAP_VL0_14,
+       VL_CAP_MAX
+};
+
+/* Max MTU */
+enum platform_config_mtu_capability_encoding {
+       MTU_CAP_256   = 1,
+       MTU_CAP_512   = 2,
+       MTU_CAP_1024  = 3,
+       MTU_CAP_2048  = 4,
+       MTU_CAP_4096  = 5,
+       MTU_CAP_8192  = 6,
+       MTU_CAP_10240 = 7
+};
+
+enum platform_config_local_max_timeout_encoding {
+       LOCAL_MAX_TIMEOUT_10_MS = 1,
+       LOCAL_MAX_TIMEOUT_100_MS,
+       LOCAL_MAX_TIMEOUT_1_S,
+       LOCAL_MAX_TIMEOUT_10_S,
+       LOCAL_MAX_TIMEOUT_100_S,
+       LOCAL_MAX_TIMEOUT_1000_S
+};
+
+#endif                 /*__PLATFORM_CONFIG_H*/
diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
new file mode 100644 (file)
index 0000000..df1fa56
--- /dev/null
@@ -0,0 +1,1687 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "trace.h"
+#include "sdma.h"
+
+#define BITS_PER_PAGE           (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
+
+static unsigned int hfi1_qp_table_size = 256;
+module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+static void flush_tx_list(struct hfi1_qp *qp);
+static int iowait_sleep(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *stx,
+       unsigned seq);
+static void iowait_wakeup(struct iowait *wait, int reason);
+
+static inline unsigned mk_qpn(struct hfi1_qpn_table *qpt,
+                             struct qpn_map *map, unsigned off)
+{
+       return (map - qpt->map) * BITS_PER_PAGE + off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+       0,                      /* 0 */
+       1,                      /* 1 */
+       2,                      /* 2 */
+       3,                      /* 3 */
+       4,                      /* 4 */
+       6,                      /* 5 */
+       8,                      /* 6 */
+       12,                     /* 7 */
+       16,                     /* 8 */
+       24,                     /* 9 */
+       32,                     /* A */
+       48,                     /* B */
+       64,                     /* C */
+       96,                     /* D */
+       128,                    /* E */
+       192,                    /* F */
+       256,                    /* 10 */
+       384,                    /* 11 */
+       512,                    /* 12 */
+       768,                    /* 13 */
+       1024,                   /* 14 */
+       1536,                   /* 15 */
+       2048,                   /* 16 */
+       3072,                   /* 17 */
+       4096,                   /* 18 */
+       6144,                   /* 19 */
+       8192,                   /* 1A */
+       12288,                  /* 1B */
+       16384,                  /* 1C */
+       24576,                  /* 1D */
+       32768                   /* 1E */
+};
+
+static void get_map_page(struct hfi1_qpn_table *qpt, struct qpn_map *map)
+{
+       unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+       /*
+        * Free the page if someone raced with us installing it.
+        */
+
+       spin_lock(&qpt->lock);
+       if (map->page)
+               free_page(page);
+       else
+               map->page = (void *)page;
+       spin_unlock(&qpt->lock);
+}
+
+/*
+ * Allocate the next available QPN or
+ * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
+ */
+static int alloc_qpn(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt,
+                    enum ib_qp_type type, u8 port)
+{
+       u32 i, offset, max_scan, qpn;
+       struct qpn_map *map;
+       u32 ret;
+
+       if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+               unsigned n;
+
+               ret = type == IB_QPT_GSI;
+               n = 1 << (ret + 2 * (port - 1));
+               spin_lock(&qpt->lock);
+               if (qpt->flags & n)
+                       ret = -EINVAL;
+               else
+                       qpt->flags |= n;
+               spin_unlock(&qpt->lock);
+               goto bail;
+       }
+
+       qpn = qpt->last + qpt->incr;
+       if (qpn >= QPN_MAX)
+               qpn = qpt->incr | ((qpt->last & 1) ^ 1);
+       /* offset carries bit 0 */
+       offset = qpn & BITS_PER_PAGE_MASK;
+       map = &qpt->map[qpn / BITS_PER_PAGE];
+       max_scan = qpt->nmaps - !offset;
+       for (i = 0;;) {
+               if (unlikely(!map->page)) {
+                       get_map_page(qpt, map);
+                       if (unlikely(!map->page))
+                               break;
+               }
+               do {
+                       if (!test_and_set_bit(offset, map->page)) {
+                               qpt->last = qpn;
+                               ret = qpn;
+                               goto bail;
+                       }
+                       offset += qpt->incr;
+                       /*
+                        * This qpn might be bogus if offset >= BITS_PER_PAGE.
+                        * That is OK.   It gets re-assigned below
+                        */
+                       qpn = mk_qpn(qpt, map, offset);
+               } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+               /*
+                * In order to keep the number of pages allocated to a
+                * minimum, we scan the all existing pages before increasing
+                * the size of the bitmap table.
+                */
+               if (++i > max_scan) {
+                       if (qpt->nmaps == QPNMAP_ENTRIES)
+                               break;
+                       map = &qpt->map[qpt->nmaps++];
+                       /* start at incr with current bit 0 */
+                       offset = qpt->incr | (offset & 1);
+               } else if (map < &qpt->map[qpt->nmaps]) {
+                       ++map;
+                       /* start at incr with current bit 0 */
+                       offset = qpt->incr | (offset & 1);
+               } else {
+                       map = &qpt->map[0];
+                       /* wrap to first map page, invert bit 0 */
+                       offset = qpt->incr | ((offset & 1) ^ 1);
+               }
+               /* there can be no bits at shift and below */
+               WARN_ON(offset & (dd->qos_shift - 1));
+               qpn = mk_qpn(qpt, map, offset);
+       }
+
+       ret = -ENOMEM;
+
+bail:
+       return ret;
+}
+
+static void free_qpn(struct hfi1_qpn_table *qpt, u32 qpn)
+{
+       struct qpn_map *map;
+
+       map = qpt->map + qpn / BITS_PER_PAGE;
+       if (map->page)
+               clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+}
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void insert_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       unsigned long flags;
+
+       atomic_inc(&qp->refcount);
+       spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+
+       if (qp->ibqp.qp_num <= 1) {
+               rcu_assign_pointer(ibp->qp[qp->ibqp.qp_num], qp);
+       } else {
+               u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
+
+               qp->next = dev->qp_dev->qp_table[n];
+               rcu_assign_pointer(dev->qp_dev->qp_table[n], qp);
+               trace_hfi1_qpinsert(qp, n);
+       }
+
+       spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+}
+
+/*
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void remove_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num);
+       unsigned long flags;
+       int removed = 1;
+
+       spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+
+       if (rcu_dereference_protected(ibp->qp[0],
+                       lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
+               RCU_INIT_POINTER(ibp->qp[0], NULL);
+       } else if (rcu_dereference_protected(ibp->qp[1],
+                       lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) {
+               RCU_INIT_POINTER(ibp->qp[1], NULL);
+       } else {
+               struct hfi1_qp *q;
+               struct hfi1_qp __rcu **qpp;
+
+               removed = 0;
+               qpp = &dev->qp_dev->qp_table[n];
+               for (; (q = rcu_dereference_protected(*qpp,
+                               lockdep_is_held(&dev->qp_dev->qpt_lock)))
+                                       != NULL;
+                               qpp = &q->next)
+                       if (q == qp) {
+                               RCU_INIT_POINTER(*qpp,
+                                rcu_dereference_protected(qp->next,
+                                lockdep_is_held(&dev->qp_dev->qpt_lock)));
+                               removed = 1;
+                               trace_hfi1_qpremove(qp, n);
+                               break;
+                       }
+       }
+
+       spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+       if (removed) {
+               synchronize_rcu();
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+}
+
+/**
+ * free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+static unsigned free_all_qps(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       unsigned long flags;
+       struct hfi1_qp *qp;
+       unsigned n, qp_inuse = 0;
+
+       for (n = 0; n < dd->num_pports; n++) {
+               struct hfi1_ibport *ibp = &dd->pport[n].ibport_data;
+
+               if (!hfi1_mcast_tree_empty(ibp))
+                       qp_inuse++;
+               rcu_read_lock();
+               if (rcu_dereference(ibp->qp[0]))
+                       qp_inuse++;
+               if (rcu_dereference(ibp->qp[1]))
+                       qp_inuse++;
+               rcu_read_unlock();
+       }
+
+       if (!dev->qp_dev)
+               goto bail;
+       spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags);
+       for (n = 0; n < dev->qp_dev->qp_table_size; n++) {
+               qp = rcu_dereference_protected(dev->qp_dev->qp_table[n],
+                       lockdep_is_held(&dev->qp_dev->qpt_lock));
+               RCU_INIT_POINTER(dev->qp_dev->qp_table[n], NULL);
+
+               for (; qp; qp = rcu_dereference_protected(qp->next,
+                               lockdep_is_held(&dev->qp_dev->qpt_lock)))
+                       qp_inuse++;
+       }
+       spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags);
+       synchronize_rcu();
+bail:
+       return qp_inuse;
+}
+
+/**
+ * reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void reset_qp(struct hfi1_qp *qp, enum ib_qp_type type)
+{
+       qp->remote_qpn = 0;
+       qp->qkey = 0;
+       qp->qp_access_flags = 0;
+       iowait_init(
+               &qp->s_iowait,
+               1,
+               hfi1_do_send,
+               iowait_sleep,
+               iowait_wakeup);
+       qp->s_flags &= HFI1_S_SIGNAL_REQ_WR;
+       qp->s_hdrwords = 0;
+       qp->s_wqe = NULL;
+       qp->s_draining = 0;
+       qp->s_next_psn = 0;
+       qp->s_last_psn = 0;
+       qp->s_sending_psn = 0;
+       qp->s_sending_hpsn = 0;
+       qp->s_psn = 0;
+       qp->r_psn = 0;
+       qp->r_msn = 0;
+       if (type == IB_QPT_RC) {
+               qp->s_state = IB_OPCODE_RC_SEND_LAST;
+               qp->r_state = IB_OPCODE_RC_SEND_LAST;
+       } else {
+               qp->s_state = IB_OPCODE_UC_SEND_LAST;
+               qp->r_state = IB_OPCODE_UC_SEND_LAST;
+       }
+       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+       qp->r_nak_state = 0;
+       qp->r_aflags = 0;
+       qp->r_flags = 0;
+       qp->s_head = 0;
+       qp->s_tail = 0;
+       qp->s_cur = 0;
+       qp->s_acked = 0;
+       qp->s_last = 0;
+       qp->s_ssn = 1;
+       qp->s_lsn = 0;
+       clear_ahg(qp);
+       qp->s_mig_state = IB_MIG_MIGRATED;
+       memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+       qp->r_head_ack_queue = 0;
+       qp->s_tail_ack_queue = 0;
+       qp->s_num_rd_atomic = 0;
+       if (qp->r_rq.wq) {
+               qp->r_rq.wq->head = 0;
+               qp->r_rq.wq->tail = 0;
+       }
+       qp->r_sge.num_sge = 0;
+}
+
+static void clear_mr_refs(struct hfi1_qp *qp, int clr_sends)
+{
+       unsigned n;
+
+       if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+               hfi1_put_ss(&qp->s_rdma_read_sge);
+
+       hfi1_put_ss(&qp->r_sge);
+
+       if (clr_sends) {
+               while (qp->s_last != qp->s_head) {
+                       struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+                       unsigned i;
+
+                       for (i = 0; i < wqe->wr.num_sge; i++) {
+                               struct hfi1_sge *sge = &wqe->sg_list[i];
+
+                               hfi1_put_mr(sge->mr);
+                       }
+                       if (qp->ibqp.qp_type == IB_QPT_UD ||
+                           qp->ibqp.qp_type == IB_QPT_SMI ||
+                           qp->ibqp.qp_type == IB_QPT_GSI)
+                               atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+                       if (++qp->s_last >= qp->s_size)
+                               qp->s_last = 0;
+               }
+               if (qp->s_rdma_mr) {
+                       hfi1_put_mr(qp->s_rdma_mr);
+                       qp->s_rdma_mr = NULL;
+               }
+       }
+
+       if (qp->ibqp.qp_type != IB_QPT_RC)
+               return;
+
+       for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+               struct hfi1_ack_entry *e = &qp->s_ack_queue[n];
+
+               if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
+                   e->rdma_sge.mr) {
+                       hfi1_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+       }
+}
+
+/**
+ * hfi1_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err)
+{
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ib_wc wc;
+       int ret = 0;
+
+       if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+               goto bail;
+
+       qp->state = IB_QPS_ERR;
+
+       if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+               qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+               del_timer(&qp->s_timer);
+       }
+
+       if (qp->s_flags & HFI1_S_ANY_WAIT_SEND)
+               qp->s_flags &= ~HFI1_S_ANY_WAIT_SEND;
+
+       write_seqlock(&dev->iowait_lock);
+       if (!list_empty(&qp->s_iowait.list) && !(qp->s_flags & HFI1_S_BUSY)) {
+               qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+               list_del_init(&qp->s_iowait.list);
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       write_sequnlock(&dev->iowait_lock);
+
+       if (!(qp->s_flags & HFI1_S_BUSY)) {
+               qp->s_hdrwords = 0;
+               if (qp->s_rdma_mr) {
+                       hfi1_put_mr(qp->s_rdma_mr);
+                       qp->s_rdma_mr = NULL;
+               }
+               flush_tx_list(qp);
+       }
+
+       /* Schedule the sending tasklet to drain the send work queue. */
+       if (qp->s_last != qp->s_head)
+               hfi1_schedule_send(qp);
+
+       clear_mr_refs(qp, 0);
+
+       memset(&wc, 0, sizeof(wc));
+       wc.qp = &qp->ibqp;
+       wc.opcode = IB_WC_RECV;
+
+       if (test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) {
+               wc.wr_id = qp->r_wr_id;
+               wc.status = err;
+               hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+       }
+       wc.status = IB_WC_WR_FLUSH_ERR;
+
+       if (qp->r_rq.wq) {
+               struct hfi1_rwq *wq;
+               u32 head;
+               u32 tail;
+
+               spin_lock(&qp->r_rq.lock);
+
+               /* sanity check pointers before trusting them */
+               wq = qp->r_rq.wq;
+               head = wq->head;
+               if (head >= qp->r_rq.size)
+                       head = 0;
+               tail = wq->tail;
+               if (tail >= qp->r_rq.size)
+                       tail = 0;
+               while (tail != head) {
+                       wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+                       if (++tail >= qp->r_rq.size)
+                               tail = 0;
+                       hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+               }
+               wq->tail = tail;
+
+               spin_unlock(&qp->r_rq.lock);
+       } else if (qp->ibqp.event_handler)
+               ret = 1;
+
+bail:
+       return ret;
+}
+
+static void flush_tx_list(struct hfi1_qp *qp)
+{
+       while (!list_empty(&qp->s_iowait.tx_head)) {
+               struct sdma_txreq *tx;
+
+               tx = list_first_entry(
+                       &qp->s_iowait.tx_head,
+                       struct sdma_txreq,
+                       list);
+               list_del_init(&tx->list);
+               hfi1_put_txreq(
+                       container_of(tx, struct verbs_txreq, txreq));
+       }
+}
+
+static void flush_iowait(struct hfi1_qp *qp)
+{
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned long flags;
+
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       if (!list_empty(&qp->s_iowait.list)) {
+               list_del_init(&qp->s_iowait.list);
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+}
+
+static inline int opa_mtu_enum_to_int(int mtu)
+{
+       switch (mtu) {
+       case OPA_MTU_8192:  return 8192;
+       case OPA_MTU_10240: return 10240;
+       default:            return -1;
+       }
+}
+
+/**
+ * This function is what we would push to the core layer if we wanted to be a
+ * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
+ * to blindly pass the MTU enum value from the PathRecord to us.
+ *
+ * The actual flag used to determine "8k MTU" will change and is currently
+ * unknown.
+ */
+static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
+{
+       int val = opa_mtu_enum_to_int((int)mtu);
+
+       if (val > 0)
+               return val;
+       return ib_mtu_enum_to_int(mtu);
+}
+
+
+/**
+ * hfi1_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                  int attr_mask, struct ib_udata *udata)
+{
+       struct hfi1_ibdev *dev = to_idev(ibqp->device);
+       struct hfi1_qp *qp = to_iqp(ibqp);
+       enum ib_qp_state cur_state, new_state;
+       struct ib_event ev;
+       int lastwqe = 0;
+       int mig = 0;
+       int ret;
+       u32 pmtu = 0; /* for gcc warning only */
+       struct hfi1_devdata *dd;
+
+       spin_lock_irq(&qp->r_lock);
+       spin_lock(&qp->s_lock);
+
+       cur_state = attr_mask & IB_QP_CUR_STATE ?
+               attr->cur_qp_state : qp->state;
+       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+                               attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+               goto inval;
+
+       if (attr_mask & IB_QP_AV) {
+               if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
+                       goto inval;
+               if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr))
+                       goto inval;
+       }
+
+       if (attr_mask & IB_QP_ALT_PATH) {
+               if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
+                       goto inval;
+               if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+                       goto inval;
+               if (attr->alt_pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+                       goto inval;
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX)
+               if (attr->pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+                       goto inval;
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER)
+               if (attr->min_rnr_timer > 31)
+                       goto inval;
+
+       if (attr_mask & IB_QP_PORT)
+               if (qp->ibqp.qp_type == IB_QPT_SMI ||
+                   qp->ibqp.qp_type == IB_QPT_GSI ||
+                   attr->port_num == 0 ||
+                   attr->port_num > ibqp->device->phys_port_cnt)
+                       goto inval;
+
+       if (attr_mask & IB_QP_DEST_QPN)
+               if (attr->dest_qp_num > HFI1_QPN_MASK)
+                       goto inval;
+
+       if (attr_mask & IB_QP_RETRY_CNT)
+               if (attr->retry_cnt > 7)
+                       goto inval;
+
+       if (attr_mask & IB_QP_RNR_RETRY)
+               if (attr->rnr_retry > 7)
+                       goto inval;
+
+       /*
+        * Don't allow invalid path_mtu values.  OK to set greater
+        * than the active mtu (or even the max_cap, if we have tuned
+        * that to a small mtu.  We'll set qp->path_mtu
+        * to the lesser of requested attribute mtu and active,
+        * for packetizing messages.
+        * Note that the QP port has to be set in INIT and MTU in RTR.
+        */
+       if (attr_mask & IB_QP_PATH_MTU) {
+               int mtu, pidx = qp->port_num - 1;
+
+               dd = dd_from_dev(dev);
+               mtu = verbs_mtu_enum_to_int(ibqp->device, attr->path_mtu);
+               if (mtu == -1)
+                       goto inval;
+
+               if (mtu > dd->pport[pidx].ibmtu)
+                       pmtu = mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048);
+               else
+                       pmtu = attr->path_mtu;
+       }
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE) {
+               if (attr->path_mig_state == IB_MIG_REARM) {
+                       if (qp->s_mig_state == IB_MIG_ARMED)
+                               goto inval;
+                       if (new_state != IB_QPS_RTS)
+                               goto inval;
+               } else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+                       if (qp->s_mig_state == IB_MIG_REARM)
+                               goto inval;
+                       if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+                               goto inval;
+                       if (qp->s_mig_state == IB_MIG_ARMED)
+                               mig = 1;
+               } else
+                       goto inval;
+       }
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+               if (attr->max_dest_rd_atomic > HFI1_MAX_RDMA_ATOMIC)
+                       goto inval;
+
+       switch (new_state) {
+       case IB_QPS_RESET:
+               if (qp->state != IB_QPS_RESET) {
+                       qp->state = IB_QPS_RESET;
+                       flush_iowait(qp);
+                       qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
+                       spin_unlock(&qp->s_lock);
+                       spin_unlock_irq(&qp->r_lock);
+                       /* Stop the sending work queue and retry timer */
+                       cancel_work_sync(&qp->s_iowait.iowork);
+                       del_timer_sync(&qp->s_timer);
+                       iowait_sdma_drain(&qp->s_iowait);
+                       flush_tx_list(qp);
+                       remove_qp(dev, qp);
+                       wait_event(qp->wait, !atomic_read(&qp->refcount));
+                       spin_lock_irq(&qp->r_lock);
+                       spin_lock(&qp->s_lock);
+                       clear_mr_refs(qp, 1);
+                       clear_ahg(qp);
+                       reset_qp(qp, ibqp->qp_type);
+               }
+               break;
+
+       case IB_QPS_RTR:
+               /* Allow event to re-trigger if QP set to RTR more than once */
+               qp->r_flags &= ~HFI1_R_COMM_EST;
+               qp->state = new_state;
+               break;
+
+       case IB_QPS_SQD:
+               qp->s_draining = qp->s_last != qp->s_cur;
+               qp->state = new_state;
+               break;
+
+       case IB_QPS_SQE:
+               if (qp->ibqp.qp_type == IB_QPT_RC)
+                       goto inval;
+               qp->state = new_state;
+               break;
+
+       case IB_QPS_ERR:
+               lastwqe = hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+               break;
+
+       default:
+               qp->state = new_state;
+               break;
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX)
+               qp->s_pkey_index = attr->pkey_index;
+
+       if (attr_mask & IB_QP_PORT)
+               qp->port_num = attr->port_num;
+
+       if (attr_mask & IB_QP_DEST_QPN)
+               qp->remote_qpn = attr->dest_qp_num;
+
+       if (attr_mask & IB_QP_SQ_PSN) {
+               qp->s_next_psn = attr->sq_psn & PSN_MODIFY_MASK;
+               qp->s_psn = qp->s_next_psn;
+               qp->s_sending_psn = qp->s_next_psn;
+               qp->s_last_psn = qp->s_next_psn - 1;
+               qp->s_sending_hpsn = qp->s_last_psn;
+       }
+
+       if (attr_mask & IB_QP_RQ_PSN)
+               qp->r_psn = attr->rq_psn & PSN_MODIFY_MASK;
+
+       if (attr_mask & IB_QP_ACCESS_FLAGS)
+               qp->qp_access_flags = attr->qp_access_flags;
+
+       if (attr_mask & IB_QP_AV) {
+               qp->remote_ah_attr = attr->ah_attr;
+               qp->s_srate = attr->ah_attr.static_rate;
+               qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+       }
+
+       if (attr_mask & IB_QP_ALT_PATH) {
+               qp->alt_ah_attr = attr->alt_ah_attr;
+               qp->s_alt_pkey_index = attr->alt_pkey_index;
+       }
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE) {
+               qp->s_mig_state = attr->path_mig_state;
+               if (mig) {
+                       qp->remote_ah_attr = qp->alt_ah_attr;
+                       qp->port_num = qp->alt_ah_attr.port_num;
+                       qp->s_pkey_index = qp->s_alt_pkey_index;
+                       qp->s_flags |= HFI1_S_AHG_CLEAR;
+               }
+       }
+
+       if (attr_mask & IB_QP_PATH_MTU) {
+               struct hfi1_ibport *ibp;
+               u8 sc, vl;
+               u32 mtu;
+
+               dd = dd_from_dev(dev);
+               ibp = &dd->pport[qp->port_num - 1].ibport_data;
+
+               sc = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+               vl = sc_to_vlt(dd, sc);
+
+               mtu = verbs_mtu_enum_to_int(ibqp->device, pmtu);
+               if (vl < PER_VL_SEND_CONTEXTS)
+                       mtu = min_t(u32, mtu, dd->vld[vl].mtu);
+               pmtu = mtu_to_enum(mtu, OPA_MTU_8192);
+
+               qp->path_mtu = pmtu;
+               qp->pmtu = mtu;
+       }
+
+       if (attr_mask & IB_QP_RETRY_CNT) {
+               qp->s_retry_cnt = attr->retry_cnt;
+               qp->s_retry = attr->retry_cnt;
+       }
+
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               qp->s_rnr_retry_cnt = attr->rnr_retry;
+               qp->s_rnr_retry = attr->rnr_retry;
+       }
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER)
+               qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+       if (attr_mask & IB_QP_TIMEOUT) {
+               qp->timeout = attr->timeout;
+               qp->timeout_jiffies =
+                       usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+                               1000UL);
+       }
+
+       if (attr_mask & IB_QP_QKEY)
+               qp->qkey = attr->qkey;
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+               qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+               qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irq(&qp->r_lock);
+
+       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+               insert_qp(dev, qp);
+
+       if (lastwqe) {
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+       if (mig) {
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_PATH_MIG;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+       ret = 0;
+       goto bail;
+
+inval:
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irq(&qp->r_lock);
+       ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                 int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+       struct hfi1_qp *qp = to_iqp(ibqp);
+
+       attr->qp_state = qp->state;
+       attr->cur_qp_state = attr->qp_state;
+       attr->path_mtu = qp->path_mtu;
+       attr->path_mig_state = qp->s_mig_state;
+       attr->qkey = qp->qkey;
+       attr->rq_psn = mask_psn(qp->r_psn);
+       attr->sq_psn = mask_psn(qp->s_next_psn);
+       attr->dest_qp_num = qp->remote_qpn;
+       attr->qp_access_flags = qp->qp_access_flags;
+       attr->cap.max_send_wr = qp->s_size - 1;
+       attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+       attr->cap.max_send_sge = qp->s_max_sge;
+       attr->cap.max_recv_sge = qp->r_rq.max_sge;
+       attr->cap.max_inline_data = 0;
+       attr->ah_attr = qp->remote_ah_attr;
+       attr->alt_ah_attr = qp->alt_ah_attr;
+       attr->pkey_index = qp->s_pkey_index;
+       attr->alt_pkey_index = qp->s_alt_pkey_index;
+       attr->en_sqd_async_notify = 0;
+       attr->sq_draining = qp->s_draining;
+       attr->max_rd_atomic = qp->s_max_rd_atomic;
+       attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+       attr->min_rnr_timer = qp->r_min_rnr_timer;
+       attr->port_num = qp->port_num;
+       attr->timeout = qp->timeout;
+       attr->retry_cnt = qp->s_retry_cnt;
+       attr->rnr_retry = qp->s_rnr_retry_cnt;
+       attr->alt_port_num = qp->alt_ah_attr.port_num;
+       attr->alt_timeout = qp->alt_timeout;
+
+       init_attr->event_handler = qp->ibqp.event_handler;
+       init_attr->qp_context = qp->ibqp.qp_context;
+       init_attr->send_cq = qp->ibqp.send_cq;
+       init_attr->recv_cq = qp->ibqp.recv_cq;
+       init_attr->srq = qp->ibqp.srq;
+       init_attr->cap = attr->cap;
+       if (qp->s_flags & HFI1_S_SIGNAL_REQ_WR)
+               init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       else
+               init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+       init_attr->qp_type = qp->ibqp.qp_type;
+       init_attr->port_num = qp->port_num;
+       return 0;
+}
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct hfi1_qp *qp)
+{
+       u32 aeth = qp->r_msn & HFI1_MSN_MASK;
+
+       if (qp->ibqp.srq) {
+               /*
+                * Shared receive queues don't generate credits.
+                * Set the credit field to the invalid value.
+                */
+               aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT;
+       } else {
+               u32 min, max, x;
+               u32 credits;
+               struct hfi1_rwq *wq = qp->r_rq.wq;
+               u32 head;
+               u32 tail;
+
+               /* sanity check pointers before trusting them */
+               head = wq->head;
+               if (head >= qp->r_rq.size)
+                       head = 0;
+               tail = wq->tail;
+               if (tail >= qp->r_rq.size)
+                       tail = 0;
+               /*
+                * Compute the number of credits available (RWQEs).
+                * There is a small chance that the pair of reads are
+                * not atomic, which is OK, since the fuzziness is
+                * resolved as further ACKs go out.
+                */
+               credits = head - tail;
+               if ((int)credits < 0)
+                       credits += qp->r_rq.size;
+               /*
+                * Binary search the credit table to find the code to
+                * use.
+                */
+               min = 0;
+               max = 31;
+               for (;;) {
+                       x = (min + max) / 2;
+                       if (credit_table[x] == credits)
+                               break;
+                       if (credit_table[x] > credits)
+                               max = x;
+                       else if (min == x)
+                               break;
+                       else
+                               min = x;
+               }
+               aeth |= x << HFI1_AETH_CREDIT_SHIFT;
+       }
+       return cpu_to_be32(aeth);
+}
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+                            struct ib_qp_init_attr *init_attr,
+                            struct ib_udata *udata)
+{
+       struct hfi1_qp *qp;
+       int err;
+       struct hfi1_swqe *swq = NULL;
+       struct hfi1_ibdev *dev;
+       struct hfi1_devdata *dd;
+       size_t sz;
+       size_t sg_list_sz;
+       struct ib_qp *ret;
+
+       if (init_attr->cap.max_send_sge > hfi1_max_sges ||
+           init_attr->cap.max_send_wr > hfi1_max_qp_wrs ||
+           init_attr->create_flags) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       /* Check receive queue parameters if no SRQ is specified. */
+       if (!init_attr->srq) {
+               if (init_attr->cap.max_recv_sge > hfi1_max_sges ||
+                   init_attr->cap.max_recv_wr > hfi1_max_qp_wrs) {
+                       ret = ERR_PTR(-EINVAL);
+                       goto bail;
+               }
+               if (init_attr->cap.max_send_sge +
+                   init_attr->cap.max_send_wr +
+                   init_attr->cap.max_recv_sge +
+                   init_attr->cap.max_recv_wr == 0) {
+                       ret = ERR_PTR(-EINVAL);
+                       goto bail;
+               }
+       }
+
+       switch (init_attr->qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               if (init_attr->port_num == 0 ||
+                   init_attr->port_num > ibpd->device->phys_port_cnt) {
+                       ret = ERR_PTR(-EINVAL);
+                       goto bail;
+               }
+       case IB_QPT_UC:
+       case IB_QPT_RC:
+       case IB_QPT_UD:
+               sz = sizeof(struct hfi1_sge) *
+                       init_attr->cap.max_send_sge +
+                       sizeof(struct hfi1_swqe);
+               swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+               if (swq == NULL) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail;
+               }
+               sz = sizeof(*qp);
+               sg_list_sz = 0;
+               if (init_attr->srq) {
+                       struct hfi1_srq *srq = to_isrq(init_attr->srq);
+
+                       if (srq->rq.max_sge > 1)
+                               sg_list_sz = sizeof(*qp->r_sg_list) *
+                                       (srq->rq.max_sge - 1);
+               } else if (init_attr->cap.max_recv_sge > 1)
+                       sg_list_sz = sizeof(*qp->r_sg_list) *
+                               (init_attr->cap.max_recv_sge - 1);
+               qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+               if (!qp) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_swq;
+               }
+               RCU_INIT_POINTER(qp->next, NULL);
+               qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+               if (!qp->s_hdr) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_qp;
+               }
+               qp->timeout_jiffies =
+                       usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+                               1000UL);
+               if (init_attr->srq)
+                       sz = 0;
+               else {
+                       qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+                       qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+                       sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+                               sizeof(struct hfi1_rwqe);
+                       qp->r_rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) +
+                                                  qp->r_rq.size * sz);
+                       if (!qp->r_rq.wq) {
+                               ret = ERR_PTR(-ENOMEM);
+                               goto bail_qp;
+                       }
+               }
+
+               /*
+                * ib_create_qp() will initialize qp->ibqp
+                * except for qp->ibqp.qp_num.
+                */
+               spin_lock_init(&qp->r_lock);
+               spin_lock_init(&qp->s_lock);
+               spin_lock_init(&qp->r_rq.lock);
+               atomic_set(&qp->refcount, 0);
+               init_waitqueue_head(&qp->wait);
+               init_timer(&qp->s_timer);
+               qp->s_timer.data = (unsigned long)qp;
+               INIT_LIST_HEAD(&qp->rspwait);
+               qp->state = IB_QPS_RESET;
+               qp->s_wq = swq;
+               qp->s_size = init_attr->cap.max_send_wr + 1;
+               qp->s_max_sge = init_attr->cap.max_send_sge;
+               if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+                       qp->s_flags = HFI1_S_SIGNAL_REQ_WR;
+               dev = to_idev(ibpd->device);
+               dd = dd_from_dev(dev);
+               err = alloc_qpn(dd, &dev->qp_dev->qpn_table, init_attr->qp_type,
+                               init_attr->port_num);
+               if (err < 0) {
+                       ret = ERR_PTR(err);
+                       vfree(qp->r_rq.wq);
+                       goto bail_qp;
+               }
+               qp->ibqp.qp_num = err;
+               qp->port_num = init_attr->port_num;
+               reset_qp(qp, init_attr->qp_type);
+
+               break;
+
+       default:
+               /* Don't support raw QPs */
+               ret = ERR_PTR(-ENOSYS);
+               goto bail;
+       }
+
+       init_attr->cap.max_inline_data = 0;
+
+       /*
+        * Return the address of the RWQ as the offset to mmap.
+        * See hfi1_mmap() for details.
+        */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               if (!qp->r_rq.wq) {
+                       __u64 offset = 0;
+
+                       err = ib_copy_to_udata(udata, &offset,
+                                              sizeof(offset));
+                       if (err) {
+                               ret = ERR_PTR(err);
+                               goto bail_ip;
+                       }
+               } else {
+                       u32 s = sizeof(struct hfi1_rwq) + qp->r_rq.size * sz;
+
+                       qp->ip = hfi1_create_mmap_info(dev, s,
+                                                     ibpd->uobject->context,
+                                                     qp->r_rq.wq);
+                       if (!qp->ip) {
+                               ret = ERR_PTR(-ENOMEM);
+                               goto bail_ip;
+                       }
+
+                       err = ib_copy_to_udata(udata, &(qp->ip->offset),
+                                              sizeof(qp->ip->offset));
+                       if (err) {
+                               ret = ERR_PTR(err);
+                               goto bail_ip;
+                       }
+               }
+       }
+
+       spin_lock(&dev->n_qps_lock);
+       if (dev->n_qps_allocated == hfi1_max_qps) {
+               spin_unlock(&dev->n_qps_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_ip;
+       }
+
+       dev->n_qps_allocated++;
+       spin_unlock(&dev->n_qps_lock);
+
+       if (qp->ip) {
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       ret = &qp->ibqp;
+
+       /*
+        * We have our QP and its good, now keep track of what types of opcodes
+        * can be processed on this QP. We do this by keeping track of what the
+        * 3 high order bits of the opcode are.
+        */
+       switch (init_attr->qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & OPCODE_QP_MASK;
+               break;
+       case IB_QPT_RC:
+               qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & OPCODE_QP_MASK;
+               break;
+       case IB_QPT_UC:
+               qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & OPCODE_QP_MASK;
+               break;
+       default:
+               ret = ERR_PTR(-EINVAL);
+               goto bail_ip;
+       }
+
+       goto bail;
+
+bail_ip:
+       if (qp->ip)
+               kref_put(&qp->ip->ref, hfi1_release_mmap_info);
+       else
+               vfree(qp->r_rq.wq);
+       free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
+bail_qp:
+       kfree(qp->s_hdr);
+       kfree(qp);
+bail_swq:
+       vfree(swq);
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int hfi1_destroy_qp(struct ib_qp *ibqp)
+{
+       struct hfi1_qp *qp = to_iqp(ibqp);
+       struct hfi1_ibdev *dev = to_idev(ibqp->device);
+
+       /* Make sure HW and driver activity is stopped. */
+       spin_lock_irq(&qp->r_lock);
+       spin_lock(&qp->s_lock);
+       if (qp->state != IB_QPS_RESET) {
+               qp->state = IB_QPS_RESET;
+               flush_iowait(qp);
+               qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT);
+               spin_unlock(&qp->s_lock);
+               spin_unlock_irq(&qp->r_lock);
+               cancel_work_sync(&qp->s_iowait.iowork);
+               del_timer_sync(&qp->s_timer);
+               iowait_sdma_drain(&qp->s_iowait);
+               flush_tx_list(qp);
+               remove_qp(dev, qp);
+               wait_event(qp->wait, !atomic_read(&qp->refcount));
+               spin_lock_irq(&qp->r_lock);
+               spin_lock(&qp->s_lock);
+               clear_mr_refs(qp, 1);
+               clear_ahg(qp);
+       }
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irq(&qp->r_lock);
+
+       /* all user's cleaned up, mark it available */
+       free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num);
+       spin_lock(&dev->n_qps_lock);
+       dev->n_qps_allocated--;
+       spin_unlock(&dev->n_qps_lock);
+
+       if (qp->ip)
+               kref_put(&qp->ip->ref, hfi1_release_mmap_info);
+       else
+               vfree(qp->r_rq.wq);
+       vfree(qp->s_wq);
+       kfree(qp->s_hdr);
+       kfree(qp);
+       return 0;
+}
+
+/**
+ * init_qpn_table - initialize the QP number table for a device
+ * @qpt: the QPN table
+ */
+static int init_qpn_table(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt)
+{
+       u32 offset, qpn, i;
+       struct qpn_map *map;
+       int ret = 0;
+
+       spin_lock_init(&qpt->lock);
+
+       qpt->last = 0;
+       qpt->incr = 1 << dd->qos_shift;
+
+       /* insure we don't assign QPs from KDETH 64K window */
+       qpn = kdeth_qp << 16;
+       qpt->nmaps = qpn / BITS_PER_PAGE;
+       /* This should always be zero */
+       offset = qpn & BITS_PER_PAGE_MASK;
+       map = &qpt->map[qpt->nmaps];
+       dd_dev_info(dd, "Reserving QPNs for KDETH window from 0x%x to 0x%x\n",
+               qpn, qpn + 65535);
+       for (i = 0; i < 65536; i++) {
+               if (!map->page) {
+                       get_map_page(qpt, map);
+                       if (!map->page) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+               }
+               set_bit(offset, map->page);
+               offset++;
+               if (offset == BITS_PER_PAGE) {
+                       /* next page */
+                       qpt->nmaps++;
+                       map++;
+                       offset = 0;
+               }
+       }
+       return ret;
+}
+
+/**
+ * free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+static void free_qpn_table(struct hfi1_qpn_table *qpt)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+               free_page((unsigned long) qpt->map[i].page);
+}
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth)
+{
+       u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK;
+
+       /*
+        * If the credit is invalid, we can send
+        * as many packets as we like.  Otherwise, we have to
+        * honor the credit field.
+        */
+       if (credit == HFI1_AETH_CREDIT_INVAL) {
+               if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
+                       qp->s_flags |= HFI1_S_UNLIMITED_CREDIT;
+                       if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
+                               qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
+                               hfi1_schedule_send(qp);
+                       }
+               }
+       } else if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) {
+               /* Compute new LSN (i.e., MSN + credit) */
+               credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK;
+               if (cmp_msn(credit, qp->s_lsn) > 0) {
+                       qp->s_lsn = credit;
+                       if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) {
+                               qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT;
+                               hfi1_schedule_send(qp);
+                       }
+               }
+       }
+}
+
+void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (qp->s_flags & flag) {
+               qp->s_flags &= ~flag;
+               trace_hfi1_qpwakeup(qp, flag);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       /* Notify hfi1_destroy_qp() if it is waiting. */
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+static int iowait_sleep(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *stx,
+       unsigned seq)
+{
+       struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq);
+       struct hfi1_qp *qp;
+       unsigned long flags;
+       int ret = 0;
+       struct hfi1_ibdev *dev;
+
+       qp = tx->qp;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+
+               /*
+                * If we couldn't queue the DMA request, save the info
+                * and try again later rather than destroying the
+                * buffer and undoing the side effects of the copy.
+                */
+               /* Make a common routine? */
+               dev = &sde->dd->verbs_dev;
+               list_add_tail(&stx->list, &wait->tx_head);
+               write_seqlock(&dev->iowait_lock);
+               if (sdma_progress(sde, seq, stx))
+                       goto eagain;
+               if (list_empty(&qp->s_iowait.list)) {
+                       struct hfi1_ibport *ibp =
+                               to_iport(qp->ibqp.device, qp->port_num);
+
+                       ibp->n_dmawait++;
+                       qp->s_flags |= HFI1_S_WAIT_DMA_DESC;
+                       list_add_tail(&qp->s_iowait.list, &sde->dmawait);
+                       trace_hfi1_qpsleep(qp, HFI1_S_WAIT_DMA_DESC);
+                       atomic_inc(&qp->refcount);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~HFI1_S_BUSY;
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               ret = -EBUSY;
+       } else {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               hfi1_put_txreq(tx);
+       }
+       return ret;
+eagain:
+       write_sequnlock(&dev->iowait_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       list_del_init(&stx->list);
+       return -EAGAIN;
+}
+
+static void iowait_wakeup(struct iowait *wait, int reason)
+{
+       struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
+
+       WARN_ON(reason != SDMA_AVAIL_REASON);
+       hfi1_qp_wakeup(qp, HFI1_S_WAIT_DMA_DESC);
+}
+
+int hfi1_qp_init(struct hfi1_ibdev *dev)
+{
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int i;
+       int ret = -ENOMEM;
+
+       /* allocate parent object */
+       dev->qp_dev = kzalloc(sizeof(*dev->qp_dev), GFP_KERNEL);
+       if (!dev->qp_dev)
+               goto nomem;
+       /* allocate hash table */
+       dev->qp_dev->qp_table_size = hfi1_qp_table_size;
+       dev->qp_dev->qp_table_bits = ilog2(hfi1_qp_table_size);
+       dev->qp_dev->qp_table =
+               kmalloc(dev->qp_dev->qp_table_size *
+                               sizeof(*dev->qp_dev->qp_table),
+                       GFP_KERNEL);
+       if (!dev->qp_dev->qp_table)
+               goto nomem;
+       for (i = 0; i < dev->qp_dev->qp_table_size; i++)
+               RCU_INIT_POINTER(dev->qp_dev->qp_table[i], NULL);
+       spin_lock_init(&dev->qp_dev->qpt_lock);
+       /* initialize qpn map */
+       ret = init_qpn_table(dd, &dev->qp_dev->qpn_table);
+       if (ret)
+               goto nomem;
+       return ret;
+nomem:
+       if (dev->qp_dev) {
+               kfree(dev->qp_dev->qp_table);
+               free_qpn_table(&dev->qp_dev->qpn_table);
+               kfree(dev->qp_dev);
+       }
+       return ret;
+}
+
+void hfi1_qp_exit(struct hfi1_ibdev *dev)
+{
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       u32 qps_inuse;
+
+       qps_inuse = free_all_qps(dd);
+       if (qps_inuse)
+               dd_dev_err(dd, "QP memory leak! %u still in use\n",
+                          qps_inuse);
+       if (dev->qp_dev) {
+               kfree(dev->qp_dev->qp_table);
+               free_qpn_table(&dev->qp_dev->qpn_table);
+               kfree(dev->qp_dev);
+       }
+}
+
+/**
+ *
+ * qp_to_sdma_engine - map a qp to a send engine
+ * @qp: the QP
+ * @sc5: the 5 bit sc
+ *
+ * Return:
+ * A send engine for the qp or NULL for SMI type qp.
+ */
+struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct sdma_engine *sde;
+
+       if (!(dd->flags & HFI1_HAS_SEND_DMA))
+               return NULL;
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_UC:
+       case IB_QPT_RC:
+               break;
+       case IB_QPT_SMI:
+               return NULL;
+       default:
+               break;
+       }
+       sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5);
+       return sde;
+}
+
+struct qp_iter {
+       struct hfi1_ibdev *dev;
+       struct hfi1_qp *qp;
+       int specials;
+       int n;
+};
+
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev)
+{
+       struct qp_iter *iter;
+
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return NULL;
+
+       iter->dev = dev;
+       iter->specials = dev->ibdev.phys_port_cnt * 2;
+       if (qp_iter_next(iter)) {
+               kfree(iter);
+               return NULL;
+       }
+
+       return iter;
+}
+
+int qp_iter_next(struct qp_iter *iter)
+{
+       struct hfi1_ibdev *dev = iter->dev;
+       int n = iter->n;
+       int ret = 1;
+       struct hfi1_qp *pqp = iter->qp;
+       struct hfi1_qp *qp;
+
+       /*
+        * The approach is to consider the special qps
+        * as an additional table entries before the
+        * real hash table.  Since the qp code sets
+        * the qp->next hash link to NULL, this works just fine.
+        *
+        * iter->specials is 2 * # ports
+        *
+        * n = 0..iter->specials is the special qp indices
+        *
+        * n = iter->specials..dev->qp_dev->qp_table_size+iter->specials are
+        * the potential hash bucket entries
+        *
+        */
+       for (; n <  dev->qp_dev->qp_table_size + iter->specials; n++) {
+               if (pqp) {
+                       qp = rcu_dereference(pqp->next);
+               } else {
+                       if (n < iter->specials) {
+                               struct hfi1_pportdata *ppd;
+                               struct hfi1_ibport *ibp;
+                               int pidx;
+
+                               pidx = n % dev->ibdev.phys_port_cnt;
+                               ppd = &dd_from_dev(dev)->pport[pidx];
+                               ibp = &ppd->ibport_data;
+
+                               if (!(n & 1))
+                                       qp = rcu_dereference(ibp->qp[0]);
+                               else
+                                       qp = rcu_dereference(ibp->qp[1]);
+                       } else {
+                               qp = rcu_dereference(
+                                       dev->qp_dev->qp_table[
+                                               (n - iter->specials)]);
+                       }
+               }
+               pqp = qp;
+               if (qp) {
+                       iter->qp = qp;
+                       iter->n = n;
+                       return 0;
+               }
+       }
+       return ret;
+}
+
+static const char * const qp_type_str[] = {
+       "SMI", "GSI", "RC", "UC", "UD",
+};
+
+static int qp_idle(struct hfi1_qp *qp)
+{
+       return
+               qp->s_last == qp->s_acked &&
+               qp->s_acked == qp->s_cur &&
+               qp->s_cur == qp->s_tail &&
+               qp->s_tail == qp->s_head;
+}
+
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter)
+{
+       struct hfi1_swqe *wqe;
+       struct hfi1_qp *qp = iter->qp;
+       struct sdma_engine *sde;
+
+       sde = qp_to_sdma_engine(qp, qp->s_sc);
+       wqe = get_swqe_ptr(qp, qp->s_last);
+       seq_printf(s,
+                  "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x SL %u MTU %d %u %u %u SDE %p,%u\n",
+                  iter->n,
+                  qp_idle(qp) ? "I" : "B",
+                  qp->ibqp.qp_num,
+                  atomic_read(&qp->refcount),
+                  qp_type_str[qp->ibqp.qp_type],
+                  qp->state,
+                  wqe ? wqe->wr.opcode : 0,
+                  qp->s_hdrwords,
+                  qp->s_flags,
+                  atomic_read(&qp->s_iowait.sdma_busy),
+                  !list_empty(&qp->s_iowait.list),
+                  qp->timeout,
+                  wqe ? wqe->ssn : 0,
+                  qp->s_lsn,
+                  qp->s_last_psn,
+                  qp->s_psn, qp->s_next_psn,
+                  qp->s_sending_psn, qp->s_sending_hpsn,
+                  qp->s_last, qp->s_acked, qp->s_cur,
+                  qp->s_tail, qp->s_head, qp->s_size,
+                  qp->remote_qpn,
+                  qp->remote_ah_attr.dlid,
+                  qp->remote_ah_attr.sl,
+                  qp->pmtu,
+                  qp->s_retry_cnt,
+                  qp->timeout,
+                  qp->s_rnr_retry_cnt,
+                  sde,
+                  sde ? sde->this_idx : 0);
+}
+
+void qp_comm_est(struct hfi1_qp *qp)
+{
+       qp->r_flags |= HFI1_R_COMM_EST;
+       if (qp->ibqp.event_handler) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_COMM_EST;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
new file mode 100644 (file)
index 0000000..6b50585
--- /dev/null
@@ -0,0 +1,235 @@
+#ifndef _QP_H
+#define _QP_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/hash.h>
+#include "verbs.h"
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+       void *page;
+};
+
+struct hfi1_qpn_table {
+       spinlock_t lock; /* protect changes in this struct */
+       unsigned flags;         /* flags for QP0/1 allocated for each port */
+       u32 last;               /* last QP number allocated */
+       u32 nmaps;              /* size of the map table */
+       u16 limit;
+       u8  incr;
+       /* bit map of free QP numbers other than 0/1 */
+       struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct hfi1_qp_ibdev {
+       u32 qp_table_size;
+       u32 qp_table_bits;
+       struct hfi1_qp __rcu **qp_table;
+       spinlock_t qpt_lock;
+       struct hfi1_qpn_table qpn_table;
+};
+
+static inline u32 qpn_hash(struct hfi1_qp_ibdev *dev, u32 qpn)
+{
+       return hash_32(qpn, dev->qp_table_bits);
+}
+
+/**
+ * hfi1_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+static inline struct hfi1_qp *hfi1_lookup_qpn(struct hfi1_ibport *ibp,
+                               u32 qpn) __must_hold(RCU)
+{
+       struct hfi1_qp *qp = NULL;
+
+       if (unlikely(qpn <= 1)) {
+               qp = rcu_dereference(ibp->qp[qpn]);
+       } else {
+               struct hfi1_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev;
+               u32 n = qpn_hash(dev->qp_dev, qpn);
+
+               for (qp = rcu_dereference(dev->qp_dev->qp_table[n]); qp;
+                       qp = rcu_dereference(qp->next))
+                       if (qp->ibqp.qp_num == qpn)
+                               break;
+       }
+       return qp;
+}
+
+/**
+ * hfi1_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err);
+
+/**
+ * hfi1_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                  int attr_mask, struct ib_udata *udata);
+
+int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                 int attr_mask, struct ib_qp_init_attr *init_attr);
+
+/**
+ * hfi1_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 hfi1_compute_aeth(struct hfi1_qp *qp);
+
+/**
+ * hfi1_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
+                            struct ib_qp_init_attr *init_attr,
+                            struct ib_udata *udata);
+/**
+ * hfi1_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int hfi1_destroy_qp(struct ib_qp *ibqp);
+
+/**
+ * hfi1_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth);
+
+/**
+ * hfi1_qp_init - allocate QP tables
+ * @dev: a pointer to the hfi1_ibdev
+ */
+int hfi1_qp_init(struct hfi1_ibdev *dev);
+
+/**
+ * hfi1_qp_exit - free the QP related structures
+ * @dev: a pointer to the hfi1_ibdev
+ */
+void hfi1_qp_exit(struct hfi1_ibdev *dev);
+
+/**
+ * hfi1_qp_waitup - wake up on the indicated event
+ * @qp: the QP
+ * @flag: flag the qp on which the qp is stalled
+ */
+void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag);
+
+struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5);
+
+struct qp_iter;
+
+/**
+ * qp_iter_init - wake up on the indicated event
+ * @dev: the hfi1_ibdev
+ */
+struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev);
+
+/**
+ * qp_iter_next - wakeup on the indicated event
+ * @iter: the iterator for the qp hash list
+ */
+int qp_iter_next(struct qp_iter *iter);
+
+/**
+ * qp_iter_next - wake up on the indicated event
+ * @s: the seq_file to emit the qp information on
+ * @iter: the iterator for the qp hash list
+ */
+void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
+
+/**
+ * qp_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void qp_comm_est(struct hfi1_qp *qp);
+
+#endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c
new file mode 100644 (file)
index 0000000..3138936
--- /dev/null
@@ -0,0 +1,546 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * QSFP support for hfi driver, using "Two Wire Serial Interface" driver
+ * in twsi.c
+ */
+#define I2C_MAX_RETRY 4
+
+/*
+ * Unlocked i2c write.  Must hold dd->qsfp_i2c_mutex.
+ */
+static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                      int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret, cnt;
+       u8 *buff = bp;
+
+       /* Make sure TWSI bus is in sane state. */
+       ret = hfi1_twsi_reset(dd, target);
+       if (ret) {
+               hfi1_dev_porterr(dd, ppd->port,
+                                "I2C interface Reset for write failed\n");
+               return -EIO;
+       }
+
+       cnt = 0;
+       while (cnt < len) {
+               int wlen = len - cnt;
+
+               ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset,
+                                      buff + cnt, wlen);
+               if (ret) {
+                       /* hfi1_twsi_blk_wr() 1 for error, else 0 */
+                       return -EIO;
+               }
+               offset += wlen;
+               cnt += wlen;
+       }
+
+       /* Must wait min 20us between qsfp i2c transactions */
+       udelay(20);
+
+       return cnt;
+}
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+             void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret;
+
+       ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
+       if (!ret) {
+               ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len);
+               mutex_unlock(&dd->qsfp_i2c_mutex);
+       }
+
+       return ret;
+}
+
+/*
+ * Unlocked i2c read.  Must hold dd->qsfp_i2c_mutex.
+ */
+static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+                     int offset, void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret, cnt, pass = 0;
+       int stuck = 0;
+       u8 *buff = bp;
+
+       /* Make sure TWSI bus is in sane state. */
+       ret = hfi1_twsi_reset(dd, target);
+       if (ret) {
+               hfi1_dev_porterr(dd, ppd->port,
+                                "I2C interface Reset for read failed\n");
+               ret = -EIO;
+               stuck = 1;
+               goto exit;
+       }
+
+       cnt = 0;
+       while (cnt < len) {
+               int rlen = len - cnt;
+
+               ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset,
+                                      buff + cnt, rlen);
+               /* Some QSFP's fail first try. Retry as experiment */
+               if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY)
+                       continue;
+               if (ret) {
+                       /* hfi1_twsi_blk_rd() 1 for error, else 0 */
+                       ret = -EIO;
+                       goto exit;
+               }
+               offset += rlen;
+               cnt += rlen;
+       }
+
+       ret = cnt;
+
+exit:
+       if (stuck)
+               dd_dev_err(dd, "I2C interface bus stuck non-idle\n");
+
+       if (pass >= I2C_MAX_RETRY && ret)
+               hfi1_dev_porterr(dd, ppd->port,
+                                "I2C failed even retrying\n");
+       else if (pass)
+               hfi1_dev_porterr(dd, ppd->port, "I2C retries: %d\n", pass);
+
+       /* Must wait min 20us between qsfp i2c transactions */
+       udelay(20);
+
+       return ret;
+}
+
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset,
+            void *bp, int len)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int ret;
+
+       ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex);
+       if (!ret) {
+               ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len);
+               mutex_unlock(&dd->qsfp_i2c_mutex);
+       }
+
+       return ret;
+}
+
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+              int len)
+{
+       int count = 0;
+       int offset;
+       int nwrite;
+       int ret;
+       u8 page;
+
+       ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
+       if (ret)
+               return ret;
+
+       while (count < len) {
+               /*
+                * Set the qsfp page based on a zero-based addresss
+                * and a page size of QSFP_PAGESIZE bytes.
+                */
+               page = (u8)(addr / QSFP_PAGESIZE);
+
+               ret = __i2c_write(ppd, target, QSFP_DEV,
+                                       QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+               if (ret != 1) {
+                       hfi1_dev_porterr(
+                       ppd->dd,
+                       ppd->port,
+                       "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
+                       ret = -EIO;
+                       break;
+               }
+
+               /* truncate write to end of page if crossing page boundary */
+               offset = addr % QSFP_PAGESIZE;
+               nwrite = len - count;
+               if ((offset + nwrite) > QSFP_PAGESIZE)
+                       nwrite = QSFP_PAGESIZE - offset;
+
+               ret = __i2c_write(ppd, target, QSFP_DEV, offset, bp + count,
+                                       nwrite);
+               if (ret <= 0)   /* stop on error or nothing read */
+                       break;
+
+               count += ret;
+               addr += ret;
+       }
+
+       mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
+
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+             int len)
+{
+       int count = 0;
+       int offset;
+       int nread;
+       int ret;
+       u8 page;
+
+       ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex);
+       if (ret)
+               return ret;
+
+       while (count < len) {
+               /*
+                * Set the qsfp page based on a zero-based address
+                * and a page size of QSFP_PAGESIZE bytes.
+                */
+               page = (u8)(addr / QSFP_PAGESIZE);
+               ret = __i2c_write(ppd, target, QSFP_DEV,
+                                       QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1);
+               if (ret != 1) {
+                       hfi1_dev_porterr(
+                       ppd->dd,
+                       ppd->port,
+                       "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret);
+                       ret = -EIO;
+                       break;
+               }
+
+               /* truncate read to end of page if crossing page boundary */
+               offset = addr % QSFP_PAGESIZE;
+               nread = len - count;
+               if ((offset + nread) > QSFP_PAGESIZE)
+                       nread = QSFP_PAGESIZE - offset;
+
+               ret = __i2c_read(ppd, target, QSFP_DEV, offset, bp + count,
+                                       nread);
+               if (ret <= 0)   /* stop on error or nothing read */
+                       break;
+
+               count += ret;
+               addr += ret;
+       }
+
+       mutex_unlock(&ppd->dd->qsfp_i2c_mutex);
+
+       if (ret < 0)
+               return ret;
+       return count;
+}
+
+/*
+ * This function caches the QSFP memory range in 128 byte chunks.
+ * As an example, the next byte after address 255 is byte 128 from
+ * upper page 01H (if existing) rather than byte 0 from lower page 00H.
+ */
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp)
+{
+       u32 target = ppd->dd->hfi1_id;
+       int ret;
+       unsigned long flags;
+       u8 *cache = &cp->cache[0];
+
+       /* ensure sane contents on invalid reads, for cable swaps */
+       memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
+       dd_dev_info(ppd->dd, "%s: called\n", __func__);
+       if (!qsfp_mod_present(ppd)) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       ret = qsfp_read(ppd, target, 0, cache, 256);
+       if (ret != 256) {
+               dd_dev_info(ppd->dd,
+                       "%s: Read of pages 00H failed, expected 256, got %d\n",
+                       __func__, ret);
+               goto bail;
+       }
+
+       if (cache[0] != 0x0C && cache[0] != 0x0D)
+               goto bail;
+
+       /* Is paging enabled? */
+       if (!(cache[2] & 4)) {
+
+               /* Paging enabled, page 03 required */
+               if ((cache[195] & 0xC0) == 0xC0) {
+                       /* all */
+                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+               } else if ((cache[195] & 0x80) == 0x80) {
+                       /* only page 2 and 3 */
+                       ret = qsfp_read(ppd, target, 640, cache + 384, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+               } else if ((cache[195] & 0x40) == 0x40) {
+                       /* only page 1 and 3 */
+                       ret = qsfp_read(ppd, target, 384, cache + 256, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+               } else {
+                       /* only page 3 */
+                       ret = qsfp_read(ppd, target, 896, cache + 512, 128);
+                       if (ret <= 0 || ret != 128) {
+                               dd_dev_info(ppd->dd, "%s: failed\n", __func__);
+                               goto bail;
+                       }
+               }
+       }
+
+       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+       ppd->qsfp_info.cache_valid = 1;
+       ppd->qsfp_info.cache_refresh_required = 0;
+       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+
+       return 0;
+
+bail:
+       memset(cache, 0, (QSFP_MAX_NUM_PAGES*128));
+       return ret;
+}
+
+const char * const hfi1_qsfp_devtech[16] = {
+       "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+       "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+       "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+       "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+
+int qsfp_mod_present(struct hfi1_pportdata *ppd)
+{
+       if (HFI1_CAP_IS_KSET(QSFP_ENABLED)) {
+               struct hfi1_devdata *dd = ppd->dd;
+               u64 reg;
+
+               reg = read_csr(dd,
+                       dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+               return !(reg & QSFP_HFI0_MODPRST_N);
+       }
+       /* always return cable present */
+       return 1;
+}
+
+/*
+ * This function maps QSFP memory addresses in 128 byte chunks in the following
+ * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1
+ * spec
+ * For addr 000-127, lower page 00h
+ * For addr 128-255, upper page 00h
+ * For addr 256-383, upper page 01h
+ * For addr 384-511, upper page 02h
+ * For addr 512-639, upper page 03h
+ *
+ * For addresses beyond this range, it returns the invalid range of data buffer
+ * set to 0.
+ * For upper pages that are optional, if they are not valid, returns the
+ * particular range of bytes in the data buffer set to 0.
+ */
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len,
+                  u8 *data)
+{
+       struct hfi1_pportdata *ppd;
+       u32 excess_len = 0;
+       int ret = 0;
+
+       if (port_num > dd->num_pports || port_num < 1) {
+               dd_dev_info(dd, "%s: Invalid port number %d\n",
+                               __func__, port_num);
+               ret = -EINVAL;
+               goto set_zeroes;
+       }
+
+       ppd = dd->pport + (port_num - 1);
+       if (!qsfp_mod_present(ppd)) {
+               ret = -ENODEV;
+               goto set_zeroes;
+       }
+
+       if (!ppd->qsfp_info.cache_valid) {
+               ret = -EINVAL;
+               goto set_zeroes;
+       }
+
+       if (addr >= (QSFP_MAX_NUM_PAGES * 128)) {
+               ret = -ERANGE;
+               goto set_zeroes;
+       }
+
+       if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) {
+               excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128);
+               memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len));
+               data += (len - excess_len);
+               goto set_zeroes;
+       }
+
+       memcpy(data, &ppd->qsfp_info.cache[addr], len);
+       return 0;
+
+set_zeroes:
+       memset(data, 0, excess_len);
+       return ret;
+}
+
+int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len)
+{
+       u8 *cache = &ppd->qsfp_info.cache[0];
+       u8 bin_buff[QSFP_DUMP_CHUNK];
+       char lenstr[6];
+       int sofar, ret;
+       int bidx = 0;
+       u8 *atten = &cache[QSFP_ATTEN_OFFS];
+       u8 *vendor_oui = &cache[QSFP_VOUI_OFFS];
+
+       sofar = 0;
+       lenstr[0] = ' ';
+       lenstr[1] = '\0';
+
+       if (ppd->qsfp_info.cache_valid) {
+
+               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+                       sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n",
+                               pwr_codes +
+                               (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n",
+                               lenstr,
+                       hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+                                  QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+                                  QSFP_OUI(vendor_oui));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+                                  QSFP_PN_LEN, &cache[QSFP_PN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+                                  QSFP_REV_LEN, &cache[QSFP_REV_OFFS]);
+
+               if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS]))
+                       sofar += scnprintf(buf + sofar, len - sofar,
+                               "Atten:%d, %d\n",
+                               QSFP_ATTEN_SDR(atten),
+                               QSFP_ATTEN_DDR(atten));
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+                                  QSFP_SN_LEN, &cache[QSFP_SN_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+                                  QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]);
+
+               sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+                                  QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]);
+
+               while (bidx < QSFP_DEFAULT_HDR_CNT) {
+                       int iidx;
+
+                       memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK);
+                       for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) {
+                               sofar += scnprintf(buf + sofar, len-sofar,
+                                       " %02X", bin_buff[iidx]);
+                       }
+                       sofar += scnprintf(buf + sofar, len - sofar, "\n");
+                       bidx += QSFP_DUMP_CHUNK;
+               }
+       }
+       ret = sofar;
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h
new file mode 100644 (file)
index 0000000..d30c2a6
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+/* QSFP support common definitions, for hfi driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+/* 128 byte pages, per SFF 8636 rev 2.4 */
+#define QSFP_MAX_NUM_PAGES     5
+
+/*
+ * Below are masks for QSFP pins.  Pins are the same for HFI0 and HFI1.
+ * _N means asserted low
+ */
+#define QSFP_HFI0_I2CCLK    (1 << 0)
+#define QSFP_HFI0_I2CDAT    (1 << 1)
+#define QSFP_HFI0_RESET_N   (1 << 2)
+#define QSFP_HFI0_INT_N            (1 << 3)
+#define QSFP_HFI0_MODPRST_N (1 << 4)
+
+/* QSFP is paged at 256 bytes */
+#define QSFP_PAGESIZE 256
+
+/* Defined fields that Intel requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
+/*
+ * Rest of first 128 not used, although 127 is reserved for page select
+ * if module is not "Flat memory".
+ */
+#define QSFP_PAGE_SELECT_BYTE_OFFS 127
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
+ *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not Intel req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not Intel req'd */
+/* Byte 141 is Extended Rate Select. Not Intel req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Intel req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const hfi1_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+                       oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/* Bytes 188,189 are Wavelength tolerance, not Intel req'd */
+/* Byte 190 is Max Case Temp. Not Intel req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */
+#define QSFP_CC_OFFS 191
+/* Bytes 192..195 are Options implemented in qsfp. Not Intel req'd */
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * Interrupt flag masks
+ */
+#define QSFP_DATA_NOT_READY            0x01
+
+#define QSFP_HIGH_TEMP_ALARM           0x80
+#define QSFP_LOW_TEMP_ALARM            0x40
+#define QSFP_HIGH_TEMP_WARNING         0x20
+#define QSFP_LOW_TEMP_WARNING          0x10
+
+#define QSFP_HIGH_VCC_ALARM            0x80
+#define QSFP_LOW_VCC_ALARM             0x40
+#define QSFP_HIGH_VCC_WARNING          0x20
+#define QSFP_LOW_VCC_WARNING           0x10
+
+#define QSFP_HIGH_POWER_ALARM          0x88
+#define QSFP_LOW_POWER_ALARM           0x44
+#define QSFP_HIGH_POWER_WARNING                0x22
+#define QSFP_LOW_POWER_WARNING         0x11
+
+#define QSFP_HIGH_BIAS_ALARM           0x88
+#define QSFP_LOW_BIAS_ALARM            0x44
+#define QSFP_HIGH_BIAS_WARNING         0x22
+#define QSFP_LOW_BIAS_WARNING          0x11
+
+/*
+ * struct qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the qsfp_lock arbitrate access to common resources.
+ *
+ */
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+struct qsfp_data {
+       /* Helps to find our way */
+       struct hfi1_pportdata *ppd;
+       struct work_struct qsfp_work;
+       u8 cache[QSFP_MAX_NUM_PAGES*128];
+       spinlock_t qsfp_lock;
+       u8 check_interrupt_flags;
+       u8 qsfp_interrupt_functional;
+       u8 cache_valid;
+       u8 cache_refresh_required;
+};
+
+int refresh_qsfp_cache(struct hfi1_pportdata *ppd,
+                      struct qsfp_data *cp);
+int qsfp_mod_present(struct hfi1_pportdata *ppd);
+int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr,
+                  u32 len, u8 *data);
+
+int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+             int offset, void *bp, int len);
+int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr,
+            int offset, void *bp, int len);
+int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+              int len);
+int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp,
+             int len);
diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c
new file mode 100644 (file)
index 0000000..632dd5b
--- /dev/null
@@ -0,0 +1,2426 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/io.h>
+
+#include "hfi.h"
+#include "qp.h"
+#include "sdma.h"
+#include "trace.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static void rc_timeout(unsigned long arg);
+
+static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe,
+                      u32 psn, u32 pmtu)
+{
+       u32 len;
+
+       len = delta_psn(psn, wqe->psn) * pmtu;
+       ss->sge = wqe->sg_list[0];
+       ss->sg_list = wqe->sg_list + 1;
+       ss->num_sge = wqe->wr.num_sge;
+       ss->total_len = wqe->length;
+       hfi1_skip_sge(ss, len, 0);
+       return wqe->length - len;
+}
+
+static void start_timer(struct hfi1_qp *qp)
+{
+       qp->s_flags |= HFI1_S_TIMER;
+       qp->s_timer.function = rc_timeout;
+       /* 4.096 usec. * (1 << qp->timeout) */
+       qp->s_timer.expires = jiffies + qp->timeout_jiffies;
+       add_timer(&qp->s_timer);
+}
+
+/**
+ * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp,
+                      struct hfi1_other_headers *ohdr, u32 pmtu)
+{
+       struct hfi1_ack_entry *e;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+       u32 bth2;
+       int middle = 0;
+
+       /* Don't send an ACK if we aren't supposed to. */
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
+               goto bail;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+
+       switch (qp->s_ack_state) {
+       case OP(RDMA_READ_RESPONSE_LAST):
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->rdma_sge.mr) {
+                       hfi1_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               /* FALLTHROUGH */
+       case OP(ATOMIC_ACKNOWLEDGE):
+               /*
+                * We can increment the tail pointer now that the last
+                * response has been sent instead of only being
+                * constructed.
+                */
+               if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
+                       qp->s_tail_ack_queue = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_ONLY):
+       case OP(ACKNOWLEDGE):
+               /* Check for no next entry in the queue. */
+               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+                       if (qp->s_flags & HFI1_S_ACK_PENDING)
+                               goto normal;
+                       goto bail;
+               }
+
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST)) {
+                       /*
+                        * If a RDMA read response is being resent and
+                        * we haven't seen the duplicate request yet,
+                        * then stop sending the remaining responses the
+                        * responder has seen until the requester re-sends it.
+                        */
+                       len = e->rdma_sge.sge_length;
+                       if (len && !e->rdma_sge.mr) {
+                               qp->s_tail_ack_queue = qp->r_head_ack_queue;
+                               goto bail;
+                       }
+                       /* Copy SGE state in case we need to resend */
+                       qp->s_rdma_mr = e->rdma_sge.mr;
+                       if (qp->s_rdma_mr)
+                               hfi1_get_mr(qp->s_rdma_mr);
+                       qp->s_ack_rdma_sge.sge = e->rdma_sge;
+                       qp->s_ack_rdma_sge.num_sge = 1;
+                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
+                       if (len > pmtu) {
+                               len = pmtu;
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+                       } else {
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+                               e->sent = 1;
+                       }
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_rdma_psn = e->psn;
+                       bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               } else {
+                       /* COMPARE_SWAP or FETCH_ADD */
+                       qp->s_cur_sge = NULL;
+                       len = 0;
+                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+                       ohdr->u.at.aeth = hfi1_compute_aeth(qp);
+                       ohdr->u.at.atomic_ack_eth[0] =
+                               cpu_to_be32(e->atomic_data >> 32);
+                       ohdr->u.at.atomic_ack_eth[1] =
+                               cpu_to_be32(e->atomic_data);
+                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
+                       bth2 = mask_psn(e->psn);
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               qp->s_cur_sge = &qp->s_ack_rdma_sge;
+               qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+               if (qp->s_rdma_mr)
+                       hfi1_get_mr(qp->s_rdma_mr);
+               len = qp->s_ack_rdma_sge.sge.sge_length;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+               } else {
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+                       e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               bth2 = mask_psn(qp->s_ack_rdma_psn++);
+               break;
+
+       default:
+normal:
+               /*
+                * Send a regular ACK.
+                * Set the s_ack_state so we wait until after sending
+                * the ACK before setting s_ack_state to ACKNOWLEDGE
+                * (see above).
+                */
+               qp->s_ack_state = OP(SEND_ONLY);
+               qp->s_flags &= ~HFI1_S_ACK_PENDING;
+               qp->s_cur_sge = NULL;
+               if (qp->s_nak_state)
+                       ohdr->u.aeth =
+                               cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+                                           (qp->s_nak_state <<
+                                            HFI1_AETH_CREDIT_SHIFT));
+               else
+                       ohdr->u.aeth = hfi1_compute_aeth(qp);
+               hwords++;
+               len = 0;
+               bth0 = OP(ACKNOWLEDGE) << 24;
+               bth2 = mask_psn(qp->s_ack_psn);
+       }
+       qp->s_rdma_ack_cnt++;
+       qp->s_hdrwords = hwords;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle);
+       return 1;
+
+bail:
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+       /*
+        * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+        * HFI1_S_RESP_PENDING
+        */
+       smp_wmb();
+       qp->s_flags &= ~(HFI1_S_RESP_PENDING
+                               | HFI1_S_ACK_PENDING
+                               | HFI1_S_AHG_VALID);
+       return 0;
+}
+
+/**
+ * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_rc_req(struct hfi1_qp *qp)
+{
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_sge_state *ss;
+       struct hfi1_swqe *wqe;
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       u32 hwords = 5;
+       u32 len;
+       u32 bth0 = 0;
+       u32 bth2;
+       u32 pmtu = qp->pmtu;
+       char newreq;
+       unsigned long flags;
+       int ret = 0;
+       int middle = 0;
+       int delta;
+
+       ohdr = &qp->s_hdr->ibh.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr->ibh.u.l.oth;
+
+       /*
+        * The lock is needed to synchronize between the sending tasklet,
+        * the receive interrupt handler, and timeout re-sends.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Sending responses has higher priority over sending requests. */
+       if ((qp->s_flags & HFI1_S_RESP_PENDING) &&
+           make_rc_ack(dev, qp, ohdr, pmtu))
+               goto done;
+
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
+               if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_iowait.sdma_busy)) {
+                       qp->s_flags |= HFI1_S_WAIT_DMA;
+                       goto bail;
+               }
+               clear_ahg(qp);
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+                       IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+               /* will get called again */
+               goto done;
+       }
+
+       if (qp->s_flags & (HFI1_S_WAIT_RNR | HFI1_S_WAIT_ACK))
+               goto bail;
+
+       if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+               if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+                       qp->s_flags |= HFI1_S_WAIT_PSN;
+                       goto bail;
+               }
+               qp->s_sending_psn = qp->s_psn;
+               qp->s_sending_hpsn = qp->s_psn - 1;
+       }
+
+       /* Send a request. */
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       switch (qp->s_state) {
+       default:
+               if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /*
+                * Resend an old request or start a new one.
+                *
+                * We keep track of the current SWQE so that
+                * we don't reset the "furthest progress" state
+                * if we need to back up.
+                */
+               newreq = 0;
+               if (qp->s_cur == qp->s_tail) {
+                       /* Check if send work queue is empty. */
+                       if (qp->s_tail == qp->s_head) {
+                               clear_ahg(qp);
+                               goto bail;
+                       }
+                       /*
+                        * If a fence is requested, wait for previous
+                        * RDMA read and atomic operations to finish.
+                        */
+                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+                           qp->s_num_rd_atomic) {
+                               qp->s_flags |= HFI1_S_WAIT_FENCE;
+                               goto bail;
+                       }
+                       wqe->psn = qp->s_next_psn;
+                       newreq = 1;
+               }
+               /*
+                * Note that we have to be careful not to modify the
+                * original work request since we may need to resend
+                * it.
+                */
+               len = wqe->length;
+               ss = &qp->s_sge;
+               bth2 = mask_psn(qp->s_psn);
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       /* If no credit, return. */
+                       if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
+                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       wqe->lpsn = wqe->psn;
+                       if (len > pmtu) {
+                               wqe->lpsn += (len - 1) / pmtu;
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND)
+                               qp->s_state = OP(SEND_ONLY);
+                       else {
+                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+                       if (newreq && !(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+                               qp->s_lsn++;
+                       /* FALLTHROUGH */
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       /* If no credit, return. */
+                       if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) &&
+                           cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / sizeof(u32);
+                       wqe->lpsn = wqe->psn;
+                       if (len > pmtu) {
+                               wqe->lpsn += (len - 1) / pmtu;
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= IB_BTH_SOLICITED;
+                       }
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_READ:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= HFI1_S_WAIT_RDMAR;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                               /*
+                                * Adjust s_next_psn to count the
+                                * expected number of responses.
+                                */
+                               if (len > pmtu)
+                                       qp->s_next_psn += (len - 1) / pmtu;
+                               wqe->lpsn = qp->s_next_psn++;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       qp->s_state = OP(RDMA_READ_REQUEST);
+                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= HFI1_S_WAIT_RDMAR;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT))
+                                       qp->s_lsn++;
+                               wqe->lpsn = wqe->psn;
+                       }
+                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+                               qp->s_state = OP(COMPARE_SWAP);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->wr.wr.atomic.swap);
+                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+                                       wqe->wr.wr.atomic.compare_add);
+                       } else {
+                               qp->s_state = OP(FETCH_ADD);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->wr.wr.atomic.compare_add);
+                               ohdr->u.atomic_eth.compare_data = 0;
+                       }
+                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+                               wqe->wr.wr.atomic.remote_addr >> 32);
+                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+                               wqe->wr.wr.atomic.remote_addr);
+                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
+                               wqe->wr.wr.atomic.rkey);
+                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       bth2 |= IB_BTH_REQ_ACK;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_sge.total_len = wqe->length;
+               qp->s_len = wqe->length;
+               if (newreq) {
+                       qp->s_tail++;
+                       if (qp->s_tail >= qp->s_size)
+                               qp->s_tail = 0;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                       qp->s_psn = wqe->lpsn + 1;
+               else {
+                       qp->s_psn++;
+                       if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+                               qp->s_next_psn = qp->s_psn;
+               }
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+                * thread to indicate a SEND needs to be restarted from an
+                * earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+               /* FALLTHROUGH */
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               bth2 = mask_psn(qp->s_psn++);
+               if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND)
+                       qp->s_state = OP(SEND_LAST);
+               else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= IB_BTH_SOLICITED;
+               bth2 |= IB_BTH_REQ_ACK;
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+                * thread to indicate a RDMA write needs to be restarted from
+                * an earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               bth2 = mask_psn(qp->s_psn++);
+               if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               else {
+                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+               }
+               bth2 |= IB_BTH_REQ_ACK;
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /*
+                * qp->s_state is normally set to the opcode of the
+                * last packet constructed for new requests and therefore
+                * is never set to RDMA read response.
+                * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+                * thread to indicate a RDMA read needs to be restarted from
+                * an earlier PSN without interfering with the sending thread.
+                * See restart_rc().
+                */
+               len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
+               ohdr->u.rc.reth.vaddr =
+                       cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+               ohdr->u.rc.reth.rkey =
+                       cpu_to_be32(wqe->wr.wr.rdma.rkey);
+               ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+               qp->s_state = OP(RDMA_READ_REQUEST);
+               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+               bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
+               qp->s_psn = wqe->lpsn + 1;
+               ss = NULL;
+               len = 0;
+               qp->s_cur++;
+               if (qp->s_cur == qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_sending_hpsn = bth2;
+       delta = delta_psn(bth2, wqe->psn);
+       if (delta && delta % HFI1_PSN_CREDIT == 0)
+               bth2 |= IB_BTH_REQ_ACK;
+       if (qp->s_flags & HFI1_S_SEND_ONE) {
+               qp->s_flags &= ~HFI1_S_SEND_ONE;
+               qp->s_flags |= HFI1_S_WAIT_ACK;
+               bth2 |= IB_BTH_REQ_ACK;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = ss;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(
+               qp,
+               ohdr,
+               bth0 | (qp->s_state << 24),
+               bth2,
+               middle);
+done:
+       ret = 1;
+       goto unlock;
+
+bail:
+       qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/**
+ * hfi1_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp,
+                     int is_fecn)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u64 pbc, pbc_flags = 0;
+       u16 lrh0;
+       u16 sc5;
+       u32 bth0;
+       u32 hwords;
+       u32 vl, plen;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       struct hfi1_ib_header hdr;
+       struct hfi1_other_headers *ohdr;
+
+       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+       if (qp->s_flags & HFI1_S_RESP_PENDING)
+               goto queue_ack;
+
+       /* Ensure s_rdma_ack_cnt changes are committed */
+       smp_read_barrier_depends();
+       if (qp->s_rdma_ack_cnt)
+               goto queue_ack;
+
+       /* Construct the header */
+       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
+       hwords = 6;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
+                                      &qp->remote_ah_attr.grh, hwords, 0);
+               ohdr = &hdr.u.l.oth;
+               lrh0 = HFI1_LRH_GRH;
+       } else {
+               ohdr = &hdr.u.oth;
+               lrh0 = HFI1_LRH_BTH;
+       }
+       /* read pkey_index w/o lock (its atomic) */
+       bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+       if (qp->s_mig_state == IB_MIG_MIGRATED)
+               bth0 |= IB_BTH_MIG_REQ;
+       if (qp->r_nak_state)
+               ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
+                                           (qp->r_nak_state <<
+                                            HFI1_AETH_CREDIT_SHIFT));
+       else
+               ohdr->u.aeth = hfi1_compute_aeth(qp);
+       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+       /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+       pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
+       lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
+       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
+
+       /* Don't try to send ACKs if the link isn't ACTIVE */
+       if (driver_lstate(ppd) != IB_PORT_ACTIVE)
+               return;
+
+       sc = rcd->sc;
+       plen = 2 /* PBC */ + hwords;
+       vl = sc_to_vlt(ppd->dd, sc5);
+       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+
+       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+       if (!pbuf) {
+               /*
+                * We have no room to send at the moment.  Pass
+                * responsibility for sending the ACK to the send tasklet
+                * so that when enough buffer space becomes available,
+                * the ACK is sent ahead of other outgoing packets.
+                */
+               goto queue_ack;
+       }
+
+       trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
+
+       /* write the pbc and data */
+       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
+
+       return;
+
+queue_ack:
+       this_cpu_inc(*ibp->rc_qacks);
+       spin_lock(&qp->s_lock);
+       qp->s_flags |= HFI1_S_ACK_PENDING | HFI1_S_RESP_PENDING;
+       qp->s_nak_state = qp->r_nak_state;
+       qp->s_ack_psn = qp->r_ack_psn;
+       if (is_fecn)
+               qp->s_flags |= HFI1_S_ECN;
+
+       /* Schedule the send tasklet. */
+       hfi1_schedule_send(qp);
+       spin_unlock(&qp->s_lock);
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct hfi1_qp *qp, u32 psn)
+{
+       u32 n = qp->s_acked;
+       struct hfi1_swqe *wqe = get_swqe_ptr(qp, n);
+       u32 opcode;
+
+       qp->s_cur = n;
+
+       /*
+        * If we are starting the request from the beginning,
+        * let the normal send code handle initialization.
+        */
+       if (cmp_psn(psn, wqe->psn) <= 0) {
+               qp->s_state = OP(SEND_LAST);
+               goto done;
+       }
+
+       /* Find the work request opcode corresponding to the given PSN. */
+       opcode = wqe->wr.opcode;
+       for (;;) {
+               int diff;
+
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+               wqe = get_swqe_ptr(qp, n);
+               diff = cmp_psn(psn, wqe->psn);
+               if (diff < 0)
+                       break;
+               qp->s_cur = n;
+               /*
+                * If we are starting the request from the beginning,
+                * let the normal send code handle initialization.
+                */
+               if (diff == 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       goto done;
+               }
+               opcode = wqe->wr.opcode;
+       }
+
+       /*
+        * Set the state to restart in the middle of a request.
+        * Don't change the s_sge, s_cur_sge, or s_cur_size.
+        * See hfi1_make_rc_req().
+        */
+       switch (opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+               break;
+
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+               break;
+
+       case IB_WR_RDMA_READ:
+               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               break;
+
+       default:
+               /*
+                * This case shouldn't happen since its only
+                * one PSN per req.
+                */
+               qp->s_state = OP(SEND_LAST);
+       }
+done:
+       qp->s_psn = psn;
+       /*
+        * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer
+        * asynchronously before the send tasklet can get scheduled.
+        * Doing it in hfi1_make_rc_req() is too late.
+        */
+       if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+           (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+               qp->s_flags |= HFI1_S_WAIT_PSN;
+       qp->s_flags &= ~HFI1_S_AHG_VALID;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait)
+{
+       struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
+       struct hfi1_ibport *ibp;
+
+       if (qp->s_retry == 0) {
+               if (qp->s_mig_state == IB_MIG_ARMED) {
+                       hfi1_migrate_qp(qp);
+                       qp->s_retry = qp->s_retry_cnt;
+               } else if (qp->s_last == qp->s_acked) {
+                       hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+                       hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       return;
+               } else /* need to handle delayed completion */
+                       return;
+       } else
+               qp->s_retry--;
+
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+               ibp->n_rc_resends++;
+       else
+               ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
+
+       qp->s_flags &= ~(HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR |
+                        HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_PSN |
+                        HFI1_S_WAIT_ACK);
+       if (wait)
+               qp->s_flags |= HFI1_S_SEND_ONE;
+       reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+static void rc_timeout(unsigned long arg)
+{
+       struct hfi1_qp *qp = (struct hfi1_qp *)arg;
+       struct hfi1_ibport *ibp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->r_lock, flags);
+       spin_lock(&qp->s_lock);
+       if (qp->s_flags & HFI1_S_TIMER) {
+               ibp = to_iport(qp->ibqp.device, qp->port_num);
+               ibp->n_rc_timeouts++;
+               qp->s_flags &= ~HFI1_S_TIMER;
+               del_timer(&qp->s_timer);
+               restart_rc(qp, qp->s_last_psn + 1, 1);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void hfi1_rc_rnr_retry(unsigned long arg)
+{
+       struct hfi1_qp *qp = (struct hfi1_qp *)arg;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (qp->s_flags & HFI1_S_WAIT_RNR) {
+               qp->s_flags &= ~HFI1_S_WAIT_RNR;
+               del_timer(&qp->s_timer);
+               hfi1_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct hfi1_qp *qp, u32 psn)
+{
+       struct hfi1_swqe *wqe;
+       u32 n = qp->s_last;
+
+       /* Find the work request corresponding to the given PSN. */
+       for (;;) {
+               wqe = get_swqe_ptr(qp, n);
+               if (cmp_psn(psn, wqe->lpsn) <= 0) {
+                       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                               qp->s_sending_psn = wqe->lpsn + 1;
+                       else
+                               qp->s_sending_psn = psn + 1;
+                       break;
+               }
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+       }
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr)
+{
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_swqe *wqe;
+       struct ib_wc wc;
+       unsigned i;
+       u32 opcode;
+       u32 psn;
+
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       /* Find out where the BTH is */
+       if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else
+               ohdr = &hdr->u.l.oth;
+
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               WARN_ON(!qp->s_rdma_ack_cnt);
+               qp->s_rdma_ack_cnt--;
+               return;
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       reset_sending_psn(qp, psn);
+
+       /*
+        * Start timer after a packet requesting an ACK has been sent and
+        * there are still requests that haven't been acked.
+        */
+       if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+           !(qp->s_flags &
+               (HFI1_S_TIMER | HFI1_S_WAIT_RNR | HFI1_S_WAIT_PSN)) &&
+               (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK))
+               start_timer(qp);
+
+       while (qp->s_last != qp->s_acked) {
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+                   cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+                       break;
+               for (i = 0; i < wqe->wr.num_sge; i++) {
+                       struct hfi1_sge *sge = &wqe->sg_list[i];
+
+                       hfi1_put_mr(sge->mr);
+               }
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof(wc));
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+               }
+               if (++qp->s_last >= qp->s_size)
+                       qp->s_last = 0;
+       }
+       /*
+        * If we were waiting for sends to complete before re-sending,
+        * and they are now complete, restart sending.
+        */
+       trace_hfi1_rc_sendcomplete(qp, psn);
+       if (qp->s_flags & HFI1_S_WAIT_PSN &&
+           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+               qp->s_flags &= ~HFI1_S_WAIT_PSN;
+               qp->s_sending_psn = qp->s_psn;
+               qp->s_sending_hpsn = qp->s_psn - 1;
+               hfi1_schedule_send(qp);
+       }
+}
+
+static inline void update_last_psn(struct hfi1_qp *qp, u32 psn)
+{
+       qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to hfi1_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp,
+                                         struct hfi1_swqe *wqe,
+                                         struct hfi1_ibport *ibp)
+{
+       struct ib_wc wc;
+       unsigned i;
+
+       /*
+        * Don't decrement refcount and don't generate a
+        * completion if the SWQE is being resent until the send
+        * is finished.
+        */
+       if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
+           cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+               for (i = 0; i < wqe->wr.num_sge; i++) {
+                       struct hfi1_sge *sge = &wqe->sg_list[i];
+
+                       hfi1_put_mr(sge->mr);
+               }
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof(wc));
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+               }
+               if (++qp->s_last >= qp->s_size)
+                       qp->s_last = 0;
+       } else {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               this_cpu_inc(*ibp->rc_delayed_comp);
+               /*
+                * If send progress not running attempt to progress
+                * SDMA queue.
+                */
+               if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
+                       struct sdma_engine *engine;
+                       u8 sc5;
+
+                       /* For now use sc to find engine */
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       engine = qp_to_sdma_engine(qp, sc5);
+                       sdma_engine_progress_schedule(engine);
+               }
+       }
+
+       qp->s_retry = qp->s_retry_cnt;
+       update_last_psn(qp, wqe->lpsn);
+
+       /*
+        * If we are completing a request which is in the process of
+        * being resent, we can stop re-sending it since we know the
+        * responder has already seen it.
+        */
+       if (qp->s_acked == qp->s_cur) {
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               qp->s_acked = qp->s_cur;
+               wqe = get_swqe_ptr(qp, qp->s_cur);
+               if (qp->s_acked != qp->s_tail) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = wqe->psn;
+               }
+       } else {
+               if (++qp->s_acked >= qp->s_size)
+                       qp->s_acked = 0;
+               if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+                       qp->s_draining = 0;
+               wqe = get_swqe_ptr(qp, qp->s_acked);
+       }
+       return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode,
+                    u64 val, struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_ibport *ibp;
+       enum ib_wc_status status;
+       struct hfi1_swqe *wqe;
+       int ret = 0;
+       u32 ack_psn;
+       int diff;
+
+       /* Remove QP from retry timer */
+       if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+               qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+               del_timer(&qp->s_timer);
+       }
+
+       /*
+        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+        * requests and implicitly NAK RDMA read and atomic requests issued
+        * before the NAK'ed request.  The MSN won't include the NAK'ed
+        * request but will include an ACK'ed request(s).
+        */
+       ack_psn = psn;
+       if (aeth >> 29)
+               ack_psn--;
+       wqe = get_swqe_ptr(qp, qp->s_acked);
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+       /*
+        * The MSN might be for a later WQE than the PSN indicates so
+        * only complete WQEs that the PSN finishes.
+        */
+       while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
+               /*
+                * RDMA_READ_RESPONSE_ONLY is a special case since
+                * we want to generate completion events for everything
+                * before the RDMA read, copy the data, then generate
+                * the completion for the read.
+                */
+               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+                   diff == 0) {
+                       ret = 1;
+                       goto bail;
+               }
+               /*
+                * If this request is a RDMA read or atomic, and the ACK is
+                * for a later operation, this ACK NAKs the RDMA read or
+                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+                * can ACK a RDMA read and likewise for atomic ops.  Note
+                * that the NAK case can only happen if relaxed ordering is
+                * used and requests are sent after an RDMA read or atomic
+                * is sent but before the response is received.
+                */
+               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+                       /* Retry this request. */
+                       if (!(qp->r_flags & HFI1_R_RDMAR_SEQ)) {
+                               qp->r_flags |= HFI1_R_RDMAR_SEQ;
+                               restart_rc(qp, qp->s_last_psn + 1, 0);
+                               if (list_empty(&qp->rspwait)) {
+                                       qp->r_flags |= HFI1_R_RSP_SEND;
+                                       atomic_inc(&qp->refcount);
+                                       list_add_tail(&qp->rspwait,
+                                                     &rcd->qp_wait_list);
+                               }
+                       }
+                       /*
+                        * No need to process the ACK/NAK since we are
+                        * restarting an earlier request.
+                        */
+                       goto bail;
+               }
+               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+                       u64 *vaddr = wqe->sg_list[0].vaddr;
+                       *vaddr = val;
+               }
+               if (qp->s_num_rd_atomic &&
+                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+                       qp->s_num_rd_atomic--;
+                       /* Restart sending task if fence is complete */
+                       if ((qp->s_flags & HFI1_S_WAIT_FENCE) &&
+                           !qp->s_num_rd_atomic) {
+                               qp->s_flags &= ~(HFI1_S_WAIT_FENCE |
+                                                HFI1_S_WAIT_ACK);
+                               hfi1_schedule_send(qp);
+                       } else if (qp->s_flags & HFI1_S_WAIT_RDMAR) {
+                               qp->s_flags &= ~(HFI1_S_WAIT_RDMAR |
+                                                HFI1_S_WAIT_ACK);
+                               hfi1_schedule_send(qp);
+                       }
+               }
+               wqe = do_rc_completion(qp, wqe, ibp);
+               if (qp->s_acked == qp->s_tail)
+                       break;
+       }
+
+       switch (aeth >> 29) {
+       case 0:         /* ACK */
+               this_cpu_inc(*ibp->rc_acks);
+               if (qp->s_acked != qp->s_tail) {
+                       /*
+                        * We are expecting more ACKs so
+                        * reset the re-transmit timer.
+                        */
+                       start_timer(qp);
+                       /*
+                        * We can stop re-sending the earlier packets and
+                        * continue with the next packet the receiver wants.
+                        */
+                       if (cmp_psn(qp->s_psn, psn) <= 0)
+                               reset_psn(qp, psn + 1);
+               } else if (cmp_psn(qp->s_psn, psn) <= 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = psn + 1;
+               }
+               if (qp->s_flags & HFI1_S_WAIT_ACK) {
+                       qp->s_flags &= ~HFI1_S_WAIT_ACK;
+                       hfi1_schedule_send(qp);
+               }
+               hfi1_get_credit(qp, aeth);
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               qp->s_retry = qp->s_retry_cnt;
+               update_last_psn(qp, psn);
+               ret = 1;
+               goto bail;
+
+       case 1:         /* RNR NAK */
+               ibp->n_rnr_naks++;
+               if (qp->s_acked == qp->s_tail)
+                       goto bail;
+               if (qp->s_flags & HFI1_S_WAIT_RNR)
+                       goto bail;
+               if (qp->s_rnr_retry == 0) {
+                       status = IB_WC_RNR_RETRY_EXC_ERR;
+                       goto class_b;
+               }
+               if (qp->s_rnr_retry_cnt < 7)
+                       qp->s_rnr_retry--;
+
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+
+               ibp->n_rc_resends += delta_psn(qp->s_psn, psn);
+
+               reset_psn(qp, psn);
+
+               qp->s_flags &= ~(HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_ACK);
+               qp->s_flags |= HFI1_S_WAIT_RNR;
+               qp->s_timer.function = hfi1_rc_rnr_retry;
+               qp->s_timer.expires = jiffies + usecs_to_jiffies(
+                       ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
+                                          HFI1_AETH_CREDIT_MASK]);
+               add_timer(&qp->s_timer);
+               goto bail;
+
+       case 3:         /* NAK */
+               if (qp->s_acked == qp->s_tail)
+                       goto bail;
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+               switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
+                       HFI1_AETH_CREDIT_MASK) {
+               case 0: /* PSN sequence error */
+                       ibp->n_seq_naks++;
+                       /*
+                        * Back up to the responder's expected PSN.
+                        * Note that we might get a NAK in the middle of an
+                        * RDMA READ response which terminates the RDMA
+                        * READ.
+                        */
+                       restart_rc(qp, psn, 0);
+                       hfi1_schedule_send(qp);
+                       break;
+
+               case 1: /* Invalid Request */
+                       status = IB_WC_REM_INV_REQ_ERR;
+                       ibp->n_other_naks++;
+                       goto class_b;
+
+               case 2: /* Remote Access Error */
+                       status = IB_WC_REM_ACCESS_ERR;
+                       ibp->n_other_naks++;
+                       goto class_b;
+
+               case 3: /* Remote Operation Error */
+                       status = IB_WC_REM_OP_ERR;
+                       ibp->n_other_naks++;
+class_b:
+                       if (qp->s_last == qp->s_acked) {
+                               hfi1_send_complete(qp, wqe, status);
+                               hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       }
+                       break;
+
+               default:
+                       /* Ignore other reserved NAK error codes */
+                       goto reserved;
+               }
+               qp->s_retry = qp->s_retry_cnt;
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               goto bail;
+
+       default:                /* 2: reserved */
+reserved:
+               /* Ignore reserved NAK codes. */
+               goto bail;
+       }
+
+bail:
+       return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn,
+                        struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_swqe *wqe;
+
+       /* Remove QP from retry timer */
+       if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) {
+               qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR);
+               del_timer(&qp->s_timer);
+       }
+
+       wqe = get_swqe_ptr(qp, qp->s_acked);
+
+       while (cmp_psn(psn, wqe->lpsn) > 0) {
+               if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+                       break;
+               wqe = do_rc_completion(qp, wqe, ibp);
+       }
+
+       ibp->n_rdma_seq++;
+       qp->r_flags |= HFI1_R_RDMAR_SEQ;
+       restart_rc(qp, qp->s_last_psn + 1, 0);
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= HFI1_R_RSP_SEND;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+}
+
+/**
+ * rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from hfi1_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void rc_rcv_resp(struct hfi1_ibport *ibp,
+                       struct hfi1_other_headers *ohdr,
+                       void *data, u32 tlen, struct hfi1_qp *qp,
+                       u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
+                       struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_swqe *wqe;
+       enum ib_wc_status status;
+       unsigned long flags;
+       int diff;
+       u32 pad;
+       u32 aeth;
+       u64 val;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Ignore invalid responses. */
+       if (cmp_psn(psn, qp->s_next_psn) >= 0)
+               goto ack_done;
+
+       /* Ignore duplicate responses. */
+       diff = cmp_psn(psn, qp->s_last_psn);
+       if (unlikely(diff <= 0)) {
+               /* Update credits for "ghost" ACKs */
+               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+                       if ((aeth >> 29) == 0)
+                               hfi1_get_credit(qp, aeth);
+               }
+               goto ack_done;
+       }
+
+       /*
+        * Skip everything other than the PSN we expect, if we are waiting
+        * for a reply to a restarted RDMA read or atomic op.
+        */
+       if (qp->r_flags & HFI1_R_RDMAR_SEQ) {
+               if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+                       goto ack_done;
+               qp->r_flags &= ~HFI1_R_RDMAR_SEQ;
+       }
+
+       if (unlikely(qp->s_acked == qp->s_tail))
+               goto ack_done;
+       wqe = get_swqe_ptr(qp, qp->s_acked);
+       status = IB_WC_SUCCESS;
+
+       switch (opcode) {
+       case OP(ACKNOWLEDGE):
+       case OP(ATOMIC_ACKNOWLEDGE):
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+                       __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+                       val = ((u64) be32_to_cpu(p[0]) << 32) |
+                               be32_to_cpu(p[1]);
+               } else
+                       val = 0;
+               if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
+                       goto ack_done;
+               wqe = get_swqe_ptr(qp, qp->s_acked);
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_middle;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /* no AETH, no ACK */
+               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+                       goto ack_seq_err;
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+read_middle:
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto ack_len_err;
+               if (unlikely(pmtu >= qp->s_rdma_read_len))
+                       goto ack_len_err;
+
+               /*
+                * We got a response so update the timeout.
+                * 4.096 usec. * (1 << qp->timeout)
+                */
+               qp->s_flags |= HFI1_S_TIMER;
+               mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+               if (qp->s_flags & HFI1_S_WAIT_ACK) {
+                       qp->s_flags &= ~HFI1_S_WAIT_ACK;
+                       hfi1_schedule_send(qp);
+               }
+
+               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+                       qp->s_retry = qp->s_retry_cnt;
+
+               /*
+                * Update the RDMA receive state but do the copy w/o
+                * holding the locks and blocking interrupts.
+                */
+               qp->s_rdma_read_len -= pmtu;
+               update_last_psn(qp, psn);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+               goto bail;
+
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+                       goto ack_done;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 0 && <= pmtu.
+                * Remember to account for ICRC (4).
+                */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto ack_len_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               wqe = get_swqe_ptr(qp, qp->s_acked);
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_last;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /* ACKs READ req. */
+               if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
+                       goto ack_seq_err;
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 1 && <= pmtu.
+                * Remember to account for ICRC (4).
+                */
+               if (unlikely(tlen <= (hdrsize + pad + 4)))
+                       goto ack_len_err;
+read_last:
+               tlen -= hdrsize + pad + 4;
+               if (unlikely(tlen != qp->s_rdma_read_len))
+                       goto ack_len_err;
+               aeth = be32_to_cpu(ohdr->u.aeth);
+               hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+               WARN_ON(qp->s_rdma_read_sge.num_sge);
+               (void) do_rc_ack(qp, aeth, psn,
+                                OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+               goto ack_done;
+       }
+
+ack_op_err:
+       status = IB_WC_LOC_QP_OP_ERR;
+       goto ack_err;
+
+ack_seq_err:
+       rdma_seq_err(qp, ibp, psn, rcd);
+       goto ack_done;
+
+ack_len_err:
+       status = IB_WC_LOC_LEN_ERR;
+ack_err:
+       if (qp->s_last == qp->s_acked) {
+               hfi1_send_complete(qp, wqe, status);
+               hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+       }
+ack_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+       return;
+}
+
+/**
+ * rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from hfi1_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data,
+                       struct hfi1_qp *qp, u32 opcode, u32 psn, int diff,
+                       struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_ack_entry *e;
+       unsigned long flags;
+       u8 i, prev;
+       int old_req;
+
+       if (diff > 0) {
+               /*
+                * Packet sequence error.
+                * A NAK will ACK earlier sends and RDMA writes.
+                * Don't queue the NAK if we already sent one.
+                */
+               if (!qp->r_nak_state) {
+                       ibp->n_rc_seqnak++;
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       /*
+                        * Wait to send the sequence NAK until all packets
+                        * in the receive queue have been processed.
+                        * Otherwise, we end up propagating congestion.
+                        */
+                       if (list_empty(&qp->rspwait)) {
+                               qp->r_flags |= HFI1_R_RSP_NAK;
+                               atomic_inc(&qp->refcount);
+                               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+                       }
+               }
+               goto done;
+       }
+
+       /*
+        * Handle a duplicate request.  Don't re-execute SEND, RDMA
+        * write or atomic op.  Don't NAK errors, just silently drop
+        * the duplicate request.  Note that r_sge, r_len, and
+        * r_rcv_len may be in use so don't modify them.
+        *
+        * We are supposed to ACK the earliest duplicate PSN but we
+        * can coalesce an outstanding duplicate ACK.  We have to
+        * send the earliest so that RDMA reads can be restarted at
+        * the requester's expected PSN.
+        *
+        * First, find where this duplicate PSN falls within the
+        * ACKs previously sent.
+        * old_req is true if there is an older response that is scheduled
+        * to be sent before sending this one.
+        */
+       e = NULL;
+       old_req = 1;
+       ibp->n_rc_dupreq++;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       for (i = qp->r_head_ack_queue; ; i = prev) {
+               if (i == qp->s_tail_ack_queue)
+                       old_req = 0;
+               if (i)
+                       prev = i - 1;
+               else
+                       prev = HFI1_MAX_RDMA_ATOMIC;
+               if (prev == qp->r_head_ack_queue) {
+                       e = NULL;
+                       break;
+               }
+               e = &qp->s_ack_queue[prev];
+               if (!e->opcode) {
+                       e = NULL;
+                       break;
+               }
+               if (cmp_psn(psn, e->psn) >= 0) {
+                       if (prev == qp->s_tail_ack_queue &&
+                           cmp_psn(psn, e->lpsn) <= 0)
+                               old_req = 0;
+                       break;
+               }
+       }
+       switch (opcode) {
+       case OP(RDMA_READ_REQUEST): {
+               struct ib_reth *reth;
+               u32 offset;
+               u32 len;
+
+               /*
+                * If we didn't find the RDMA read request in the ack queue,
+                * we can ignore this request.
+                */
+               if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+                       goto unlock_done;
+               /* RETH comes after BTH */
+               reth = &ohdr->u.rc.reth;
+               /*
+                * Address range must be a subset of the original
+                * request and start on pmtu boundaries.
+                * We reuse the old ack_queue slot since the requester
+                * should not back up and request an earlier PSN for the
+                * same request.
+                */
+               offset = delta_psn(psn, e->psn) * qp->pmtu;
+               len = be32_to_cpu(reth->length);
+               if (unlikely(offset + len != e->rdma_sge.sge_length))
+                       goto unlock_done;
+               if (e->rdma_sge.mr) {
+                       hfi1_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               if (len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+                                         IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto unlock_done;
+               } else {
+                       e->rdma_sge.vaddr = NULL;
+                       e->rdma_sge.length = 0;
+                       e->rdma_sge.sge_length = 0;
+               }
+               e->psn = psn;
+               if (old_req)
+                       goto unlock_done;
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               /*
+                * If we didn't find the atomic request in the ack queue
+                * or the send tasklet is already backed up to send an
+                * earlier entry, we can ignore this request.
+                */
+               if (!e || e->opcode != (u8) opcode || old_req)
+                       goto unlock_done;
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       default:
+               /*
+                * Ignore this operation if it doesn't request an ACK
+                * or an earlier RDMA read or atomic is going to be resent.
+                */
+               if (!(psn & IB_BTH_REQ_ACK) || old_req)
+                       goto unlock_done;
+               /*
+                * Resend the most recent ACK if this request is
+                * after all the previous RDMA reads and atomics.
+                */
+               if (i == qp->r_head_ack_queue) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       qp->r_nak_state = 0;
+                       qp->r_ack_psn = qp->r_psn - 1;
+                       goto send_ack;
+               }
+
+               /*
+                * Resend the RDMA read or atomic op which
+                * ACKs this duplicate request.
+                */
+               qp->s_tail_ack_queue = i;
+               break;
+       }
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+       qp->s_flags |= HFI1_S_RESP_PENDING;
+       qp->r_nak_state = 0;
+       hfi1_schedule_send(qp);
+
+unlock_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+       return 1;
+
+send_ack:
+       return 0;
+}
+
+void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err)
+{
+       unsigned long flags;
+       int lastwqe;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       lastwqe = hfi1_error_qp(qp, err);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       if (lastwqe) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+static inline void update_ack_queue(struct hfi1_qp *qp, unsigned n)
+{
+       unsigned next;
+
+       next = n + 1;
+       if (next > HFI1_MAX_RDMA_ATOMIC)
+               next = 0;
+       qp->s_tail_ack_queue = next;
+       qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
+                         u32 lqpn, u32 rqpn, u8 svc_type)
+{
+       struct opa_hfi1_cong_log_event_internal *cc_event;
+
+       if (sl >= OPA_MAX_SLS)
+               return;
+
+       spin_lock(&ppd->cc_log_lock);
+
+       ppd->threshold_cong_event_map[sl/8] |= 1 << (sl % 8);
+       ppd->threshold_event_counter++;
+
+       cc_event = &ppd->cc_events[ppd->cc_log_idx++];
+       if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
+               ppd->cc_log_idx = 0;
+       cc_event->lqpn = lqpn & HFI1_QPN_MASK;
+       cc_event->rqpn = rqpn & HFI1_QPN_MASK;
+       cc_event->sl = sl;
+       cc_event->svc_type = svc_type;
+       cc_event->rlid = rlid;
+       /* keep timestamp in units of 1.024 usec */
+       cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
+
+       spin_unlock(&ppd->cc_log_lock);
+}
+
+void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
+                 u32 rqpn, u8 svc_type)
+{
+       struct cca_timer *cca_timer;
+       u16 ccti, ccti_incr, ccti_timer, ccti_limit;
+       u8 trigger_threshold;
+       struct cc_state *cc_state;
+
+       if (sl >= OPA_MAX_SLS)
+               return;
+
+       cca_timer = &ppd->cca_timer[sl];
+
+       cc_state = get_cc_state(ppd);
+
+       if (cc_state == NULL)
+               return;
+
+       /*
+        * 1) increase CCTI (for this SL)
+        * 2) select IPG (i.e., call set_link_ipg())
+        * 3) start timer
+        */
+       ccti_limit = cc_state->cct.ccti_limit;
+       ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
+       ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
+       trigger_threshold =
+               cc_state->cong_setting.entries[sl].trigger_threshold;
+
+       spin_lock(&ppd->cca_timer_lock);
+
+       if (cca_timer->ccti < ccti_limit) {
+               if (cca_timer->ccti + ccti_incr <= ccti_limit)
+                       cca_timer->ccti += ccti_incr;
+               else
+                       cca_timer->ccti = ccti_limit;
+               set_link_ipg(ppd);
+       }
+
+       spin_unlock(&ppd->cca_timer_lock);
+
+       ccti = cca_timer->ccti;
+
+       if (!hrtimer_active(&cca_timer->hrtimer)) {
+               /* ccti_timer is in units of 1.024 usec */
+               unsigned long nsec = 1024 * ccti_timer;
+
+               hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
+                             HRTIMER_MODE_REL);
+       }
+
+       if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
+               log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
+}
+
+/**
+ * hfi1_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_rc_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct hfi1_qp *qp = packet->qp;
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       u32 bth0, opcode;
+       u32 hdrsize = packet->hlen;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = qp->pmtu;
+       int diff;
+       struct ib_reth *reth;
+       unsigned long flags;
+       u32 bth1;
+       int ret, is_fecn = 0;
+
+       bth0 = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
+               return;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               if (bth1 & HFI1_BECN_SMASK) {
+                       u16 rlid = qp->remote_ah_attr.dlid;
+                       u32 lqpn, rqpn;
+
+                       lqpn = qp->ibqp.qp_num;
+                       rqpn = qp->remote_qpn;
+                       process_becn(
+                               ppd,
+                               qp->remote_ah_attr.sl,
+                               rlid, lqpn, rqpn,
+                               IB_CC_SVCTYPE_RC);
+               }
+               is_fecn = bth1 & HFI1_FECN_SMASK;
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode = bth0 >> 24;
+
+       /*
+        * Process responses (ACKs) before anything else.  Note that the
+        * packet sequence number will be for something in the send work
+        * queue rather than the expected receive packet sequence number.
+        * In other words, this QP is the requester.
+        */
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+                           hdrsize, pmtu, rcd);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       /* Compute 24 bits worth of difference. */
+       diff = delta_psn(psn, qp->r_psn);
+       if (unlikely(diff)) {
+               if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+                       return;
+               goto send_ack;
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       default:
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       goto nack_inv;
+               /*
+                * Note that it is up to the requester to not send a new
+                * RDMA read or atomic operation before receiving an ACK
+                * for the previous operation.
+                */
+               break;
+       }
+
+       if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
+               qp_comm_est(qp);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+               ret = hfi1_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+       case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto nack_inv;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto nack_inv;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               /* consume RWQE */
+               ret = hfi1_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+               ret = hfi1_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto no_immediate_data;
+               /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+               wc.ex.imm_data = ohdr->u.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+       case OP(SEND_LAST):
+       case OP(RDMA_WRITE_LAST):
+no_immediate_data:
+               wc.wc_flags = 0;
+               wc.ex.imm_data = 0;
+send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (bth0 >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto nack_inv;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto nack_inv;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+               hfi1_put_ss(&qp->r_sge);
+               qp->r_msn++;
+               if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+                       break;
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               else
+                       wc.opcode = IB_WC_RECV;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               /*
+                * It seems that IB mandates the presence of an SL in a
+                * work completion only for the UD transport (see section
+                * 11.4.2 of IBTA Vol. 1).
+                *
+                * However, the way the SL is chosen below is consistent
+                * with the way that IB/qib works and is trying avoid
+                * introducing incompatibilities.
+                *
+                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+                */
+               wc.sl = qp->remote_ah_attr.sl;
+               /* zero fields that are N/A */
+               wc.vendor_err = 0;
+               wc.pkey_index = 0;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                             (bth0 & IB_BTH_SOLICITED) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto nack_inv;
+               /* consume RWQE */
+               reth = &ohdr->u.rc.reth;
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               qp->r_sge.sg_list = NULL;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+                                         rkey, IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto nack_acc;
+                       qp->r_sge.num_sge = 1;
+               } else {
+                       qp->r_sge.num_sge = 0;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_FIRST))
+                       goto send_middle;
+               else if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto no_immediate_data;
+               ret = hfi1_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+               if (!ret)
+                       goto rnr_nak;
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+
+       case OP(RDMA_READ_REQUEST): {
+               struct hfi1_ack_entry *e;
+               u32 len;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
+               if (next > HFI1_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+                       hfi1_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               reth = &ohdr->u.rc.reth;
+               len = be32_to_cpu(reth->length);
+               if (len) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+                                         rkey, IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto nack_acc_unlck;
+                       /*
+                        * Update the next expected PSN.  We add 1 later
+                        * below, so only add the remainder here.
+                        */
+                       if (len > pmtu)
+                               qp->r_psn += (len - 1) / pmtu;
+               } else {
+                       e->rdma_sge.mr = NULL;
+                       e->rdma_sge.vaddr = NULL;
+                       e->rdma_sge.length = 0;
+                       e->rdma_sge.sge_length = 0;
+               }
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               e->lpsn = qp->r_psn;
+               /*
+                * We need to increment the MSN here instead of when we
+                * finish sending the result since a duplicate request would
+                * increment it more than once.
+                */
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               qp->s_flags |= HFI1_S_RESP_PENDING;
+               hfi1_schedule_send(qp);
+
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               struct ib_atomic_eth *ateth;
+               struct hfi1_ack_entry *e;
+               u64 vaddr;
+               atomic64_t *maddr;
+               u64 sdata;
+               u32 rkey;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               if (next > HFI1_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+                       hfi1_put_mr(e->rdma_sge.mr);
+                       e->rdma_sge.mr = NULL;
+               }
+               ateth = &ohdr->u.atomic_eth;
+               vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+                       be32_to_cpu(ateth->vaddr[1]);
+               if (unlikely(vaddr & (sizeof(u64) - 1)))
+                       goto nack_inv_unlck;
+               rkey = be32_to_cpu(ateth->rkey);
+               /* Check rkey & NAK */
+               if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+                                          vaddr, rkey,
+                                          IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_acc_unlck;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+               sdata = be64_to_cpu(ateth->swap_data);
+               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+                       (u64) atomic64_add_return(sdata, maddr) - sdata :
+                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+                                     be64_to_cpu(ateth->compare_data),
+                                     sdata);
+               hfi1_put_mr(qp->r_sge.sge.mr);
+               qp->r_sge.num_sge = 0;
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               e->lpsn = psn;
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               qp->s_flags |= HFI1_S_RESP_PENDING;
+               hfi1_schedule_send(qp);
+
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               if (is_fecn)
+                       goto send_ack;
+               return;
+       }
+
+       default:
+               /* NAK unknown opcodes. */
+               goto nack_inv;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       qp->r_ack_psn = psn;
+       qp->r_nak_state = 0;
+       /* Send an ACK if requested or required. */
+       if (psn & (1 << 31))
+               goto send_ack;
+       return;
+
+rnr_nak:
+       qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue RNR NAK for later */
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= HFI1_R_RSP_NAK;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+       return;
+
+nack_op_err:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue NAK for later */
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= HFI1_R_RSP_NAK;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+       return;
+
+nack_inv_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+       qp->r_ack_psn = qp->r_psn;
+       /* Queue NAK for later */
+       if (list_empty(&qp->rspwait)) {
+               qp->r_flags |= HFI1_R_RSP_NAK;
+               atomic_inc(&qp->refcount);
+               list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+       }
+       return;
+
+nack_acc_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+       hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+send_ack:
+       hfi1_send_rc_ack(rcd, qp, is_fecn);
+}
+
+void hfi1_rc_hdrerr(
+       struct hfi1_ctxtdata *rcd,
+       struct hfi1_ib_header *hdr,
+       u32 rcv_flags,
+       struct hfi1_qp *qp)
+{
+       int has_grh = rcv_flags & HFI1_HAS_GRH;
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       int diff;
+       u8 opcode;
+       u32 psn;
+
+       /* Check for GRH */
+       ohdr = &hdr->u.oth;
+       if (has_grh)
+               ohdr = &hdr->u.l.oth;
+
+       opcode = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+               return;
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode >>= 24;
+
+       /* Only deal with RDMA Writes for now */
+       if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+               diff = delta_psn(psn, qp->r_psn);
+               if (!qp->r_nak_state && diff >= 0) {
+                       ibp->n_rc_seqnak++;
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       /*
+                        * Wait to send the sequence
+                        * NAK until all packets
+                        * in the receive queue have
+                        * been processed.
+                        * Otherwise, we end up
+                        * propagating congestion.
+                        */
+                       if (list_empty(&qp->rspwait)) {
+                               qp->r_flags |= HFI1_R_RSP_NAK;
+                               atomic_inc(&qp->refcount);
+                               list_add_tail(
+                                       &qp->rspwait,
+                                       &rcd->qp_wait_list);
+                               }
+               } /* Out of sequence NAK */
+       } /* QP Request NAKs */
+}
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
new file mode 100644 (file)
index 0000000..a411528
--- /dev/null
@@ -0,0 +1,948 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+#include "sdma.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_hfi1_rnr_table[32] = {
+       655360, /* 00: 655.36 */
+       10,     /* 01:    .01 */
+       20,     /* 02     .02 */
+       30,     /* 03:    .03 */
+       40,     /* 04:    .04 */
+       60,     /* 05:    .06 */
+       80,     /* 06:    .08 */
+       120,    /* 07:    .12 */
+       160,    /* 08:    .16 */
+       240,    /* 09:    .24 */
+       320,    /* 0A:    .32 */
+       480,    /* 0B:    .48 */
+       640,    /* 0C:    .64 */
+       960,    /* 0D:    .96 */
+       1280,   /* 0E:   1.28 */
+       1920,   /* 0F:   1.92 */
+       2560,   /* 10:   2.56 */
+       3840,   /* 11:   3.84 */
+       5120,   /* 12:   5.12 */
+       7680,   /* 13:   7.68 */
+       10240,  /* 14:  10.24 */
+       15360,  /* 15:  15.36 */
+       20480,  /* 16:  20.48 */
+       30720,  /* 17:  30.72 */
+       40960,  /* 18:  40.96 */
+       61440,  /* 19:  61.44 */
+       81920,  /* 1A:  81.92 */
+       122880, /* 1B: 122.88 */
+       163840, /* 1C: 163.84 */
+       245760, /* 1D: 245.76 */
+       327680, /* 1E: 327.68 */
+       491520  /* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe)
+{
+       int i, j, ret;
+       struct ib_wc wc;
+       struct hfi1_lkey_table *rkt;
+       struct hfi1_pd *pd;
+       struct hfi1_sge_state *ss;
+
+       rkt = &to_idev(qp->ibqp.device)->lk_table;
+       pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+       ss = &qp->r_sge;
+       ss->sg_list = qp->r_sg_list;
+       qp->r_len = 0;
+       for (i = j = 0; i < wqe->num_sge; i++) {
+               if (wqe->sg_list[i].length == 0)
+                       continue;
+               /* Check LKEY */
+               if (!hfi1_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+                                 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+                       goto bad_lkey;
+               qp->r_len += wqe->sg_list[i].length;
+               j++;
+       }
+       ss->num_sge = j;
+       ss->total_len = qp->r_len;
+       ret = 1;
+       goto bail;
+
+bad_lkey:
+       while (j) {
+               struct hfi1_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+               hfi1_put_mr(sge->mr);
+       }
+       ss->num_sge = 0;
+       memset(&wc, 0, sizeof(wc));
+       wc.wr_id = wqe->wr_id;
+       wc.status = IB_WC_LOC_PROT_ERR;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       /* Signal solicited completion event. */
+       hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only)
+{
+       unsigned long flags;
+       struct hfi1_rq *rq;
+       struct hfi1_rwq *wq;
+       struct hfi1_srq *srq;
+       struct hfi1_rwqe *wqe;
+       void (*handler)(struct ib_event *, void *);
+       u32 tail;
+       int ret;
+
+       if (qp->ibqp.srq) {
+               srq = to_isrq(qp->ibqp.srq);
+               handler = srq->ibsrq.event_handler;
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               handler = NULL;
+               rq = &qp->r_rq;
+       }
+
+       spin_lock_irqsave(&rq->lock, flags);
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
+               ret = 0;
+               goto unlock;
+       }
+
+       wq = rq->wq;
+       tail = wq->tail;
+       /* Validate tail before using it since it is user writable. */
+       if (tail >= rq->size)
+               tail = 0;
+       if (unlikely(tail == wq->head)) {
+               ret = 0;
+               goto unlock;
+       }
+       /* Make sure entry is read after head index is read. */
+       smp_rmb();
+       wqe = get_rwqe_ptr(rq, tail);
+       /*
+        * Even though we update the tail index in memory, the verbs
+        * consumer is not supposed to post more entries until a
+        * completion is generated.
+        */
+       if (++tail >= rq->size)
+               tail = 0;
+       wq->tail = tail;
+       if (!wr_id_only && !init_sge(qp, wqe)) {
+               ret = -1;
+               goto unlock;
+       }
+       qp->r_wr_id = wqe->wr_id;
+
+       ret = 1;
+       set_bit(HFI1_R_WRID_VALID, &qp->r_aflags);
+       if (handler) {
+               u32 n;
+
+               /*
+                * Validate head pointer value and compute
+                * the number of remaining WQEs.
+                */
+               n = wq->head;
+               if (n >= rq->size)
+                       n = 0;
+               if (n < tail)
+                       n += rq->size - tail;
+               else
+                       n -= tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       handler(&ev, srq->ibsrq.srq_context);
+                       goto bail;
+               }
+       }
+unlock:
+       spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+       return ret;
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct hfi1_qp *qp)
+{
+       struct ib_event ev;
+
+       qp->s_mig_state = IB_MIG_MIGRATED;
+       qp->remote_ah_attr = qp->alt_ah_attr;
+       qp->port_num = qp->alt_ah_attr.port_num;
+       qp->s_pkey_index = qp->s_alt_pkey_index;
+       qp->s_flags |= HFI1_S_AHG_CLEAR;
+
+       ev.device = qp->ibqp.device;
+       ev.element.qp = &qp->ibqp;
+       ev.event = IB_EVENT_PATH_MIG;
+       qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
+{
+       if (!index) {
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               return cpu_to_be64(ppd->guid);
+       }
+       return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+       return (gid->global.interface_id == id &&
+               (gid->global.subnet_prefix == gid_prefix ||
+                gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the hfi1_migrate_qp() call.
+ */
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+                      int has_grh, struct hfi1_qp *qp, u32 bth0)
+{
+       __be64 guid;
+       unsigned long flags;
+       u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+       if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+               if (!has_grh) {
+                       if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+                               goto err;
+               } else {
+                       if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+                               goto err;
+                       guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+                               goto err;
+                       if (!gid_ok(&hdr->u.l.grh.sgid,
+                           qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+                           qp->alt_ah_attr.grh.dgid.global.interface_id))
+                               goto err;
+               }
+               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
+                       hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+                                      (u16)bth0,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      0, qp->ibqp.qp_num,
+                                      hdr->lrh[3], hdr->lrh[1]);
+                       goto err;
+               }
+               /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+               if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+                   ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+                       goto err;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_migrate_qp(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       } else {
+               if (!has_grh) {
+                       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+                               goto err;
+               } else {
+                       if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+                               goto err;
+                       guid = get_sguid(ibp,
+                                        qp->remote_ah_attr.grh.sgid_index);
+                       if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+                               goto err;
+                       if (!gid_ok(&hdr->u.l.grh.sgid,
+                           qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+                           qp->remote_ah_attr.grh.dgid.global.interface_id))
+                               goto err;
+               }
+               if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
+                                           sc5, be16_to_cpu(hdr->lrh[3])))) {
+                       hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+                                      (u16)bth0,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      0, qp->ibqp.qp_num,
+                                      hdr->lrh[3], hdr->lrh[1]);
+                       goto err;
+               }
+               /* Validate the SLID. See Ch. 9.6.1.5 */
+               if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+                   ppd_from_ibp(ibp)->port != qp->port_num)
+                       goto err;
+               if (qp->s_mig_state == IB_MIG_REARM &&
+                   !(bth0 & IB_BTH_MIG_REQ))
+                       qp->s_mig_state = IB_MIG_ARMED;
+       }
+
+       return 0;
+
+err:
+       return 1;
+}
+
+/**
+ * ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from hfi1_do_send() to
+ * forward a WQE addressed to the same HFI.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ruc_loopback(struct hfi1_qp *sqp)
+{
+       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+       struct hfi1_qp *qp;
+       struct hfi1_swqe *wqe;
+       struct hfi1_sge *sge;
+       unsigned long flags;
+       struct ib_wc wc;
+       u64 sdata;
+       atomic64_t *maddr;
+       enum ib_wc_status send_status;
+       int release;
+       int ret;
+
+       rcu_read_lock();
+
+       /*
+        * Note that we check the responder QP state after
+        * checking the requester's state.
+        */
+       qp = hfi1_lookup_qpn(ibp, sqp->remote_qpn);
+
+       spin_lock_irqsave(&sqp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((sqp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT)) ||
+           !(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+               goto unlock;
+
+       sqp->s_flags |= HFI1_S_BUSY;
+
+again:
+       if (sqp->s_last == sqp->s_head)
+               goto clr_busy;
+       wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+       /* Return if it is not OK to start a new work request. */
+       if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_hfi1_state_ops[sqp->state] & HFI1_FLUSH_SEND))
+                       goto clr_busy;
+               /* We are in the error state, flush the work request. */
+               send_status = IB_WC_WR_FLUSH_ERR;
+               goto flush_send;
+       }
+
+       /*
+        * We can rely on the entry not changing without the s_lock
+        * being held until we update s_last.
+        * We increment s_cur to indicate s_last is in progress.
+        */
+       if (sqp->s_last == sqp->s_cur) {
+               if (++sqp->s_cur >= sqp->s_size)
+                       sqp->s_cur = 0;
+       }
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+       if (!qp || !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) ||
+           qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+               ibp->n_pkt_drops++;
+               /*
+                * For RC, the requester would timeout and retry so
+                * shortcut the timeouts and just signal too many retries.
+                */
+               if (sqp->ibqp.qp_type == IB_QPT_RC)
+                       send_status = IB_WC_RETRY_EXC_ERR;
+               else
+                       send_status = IB_WC_SUCCESS;
+               goto serr;
+       }
+
+       memset(&wc, 0, sizeof(wc));
+       send_status = IB_WC_SUCCESS;
+
+       release = 1;
+       sqp->s_sge.sge = wqe->sg_list[0];
+       sqp->s_sge.sg_list = wqe->sg_list + 1;
+       sqp->s_sge.num_sge = wqe->wr.num_sge;
+       sqp->s_len = wqe->length;
+       switch (wqe->wr.opcode) {
+       case IB_WR_SEND_WITH_IMM:
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               /* FALLTHROUGH */
+       case IB_WR_SEND:
+               ret = hfi1_get_rwqe(qp, 0);
+               if (ret < 0)
+                       goto op_err;
+               if (!ret)
+                       goto rnr_nak;
+               break;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               ret = hfi1_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto op_err;
+               if (!ret)
+                       goto rnr_nak;
+               /* FALLTHROUGH */
+       case IB_WR_RDMA_WRITE:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+               if (wqe->length == 0)
+                       break;
+               if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+                                          wqe->wr.wr.rdma.remote_addr,
+                                          wqe->wr.wr.rdma.rkey,
+                                          IB_ACCESS_REMOTE_WRITE)))
+                       goto acc_err;
+               qp->r_sge.sg_list = NULL;
+               qp->r_sge.num_sge = 1;
+               qp->r_sge.total_len = wqe->length;
+               break;
+
+       case IB_WR_RDMA_READ:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto inv_err;
+               if (unlikely(!hfi1_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+                                          wqe->wr.wr.rdma.remote_addr,
+                                          wqe->wr.wr.rdma.rkey,
+                                          IB_ACCESS_REMOTE_READ)))
+                       goto acc_err;
+               release = 0;
+               sqp->s_sge.sg_list = NULL;
+               sqp->s_sge.num_sge = 1;
+               qp->r_sge.sge = wqe->sg_list[0];
+               qp->r_sge.sg_list = wqe->sg_list + 1;
+               qp->r_sge.num_sge = wqe->wr.num_sge;
+               qp->r_sge.total_len = wqe->length;
+               break;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto inv_err;
+               if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+                                          wqe->wr.wr.atomic.remote_addr,
+                                          wqe->wr.wr.atomic.rkey,
+                                          IB_ACCESS_REMOTE_ATOMIC)))
+                       goto acc_err;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+               sdata = wqe->wr.wr.atomic.compare_add;
+               *(u64 *) sqp->s_sge.sge.vaddr =
+                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+                       (u64) atomic64_add_return(sdata, maddr) - sdata :
+                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+                                     sdata, wqe->wr.wr.atomic.swap);
+               hfi1_put_mr(qp->r_sge.sge.mr);
+               qp->r_sge.num_sge = 0;
+               goto send_comp;
+
+       default:
+               send_status = IB_WC_LOC_QP_OP_ERR;
+               goto serr;
+       }
+
+       sge = &sqp->s_sge.sge;
+       while (sqp->s_len) {
+               u32 len = sqp->s_len;
+
+               if (len > sge->length)
+                       len = sge->length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (!release)
+                               hfi1_put_mr(sge->mr);
+                       if (--sqp->s_sge.num_sge)
+                               *sge = *sqp->s_sge.sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= HFI1_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               sqp->s_len -= len;
+       }
+       if (release)
+               hfi1_put_ss(&qp->r_sge);
+
+       if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+               goto send_comp;
+
+       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+       else
+               wc.opcode = IB_WC_RECV;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.byte_len = wqe->length;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = qp->remote_qpn;
+       wc.slid = qp->remote_ah_attr.dlid;
+       wc.sl = qp->remote_ah_attr.sl;
+       wc.port_num = 1;
+       /* Signal completion event if the solicited bit is set. */
+       hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                     wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       ibp->n_loop_pkts++;
+flush_send:
+       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+       hfi1_send_complete(sqp, wqe, send_status);
+       goto again;
+
+rnr_nak:
+       /* Handle RNR NAK */
+       if (qp->ibqp.qp_type == IB_QPT_UC)
+               goto send_comp;
+       ibp->n_rnr_naks++;
+       /*
+        * Note: we don't need the s_lock held since the BUSY flag
+        * makes this single threaded.
+        */
+       if (sqp->s_rnr_retry == 0) {
+               send_status = IB_WC_RNR_RETRY_EXC_ERR;
+               goto serr;
+       }
+       if (sqp->s_rnr_retry_cnt < 7)
+               sqp->s_rnr_retry--;
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_RECV_OK))
+               goto clr_busy;
+       sqp->s_flags |= HFI1_S_WAIT_RNR;
+       sqp->s_timer.function = hfi1_rc_rnr_retry;
+       sqp->s_timer.expires = jiffies +
+               usecs_to_jiffies(ib_hfi1_rnr_table[qp->r_min_rnr_timer]);
+       add_timer(&sqp->s_timer);
+       goto clr_busy;
+
+op_err:
+       send_status = IB_WC_REM_OP_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+inv_err:
+       send_status = IB_WC_REM_INV_REQ_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+acc_err:
+       send_status = IB_WC_REM_ACCESS_ERR;
+       wc.status = IB_WC_LOC_PROT_ERR;
+err:
+       /* responder goes to error state */
+       hfi1_rc_error(qp, wc.status);
+
+serr:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       hfi1_send_complete(sqp, wqe, send_status);
+       if (sqp->ibqp.qp_type == IB_QPT_RC) {
+               int lastwqe = hfi1_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+               sqp->s_flags &= ~HFI1_S_BUSY;
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               if (lastwqe) {
+                       struct ib_event ev;
+
+                       ev.device = sqp->ibqp.device;
+                       ev.element.qp = &sqp->ibqp;
+                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+               }
+               goto done;
+       }
+clr_busy:
+       sqp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+       rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+                 struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+       hdr->version_tclass_flow =
+               cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+                           (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+                           (grh->flow_label << IB_GRH_FLOW_SHIFT));
+       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+       hdr->next_hdr = IB_GRH_NEXT_HDR;
+       hdr->hop_limit = grh->hop_limit;
+       /* The SGID is 32-bit aligned. */
+       hdr->sgid.global.subnet_prefix = ibp->gid_prefix;
+       hdr->sgid.global.interface_id =
+               grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ?
+               ibp->guids[grh->sgid_index - 1] :
+                       cpu_to_be64(ppd_from_ibp(ibp)->guid);
+       hdr->dgid = grh->dgid;
+
+       /* GRH header size in 32-bit words. */
+       return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+/*
+ * free_ahg - clear ahg from QP
+ */
+void clear_ahg(struct hfi1_qp *qp)
+{
+       qp->s_hdr->ahgcount = 0;
+       qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR);
+       if (qp->s_sde)
+               sdma_ahg_free(qp->s_sde, qp->s_ahgidx);
+       qp->s_ahgidx = -1;
+       qp->s_sde = NULL;
+}
+
+#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4)
+
+/**
+ * build_ahg - create ahg in s_hdr
+ * @qp: a pointer to QP
+ * @npsn: the next PSN for the request/response
+ *
+ * This routine handles the AHG by allocating an ahg entry and causing the
+ * copy of the first middle.
+ *
+ * Subsequent middles use the copied entry, editing the
+ * PSN with 1 or 2 edits.
+ */
+static inline void build_ahg(struct hfi1_qp *qp, u32 npsn)
+{
+       if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR))
+               clear_ahg(qp);
+       if (!(qp->s_flags & HFI1_S_AHG_VALID)) {
+               /* first middle that needs copy  */
+               if (qp->s_ahgidx < 0) {
+                       if (!qp->s_sde)
+                               qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
+                       qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde);
+               }
+               if (qp->s_ahgidx >= 0) {
+                       qp->s_ahgpsn = npsn;
+                       qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
+                       /* save to protect a change in another thread */
+                       qp->s_hdr->sde = qp->s_sde;
+                       qp->s_hdr->ahgidx = qp->s_ahgidx;
+                       qp->s_flags |= HFI1_S_AHG_VALID;
+               }
+       } else {
+               /* subsequent middle after valid */
+               if (qp->s_ahgidx >= 0) {
+                       qp->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG;
+                       qp->s_hdr->ahgidx = qp->s_ahgidx;
+                       qp->s_hdr->ahgcount++;
+                       qp->s_hdr->ahgdesc[0] =
+                               sdma_build_ahg_descriptor(
+                                       (__force u16)cpu_to_be16((u16)npsn),
+                                       BTH2_OFFSET,
+                                       16,
+                                       16);
+                       if ((npsn & 0xffff0000) !=
+                                       (qp->s_ahgpsn & 0xffff0000)) {
+                               qp->s_hdr->ahgcount++;
+                               qp->s_hdr->ahgdesc[1] =
+                                       sdma_build_ahg_descriptor(
+                                               (__force u16)cpu_to_be16(
+                                                       (u16)(npsn >> 16)),
+                                               BTH2_OFFSET,
+                                               0,
+                                               16);
+                       }
+               }
+       }
+}
+
+void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
+                         u32 bth0, u32 bth2, int middle)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       u16 lrh0;
+       u32 nwords;
+       u32 extra_bytes;
+       u8 sc5;
+       u32 bth1;
+
+       /* Construct the header. */
+       extra_bytes = -qp->s_cur_size & 3;
+       nwords = (qp->s_cur_size + extra_bytes) >> 2;
+       lrh0 = HFI1_LRH_BTH;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
+                                              &qp->remote_ah_attr.grh,
+                                              qp->s_hdrwords, nwords);
+               lrh0 = HFI1_LRH_GRH;
+               middle = 0;
+       }
+       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+       lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
+       qp->s_sc = sc5;
+       /*
+        * reset s_hdr/AHG fields
+        *
+        * This insures that the ahgentry/ahgcount
+        * are at a non-AHG default to protect
+        * build_verbs_tx_desc() from using
+        * an include ahgidx.
+        *
+        * build_ahg() will modify as appropriate
+        * to use the AHG feature.
+        */
+       qp->s_hdr->tx_flags = 0;
+       qp->s_hdr->ahgcount = 0;
+       qp->s_hdr->ahgidx = 0;
+       qp->s_hdr->sde = NULL;
+       if (qp->s_mig_state == IB_MIG_MIGRATED)
+               bth0 |= IB_BTH_MIG_REQ;
+       else
+               middle = 0;
+       if (middle)
+               build_ahg(qp, bth2);
+       else
+               qp->s_flags &= ~HFI1_S_AHG_VALID;
+       qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr->ibh.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       qp->s_hdr->ibh.lrh[2] =
+               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       qp->s_hdr->ibh.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+                                      qp->remote_ah_attr.src_path_bits);
+       bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+       bth0 |= extra_bytes << 20;
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       bth1 = qp->remote_qpn;
+       if (qp->s_flags & HFI1_S_ECN) {
+               qp->s_flags &= ~HFI1_S_ECN;
+               /* we recently received a FECN, so return a BECN */
+               bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
+       }
+       ohdr->bth[1] = cpu_to_be32(bth1);
+       ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * hfi1_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void hfi1_do_send(struct work_struct *work)
+{
+       struct iowait *wait = container_of(work, struct iowait, iowork);
+       struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait);
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       int (*make_req)(struct hfi1_qp *qp);
+       unsigned long flags;
+
+       if ((qp->ibqp.qp_type == IB_QPT_RC ||
+            qp->ibqp.qp_type == IB_QPT_UC) &&
+           !loopback &&
+           (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) {
+               ruc_loopback(qp);
+               return;
+       }
+
+       if (qp->ibqp.qp_type == IB_QPT_RC)
+               make_req = hfi1_make_rc_req;
+       else if (qp->ibqp.qp_type == IB_QPT_UC)
+               make_req = hfi1_make_uc_req;
+       else
+               make_req = hfi1_make_ud_req;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if (!hfi1_send_ok(qp)) {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               return;
+       }
+
+       qp->s_flags |= HFI1_S_BUSY;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       do {
+               /* Check for a constructed packet to be sent. */
+               if (qp->s_hdrwords != 0) {
+                       /*
+                        * If the packet cannot be sent now, return and
+                        * the send tasklet will be woken up later.
+                        */
+                       if (hfi1_verbs_send(qp, qp->s_hdr, qp->s_hdrwords,
+                                           qp->s_cur_sge, qp->s_cur_size))
+                               break;
+                       /* Record that s_hdr is empty. */
+                       qp->s_hdrwords = 0;
+               }
+       } while (make_req(qp));
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
+                       enum ib_wc_status status)
+{
+       u32 old_last, last;
+       unsigned i;
+
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       for (i = 0; i < wqe->wr.num_sge; i++) {
+               struct hfi1_sge *sge = &wqe->sg_list[i];
+
+               hfi1_put_mr(sge->mr);
+       }
+       if (qp->ibqp.qp_type == IB_QPT_UD ||
+           qp->ibqp.qp_type == IB_QPT_SMI ||
+           qp->ibqp.qp_type == IB_QPT_GSI)
+               atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+
+       /* See ch. 11.2.4.1 and 10.7.3.1 */
+       if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+           status != IB_WC_SUCCESS) {
+               struct ib_wc wc;
+
+               memset(&wc, 0, sizeof(wc));
+               wc.wr_id = wqe->wr.wr_id;
+               wc.status = status;
+               wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
+               wc.qp = &qp->ibqp;
+               if (status == IB_WC_SUCCESS)
+                       wc.byte_len = wqe->length;
+               hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+                             status != IB_WC_SUCCESS);
+       }
+
+       last = qp->s_last;
+       old_last = last;
+       if (++last >= qp->s_size)
+               last = 0;
+       qp->s_last = last;
+       if (qp->s_acked == old_last)
+               qp->s_acked = last;
+       if (qp->s_cur == old_last)
+               qp->s_cur = last;
+       if (qp->s_tail == old_last)
+               qp->s_tail = last;
+       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+               qp->s_draining = 0;
+}
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
new file mode 100644 (file)
index 0000000..a8c903c
--- /dev/null
@@ -0,0 +1,2962 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/seqlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/bitops.h>
+#include <linux/timer.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "qp.h"
+#include "sdma.h"
+#include "iowait.h"
+#include "trace.h"
+
+/* must be a power of 2 >= 64 <= 32768 */
+#define SDMA_DESCQ_CNT 1024
+#define INVALID_TAIL 0xffff
+
+static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
+module_param(sdma_descq_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+static uint sdma_idle_cnt = 250;
+module_param(sdma_idle_cnt, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
+
+uint mod_num_sdma;
+module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
+MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
+
+#define SDMA_WAIT_BATCH_SIZE 20
+/* max wait time for a SDMA engine to indicate it has halted */
+#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
+/* all SDMA engine errors that cause a halt */
+
+#define SD(name) SEND_DMA_##name
+#define ALL_SDMA_ENG_HALT_ERRS \
+       (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
+       | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
+
+/* sdma_sendctrl operations */
+#define SDMA_SENDCTRL_OP_ENABLE    (1U << 0)
+#define SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
+#define SDMA_SENDCTRL_OP_HALT      (1U << 2)
+#define SDMA_SENDCTRL_OP_CLEANUP   (1U << 3)
+
+/* handle long defines */
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
+#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
+SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
+
+static const char * const sdma_state_names[] = {
+       [sdma_state_s00_hw_down]                = "s00_HwDown",
+       [sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
+       [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
+       [sdma_state_s20_idle]                   = "s20_Idle",
+       [sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
+       [sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
+       [sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
+       [sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
+       [sdma_state_s80_hw_freeze]              = "s80_HwFreeze",
+       [sdma_state_s82_freeze_sw_clean]        = "s82_FreezeSwClean",
+       [sdma_state_s99_running]                = "s99_Running",
+};
+
+static const char * const sdma_event_names[] = {
+       [sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+       [sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+       [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
+       [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
+       [sdma_event_e30_go_running]   = "e30_GoRunning",
+       [sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+       [sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+       [sdma_event_e60_hw_halted]    = "e60_HwHalted",
+       [sdma_event_e70_go_idle]      = "e70_GoIdle",
+       [sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
+       [sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
+       [sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
+       [sdma_event_e85_link_down]    = "e85_LinkDown",
+       [sdma_event_e90_sw_halted]    = "e90_SwHalted",
+};
+
+static const struct sdma_set_state_action sdma_action_table[] = {
+       [sdma_state_s00_hw_down] = {
+               .go_s99_running_tofalse = 1,
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s10_hw_start_up_halt_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 1,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s15_hw_start_up_clean_wait] = {
+               .op_enable = 0,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 1,
+       },
+       [sdma_state_s20_idle] = {
+               .op_enable = 0,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s30_sw_clean_up_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s40_hw_clean_up_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 1,
+       },
+       [sdma_state_s50_hw_halt_wait] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s60_idle_halt_wait] = {
+               .go_s99_running_tofalse = 1,
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 1,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s80_hw_freeze] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s82_freeze_sw_clean] = {
+               .op_enable = 0,
+               .op_intenable = 0,
+               .op_halt = 0,
+               .op_cleanup = 0,
+       },
+       [sdma_state_s99_running] = {
+               .op_enable = 1,
+               .op_intenable = 1,
+               .op_halt = 0,
+               .op_cleanup = 0,
+               .go_s99_running_totrue = 1,
+       },
+};
+
+#define SDMA_TAIL_UPDATE_THRESH 0x1F
+
+/* declare all statics here rather than keep sorting */
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct sdma_state *);
+static void sdma_get(struct sdma_state *);
+static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_put(struct sdma_state *);
+static void sdma_set_state(struct sdma_engine *, enum sdma_states);
+static void sdma_start_hw_clean_up(struct sdma_engine *);
+static void sdma_start_sw_clean_up(struct sdma_engine *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sendctrl(struct sdma_engine *, unsigned);
+static void init_sdma_regs(struct sdma_engine *, u32, uint);
+static void sdma_process_event(
+       struct sdma_engine *sde,
+       enum sdma_events event);
+static void __sdma_process_event(
+       struct sdma_engine *sde,
+       enum sdma_events event);
+static void dump_sdma_state(struct sdma_engine *sde);
+static void sdma_make_progress(struct sdma_engine *sde, u64 status);
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail);
+static void sdma_flush_descq(struct sdma_engine *sde);
+
+/**
+ * sdma_state_name() - return state string from enum
+ * @state: state
+ */
+static const char *sdma_state_name(enum sdma_states state)
+{
+       return sdma_state_names[state];
+}
+
+static void sdma_get(struct sdma_state *ss)
+{
+       kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+       struct sdma_state *ss =
+               container_of(kref, struct sdma_state, kref);
+
+       complete(&ss->comp);
+}
+
+static void sdma_put(struct sdma_state *ss)
+{
+       kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct sdma_state *ss)
+{
+       sdma_put(ss);
+       wait_for_completion(&ss->comp);
+}
+
+static inline void write_sde_csr(
+       struct sdma_engine *sde,
+       u32 offset0,
+       u64 value)
+{
+       write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
+}
+
+static inline u64 read_sde_csr(
+       struct sdma_engine *sde,
+       u32 offset0)
+{
+       return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
+}
+
+/*
+ * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
+ * sdma engine 'sde' to drop to 0.
+ */
+static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
+                                       int pause)
+{
+       u64 off = 8 * sde->this_idx;
+       struct hfi1_devdata *dd = sde->dd;
+       int lcnt = 0;
+
+       while (1) {
+               u64 reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
+
+               reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
+               reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
+               if (reg == 0)
+                       break;
+               if (lcnt++ > 100) {
+                       dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u\n",
+                                 __func__, sde->this_idx, (u32)reg);
+                       break;
+               }
+               udelay(1);
+       }
+}
+
+/*
+ * sdma_wait() - wait for packet egress to complete for all SDMA engines,
+ * and pause for credit return.
+ */
+void sdma_wait(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < dd->num_sdma; i++) {
+               struct sdma_engine *sde = &dd->per_sdma[i];
+
+               sdma_wait_for_packet_egress(sde, 0);
+       }
+}
+
+static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
+{
+       u64 reg;
+
+       if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
+               return;
+       reg = cnt;
+       reg &= SD(DESC_CNT_CNT_MASK);
+       reg <<= SD(DESC_CNT_CNT_SHIFT);
+       write_sde_csr(sde, SD(DESC_CNT), reg);
+}
+
+/*
+ * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
+ *
+ * Depending on timing there can be txreqs in two places:
+ * - in the descq ring
+ * - in the flush list
+ *
+ * To avoid ordering issues the descq ring needs to be flushed
+ * first followed by the flush list.
+ *
+ * This routine is called from two places
+ * - From a work queue item
+ * - Directly from the state machine just before setting the
+ *   state to running
+ *
+ * Must be called with head_lock held
+ *
+ */
+static void sdma_flush(struct sdma_engine *sde)
+{
+       struct sdma_txreq *txp, *txp_next;
+       LIST_HEAD(flushlist);
+
+       /* flush from head to tail */
+       sdma_flush_descq(sde);
+       spin_lock(&sde->flushlist_lock);
+       /* copy flush list */
+       list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
+               list_del_init(&txp->list);
+               list_add_tail(&txp->list, &flushlist);
+       }
+       spin_unlock(&sde->flushlist_lock);
+       /* flush from flush list */
+       list_for_each_entry_safe(txp, txp_next, &flushlist, list) {
+               int drained = 0;
+               /* protect against complete modifying */
+               struct iowait *wait = txp->wait;
+
+               list_del_init(&txp->list);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+               trace_hfi1_sdma_out_sn(sde, txp->sn);
+               if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+                       dd_dev_err(sde->dd, "expected %llu got %llu\n",
+                               sde->head_sn, txp->sn);
+               sde->head_sn++;
+#endif
+               sdma_txclean(sde->dd, txp);
+               if (wait)
+                       drained = atomic_dec_and_test(&wait->sdma_busy);
+               if (txp->complete)
+                       (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained);
+               if (wait && drained)
+                       iowait_drain_wakeup(wait);
+       }
+}
+
+/*
+ * Fields a work request for flushing the descq ring
+ * and the flush list
+ *
+ * If the engine has been brought to running during
+ * the scheduling delay, the flush is ignored, assuming
+ * that the process of bringing the engine to running
+ * would have done this flush prior to going to running.
+ *
+ */
+static void sdma_field_flush(struct work_struct *work)
+{
+       unsigned long flags;
+       struct sdma_engine *sde =
+               container_of(work, struct sdma_engine, flush_worker);
+
+       write_seqlock_irqsave(&sde->head_lock, flags);
+       if (!__sdma_running(sde))
+               sdma_flush(sde);
+       write_sequnlock_irqrestore(&sde->head_lock, flags);
+}
+
+static void sdma_err_halt_wait(struct work_struct *work)
+{
+       struct sdma_engine *sde = container_of(work, struct sdma_engine,
+                                               err_halt_worker);
+       u64 statuscsr;
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
+       while (1) {
+               statuscsr = read_sde_csr(sde, SD(STATUS));
+               statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
+               if (statuscsr)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(sde->dd,
+                               "SDMA engine %d - timeout waiting for engine to halt\n",
+                               sde->this_idx);
+                       /*
+                        * Continue anyway.  This could happen if there was
+                        * an uncorrectable error in the wrong spot.
+                        */
+                       break;
+               }
+               usleep_range(80, 120);
+       }
+
+       sdma_process_event(sde, sdma_event_e15_hw_halt_done);
+}
+
+static void sdma_start_err_halt_wait(struct sdma_engine *sde)
+{
+       schedule_work(&sde->err_halt_worker);
+}
+
+
+static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
+{
+       if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
+
+               unsigned index;
+               struct hfi1_devdata *dd = sde->dd;
+
+               for (index = 0; index < dd->num_sdma; index++) {
+                       struct sdma_engine *curr_sdma = &dd->per_sdma[index];
+
+                       if (curr_sdma != sde)
+                               curr_sdma->progress_check_head =
+                                                       curr_sdma->descq_head;
+               }
+               dd_dev_err(sde->dd,
+                          "SDMA engine %d - check scheduled\n",
+                               sde->this_idx);
+               mod_timer(&sde->err_progress_check_timer, jiffies + 10);
+       }
+}
+
+static void sdma_err_progress_check(unsigned long data)
+{
+       unsigned index;
+       struct sdma_engine *sde = (struct sdma_engine *)data;
+
+       dd_dev_err(sde->dd, "SDE progress check event\n");
+       for (index = 0; index < sde->dd->num_sdma; index++) {
+               struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
+               unsigned long flags;
+
+               /* check progress on each engine except the current one */
+               if (curr_sde == sde)
+                       continue;
+               /*
+                * We must lock interrupts when acquiring sde->lock,
+                * to avoid a deadlock if interrupt triggers and spins on
+                * the same lock on same CPU
+                */
+               spin_lock_irqsave(&curr_sde->tail_lock, flags);
+               write_seqlock(&curr_sde->head_lock);
+
+               /* skip non-running queues */
+               if (curr_sde->state.current_state != sdma_state_s99_running) {
+                       write_sequnlock(&curr_sde->head_lock);
+                       spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+                       continue;
+               }
+
+               if ((curr_sde->descq_head != curr_sde->descq_tail) &&
+                   (curr_sde->descq_head ==
+                               curr_sde->progress_check_head))
+                       __sdma_process_event(curr_sde,
+                                            sdma_event_e90_sw_halted);
+               write_sequnlock(&curr_sde->head_lock);
+               spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
+       }
+       schedule_work(&sde->err_halt_worker);
+}
+
+static void sdma_hw_clean_up_task(unsigned long opaque)
+{
+       struct sdma_engine *sde = (struct sdma_engine *) opaque;
+       u64 statuscsr;
+
+       while (1) {
+#ifdef CONFIG_SDMA_VERBOSITY
+               dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                          sde->this_idx, slashstrip(__FILE__), __LINE__,
+                       __func__);
+#endif
+               statuscsr = read_sde_csr(sde, SD(STATUS));
+               statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
+               if (statuscsr)
+                       break;
+               udelay(10);
+       }
+
+       sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
+}
+
+static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
+{
+       smp_read_barrier_depends(); /* see sdma_update_tail() */
+       return sde->tx_ring[sde->tx_head & sde->sdma_mask];
+}
+
+/*
+ * flush ring for recovery
+ */
+static void sdma_flush_descq(struct sdma_engine *sde)
+{
+       u16 head, tail;
+       int progress = 0;
+       struct sdma_txreq *txp = get_txhead(sde);
+
+       /* The reason for some of the complexity of this code is that
+        * not all descriptors have corresponding txps.  So, we have to
+        * be able to skip over descs until we wander into the range of
+        * the next txp on the list.
+        */
+       head = sde->descq_head & sde->sdma_mask;
+       tail = sde->descq_tail & sde->sdma_mask;
+       while (head != tail) {
+               /* advance head, wrap if needed */
+               head = ++sde->descq_head & sde->sdma_mask;
+               /* if now past this txp's descs, do the callback */
+               if (txp && txp->next_descq_idx == head) {
+                       int drained = 0;
+                       /* protect against complete modifying */
+                       struct iowait *wait = txp->wait;
+
+                       /* remove from list */
+                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+                       if (wait)
+                               drained = atomic_dec_and_test(&wait->sdma_busy);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+                       trace_hfi1_sdma_out_sn(sde, txp->sn);
+                       if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+                               dd_dev_err(sde->dd, "expected %llu got %llu\n",
+                                       sde->head_sn, txp->sn);
+                       sde->head_sn++;
+#endif
+                       sdma_txclean(sde->dd, txp);
+                       trace_hfi1_sdma_progress(sde, head, tail, txp);
+                       if (txp->complete)
+                               (*txp->complete)(
+                                       txp,
+                                       SDMA_TXREQ_S_ABORTED,
+                                       drained);
+                       if (wait && drained)
+                               iowait_drain_wakeup(wait);
+                       /* see if there is another txp */
+                       txp = get_txhead(sde);
+               }
+               progress++;
+       }
+       if (progress)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+       struct sdma_engine *sde = (struct sdma_engine *) opaque;
+       unsigned long flags;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+
+       /*
+        * At this point, the following should always be true:
+        * - We are halted, so no more descriptors are getting retired.
+        * - We are not running, so no one is submitting new work.
+        * - Only we can send the e40_sw_cleaned, so we can't start
+        *   running again until we say so.  So, the active list and
+        *   descq are ours to play with.
+        */
+
+
+       /*
+        * In the error clean up sequence, software clean must be called
+        * before the hardware clean so we can use the hardware head in
+        * the progress routine.  A hardware clean or SPC unfreeze will
+        * reset the hardware head.
+        *
+        * Process all retired requests. The progress routine will use the
+        * latest physical hardware head - we are not running so speed does
+        * not matter.
+        */
+       sdma_make_progress(sde, 0);
+
+       sdma_flush(sde);
+
+       /*
+        * Reset our notion of head and tail.
+        * Note that the HW registers have been reset via an earlier
+        * clean up.
+        */
+       sde->descq_tail = 0;
+       sde->descq_head = 0;
+       sde->desc_avail = sdma_descq_freecnt(sde);
+       *sde->head_dma = 0;
+
+       __sdma_process_event(sde, sdma_event_e40_sw_cleaned);
+
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sw_tear_down(struct sdma_engine *sde)
+{
+       struct sdma_state *ss = &sde->state;
+
+       /* Releasing this reference means the state machine has stopped. */
+       sdma_put(ss);
+
+       /* stop waiting for all unfreeze events to complete */
+       atomic_set(&sde->dd->sdma_unfreeze_count, -1);
+       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+}
+
+static void sdma_start_hw_clean_up(struct sdma_engine *sde)
+{
+       tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
+}
+
+static void sdma_start_sw_clean_up(struct sdma_engine *sde)
+{
+       tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
+}
+
+static void sdma_set_state(struct sdma_engine *sde,
+       enum sdma_states next_state)
+{
+       struct sdma_state *ss = &sde->state;
+       const struct sdma_set_state_action *action = sdma_action_table;
+       unsigned op = 0;
+
+       trace_hfi1_sdma_state(
+               sde,
+               sdma_state_names[ss->current_state],
+               sdma_state_names[next_state]);
+
+       /* debugging bookkeeping */
+       ss->previous_state = ss->current_state;
+       ss->previous_op = ss->current_op;
+       ss->current_state = next_state;
+
+       if (ss->previous_state != sdma_state_s99_running
+               && next_state == sdma_state_s99_running)
+               sdma_flush(sde);
+
+       if (action[next_state].op_enable)
+               op |= SDMA_SENDCTRL_OP_ENABLE;
+
+       if (action[next_state].op_intenable)
+               op |= SDMA_SENDCTRL_OP_INTENABLE;
+
+       if (action[next_state].op_halt)
+               op |= SDMA_SENDCTRL_OP_HALT;
+
+       if (action[next_state].op_cleanup)
+               op |= SDMA_SENDCTRL_OP_CLEANUP;
+
+       if (action[next_state].go_s99_running_tofalse)
+               ss->go_s99_running = 0;
+
+       if (action[next_state].go_s99_running_totrue)
+               ss->go_s99_running = 1;
+
+       ss->current_op = op;
+       sdma_sendctrl(sde, ss->current_op);
+}
+
+/**
+ * sdma_get_descq_cnt() - called when device probed
+ *
+ * Return a validated descq count.
+ *
+ * This is currently only used in the verbs initialization to build the tx
+ * list.
+ *
+ * This will probably be deleted in favor of a more scalable approach to
+ * alloc tx's.
+ *
+ */
+u16 sdma_get_descq_cnt(void)
+{
+       u16 count = sdma_descq_cnt;
+
+       if (!count)
+               return SDMA_DESCQ_CNT;
+       /* count must be a power of 2 greater than 64 and less than
+        * 32768.   Otherwise return default.
+        */
+       if (!is_power_of_2(count))
+               return SDMA_DESCQ_CNT;
+       if (count < 64 && count > 32768)
+               return SDMA_DESCQ_CNT;
+       return count;
+}
+/**
+ * sdma_select_engine_vl() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @vl: this vl
+ *
+ *
+ * This function returns an engine based on the selector and a vl.  The
+ * mapping fields are protected by RCU.
+ */
+struct sdma_engine *sdma_select_engine_vl(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 vl)
+{
+       struct sdma_vl_map *m;
+       struct sdma_map_elem *e;
+       struct sdma_engine *rval;
+
+       if (WARN_ON(vl > 8))
+               return NULL;
+
+       rcu_read_lock();
+       m = rcu_dereference(dd->sdma_map);
+       if (unlikely(!m)) {
+               rcu_read_unlock();
+               return NULL;
+       }
+       e = m->map[vl & m->mask];
+       rval = e->sde[selector & e->mask];
+       rcu_read_unlock();
+
+       trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
+       return rval;
+}
+
+/**
+ * sdma_select_engine_sc() - select sdma engine
+ * @dd: devdata
+ * @selector: a spreading factor
+ * @sc5: the 5 bit sc
+ *
+ *
+ * This function returns an engine based on the selector and an sc.
+ */
+struct sdma_engine *sdma_select_engine_sc(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 sc5)
+{
+       u8 vl = sc_to_vlt(dd, sc5);
+
+       return sdma_select_engine_vl(dd, selector, vl);
+}
+
+/*
+ * Free the indicated map struct
+ */
+static void sdma_map_free(struct sdma_vl_map *m)
+{
+       int i;
+
+       for (i = 0; m && i < m->actual_vls; i++)
+               kfree(m->map[i]);
+       kfree(m);
+}
+
+/*
+ * Handle RCU callback
+ */
+static void sdma_map_rcu_callback(struct rcu_head *list)
+{
+       struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
+
+       sdma_map_free(m);
+}
+
+/**
+ * sdma_map_init - called when # vls change
+ * @dd: hfi1_devdata
+ * @port: port number
+ * @num_vls: number of vls
+ * @vl_engines: per vl engine mapping (optional)
+ *
+ * This routine changes the mapping based on the number of vls.
+ *
+ * vl_engines is used to specify a non-uniform vl/engine loading. NULL
+ * implies auto computing the loading and giving each VLs a uniform
+ * distribution of engines per VL.
+ *
+ * The auto algorithm computes the sde_per_vl and the number of extra
+ * engines.  Any extra engines are added from the last VL on down.
+ *
+ * rcu locking is used here to control access to the mapping fields.
+ *
+ * If either the num_vls or num_sdma are non-power of 2, the array sizes
+ * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
+ * up to the next highest power of 2 and the first entry is reused
+ * in a round robin fashion.
+ *
+ * If an error occurs the map change is not done and the mapping is
+ * not changed.
+ *
+ */
+int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
+{
+       int i, j;
+       int extra, sde_per_vl;
+       int engine = 0;
+       u8 lvl_engines[OPA_MAX_VLS];
+       struct sdma_vl_map *oldmap, *newmap;
+
+       if (!(dd->flags & HFI1_HAS_SEND_DMA))
+               return 0;
+
+       if (!vl_engines) {
+               /* truncate divide */
+               sde_per_vl = dd->num_sdma / num_vls;
+               /* extras */
+               extra = dd->num_sdma % num_vls;
+               vl_engines = lvl_engines;
+               /* add extras from last vl down */
+               for (i = num_vls - 1; i >= 0; i--, extra--)
+                       vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
+       }
+       /* build new map */
+       newmap = kzalloc(
+               sizeof(struct sdma_vl_map) +
+                       roundup_pow_of_two(num_vls) *
+                       sizeof(struct sdma_map_elem *),
+               GFP_KERNEL);
+       if (!newmap)
+               goto bail;
+       newmap->actual_vls = num_vls;
+       newmap->vls = roundup_pow_of_two(num_vls);
+       newmap->mask = (1 << ilog2(newmap->vls)) - 1;
+       for (i = 0; i < newmap->vls; i++) {
+               /* save for wrap around */
+               int first_engine = engine;
+
+               if (i < newmap->actual_vls) {
+                       int sz = roundup_pow_of_two(vl_engines[i]);
+
+                       /* only allocate once */
+                       newmap->map[i] = kzalloc(
+                               sizeof(struct sdma_map_elem) +
+                                       sz * sizeof(struct sdma_engine *),
+                               GFP_KERNEL);
+                       if (!newmap->map[i])
+                               goto bail;
+                       newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
+                       /* assign engines */
+                       for (j = 0; j < sz; j++) {
+                               newmap->map[i]->sde[j] =
+                                       &dd->per_sdma[engine];
+                               if (++engine >= first_engine + vl_engines[i])
+                                       /* wrap back to first engine */
+                                       engine = first_engine;
+                       }
+               } else {
+                       /* just re-use entry without allocating */
+                       newmap->map[i] = newmap->map[i % num_vls];
+               }
+               engine = first_engine + vl_engines[i];
+       }
+       /* newmap in hand, save old map */
+       spin_lock_irq(&dd->sde_map_lock);
+       oldmap = rcu_dereference_protected(dd->sdma_map,
+                       lockdep_is_held(&dd->sde_map_lock));
+
+       /* publish newmap */
+       rcu_assign_pointer(dd->sdma_map, newmap);
+
+       spin_unlock_irq(&dd->sde_map_lock);
+       /* success, free any old map after grace period */
+       if (oldmap)
+               call_rcu(&oldmap->list, sdma_map_rcu_callback);
+       return 0;
+bail:
+       /* free any partial allocation */
+       sdma_map_free(newmap);
+       return -ENOMEM;
+}
+
+/*
+ * Clean up allocated memory.
+ *
+ * This routine is can be called regardless of the success of sdma_init()
+ *
+ */
+static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
+{
+       size_t i;
+       struct sdma_engine *sde;
+
+       if (dd->sdma_pad_dma) {
+               dma_free_coherent(&dd->pcidev->dev, 4,
+                                 (void *)dd->sdma_pad_dma,
+                                 dd->sdma_pad_phys);
+               dd->sdma_pad_dma = NULL;
+               dd->sdma_pad_phys = 0;
+       }
+       if (dd->sdma_heads_dma) {
+               dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
+                                 (void *)dd->sdma_heads_dma,
+                                 dd->sdma_heads_phys);
+               dd->sdma_heads_dma = NULL;
+               dd->sdma_heads_phys = 0;
+       }
+       for (i = 0; dd->per_sdma && i < num_engines; ++i) {
+               sde = &dd->per_sdma[i];
+
+               sde->head_dma = NULL;
+               sde->head_phys = 0;
+
+               if (sde->descq) {
+                       dma_free_coherent(
+                               &dd->pcidev->dev,
+                               sde->descq_cnt * sizeof(u64[2]),
+                               sde->descq,
+                               sde->descq_phys
+                       );
+                       sde->descq = NULL;
+                       sde->descq_phys = 0;
+               }
+               if (is_vmalloc_addr(sde->tx_ring))
+                       vfree(sde->tx_ring);
+               else
+                       kfree(sde->tx_ring);
+               sde->tx_ring = NULL;
+       }
+       spin_lock_irq(&dd->sde_map_lock);
+       kfree(rcu_access_pointer(dd->sdma_map));
+       RCU_INIT_POINTER(dd->sdma_map, NULL);
+       spin_unlock_irq(&dd->sde_map_lock);
+       synchronize_rcu();
+       kfree(dd->per_sdma);
+       dd->per_sdma = NULL;
+}
+
+/**
+ * sdma_init() - called when device probed
+ * @dd: hfi1_devdata
+ * @port: port number (currently only zero)
+ *
+ * sdma_init initializes the specified number of engines.
+ *
+ * The code initializes each sde, its csrs.  Interrupts
+ * are not required to be enabled.
+ *
+ * Returns:
+ * 0 - success, -errno on failure
+ */
+int sdma_init(struct hfi1_devdata *dd, u8 port)
+{
+       unsigned this_idx;
+       struct sdma_engine *sde;
+       u16 descq_cnt;
+       void *curr_head;
+       struct hfi1_pportdata *ppd = dd->pport + port;
+       u32 per_sdma_credits;
+       uint idle_cnt = sdma_idle_cnt;
+       size_t num_engines = dd->chip_sdma_engines;
+
+       if (!HFI1_CAP_IS_KSET(SDMA)) {
+               HFI1_CAP_CLEAR(SDMA_AHG);
+               return 0;
+       }
+       if (mod_num_sdma &&
+               /* can't exceed chip support */
+               mod_num_sdma <= dd->chip_sdma_engines &&
+               /* count must be >= vls */
+               mod_num_sdma >= num_vls)
+               num_engines = mod_num_sdma;
+
+       dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
+       dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines);
+       dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
+               dd->chip_sdma_mem_size);
+
+       per_sdma_credits =
+               dd->chip_sdma_mem_size/(num_engines * SDMA_BLOCK_SIZE);
+
+       /* set up freeze waitqueue */
+       init_waitqueue_head(&dd->sdma_unfreeze_wq);
+       atomic_set(&dd->sdma_unfreeze_count, 0);
+
+       descq_cnt = sdma_get_descq_cnt();
+       dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
+               num_engines, descq_cnt);
+
+       /* alloc memory for array of send engines */
+       dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL);
+       if (!dd->per_sdma)
+               return -ENOMEM;
+
+       idle_cnt = ns_to_cclock(dd, idle_cnt);
+       /* Allocate memory for SendDMA descriptor FIFOs */
+       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+               sde = &dd->per_sdma[this_idx];
+               sde->dd = dd;
+               sde->ppd = ppd;
+               sde->this_idx = this_idx;
+               sde->descq_cnt = descq_cnt;
+               sde->desc_avail = sdma_descq_freecnt(sde);
+               sde->sdma_shift = ilog2(descq_cnt);
+               sde->sdma_mask = (1 << sde->sdma_shift) - 1;
+               sde->descq_full_count = 0;
+
+               /* Create a mask for all 3 chip interrupt sources */
+               sde->imask = (u64)1 << (0*TXE_NUM_SDMA_ENGINES + this_idx)
+                       | (u64)1 << (1*TXE_NUM_SDMA_ENGINES + this_idx)
+                       | (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
+               /* Create a mask specifically for sdma_idle */
+               sde->idle_mask =
+                       (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx);
+               /* Create a mask specifically for sdma_progress */
+               sde->progress_mask =
+                       (u64)1 << (TXE_NUM_SDMA_ENGINES + this_idx);
+               spin_lock_init(&sde->tail_lock);
+               seqlock_init(&sde->head_lock);
+               spin_lock_init(&sde->senddmactrl_lock);
+               spin_lock_init(&sde->flushlist_lock);
+               /* insure there is always a zero bit */
+               sde->ahg_bits = 0xfffffffe00000000ULL;
+
+               sdma_set_state(sde, sdma_state_s00_hw_down);
+
+               /* set up reference counting */
+               kref_init(&sde->state.kref);
+               init_completion(&sde->state.comp);
+
+               INIT_LIST_HEAD(&sde->flushlist);
+               INIT_LIST_HEAD(&sde->dmawait);
+
+               sde->tail_csr =
+                       get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
+
+               if (idle_cnt)
+                       dd->default_desc1 =
+                               SDMA_DESC1_HEAD_TO_HOST_FLAG;
+               else
+                       dd->default_desc1 =
+                               SDMA_DESC1_INT_REQ_FLAG;
+
+               tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
+                       (unsigned long)sde);
+
+               tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+                       (unsigned long)sde);
+               INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
+               INIT_WORK(&sde->flush_worker, sdma_field_flush);
+
+               sde->progress_check_head = 0;
+
+               init_timer(&sde->err_progress_check_timer);
+               sde->err_progress_check_timer.function =
+                                               sdma_err_progress_check;
+               sde->err_progress_check_timer.data = (unsigned long)sde;
+
+               sde->descq = dma_zalloc_coherent(
+                       &dd->pcidev->dev,
+                       descq_cnt * sizeof(u64[2]),
+                       &sde->descq_phys,
+                       GFP_KERNEL
+               );
+               if (!sde->descq)
+                       goto bail;
+               sde->tx_ring =
+                       kcalloc(descq_cnt, sizeof(struct sdma_txreq *),
+                               GFP_KERNEL);
+               if (!sde->tx_ring)
+                       sde->tx_ring =
+                               vzalloc(
+                                       sizeof(struct sdma_txreq *) *
+                                       descq_cnt);
+               if (!sde->tx_ring)
+                       goto bail;
+       }
+
+       dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
+       /* Allocate memory for DMA of head registers to memory */
+       dd->sdma_heads_dma = dma_zalloc_coherent(
+               &dd->pcidev->dev,
+               dd->sdma_heads_size,
+               &dd->sdma_heads_phys,
+               GFP_KERNEL
+       );
+       if (!dd->sdma_heads_dma) {
+               dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
+               goto bail;
+       }
+
+       /* Allocate memory for pad */
+       dd->sdma_pad_dma = dma_zalloc_coherent(
+               &dd->pcidev->dev,
+               sizeof(u32),
+               &dd->sdma_pad_phys,
+               GFP_KERNEL
+       );
+       if (!dd->sdma_pad_dma) {
+               dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
+               goto bail;
+       }
+
+       /* assign each engine to different cacheline and init registers */
+       curr_head = (void *)dd->sdma_heads_dma;
+       for (this_idx = 0; this_idx < num_engines; ++this_idx) {
+               unsigned long phys_offset;
+
+               sde = &dd->per_sdma[this_idx];
+
+               sde->head_dma = curr_head;
+               curr_head += L1_CACHE_BYTES;
+               phys_offset = (unsigned long)sde->head_dma -
+                             (unsigned long)dd->sdma_heads_dma;
+               sde->head_phys = dd->sdma_heads_phys + phys_offset;
+               init_sdma_regs(sde, per_sdma_credits, idle_cnt);
+       }
+       dd->flags |= HFI1_HAS_SEND_DMA;
+       dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
+       dd->num_sdma = num_engines;
+       if (sdma_map_init(dd, port, ppd->vls_operational, NULL))
+               goto bail;
+       dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
+       return 0;
+
+bail:
+       sdma_clean(dd, num_engines);
+       return -ENOMEM;
+}
+
+/**
+ * sdma_all_running() - called when the link goes up
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the running state.
+ */
+void sdma_all_running(struct hfi1_devdata *dd)
+{
+       struct sdma_engine *sde;
+       unsigned int i;
+
+       /* move all engines to running */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e30_go_running);
+       }
+}
+
+/**
+ * sdma_all_idle() - called when the link goes down
+ * @dd: hfi1_devdata
+ *
+ * This routine moves all engines to the idle state.
+ */
+void sdma_all_idle(struct hfi1_devdata *dd)
+{
+       struct sdma_engine *sde;
+       unsigned int i;
+
+       /* idle all engines */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e70_go_idle);
+       }
+}
+
+/**
+ * sdma_start() - called to kick off state processing for all engines
+ * @dd: hfi1_devdata
+ *
+ * This routine is for kicking off the state processing for all required
+ * sdma engines.  Interrupts need to be working at this point.
+ *
+ */
+void sdma_start(struct hfi1_devdata *dd)
+{
+       unsigned i;
+       struct sdma_engine *sde;
+
+       /* kick off the engines state processing */
+       for (i = 0; i < dd->num_sdma; ++i) {
+               sde = &dd->per_sdma[i];
+               sdma_process_event(sde, sdma_event_e10_go_hw_start);
+       }
+}
+
+/**
+ * sdma_exit() - used when module is removed
+ * @dd: hfi1_devdata
+ */
+void sdma_exit(struct hfi1_devdata *dd)
+{
+       unsigned this_idx;
+       struct sdma_engine *sde;
+
+       for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
+                       ++this_idx) {
+
+               sde = &dd->per_sdma[this_idx];
+               if (!list_empty(&sde->dmawait))
+                       dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
+                               sde->this_idx);
+               sdma_process_event(sde, sdma_event_e00_go_hw_down);
+
+               del_timer_sync(&sde->err_progress_check_timer);
+
+               /*
+                * This waits for the state machine to exit so it is not
+                * necessary to kill the sdma_sw_clean_up_task to make sure
+                * it is not running.
+                */
+               sdma_finalput(&sde->state);
+       }
+       sdma_clean(dd, dd->num_sdma);
+}
+
+/*
+ * unmap the indicated descriptor
+ */
+static inline void sdma_unmap_desc(
+       struct hfi1_devdata *dd,
+       struct sdma_desc *descp)
+{
+       switch (sdma_mapping_type(descp)) {
+       case SDMA_MAP_SINGLE:
+               dma_unmap_single(
+                       &dd->pcidev->dev,
+                       sdma_mapping_addr(descp),
+                       sdma_mapping_len(descp),
+                       DMA_TO_DEVICE);
+               break;
+       case SDMA_MAP_PAGE:
+               dma_unmap_page(
+                       &dd->pcidev->dev,
+                       sdma_mapping_addr(descp),
+                       sdma_mapping_len(descp),
+                       DMA_TO_DEVICE);
+               break;
+       }
+}
+
+/*
+ * return the mode as indicated by the first
+ * descriptor in the tx.
+ */
+static inline u8 ahg_mode(struct sdma_txreq *tx)
+{
+       return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+               >> SDMA_DESC1_HEADER_MODE_SHIFT;
+}
+
+/**
+ * sdma_txclean() - clean tx of mappings, descp *kmalloc's
+ * @dd: hfi1_devdata for unmapping
+ * @tx: tx request to clean
+ *
+ * This is used in the progress routine to clean the tx or
+ * by the ULP to toss an in-process tx build.
+ *
+ * The code can be called multiple times without issue.
+ *
+ */
+void sdma_txclean(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx)
+{
+       u16 i;
+
+       if (tx->num_desc) {
+               u8 skip = 0, mode = ahg_mode(tx);
+
+               /* unmap first */
+               sdma_unmap_desc(dd, &tx->descp[0]);
+               /* determine number of AHG descriptors to skip */
+               if (mode > SDMA_AHG_APPLY_UPDATE1)
+                       skip = mode >> 1;
+               for (i = 1 + skip; i < tx->num_desc; i++)
+                       sdma_unmap_desc(dd, &tx->descp[i]);
+               tx->num_desc = 0;
+       }
+       kfree(tx->coalesce_buf);
+       tx->coalesce_buf = NULL;
+       /* kmalloc'ed descp */
+       if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
+               tx->desc_limit = ARRAY_SIZE(tx->descs);
+               kfree(tx->descp);
+       }
+}
+
+static inline u16 sdma_gethead(struct sdma_engine *sde)
+{
+       struct hfi1_devdata *dd = sde->dd;
+       int use_dmahead;
+       u16 hwhead;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+retry:
+       use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
+                                       (dd->flags & HFI1_HAS_SDMA_TIMEOUT);
+       hwhead = use_dmahead ?
+               (u16) le64_to_cpu(*sde->head_dma) :
+               (u16) read_sde_csr(sde, SD(HEAD));
+
+       if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
+               u16 cnt;
+               u16 swtail;
+               u16 swhead;
+               int sane;
+
+               swhead = sde->descq_head & sde->sdma_mask;
+               /* this code is really bad for cache line trading */
+               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+               cnt = sde->descq_cnt;
+
+               if (swhead < swtail)
+                       /* not wrapped */
+                       sane = (hwhead >= swhead) & (hwhead <= swtail);
+               else if (swhead > swtail)
+                       /* wrapped around */
+                       sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+                               (hwhead <= swtail);
+               else
+                       /* empty */
+                       sane = (hwhead == swhead);
+
+               if (unlikely(!sane)) {
+                       dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n",
+                               sde->this_idx,
+                               use_dmahead ? "dma" : "kreg",
+                               hwhead, swhead, swtail, cnt);
+                       if (use_dmahead) {
+                               /* try one more time, using csr */
+                               use_dmahead = 0;
+                               goto retry;
+                       }
+                       /* proceed as if no progress */
+                       hwhead = swhead;
+               }
+       }
+       return hwhead;
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with head_lock held.
+ */
+static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail)
+{
+       struct iowait *wait, *nw;
+       struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
+       unsigned i, n = 0, seq;
+       struct sdma_txreq *stx;
+       struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "avail: %u\n", avail);
+#endif
+
+       do {
+               seq = read_seqbegin(&dev->iowait_lock);
+               if (!list_empty(&sde->dmawait)) {
+                       /* at least one item */
+                       write_seqlock(&dev->iowait_lock);
+                       /* Harvest waiters wanting DMA descriptors */
+                       list_for_each_entry_safe(
+                                       wait,
+                                       nw,
+                                       &sde->dmawait,
+                                       list) {
+                               u16 num_desc = 0;
+
+                               if (!wait->wakeup)
+                                       continue;
+                               if (n == ARRAY_SIZE(waits))
+                                       break;
+                               if (!list_empty(&wait->tx_head)) {
+                                       stx = list_first_entry(
+                                               &wait->tx_head,
+                                               struct sdma_txreq,
+                                               list);
+                                       num_desc = stx->num_desc;
+                               }
+                               if (num_desc > avail)
+                                       break;
+                               avail -= num_desc;
+                               list_del_init(&wait->list);
+                               waits[n++] = wait;
+                       }
+                       write_sequnlock(&dev->iowait_lock);
+                       break;
+               }
+       } while (read_seqretry(&dev->iowait_lock, seq));
+
+       for (i = 0; i < n; i++)
+               waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
+}
+
+/* head_lock must be held */
+static void sdma_make_progress(struct sdma_engine *sde, u64 status)
+{
+       struct sdma_txreq *txp = NULL;
+       int progress = 0;
+       u16 hwhead, swhead, swtail;
+       int idle_check_done = 0;
+
+       hwhead = sdma_gethead(sde);
+
+       /* The reason for some of the complexity of this code is that
+        * not all descriptors have corresponding txps.  So, we have to
+        * be able to skip over descs until we wander into the range of
+        * the next txp on the list.
+        */
+
+retry:
+       txp = get_txhead(sde);
+       swhead = sde->descq_head & sde->sdma_mask;
+       trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+       while (swhead != hwhead) {
+               /* advance head, wrap if needed */
+               swhead = ++sde->descq_head & sde->sdma_mask;
+
+               /* if now past this txp's descs, do the callback */
+               if (txp && txp->next_descq_idx == swhead) {
+                       int drained = 0;
+                       /* protect against complete modifying */
+                       struct iowait *wait = txp->wait;
+
+                       /* remove from list */
+                       sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
+                       if (wait)
+                               drained = atomic_dec_and_test(&wait->sdma_busy);
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+                       trace_hfi1_sdma_out_sn(sde, txp->sn);
+                       if (WARN_ON_ONCE(sde->head_sn != txp->sn))
+                               dd_dev_err(sde->dd, "expected %llu got %llu\n",
+                                       sde->head_sn, txp->sn);
+                       sde->head_sn++;
+#endif
+                       sdma_txclean(sde->dd, txp);
+                       if (txp->complete)
+                               (*txp->complete)(
+                                       txp,
+                                       SDMA_TXREQ_S_OK,
+                                       drained);
+                       if (wait && drained)
+                               iowait_drain_wakeup(wait);
+                       /* see if there is another txp */
+                       txp = get_txhead(sde);
+               }
+               trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
+               progress++;
+       }
+
+       /*
+        * The SDMA idle interrupt is not guaranteed to be ordered with respect
+        * to updates to the the dma_head location in host memory. The head
+        * value read might not be fully up to date. If there are pending
+        * descriptors and the SDMA idle interrupt fired then read from the
+        * CSR SDMA head instead to get the latest value from the hardware.
+        * The hardware SDMA head should be read at most once in this invocation
+        * of sdma_make_progress(..) which is ensured by idle_check_done flag
+        */
+       if ((status & sde->idle_mask) && !idle_check_done) {
+               swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+               if (swtail != hwhead) {
+                       hwhead = (u16)read_sde_csr(sde, SD(HEAD));
+                       idle_check_done = 1;
+                       goto retry;
+               }
+       }
+
+       sde->last_status = status;
+       if (progress)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+}
+
+/*
+ * sdma_engine_interrupt() - interrupt handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ *
+ * Status is a mask of the 3 possible interrupts for this engine.  It will
+ * contain bits _only_ for this SDMA engine.  It will contain at least one
+ * bit, it may contain more.
+ */
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
+{
+       trace_hfi1_sdma_engine_interrupt(sde, status);
+       write_seqlock(&sde->head_lock);
+       sdma_set_desc_cnt(sde, sde->descq_cnt / 2);
+       sdma_make_progress(sde, status);
+       write_sequnlock(&sde->head_lock);
+}
+
+/**
+ * sdma_engine_error() - error handler for engine
+ * @sde: sdma engine
+ * @status: sdma interrupt reason
+ */
+void sdma_engine_error(struct sdma_engine *sde, u64 status)
+{
+       unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
+                  sde->this_idx,
+                  (unsigned long long)status,
+                  sdma_state_names[sde->state.current_state]);
+#endif
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+       if (status & ALL_SDMA_ENG_HALT_ERRS)
+               __sdma_process_event(sde, sdma_event_e60_hw_halted);
+       if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
+               dd_dev_err(sde->dd,
+                       "SDMA (%u) engine error: 0x%llx state %s\n",
+                       sde->this_idx,
+                       (unsigned long long)status,
+                       sdma_state_names[sde->state.current_state]);
+               dump_sdma_state(sde);
+       }
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
+{
+       u64 set_senddmactrl = 0;
+       u64 clr_senddmactrl = 0;
+       unsigned long flags;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
+                  sde->this_idx,
+                  (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
+                  (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
+#endif
+
+       if (op & SDMA_SENDCTRL_OP_ENABLE)
+               set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
+
+       if (op & SDMA_SENDCTRL_OP_INTENABLE)
+               set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
+
+       if (op & SDMA_SENDCTRL_OP_HALT)
+               set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+       else
+               clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
+
+       spin_lock_irqsave(&sde->senddmactrl_lock, flags);
+
+       sde->p_senddmactrl |= set_senddmactrl;
+       sde->p_senddmactrl &= ~clr_senddmactrl;
+
+       if (op & SDMA_SENDCTRL_OP_CLEANUP)
+               write_sde_csr(sde, SD(CTRL),
+                       sde->p_senddmactrl |
+                       SD(CTRL_SDMA_CLEANUP_SMASK));
+       else
+               write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
+
+       spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       sdma_dumpstate(sde);
+#endif
+}
+
+static void sdma_setlengen(struct sdma_engine *sde)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       /*
+        * Set SendDmaLenGen and clear-then-set the MSB of the generation
+        * count to enable generation checking and load the internal
+        * generation counter.
+        */
+       write_sde_csr(sde, SD(LEN_GEN),
+               (sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT)
+       );
+       write_sde_csr(sde, SD(LEN_GEN),
+               ((sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT))
+               | (4ULL << SD(LEN_GEN_GENERATION_SHIFT))
+       );
+}
+
+static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
+{
+       /* Commit writes to memory and advance the tail on the chip */
+       smp_wmb(); /* see get_txhead() */
+       writeq(tail, sde->tail_csr);
+}
+
+/*
+ * This is called when changing to state s10_hw_start_up_halt_wait as
+ * a result of send buffer errors or send DMA descriptor errors.
+ */
+static void sdma_hw_start_up(struct sdma_engine *sde)
+{
+       u64 reg;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       sdma_setlengen(sde);
+       sdma_update_tail(sde, 0); /* Set SendDmaTail */
+       *sde->head_dma = 0;
+
+       reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
+             SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
+       write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+/*
+ * set_sdma_integrity
+ *
+ * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
+ */
+static void set_sdma_integrity(struct sdma_engine *sde)
+{
+       struct hfi1_devdata *dd = sde->dd;
+       u64 reg;
+
+       if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY)))
+               return;
+
+       reg = hfi1_pkt_base_sdma_integrity(dd);
+
+       if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL))
+               CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+       else
+               SET_STATIC_RATE_CONTROL_SMASK(reg);
+
+       write_sde_csr(sde, SD(CHECK_ENABLE), reg);
+}
+
+
+static void init_sdma_regs(
+       struct sdma_engine *sde,
+       u32 credits,
+       uint idle_cnt)
+{
+       u8 opval, opmask;
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct hfi1_devdata *dd = sde->dd;
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
+                  sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
+#endif
+
+       write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
+       sdma_setlengen(sde);
+       sdma_update_tail(sde, 0); /* Set SendDmaTail */
+       write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
+       write_sde_csr(sde, SD(DESC_CNT), 0);
+       write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
+       write_sde_csr(sde, SD(MEMORY),
+               ((u64)credits <<
+                       SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
+               ((u64)(credits * sde->this_idx) <<
+                       SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
+       write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
+       set_sdma_integrity(sde);
+       opmask = OPCODE_CHECK_MASK_DISABLED;
+       opval = OPCODE_CHECK_VAL_DISABLED;
+       write_sde_csr(sde, SD(CHECK_OPCODE),
+               (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
+               (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
+}
+
+#ifdef CONFIG_SDMA_VERBOSITY
+
+#define sdma_dumpstate_helper0(reg) do { \
+               csr = read_csr(sde->dd, reg); \
+               dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
+       } while (0)
+
+#define sdma_dumpstate_helper(reg) do { \
+               csr = read_sde_csr(sde, reg); \
+               dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
+                       #reg, sde->this_idx, csr); \
+       } while (0)
+
+#define sdma_dumpstate_helper2(reg) do { \
+               csr = read_csr(sde->dd, reg + (8 * i)); \
+               dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
+                               #reg, i, csr); \
+       } while (0)
+
+void sdma_dumpstate(struct sdma_engine *sde)
+{
+       u64 csr;
+       unsigned i;
+
+       sdma_dumpstate_helper(SD(CTRL));
+       sdma_dumpstate_helper(SD(STATUS));
+       sdma_dumpstate_helper0(SD(ERR_STATUS));
+       sdma_dumpstate_helper0(SD(ERR_MASK));
+       sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
+       sdma_dumpstate_helper(SD(ENG_ERR_MASK));
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
+               sdma_dumpstate_helper2(CCE_INT_STATUS);
+               sdma_dumpstate_helper2(CCE_INT_MASK);
+               sdma_dumpstate_helper2(CCE_INT_BLOCKED);
+       }
+
+       sdma_dumpstate_helper(SD(TAIL));
+       sdma_dumpstate_helper(SD(HEAD));
+       sdma_dumpstate_helper(SD(PRIORITY_THLD));
+       sdma_dumpstate_helper(SD(IDLE_CNT));
+       sdma_dumpstate_helper(SD(RELOAD_CNT));
+       sdma_dumpstate_helper(SD(DESC_CNT));
+       sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
+       sdma_dumpstate_helper(SD(MEMORY));
+       sdma_dumpstate_helper0(SD(ENGINES));
+       sdma_dumpstate_helper0(SD(MEM_SIZE));
+       /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
+       sdma_dumpstate_helper(SD(BASE_ADDR));
+       sdma_dumpstate_helper(SD(LEN_GEN));
+       sdma_dumpstate_helper(SD(HEAD_ADDR));
+       sdma_dumpstate_helper(SD(CHECK_ENABLE));
+       sdma_dumpstate_helper(SD(CHECK_VL));
+       sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
+       sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
+       sdma_dumpstate_helper(SD(CHECK_SLID));
+       sdma_dumpstate_helper(SD(CHECK_OPCODE));
+}
+#endif
+
+static void dump_sdma_state(struct sdma_engine *sde)
+{
+       struct hw_sdma_desc *descq;
+       struct hw_sdma_desc *descqp;
+       u64 desc[2];
+       u64 addr;
+       u8 gen;
+       u16 len;
+       u16 head, tail, cnt;
+
+       head = sde->descq_head & sde->sdma_mask;
+       tail = sde->descq_tail & sde->sdma_mask;
+       cnt = sdma_descq_freecnt(sde);
+       descq = sde->descq;
+
+       dd_dev_err(sde->dd,
+               "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
+               sde->this_idx,
+               head,
+               tail,
+               cnt,
+               !list_empty(&sde->flushlist));
+
+       /* print info for each entry in the descriptor queue */
+       while (head != tail) {
+               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+               descqp = &sde->descq[head];
+               desc[0] = le64_to_cpu(descqp->qw[0]);
+               desc[1] = le64_to_cpu(descqp->qw[1]);
+               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+                               'H' : '-';
+               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK;
+               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK;
+               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK;
+               dd_dev_err(sde->dd,
+                       "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+                        head, flags, addr, gen, len);
+               dd_dev_err(sde->dd,
+                       "\tdesc0:0x%016llx desc1 0x%016llx\n",
+                        desc[0], desc[1]);
+               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+                       dd_dev_err(sde->dd,
+                               "\taidx: %u amode: %u alen: %u\n",
+                               (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
+                                       >> SDMA_DESC1_HEADER_INDEX_MASK),
+                               (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+                                       >> SDMA_DESC1_HEADER_MODE_SHIFT),
+                               (u8)((desc[1] & SDMA_DESC1_HEADER_DWS_SMASK)
+                                       >> SDMA_DESC1_HEADER_DWS_SHIFT));
+               head++;
+               head &= sde->sdma_mask;
+       }
+}
+
+#define SDE_FMT \
+       "SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+/**
+ * sdma_seqfile_dump_sde() - debugfs dump of sde
+ * @s: seq file
+ * @sde: send dma engine to dump
+ *
+ * This routine dumps the sde to the indicated seq file.
+ */
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
+{
+       u16 head, tail;
+       struct hw_sdma_desc *descqp;
+       u64 desc[2];
+       u64 addr;
+       u8 gen;
+       u16 len;
+
+       head = sde->descq_head & sde->sdma_mask;
+       tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+       seq_printf(s, SDE_FMT, sde->this_idx,
+               sdma_state_name(sde->state.current_state),
+               (unsigned long long)read_sde_csr(sde, SD(CTRL)),
+               (unsigned long long)read_sde_csr(sde, SD(STATUS)),
+               (unsigned long long)read_sde_csr(sde,
+                       SD(ENG_ERR_STATUS)),
+               (unsigned long long)read_sde_csr(sde, SD(TAIL)),
+               tail,
+               (unsigned long long)read_sde_csr(sde, SD(HEAD)),
+               head,
+               (unsigned long long)le64_to_cpu(*sde->head_dma),
+               (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
+               (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
+               (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
+               (unsigned long long)sde->last_status,
+               (unsigned long long)sde->ahg_bits,
+               sde->tx_tail,
+               sde->tx_head,
+               sde->descq_tail,
+               sde->descq_head,
+                  !list_empty(&sde->flushlist),
+               sde->descq_full_count,
+               (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
+
+       /* print info for each entry in the descriptor queue */
+       while (head != tail) {
+               char flags[6] = { 'x', 'x', 'x', 'x', 0 };
+
+               descqp = &sde->descq[head];
+               desc[0] = le64_to_cpu(descqp->qw[0]);
+               desc[1] = le64_to_cpu(descqp->qw[1]);
+               flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+               flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
+                               'H' : '-';
+               flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+               flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+               addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK;
+               gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK;
+               len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK;
+               seq_printf(s,
+                       "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
+                       head, flags, addr, gen, len);
+               if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
+                       seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
+                               (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK)
+                                       >> SDMA_DESC1_HEADER_INDEX_MASK),
+                               (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK)
+                                       >> SDMA_DESC1_HEADER_MODE_SHIFT));
+               head = (head + 1) & sde->sdma_mask;
+       }
+}
+
+/*
+ * add the generation number into
+ * the qw1 and return
+ */
+static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
+{
+       u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
+
+       qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
+       qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
+                       << SDMA_DESC1_GENERATION_SHIFT;
+       return qw1;
+}
+
+/*
+ * This routine submits the indicated tx
+ *
+ * Space has already been guaranteed and
+ * tail side of ring is locked.
+ *
+ * The hardware tail update is done
+ * in the caller and that is facilitated
+ * by returning the new tail.
+ *
+ * There is special case logic for ahg
+ * to not add the generation number for
+ * up to 2 descriptors that follow the
+ * first descriptor.
+ *
+ */
+static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
+{
+       int i;
+       u16 tail;
+       struct sdma_desc *descp = tx->descp;
+       u8 skip = 0, mode = ahg_mode(tx);
+
+       tail = sde->descq_tail & sde->sdma_mask;
+       sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+       sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
+       trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
+                                  tail, &sde->descq[tail]);
+       tail = ++sde->descq_tail & sde->sdma_mask;
+       descp++;
+       if (mode > SDMA_AHG_APPLY_UPDATE1)
+               skip = mode >> 1;
+       for (i = 1; i < tx->num_desc; i++, descp++) {
+               u64 qw1;
+
+               sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
+               if (skip) {
+                       /* edits don't have generation */
+                       qw1 = descp->qw[1];
+                       skip--;
+               } else {
+                       /* replace generation with real one for non-edits */
+                       qw1 = add_gen(sde, descp->qw[1]);
+               }
+               sde->descq[tail].qw[1] = cpu_to_le64(qw1);
+               trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
+                                          tail, &sde->descq[tail]);
+               tail = ++sde->descq_tail & sde->sdma_mask;
+       }
+       tx->next_descq_idx = tail;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       tx->sn = sde->tail_sn++;
+       trace_hfi1_sdma_in_sn(sde, tx->sn);
+       WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
+#endif
+       sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
+       sde->desc_avail -= tx->num_desc;
+       return tail;
+}
+
+/*
+ * Check for progress
+ */
+static int sdma_check_progress(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *tx)
+{
+       int ret;
+
+       sde->desc_avail = sdma_descq_freecnt(sde);
+       if (tx->num_desc <= sde->desc_avail)
+               return -EAGAIN;
+       /* pulse the head_lock */
+       if (wait && wait->sleep) {
+               unsigned seq;
+
+               seq = raw_seqcount_begin(
+                       (const seqcount_t *)&sde->head_lock.seqcount);
+               ret = wait->sleep(sde, wait, tx, seq);
+               if (ret == -EAGAIN)
+                       sde->desc_avail = sdma_descq_freecnt(sde);
+       } else
+               ret = -EBUSY;
+       return ret;
+}
+
+/**
+ * sdma_send_txreq() - submit a tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx: sdma_txreq to submit
+ *
+ * The call submits the tx into the ring.  If a iowait structure is non-NULL
+ * the packet will be queued to the list in wait.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
+ * ring (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txreq(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct sdma_txreq *tx)
+{
+       int ret = 0;
+       u16 tail;
+       unsigned long flags;
+
+       /* user should have supplied entire packet */
+       if (unlikely(tx->tlen))
+               return -EINVAL;
+       tx->wait = wait;
+       spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+       if (unlikely(!__sdma_running(sde)))
+               goto unlock_noconn;
+       if (unlikely(tx->num_desc > sde->desc_avail))
+               goto nodesc;
+       tail = submit_tx(sde, tx);
+       if (wait)
+               atomic_inc(&wait->sdma_busy);
+       sdma_update_tail(sde, tail);
+unlock:
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+       return ret;
+unlock_noconn:
+       if (wait)
+               atomic_inc(&wait->sdma_busy);
+       tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       tx->sn = sde->tail_sn++;
+       trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+       spin_lock(&sde->flushlist_lock);
+       list_add_tail(&tx->list, &sde->flushlist);
+       spin_unlock(&sde->flushlist_lock);
+       if (wait) {
+               wait->tx_count++;
+               wait->count += tx->num_desc;
+       }
+       schedule_work(&sde->flush_worker);
+       ret = -ECOMM;
+       goto unlock;
+nodesc:
+       ret = sdma_check_progress(sde, wait, tx);
+       if (ret == -EAGAIN) {
+               ret = 0;
+               goto retry;
+       }
+       sde->descq_full_count++;
+       goto unlock;
+}
+
+/**
+ * sdma_send_txlist() - submit a list of tx req to ring
+ * @sde: sdma engine to use
+ * @wait: wait structure to use when full (may be NULL)
+ * @tx_list: list of sdma_txreqs to submit
+ *
+ * The call submits the list into the ring.
+ *
+ * If the iowait structure is non-NULL and not equal to the iowait list
+ * the unprocessed part of the list  will be appended to the list in wait.
+ *
+ * In all cases, the tx_list will be updated so the head of the tx_list is
+ * the list of descriptors that have yet to be transmitted.
+ *
+ * The intent of this call is to provide a more efficient
+ * way of submitting multiple packets to SDMA while holding the tail
+ * side locking.
+ *
+ * Return:
+ * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring
+ * (wait == NULL)
+ * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
+ */
+int sdma_send_txlist(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct list_head *tx_list)
+{
+       struct sdma_txreq *tx, *tx_next;
+       int ret = 0;
+       unsigned long flags;
+       u16 tail = INVALID_TAIL;
+       int count = 0;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+retry:
+       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+               tx->wait = wait;
+               if (unlikely(!__sdma_running(sde)))
+                       goto unlock_noconn;
+               if (unlikely(tx->num_desc > sde->desc_avail))
+                       goto nodesc;
+               if (unlikely(tx->tlen)) {
+                       ret = -EINVAL;
+                       goto update_tail;
+               }
+               list_del_init(&tx->list);
+               tail = submit_tx(sde, tx);
+               count++;
+               if (tail != INVALID_TAIL &&
+                   (count & SDMA_TAIL_UPDATE_THRESH) == 0) {
+                       sdma_update_tail(sde, tail);
+                       tail = INVALID_TAIL;
+               }
+       }
+update_tail:
+       if (wait)
+               atomic_add(count, &wait->sdma_busy);
+       if (tail != INVALID_TAIL)
+               sdma_update_tail(sde, tail);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+       return ret;
+unlock_noconn:
+       spin_lock(&sde->flushlist_lock);
+       list_for_each_entry_safe(tx, tx_next, tx_list, list) {
+               tx->wait = wait;
+               list_del_init(&tx->list);
+               if (wait)
+                       atomic_inc(&wait->sdma_busy);
+               tx->next_descq_idx = 0;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+               tx->sn = sde->tail_sn++;
+               trace_hfi1_sdma_in_sn(sde, tx->sn);
+#endif
+               list_add_tail(&tx->list, &sde->flushlist);
+               if (wait) {
+                       wait->tx_count++;
+                       wait->count += tx->num_desc;
+               }
+       }
+       spin_unlock(&sde->flushlist_lock);
+       schedule_work(&sde->flush_worker);
+       ret = -ECOMM;
+       goto update_tail;
+nodesc:
+       ret = sdma_check_progress(sde, wait, tx);
+       if (ret == -EAGAIN) {
+               ret = 0;
+               goto retry;
+       }
+       sde->descq_full_count++;
+       goto update_tail;
+}
+
+static void sdma_process_event(struct sdma_engine *sde,
+       enum sdma_events event)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&sde->tail_lock, flags);
+       write_seqlock(&sde->head_lock);
+
+       __sdma_process_event(sde, event);
+
+       if (sde->state.current_state == sdma_state_s99_running)
+               sdma_desc_avail(sde, sdma_descq_freecnt(sde));
+
+       write_sequnlock(&sde->head_lock);
+       spin_unlock_irqrestore(&sde->tail_lock, flags);
+}
+
+static void __sdma_process_event(struct sdma_engine *sde,
+       enum sdma_events event)
+{
+       struct sdma_state *ss = &sde->state;
+       int need_progress = 0;
+
+       /* CONFIG SDMA temporary */
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
+                  sdma_state_names[ss->current_state],
+                  sdma_event_names[event]);
+#endif
+
+       switch (ss->current_state) {
+       case sdma_state_s00_hw_down:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       break;
+               case sdma_event_e30_go_running:
+                       /*
+                        * If down, but running requested (usually result
+                        * of link up, then we need to start up.
+                        * This can happen when hw down is requested while
+                        * bringing the link up with traffic active on
+                        * 7220, e.g. */
+                       ss->go_s99_running = 1;
+                       /* fall through and start dma engine */
+               case sdma_event_e10_go_hw_start:
+                       /* This reference means the state machine is started */
+                       sdma_get(&sde->state);
+                       sdma_set_state(sde,
+                               sdma_state_s10_hw_start_up_halt_wait);
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s10_hw_start_up_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde,
+                               sdma_state_s15_hw_start_up_clean_wait);
+                       sdma_start_hw_clean_up(sde);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       sdma_start_err_halt_wait(sde);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s15_hw_start_up_clean_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s20_idle:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_sw_tear_down(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       sdma_set_state(sde, sdma_state_s99_running);
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+                       sdma_start_err_halt_wait(sde);
+                       break;
+               case sdma_event_e70_go_idle:
+                       break;
+               case sdma_event_e85_link_down:
+                       /* fall through */
+               case sdma_event_e80_hw_freeze:
+                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s30_sw_clean_up_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
+                       sdma_start_hw_clean_up(sde);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s40_hw_clean_up_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s50_hw_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       sdma_start_err_halt_wait(sde);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s60_idle_halt_wait:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       sdma_start_err_halt_wait(sde);
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s80_hw_freeze:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s82_freeze_sw_clean:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       ss->go_s99_running = 1;
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       /* notify caller this engine is done cleaning */
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       break;
+               case sdma_event_e70_go_idle:
+                       ss->go_s99_running = 0;
+                       break;
+               case sdma_event_e80_hw_freeze:
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       sdma_hw_start_up(sde);
+                       sdma_set_state(sde, ss->go_s99_running ?
+                                      sdma_state_s99_running :
+                                      sdma_state_s20_idle);
+                       break;
+               case sdma_event_e85_link_down:
+                       break;
+               case sdma_event_e90_sw_halted:
+                       break;
+               }
+               break;
+
+       case sdma_state_s99_running:
+               switch (event) {
+               case sdma_event_e00_go_hw_down:
+                       sdma_set_state(sde, sdma_state_s00_hw_down);
+                       sdma_start_sw_clean_up(sde);
+                       break;
+               case sdma_event_e10_go_hw_start:
+                       break;
+               case sdma_event_e15_hw_halt_done:
+                       break;
+               case sdma_event_e25_hw_clean_up_done:
+                       break;
+               case sdma_event_e30_go_running:
+                       break;
+               case sdma_event_e40_sw_cleaned:
+                       break;
+               case sdma_event_e50_hw_cleaned:
+                       break;
+               case sdma_event_e60_hw_halted:
+                       need_progress = 1;
+                       sdma_err_progress_check_schedule(sde);
+               case sdma_event_e90_sw_halted:
+                       /*
+                       * SW initiated halt does not perform engines
+                       * progress check
+                       */
+                       sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
+                       sdma_start_err_halt_wait(sde);
+                       break;
+               case sdma_event_e70_go_idle:
+                       sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
+                       break;
+               case sdma_event_e85_link_down:
+                       ss->go_s99_running = 0;
+                       /* fall through */
+               case sdma_event_e80_hw_freeze:
+                       sdma_set_state(sde, sdma_state_s80_hw_freeze);
+                       atomic_dec(&sde->dd->sdma_unfreeze_count);
+                       wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
+                       break;
+               case sdma_event_e81_hw_frozen:
+                       break;
+               case sdma_event_e82_hw_unfreeze:
+                       break;
+               }
+               break;
+       }
+
+       ss->last_event = event;
+       if (need_progress)
+               sdma_make_progress(sde, 0);
+}
+
+/*
+ * _extend_sdma_tx_descs() - helper to extend txreq
+ *
+ * This is called once the initial nominal allocation
+ * of descriptors in the sdma_txreq is exhausted.
+ *
+ * The code will bump the allocation up to the max
+ * of MAX_DESC (64) descriptors.  There doesn't seem
+ * much point in an interim step.
+ *
+ */
+int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+       int i;
+
+       tx->descp = kmalloc_array(
+                       MAX_DESC,
+                       sizeof(struct sdma_desc),
+                       GFP_ATOMIC);
+       if (!tx->descp)
+               return -ENOMEM;
+       tx->desc_limit = MAX_DESC;
+       /* copy ones already built */
+       for (i = 0; i < tx->num_desc; i++)
+               tx->descp[i] = tx->descs[i];
+       return 0;
+}
+
+/* Update sdes when the lmc changes */
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
+{
+       struct sdma_engine *sde;
+       int i;
+       u64 sreg;
+
+       sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
+               SD(CHECK_SLID_MASK_SHIFT)) |
+               (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
+               SD(CHECK_SLID_VALUE_SHIFT));
+
+       for (i = 0; i < dd->num_sdma; i++) {
+               hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               sde = &dd->per_sdma[i];
+               write_sde_csr(sde, SD(CHECK_SLID), sreg);
+       }
+}
+
+/* tx not dword sized - pad */
+int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
+{
+       int rval = 0;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = _extend_sdma_tx_descs(dd, tx);
+               if (rval)
+                       return rval;
+       }
+       /* finish the one just added  */
+       tx->num_desc++;
+       make_tx_sdma_desc(
+               tx,
+               SDMA_MAP_NONE,
+               dd->sdma_pad_phys,
+               sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)));
+       _sdma_close_tx(dd, tx);
+       return rval;
+}
+
+/*
+ * Add ahg to the sdma_txreq
+ *
+ * The logic will consume up to 3
+ * descriptors at the beginning of
+ * sdma_txreq.
+ */
+void _sdma_txreq_ahgadd(
+       struct sdma_txreq *tx,
+       u8 num_ahg,
+       u8 ahg_entry,
+       u32 *ahg,
+       u8 ahg_hlen)
+{
+       u32 i, shift = 0, desc = 0;
+       u8 mode;
+
+       WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
+       /* compute mode */
+       if (num_ahg == 1)
+               mode = SDMA_AHG_APPLY_UPDATE1;
+       else if (num_ahg <= 5)
+               mode = SDMA_AHG_APPLY_UPDATE2;
+       else
+               mode = SDMA_AHG_APPLY_UPDATE3;
+       tx->num_desc++;
+       /* initialize to consumed descriptors to zero */
+       switch (mode) {
+       case SDMA_AHG_APPLY_UPDATE3:
+               tx->num_desc++;
+               tx->descs[2].qw[0] = 0;
+               tx->descs[2].qw[1] = 0;
+               /* FALLTHROUGH */
+       case SDMA_AHG_APPLY_UPDATE2:
+               tx->num_desc++;
+               tx->descs[1].qw[0] = 0;
+               tx->descs[1].qw[1] = 0;
+               break;
+       }
+       ahg_hlen >>= 2;
+       tx->descs[0].qw[1] |=
+               (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+                       << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+               (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
+                       << SDMA_DESC1_HEADER_DWS_SHIFT) |
+               (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
+                       << SDMA_DESC1_HEADER_MODE_SHIFT) |
+               (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
+                       << SDMA_DESC1_HEADER_UPDATE1_SHIFT);
+       for (i = 0; i < (num_ahg - 1); i++) {
+               if (!shift && !(i & 2))
+                       desc++;
+               tx->descs[desc].qw[!!(i & 2)] |=
+                       (((u64)ahg[i + 1])
+                               << shift);
+               shift = (shift + 32) & 63;
+       }
+}
+
+/**
+ * sdma_ahg_alloc - allocate an AHG entry
+ * @sde: engine to allocate from
+ *
+ * Return:
+ * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
+ * -ENOSPC if an entry is not available
+ */
+int sdma_ahg_alloc(struct sdma_engine *sde)
+{
+       int nr;
+       int oldbit;
+
+       if (!sde) {
+               trace_hfi1_ahg_allocate(sde, -EINVAL);
+               return -EINVAL;
+       }
+       while (1) {
+               nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+               if (nr > 31) {
+                       trace_hfi1_ahg_allocate(sde, -ENOSPC);
+                       return -ENOSPC;
+               }
+               oldbit = test_and_set_bit(nr, &sde->ahg_bits);
+               if (!oldbit)
+                       break;
+               cpu_relax();
+       }
+       trace_hfi1_ahg_allocate(sde, nr);
+       return nr;
+}
+
+/**
+ * sdma_ahg_free - free an AHG entry
+ * @sde: engine to return AHG entry
+ * @ahg_index: index to free
+ *
+ * This routine frees the indicate AHG entry.
+ */
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
+{
+       if (!sde)
+               return;
+       trace_hfi1_ahg_deallocate(sde, ahg_index);
+       if (ahg_index < 0 || ahg_index > 31)
+               return;
+       clear_bit(ahg_index, &sde->ahg_bits);
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is going into a freeze but before the freeze is fully
+ * settled.  Generally an error interrupt.
+ *
+ * This event will pull the engine out of running so no more entries can be
+ * added to the engine's queue.
+ */
+void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
+{
+       int i;
+       enum sdma_events event = link_down ? sdma_event_e85_link_down :
+                                            sdma_event_e80_hw_freeze;
+
+       /* set up the wait but do not wait here */
+       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+       /* tell all engines to stop running and wait */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i], event);
+
+       /* sdma_freeze() will wait for all engines to have stopped */
+}
+
+/*
+ * SPC freeze handling for SDMA engines.  Called when the driver knows
+ * the SPC is fully frozen.
+ */
+void sdma_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+       int ret;
+
+       /*
+        * Make sure all engines have moved out of the running state before
+        * continuing.
+        */
+       ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
+                               atomic_read(&dd->sdma_unfreeze_count) <= 0);
+       /* interrupted or count is negative, then unloading - just exit */
+       if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
+               return;
+
+       /* set up the count for the next wait */
+       atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
+
+       /* tell all engines that the SPC is frozen, they can start cleaning */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
+
+       /*
+        * Wait for everyone to finish software clean before exiting.  The
+        * software clean will read engine CSRs, so must be completed before
+        * the next step, which will clear the engine CSRs.
+        */
+       (void) wait_event_interruptible(dd->sdma_unfreeze_wq,
+                               atomic_read(&dd->sdma_unfreeze_count) <= 0);
+       /* no need to check results - done no matter what */
+}
+
+/*
+ * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
+ *
+ * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
+ * that is left is a software clean.  We could do it after the SPC is fully
+ * frozen, but then we'd have to add another state to wait for the unfreeze.
+ * Instead, just defer the software clean until the unfreeze step.
+ */
+void sdma_unfreeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* tell all engines start freeze clean up */
+       for (i = 0; i < dd->num_sdma; i++)
+               sdma_process_event(&dd->per_sdma[i],
+                                       sdma_event_e82_hw_unfreeze);
+}
+
+/**
+ * _sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ */
+void _sdma_engine_progress_schedule(
+       struct sdma_engine *sde)
+{
+       trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
+       /* assume we have selected a good cpu */
+       write_csr(sde->dd,
+                 CCE_INT_FORCE + (8*(IS_SDMA_START/64)), sde->progress_mask);
+}
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
new file mode 100644 (file)
index 0000000..1e613fc
--- /dev/null
@@ -0,0 +1,1123 @@
+#ifndef _HFI1_SDMA_H
+#define _HFI1_SDMA_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+#include <linux/workqueue.h>
+#include <linux/rculist.h>
+
+#include "hfi.h"
+#include "verbs.h"
+
+/* increased for AHG */
+#define NUM_DESC 6
+/* Hardware limit */
+#define MAX_DESC 64
+/* Hardware limit for SDMA packet size */
+#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
+
+
+#define SDMA_TXREQ_S_OK        0
+#define SDMA_TXREQ_S_SENDERROR 1
+#define SDMA_TXREQ_S_ABORTED   2
+#define SDMA_TXREQ_S_SHUTDOWN  3
+
+/* flags bits */
+#define SDMA_TXREQ_F_URGENT       0x0001
+#define SDMA_TXREQ_F_AHG_COPY     0x0002
+#define SDMA_TXREQ_F_USE_AHG      0x0004
+
+#define SDMA_MAP_NONE          0
+#define SDMA_MAP_SINGLE        1
+#define SDMA_MAP_PAGE          2
+
+#define SDMA_AHG_VALUE_MASK          0xffff
+#define SDMA_AHG_VALUE_SHIFT         0
+#define SDMA_AHG_INDEX_MASK          0xf
+#define SDMA_AHG_INDEX_SHIFT         16
+#define SDMA_AHG_FIELD_LEN_MASK      0xf
+#define SDMA_AHG_FIELD_LEN_SHIFT     20
+#define SDMA_AHG_FIELD_START_MASK    0x1f
+#define SDMA_AHG_FIELD_START_SHIFT   24
+#define SDMA_AHG_UPDATE_ENABLE_MASK  0x1
+#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31
+
+/* AHG modes */
+
+/*
+ * Be aware the ordering and values
+ * for SDMA_AHG_APPLY_UPDATE[123]
+ * are assumed in generating a skip
+ * count in submit_tx() in sdma.c
+ */
+#define SDMA_AHG_NO_AHG              0
+#define SDMA_AHG_COPY                1
+#define SDMA_AHG_APPLY_UPDATE1       2
+#define SDMA_AHG_APPLY_UPDATE2       3
+#define SDMA_AHG_APPLY_UPDATE3       4
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC0_FIRST_DESC_FLAG      (1ULL<<63)
+#define SDMA_DESC0_LAST_DESC_FLAG       (1ULL<<62)
+#define SDMA_DESC0_BYTE_COUNT_SHIFT     48
+#define SDMA_DESC0_BYTE_COUNT_WIDTH     14
+#define SDMA_DESC0_BYTE_COUNT_MASK \
+       ((1ULL<<SDMA_DESC0_BYTE_COUNT_WIDTH)-1ULL)
+#define SDMA_DESC0_BYTE_COUNT_SMASK \
+       (SDMA_DESC0_BYTE_COUNT_MASK<<SDMA_DESC0_BYTE_COUNT_SHIFT)
+#define SDMA_DESC0_PHY_ADDR_SHIFT       0
+#define SDMA_DESC0_PHY_ADDR_WIDTH       48
+#define SDMA_DESC0_PHY_ADDR_MASK \
+       ((1ULL<<SDMA_DESC0_PHY_ADDR_WIDTH)-1ULL)
+#define SDMA_DESC0_PHY_ADDR_SMASK \
+       (SDMA_DESC0_PHY_ADDR_MASK<<SDMA_DESC0_PHY_ADDR_SHIFT)
+
+#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32
+#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32
+#define SDMA_DESC1_HEADER_UPDATE1_MASK \
+       ((1ULL<<SDMA_DESC1_HEADER_UPDATE1_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_UPDATE1_SMASK \
+       (SDMA_DESC1_HEADER_UPDATE1_MASK<<SDMA_DESC1_HEADER_UPDATE1_SHIFT)
+#define SDMA_DESC1_HEADER_MODE_SHIFT    13
+#define SDMA_DESC1_HEADER_MODE_WIDTH    3
+#define SDMA_DESC1_HEADER_MODE_MASK \
+       ((1ULL<<SDMA_DESC1_HEADER_MODE_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_MODE_SMASK \
+       (SDMA_DESC1_HEADER_MODE_MASK<<SDMA_DESC1_HEADER_MODE_SHIFT)
+#define SDMA_DESC1_HEADER_INDEX_SHIFT   8
+#define SDMA_DESC1_HEADER_INDEX_WIDTH   5
+#define SDMA_DESC1_HEADER_INDEX_MASK \
+       ((1ULL<<SDMA_DESC1_HEADER_INDEX_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_INDEX_SMASK \
+       (SDMA_DESC1_HEADER_INDEX_MASK<<SDMA_DESC1_HEADER_INDEX_SHIFT)
+#define SDMA_DESC1_HEADER_DWS_SHIFT     4
+#define SDMA_DESC1_HEADER_DWS_WIDTH     4
+#define SDMA_DESC1_HEADER_DWS_MASK \
+       ((1ULL<<SDMA_DESC1_HEADER_DWS_WIDTH)-1ULL)
+#define SDMA_DESC1_HEADER_DWS_SMASK \
+       (SDMA_DESC1_HEADER_DWS_MASK<<SDMA_DESC1_HEADER_DWS_SHIFT)
+#define SDMA_DESC1_GENERATION_SHIFT     2
+#define SDMA_DESC1_GENERATION_WIDTH     2
+#define SDMA_DESC1_GENERATION_MASK \
+       ((1ULL<<SDMA_DESC1_GENERATION_WIDTH)-1ULL)
+#define SDMA_DESC1_GENERATION_SMASK \
+       (SDMA_DESC1_GENERATION_MASK<<SDMA_DESC1_GENERATION_SHIFT)
+#define SDMA_DESC1_INT_REQ_FLAG         (1ULL<<1)
+#define SDMA_DESC1_HEAD_TO_HOST_FLAG    (1ULL<<0)
+
+enum sdma_states {
+       sdma_state_s00_hw_down,
+       sdma_state_s10_hw_start_up_halt_wait,
+       sdma_state_s15_hw_start_up_clean_wait,
+       sdma_state_s20_idle,
+       sdma_state_s30_sw_clean_up_wait,
+       sdma_state_s40_hw_clean_up_wait,
+       sdma_state_s50_hw_halt_wait,
+       sdma_state_s60_idle_halt_wait,
+       sdma_state_s80_hw_freeze,
+       sdma_state_s82_freeze_sw_clean,
+       sdma_state_s99_running,
+};
+
+enum sdma_events {
+       sdma_event_e00_go_hw_down,
+       sdma_event_e10_go_hw_start,
+       sdma_event_e15_hw_halt_done,
+       sdma_event_e25_hw_clean_up_done,
+       sdma_event_e30_go_running,
+       sdma_event_e40_sw_cleaned,
+       sdma_event_e50_hw_cleaned,
+       sdma_event_e60_hw_halted,
+       sdma_event_e70_go_idle,
+       sdma_event_e80_hw_freeze,
+       sdma_event_e81_hw_frozen,
+       sdma_event_e82_hw_unfreeze,
+       sdma_event_e85_link_down,
+       sdma_event_e90_sw_halted,
+};
+
+struct sdma_set_state_action {
+       unsigned op_enable:1;
+       unsigned op_intenable:1;
+       unsigned op_halt:1;
+       unsigned op_cleanup:1;
+       unsigned go_s99_running_tofalse:1;
+       unsigned go_s99_running_totrue:1;
+};
+
+struct sdma_state {
+       struct kref          kref;
+       struct completion    comp;
+       enum sdma_states current_state;
+       unsigned             current_op;
+       unsigned             go_s99_running;
+       /* debugging/development */
+       enum sdma_states previous_state;
+       unsigned             previous_op;
+       enum sdma_events last_event;
+};
+
+/**
+ * DOC: sdma exported routines
+ *
+ * These sdma routines fit into three categories:
+ * - The SDMA API for building and submitting packets
+ *   to the ring
+ *
+ * - Initialization and tear down routines to buildup
+ *   and tear down SDMA
+ *
+ * - ISR entrances to handle interrupts, state changes
+ *   and errors
+ */
+
+/**
+ * DOC: sdma PSM/verbs API
+ *
+ * The sdma API is designed to be used by both PSM
+ * and verbs to supply packets to the SDMA ring.
+ *
+ * The usage of the API is as follows:
+ *
+ * Embed a struct iowait in the QP or
+ * PQ.  The iowait should be initialized with a
+ * call to iowait_init().
+ *
+ * The user of the API should create an allocation method
+ * for their version of the txreq. slabs, pre-allocated lists,
+ * and dma pools can be used.  Once the user's overload of
+ * the sdma_txreq has been allocated, the sdma_txreq member
+ * must be initialized with sdma_txinit() or sdma_txinit_ahg().
+ *
+ * The txreq must be declared with the sdma_txreq first.
+ *
+ * The tx request, once initialized,  is manipulated with calls to
+ * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr()
+ * for each disjoint memory location.  It is the user's responsibility
+ * to understand the packet boundaries and page boundaries to do the
+ * appropriate number of sdma_txadd_* calls..  The user
+ * must be prepared to deal with failures from these routines due to
+ * either memory allocation or dma_mapping failures.
+ *
+ * The mapping specifics for each memory location are recorded
+ * in the tx. Memory locations added with sdma_txadd_page()
+ * and sdma_txadd_kvaddr() are automatically mapped when added
+ * to the tx and nmapped as part of the progress processing in the
+ * SDMA interrupt handling.
+ *
+ * sdma_txadd_daddr() is used to add an dma_addr_t memory to the
+ * tx.   An example of a use case would be a pre-allocated
+ * set of headers allocated via dma_pool_alloc() or
+ * dma_alloc_coherent().  For these memory locations, it
+ * is the responsibility of the user to handle that unmapping.
+ * (This would usually be at an unload or job termination.)
+ *
+ * The routine sdma_send_txreq() is used to submit
+ * a tx to the ring after the appropriate number of
+ * sdma_txadd_* have been done.
+ *
+ * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist()
+ * can be used to submit a list of packets.
+ *
+ * The user is free to use the link overhead in the struct sdma_txreq as
+ * long as the tx isn't in flight.
+ *
+ * The extreme degenerate case of the number of descriptors
+ * exceeding the ring size is automatically handled as
+ * memory locations are added.  An overflow of the descriptor
+ * array that is part of the sdma_txreq is also automatically
+ * handled.
+ *
+ */
+
+/**
+ * DOC: Infrastructure calls
+ *
+ * sdma_init() is used to initialize data structures and
+ * CSRs for the desired number of SDMA engines.
+ *
+ * sdma_start() is used to kick the SDMA engines initialized
+ * with sdma_init().   Interrupts must be enabled at this
+ * point since aspects of the state machine are interrupt
+ * driven.
+ *
+ * sdma_engine_error() and sdma_engine_interrupt() are
+ * entrances for interrupts.
+ *
+ * sdma_map_init() is for the management of the mapping
+ * table when the number of vls is changed.
+ *
+ */
+
+/*
+ * struct hw_sdma_desc - raw 128 bit SDMA descriptor
+ *
+ * This is the raw descriptor in the SDMA ring
+ */
+struct hw_sdma_desc {
+       /* private:  don't use directly */
+       __le64 qw[2];
+};
+
+/*
+ * struct sdma_desc - canonical fragment descriptor
+ *
+ * This is the descriptor carried in the tx request
+ * corresponding to each fragment.
+ *
+ */
+struct sdma_desc {
+       /* private:  don't use directly */
+       u64 qw[2];
+};
+
+struct sdma_txreq;
+typedef void (*callback_t)(struct sdma_txreq *, int, int);
+
+/**
+ * struct sdma_txreq - the sdma_txreq structure (one per packet)
+ * @list: for use by user and by queuing for wait
+ *
+ * This is the representation of a packet which consists of some
+ * number of fragments.   Storage is provided to within the structure.
+ * for all fragments.
+ *
+ * The storage for the descriptors are automatically extended as needed
+ * when the currently allocation is exceeded.
+ *
+ * The user (Verbs or PSM) may overload this structure with fields
+ * specific to their use by putting this struct first in their struct.
+ * The method of allocation of the overloaded structure is user dependent
+ *
+ * The list is the only public field in the structure.
+ *
+ */
+
+struct sdma_txreq {
+       struct list_head list;
+       /* private: */
+       struct sdma_desc *descp;
+       /* private: */
+       void *coalesce_buf;
+       /* private: */
+       struct iowait *wait;
+       /* private: */
+       callback_t                  complete;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       u64 sn;
+#endif
+       /* private: - used in coalesce/pad processing */
+       u16                         packet_len;
+       /* private: - down-counted to trigger last */
+       u16                         tlen;
+       /* private: flags */
+       u16                         flags;
+       /* private: */
+       u16                         num_desc;
+       /* private: */
+       u16                         desc_limit;
+       /* private: */
+       u16                         next_descq_idx;
+       /* private: */
+       struct sdma_desc descs[NUM_DESC];
+};
+
+struct verbs_txreq {
+       struct hfi1_pio_header  phdr;
+       struct sdma_txreq       txreq;
+       struct hfi1_qp           *qp;
+       struct hfi1_swqe         *wqe;
+       struct hfi1_mregion     *mr;
+       struct hfi1_sge_state    *ss;
+       struct sdma_engine     *sde;
+       u16                     hdr_dwords;
+       u16                     hdr_inx;
+};
+
+/**
+ * struct sdma_engine - Data pertaining to each SDMA engine.
+ * @dd: a back-pointer to the device data
+ * @ppd: per port back-pointer
+ * @imask: mask for irq manipulation
+ * @idle_mask: mask for determining if an interrupt is due to sdma_idle
+ *
+ * This structure has the state for each sdma_engine.
+ *
+ * Accessing to non public fields are not supported
+ * since the private members are subject to change.
+ */
+struct sdma_engine {
+       /* read mostly */
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       /* private: */
+       void __iomem *tail_csr;
+       u64 imask;                      /* clear interrupt mask */
+       u64 idle_mask;
+       u64 progress_mask;
+       /* private: */
+       struct workqueue_struct *wq;
+       /* private: */
+       volatile __le64      *head_dma; /* DMA'ed by chip */
+       /* private: */
+       dma_addr_t            head_phys;
+       /* private: */
+       struct hw_sdma_desc *descq;
+       /* private: */
+       unsigned descq_full_count;
+       struct sdma_txreq **tx_ring;
+       /* private: */
+       dma_addr_t            descq_phys;
+       /* private */
+       u32 sdma_mask;
+       /* private */
+       struct sdma_state state;
+       /* private: */
+       u8 sdma_shift;
+       /* private: */
+       u8 this_idx; /* zero relative engine */
+       /* protect changes to senddmactrl shadow */
+       spinlock_t senddmactrl_lock;
+       /* private: */
+       u64 p_senddmactrl;              /* shadow per-engine SendDmaCtrl */
+
+       /* read/write using tail_lock */
+       spinlock_t            tail_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       /* private: */
+       u64                   tail_sn;
+#endif
+       /* private: */
+       u32                   descq_tail;
+       /* private: */
+       unsigned long         ahg_bits;
+       /* private: */
+       u16                   desc_avail;
+       /* private: */
+       u16                   tx_tail;
+       /* private: */
+       u16 descq_cnt;
+
+       /* read/write using head_lock */
+       /* private: */
+       seqlock_t            head_lock ____cacheline_aligned_in_smp;
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+       /* private: */
+       u64                   head_sn;
+#endif
+       /* private: */
+       u32                   descq_head;
+       /* private: */
+       u16                   tx_head;
+       /* private: */
+       u64                   last_status;
+
+       /* private: */
+       struct list_head      dmawait;
+
+       /* CONFIG SDMA for now, just blindly duplicate */
+       /* private: */
+       struct tasklet_struct sdma_hw_clean_up_task
+               ____cacheline_aligned_in_smp;
+
+       /* private: */
+       struct tasklet_struct sdma_sw_clean_up_task
+               ____cacheline_aligned_in_smp;
+       /* private: */
+       struct work_struct err_halt_worker;
+       /* private */
+       struct timer_list     err_progress_check_timer;
+       u32                   progress_check_head;
+       /* private: */
+       struct work_struct flush_worker;
+       spinlock_t flushlist_lock;
+       /* private: */
+       struct list_head flushlist;
+};
+
+
+int sdma_init(struct hfi1_devdata *dd, u8 port);
+void sdma_start(struct hfi1_devdata *dd);
+void sdma_exit(struct hfi1_devdata *dd);
+void sdma_all_running(struct hfi1_devdata *dd);
+void sdma_all_idle(struct hfi1_devdata *dd);
+void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
+void sdma_freeze(struct hfi1_devdata *dd);
+void sdma_unfreeze(struct hfi1_devdata *dd);
+void sdma_wait(struct hfi1_devdata *dd);
+
+/**
+ * sdma_empty() - idle engine test
+ * @engine: sdma engine
+ *
+ * Currently used by verbs as a latency optimization.
+ *
+ * Return:
+ * 1 - empty, 0 - non-empty
+ */
+static inline int sdma_empty(struct sdma_engine *sde)
+{
+       return sde->descq_tail == sde->descq_head;
+}
+
+static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
+{
+       return sde->descq_cnt -
+               (sde->descq_tail -
+                ACCESS_ONCE(sde->descq_head)) - 1;
+}
+
+static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
+{
+       return sde->descq_cnt - sdma_descq_freecnt(sde);
+}
+
+/*
+ * Either head_lock or tail lock required to see
+ * a steady state.
+ */
+static inline int __sdma_running(struct sdma_engine *engine)
+{
+       return engine->state.current_state == sdma_state_s99_running;
+}
+
+
+/**
+ * sdma_running() - state suitability test
+ * @engine: sdma engine
+ *
+ * sdma_running probes the internal state to determine if it is suitable
+ * for submitting packets.
+ *
+ * Return:
+ * 1 - ok to submit, 0 - not ok to submit
+ *
+ */
+static inline int sdma_running(struct sdma_engine *engine)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&engine->tail_lock, flags);
+       ret = __sdma_running(engine);
+       spin_unlock_irqrestore(&engine->tail_lock, flags);
+       return ret;
+}
+
+void _sdma_txreq_ahgadd(
+       struct sdma_txreq *tx,
+       u8 num_ahg,
+       u8 ahg_entry,
+       u32 *ahg,
+       u8 ahg_hlen);
+
+
+/**
+ * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @ahg_entry: ahg entry to use  (0 - 31)
+ * @num_ahg: ahg descriptor for first descriptor (0 - 9)
+ * @ahg: array of AHG descriptors (up to 9 entries)
+ * @ahg_hlen: number of bytes from ASIC entry to use
+ * @cb: callback
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user independent
+ * fields.
+ *
+ * The currently supported flags are SDMA_TXREQ_F_URGENT,
+ * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be
+ * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in
+ * the AHG descriptors into the first 1 to 3 descriptors.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   Aspects of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted. The callback will be provided this tx, a status, and a flag.
+ *
+ * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ * The flag, if the is the iowait had been used, indicates the iowait
+ * sdma_busy count has reached zero.
+ *
+ * user data portion of tlen should be precise.   The sdma_txadd_* entrances
+ * will pad with a descriptor references 1 - 3 bytes when the number of bytes
+ * specified in tlen have been supplied to the sdma_txreq.
+ *
+ * ahg_hlen is used to determine the number of on-chip entry bytes to
+ * use as the header.   This is for cases where the stored header is
+ * larger than the header to be used in a packet.  This is typical
+ * for verbs where an RDMA_WRITE_FIRST is larger than the packet in
+ * and RDMA_WRITE_MIDDLE.
+ *
+ */
+static inline int sdma_txinit_ahg(
+       struct sdma_txreq *tx,
+       u16 flags,
+       u16 tlen,
+       u8 ahg_entry,
+       u8 num_ahg,
+       u32 *ahg,
+       u8 ahg_hlen,
+       void (*cb)(struct sdma_txreq *, int, int))
+{
+       if (tlen == 0)
+               return -ENODATA;
+       if (tlen > MAX_SDMA_PKT_SIZE)
+               return -EMSGSIZE;
+       tx->desc_limit = ARRAY_SIZE(tx->descs);
+       tx->descp = &tx->descs[0];
+       INIT_LIST_HEAD(&tx->list);
+       tx->num_desc = 0;
+       tx->flags = flags;
+       tx->complete = cb;
+       tx->coalesce_buf = NULL;
+       tx->wait = NULL;
+       tx->tlen = tx->packet_len = tlen;
+       tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG;
+       tx->descs[0].qw[1] = 0;
+       if (flags & SDMA_TXREQ_F_AHG_COPY)
+               tx->descs[0].qw[1] |=
+                       (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
+                               << SDMA_DESC1_HEADER_INDEX_SHIFT) |
+                       (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK)
+                               << SDMA_DESC1_HEADER_MODE_SHIFT);
+       else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg)
+               _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen);
+       return 0;
+}
+
+/**
+ * sdma_txinit() - initialize an sdma_txreq struct (no AHG)
+ * @tx: tx request to initialize
+ * @flags: flags to key last descriptor additions
+ * @tlen: total packet length (pbc + headers + data)
+ * @cb: callback pointer
+ *
+ * The allocation of the sdma_txreq and it enclosing structure is user
+ * dependent.  This routine must be called to initialize the user
+ * independent fields.
+ *
+ * The currently supported flags is SDMA_TXREQ_F_URGENT.
+ *
+ * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the
+ * completion is desired as soon as possible.
+ *
+ * Completions of submitted requests can be gotten on selected
+ * txreqs by giving a completion routine callback to sdma_txinit() or
+ * sdma_txinit_ahg().  The environment in which the callback runs
+ * can be from an ISR, a tasklet, or a thread, so no sleeping
+ * kernel routines can be used.   The head size of the sdma ring may
+ * be locked so care should be taken with locking.
+ *
+ * The callback pointer can be NULL to avoid any callback for the packet
+ * being submitted.
+ *
+ * The callback, if non-NULL,  will be provided this tx and a status.  The
+ * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR,
+ * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN.
+ *
+ */
+static inline int sdma_txinit(
+       struct sdma_txreq *tx,
+       u16 flags,
+       u16 tlen,
+       void (*cb)(struct sdma_txreq *, int, int))
+{
+       return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb);
+}
+
+/* helpers - don't use */
+static inline int sdma_mapping_type(struct sdma_desc *d)
+{
+       return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK)
+               >> SDMA_DESC1_GENERATION_SHIFT;
+}
+
+static inline size_t sdma_mapping_len(struct sdma_desc *d)
+{
+       return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK)
+               >> SDMA_DESC0_BYTE_COUNT_SHIFT;
+}
+
+static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
+{
+       return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK)
+               >> SDMA_DESC0_PHY_ADDR_SHIFT;
+}
+
+static inline void make_tx_sdma_desc(
+       struct sdma_txreq *tx,
+       int type,
+       dma_addr_t addr,
+       size_t len)
+{
+       struct sdma_desc *desc = &tx->descp[tx->num_desc];
+
+       if (!tx->num_desc) {
+               /* qw[0] zero; qw[1] first, ahg mode already in from init */
+               desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK)
+                               << SDMA_DESC1_GENERATION_SHIFT;
+       } else {
+               desc->qw[0] = 0;
+               desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK)
+                               << SDMA_DESC1_GENERATION_SHIFT;
+       }
+       desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK)
+                               << SDMA_DESC0_PHY_ADDR_SHIFT) |
+                       (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
+                               << SDMA_DESC0_BYTE_COUNT_SHIFT);
+}
+
+/* helper to extend txreq */
+int _extend_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *);
+void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *);
+
+/* helpers used by public routines */
+static inline void _sdma_close_tx(struct hfi1_devdata *dd,
+                                 struct sdma_txreq *tx)
+{
+       tx->descp[tx->num_desc].qw[0] |=
+               SDMA_DESC0_LAST_DESC_FLAG;
+       tx->descp[tx->num_desc].qw[1] |=
+               dd->default_desc1;
+       if (tx->flags & SDMA_TXREQ_F_URGENT)
+               tx->descp[tx->num_desc].qw[1] |=
+                       (SDMA_DESC1_HEAD_TO_HOST_FLAG|
+                        SDMA_DESC1_INT_REQ_FLAG);
+}
+
+static inline int _sdma_txadd_daddr(
+       struct hfi1_devdata *dd,
+       int type,
+       struct sdma_txreq *tx,
+       dma_addr_t addr,
+       u16 len)
+{
+       int rval = 0;
+
+       if ((unlikely(tx->num_desc == tx->desc_limit))) {
+               rval = _extend_sdma_tx_descs(dd, tx);
+               if (rval)
+                       return rval;
+       }
+       make_tx_sdma_desc(
+               tx,
+               type,
+               addr, len);
+       WARN_ON(len > tx->tlen);
+       tx->tlen -= len;
+       /* special cases for last */
+       if (!tx->tlen) {
+               if (tx->packet_len & (sizeof(u32) - 1))
+                       rval = _pad_sdma_tx_descs(dd, tx);
+               else
+                       _sdma_close_tx(dd, tx);
+       }
+       tx->num_desc++;
+       return rval;
+}
+
+/**
+ * sdma_txadd_page() - add a page to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: tx request to which the page is added
+ * @page: page to map
+ * @offset: offset within the page
+ * @len: length in bytes
+ *
+ * This is used to add a page/offset/length descriptor.
+ *
+ * The mapping/unmapping of the page/offset/len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't
+ * extend descriptor array or couldn't allocate coalesce
+ * buffer.
+ *
+ */
+static inline int sdma_txadd_page(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       struct page *page,
+       unsigned long offset,
+       u16 len)
+{
+       dma_addr_t addr =
+               dma_map_page(
+                       &dd->pcidev->dev,
+                       page,
+                       offset,
+                       len,
+                       DMA_TO_DEVICE);
+       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+               sdma_txclean(dd, tx);
+               return -ENOSPC;
+       }
+       return _sdma_txadd_daddr(
+                       dd, SDMA_MAP_PAGE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_daddr() - add a dma address to the sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @addr: dma address mapped by caller
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor for memory that is already dma mapped.
+ *
+ * In this case, there is no unmapping as part of the progress processing for
+ * this memory location.
+ *
+ * Return:
+ * 0 - success, -ENOMEM - couldn't extend descriptor array
+ */
+
+static inline int sdma_txadd_daddr(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       dma_addr_t addr,
+       u16 len)
+{
+       return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len);
+}
+
+/**
+ * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq
+ * @dd: the device to use for mapping
+ * @tx: sdma_txreq to which the page is added
+ * @kvaddr: the kernel virtual address
+ * @len: length in bytes
+ *
+ * This is used to add a descriptor referenced by the indicated kvaddr and
+ * len.
+ *
+ * The mapping/unmapping of the kvaddr and len is automatically handled.
+ *
+ * Return:
+ * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend
+ * descriptor array
+ */
+static inline int sdma_txadd_kvaddr(
+       struct hfi1_devdata *dd,
+       struct sdma_txreq *tx,
+       void *kvaddr,
+       u16 len)
+{
+       dma_addr_t addr =
+               dma_map_single(
+                       &dd->pcidev->dev,
+                       kvaddr,
+                       len,
+                       DMA_TO_DEVICE);
+       if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
+               sdma_txclean(dd, tx);
+               return -ENOSPC;
+       }
+       return _sdma_txadd_daddr(
+                       dd, SDMA_MAP_SINGLE, tx, addr, len);
+}
+
+struct iowait;
+
+int sdma_send_txreq(struct sdma_engine *sde,
+                   struct iowait *wait,
+                   struct sdma_txreq *tx);
+int sdma_send_txlist(struct sdma_engine *sde,
+                    struct iowait *wait,
+                    struct list_head *tx_list);
+
+int sdma_ahg_alloc(struct sdma_engine *sde);
+void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
+
+/**
+ * sdma_build_ahg - build ahg descriptor
+ * @data
+ * @dwindex
+ * @startbit
+ * @bits
+ *
+ * Build and return a 32 bit descriptor.
+ */
+static inline u32 sdma_build_ahg_descriptor(
+       u16 data,
+       u8 dwindex,
+       u8 startbit,
+       u8 bits)
+{
+       return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT |
+               ((startbit & SDMA_AHG_FIELD_START_MASK) <<
+               SDMA_AHG_FIELD_START_SHIFT) |
+               ((bits & SDMA_AHG_FIELD_LEN_MASK) <<
+               SDMA_AHG_FIELD_LEN_SHIFT) |
+               ((dwindex & SDMA_AHG_INDEX_MASK) <<
+               SDMA_AHG_INDEX_SHIFT) |
+               ((data & SDMA_AHG_VALUE_MASK) <<
+               SDMA_AHG_VALUE_SHIFT));
+}
+
+/**
+ * sdma_progress - use seq number of detect head progress
+ * @sde: sdma_engine to check
+ * @seq: base seq count
+ * @tx: txreq for which we need to check descriptor availability
+ *
+ * This is used in the appropriate spot in the sleep routine
+ * to check for potential ring progress.  This routine gets the
+ * seqcount before queuing the iowait structure for progress.
+ *
+ * If the seqcount indicates that progress needs to be checked,
+ * re-submission is detected by checking whether the descriptor
+ * queue has enough descriptor for the txreq.
+ */
+static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq,
+                                    struct sdma_txreq *tx)
+{
+       if (read_seqretry(&sde->head_lock, seq)) {
+               sde->desc_avail = sdma_descq_freecnt(sde);
+               if (tx->num_desc > sde->desc_avail)
+                       return 0;
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * sdma_iowait_schedule() - initialize wait structure
+ * @sde: sdma_engine to schedule
+ * @wait: wait struct to schedule
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+static inline void sdma_iowait_schedule(
+       struct sdma_engine *sde,
+       struct iowait *wait)
+{
+       iowait_schedule(wait, sde->wq);
+}
+
+/* for use by interrupt handling */
+void sdma_engine_error(struct sdma_engine *sde, u64 status);
+void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
+
+/*
+ *
+ * The diagram below details the relationship of the mapping structures
+ *
+ * Since the mapping now allows for non-uniform engines per vl, the
+ * number of engines for a vl is either the vl_engines[vl] or
+ * a computation based on num_sdma/num_vls:
+ *
+ * For example:
+ * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls
+ *
+ * n = roundup to next highest power of 2 using nactual
+ *
+ * In the case where there are num_sdma/num_vls doesn't divide
+ * evenly, the extras are added from the last vl downward.
+ *
+ * For the case where n > nactual, the engines are assigned
+ * in a round robin fashion wrapping back to the first engine
+ * for a particular vl.
+ *
+ *               dd->sdma_map
+ *                    |                                   sdma_map_elem[0]
+ *                    |                                +--------------------+
+ *                    v                                |       mask         |
+ *               sdma_vl_map                           |--------------------|
+ *      +--------------------------+                   | sde[0] -> eng 1    |
+ *      |    list (RCU)            |                   |--------------------|
+ *      |--------------------------|                 ->| sde[1] -> eng 2    |
+ *      |    mask                  |              --/  |--------------------|
+ *      |--------------------------|            -/     |        *           |
+ *      |    actual_vls (max 8)    |          -/       |--------------------|
+ *      |--------------------------|       --/         | sde[n] -> eng n    |
+ *      |    vls (max 8)           |     -/            +--------------------+
+ *      |--------------------------|  --/
+ *      |    map[0]                |-/
+ *      |--------------------------|                   +--------------------+
+ *      |    map[1]                |---                |       mask         |
+ *      |--------------------------|   \----           |--------------------|
+ *      |           *              |        \--        | sde[0] -> eng 1+n  |
+ *      |           *              |           \----   |--------------------|
+ *      |           *              |                \->| sde[1] -> eng 2+n  |
+ *      |--------------------------|                   |--------------------|
+ *      |   map[vls - 1]           |-                  |         *          |
+ *      +--------------------------+ \-                |--------------------|
+ *                                     \-              | sde[m] -> eng m+n  |
+ *                                       \             +--------------------+
+ *                                        \-
+ *                                          \
+ *                                           \-        +--------------------+
+ *                                             \-      |       mask         |
+ *                                               \     |--------------------|
+ *                                                \-   | sde[0] -> eng 1+m+n|
+ *                                                  \- |--------------------|
+ *                                                    >| sde[1] -> eng 2+m+n|
+ *                                                     |--------------------|
+ *                                                     |         *          |
+ *                                                     |--------------------|
+ *                                                     | sde[o] -> eng o+m+n|
+ *                                                     +--------------------+
+ *
+ */
+
+/**
+ * struct sdma_map_elem - mapping for a vl
+ * @mask - selector mask
+ * @sde - array of engines for this vl
+ *
+ * The mask is used to "mod" the selector
+ * to produce index into the trailing
+ * array of sdes.
+ */
+struct sdma_map_elem {
+       u32 mask;
+       struct sdma_engine *sde[0];
+};
+
+/**
+ * struct sdma_map_el - mapping for a vl
+ * @list - rcu head for free callback
+ * @mask - vl mask to "mod" the vl to produce an index to map array
+ * @actual_vls - number of vls
+ * @vls - number of vls rounded to next power of 2
+ * @map - array of sdma_map_elem entries
+ *
+ * This is the parent mapping structure.  The trailing
+ * members of the struct point to sdma_map_elem entries, which
+ * in turn point to an array of sde's for that vl.
+ */
+struct sdma_vl_map {
+       struct rcu_head list;
+       u32 mask;
+       u8 actual_vls;
+       u8 vls;
+       struct sdma_map_elem *map[0];
+};
+
+int sdma_map_init(
+       struct hfi1_devdata *dd,
+       u8 port,
+       u8 num_vls,
+       u8 *vl_engines);
+
+/* slow path */
+void _sdma_engine_progress_schedule(struct sdma_engine *sde);
+
+/**
+ * sdma_engine_progress_schedule() - schedule progress on engine
+ * @sde: sdma_engine to schedule progress
+ *
+ * This is the fast path.
+ *
+ */
+static inline void sdma_engine_progress_schedule(
+       struct sdma_engine *sde)
+{
+       if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8))
+               return;
+       _sdma_engine_progress_schedule(sde);
+}
+
+struct sdma_engine *sdma_select_engine_sc(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 sc5);
+
+struct sdma_engine *sdma_select_engine_vl(
+       struct hfi1_devdata *dd,
+       u32 selector,
+       u8 vl);
+
+void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *);
+
+#ifdef CONFIG_SDMA_VERBOSITY
+void sdma_dumpstate(struct sdma_engine *);
+#endif
+static inline char *slashstrip(char *s)
+{
+       char *r = s;
+
+       while (*s)
+               if (*s++ == '/')
+                       r = s;
+       return r;
+}
+
+u16 sdma_get_descq_cnt(void);
+
+extern uint mod_num_sdma;
+
+void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
+
+#endif
diff --git a/drivers/staging/rdma/hfi1/srq.c b/drivers/staging/rdma/hfi1/srq.c
new file mode 100644 (file)
index 0000000..67786d4
--- /dev/null
@@ -0,0 +1,397 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "verbs.h"
+
+/**
+ * hfi1_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                         struct ib_recv_wr **bad_wr)
+{
+       struct hfi1_srq *srq = to_isrq(ibsrq);
+       struct hfi1_rwq *wq;
+       unsigned long flags;
+       int ret;
+
+       for (; wr; wr = wr->next) {
+               struct hfi1_rwqe *wqe;
+               u32 next;
+               int i;
+
+               if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+                       *bad_wr = wr;
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&srq->rq.lock, flags);
+               wq = srq->rq.wq;
+               next = wq->head + 1;
+               if (next >= srq->rq.size)
+                       next = 0;
+               if (next == wq->tail) {
+                       spin_unlock_irqrestore(&srq->rq.lock, flags);
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               wqe = get_rwqe_ptr(&srq->rq, wq->head);
+               wqe->wr_id = wr->wr_id;
+               wqe->num_sge = wr->num_sge;
+               for (i = 0; i < wr->num_sge; i++)
+                       wqe->sg_list[i] = wr->sg_list[i];
+               /* Make sure queue entry is written before the head index. */
+               smp_wmb();
+               wq->head = next;
+               spin_unlock_irqrestore(&srq->rq.lock, flags);
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ */
+struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
+                              struct ib_srq_init_attr *srq_init_attr,
+                              struct ib_udata *udata)
+{
+       struct hfi1_ibdev *dev = to_idev(ibpd->device);
+       struct hfi1_srq *srq;
+       u32 sz;
+       struct ib_srq *ret;
+
+       if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+               ret = ERR_PTR(-ENOSYS);
+               goto done;
+       }
+
+       if (srq_init_attr->attr.max_sge == 0 ||
+           srq_init_attr->attr.max_sge > hfi1_max_srq_sges ||
+           srq_init_attr->attr.max_wr == 0 ||
+           srq_init_attr->attr.max_wr > hfi1_max_srq_wrs) {
+               ret = ERR_PTR(-EINVAL);
+               goto done;
+       }
+
+       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+       if (!srq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto done;
+       }
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       srq->rq.size = srq_init_attr->attr.max_wr + 1;
+       srq->rq.max_sge = srq_init_attr->attr.max_sge;
+       sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+               sizeof(struct hfi1_rwqe);
+       srq->rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + srq->rq.size * sz);
+       if (!srq->rq.wq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_srq;
+       }
+
+       /*
+        * Return the address of the RWQ as the offset to mmap.
+        * See hfi1_mmap() for details.
+        */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               int err;
+               u32 s = sizeof(struct hfi1_rwq) + srq->rq.size * sz;
+
+               srq->ip =
+                   hfi1_create_mmap_info(dev, s, ibpd->uobject->context,
+                                         srq->rq.wq);
+               if (!srq->ip) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_wq;
+               }
+
+               err = ib_copy_to_udata(udata, &srq->ip->offset,
+                                      sizeof(srq->ip->offset));
+               if (err) {
+                       ret = ERR_PTR(err);
+                       goto bail_ip;
+               }
+       } else
+               srq->ip = NULL;
+
+       /*
+        * ib_create_srq() will initialize srq->ibsrq.
+        */
+       spin_lock_init(&srq->rq.lock);
+       srq->rq.wq->head = 0;
+       srq->rq.wq->tail = 0;
+       srq->limit = srq_init_attr->attr.srq_limit;
+
+       spin_lock(&dev->n_srqs_lock);
+       if (dev->n_srqs_allocated == hfi1_max_srqs) {
+               spin_unlock(&dev->n_srqs_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_ip;
+       }
+
+       dev->n_srqs_allocated++;
+       spin_unlock(&dev->n_srqs_lock);
+
+       if (srq->ip) {
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       ret = &srq->ibsrq;
+       goto done;
+
+bail_ip:
+       kfree(srq->ip);
+bail_wq:
+       vfree(srq->rq.wq);
+bail_srq:
+       kfree(srq);
+done:
+       return ret;
+}
+
+/**
+ * hfi1_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ */
+int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                   enum ib_srq_attr_mask attr_mask,
+                   struct ib_udata *udata)
+{
+       struct hfi1_srq *srq = to_isrq(ibsrq);
+       struct hfi1_rwq *wq;
+       int ret = 0;
+
+       if (attr_mask & IB_SRQ_MAX_WR) {
+               struct hfi1_rwq *owq;
+               struct hfi1_rwqe *p;
+               u32 sz, size, n, head, tail;
+
+               /* Check that the requested sizes are below the limits. */
+               if ((attr->max_wr > hfi1_max_srq_wrs) ||
+                   ((attr_mask & IB_SRQ_LIMIT) ?
+                    attr->srq_limit : srq->limit) > attr->max_wr) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               sz = sizeof(struct hfi1_rwqe) +
+                       srq->rq.max_sge * sizeof(struct ib_sge);
+               size = attr->max_wr + 1;
+               wq = vmalloc_user(sizeof(struct hfi1_rwq) + size * sz);
+               if (!wq) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               /* Check that we can write the offset to mmap. */
+               if (udata && udata->inlen >= sizeof(__u64)) {
+                       __u64 offset_addr;
+                       __u64 offset = 0;
+
+                       ret = ib_copy_from_udata(&offset_addr, udata,
+                                                sizeof(offset_addr));
+                       if (ret)
+                               goto bail_free;
+                       udata->outbuf =
+                               (void __user *) (unsigned long) offset_addr;
+                       ret = ib_copy_to_udata(udata, &offset,
+                                              sizeof(offset));
+                       if (ret)
+                               goto bail_free;
+               }
+
+               spin_lock_irq(&srq->rq.lock);
+               /*
+                * validate head and tail pointer values and compute
+                * the number of remaining WQEs.
+                */
+               owq = srq->rq.wq;
+               head = owq->head;
+               tail = owq->tail;
+               if (head >= srq->rq.size || tail >= srq->rq.size) {
+                       ret = -EINVAL;
+                       goto bail_unlock;
+               }
+               n = head;
+               if (n < tail)
+                       n += srq->rq.size - tail;
+               else
+                       n -= tail;
+               if (size <= n) {
+                       ret = -EINVAL;
+                       goto bail_unlock;
+               }
+               n = 0;
+               p = wq->wq;
+               while (tail != head) {
+                       struct hfi1_rwqe *wqe;
+                       int i;
+
+                       wqe = get_rwqe_ptr(&srq->rq, tail);
+                       p->wr_id = wqe->wr_id;
+                       p->num_sge = wqe->num_sge;
+                       for (i = 0; i < wqe->num_sge; i++)
+                               p->sg_list[i] = wqe->sg_list[i];
+                       n++;
+                       p = (struct hfi1_rwqe *)((char *)p + sz);
+                       if (++tail >= srq->rq.size)
+                               tail = 0;
+               }
+               srq->rq.wq = wq;
+               srq->rq.size = size;
+               wq->head = n;
+               wq->tail = 0;
+               if (attr_mask & IB_SRQ_LIMIT)
+                       srq->limit = attr->srq_limit;
+               spin_unlock_irq(&srq->rq.lock);
+
+               vfree(owq);
+
+               if (srq->ip) {
+                       struct hfi1_mmap_info *ip = srq->ip;
+                       struct hfi1_ibdev *dev = to_idev(srq->ibsrq.device);
+                       u32 s = sizeof(struct hfi1_rwq) + size * sz;
+
+                       hfi1_update_mmap_info(dev, ip, s, wq);
+
+                       /*
+                        * Return the offset to mmap.
+                        * See hfi1_mmap() for details.
+                        */
+                       if (udata && udata->inlen >= sizeof(__u64)) {
+                               ret = ib_copy_to_udata(udata, &ip->offset,
+                                                      sizeof(ip->offset));
+                               if (ret)
+                                       goto bail;
+                       }
+
+                       /*
+                        * Put user mapping info onto the pending list
+                        * unless it already is on the list.
+                        */
+                       spin_lock_irq(&dev->pending_lock);
+                       if (list_empty(&ip->pending_mmaps))
+                               list_add(&ip->pending_mmaps,
+                                        &dev->pending_mmaps);
+                       spin_unlock_irq(&dev->pending_lock);
+               }
+       } else if (attr_mask & IB_SRQ_LIMIT) {
+               spin_lock_irq(&srq->rq.lock);
+               if (attr->srq_limit >= srq->rq.size)
+                       ret = -EINVAL;
+               else
+                       srq->limit = attr->srq_limit;
+               spin_unlock_irq(&srq->rq.lock);
+       }
+       goto bail;
+
+bail_unlock:
+       spin_unlock_irq(&srq->rq.lock);
+bail_free:
+       vfree(wq);
+bail:
+       return ret;
+}
+
+int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+       struct hfi1_srq *srq = to_isrq(ibsrq);
+
+       attr->max_wr = srq->rq.size - 1;
+       attr->max_sge = srq->rq.max_sge;
+       attr->srq_limit = srq->limit;
+       return 0;
+}
+
+/**
+ * hfi1_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int hfi1_destroy_srq(struct ib_srq *ibsrq)
+{
+       struct hfi1_srq *srq = to_isrq(ibsrq);
+       struct hfi1_ibdev *dev = to_idev(ibsrq->device);
+
+       spin_lock(&dev->n_srqs_lock);
+       dev->n_srqs_allocated--;
+       spin_unlock(&dev->n_srqs_lock);
+       if (srq->ip)
+               kref_put(&srq->ip->ref, hfi1_release_mmap_info);
+       else
+               vfree(srq->rq.wq);
+       kfree(srq);
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c
new file mode 100644 (file)
index 0000000..b78c728
--- /dev/null
@@ -0,0 +1,739 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/ctype.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "trace.h"
+
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+               struct bin_attribute *bin_attr,
+               char *buf, loff_t pos, size_t count)
+{
+       int ret;
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+       struct cc_state *cc_state;
+
+       ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+                + sizeof(__be16);
+
+       if (pos > ret)
+               return -EINVAL;
+
+       if (count > ret - pos)
+               count = ret - pos;
+
+       if (!count)
+               return count;
+
+       rcu_read_lock();
+       cc_state = get_cc_state(ppd);
+       if (cc_state == NULL) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       memcpy(buf, &cc_state->cct, count);
+       rcu_read_unlock();
+
+       return count;
+}
+
+static void port_release(struct kobject *kobj)
+{
+       /* nothing to do since memory is freed by hfi1_free_devdata() */
+}
+
+static struct kobj_type port_cc_ktype = {
+       .release = port_release,
+};
+
+static struct bin_attribute cc_table_bin_attr = {
+       .attr = {.name = "cc_table_bin", .mode = 0444},
+       .read = read_cc_table_bin,
+       .size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+               struct bin_attribute *bin_attr,
+               char *buf, loff_t pos, size_t count)
+{
+       int ret;
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, pport_cc_kobj);
+       struct cc_state *cc_state;
+
+       ret = sizeof(struct opa_congestion_setting_attr_shadow);
+
+       if (pos > ret)
+               return -EINVAL;
+       if (count > ret - pos)
+               count = ret - pos;
+
+       if (!count)
+               return count;
+
+       rcu_read_lock();
+       cc_state = get_cc_state(ppd);
+       if (cc_state == NULL) {
+               rcu_read_unlock();
+               return -EINVAL;
+       }
+       memcpy(buf, &cc_state->cong_setting, count);
+       rcu_read_unlock();
+
+       return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+       .attr = {.name = "cc_settings_bin", .mode = 0444},
+       .read = read_cc_setting_bin,
+       .size = PAGE_SIZE,
+};
+
+/* Start sc2vl */
+#define HFI1_SC2VL_ATTR(N)                                 \
+       static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .sc = N \
+       }
+
+struct hfi1_sc2vl_attr {
+       struct attribute attr;
+       int sc;
+};
+
+HFI1_SC2VL_ATTR(0);
+HFI1_SC2VL_ATTR(1);
+HFI1_SC2VL_ATTR(2);
+HFI1_SC2VL_ATTR(3);
+HFI1_SC2VL_ATTR(4);
+HFI1_SC2VL_ATTR(5);
+HFI1_SC2VL_ATTR(6);
+HFI1_SC2VL_ATTR(7);
+HFI1_SC2VL_ATTR(8);
+HFI1_SC2VL_ATTR(9);
+HFI1_SC2VL_ATTR(10);
+HFI1_SC2VL_ATTR(11);
+HFI1_SC2VL_ATTR(12);
+HFI1_SC2VL_ATTR(13);
+HFI1_SC2VL_ATTR(14);
+HFI1_SC2VL_ATTR(15);
+HFI1_SC2VL_ATTR(16);
+HFI1_SC2VL_ATTR(17);
+HFI1_SC2VL_ATTR(18);
+HFI1_SC2VL_ATTR(19);
+HFI1_SC2VL_ATTR(20);
+HFI1_SC2VL_ATTR(21);
+HFI1_SC2VL_ATTR(22);
+HFI1_SC2VL_ATTR(23);
+HFI1_SC2VL_ATTR(24);
+HFI1_SC2VL_ATTR(25);
+HFI1_SC2VL_ATTR(26);
+HFI1_SC2VL_ATTR(27);
+HFI1_SC2VL_ATTR(28);
+HFI1_SC2VL_ATTR(29);
+HFI1_SC2VL_ATTR(30);
+HFI1_SC2VL_ATTR(31);
+
+
+static struct attribute *sc2vl_default_attributes[] = {
+       &hfi1_sc2vl_attr_0.attr,
+       &hfi1_sc2vl_attr_1.attr,
+       &hfi1_sc2vl_attr_2.attr,
+       &hfi1_sc2vl_attr_3.attr,
+       &hfi1_sc2vl_attr_4.attr,
+       &hfi1_sc2vl_attr_5.attr,
+       &hfi1_sc2vl_attr_6.attr,
+       &hfi1_sc2vl_attr_7.attr,
+       &hfi1_sc2vl_attr_8.attr,
+       &hfi1_sc2vl_attr_9.attr,
+       &hfi1_sc2vl_attr_10.attr,
+       &hfi1_sc2vl_attr_11.attr,
+       &hfi1_sc2vl_attr_12.attr,
+       &hfi1_sc2vl_attr_13.attr,
+       &hfi1_sc2vl_attr_14.attr,
+       &hfi1_sc2vl_attr_15.attr,
+       &hfi1_sc2vl_attr_16.attr,
+       &hfi1_sc2vl_attr_17.attr,
+       &hfi1_sc2vl_attr_18.attr,
+       &hfi1_sc2vl_attr_19.attr,
+       &hfi1_sc2vl_attr_20.attr,
+       &hfi1_sc2vl_attr_21.attr,
+       &hfi1_sc2vl_attr_22.attr,
+       &hfi1_sc2vl_attr_23.attr,
+       &hfi1_sc2vl_attr_24.attr,
+       &hfi1_sc2vl_attr_25.attr,
+       &hfi1_sc2vl_attr_26.attr,
+       &hfi1_sc2vl_attr_27.attr,
+       &hfi1_sc2vl_attr_28.attr,
+       &hfi1_sc2vl_attr_29.attr,
+       &hfi1_sc2vl_attr_30.attr,
+       &hfi1_sc2vl_attr_31.attr,
+       NULL
+};
+
+static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct hfi1_sc2vl_attr *sattr =
+               container_of(attr, struct hfi1_sc2vl_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, sc2vl_kobj);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc));
+}
+
+static const struct sysfs_ops hfi1_sc2vl_ops = {
+       .show = sc2vl_attr_show,
+};
+
+static struct kobj_type hfi1_sc2vl_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_sc2vl_ops,
+       .default_attrs = sc2vl_default_attributes
+};
+
+/* End sc2vl */
+
+/* Start sl2sc */
+#define HFI1_SL2SC_ATTR(N)                                 \
+       static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = {     \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .sl = N                                           \
+       }
+
+struct hfi1_sl2sc_attr {
+       struct attribute attr;
+       int sl;
+};
+
+HFI1_SL2SC_ATTR(0);
+HFI1_SL2SC_ATTR(1);
+HFI1_SL2SC_ATTR(2);
+HFI1_SL2SC_ATTR(3);
+HFI1_SL2SC_ATTR(4);
+HFI1_SL2SC_ATTR(5);
+HFI1_SL2SC_ATTR(6);
+HFI1_SL2SC_ATTR(7);
+HFI1_SL2SC_ATTR(8);
+HFI1_SL2SC_ATTR(9);
+HFI1_SL2SC_ATTR(10);
+HFI1_SL2SC_ATTR(11);
+HFI1_SL2SC_ATTR(12);
+HFI1_SL2SC_ATTR(13);
+HFI1_SL2SC_ATTR(14);
+HFI1_SL2SC_ATTR(15);
+HFI1_SL2SC_ATTR(16);
+HFI1_SL2SC_ATTR(17);
+HFI1_SL2SC_ATTR(18);
+HFI1_SL2SC_ATTR(19);
+HFI1_SL2SC_ATTR(20);
+HFI1_SL2SC_ATTR(21);
+HFI1_SL2SC_ATTR(22);
+HFI1_SL2SC_ATTR(23);
+HFI1_SL2SC_ATTR(24);
+HFI1_SL2SC_ATTR(25);
+HFI1_SL2SC_ATTR(26);
+HFI1_SL2SC_ATTR(27);
+HFI1_SL2SC_ATTR(28);
+HFI1_SL2SC_ATTR(29);
+HFI1_SL2SC_ATTR(30);
+HFI1_SL2SC_ATTR(31);
+
+
+static struct attribute *sl2sc_default_attributes[] = {
+       &hfi1_sl2sc_attr_0.attr,
+       &hfi1_sl2sc_attr_1.attr,
+       &hfi1_sl2sc_attr_2.attr,
+       &hfi1_sl2sc_attr_3.attr,
+       &hfi1_sl2sc_attr_4.attr,
+       &hfi1_sl2sc_attr_5.attr,
+       &hfi1_sl2sc_attr_6.attr,
+       &hfi1_sl2sc_attr_7.attr,
+       &hfi1_sl2sc_attr_8.attr,
+       &hfi1_sl2sc_attr_9.attr,
+       &hfi1_sl2sc_attr_10.attr,
+       &hfi1_sl2sc_attr_11.attr,
+       &hfi1_sl2sc_attr_12.attr,
+       &hfi1_sl2sc_attr_13.attr,
+       &hfi1_sl2sc_attr_14.attr,
+       &hfi1_sl2sc_attr_15.attr,
+       &hfi1_sl2sc_attr_16.attr,
+       &hfi1_sl2sc_attr_17.attr,
+       &hfi1_sl2sc_attr_18.attr,
+       &hfi1_sl2sc_attr_19.attr,
+       &hfi1_sl2sc_attr_20.attr,
+       &hfi1_sl2sc_attr_21.attr,
+       &hfi1_sl2sc_attr_22.attr,
+       &hfi1_sl2sc_attr_23.attr,
+       &hfi1_sl2sc_attr_24.attr,
+       &hfi1_sl2sc_attr_25.attr,
+       &hfi1_sl2sc_attr_26.attr,
+       &hfi1_sl2sc_attr_27.attr,
+       &hfi1_sl2sc_attr_28.attr,
+       &hfi1_sl2sc_attr_29.attr,
+       &hfi1_sl2sc_attr_30.attr,
+       &hfi1_sl2sc_attr_31.attr,
+       NULL
+};
+
+static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       struct hfi1_sl2sc_attr *sattr =
+               container_of(attr, struct hfi1_sl2sc_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, sl2sc_kobj);
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+
+       return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]);
+}
+
+static const struct sysfs_ops hfi1_sl2sc_ops = {
+       .show = sl2sc_attr_show,
+};
+
+static struct kobj_type hfi1_sl2sc_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_sl2sc_ops,
+       .default_attrs = sl2sc_default_attributes
+};
+
+/* End sl2sc */
+
+/* Start vl2mtu */
+
+#define HFI1_VL2MTU_ATTR(N) \
+       static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \
+               .attr = { .name = __stringify(N), .mode = 0444 }, \
+               .vl = N                                           \
+       }
+
+struct hfi1_vl2mtu_attr {
+       struct attribute attr;
+       int vl;
+};
+
+HFI1_VL2MTU_ATTR(0);
+HFI1_VL2MTU_ATTR(1);
+HFI1_VL2MTU_ATTR(2);
+HFI1_VL2MTU_ATTR(3);
+HFI1_VL2MTU_ATTR(4);
+HFI1_VL2MTU_ATTR(5);
+HFI1_VL2MTU_ATTR(6);
+HFI1_VL2MTU_ATTR(7);
+HFI1_VL2MTU_ATTR(8);
+HFI1_VL2MTU_ATTR(9);
+HFI1_VL2MTU_ATTR(10);
+HFI1_VL2MTU_ATTR(11);
+HFI1_VL2MTU_ATTR(12);
+HFI1_VL2MTU_ATTR(13);
+HFI1_VL2MTU_ATTR(14);
+HFI1_VL2MTU_ATTR(15);
+
+static struct attribute *vl2mtu_default_attributes[] = {
+       &hfi1_vl2mtu_attr_0.attr,
+       &hfi1_vl2mtu_attr_1.attr,
+       &hfi1_vl2mtu_attr_2.attr,
+       &hfi1_vl2mtu_attr_3.attr,
+       &hfi1_vl2mtu_attr_4.attr,
+       &hfi1_vl2mtu_attr_5.attr,
+       &hfi1_vl2mtu_attr_6.attr,
+       &hfi1_vl2mtu_attr_7.attr,
+       &hfi1_vl2mtu_attr_8.attr,
+       &hfi1_vl2mtu_attr_9.attr,
+       &hfi1_vl2mtu_attr_10.attr,
+       &hfi1_vl2mtu_attr_11.attr,
+       &hfi1_vl2mtu_attr_12.attr,
+       &hfi1_vl2mtu_attr_13.attr,
+       &hfi1_vl2mtu_attr_14.attr,
+       &hfi1_vl2mtu_attr_15.attr,
+       NULL
+};
+
+static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr,
+                               char *buf)
+{
+       struct hfi1_vl2mtu_attr *vlattr =
+               container_of(attr, struct hfi1_vl2mtu_attr, attr);
+       struct hfi1_pportdata *ppd =
+               container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu);
+}
+
+static const struct sysfs_ops hfi1_vl2mtu_ops = {
+       .show = vl2mtu_attr_show,
+};
+
+static struct kobj_type hfi1_vl2mtu_ktype = {
+       .release = port_release,
+       .sysfs_ops = &hfi1_vl2mtu_ops,
+       .default_attrs = vl2mtu_default_attributes
+};
+
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+
+       return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int ret;
+
+       if (!dd->boardname)
+               ret = -EINVAL;
+       else
+               ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+       return ret;
+}
+
+static ssize_t show_boardversion(struct device *device,
+                                struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+
+static ssize_t show_nctxts(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /*
+        * Return the smaller of send and receive contexts.
+        * Normally, user level applications would require both a send
+        * and a receive context, so returning the smaller of the two counts
+        * give a more accurate picture of total contexts available.
+        */
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        min(dd->num_rcv_contexts - dd->first_user_ctxt,
+                            (u32)dd->sc_sizes[SC_USER].count));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       /* Return the number of free user ports (contexts) available. */
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+                          struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
+
+}
+
+static ssize_t store_chip_reset(struct device *device,
+                               struct device_attribute *attr, const char *buf,
+                               size_t count)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       int ret;
+
+       if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ret = hfi1_reset_device(dd->unit);
+bail:
+       return ret < 0 ? ret : count;
+}
+
+/*
+ * Convert the reported temperature from an integer (reported in
+ * units of 0.25C) to a floating point number.
+ */
+#define temp2str(temp, buf, size, idx)                                 \
+       scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ",            \
+                             ((temp) >> 2), ((temp) & 0x3) * 25)
+
+/*
+ * Dump tempsense values, in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+                             struct device_attribute *attr, char *buf)
+{
+       struct hfi1_ibdev *dev =
+               container_of(device, struct hfi1_ibdev, ibdev.dev);
+       struct hfi1_devdata *dd = dd_from_dev(dev);
+       struct hfi1_temp temp;
+       int ret = -ENXIO;
+
+       ret = hfi1_tempsense_rd(dd, &temp);
+       if (!ret) {
+               int idx = 0;
+
+               idx += temp2str(temp.curr, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx);
+               idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx);
+               idx += scnprintf(buf + idx, PAGE_SIZE - idx,
+                               "%u %u %u\n", temp.triggers & 0x1,
+                               temp.triggers & 0x2, temp.triggers & 0x4);
+               ret = idx;
+       }
+       return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *hfi1_attributes[] = {
+       &dev_attr_hw_rev,
+       &dev_attr_board_id,
+       &dev_attr_nctxts,
+       &dev_attr_nfreectxts,
+       &dev_attr_serial,
+       &dev_attr_boardversion,
+       &dev_attr_tempsense,
+       &dev_attr_chip_reset,
+};
+
+int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
+                          struct kobject *kobj)
+{
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       int ret;
+
+       if (!port_num || port_num > dd->num_pports) {
+               dd_dev_err(dd,
+                       "Skipping infiniband class with invalid port %u\n",
+                       port_num);
+               return -ENODEV;
+       }
+       ppd = &dd->pport[port_num - 1];
+
+       ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj,
+                                  "sc2vl");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping sc2vl sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail;
+       }
+       kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj,
+                                  "sl2sc");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping sl2sc sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_sc2vl;
+       }
+       kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
+
+       ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj,
+                                  "vl2mtu");
+       if (ret) {
+               dd_dev_err(dd,
+                          "Skipping vl2mtu sysfs info, (err %d) port %u\n",
+                          ret, port_num);
+               goto bail_sl2sc;
+       }
+       kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
+
+
+       ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype,
+                                  kobj, "CCMgtA");
+       if (ret) {
+               dd_dev_err(dd,
+                "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+                ret, port_num);
+               goto bail_vl2mtu;
+       }
+
+       kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+                               &cc_setting_bin_attr);
+       if (ret) {
+               dd_dev_err(dd,
+                "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+                ret, port_num);
+               goto bail_cc;
+       }
+
+       ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+                               &cc_table_bin_attr);
+       if (ret) {
+               dd_dev_err(dd,
+                "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+                ret, port_num);
+               goto bail_cc_entry_bin;
+       }
+
+       dd_dev_info(dd,
+               "IB%u: Congestion Control Agent enabled for port %d\n",
+               dd->unit, port_num);
+
+       return 0;
+
+bail_cc_entry_bin:
+       sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                             &cc_setting_bin_attr);
+bail_cc:
+       kobject_put(&ppd->pport_cc_kobj);
+bail_vl2mtu:
+       kobject_put(&ppd->vl2mtu_kobj);
+bail_sl2sc:
+       kobject_put(&ppd->sl2sc_kobj);
+bail_sc2vl:
+       kobject_put(&ppd->sc2vl_kobj);
+bail:
+       return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
+{
+       struct ib_device *dev = &dd->verbs_dev.ibdev;
+       int i, ret;
+
+       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
+               ret = device_create_file(&dev->dev, hfi1_attributes[i]);
+               if (ret)
+                       goto bail;
+       }
+
+       return 0;
+bail:
+       for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
+               device_remove_file(&dev->dev, hfi1_attributes[i]);
+       return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       for (i = 0; i < dd->num_pports; i++) {
+               ppd = &dd->pport[i];
+
+               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                                     &cc_setting_bin_attr);
+               sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+                                     &cc_table_bin_attr);
+               kobject_put(&ppd->pport_cc_kobj);
+               kobject_put(&ppd->vl2mtu_kobj);
+               kobject_put(&ppd->sl2sc_kobj);
+               kobject_put(&ppd->sc2vl_kobj);
+       }
+}
diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c
new file mode 100644 (file)
index 0000000..70ad7b9
--- /dev/null
@@ -0,0 +1,221 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr)
+{
+       struct hfi1_other_headers *ohdr;
+       u8 opcode;
+       u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+
+       if (lnh == HFI1_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else
+               ohdr = &hdr->u.l.oth;
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       return hdr_len_by_opcode[opcode] == 0 ?
+              0 : hdr_len_by_opcode[opcode] - (12 + 8);
+}
+
+#define IMM_PRN  "imm %d"
+#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x"
+#define AETH_PRN "aeth syn 0x%.2x msn 0x%.8x"
+#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x"
+#define ATOMICACKETH_PRN "origdata %lld"
+#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld"
+
+#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
+
+static u64 ib_u64_get(__be32 *p)
+{
+       return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]);
+}
+
+const char *parse_everbs_hdrs(
+       struct trace_seq *p,
+       u8 opcode,
+       void *ehdrs)
+{
+       union ib_ehdrs *eh = ehdrs;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       switch (opcode) {
+       /* imm */
+       case OP(RC, SEND_LAST_WITH_IMMEDIATE):
+       case OP(UC, SEND_LAST_WITH_IMMEDIATE):
+       case OP(RC, SEND_ONLY_WITH_IMMEDIATE):
+       case OP(UC, SEND_ONLY_WITH_IMMEDIATE):
+       case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+       case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               trace_seq_printf(p, IMM_PRN,
+                       be32_to_cpu(eh->imm_data));
+               break;
+       /* reth + imm */
+       case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+       case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               trace_seq_printf(p, RETH_PRN " " IMM_PRN,
+                       (unsigned long long)ib_u64_get(
+                               (__be32 *)&eh->rc.reth.vaddr),
+                       be32_to_cpu(eh->rc.reth.rkey),
+                       be32_to_cpu(eh->rc.reth.length),
+                       be32_to_cpu(eh->rc.imm_data));
+               break;
+       /* reth */
+       case OP(RC, RDMA_READ_REQUEST):
+       case OP(RC, RDMA_WRITE_FIRST):
+       case OP(UC, RDMA_WRITE_FIRST):
+       case OP(RC, RDMA_WRITE_ONLY):
+       case OP(UC, RDMA_WRITE_ONLY):
+               trace_seq_printf(p, RETH_PRN,
+                       (unsigned long long)ib_u64_get(
+                               (__be32 *)&eh->rc.reth.vaddr),
+                       be32_to_cpu(eh->rc.reth.rkey),
+                       be32_to_cpu(eh->rc.reth.length));
+               break;
+       case OP(RC, RDMA_READ_RESPONSE_FIRST):
+       case OP(RC, RDMA_READ_RESPONSE_LAST):
+       case OP(RC, RDMA_READ_RESPONSE_ONLY):
+       case OP(RC, ACKNOWLEDGE):
+               trace_seq_printf(p, AETH_PRN,
+                       be32_to_cpu(eh->aeth) >> 24,
+                       be32_to_cpu(eh->aeth) & HFI1_QPN_MASK);
+               break;
+       /* aeth + atomicacketh */
+       case OP(RC, ATOMIC_ACKNOWLEDGE):
+               trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
+                       (be32_to_cpu(eh->at.aeth) >> 24) & 0xff,
+                       be32_to_cpu(eh->at.aeth) & HFI1_QPN_MASK,
+                       (unsigned long long)ib_u64_get(eh->at.atomic_ack_eth));
+               break;
+       /* atomiceth */
+       case OP(RC, COMPARE_SWAP):
+       case OP(RC, FETCH_ADD):
+               trace_seq_printf(p, ATOMICETH_PRN,
+                       (unsigned long long)ib_u64_get(eh->atomic_eth.vaddr),
+                       eh->atomic_eth.rkey,
+                       (unsigned long long)ib_u64_get(
+                               (__be32 *)&eh->atomic_eth.swap_data),
+                       (unsigned long long) ib_u64_get(
+                                (__be32 *)&eh->atomic_eth.compare_data));
+               break;
+       /* deth */
+       case OP(UD, SEND_ONLY):
+       case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
+               trace_seq_printf(p, DETH_PRN,
+                       be32_to_cpu(eh->ud.deth[0]),
+                       be32_to_cpu(eh->ud.deth[1]) & HFI1_QPN_MASK);
+               break;
+       }
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+const char *parse_sdma_flags(
+       struct trace_seq *p,
+       u64 desc0, u64 desc1)
+{
+       const char *ret = trace_seq_buffer_ptr(p);
+       char flags[5] = { 'x', 'x', 'x', 'x', 0 };
+
+       flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
+       flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?  'H' : '-';
+       flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
+       flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
+       trace_seq_printf(p, "%s", flags);
+       if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG)
+               trace_seq_printf(p, " amode:%u aidx:%u alen:%u",
+                       (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT)
+                               & SDMA_DESC1_HEADER_MODE_MASK),
+                       (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT)
+                               & SDMA_DESC1_HEADER_INDEX_MASK),
+                       (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT)
+                               & SDMA_DESC1_HEADER_DWS_MASK));
+       return ret;
+}
+
+const char *print_u32_array(
+       struct trace_seq *p,
+       u32 *arr, int len)
+{
+       int i;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       for (i = 0; i < len ; i++)
+               trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]);
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+const char *print_u64_array(
+       struct trace_seq *p,
+       u64 *arr, int len)
+{
+       int i;
+       const char *ret = trace_seq_buffer_ptr(p);
+
+       for (i = 0; i < len; i++)
+               trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]);
+       trace_seq_putc(p, 0);
+       return ret;
+}
+
+__hfi1_trace_fn(PKT);
+__hfi1_trace_fn(PROC);
+__hfi1_trace_fn(SDMA);
+__hfi1_trace_fn(LINKVERB);
+__hfi1_trace_fn(DEBUG);
+__hfi1_trace_fn(SNOOP);
+__hfi1_trace_fn(CNTR);
+__hfi1_trace_fn(PIO);
+__hfi1_trace_fn(DC8051);
+__hfi1_trace_fn(FIRMWARE);
+__hfi1_trace_fn(RCVCTRL);
+__hfi1_trace_fn(TID);
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
new file mode 100644 (file)
index 0000000..d7851c0
--- /dev/null
@@ -0,0 +1,1409 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#undef TRACE_SYSTEM_VAR
+#define TRACE_SYSTEM_VAR hfi1
+
+#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "sdma.h"
+
+#define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+
+#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype }
+#define show_packettype(etype)                  \
+__print_symbolic(etype,                         \
+       packettype_name(EXPECTED),              \
+       packettype_name(EAGER),                 \
+       packettype_name(IB),                    \
+       packettype_name(ERROR),                 \
+       packettype_name(BYPASS))
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rx
+
+TRACE_EVENT(hfi1_rcvhdr,
+       TP_PROTO(struct hfi1_devdata *dd,
+                u64 eflags,
+                u32 ctxt,
+                u32 etype,
+                u32 hlen,
+                u32 tlen,
+                u32 updegr,
+                u32 etail),
+       TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __field(u64, eflags)
+               __field(u32, ctxt)
+               __field(u32, etype)
+               __field(u32, hlen)
+               __field(u32, tlen)
+               __field(u32, updegr)
+               __field(u32, etail)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd);
+               __entry->eflags = eflags;
+               __entry->ctxt = ctxt;
+               __entry->etype = etype;
+               __entry->hlen = hlen;
+               __entry->tlen = tlen;
+               __entry->updegr = updegr;
+               __entry->etail = etail;
+       ),
+       TP_printk(
+"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d",
+               __get_str(dev),
+               __entry->ctxt,
+               __entry->eflags,
+               __entry->etype, show_packettype(__entry->etype),
+               __entry->hlen,
+               __entry->tlen,
+               __entry->updegr,
+               __entry->etail
+       )
+);
+
+TRACE_EVENT(hfi1_receive_interrupt,
+       TP_PROTO(struct hfi1_devdata *dd, u32 ctxt),
+       TP_ARGS(dd, ctxt),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __field(u32, ctxt)
+               __field(u8, slow_path)
+               __field(u8, dma_rtail)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd);
+               __entry->ctxt = ctxt;
+               if (dd->rcd[ctxt]->do_interrupt ==
+                   &handle_receive_interrupt) {
+                       __entry->slow_path = 1;
+                       __entry->dma_rtail = 0xFF;
+               } else if (dd->rcd[ctxt]->do_interrupt ==
+                       &handle_receive_interrupt_dma_rtail){
+                       __entry->dma_rtail = 1;
+                       __entry->slow_path = 0;
+               } else if (dd->rcd[ctxt]->do_interrupt ==
+                        &handle_receive_interrupt_nodma_rtail) {
+                       __entry->dma_rtail = 0;
+                       __entry->slow_path = 0;
+               }
+       ),
+       TP_printk(
+               "[%s] ctxt %d SlowPath: %d DmaRtail: %d",
+               __get_str(dev),
+               __entry->ctxt,
+               __entry->slow_path,
+               __entry->dma_rtail
+       )
+);
+
+const char *print_u64_array(struct trace_seq *, u64 *, int);
+
+TRACE_EVENT(hfi1_exp_tid_map,
+           TP_PROTO(unsigned ctxt, u16 subctxt, int dir,
+                    unsigned long *maps, u16 count),
+           TP_ARGS(ctxt, subctxt, dir, maps, count),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(int, dir)
+                   __field(u16, count)
+                   __dynamic_array(unsigned long, maps, sizeof(*maps) * count)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->dir = dir;
+                   __entry->count = count;
+                   memcpy(__get_dynamic_array(maps), maps,
+                          sizeof(*maps) * count);
+                   ),
+           TP_printk("[%3u:%02u] %s tidmaps %s",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     (__entry->dir ? ">" : "<"),
+                     print_u64_array(p, __get_dynamic_array(maps),
+                                     __entry->count)
+                   )
+       );
+
+TRACE_EVENT(hfi1_exp_rcv_set,
+           TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
+                    unsigned long vaddr, u64 phys_addr, void *page),
+           TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(u32, tid)
+                   __field(unsigned long, vaddr)
+                   __field(u64, phys_addr)
+                   __field(void *, page)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->tid = tid;
+                   __entry->vaddr = vaddr;
+                   __entry->phys_addr = phys_addr;
+                   __entry->page = page;
+                   ),
+           TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->tid,
+                     __entry->vaddr,
+                     __entry->phys_addr,
+                     __entry->page
+                   )
+       );
+
+TRACE_EVENT(hfi1_exp_rcv_free,
+           TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
+                    unsigned long phys, void *page),
+           TP_ARGS(ctxt, subctxt, tid, phys, page),
+           TP_STRUCT__entry(
+                   __field(unsigned, ctxt)
+                   __field(u16, subctxt)
+                   __field(u32, tid)
+                   __field(unsigned long, phys)
+                   __field(void *, page)
+                   ),
+           TP_fast_assign(
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->tid = tid;
+                   __entry->phys = phys;
+                   __entry->page = page;
+                   ),
+           TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p",
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->tid,
+                     __entry->phys,
+                     __entry->page
+                   )
+       );
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tx
+
+TRACE_EVENT(hfi1_piofree,
+       TP_PROTO(struct send_context *sc, int extra),
+       TP_ARGS(sc, extra),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sc->dd)
+               __field(u32, sw_index)
+               __field(u32, hw_context)
+               __field(int, extra)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sc->dd);
+               __entry->sw_index = sc->sw_index;
+               __entry->hw_context = sc->hw_context;
+               __entry->extra = extra;
+       ),
+       TP_printk(
+               "[%s] ctxt %u(%u) extra %d",
+               __get_str(dev),
+               __entry->sw_index,
+               __entry->hw_context,
+               __entry->extra
+       )
+);
+
+TRACE_EVENT(hfi1_wantpiointr,
+       TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl),
+       TP_ARGS(sc, needint, credit_ctrl),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sc->dd)
+               __field(u32, sw_index)
+               __field(u32, hw_context)
+               __field(u32, needint)
+               __field(u64, credit_ctrl)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sc->dd);
+               __entry->sw_index = sc->sw_index;
+               __entry->hw_context = sc->hw_context;
+               __entry->needint = needint;
+               __entry->credit_ctrl = credit_ctrl;
+       ),
+       TP_printk(
+               "[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx",
+               __get_str(dev),
+               __entry->sw_index,
+               __entry->hw_context,
+               __entry->needint,
+               (unsigned long long)__entry->credit_ctrl
+       )
+);
+
+DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
+       TP_PROTO(struct hfi1_qp *qp, u32 flags),
+       TP_ARGS(qp, flags),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+               __field(u32, qpn)
+               __field(u32, flags)
+               __field(u32, s_flags)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+               __entry->flags = flags;
+               __entry->qpn = qp->ibqp.qp_num;
+               __entry->s_flags = qp->s_flags;
+       ),
+       TP_printk(
+               "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+               __get_str(dev),
+               __entry->qpn,
+               __entry->flags,
+               __entry->s_flags
+       )
+);
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup,
+            TP_PROTO(struct hfi1_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep,
+            TP_PROTO(struct hfi1_qp *qp, u32 flags),
+            TP_ARGS(qp, flags));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_qphash
+DECLARE_EVENT_CLASS(hfi1_qphash_template,
+       TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+       TP_ARGS(qp, bucket),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+               __field(u32, qpn)
+               __field(u32, bucket)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+               __entry->qpn = qp->ibqp.qp_num;
+               __entry->bucket = bucket;
+       ),
+       TP_printk(
+               "[%s] qpn 0x%x bucket %u",
+               __get_str(dev),
+               __entry->qpn,
+               __entry->bucket
+       )
+);
+
+DEFINE_EVENT(hfi1_qphash_template, hfi1_qpinsert,
+       TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+       TP_ARGS(qp, bucket));
+
+DEFINE_EVENT(hfi1_qphash_template, hfi1_qpremove,
+       TP_PROTO(struct hfi1_qp *qp, u32 bucket),
+       TP_ARGS(qp, bucket));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ibhdrs
+
+u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr);
+const char *parse_everbs_hdrs(
+       struct trace_seq *p,
+       u8 opcode,
+       void *ehdrs);
+
+#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs)
+
+const char *parse_sdma_flags(
+       struct trace_seq *p,
+       u64 desc0, u64 desc1);
+
+#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1)
+
+
+#define lrh_name(lrh) { HFI1_##lrh, #lrh }
+#define show_lnh(lrh)                    \
+__print_symbolic(lrh,                    \
+       lrh_name(LRH_BTH),               \
+       lrh_name(LRH_GRH))
+
+#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode  }
+#define show_ib_opcode(opcode)                             \
+__print_symbolic(opcode,                                   \
+       ib_opcode_name(RC_SEND_FIRST),                     \
+       ib_opcode_name(RC_SEND_MIDDLE),                    \
+       ib_opcode_name(RC_SEND_LAST),                      \
+       ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_SEND_ONLY),                      \
+       ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(RC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(RC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(RC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(RC_RDMA_READ_REQUEST),              \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST),       \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE),      \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST),        \
+       ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY),        \
+       ib_opcode_name(RC_ACKNOWLEDGE),                    \
+       ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
+       ib_opcode_name(RC_COMPARE_SWAP),                   \
+       ib_opcode_name(RC_FETCH_ADD),                      \
+       ib_opcode_name(UC_SEND_FIRST),                     \
+       ib_opcode_name(UC_SEND_MIDDLE),                    \
+       ib_opcode_name(UC_SEND_LAST),                      \
+       ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_SEND_ONLY),                      \
+       ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE),       \
+       ib_opcode_name(UC_RDMA_WRITE_FIRST),               \
+       ib_opcode_name(UC_RDMA_WRITE_MIDDLE),              \
+       ib_opcode_name(UC_RDMA_WRITE_LAST),                \
+       ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY),                \
+       ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \
+       ib_opcode_name(UD_SEND_ONLY),                      \
+       ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE))
+
+
+#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x"
+#define BTH_PRN \
+       "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \
+       "f %d b %d qpn 0x%.6x a %d psn 0x%.8x"
+#define EHDR_PRN "%s"
+
+DECLARE_EVENT_CLASS(hfi1_ibhdr_template,
+       TP_PROTO(struct hfi1_devdata *dd,
+                struct hfi1_ib_header *hdr),
+       TP_ARGS(dd, hdr),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               /* LRH */
+               __field(u8, vl)
+               __field(u8, lver)
+               __field(u8, sl)
+               __field(u8, lnh)
+               __field(u16, dlid)
+               __field(u16, len)
+               __field(u16, slid)
+               /* BTH */
+               __field(u8, opcode)
+               __field(u8, se)
+               __field(u8, m)
+               __field(u8, pad)
+               __field(u8, tver)
+               __field(u16, pkey)
+               __field(u8, f)
+               __field(u8, b)
+               __field(u32, qpn)
+               __field(u8, a)
+               __field(u32, psn)
+               /* extended headers */
+               __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr))
+       ),
+       TP_fast_assign(
+               struct hfi1_other_headers *ohdr;
+
+               DD_DEV_ASSIGN(dd);
+               /* LRH */
+               __entry->vl =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 12);
+               __entry->lver =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf;
+               __entry->sl =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+               __entry->lnh =
+                       (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+               __entry->dlid =
+                       be16_to_cpu(hdr->lrh[1]);
+               /* allow for larger len */
+               __entry->len =
+                       be16_to_cpu(hdr->lrh[2]);
+               __entry->slid =
+                       be16_to_cpu(hdr->lrh[3]);
+               /* BTH */
+               if (__entry->lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+               else
+                       ohdr = &hdr->u.l.oth;
+               __entry->opcode =
+                       (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+               __entry->se =
+                       (be32_to_cpu(ohdr->bth[0]) >> 23) & 1;
+               __entry->m =
+                        (be32_to_cpu(ohdr->bth[0]) >> 22) & 1;
+               __entry->pad =
+                       (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               __entry->tver =
+                       (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf;
+               __entry->pkey =
+                       be32_to_cpu(ohdr->bth[0]) & 0xffff;
+               __entry->f =
+                       (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT)
+                       & HFI1_FECN_MASK;
+               __entry->b =
+                       (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT)
+                       & HFI1_BECN_MASK;
+               __entry->qpn =
+                       be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+               __entry->a =
+                       (be32_to_cpu(ohdr->bth[2]) >> 31) & 1;
+               /* allow for larger PSN */
+               __entry->psn =
+                       be32_to_cpu(ohdr->bth[2]) & 0x7fffffff;
+               /* extended headers */
+                memcpy(
+                       __get_dynamic_array(ehdrs),
+                       &ohdr->u,
+                       ibhdr_exhdr_len(hdr));
+       ),
+       TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN,
+               __get_str(dev),
+               /* LRH */
+               __entry->vl,
+               __entry->lver,
+               __entry->sl,
+               __entry->lnh, show_lnh(__entry->lnh),
+               __entry->dlid,
+               __entry->len,
+               __entry->slid,
+               /* BTH */
+               __entry->opcode, show_ib_opcode(__entry->opcode),
+               __entry->se,
+               __entry->m,
+               __entry->pad,
+               __entry->tver,
+               __entry->pkey,
+               __entry->f,
+               __entry->b,
+               __entry->qpn,
+               __entry->a,
+               __entry->psn,
+               /* extended headers */
+               __parse_ib_ehdrs(
+                       __entry->opcode,
+                       (void *)__get_dynamic_array(ehdrs))
+       )
+);
+
+DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr,
+            TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr),
+            TP_ARGS(dd, hdr));
+
+#define SNOOP_PRN \
+       "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \
+       "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_snoop
+
+
+TRACE_EVENT(snoop_capture,
+       TP_PROTO(struct hfi1_devdata *dd,
+                int hdr_len,
+                struct hfi1_ib_header *hdr,
+                int data_len,
+                void *data),
+       TP_ARGS(dd, hdr_len, hdr, data_len, data),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __field(u16, slid)
+               __field(u16, dlid)
+               __field(u32, qpn)
+               __field(u8, opcode)
+               __field(u8, sl)
+               __field(u16, pkey)
+               __field(u32, hdr_len)
+               __field(u32, data_len)
+               __field(u8, lnh)
+               __dynamic_array(u8, raw_hdr, hdr_len)
+               __dynamic_array(u8, raw_pkt, data_len)
+       ),
+       TP_fast_assign(
+               struct hfi1_other_headers *ohdr;
+
+               __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3);
+               if (__entry->lnh == HFI1_LRH_BTH)
+                       ohdr = &hdr->u.oth;
+               else
+                       ohdr = &hdr->u.l.oth;
+               DD_DEV_ASSIGN(dd);
+               __entry->slid = be16_to_cpu(hdr->lrh[3]);
+               __entry->dlid = be16_to_cpu(hdr->lrh[1]);
+               __entry->qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+               __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+               __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf;
+               __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff;
+               __entry->hdr_len = hdr_len;
+               __entry->data_len = data_len;
+               memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len);
+               memcpy(__get_dynamic_array(raw_pkt), data, data_len);
+       ),
+       TP_printk("[%s] " SNOOP_PRN,
+               __get_str(dev),
+               __entry->slid,
+               __entry->dlid,
+               __entry->qpn,
+               __entry->opcode,
+               show_ib_opcode(__entry->opcode),
+               __entry->sl,
+               __entry->pkey,
+               __entry->hdr_len,
+               __entry->data_len
+       )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_ctxts
+
+#define UCTXT_FMT \
+       "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, "     \
+       "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx"
+TRACE_EVENT(hfi1_uctxtdata,
+           TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt),
+           TP_ARGS(dd, uctxt),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(unsigned, ctxt)
+                   __field(u32, credits)
+                   __field(u64, hw_free)
+                   __field(u64, piobase)
+                   __field(u16, rcvhdrq_cnt)
+                   __field(u64, rcvhdrq_phys)
+                   __field(u32, eager_cnt)
+                   __field(u64, rcvegr_phys)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = uctxt->ctxt;
+                   __entry->credits = uctxt->sc->credits;
+                   __entry->hw_free = (u64)uctxt->sc->hw_free;
+                   __entry->piobase = (u64)uctxt->sc->base_addr;
+                   __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+                   __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys;
+                   __entry->eager_cnt = uctxt->egrbufs.alloced;
+                   __entry->rcvegr_phys = uctxt->egrbufs.rcvtids[0].phys;
+                   ),
+           TP_printk(
+                   "[%s] ctxt %u " UCTXT_FMT,
+                   __get_str(dev),
+                   __entry->ctxt,
+                   __entry->credits,
+                   __entry->hw_free,
+                   __entry->piobase,
+                   __entry->rcvhdrq_cnt,
+                   __entry->rcvhdrq_phys,
+                   __entry->eager_cnt,
+                   __entry->rcvegr_phys
+                   )
+       );
+
+#define CINFO_FMT \
+       "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u"
+TRACE_EVENT(hfi1_ctxt_info,
+           TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt,
+                    struct hfi1_ctxt_info cinfo),
+           TP_ARGS(dd, ctxt, subctxt, cinfo),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(unsigned, ctxt)
+                   __field(unsigned, subctxt)
+                   __field(u16, egrtids)
+                   __field(u16, rcvhdrq_cnt)
+                   __field(u16, rcvhdrq_size)
+                   __field(u16, sdma_ring_size)
+                   __field(u32, rcvegr_size)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->egrtids = cinfo.egrtids;
+                   __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt;
+                   __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize;
+                   __entry->sdma_ring_size = cinfo.sdma_ring_size;
+                   __entry->rcvegr_size = cinfo.rcvegr_size;
+                   ),
+           TP_printk(
+                   "[%s] ctxt %u:%u " CINFO_FMT,
+                   __get_str(dev),
+                   __entry->ctxt,
+                   __entry->subctxt,
+                   __entry->egrtids,
+                   __entry->rcvegr_size,
+                   __entry->rcvhdrq_cnt,
+                   __entry->rcvhdrq_size,
+                   __entry->sdma_ring_size
+                   )
+       );
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sma
+
+#define BCT_FORMAT \
+       "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]"
+
+#define BCT(field) \
+       be16_to_cpu( \
+               ((struct buffer_control *)__get_dynamic_array(bct))->field \
+       )
+
+DECLARE_EVENT_CLASS(hfi1_bct_template,
+       TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+       TP_ARGS(dd, bc),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __dynamic_array(u8, bct, sizeof(*bc))
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd);
+               memcpy(
+                       __get_dynamic_array(bct),
+                       bc,
+                       sizeof(*bc));
+       ),
+       TP_printk(BCT_FORMAT,
+               BCT(overall_shared_limit),
+
+               BCT(vl[0].dedicated),
+               BCT(vl[0].shared),
+
+               BCT(vl[1].dedicated),
+               BCT(vl[1].shared),
+
+               BCT(vl[2].dedicated),
+               BCT(vl[2].shared),
+
+               BCT(vl[3].dedicated),
+               BCT(vl[3].shared),
+
+               BCT(vl[4].dedicated),
+               BCT(vl[4].shared),
+
+               BCT(vl[5].dedicated),
+               BCT(vl[5].shared),
+
+               BCT(vl[6].dedicated),
+               BCT(vl[6].shared),
+
+               BCT(vl[7].dedicated),
+               BCT(vl[7].shared),
+
+               BCT(vl[15].dedicated),
+               BCT(vl[15].shared)
+       )
+);
+
+
+DEFINE_EVENT(hfi1_bct_template, bct_set,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+DEFINE_EVENT(hfi1_bct_template, bct_get,
+            TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc),
+            TP_ARGS(dd, bc));
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_sdma
+
+TRACE_EVENT(hfi1_sdma_descriptor,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               u64 desc0,
+               u64 desc1,
+               u16 e,
+               void *descp),
+       TP_ARGS(sde, desc0, desc1, e, descp),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __field(void *, descp)
+               __field(u64, desc0)
+               __field(u64, desc1)
+               __field(u16, e)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __entry->desc0 = desc0;
+               __entry->desc1 = desc1;
+               __entry->idx = sde->this_idx;
+               __entry->descp = descp;
+               __entry->e = e;
+       ),
+       TP_printk(
+               "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u",
+               __get_str(dev),
+               __entry->idx,
+               __parse_sdma_flags(__entry->desc0, __entry->desc1),
+               (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT)
+                       & SDMA_DESC0_PHY_ADDR_MASK,
+               (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT)
+                       & SDMA_DESC1_GENERATION_MASK),
+               (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT)
+                       & SDMA_DESC0_BYTE_COUNT_MASK),
+               __entry->desc0,
+               __entry->desc1,
+               __entry->descp,
+               __entry->e
+       )
+);
+
+TRACE_EVENT(hfi1_sdma_engine_select,
+       TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx),
+       TP_ARGS(dd, sel, vl, idx),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __field(u32, sel)
+               __field(u8, vl)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd);
+               __entry->sel = sel;
+               __entry->vl = vl;
+               __entry->idx = idx;
+       ),
+       TP_printk(
+               "[%s] selecting SDE %u sel 0x%x vl %u",
+               __get_str(dev),
+               __entry->idx,
+               __entry->sel,
+               __entry->vl
+       )
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_engine_class,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               u64 status
+       ),
+       TP_ARGS(sde, status),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __field(u64, status)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __entry->status = status;
+               __entry->idx = sde->this_idx;
+       ),
+       TP_printk(
+               "[%s] SDE(%u) status %llx",
+               __get_str(dev),
+               __entry->idx,
+               (unsigned long long)__entry->status
+       )
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               u64 status
+       ),
+       TP_ARGS(sde, status)
+);
+
+DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               u64 status
+       ),
+       TP_ARGS(sde, status)
+);
+
+DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               int aidx
+       ),
+       TP_ARGS(sde, aidx),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __field(int, aidx)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __entry->idx = sde->this_idx;
+               __entry->aidx = aidx;
+       ),
+       TP_printk(
+               "[%s] SDE(%u) aidx %d",
+               __get_str(dev),
+               __entry->idx,
+               __entry->aidx
+       )
+);
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate,
+            TP_PROTO(
+               struct sdma_engine *sde,
+               int aidx
+            ),
+            TP_ARGS(sde, aidx));
+
+DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate,
+            TP_PROTO(
+               struct sdma_engine *sde,
+               int aidx
+            ),
+            TP_ARGS(sde, aidx));
+
+#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
+TRACE_EVENT(hfi1_sdma_progress,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               u16 hwhead,
+               u16 swhead,
+               struct sdma_txreq *txp
+       ),
+       TP_ARGS(sde, hwhead, swhead, txp),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __field(u64, sn)
+               __field(u16, hwhead)
+               __field(u16, swhead)
+               __field(u16, txnext)
+               __field(u16, tx_tail)
+               __field(u16, tx_head)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __entry->hwhead = hwhead;
+               __entry->swhead = swhead;
+               __entry->tx_tail = sde->tx_tail;
+               __entry->tx_head = sde->tx_head;
+               __entry->txnext = txp ? txp->next_descq_idx : ~0;
+               __entry->idx = sde->this_idx;
+               __entry->sn = txp ? txp->sn : ~0;
+       ),
+       TP_printk(
+               "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+               __get_str(dev),
+               __entry->idx,
+               __entry->sn,
+               __entry->hwhead,
+               __entry->swhead,
+               __entry->txnext,
+               __entry->tx_head,
+               __entry->tx_tail
+       )
+);
+#else
+TRACE_EVENT(hfi1_sdma_progress,
+           TP_PROTO(
+               struct sdma_engine *sde,
+               u16 hwhead,
+               u16 swhead,
+               struct sdma_txreq *txp
+           ),
+       TP_ARGS(sde, hwhead, swhead, txp),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __field(u16, hwhead)
+               __field(u16, swhead)
+               __field(u16, txnext)
+               __field(u16, tx_tail)
+               __field(u16, tx_head)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __entry->hwhead = hwhead;
+               __entry->swhead = swhead;
+               __entry->tx_tail = sde->tx_tail;
+               __entry->tx_head = sde->tx_head;
+               __entry->txnext = txp ? txp->next_descq_idx : ~0;
+               __entry->idx = sde->this_idx;
+       ),
+       TP_printk(
+               "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u",
+               __get_str(dev),
+               __entry->idx,
+               __entry->hwhead,
+               __entry->swhead,
+               __entry->txnext,
+               __entry->tx_head,
+               __entry->tx_tail
+       )
+);
+#endif
+
+DECLARE_EVENT_CLASS(hfi1_sdma_sn,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               u64 sn
+       ),
+       TP_ARGS(sde, sn),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __field(u64, sn)
+               __field(u8, idx)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __entry->sn = sn;
+               __entry->idx = sde->this_idx;
+       ),
+       TP_printk(
+               "[%s] SDE(%u) sn %llu",
+               __get_str(dev),
+               __entry->idx,
+               __entry->sn
+       )
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn,
+            TP_PROTO(
+               struct sdma_engine *sde,
+               u64 sn
+            ),
+            TP_ARGS(sde, sn)
+);
+
+DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn,
+            TP_PROTO(
+               struct sdma_engine *sde,
+               u64 sn
+            ),
+            TP_ARGS(sde, sn)
+);
+
+#define USDMA_HDR_FORMAT \
+       "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x"
+
+TRACE_EVENT(hfi1_sdma_user_header,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    struct hfi1_pkt_header *hdr, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(__le32, pbc0)
+                   __field(__le32, pbc1)
+                   __field(__be32, lrh0)
+                   __field(__be32, lrh1)
+                   __field(__be32, bth0)
+                   __field(__be32, bth1)
+                   __field(__be32, bth2)
+                   __field(__le32, kdeth0)
+                   __field(__le32, kdeth1)
+                   __field(__le32, kdeth2)
+                   __field(__le32, kdeth3)
+                   __field(__le32, kdeth4)
+                   __field(__le32, kdeth5)
+                   __field(__le32, kdeth6)
+                   __field(__le32, kdeth7)
+                   __field(__le32, kdeth8)
+                   __field(u32, tidval)
+                   ),
+           TP_fast_assign(
+                   __le32 *pbc = (__le32 *)hdr->pbc;
+                   __be32 *lrh = (__be32 *)hdr->lrh;
+                   __be32 *bth = (__be32 *)hdr->bth;
+                   __le32 *kdeth = (__le32 *)&hdr->kdeth;
+
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->pbc0 = pbc[0];
+                   __entry->pbc1 = pbc[1];
+                   __entry->lrh0 = be32_to_cpu(lrh[0]);
+                   __entry->lrh1 = be32_to_cpu(lrh[1]);
+                   __entry->bth0 = be32_to_cpu(bth[0]);
+                   __entry->bth1 = be32_to_cpu(bth[1]);
+                   __entry->bth2 = be32_to_cpu(bth[2]);
+                   __entry->kdeth0 = kdeth[0];
+                   __entry->kdeth1 = kdeth[1];
+                   __entry->kdeth2 = kdeth[2];
+                   __entry->kdeth3 = kdeth[3];
+                   __entry->kdeth4 = kdeth[4];
+                   __entry->kdeth5 = kdeth[5];
+                   __entry->kdeth6 = kdeth[6];
+                   __entry->kdeth7 = kdeth[7];
+                   __entry->kdeth8 = kdeth[8];
+                   __entry->tidval = tidval;
+                   ),
+           TP_printk(USDMA_HDR_FORMAT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->pbc1,
+                     __entry->pbc0,
+                     __entry->lrh0,
+                     __entry->lrh1,
+                     __entry->bth0,
+                     __entry->bth1,
+                     __entry->bth2,
+                     __entry->kdeth0,
+                     __entry->kdeth1,
+                     __entry->kdeth2,
+                     __entry->kdeth3,
+                     __entry->kdeth4,
+                     __entry->kdeth5,
+                     __entry->kdeth6,
+                     __entry->kdeth7,
+                     __entry->kdeth8,
+                     __entry->tidval
+                   )
+       );
+
+#define SDMA_UREQ_FMT \
+       "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u"
+TRACE_EVENT(hfi1_sdma_user_reqinfo,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
+           TP_ARGS(dd, ctxt, subctxt, i),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd);
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u8, ver_opcode)
+                   __field(u8, iovcnt)
+                   __field(u16, npkts)
+                   __field(u16, fragsize)
+                   __field(u16, comp_idx)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->ver_opcode = i[0] & 0xff;
+                   __entry->iovcnt = (i[0] >> 8) & 0xff;
+                   __entry->npkts = i[1];
+                   __entry->fragsize = i[2];
+                   __entry->comp_idx = i[3];
+                   ),
+           TP_printk(SDMA_UREQ_FMT,
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->ver_opcode,
+                     __entry->iovcnt,
+                     __entry->npkts,
+                     __entry->fragsize,
+                     __entry->comp_idx
+                   )
+       );
+
+#define usdma_complete_name(st) { st, #st }
+#define show_usdma_complete_state(st)                  \
+       __print_symbolic(st,                            \
+                        usdma_complete_name(FREE),     \
+                        usdma_complete_name(QUEUED),   \
+                        usdma_complete_name(COMPLETE), \
+                        usdma_complete_name(ERROR))
+
+TRACE_EVENT(hfi1_sdma_user_completion,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx,
+                    u8 state, int code),
+           TP_ARGS(dd, ctxt, subctxt, idx, state, code),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, idx)
+                   __field(u8, state)
+                   __field(int, code)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->idx = idx;
+                   __entry->state = state;
+                   __entry->code = code;
+                   ),
+           TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)",
+                     __get_str(dev), __entry->ctxt, __entry->subctxt,
+                     __entry->idx, show_usdma_complete_state(__entry->state),
+                     __entry->code)
+       );
+
+const char *print_u32_array(struct trace_seq *, u32 *, int);
+#define __print_u32_hex(arr, len) print_u32_array(p, arr, len)
+
+TRACE_EVENT(hfi1_sdma_user_header_ahg,
+           TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req,
+                    u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval),
+           TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval),
+           TP_STRUCT__entry(
+                   DD_DEV_ENTRY(dd)
+                   __field(u16, ctxt)
+                   __field(u8, subctxt)
+                   __field(u16, req)
+                   __field(u8, sde)
+                   __field(u8, idx)
+                   __field(int, len)
+                   __field(u32, tidval)
+                   __array(u32, ahg, 10)
+                   ),
+           TP_fast_assign(
+                   DD_DEV_ASSIGN(dd);
+                   __entry->ctxt = ctxt;
+                   __entry->subctxt = subctxt;
+                   __entry->req = req;
+                   __entry->sde = sde;
+                   __entry->idx = ahgidx;
+                   __entry->len = len;
+                   __entry->tidval = tidval;
+                   memcpy(__entry->ahg, ahg, len * sizeof(u32));
+                   ),
+           TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x",
+                     __get_str(dev),
+                     __entry->ctxt,
+                     __entry->subctxt,
+                     __entry->req,
+                     __entry->sde,
+                     __entry->idx,
+                     __entry->len - 1,
+                     __print_u32_hex(__entry->ahg, __entry->len),
+                     __entry->tidval
+                   )
+       );
+
+TRACE_EVENT(hfi1_sdma_state,
+       TP_PROTO(
+               struct sdma_engine *sde,
+               const char *cstate,
+               const char *nstate
+       ),
+       TP_ARGS(sde, cstate, nstate),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(sde->dd)
+               __string(curstate, cstate)
+               __string(newstate, nstate)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(sde->dd);
+               __assign_str(curstate, cstate);
+               __assign_str(newstate, nstate);
+       ),
+       TP_printk("[%s] current state %s new state %s",
+               __get_str(dev),
+               __get_str(curstate),
+               __get_str(newstate)
+       )
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_rc
+
+DECLARE_EVENT_CLASS(hfi1_sdma_rc,
+       TP_PROTO(struct hfi1_qp *qp, u32 psn),
+       TP_ARGS(qp, psn),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+               __field(u32, qpn)
+               __field(u32, flags)
+               __field(u32, psn)
+               __field(u32, sending_psn)
+               __field(u32, sending_hpsn)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+               __entry->qpn = qp->ibqp.qp_num;
+               __entry->flags = qp->s_flags;
+               __entry->psn = psn;
+               __entry->sending_psn = qp->s_sending_psn;
+               __entry->sending_hpsn = qp->s_sending_hpsn;
+       ),
+       TP_printk(
+               "[%s] qpn 0x%x flags 0x%x psn 0x%x sending_psn 0x%x sending_hpsn 0x%x",
+               __get_str(dev),
+               __entry->qpn,
+               __entry->flags,
+               __entry->psn,
+               __entry->sending_psn,
+               __entry->sending_psn
+       )
+);
+
+DEFINE_EVENT(hfi1_sdma_rc, hfi1_rc_sendcomplete,
+            TP_PROTO(struct hfi1_qp *qp, u32 psn),
+            TP_ARGS(qp, psn)
+);
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_misc
+
+TRACE_EVENT(hfi1_interrupt,
+       TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry,
+                int src),
+       TP_ARGS(dd, is_entry, src),
+       TP_STRUCT__entry(
+               DD_DEV_ENTRY(dd)
+               __array(char, buf, 64)
+               __field(int, src)
+       ),
+       TP_fast_assign(
+               DD_DEV_ASSIGN(dd)
+               is_entry->is_name(__entry->buf, 64, src - is_entry->start);
+               __entry->src = src;
+       ),
+       TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf,
+                 __entry->src)
+);
+
+/*
+ * Note:
+ * This produces a REALLY ugly trace in the console output when the string is
+ * too long.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_trace
+
+#define MAX_MSG_LEN 512
+
+DECLARE_EVENT_CLASS(hfi1_trace_template,
+       TP_PROTO(const char *function, struct va_format *vaf),
+       TP_ARGS(function, vaf),
+       TP_STRUCT__entry(
+               __string(function, function)
+               __dynamic_array(char, msg, MAX_MSG_LEN)
+       ),
+       TP_fast_assign(
+               __assign_str(function, function);
+               WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
+                    MAX_MSG_LEN, vaf->fmt,
+                    *vaf->va) >= MAX_MSG_LEN);
+       ),
+       TP_printk("(%s) %s",
+                 __get_str(function),
+                 __get_str(msg))
+);
+
+/*
+ * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an
+ * actual function to work and can not be in a macro.
+ */
+#define __hfi1_trace_def(lvl) \
+void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
+                                                                       \
+DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
+       TP_PROTO(const char *function, struct va_format *vaf),          \
+       TP_ARGS(function, vaf))
+
+#define __hfi1_trace_fn(lvl) \
+void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
+{                                                                      \
+       struct va_format vaf = {                                        \
+               .fmt = fmt,                                             \
+       };                                                              \
+       va_list args;                                                   \
+                                                                       \
+       va_start(args, fmt);                                            \
+       vaf.va = &args;                                                 \
+       trace_hfi1_ ##lvl(func, &vaf);                                  \
+       va_end(args);                                                   \
+       return;                                                         \
+}
+
+/*
+ * To create a new trace level simply define it below and as a __hfi1_trace_fn
+ * in trace.c. This will create all the hooks for calling
+ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all
+ * the debugfs stuff.
+ */
+__hfi1_trace_def(PKT);
+__hfi1_trace_def(PROC);
+__hfi1_trace_def(SDMA);
+__hfi1_trace_def(LINKVERB);
+__hfi1_trace_def(DEBUG);
+__hfi1_trace_def(SNOOP);
+__hfi1_trace_def(CNTR);
+__hfi1_trace_def(PIO);
+__hfi1_trace_def(DC8051);
+__hfi1_trace_def(FIRMWARE);
+__hfi1_trace_def(RCVCTRL);
+__hfi1_trace_def(TID);
+
+#define hfi1_cdbg(which, fmt, ...) \
+       __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__)
+
+#define hfi1_dbg(fmt, ...) \
+       hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__)
+
+/*
+ * Define HFI1_EARLY_DBG at compile time or here to enable early trace
+ * messages. Do not check in an enablement for this.
+ */
+
+#ifdef HFI1_EARLY_DBG
+#define hfi1_dbg_early(fmt, ...) \
+       trace_printk(fmt, ##__VA_ARGS__)
+#else
+#define hfi1_dbg_early(fmt, ...)
+#endif
+
+#endif /* __HFI1_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c
new file mode 100644 (file)
index 0000000..ea54fd2
--- /dev/null
@@ -0,0 +1,518 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "twsi.h"
+
+/*
+ * "Two Wire Serial Interface" support.
+ *
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the hfi1_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target)
+{
+       /*
+        * implicit read of EXTStatus is as good as explicit
+        * read of scratch, if all we want to do is flush
+        * writes.
+        */
+       hfi1_gpio_mod(dd, target, 0, 0, 0);
+       rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+       u32 mask;
+
+       udelay(1);
+
+       mask = QSFP_HFI0_I2CCLK;
+
+       /* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+       /*
+        * Allow for slow slaves by simple
+        * delay for falling edge, sampling on rise.
+        */
+       if (!bit)
+               udelay(2);
+       else {
+               int rise_usec;
+
+               for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+                       if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0))
+                               break;
+                       udelay(2);
+               }
+               if (rise_usec <= 0)
+                       dd_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+                                   SCL_WAIT_USEC);
+       }
+       i2c_wait_for_writes(dd, target);
+}
+
+static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit)
+{
+       u32 mask;
+
+       mask = QSFP_HFI0_I2CDAT;
+
+       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask);
+
+       i2c_wait_for_writes(dd, target);
+       udelay(2);
+}
+
+static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait)
+{
+       u32 read_val, mask;
+
+       mask = QSFP_HFI0_I2CDAT;
+       /* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+       read_val = hfi1_gpio_mod(dd, target, 0, 0, 0);
+       if (wait)
+               i2c_wait_for_writes(dd, target);
+       return (read_val & mask) >> GPIO_SDA_NUM;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the hfi1_ib device
+ */
+static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target)
+{
+       u8 ack_received;
+
+       /* AT ENTRY SCL = LOW */
+       /* change direction, ignore data */
+       ack_received = sda_in(dd, target, 1);
+       scl_out(dd, target, 1);
+       ack_received = sda_in(dd, target, 1) == 0;
+       scl_out(dd, target, 0);
+       return ack_received;
+}
+
+static void stop_cmd(struct hfi1_devdata *dd, u32 target);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the hfi1_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct hfi1_devdata *dd, u32 target, int last)
+{
+       int bit_cntr, data;
+
+       data = 0;
+
+       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+               data <<= 1;
+               scl_out(dd, target, 1);
+               data |= sda_in(dd, target, 0);
+               scl_out(dd, target, 0);
+       }
+       if (last) {
+               scl_out(dd, target, 1);
+               stop_cmd(dd, target);
+       } else {
+               sda_out(dd, target, 0);
+               scl_out(dd, target, 1);
+               scl_out(dd, target, 0);
+               sda_out(dd, target, 1);
+       }
+       return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the hfi1_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data)
+{
+       int bit_cntr;
+       u8 bit;
+
+       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+               bit = (data >> bit_cntr) & 1;
+               sda_out(dd, target, bit);
+               scl_out(dd, target, 1);
+               scl_out(dd, target, 0);
+       }
+       return (!i2c_ackrcv(dd, target)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct hfi1_devdata *dd, u32 target)
+{
+       sda_out(dd, target, 1);
+       scl_out(dd, target, 1);
+       sda_out(dd, target, 0);
+       udelay(1);
+       scl_out(dd, target, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct hfi1_devdata *dd, u32 target)
+{
+       scl_out(dd, target, 0);
+       sda_out(dd, target, 0);
+       scl_out(dd, target, 1);
+       sda_out(dd, target, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the hfi1_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct hfi1_devdata *dd, u32 target)
+{
+       stop_seq(dd, target);
+       udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * hfi1_twsi_reset - reset I2C communication
+ * @dd: the hfi1_ib device
+ */
+
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target)
+{
+       int clock_cycles_left = 9;
+       int was_high = 0;
+       u32 pins, mask;
+
+       /* Both SCL and SDA should be high. If not, there
+        * is something wrong.
+        */
+       mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT;
+
+       /*
+        * Force pins to desired innocuous state.
+        * This is the default power-on state with out=0 and dir=0,
+        * So tri-stated and should be floating high (barring HW problems)
+        */
+       hfi1_gpio_mod(dd, target, 0, 0, mask);
+
+       /*
+        * Clock nine times to get all listeners into a sane state.
+        * If SDA does not go high at any point, we are wedged.
+        * One vendor recommends then issuing START followed by STOP.
+        * we cannot use our "normal" functions to do that, because
+        * if SCL drops between them, another vendor's part will
+        * wedge, dropping SDA and keeping it low forever, at the end of
+        * the next transaction (even if it was not the device addressed).
+        * So our START and STOP take place with SCL held high.
+        */
+       while (clock_cycles_left--) {
+               scl_out(dd, target, 0);
+               scl_out(dd, target, 1);
+               /* Note if SDA is high, but keep clocking to sync slave */
+               was_high |= sda_in(dd, target, 0);
+       }
+
+       if (was_high) {
+               /*
+                * We saw a high, which we hope means the slave is sync'd.
+                * Issue START, STOP, pause for T_BUF.
+                */
+
+               pins = hfi1_gpio_mod(dd, target, 0, 0, 0);
+               if ((pins & mask) != mask)
+                       dd_dev_err(dd, "GPIO pins not at rest: %d\n",
+                                   pins & mask);
+               /* Drop SDA to issue START */
+               udelay(1); /* Guarantee .6 uSec setup */
+               sda_out(dd, target, 0);
+               udelay(1); /* Guarantee .6 uSec hold */
+               /* At this point, SCL is high, SDA low. Raise SDA for STOP */
+               sda_out(dd, target, 1);
+               udelay(TWSI_BUF_WAIT_USEC);
+       }
+
+       return !was_high;
+}
+
+#define HFI1_TWSI_START 0x100
+#define HFI1_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags)
+{
+       int ret = 1;
+
+       if (flags & HFI1_TWSI_START)
+               start_seq(dd, target);
+
+       /* Leaves SCL low (from i2c_ackrcv()) */
+       ret = wr_byte(dd, target, data);
+
+       if (flags & HFI1_TWSI_STOP)
+               stop_cmd(dd, target);
+       return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define HFI1_TEMP_DEV 0x98
+
+/*
+ * hfi1_twsi_blk_rd
+ * General interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    void *buffer, int len)
+{
+       int ret;
+       u8 *bp = buffer;
+
+       ret = 1;
+
+       if (dev == HFI1_TWSI_NO_DEV) {
+               /* legacy not-really-I2C */
+               addr = (addr << 1) | READ_CMD;
+               ret = twsi_wr(dd, target, addr, HFI1_TWSI_START);
+       } else {
+               /* Actual I2C */
+               ret = twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START);
+               if (ret) {
+                       stop_cmd(dd, target);
+                       ret = 1;
+                       goto bail;
+               }
+               /*
+                * SFF spec claims we do _not_ stop after the addr
+                * but simply issue a start with the "read" dev-addr.
+                * Since we are implicitly waiting for ACK here,
+                * we need t_buf (nominally 20uSec) before that start,
+                * and cannot rely on the delay built in to the STOP
+                */
+               ret = twsi_wr(dd, target, addr, 0);
+               udelay(TWSI_BUF_WAIT_USEC);
+
+               if (ret) {
+                       dd_dev_err(dd,
+                               "Failed to write interface read addr %02X\n",
+                               addr);
+                       ret = 1;
+                       goto bail;
+               }
+               ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START);
+       }
+       if (ret) {
+               stop_cmd(dd, target);
+               ret = 1;
+               goto bail;
+       }
+
+       /*
+        * block devices keeps clocking data out as long as we ack,
+        * automatically incrementing the address. Some have "pages"
+        * whose boundaries will not be crossed, but the handling
+        * of these is left to the caller, who is in a better
+        * position to know.
+        */
+       while (len-- > 0) {
+               /*
+                * Get and store data, sending ACK if length remaining,
+                * else STOP
+                */
+               *bp++ = rd_byte(dd, target, !len);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/*
+ * hfi1_twsi_blk_wr
+ * General interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    const void *buffer, int len)
+{
+       int sub_len;
+       const u8 *bp = buffer;
+       int max_wait_time, i;
+       int ret = 1;
+
+       while (len > 0) {
+               if (dev == HFI1_TWSI_NO_DEV) {
+                       if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD,
+                                   HFI1_TWSI_START)) {
+                               goto failed_write;
+                       }
+               } else {
+                       /* Real I2C */
+                       if (twsi_wr(dd, target,
+                                   dev | WRITE_CMD, HFI1_TWSI_START))
+                               goto failed_write;
+                       ret = twsi_wr(dd, target, addr, 0);
+                       if (ret) {
+                               dd_dev_err(dd,
+                                       "Failed to write interface write addr %02X\n",
+                                       addr);
+                               goto failed_write;
+                       }
+               }
+
+               sub_len = min(len, 4);
+               addr += sub_len;
+               len -= sub_len;
+
+               for (i = 0; i < sub_len; i++)
+                       if (twsi_wr(dd, target, *bp++, 0))
+                               goto failed_write;
+
+               stop_cmd(dd, target);
+
+               /*
+                * Wait for write complete by waiting for a successful
+                * read (the chip replies with a zero after the write
+                * cmd completes, and before it writes to the eeprom.
+                * The startcmd for the read will fail the ack until
+                * the writes have completed.   We do this inline to avoid
+                * the debug prints that are in the real read routine
+                * if the startcmd fails.
+                * We also use the proper device address, so it doesn't matter
+                * whether we have real eeprom_dev. Legacy likes any address.
+                */
+               max_wait_time = 100;
+               while (twsi_wr(dd, target,
+                              dev | READ_CMD, HFI1_TWSI_START)) {
+                       stop_cmd(dd, target);
+                       if (!--max_wait_time)
+                               goto failed_write;
+               }
+               /* now read (and ignore) the resulting byte */
+               rd_byte(dd, target, 1);
+       }
+
+       ret = 0;
+       goto bail;
+
+failed_write:
+       stop_cmd(dd, target);
+       ret = 1;
+
+bail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h
new file mode 100644 (file)
index 0000000..5907e02
--- /dev/null
@@ -0,0 +1,68 @@
+#ifndef _TWSI_H
+#define _TWSI_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define HFI1_TWSI_NO_DEV 0xFF
+
+struct hfi1_devdata;
+
+/* Bit position of SDA pin in ASIC_QSFP* registers  */
+#define  GPIO_SDA_NUM 1
+
+/* these functions must be called with qsfp_lock held */
+int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target);
+int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    void *buffer, int len);
+int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr,
+                    const void *buffer, int len);
+
+
+#endif /* _TWSI_H */
diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c
new file mode 100644 (file)
index 0000000..b536f39
--- /dev/null
@@ -0,0 +1,585 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "sdma.h"
+#include "qp.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * hfi1_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_uc_req(struct hfi1_qp *qp)
+{
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_swqe *wqe;
+       unsigned long flags;
+       u32 hwords = 5;
+       u32 bth0 = 0;
+       u32 len;
+       u32 pmtu = qp->pmtu;
+       int ret = 0;
+       int middle = 0;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) {
+               if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_iowait.sdma_busy)) {
+                       qp->s_flags |= HFI1_S_WAIT_DMA;
+                       goto bail;
+               }
+               clear_ahg(qp);
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
+
+       ohdr = &qp->s_hdr->ibh.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr->ibh.u.l.oth;
+
+       /* Get the next send request. */
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       qp->s_wqe = NULL;
+       switch (qp->s_state) {
+       default:
+               if (!(ib_hfi1_state_ops[qp->state] &
+                   HFI1_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /* Check if send work queue is empty. */
+               if (qp->s_cur == qp->s_head) {
+                       clear_ahg(qp);
+                       goto bail;
+               }
+               /*
+                * Start a new request.
+                */
+               wqe->psn = qp->s_next_psn;
+               qp->s_psn = qp->s_next_psn;
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_sge.total_len = wqe->length;
+               len = wqe->length;
+               qp->s_len = len;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND)
+                               qp->s_state = OP(SEND_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / 4;
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= IB_BTH_SOLICITED;
+                       }
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               break;
+
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND)
+                       qp->s_state = OP(SEND_LAST);
+               else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= IB_BTH_SOLICITED;
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       middle = HFI1_CAP_IS_KSET(SDMA_AHG);
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               else {
+                       qp->s_state =
+                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= IB_BTH_SOLICITED;
+               }
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_cur_size = len;
+       hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+                            mask_psn(qp->s_next_psn++), middle);
+done:
+       ret = 1;
+       goto unlock;
+
+bail:
+       qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/**
+ * hfi1_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_uc_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct hfi1_qp *qp = packet->qp;
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       u32 opcode;
+       u32 hdrsize = packet->hlen;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = qp->pmtu;
+       struct ib_reth *reth;
+       int has_grh = rcv_flags & HFI1_HAS_GRH;
+       int ret;
+       u32 bth1;
+       struct ib_grh *grh = NULL;
+
+       opcode = be32_to_cpu(ohdr->bth[0]);
+       if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+               return;
+
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) {
+               if (bth1 & HFI1_BECN_SMASK) {
+                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                       u32 rqpn, lqpn;
+                       u16 rlid = be16_to_cpu(hdr->lrh[3]);
+                       u8 sl, sc5;
+
+                       lqpn = bth1 & HFI1_QPN_MASK;
+                       rqpn = qp->remote_qpn;
+
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+                       sl = ibp->sc_to_sl[sc5];
+
+                       process_becn(ppd, sl, rlid, lqpn, rqpn,
+                                       IB_CC_SVCTYPE_UC);
+               }
+
+               if (bth1 & HFI1_FECN_SMASK) {
+                       u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+                       u16 slid = be16_to_cpu(hdr->lrh[3]);
+                       u16 dlid = be16_to_cpu(hdr->lrh[1]);
+                       u32 src_qp = qp->remote_qpn;
+                       u8 sc5;
+
+                       sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
+
+                       return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+               }
+       }
+
+       psn = be32_to_cpu(ohdr->bth[2]);
+       opcode >>= 24;
+
+       /* Compare the PSN verses the expected PSN. */
+       if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) {
+               /*
+                * Handle a sequence error.
+                * Silently drop any current message.
+                */
+               qp->r_psn = psn;
+inv:
+               if (qp->r_state == OP(SEND_FIRST) ||
+                   qp->r_state == OP(SEND_MIDDLE)) {
+                       set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
+                       qp->r_sge.num_sge = 0;
+               } else
+                       hfi1_put_ss(&qp->r_sge);
+               qp->r_state = OP(SEND_LAST);
+               switch (opcode) {
+               case OP(SEND_FIRST):
+               case OP(SEND_ONLY):
+               case OP(SEND_ONLY_WITH_IMMEDIATE):
+                       goto send_first;
+
+               case OP(RDMA_WRITE_FIRST):
+               case OP(RDMA_WRITE_ONLY):
+               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+                       goto rdma_first;
+
+               default:
+                       goto drop;
+               }
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       default:
+               if (opcode == OP(SEND_FIRST) ||
+                   opcode == OP(SEND_ONLY) ||
+                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_FIRST) ||
+                   opcode == OP(RDMA_WRITE_ONLY) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+       }
+
+       if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST))
+               qp_comm_est(qp);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+               if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+                       qp->r_sge = qp->s_rdma_read_sge;
+               else {
+                       ret = hfi1_get_rwqe(qp, 0);
+                       if (ret < 0)
+                               goto op_err;
+                       if (!ret)
+                               goto drop;
+                       /*
+                        * qp->s_rdma_read_sge will be the owner
+                        * of the mr references.
+                        */
+                       qp->s_rdma_read_sge = qp->r_sge;
+               }
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto no_immediate_data;
+               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+                       goto send_last_imm;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto rewind;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto rewind;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 0);
+               break;
+
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+               wc.ex.imm_data = ohdr->u.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+       case OP(SEND_LAST):
+no_immediate_data:
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto rewind;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto rewind;
+               wc.opcode = IB_WC_RECV;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 0);
+               hfi1_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               /*
+                * It seems that IB mandates the presence of an SL in a
+                * work completion only for the UD transport (see section
+                * 11.4.2 of IBTA Vol. 1).
+                *
+                * However, the way the SL is chosen below is consistent
+                * with the way that IB/qib works and is trying avoid
+                * introducing incompatibilities.
+                *
+                * See also OPA Vol. 1, section 9.7.6, and table 9-17.
+                */
+               wc.sl = qp->remote_ah_attr.sl;
+               /* zero fields that are N/A */
+               wc.vendor_err = 0;
+               wc.pkey_index = 0;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                             (ohdr->bth[0] &
+                               cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE))) {
+                       goto drop;
+               }
+               reth = &ohdr->u.rc.reth;
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               qp->r_sge.sg_list = NULL;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey */
+                       ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+                                         vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto drop;
+                       qp->r_sge.num_sge = 1;
+               } else {
+                       qp->r_sge.num_sge = 0;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto rdma_last;
+               else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+                       wc.ex.imm_data = ohdr->u.rc.imm_data;
+                       goto rdma_last_imm;
+               }
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto drop;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto drop;
+               hfi1_copy_sge(&qp->r_sge, data, pmtu, 1);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+               wc.wc_flags = IB_WC_WITH_IMM;
+
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto drop;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+                       goto drop;
+               if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags))
+                       hfi1_put_ss(&qp->s_rdma_read_sge);
+               else {
+                       ret = hfi1_get_rwqe(qp, 1);
+                       if (ret < 0)
+                               goto op_err;
+                       if (!ret)
+                               goto drop;
+               }
+               wc.byte_len = qp->r_len;
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+               hfi1_put_ss(&qp->r_sge);
+               goto last_imm;
+
+       case OP(RDMA_WRITE_LAST):
+rdma_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto drop;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+                       goto drop;
+               hfi1_copy_sge(&qp->r_sge, data, tlen, 1);
+               hfi1_put_ss(&qp->r_sge);
+               break;
+
+       default:
+               /* Drop packet for unknown opcodes. */
+               goto drop;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       return;
+
+rewind:
+       set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags);
+       qp->r_sge.num_sge = 0;
+drop:
+       ibp->n_pkt_drops++;
+       return;
+
+op_err:
+       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       return;
+
+}
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
new file mode 100644 (file)
index 0000000..d40d1a1
--- /dev/null
@@ -0,0 +1,885 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/net.h>
+#include <rdma/ib_smi.h>
+
+#include "hfi.h"
+#include "mad.h"
+#include "qp.h"
+
+/**
+ * ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from hfi1_make_ud_req() to forward a WQE addressed
+ * to the same HFI.
+ * Note that the receive interrupt handler may be calling hfi1_ud_rcv()
+ * while this is being called.
+ */
+static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe)
+{
+       struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+       struct hfi1_pportdata *ppd;
+       struct hfi1_qp *qp;
+       struct ib_ah_attr *ah_attr;
+       unsigned long flags;
+       struct hfi1_sge_state ssge;
+       struct hfi1_sge *sge;
+       struct ib_wc wc;
+       u32 length;
+       enum ib_qp_type sqptype, dqptype;
+
+       rcu_read_lock();
+
+       qp = hfi1_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
+       if (!qp) {
+               ibp->n_pkt_drops++;
+               rcu_read_unlock();
+               return;
+       }
+
+       sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+                       IB_QPT_UD : sqp->ibqp.qp_type;
+       dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+                       IB_QPT_UD : qp->ibqp.qp_type;
+
+       if (dqptype != sqptype ||
+           !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) {
+               ibp->n_pkt_drops++;
+               goto drop;
+       }
+
+       ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+       ppd = ppd_from_ibp(ibp);
+
+       if (qp->ibqp.qp_num > 1) {
+               u16 pkey;
+               u16 slid;
+               u8 sc5 = ibp->sl_to_sc[ah_attr->sl];
+
+               pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
+               slid = ppd->lid | (ah_attr->src_path_bits &
+                                  ((1 << ppd->lmc) - 1));
+               if (unlikely(ingress_pkey_check(ppd, pkey, sc5,
+                                               qp->s_pkey_index, slid))) {
+                       hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey,
+                                      ah_attr->sl,
+                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+                                      cpu_to_be16(slid),
+                                      cpu_to_be16(ah_attr->dlid));
+                       goto drop;
+               }
+       }
+
+       /*
+        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       if (qp->ibqp.qp_num) {
+               u32 qkey;
+
+               qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
+                       sqp->qkey : swqe->wr.wr.ud.remote_qkey;
+               if (unlikely(qkey != qp->qkey)) {
+                       u16 lid;
+
+                       lid = ppd->lid | (ah_attr->src_path_bits &
+                                         ((1 << ppd->lmc) - 1));
+                       hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+                                      ah_attr->sl,
+                                      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+                                      cpu_to_be16(lid),
+                                      cpu_to_be16(ah_attr->dlid));
+                       goto drop;
+               }
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       length = swqe->length;
+       memset(&wc, 0, sizeof(wc));
+       wc.byte_len = length + sizeof(struct ib_grh);
+
+       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = swqe->wr.ex.imm_data;
+       }
+
+       spin_lock_irqsave(&qp->r_lock, flags);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & HFI1_R_REUSE_SGE)
+               qp->r_flags &= ~HFI1_R_REUSE_SGE;
+       else {
+               int ret;
+
+               ret = hfi1_get_rwqe(qp, 0);
+               if (ret < 0) {
+                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+                       goto bail_unlock;
+               }
+               if (!ret) {
+                       if (qp->ibqp.qp_num == 0)
+                               ibp->n_vl15_dropped++;
+                       goto bail_unlock;
+               }
+       }
+       /* Silently drop packets which are too big. */
+       if (unlikely(wc.byte_len > qp->r_len)) {
+               qp->r_flags |= HFI1_R_REUSE_SGE;
+               ibp->n_pkt_drops++;
+               goto bail_unlock;
+       }
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               hfi1_copy_sge(&qp->r_sge, &ah_attr->grh,
+                             sizeof(struct ib_grh), 1);
+               wc.wc_flags |= IB_WC_GRH;
+       } else
+               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+       ssge.sg_list = swqe->sg_list + 1;
+       ssge.sge = *swqe->sg_list;
+       ssge.num_sge = swqe->wr.num_sge;
+       sge = &ssge.sge;
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ssge.num_sge)
+                               *sge = *ssge.sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= HFI1_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+       hfi1_put_ss(&qp->r_sge);
+       if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+               goto bail_unlock;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = sqp->ibqp.qp_num;
+       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
+               if (sqp->ibqp.qp_type == IB_QPT_GSI ||
+                   sqp->ibqp.qp_type == IB_QPT_SMI)
+                       wc.pkey_index = swqe->wr.wr.ud.pkey_index;
+               else
+                       wc.pkey_index = sqp->s_pkey_index;
+       } else {
+               wc.pkey_index = 0;
+       }
+       wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+       /* Check for loopback when the port lid is not set */
+       if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
+               wc.slid = HFI1_PERMISSIVE_LID;
+       wc.sl = ah_attr->sl;
+       wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+       wc.port_num = qp->port_num;
+       /* Signal completion event if the solicited bit is set. */
+       hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                     swqe->wr.send_flags & IB_SEND_SOLICITED);
+       ibp->n_loop_pkts++;
+bail_unlock:
+       spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+       rcu_read_unlock();
+}
+
+/**
+ * hfi1_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int hfi1_make_ud_req(struct hfi1_qp *qp)
+{
+       struct hfi1_other_headers *ohdr;
+       struct ib_ah_attr *ah_attr;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+       struct hfi1_swqe *wqe;
+       unsigned long flags;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth0;
+       u16 lrh0;
+       u16 lid;
+       int ret = 0;
+       int next_cur;
+       u8 sc5;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_iowait.sdma_busy)) {
+                       qp->s_flags |= HFI1_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
+
+       if (qp->s_cur == qp->s_head)
+               goto bail;
+
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       next_cur = qp->s_cur + 1;
+       if (next_cur >= qp->s_size)
+               next_cur = 0;
+
+       /* Construct the header. */
+       ibp = to_iport(qp->ibqp.device, qp->port_num);
+       ppd = ppd_from_ibp(ibp);
+       ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+       if (ah_attr->dlid < HFI1_MULTICAST_LID_BASE ||
+           ah_attr->dlid == HFI1_PERMISSIVE_LID) {
+               lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+               if (unlikely(!loopback && (lid == ppd->lid ||
+                   (lid == HFI1_PERMISSIVE_LID &&
+                    qp->ibqp.qp_type == IB_QPT_GSI)))) {
+                       /*
+                        * If DMAs are in progress, we can't generate
+                        * a completion for the loopback packet since
+                        * it would be out of order.
+                        * Instead of waiting, we could queue a
+                        * zero length descriptor so we get a callback.
+                        */
+                       if (atomic_read(&qp->s_iowait.sdma_busy)) {
+                               qp->s_flags |= HFI1_S_WAIT_DMA;
+                               goto bail;
+                       }
+                       qp->s_cur = next_cur;
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       ud_loopback(qp, wqe);
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+                       goto done;
+               }
+       }
+
+       qp->s_cur = next_cur;
+       extra_bytes = -wqe->length & 3;
+       nwords = (wqe->length + extra_bytes) >> 2;
+
+       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+       qp->s_hdrwords = 7;
+       qp->s_cur_size = wqe->length;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_srate = ah_attr->static_rate;
+       qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+       qp->s_wqe = wqe;
+       qp->s_sge.sge = wqe->sg_list[0];
+       qp->s_sge.sg_list = wqe->sg_list + 1;
+       qp->s_sge.num_sge = wqe->wr.num_sge;
+       qp->s_sge.total_len = wqe->length;
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               /* Header size in 32-bit words. */
+               qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh,
+                                              &ah_attr->grh,
+                                              qp->s_hdrwords, nwords);
+               lrh0 = HFI1_LRH_GRH;
+               ohdr = &qp->s_hdr->ibh.u.l.oth;
+               /*
+                * Don't worry about sending to locally attached multicast
+                * QPs.  It is unspecified by the spec. what happens.
+                */
+       } else {
+               /* Header size in 32-bit words. */
+               lrh0 = HFI1_LRH_BTH;
+               ohdr = &qp->s_hdr->ibh.u.oth;
+       }
+       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               qp->s_hdrwords++;
+               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+       } else
+               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+       sc5 = ibp->sl_to_sc[ah_attr->sl];
+       lrh0 |= (ah_attr->sl & 0xf) << 4;
+       if (qp->ibqp.qp_type == IB_QPT_SMI) {
+               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+               qp->s_sc = 0xf;
+       } else {
+               lrh0 |= (sc5 & 0xf) << 12;
+               qp->s_sc = sc5;
+       }
+       qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
+       qp->s_hdr->ibh.lrh[2] =
+               cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE))
+               qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
+       else {
+               lid = ppd->lid;
+               if (lid) {
+                       lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+                       qp->s_hdr->ibh.lrh[3] = cpu_to_be16(lid);
+               } else
+                       qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE;
+       }
+       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+               bth0 |= IB_BTH_SOLICITED;
+       bth0 |= extra_bytes << 20;
+       if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+               bth0 |= hfi1_get_pkey(ibp, wqe->wr.wr.ud.pkey_index);
+       else
+               bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++));
+       /*
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+                                        qp->qkey : wqe->wr.wr.ud.remote_qkey);
+       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+       /* disarm any ahg */
+       qp->s_hdr->ahgcount = 0;
+       qp->s_hdr->ahgidx = 0;
+       qp->s_hdr->tx_flags = 0;
+       qp->s_hdr->sde = NULL;
+
+done:
+       ret = 1;
+       goto unlock;
+
+bail:
+       qp->s_flags &= ~HFI1_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/*
+ * Hardware can't check this so we do it here.
+ *
+ * This is a slightly different algorithm than the standard pkey check.  It
+ * special cases the management keys and allows for 0x7fff and 0xffff to be in
+ * the table at the same time.
+ *
+ * @returns the index found or -1 if not found
+ */
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       unsigned i;
+
+       if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) {
+               unsigned lim_idx = -1;
+
+               for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) {
+                       /* here we look for an exact match */
+                       if (ppd->pkeys[i] == pkey)
+                               return i;
+                       if (ppd->pkeys[i] == LIM_MGMT_P_KEY)
+                               lim_idx = i;
+               }
+
+               /* did not find 0xffff return 0x7fff idx if found */
+               if (pkey == FULL_MGMT_P_KEY)
+                       return lim_idx;
+
+               /* no match...  */
+               return -1;
+       }
+
+       pkey &= 0x7fff; /* remove limited/full membership bit */
+
+       for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i)
+               if ((ppd->pkeys[i] & 0x7fff) == pkey)
+                       return i;
+
+       /*
+        * Should not get here, this means hardware failed to validate pkeys.
+        */
+       return -1;
+}
+
+void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn,
+               u32 pkey, u32 slid, u32 dlid, u8 sc5,
+               const struct ib_grh *old_grh)
+{
+       u64 pbc, pbc_flags = 0;
+       u32 bth0, plen, vl, hwords = 5;
+       u16 lrh0;
+       u8 sl = ibp->sc_to_sl[sc5];
+       struct hfi1_ib_header hdr;
+       struct hfi1_other_headers *ohdr;
+       struct pio_buf *pbuf;
+       struct send_context *ctxt = qp_to_send_context(qp, sc5);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       if (old_grh) {
+               struct ib_grh *grh = &hdr.u.l.grh;
+
+               grh->version_tclass_flow = old_grh->version_tclass_flow;
+               grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2);
+               grh->hop_limit = 0xff;
+               grh->sgid = old_grh->dgid;
+               grh->dgid = old_grh->sgid;
+               ohdr = &hdr.u.l.oth;
+               lrh0 = HFI1_LRH_GRH;
+               hwords += sizeof(struct ib_grh) / sizeof(u32);
+       } else {
+               ohdr = &hdr.u.oth;
+               lrh0 = HFI1_LRH_BTH;
+       }
+
+       lrh0 |= (sc5 & 0xf) << 12 | sl << 4;
+
+       bth0 = pkey | (IB_OPCODE_CNP << 24);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+
+       ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT));
+       ohdr->bth[2] = 0; /* PSN 0 */
+
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(slid);
+
+       plen = 2 /* PBC */ + hwords;
+       pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+       vl = sc_to_vlt(ppd->dd, sc5);
+       pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       if (ctxt) {
+               pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+               if (pbuf)
+                       ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+                                                &hdr, hwords);
+       }
+}
+
+/*
+ * opa_smp_check() - Do the regular pkey checking, and the additional
+ * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26
+ * ("SMA Packet Checks").
+ *
+ * Note that:
+ *   - Checks are done using the pkey directly from the packet's BTH,
+ *     and specifically _not_ the pkey that we attach to the completion,
+ *     which may be different.
+ *   - These checks are specifically for "non-local" SMPs (i.e., SMPs
+ *     which originated on another node). SMPs which are sent from, and
+ *     destined to this node are checked in opa_local_smp_check().
+ *
+ * At the point where opa_smp_check() is called, we know:
+ *   - destination QP is QP0
+ *
+ * opa_smp_check() returns 0 if all checks succeed, 1 otherwise.
+ */
+static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5,
+                        struct hfi1_qp *qp, u16 slid, struct opa_smp *smp)
+{
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+       /*
+        * I don't think it's possible for us to get here with sc != 0xf,
+        * but check it to be certain.
+        */
+       if (sc5 != 0xf)
+               return 1;
+
+       if (rcv_pkey_check(ppd, pkey, sc5, slid))
+               return 1;
+
+       /*
+        * At this point we know (and so don't need to check again) that
+        * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY
+        * (see ingress_pkey_check).
+        */
+       if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE &&
+           smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) {
+               ingress_pkey_table_fail(ppd, pkey, slid);
+               return 1;
+       }
+
+       /*
+        * SMPs fall into one of four (disjoint) categories:
+        * SMA request, SMA response, trap, or trap repress.
+        * Our response depends, in part, on which type of
+        * SMP we're processing.
+        *
+        * If this is not an SMA request, or trap repress:
+        *   - accept MAD if the port is running an SM
+        *   - pkey == FULL_MGMT_P_KEY =>
+        *       reply with unsupported method (i.e., just mark
+        *       the smp's status field here, and let it be
+        *       processed normally)
+        *   - pkey != LIM_MGMT_P_KEY =>
+        *       increment port recv constraint errors, drop MAD
+        * If this is an SMA request or trap repress:
+        *   - pkey != FULL_MGMT_P_KEY =>
+        *       increment port recv constraint errors, drop MAD
+        */
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+       case IB_MGMT_METHOD_SET:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_TRAP_REPRESS:
+               if (pkey != FULL_MGMT_P_KEY) {
+                       ingress_pkey_table_fail(ppd, pkey, slid);
+                       return 1;
+               }
+               break;
+       case IB_MGMT_METHOD_SEND:
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_GET_RESP:
+       case IB_MGMT_METHOD_REPORT_RESP:
+               if (ibp->port_cap_flags & IB_PORT_SM)
+                       return 0;
+               if (pkey == FULL_MGMT_P_KEY) {
+                       smp->status |= IB_SMP_UNSUP_METHOD;
+                       return 0;
+               }
+               if (pkey != LIM_MGMT_P_KEY) {
+                       ingress_pkey_table_fail(ppd, pkey, slid);
+                       return 1;
+               }
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
+
+/**
+ * hfi1_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @rcv_flags: flags relevant to rcv processing
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void hfi1_ud_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_other_headers *ohdr = packet->ohdr;
+       int opcode;
+       u32 hdrsize = packet->hlen;
+       u32 pad;
+       struct ib_wc wc;
+       u32 qkey;
+       u32 src_qp;
+       u16 dlid, pkey;
+       int mgmt_pkey_idx = -1;
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 rcv_flags = packet->rcv_flags;
+       void *data = packet->ebuf;
+       u32 tlen = packet->tlen;
+       struct hfi1_qp *qp = packet->qp;
+       bool has_grh = rcv_flags & HFI1_HAS_GRH;
+       bool sc4_bit = has_sc4_bit(packet);
+       u8 sc;
+       u32 bth1;
+       int is_mcast;
+       struct ib_grh *grh = NULL;
+
+       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK;
+       dlid = be16_to_cpu(hdr->lrh[1]);
+       is_mcast = (dlid > HFI1_MULTICAST_LID_BASE) &&
+                       (dlid != HFI1_PERMISSIVE_LID);
+       bth1 = be32_to_cpu(ohdr->bth[1]);
+       if (unlikely(bth1 & HFI1_BECN_SMASK)) {
+               /*
+                * In pre-B0 h/w the CNP_OPCODE is handled via an
+                * error path (errata 291394).
+                */
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+               u32 lqpn =  be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK;
+               u8 sl, sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+               sl = ibp->sc_to_sl[sc5];
+
+               process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD);
+       }
+
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       opcode &= 0xff;
+
+       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+       if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) {
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+               u8 sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+
+               return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh);
+       }
+       /*
+        * Get the number of bytes the message was padded by
+        * and drop incomplete packets.
+        */
+       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       if (unlikely(tlen < (hdrsize + pad + 4)))
+               goto drop;
+
+       tlen -= hdrsize + pad + 4;
+
+       /*
+        * Check that the permissive LID is only used on QP0
+        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+        */
+       if (qp->ibqp.qp_num) {
+               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                            hdr->lrh[3] == IB_LID_PERMISSIVE))
+                       goto drop;
+               if (qp->ibqp.qp_num > 1) {
+                       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                       u16 slid;
+                       u8 sc5;
+
+                       sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+                       sc5 |= sc4_bit;
+
+                       slid = be16_to_cpu(hdr->lrh[3]);
+                       if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
+                               /*
+                                * Traps will not be sent for packets dropped
+                                * by the HW. This is fine, as sending trap
+                                * for invalid pkeys is optional according to
+                                * IB spec (release 1.3, section 10.9.4)
+                                */
+                               hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+                                              pkey,
+                                              (be16_to_cpu(hdr->lrh[0]) >> 4) &
+                                               0xF,
+                                              src_qp, qp->ibqp.qp_num,
+                                              hdr->lrh[3], hdr->lrh[1]);
+                               return;
+                       }
+               } else {
+                       /* GSI packet */
+                       mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+                       if (mgmt_pkey_idx < 0)
+                               goto drop;
+
+               }
+               if (unlikely(qkey != qp->qkey)) {
+                       hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+                                      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+                                      src_qp, qp->ibqp.qp_num,
+                                      hdr->lrh[3], hdr->lrh[1]);
+                       return;
+               }
+               /* Drop invalid MAD packets (see 13.5.3.1). */
+               if (unlikely(qp->ibqp.qp_num == 1 &&
+                            (tlen > 2048 ||
+                             (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+                       goto drop;
+       } else {
+               /* Received on QP0, and so by definition, this is an SMP */
+               struct opa_smp *smp = (struct opa_smp *)data;
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+               u8 sc5;
+
+               sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+               sc5 |= sc4_bit;
+
+               if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp))
+                       goto drop;
+
+               if (tlen > 2048)
+                       goto drop;
+               if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                    hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+                   smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+                       goto drop;
+
+               /* look up SMI pkey */
+               mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey);
+               if (mgmt_pkey_idx < 0)
+                       goto drop;
+
+       }
+
+       if (qp->ibqp.qp_num > 1 &&
+           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+               wc.ex.imm_data = ohdr->u.ud.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               tlen -= sizeof(u32);
+       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+       } else
+               goto drop;
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       wc.byte_len = tlen + sizeof(struct ib_grh);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & HFI1_R_REUSE_SGE)
+               qp->r_flags &= ~HFI1_R_REUSE_SGE;
+       else {
+               int ret;
+
+               ret = hfi1_get_rwqe(qp, 0);
+               if (ret < 0) {
+                       hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+                       return;
+               }
+               if (!ret) {
+                       if (qp->ibqp.qp_num == 0)
+                               ibp->n_vl15_dropped++;
+                       return;
+               }
+       }
+       /* Silently drop packets which are too big. */
+       if (unlikely(wc.byte_len > qp->r_len)) {
+               qp->r_flags |= HFI1_R_REUSE_SGE;
+               goto drop;
+       }
+       if (has_grh) {
+               hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+                             sizeof(struct ib_grh), 1);
+               wc.wc_flags |= IB_WC_GRH;
+       } else
+               hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+       hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+       hfi1_put_ss(&qp->r_sge);
+       if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags))
+               return;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.vendor_err = 0;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = src_qp;
+
+       if (qp->ibqp.qp_type == IB_QPT_GSI ||
+           qp->ibqp.qp_type == IB_QPT_SMI) {
+               if (mgmt_pkey_idx < 0) {
+                       if (net_ratelimit()) {
+                               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+                               struct hfi1_devdata *dd = ppd->dd;
+
+                               dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n",
+                                          qp->ibqp.qp_type);
+                               mgmt_pkey_idx = 0;
+                       }
+               }
+               wc.pkey_index = (unsigned)mgmt_pkey_idx;
+       } else
+               wc.pkey_index = 0;
+
+       wc.slid = be16_to_cpu(hdr->lrh[3]);
+       sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf;
+       sc |= sc4_bit;
+       wc.sl = ibp->sc_to_sl[sc];
+
+       /*
+        * Save the LMC lower bits if the destination LID is a unicast LID.
+        */
+       wc.dlid_path_bits = dlid >= HFI1_MULTICAST_LID_BASE ? 0 :
+               dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+       wc.port_num = qp->port_num;
+       /* Signal completion event if the solicited bit is set. */
+       hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                     (ohdr->bth[0] &
+                       cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+       return;
+
+drop:
+       ibp->n_pkt_drops++;
+}
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
new file mode 100644 (file)
index 0000000..9071afb
--- /dev/null
@@ -0,0 +1,156 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include "hfi.h"
+
+static void __hfi1_release_user_pages(struct page **p, size_t num_pages,
+                                     int dirty)
+{
+       size_t i;
+
+       for (i = 0; i < num_pages; i++) {
+               if (dirty)
+                       set_page_dirty_lock(p[i]);
+               put_page(p[i]);
+       }
+}
+
+/*
+ * Call with current->mm->mmap_sem held.
+ */
+static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
+                                struct page **p)
+{
+       unsigned long lock_limit;
+       size_t got;
+       int ret;
+
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       for (got = 0; got < num_pages; got += ret) {
+               ret = get_user_pages(current, current->mm,
+                                    start_page + got * PAGE_SIZE,
+                                    num_pages - got, 1, 1,
+                                    p + got, NULL);
+               if (ret < 0)
+                       goto bail_release;
+       }
+
+       current->mm->pinned_vm += num_pages;
+
+       ret = 0;
+       goto bail;
+
+bail_release:
+       __hfi1_release_user_pages(p, got, 0);
+bail:
+       return ret;
+}
+
+/**
+ * hfi1_map_page - a safety wrapper around pci_map_page()
+ *
+ */
+dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page,
+                        unsigned long offset, size_t size, int direction)
+{
+       dma_addr_t phys;
+
+       phys = pci_map_page(hwdev, page, offset, size, direction);
+
+       return phys;
+}
+
+/**
+ * hfi1_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
+                       struct page **p)
+{
+       int ret;
+
+       down_write(&current->mm->mmap_sem);
+
+       ret = __hfi1_get_user_pages(start_page, num_pages, p);
+
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+void hfi1_release_user_pages(struct page **p, size_t num_pages)
+{
+       if (current->mm) /* during close after signal, mm can be NULL */
+               down_write(&current->mm->mmap_sem);
+
+       __hfi1_release_user_pages(p, num_pages, 1);
+
+       if (current->mm) {
+               current->mm->pinned_vm -= num_pages;
+               up_write(&current->mm->mmap_sem);
+       }
+}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
new file mode 100644 (file)
index 0000000..5552661
--- /dev/null
@@ -0,0 +1,1444 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "sdma.h"
+#include "user_sdma.h"
+#include "sdma.h"
+#include "verbs.h"  /* for the headers */
+#include "common.h" /* for struct hfi1_tid_info */
+#include "trace.h"
+
+static uint hfi1_sdma_comp_ring_size = 128;
+module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
+MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
+
+/* The maximum number of Data io vectors per message/request */
+#define MAX_VECTORS_PER_REQ 8
+/*
+ * Maximum number of packet to send from each message/request
+ * before moving to the next one.
+ */
+#define MAX_PKTS_PER_QUEUE 16
+
+#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
+
+#define req_opcode(x) \
+       (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_version(x) \
+       (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
+#define req_iovcnt(x) \
+       (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
+
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define BTH_SEQ_MASK 0x7ffull
+
+/*
+ * Define fields in the KDETH header so we can update the header
+ * template.
+ */
+#define KDETH_OFFSET_SHIFT        0
+#define KDETH_OFFSET_MASK         0x7fff
+#define KDETH_OM_SHIFT            15
+#define KDETH_OM_MASK             0x1
+#define KDETH_TID_SHIFT           16
+#define KDETH_TID_MASK            0x3ff
+#define KDETH_TIDCTRL_SHIFT       26
+#define KDETH_TIDCTRL_MASK        0x3
+#define KDETH_INTR_SHIFT          28
+#define KDETH_INTR_MASK           0x1
+#define KDETH_SH_SHIFT            29
+#define KDETH_SH_MASK             0x1
+#define KDETH_HCRC_UPPER_SHIFT    16
+#define KDETH_HCRC_UPPER_MASK     0xff
+#define KDETH_HCRC_LOWER_SHIFT    24
+#define KDETH_HCRC_LOWER_MASK     0xff
+
+#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
+#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
+
+#define KDETH_GET(val, field)                                          \
+       (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
+#define KDETH_SET(dw, field, val) do {                                 \
+               u32 dwval = le32_to_cpu(dw);                            \
+               dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
+               dwval |= (((val) & KDETH_##field##_MASK) << \
+                         KDETH_##field##_SHIFT);                       \
+               dw = cpu_to_le32(dwval);                                \
+       } while (0)
+
+#define AHG_HEADER_SET(arr, idx, dw, bit, width, value)                        \
+       do {                                                            \
+               if ((idx) < ARRAY_SIZE((arr)))                          \
+                       (arr)[(idx++)] = sdma_build_ahg_descriptor(     \
+                               (__force u16)(value), (dw), (bit),      \
+                                                       (width));       \
+               else                                                    \
+                       return -ERANGE;                                 \
+       } while (0)
+
+/* KDETH OM multipliers and switch over point */
+#define KDETH_OM_SMALL     4
+#define KDETH_OM_LARGE     64
+#define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
+
+/* Last packet in the request */
+#define USER_SDMA_TXREQ_FLAGS_LAST_PKT   (1 << 0)
+
+#define SDMA_REQ_IN_USE     0
+#define SDMA_REQ_FOR_THREAD 1
+#define SDMA_REQ_SEND_DONE  2
+#define SDMA_REQ_HAVE_AHG   3
+#define SDMA_REQ_HAS_ERROR  4
+#define SDMA_REQ_DONE_ERROR 5
+
+#define SDMA_PKT_Q_INACTIVE (1 << 0)
+#define SDMA_PKT_Q_ACTIVE   (1 << 1)
+#define SDMA_PKT_Q_DEFERRED (1 << 2)
+
+/*
+ * Maximum retry attempts to submit a TX request
+ * before putting the process to sleep.
+ */
+#define MAX_DEFER_RETRY_COUNT 1
+
+static unsigned initial_pkt_count = 8;
+
+#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
+
+struct user_sdma_iovec {
+       struct iovec iov;
+       /* number of pages in this vector */
+       unsigned npages;
+       /* array of pinned pages for this vector */
+       struct page **pages;
+       /* offset into the virtual address space of the vector at
+        * which we last left off. */
+       u64 offset;
+};
+
+struct user_sdma_request {
+       struct sdma_req_info info;
+       struct hfi1_user_sdma_pkt_q *pq;
+       struct hfi1_user_sdma_comp_q *cq;
+       /* This is the original header from user space */
+       struct hfi1_pkt_header hdr;
+       /*
+        * Pointer to the SDMA engine for this request.
+        * Since different request could be on different VLs,
+        * each request will need it's own engine pointer.
+        */
+       struct sdma_engine *sde;
+       u8 ahg_idx;
+       u32 ahg[9];
+       /*
+        * KDETH.Offset (Eager) field
+        * We need to remember the initial value so the headers
+        * can be updated properly.
+        */
+       u32 koffset;
+       /*
+        * KDETH.OFFSET (TID) field
+        * The offset can cover multiple packets, depending on the
+        * size of the TID entry.
+        */
+       u32 tidoffset;
+       /*
+        * KDETH.OM
+        * Remember this because the header template always sets it
+        * to 0.
+        */
+       u8 omfactor;
+       /*
+        * pointer to the user's task_struct. We are going to
+        * get a reference to it so we can process io vectors
+        * at a later time.
+        */
+       struct task_struct *user_proc;
+       /*
+        * pointer to the user's mm_struct. We are going to
+        * get a reference to it so it doesn't get freed
+        * since we might not be in process context when we
+        * are processing the iov's.
+        * Using this mm_struct, we can get vma based on the
+        * iov's address (find_vma()).
+        */
+       struct mm_struct *user_mm;
+       /*
+        * We copy the iovs for this request (based on
+        * info.iovcnt). These are only the data vectors
+        */
+       unsigned data_iovs;
+       /* total length of the data in the request */
+       u32 data_len;
+       /* progress index moving along the iovs array */
+       unsigned iov_idx;
+       struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
+       /* number of elements copied to the tids array */
+       u16 n_tids;
+       /* TID array values copied from the tid_iov vector */
+       u32 *tids;
+       u16 tididx;
+       u32 sent;
+       u64 seqnum;
+       spinlock_t list_lock;
+       struct list_head txps;
+       unsigned long flags;
+};
+
+struct user_sdma_txreq {
+       /* Packet header for the txreq */
+       struct hfi1_pkt_header hdr;
+       struct sdma_txreq txreq;
+       struct user_sdma_request *req;
+       struct user_sdma_iovec *iovec1;
+       struct user_sdma_iovec *iovec2;
+       u16 flags;
+       unsigned busycount;
+       u64 seqnum;
+};
+
+#define SDMA_DBG(req, fmt, ...)                                     \
+       hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
+                (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
+                ##__VA_ARGS__)
+#define SDMA_Q_DBG(pq, fmt, ...)                        \
+       hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
+                (pq)->subctxt, ##__VA_ARGS__)
+
+static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
+static int num_user_pages(const struct iovec *);
+static void user_sdma_txreq_cb(struct sdma_txreq *, int, int);
+static void user_sdma_free_request(struct user_sdma_request *);
+static int pin_vector_pages(struct user_sdma_request *,
+                           struct user_sdma_iovec *);
+static void unpin_vector_pages(struct user_sdma_iovec *);
+static int check_header_template(struct user_sdma_request *,
+                                struct hfi1_pkt_header *, u32, u32);
+static int set_txreq_header(struct user_sdma_request *,
+                           struct user_sdma_txreq *, u32);
+static int set_txreq_header_ahg(struct user_sdma_request *,
+                               struct user_sdma_txreq *, u32);
+static inline void set_comp_state(struct user_sdma_request *,
+                                       enum hfi1_sdma_comp_state, int);
+static inline u32 set_pkt_bth_psn(__be32, u8, u32);
+static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
+
+static int defer_packet_queue(
+       struct sdma_engine *,
+       struct iowait *,
+       struct sdma_txreq *,
+       unsigned seq);
+static void activate_packet_queue(struct iowait *, int);
+
+static inline int iovec_may_free(struct user_sdma_iovec *iovec,
+                                      void (*free)(struct user_sdma_iovec *))
+{
+       if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+               free(iovec);
+               return 1;
+       }
+       return 0;
+}
+
+static inline void iovec_set_complete(struct user_sdma_iovec *iovec)
+{
+       iovec->offset = iovec->iov.iov_len;
+}
+
+static int defer_packet_queue(
+       struct sdma_engine *sde,
+       struct iowait *wait,
+       struct sdma_txreq *txreq,
+       unsigned seq)
+{
+       struct hfi1_user_sdma_pkt_q *pq =
+               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+       struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
+       struct user_sdma_txreq *tx =
+               container_of(txreq, struct user_sdma_txreq, txreq);
+
+       if (sdma_progress(sde, seq, txreq)) {
+               if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
+                       goto eagain;
+       }
+       /*
+        * We are assuming that if the list is enqueued somewhere, it
+        * is to the dmawait list since that is the only place where
+        * it is supposed to be enqueued.
+        */
+       xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
+       write_seqlock(&dev->iowait_lock);
+       if (list_empty(&pq->busy.list))
+               list_add_tail(&pq->busy.list, &sde->dmawait);
+       write_sequnlock(&dev->iowait_lock);
+       return -EBUSY;
+eagain:
+       return -EAGAIN;
+}
+
+static void activate_packet_queue(struct iowait *wait, int reason)
+{
+       struct hfi1_user_sdma_pkt_q *pq =
+               container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
+       xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
+       wake_up(&wait->wait_dma);
+};
+
+static void sdma_kmem_cache_ctor(void *obj)
+{
+       struct user_sdma_txreq *tx = (struct user_sdma_txreq *)obj;
+
+       memset(tx, 0, sizeof(*tx));
+}
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
+{
+       int ret = 0;
+       unsigned memsize;
+       char buf[64];
+       struct hfi1_devdata *dd;
+       struct hfi1_user_sdma_comp_q *cq;
+       struct hfi1_user_sdma_pkt_q *pq;
+       unsigned long flags;
+
+       if (!uctxt || !fp) {
+               ret = -EBADF;
+               goto done;
+       }
+
+       if (!hfi1_sdma_comp_ring_size) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       dd = uctxt->dd;
+
+       pq = kzalloc(sizeof(*pq), GFP_KERNEL);
+       if (!pq) {
+               dd_dev_err(dd,
+                          "[%u:%u] Failed to allocate SDMA request struct\n",
+                          uctxt->ctxt, subctxt_fp(fp));
+               goto pq_nomem;
+       }
+       memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
+       pq->reqs = kmalloc(memsize, GFP_KERNEL);
+       if (!pq->reqs) {
+               dd_dev_err(dd,
+                          "[%u:%u] Failed to allocate SDMA request queue (%u)\n",
+                          uctxt->ctxt, subctxt_fp(fp), memsize);
+               goto pq_reqs_nomem;
+       }
+       INIT_LIST_HEAD(&pq->list);
+       pq->dd = dd;
+       pq->ctxt = uctxt->ctxt;
+       pq->subctxt = subctxt_fp(fp);
+       pq->n_max_reqs = hfi1_sdma_comp_ring_size;
+       pq->state = SDMA_PKT_Q_INACTIVE;
+       atomic_set(&pq->n_reqs, 0);
+
+       iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
+                   activate_packet_queue);
+       pq->reqidx = 0;
+       snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
+                subctxt_fp(fp));
+       pq->txreq_cache = kmem_cache_create(buf,
+                              sizeof(struct user_sdma_txreq),
+                                           L1_CACHE_BYTES,
+                                           SLAB_HWCACHE_ALIGN,
+                                           sdma_kmem_cache_ctor);
+       if (!pq->txreq_cache) {
+               dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
+                          uctxt->ctxt);
+               goto pq_txreq_nomem;
+       }
+       user_sdma_pkt_fp(fp) = pq;
+       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq) {
+               dd_dev_err(dd,
+                          "[%u:%u] Failed to allocate SDMA completion queue\n",
+                          uctxt->ctxt, subctxt_fp(fp));
+               goto cq_nomem;
+       }
+
+       memsize = ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size,
+                       PAGE_SIZE);
+       cq->comps = vmalloc_user(memsize);
+       if (!cq->comps) {
+               dd_dev_err(dd,
+                     "[%u:%u] Failed to allocate SDMA completion queue entries\n",
+                     uctxt->ctxt, subctxt_fp(fp));
+               goto cq_comps_nomem;
+       }
+       cq->nentries = hfi1_sdma_comp_ring_size;
+       user_sdma_comp_fp(fp) = cq;
+
+       spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+       list_add(&pq->list, &uctxt->sdma_queues);
+       spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+       goto done;
+
+cq_comps_nomem:
+       kfree(cq);
+cq_nomem:
+       kmem_cache_destroy(pq->txreq_cache);
+pq_txreq_nomem:
+       kfree(pq->reqs);
+pq_reqs_nomem:
+       kfree(pq);
+       user_sdma_pkt_fp(fp) = NULL;
+pq_nomem:
+       ret = -ENOMEM;
+done:
+       return ret;
+}
+
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
+{
+       struct hfi1_ctxtdata *uctxt = fd->uctxt;
+       struct hfi1_user_sdma_pkt_q *pq;
+       unsigned long flags;
+
+       hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
+                 uctxt->ctxt, fd->subctxt);
+       pq = fd->pq;
+       if (pq) {
+               u16 i, j;
+
+               spin_lock_irqsave(&uctxt->sdma_qlock, flags);
+               if (!list_empty(&pq->list))
+                       list_del_init(&pq->list);
+               spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
+               iowait_sdma_drain(&pq->busy);
+               if (pq->reqs) {
+                       for (i = 0, j = 0; i < atomic_read(&pq->n_reqs) &&
+                                    j < pq->n_max_reqs; j++) {
+                               struct user_sdma_request *req = &pq->reqs[j];
+
+                               if (test_bit(SDMA_REQ_IN_USE, &req->flags)) {
+                                       set_comp_state(req, ERROR, -ECOMM);
+                                       user_sdma_free_request(req);
+                                       i++;
+                               }
+                       }
+                       kfree(pq->reqs);
+               }
+               if (pq->txreq_cache)
+                       kmem_cache_destroy(pq->txreq_cache);
+               kfree(pq);
+               fd->pq = NULL;
+       }
+       if (fd->cq) {
+               if (fd->cq->comps)
+                       vfree(fd->cq->comps);
+               kfree(fd->cq);
+               fd->cq = NULL;
+       }
+       return 0;
+}
+
+int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
+                                  unsigned long dim, unsigned long *count)
+{
+       int ret = 0, i = 0, sent;
+       struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+       struct hfi1_user_sdma_pkt_q *pq = user_sdma_pkt_fp(fp);
+       struct hfi1_user_sdma_comp_q *cq = user_sdma_comp_fp(fp);
+       struct hfi1_devdata *dd = pq->dd;
+       unsigned long idx = 0;
+       u8 pcount = initial_pkt_count;
+       struct sdma_req_info info;
+       struct user_sdma_request *req;
+       u8 opcode, sc, vl;
+
+       if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
+               hfi1_cdbg(
+                  SDMA,
+                  "[%u:%u:%u] First vector not big enough for header %lu/%lu",
+                  dd->unit, uctxt->ctxt, subctxt_fp(fp),
+                  iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
+               ret = -EINVAL;
+               goto done;
+       }
+       ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
+       if (ret) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
+                         dd->unit, uctxt->ctxt, subctxt_fp(fp), ret);
+               ret = -EFAULT;
+               goto done;
+       }
+       trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, subctxt_fp(fp),
+                                    (u16 *)&info);
+       if (cq->comps[info.comp_idx].status == QUEUED) {
+               hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
+                         dd->unit, uctxt->ctxt, subctxt_fp(fp),
+                         info.comp_idx);
+               ret = -EBADSLT;
+               goto done;
+       }
+       if (!info.fragsize) {
+               hfi1_cdbg(SDMA,
+                         "[%u:%u:%u:%u] Request does not specify fragsize",
+                         dd->unit, uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+               ret = -EINVAL;
+               goto done;
+       }
+       /*
+        * We've done all the safety checks that we can up to this point,
+        * "allocate" the request entry.
+        */
+       hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
+                 uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+       req = pq->reqs + info.comp_idx;
+       memset(req, 0, sizeof(*req));
+       /* Mark the request as IN_USE before we start filling it in. */
+       set_bit(SDMA_REQ_IN_USE, &req->flags);
+       req->data_iovs = req_iovcnt(info.ctrl) - 1;
+       req->pq = pq;
+       req->cq = cq;
+       INIT_LIST_HEAD(&req->txps);
+       spin_lock_init(&req->list_lock);
+       memcpy(&req->info, &info, sizeof(info));
+
+       if (req_opcode(info.ctrl) == EXPECTED)
+               req->data_iovs--;
+
+       if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
+               SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
+                        MAX_VECTORS_PER_REQ);
+               ret = -EINVAL;
+               goto done;
+       }
+       /* Copy the header from the user buffer */
+       ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
+                            sizeof(req->hdr));
+       if (ret) {
+               SDMA_DBG(req, "Failed to copy header template (%d)", ret);
+               ret = -EFAULT;
+               goto free_req;
+       }
+
+       /* If Static rate control is not enabled, sanitize the header. */
+       if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
+               req->hdr.pbc[2] = 0;
+
+       /* Validate the opcode. Do not trust packets from user space blindly. */
+       opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
+       if ((opcode & USER_OPCODE_CHECK_MASK) !=
+            USER_OPCODE_CHECK_VAL) {
+               SDMA_DBG(req, "Invalid opcode (%d)", opcode);
+               ret = -EINVAL;
+               goto free_req;
+       }
+       /*
+        * Validate the vl. Do not trust packets from user space blindly.
+        * VL comes from PBC, SC comes from LRH, and the VL needs to
+        * match the SC look up.
+        */
+       vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
+       sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
+             (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
+       if (vl >= dd->pport->vls_operational ||
+           vl != sc_to_vlt(dd, sc)) {
+               SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       /*
+        * Also should check the BTH.lnh. If it says the next header is GRH then
+        * the RXE parsing will be off and will land in the middle of the KDETH
+        * or miss it entirely.
+        */
+       if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
+               SDMA_DBG(req, "User tried to pass in a GRH");
+               ret = -EINVAL;
+               goto free_req;
+       }
+
+       req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
+       /* Calculate the initial TID offset based on the values of
+          KDETH.OFFSET and KDETH.OM that are passed in. */
+       req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
+               (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+                KDETH_OM_LARGE : KDETH_OM_SMALL);
+       SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
+       idx++;
+
+       /* Save all the IO vector structures */
+       while (i < req->data_iovs) {
+               memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
+               req->iovs[i].offset = 0;
+               req->data_len += req->iovs[i++].iov.iov_len;
+       }
+       SDMA_DBG(req, "total data length %u", req->data_len);
+
+       if (pcount > req->info.npkts)
+               pcount = req->info.npkts;
+       /*
+        * Copy any TID info
+        * User space will provide the TID info only when the
+        * request type is EXPECTED. This is true even if there is
+        * only one packet in the request and the header is already
+        * setup. The reason for the singular TID case is that the
+        * driver needs to perform safety checks.
+        */
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
+
+               if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
+                       ret = -EINVAL;
+                       goto free_req;
+               }
+               req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
+               if (!req->tids) {
+                       ret = -ENOMEM;
+                       goto free_req;
+               }
+               /*
+                * We have to copy all of the tids because they may vary
+                * in size and, therefore, the TID count might not be
+                * equal to the pkt count. However, there is no way to
+                * tell at this point.
+                */
+               ret = copy_from_user(req->tids, iovec[idx].iov_base,
+                                    ntids * sizeof(*req->tids));
+               if (ret) {
+                       SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
+                                ntids, ret);
+                       ret = -EFAULT;
+                       goto free_req;
+               }
+               req->n_tids = ntids;
+               idx++;
+       }
+
+       /* Have to select the engine */
+       req->sde = sdma_select_engine_vl(dd,
+                                        (u32)(uctxt->ctxt + subctxt_fp(fp)),
+                                        vl);
+       if (!req->sde || !sdma_running(req->sde)) {
+               ret = -ECOMM;
+               goto free_req;
+       }
+
+       /* We don't need an AHG entry if the request contains only one packet */
+       if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
+               int ahg = sdma_ahg_alloc(req->sde);
+
+               if (likely(ahg >= 0)) {
+                       req->ahg_idx = (u8)ahg;
+                       set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
+               }
+       }
+
+       set_comp_state(req, QUEUED, 0);
+       /* Send the first N packets in the request to buy us some time */
+       sent = user_sdma_send_pkts(req, pcount);
+       if (unlikely(sent < 0)) {
+               if (sent != -EBUSY) {
+                       ret = sent;
+                       goto send_err;
+               } else
+                       sent = 0;
+       }
+       atomic_inc(&pq->n_reqs);
+
+       if (sent < req->info.npkts) {
+               /* Take the references to the user's task and mm_struct */
+               get_task_struct(current);
+               req->user_proc = current;
+
+               /*
+                * This is a somewhat blocking send implementation.
+                * The driver will block the caller until all packets of the
+                * request have been submitted to the SDMA engine. However, it
+                * will not wait for send completions.
+                */
+               while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
+                       ret = user_sdma_send_pkts(req, pcount);
+                       if (ret < 0) {
+                               if (ret != -EBUSY)
+                                       goto send_err;
+                               wait_event_interruptible_timeout(
+                                       pq->busy.wait_dma,
+                                       (pq->state == SDMA_PKT_Q_ACTIVE),
+                                       msecs_to_jiffies(
+                                               SDMA_IOWAIT_TIMEOUT));
+                       }
+               }
+
+       }
+       ret = 0;
+       *count += idx;
+       goto done;
+send_err:
+       set_comp_state(req, ERROR, ret);
+free_req:
+       user_sdma_free_request(req);
+done:
+       return ret;
+}
+
+static inline u32 compute_data_length(struct user_sdma_request *req,
+                                           struct user_sdma_txreq *tx)
+{
+       /*
+        * Determine the proper size of the packet data.
+        * The size of the data of the first packet is in the header
+        * template. However, it includes the header and ICRC, which need
+        * to be subtracted.
+        * The size of the remaining packets is the minimum of the frag
+        * size (MTU) or remaining data in the request.
+        */
+       u32 len;
+
+       if (!req->seqnum) {
+               len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+                      (sizeof(tx->hdr) - 4));
+       } else if (req_opcode(req->info.ctrl) == EXPECTED) {
+               u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
+                       PAGE_SIZE;
+               /* Get the data length based on the remaining space in the
+                * TID pair. */
+               len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
+               /* If we've filled up the TID pair, move to the next one. */
+               if (unlikely(!len) && ++req->tididx < req->n_tids &&
+                   req->tids[req->tididx]) {
+                       tidlen = EXP_TID_GET(req->tids[req->tididx],
+                                            LEN) * PAGE_SIZE;
+                       req->tidoffset = 0;
+                       len = min_t(u32, tidlen, req->info.fragsize);
+               }
+               /* Since the TID pairs map entire pages, make sure that we
+                * are not going to try to send more data that we have
+                * remaining. */
+               len = min(len, req->data_len - req->sent);
+       } else
+               len = min(req->data_len - req->sent, (u32)req->info.fragsize);
+       SDMA_DBG(req, "Data Length = %u", len);
+       return len;
+}
+
+static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
+{
+       /* (Size of complete header - size of PBC) + 4B ICRC + data length */
+       return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
+}
+
+static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+{
+       int ret = 0;
+       unsigned npkts = 0;
+       struct user_sdma_txreq *tx = NULL;
+       struct hfi1_user_sdma_pkt_q *pq = NULL;
+       struct user_sdma_iovec *iovec = NULL;
+
+       if (!req->pq) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       pq = req->pq;
+
+       /*
+        * Check if we might have sent the entire request already
+        */
+       if (unlikely(req->seqnum == req->info.npkts)) {
+               if (!list_empty(&req->txps))
+                       goto dosend;
+               goto done;
+       }
+
+       if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
+               maxpkts = req->info.npkts - req->seqnum;
+
+       while (npkts < maxpkts) {
+               u32 datalen = 0, queued = 0, data_sent = 0;
+               u64 iov_offset = 0;
+
+               /*
+                * Check whether any of the completions have come back
+                * with errors. If so, we are not going to process any
+                * more packets from this request.
+                */
+               if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
+                       set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
+                       ret = -EFAULT;
+                       goto done;
+               }
+
+               tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
+               if (!tx) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+               tx->flags = 0;
+               tx->req = req;
+               tx->busycount = 0;
+               tx->iovec1 = NULL;
+               tx->iovec2 = NULL;
+
+               if (req->seqnum == req->info.npkts - 1)
+                       tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT;
+
+               /*
+                * Calculate the payload size - this is min of the fragment
+                * (MTU) size or the remaining bytes in the request but only
+                * if we have payload data.
+                */
+               if (req->data_len) {
+                       iovec = &req->iovs[req->iov_idx];
+                       if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+                               if (++req->iov_idx == req->data_iovs) {
+                                       ret = -EFAULT;
+                                       goto free_txreq;
+                               }
+                               iovec = &req->iovs[req->iov_idx];
+                               WARN_ON(iovec->offset);
+                       }
+
+                       /*
+                        * This request might include only a header and no user
+                        * data, so pin pages only if there is data and it the
+                        * pages have not been pinned already.
+                        */
+                       if (unlikely(!iovec->pages && iovec->iov.iov_len)) {
+                               ret = pin_vector_pages(req, iovec);
+                               if (ret)
+                                       goto free_tx;
+                       }
+
+                       tx->iovec1 = iovec;
+                       datalen = compute_data_length(req, tx);
+                       if (!datalen) {
+                               SDMA_DBG(req,
+                                        "Request has data but pkt len is 0");
+                               ret = -EFAULT;
+                               goto free_tx;
+                       }
+               }
+
+               if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
+                       if (!req->seqnum) {
+                               u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
+                               u32 lrhlen = get_lrh_len(req->hdr, datalen);
+                               /*
+                                * Copy the request header into the tx header
+                                * because the HW needs a cacheline-aligned
+                                * address.
+                                * This copy can be optimized out if the hdr
+                                * member of user_sdma_request were also
+                                * cacheline aligned.
+                                */
+                               memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
+                               if (PBC2LRH(pbclen) != lrhlen) {
+                                       pbclen = (pbclen & 0xf000) |
+                                               LRH2PBC(lrhlen);
+                                       tx->hdr.pbc[0] = cpu_to_le16(pbclen);
+                               }
+                               ret = sdma_txinit_ahg(&tx->txreq,
+                                                     SDMA_TXREQ_F_AHG_COPY,
+                                                     sizeof(tx->hdr) + datalen,
+                                                     req->ahg_idx, 0, NULL, 0,
+                                                     user_sdma_txreq_cb);
+                               if (ret)
+                                       goto free_tx;
+                               ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
+                                                       &tx->hdr,
+                                                       sizeof(tx->hdr));
+                               if (ret)
+                                       goto free_txreq;
+                       } else {
+                               int changes;
+
+                               changes = set_txreq_header_ahg(req, tx,
+                                                              datalen);
+                               if (changes < 0)
+                                       goto free_tx;
+                               sdma_txinit_ahg(&tx->txreq,
+                                               SDMA_TXREQ_F_USE_AHG,
+                                               datalen, req->ahg_idx, changes,
+                                               req->ahg, sizeof(req->hdr),
+                                               user_sdma_txreq_cb);
+                       }
+               } else {
+                       ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
+                                         datalen, user_sdma_txreq_cb);
+                       if (ret)
+                               goto free_tx;
+                       /*
+                        * Modify the header for this packet. This only needs
+                        * to be done if we are not going to use AHG. Otherwise,
+                        * the HW will do it based on the changes we gave it
+                        * during sdma_txinit_ahg().
+                        */
+                       ret = set_txreq_header(req, tx, datalen);
+                       if (ret)
+                               goto free_txreq;
+               }
+
+               /*
+                * If the request contains any data vectors, add up to
+                * fragsize bytes to the descriptor.
+                */
+               while (queued < datalen &&
+                      (req->sent + data_sent) < req->data_len) {
+                       unsigned long base, offset;
+                       unsigned pageidx, len;
+
+                       base = (unsigned long)iovec->iov.iov_base;
+                       offset = ((base + iovec->offset + iov_offset) &
+                                 ~PAGE_MASK);
+                       pageidx = (((iovec->offset + iov_offset +
+                                    base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
+                       len = offset + req->info.fragsize > PAGE_SIZE ?
+                               PAGE_SIZE - offset : req->info.fragsize;
+                       len = min((datalen - queued), len);
+                       ret = sdma_txadd_page(pq->dd, &tx->txreq,
+                                             iovec->pages[pageidx],
+                                             offset, len);
+                       if (ret) {
+                               dd_dev_err(pq->dd,
+                                          "SDMA txreq add page failed %d\n",
+                                          ret);
+                               iovec_set_complete(iovec);
+                               goto free_txreq;
+                       }
+                       iov_offset += len;
+                       queued += len;
+                       data_sent += len;
+                       if (unlikely(queued < datalen &&
+                                    pageidx == iovec->npages &&
+                                    req->iov_idx < req->data_iovs - 1)) {
+                               iovec->offset += iov_offset;
+                               iovec = &req->iovs[++req->iov_idx];
+                               if (!iovec->pages) {
+                                       ret = pin_vector_pages(req, iovec);
+                                       if (ret)
+                                               goto free_txreq;
+                               }
+                               iov_offset = 0;
+                               tx->iovec2 = iovec;
+
+                       }
+               }
+               /*
+                * The txreq was submitted successfully so we can update
+                * the counters.
+                */
+               req->koffset += datalen;
+               if (req_opcode(req->info.ctrl) == EXPECTED)
+                       req->tidoffset += datalen;
+               req->sent += data_sent;
+               if (req->data_len) {
+                       if (tx->iovec1 && !tx->iovec2)
+                               tx->iovec1->offset += iov_offset;
+                       else if (tx->iovec2)
+                               tx->iovec2->offset += iov_offset;
+               }
+               /*
+                * It is important to increment this here as it is used to
+                * generate the BTH.PSN and, therefore, can't be bulk-updated
+                * outside of the loop.
+                */
+               tx->seqnum = req->seqnum++;
+               list_add_tail(&tx->txreq.list, &req->txps);
+               npkts++;
+       }
+dosend:
+       ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
+       if (list_empty(&req->txps))
+               if (req->seqnum == req->info.npkts) {
+                       set_bit(SDMA_REQ_SEND_DONE, &req->flags);
+                       /*
+                        * The txreq has already been submitted to the HW queue
+                        * so we can free the AHG entry now. Corruption will not
+                        * happen due to the sequential manner in which
+                        * descriptors are processed.
+                        */
+                       if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
+                               sdma_ahg_free(req->sde, req->ahg_idx);
+               }
+       goto done;
+free_txreq:
+       sdma_txclean(pq->dd, &tx->txreq);
+free_tx:
+       kmem_cache_free(pq->txreq_cache, tx);
+done:
+       return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static inline int num_user_pages(const struct iovec *iov)
+{
+       const unsigned long addr  = (unsigned long) iov->iov_base;
+       const unsigned long len   = iov->iov_len;
+       const unsigned long spage = addr & PAGE_MASK;
+       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+       return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static int pin_vector_pages(struct user_sdma_request *req,
+                           struct user_sdma_iovec *iovec) {
+       int ret = 0;
+       unsigned pinned;
+
+       iovec->npages = num_user_pages(&iovec->iov);
+       iovec->pages = kzalloc(sizeof(*iovec->pages) *
+                              iovec->npages, GFP_KERNEL);
+       if (!iovec->pages) {
+               SDMA_DBG(req, "Failed page array alloc");
+               ret = -ENOMEM;
+               goto done;
+       }
+       /* If called by the kernel thread, use the user's mm */
+       if (current->flags & PF_KTHREAD)
+               use_mm(req->user_proc->mm);
+       pinned = get_user_pages_fast(
+               (unsigned long)iovec->iov.iov_base,
+               iovec->npages, 0, iovec->pages);
+       /* If called by the kernel thread, unuse the user's mm */
+       if (current->flags & PF_KTHREAD)
+               unuse_mm(req->user_proc->mm);
+       if (pinned != iovec->npages) {
+               SDMA_DBG(req, "Failed to pin pages (%u/%u)", pinned,
+                        iovec->npages);
+               ret = -EFAULT;
+               goto pfree;
+       }
+       goto done;
+pfree:
+       unpin_vector_pages(iovec);
+done:
+       return ret;
+}
+
+static void unpin_vector_pages(struct user_sdma_iovec *iovec)
+{
+       unsigned i;
+
+       if (ACCESS_ONCE(iovec->offset) != iovec->iov.iov_len) {
+               hfi1_cdbg(SDMA,
+                         "the complete vector has not been sent yet %llu %zu",
+                         iovec->offset, iovec->iov.iov_len);
+               return;
+       }
+       for (i = 0; i < iovec->npages; i++)
+               if (iovec->pages[i])
+                       put_page(iovec->pages[i]);
+       kfree(iovec->pages);
+       iovec->pages = NULL;
+       iovec->npages = 0;
+       iovec->offset = 0;
+}
+
+static int check_header_template(struct user_sdma_request *req,
+                                struct hfi1_pkt_header *hdr, u32 lrhlen,
+                                u32 datalen)
+{
+       /*
+        * Perform safety checks for any type of packet:
+        *    - transfer size is multiple of 64bytes
+        *    - packet length is multiple of 4bytes
+        *    - entire request length is multiple of 4bytes
+        *    - packet length is not larger than MTU size
+        *
+        * These checks are only done for the first packet of the
+        * transfer since the header is "given" to us by user space.
+        * For the remainder of the packets we compute the values.
+        */
+       if (req->info.fragsize % PIO_BLOCK_SIZE ||
+           lrhlen & 0x3 || req->data_len & 0x3  ||
+           lrhlen > get_lrh_len(*hdr, req->info.fragsize))
+               return -EINVAL;
+
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               /*
+                * The header is checked only on the first packet. Furthermore,
+                * we ensure that at least one TID entry is copied when the
+                * request is submitted. Therefore, we don't have to verify that
+                * tididx points to something sane.
+                */
+               u32 tidval = req->tids[req->tididx],
+                       tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
+                       tididx = EXP_TID_GET(tidval, IDX),
+                       tidctrl = EXP_TID_GET(tidval, CTRL),
+                       tidoff;
+               __le32 kval = hdr->kdeth.ver_tid_offset;
+
+               tidoff = KDETH_GET(kval, OFFSET) *
+                         (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
+                          KDETH_OM_LARGE : KDETH_OM_SMALL);
+               /*
+                * Expected receive packets have the following
+                * additional checks:
+                *     - offset is not larger than the TID size
+                *     - TIDCtrl values match between header and TID array
+                *     - TID indexes match between header and TID array
+                */
+               if ((tidoff + datalen > tidlen) ||
+                   KDETH_GET(kval, TIDCTRL) != tidctrl ||
+                   KDETH_GET(kval, TID) != tididx)
+                       return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Correctly set the BTH.PSN field based on type of
+ * transfer - eager packets can just increment the PSN but
+ * expected packets encode generation and sequence in the
+ * BTH.PSN field so just incrementing will result in errors.
+ */
+static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
+{
+       u32 val = be32_to_cpu(bthpsn),
+               mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
+                       0xffffffull),
+               psn = val & mask;
+       if (expct)
+               psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+       else
+               psn = psn + frags;
+       return psn & mask;
+}
+
+static int set_txreq_header(struct user_sdma_request *req,
+                           struct user_sdma_txreq *tx, u32 datalen)
+{
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct hfi1_pkt_header *hdr = &tx->hdr;
+       u16 pbclen;
+       int ret;
+       u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+
+       /* Copy the header template to the request before modification */
+       memcpy(hdr, &req->hdr, sizeof(*hdr));
+
+       /*
+        * Check if the PBC and LRH length are mismatched. If so
+        * adjust both in the header.
+        */
+       pbclen = le16_to_cpu(hdr->pbc[0]);
+       if (PBC2LRH(pbclen) != lrhlen) {
+               pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
+               hdr->pbc[0] = cpu_to_le16(pbclen);
+               hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
+               /*
+                * Third packet
+                * This is the first packet in the sequence that has
+                * a "static" size that can be used for the rest of
+                * the packets (besides the last one).
+                */
+               if (unlikely(req->seqnum == 2)) {
+                       /*
+                        * From this point on the lengths in both the
+                        * PBC and LRH are the same until the last
+                        * packet.
+                        * Adjust the template so we don't have to update
+                        * every packet
+                        */
+                       req->hdr.pbc[0] = hdr->pbc[0];
+                       req->hdr.lrh[2] = hdr->lrh[2];
+               }
+       }
+       /*
+        * We only have to modify the header if this is not the
+        * first packet in the request. Otherwise, we use the
+        * header given to us.
+        */
+       if (unlikely(!req->seqnum)) {
+               ret = check_header_template(req, hdr, lrhlen, datalen);
+               if (ret)
+                       return ret;
+               goto done;
+
+       }
+
+       hdr->bth[2] = cpu_to_be32(
+               set_pkt_bth_psn(hdr->bth[2],
+                               (req_opcode(req->info.ctrl) == EXPECTED),
+                               req->seqnum));
+
+       /* Set ACK request on last packet */
+       if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+               hdr->bth[2] |= cpu_to_be32(1UL<<31);
+
+       /* Set the new offset */
+       hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
+       /* Expected packets have to fill in the new TID information */
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               tidval = req->tids[req->tididx];
+               /*
+                * If the offset puts us at the end of the current TID,
+                * advance everything.
+                */
+               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+                                        PAGE_SIZE)) {
+                       req->tidoffset = 0;
+                       /* Since we don't copy all the TIDs, all at once,
+                        * we have to check again. */
+                       if (++req->tididx > req->n_tids - 1 ||
+                           !req->tids[req->tididx]) {
+                               return -EINVAL;
+                       }
+                       tidval = req->tids[req->tididx];
+               }
+               req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
+                       KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
+               /* Set KDETH.TIDCtrl based on value for this TID. */
+               KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
+                         EXP_TID_GET(tidval, CTRL));
+               /* Set KDETH.TID based on value for this TID */
+               KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
+                         EXP_TID_GET(tidval, IDX));
+               /* Clear KDETH.SH only on the last packet */
+               if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+                       KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
+               /*
+                * Set the KDETH.OFFSET and KDETH.OM based on size of
+                * transfer.
+                */
+               SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
+                        req->tidoffset, req->tidoffset / req->omfactor,
+                        !!(req->omfactor - KDETH_OM_SMALL));
+               KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
+                         req->tidoffset / req->omfactor);
+               KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
+                         !!(req->omfactor - KDETH_OM_SMALL));
+       }
+done:
+       trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
+                                   req->info.comp_idx, hdr, tidval);
+       return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
+}
+
+static int set_txreq_header_ahg(struct user_sdma_request *req,
+                               struct user_sdma_txreq *tx, u32 len)
+{
+       int diff = 0;
+       struct hfi1_user_sdma_pkt_q *pq = req->pq;
+       struct hfi1_pkt_header *hdr = &req->hdr;
+       u16 pbclen = le16_to_cpu(hdr->pbc[0]);
+       u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+
+       if (PBC2LRH(pbclen) != lrhlen) {
+               /* PBC.PbcLengthDWs */
+               AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
+                              cpu_to_le16(LRH2PBC(lrhlen)));
+               /* LRH.PktLen (we need the full 16 bits due to byte swap) */
+               AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
+                              cpu_to_be16(lrhlen >> 2));
+       }
+
+       /*
+        * Do the common updates
+        */
+       /* BTH.PSN and BTH.A */
+       val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
+               (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
+       if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+               val32 |= 1UL << 31;
+       AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
+       AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
+       /* KDETH.Offset */
+       AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
+                      cpu_to_le16(req->koffset & 0xffff));
+       AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
+                      cpu_to_le16(req->koffset >> 16));
+       if (req_opcode(req->info.ctrl) == EXPECTED) {
+               __le16 val;
+
+               tidval = req->tids[req->tididx];
+
+               /*
+                * If the offset puts us at the end of the current TID,
+                * advance everything.
+                */
+               if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
+                                        PAGE_SIZE)) {
+                       req->tidoffset = 0;
+                       /* Since we don't copy all the TIDs, all at once,
+                        * we have to check again. */
+                       if (++req->tididx > req->n_tids - 1 ||
+                           !req->tids[req->tididx]) {
+                               return -EINVAL;
+                       }
+                       tidval = req->tids[req->tididx];
+               }
+               req->omfactor = ((EXP_TID_GET(tidval, LEN) *
+                                 PAGE_SIZE) >=
+                                KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
+                       KDETH_OM_SMALL;
+               /* KDETH.OM and KDETH.OFFSET (TID) */
+               AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
+                              ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
+                               ((req->tidoffset / req->omfactor) & 0x7fff)));
+               /* KDETH.TIDCtrl, KDETH.TID */
+               val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
+                                       (EXP_TID_GET(tidval, IDX) & 0x3ff));
+               /* Clear KDETH.SH on last packet */
+               if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) {
+                       val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
+                                                               INTR) >> 16);
+                       val &= cpu_to_le16(~(1U << 13));
+                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
+               } else
+                       AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
+       }
+
+       trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
+                                       req->info.comp_idx, req->sde->this_idx,
+                                       req->ahg_idx, req->ahg, diff, tidval);
+       return diff;
+}
+
+static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
+                              int drain)
+{
+       struct user_sdma_txreq *tx =
+               container_of(txreq, struct user_sdma_txreq, txreq);
+       struct user_sdma_request *req = tx->req;
+       struct hfi1_user_sdma_pkt_q *pq = req ? req->pq : NULL;
+       u64 tx_seqnum;
+
+       if (unlikely(!req || !pq))
+               return;
+
+       if (tx->iovec1)
+               iovec_may_free(tx->iovec1, unpin_vector_pages);
+       if (tx->iovec2)
+               iovec_may_free(tx->iovec2, unpin_vector_pages);
+
+       tx_seqnum = tx->seqnum;
+       kmem_cache_free(pq->txreq_cache, tx);
+
+       if (status != SDMA_TXREQ_S_OK) {
+               dd_dev_err(pq->dd, "SDMA completion with error %d", status);
+               set_comp_state(req, ERROR, status);
+               set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
+               /* Do not free the request until the sender loop has ack'ed
+                * the error and we've seen all txreqs. */
+               if (tx_seqnum == ACCESS_ONCE(req->seqnum) &&
+                   test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) {
+                       atomic_dec(&pq->n_reqs);
+                       user_sdma_free_request(req);
+               }
+       } else {
+               if (tx_seqnum == req->info.npkts - 1) {
+                       /* We've sent and completed all packets in this
+                        * request. Signal completion to the user */
+                       atomic_dec(&pq->n_reqs);
+                       set_comp_state(req, COMPLETE, 0);
+                       user_sdma_free_request(req);
+               }
+       }
+       if (!atomic_read(&pq->n_reqs))
+               xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
+}
+
+static void user_sdma_free_request(struct user_sdma_request *req)
+{
+       if (!list_empty(&req->txps)) {
+               struct sdma_txreq *t, *p;
+
+               list_for_each_entry_safe(t, p, &req->txps, list) {
+                       struct user_sdma_txreq *tx =
+                               container_of(t, struct user_sdma_txreq, txreq);
+                       list_del_init(&t->list);
+                       sdma_txclean(req->pq->dd, t);
+                       kmem_cache_free(req->pq->txreq_cache, tx);
+               }
+       }
+       if (req->data_iovs) {
+               int i;
+
+               for (i = 0; i < req->data_iovs; i++)
+                       if (req->iovs[i].npages && req->iovs[i].pages)
+                               unpin_vector_pages(&req->iovs[i]);
+       }
+       if (req->user_proc)
+               put_task_struct(req->user_proc);
+       kfree(req->tids);
+       clear_bit(SDMA_REQ_IN_USE, &req->flags);
+}
+
+static inline void set_comp_state(struct user_sdma_request *req,
+                                       enum hfi1_sdma_comp_state state,
+                                       int ret)
+{
+       SDMA_DBG(req, "Setting completion status %u %d", state, ret);
+       req->cq->comps[req->info.comp_idx].status = state;
+       if (state == ERROR)
+               req->cq->comps[req->info.comp_idx].errcode = -ret;
+       trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt,
+                                       req->pq->subctxt, req->info.comp_idx,
+                                       state, ret);
+}
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
new file mode 100644 (file)
index 0000000..fa44225
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <linux/device.h>
+#include <linux/wait.h>
+
+#include "common.h"
+#include "iowait.h"
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x7FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)        \
+       (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+extern uint extended_psn;
+
+struct hfi1_user_sdma_pkt_q {
+       struct list_head list;
+       unsigned ctxt;
+       unsigned subctxt;
+       u16 n_max_reqs;
+       atomic_t n_reqs;
+       u16 reqidx;
+       struct hfi1_devdata *dd;
+       struct kmem_cache *txreq_cache;
+       struct user_sdma_request *reqs;
+       struct iowait busy;
+       unsigned state;
+};
+
+struct hfi1_user_sdma_comp_q {
+       u16 nentries;
+       struct hfi1_sdma_comp_entry *comps;
+};
+
+int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *);
+int hfi1_user_sdma_free_queues(struct hfi1_filedata *);
+int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long,
+                                  unsigned long *);
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
new file mode 100644 (file)
index 0000000..53ac214
--- /dev/null
@@ -0,0 +1,2143 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "trace.h"
+#include "qp.h"
+#include "sdma.h"
+
+unsigned int hfi1_lkey_table_size = 16;
+module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
+                  S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+                "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int hfi1_max_pds = 0xFFFF;
+module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+                "Maximum number of protection domains to support");
+
+static unsigned int hfi1_max_ahs = 0xFFFF;
+module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int hfi1_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+                "Maximum number of completion queue entries to support");
+
+unsigned int hfi1_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int hfi1_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int hfi1_max_qps = 16384;
+module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int hfi1_max_sges = 0x60;
+module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int hfi1_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+                "Maximum number of multicast groups to support");
+
+unsigned int hfi1_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
+                  uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+                "Maximum number of attached QPs to support");
+
+unsigned int hfi1_max_srqs = 1024;
+module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int hfi1_max_srq_sges = 128;
+module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int hfi1_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static void verbs_sdma_complete(
+       struct sdma_txreq *cookie,
+       int status,
+       int drained);
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; hfi1_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = {
+       [IB_QPS_RESET] = 0,
+       [IB_QPS_INIT] = HFI1_POST_RECV_OK,
+       [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK,
+       [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+           HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK |
+           HFI1_PROCESS_NEXT_SEND_OK,
+       [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+           HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK,
+       [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK |
+           HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
+       [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV |
+           HFI1_POST_SEND_OK | HFI1_FLUSH_SEND,
+};
+
+struct hfi1_ucontext {
+       struct ib_ucontext ibucontext;
+};
+
+static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext
+                                                 *ibucontext)
+{
+       return container_of(ibucontext, struct hfi1_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
+       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+       [IB_WR_SEND] = IB_WC_SEND,
+       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * Length of header by opcode, 0 --> not supported
+ */
+const u8 hdr_len_by_opcode[256] = {
+       /* RC */
+       [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
+       [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
+       [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
+       [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4,
+       [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
+       [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
+       [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
+       [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
+};
+
+static const opcode_handler opcode_handler_tbl[256] = {
+       /* RC */
+       [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
+       [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
+       /* UC */
+       [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
+       [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
+       /* UD */
+       [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
+       [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
+       /* CNP */
+       [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_hfi1_sys_image_guid;
+
+/**
+ * hfi1_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void hfi1_copy_sge(
+       struct hfi1_sge_state *ss,
+       void *data, u32 length,
+       int release)
+{
+       struct hfi1_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               memcpy(sge->vaddr, data, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (release)
+                               hfi1_put_mr(sge->mr);
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= HFI1_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               data += len;
+               length -= len;
+       }
+}
+
+/**
+ * hfi1_skip_sge - skip over SGE memory
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release)
+{
+       struct hfi1_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               WARN_ON_ONCE(len == 0);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (release)
+                               hfi1_put_mr(sge->mr);
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr->lkey) {
+                       if (++sge->n >= HFI1_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+}
+
+/**
+ * post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr)
+{
+       struct hfi1_swqe *wqe;
+       u32 next;
+       int i;
+       int j;
+       int acc;
+       struct hfi1_lkey_table *rkt;
+       struct hfi1_pd *pd;
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_pportdata *ppd;
+       struct hfi1_ibport *ibp;
+
+       /* IB spec says that num_sge == 0 is OK. */
+       if (unlikely(wr->num_sge > qp->s_max_sge))
+               return -EINVAL;
+
+       ppd = &dd->pport[qp->port_num - 1];
+       ibp = &ppd->ibport_data;
+
+       /*
+        * Don't allow RDMA reads or atomic operations on UC or
+        * undefined operations.
+        * Make sure buffer is large enough to hold the result for atomics.
+        */
+       if (wr->opcode == IB_WR_FAST_REG_MR) {
+               return -EINVAL;
+       } else if (qp->ibqp.qp_type == IB_QPT_UC) {
+               if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+                       return -EINVAL;
+       } else if (qp->ibqp.qp_type != IB_QPT_RC) {
+               /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
+               if (wr->opcode != IB_WR_SEND &&
+                   wr->opcode != IB_WR_SEND_WITH_IMM)
+                       return -EINVAL;
+               /* Check UD destination address PD */
+               if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+                       return -EINVAL;
+       } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+               return -EINVAL;
+       else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+                  (wr->num_sge == 0 ||
+                   wr->sg_list[0].length < sizeof(u64) ||
+                   wr->sg_list[0].addr & (sizeof(u64) - 1)))
+               return -EINVAL;
+       else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+               return -EINVAL;
+
+       next = qp->s_head + 1;
+       if (next >= qp->s_size)
+               next = 0;
+       if (next == qp->s_last)
+               return -ENOMEM;
+
+       rkt = &to_idev(qp->ibqp.device)->lk_table;
+       pd = to_ipd(qp->ibqp.pd);
+       wqe = get_swqe_ptr(qp, qp->s_head);
+       wqe->wr = *wr;
+       wqe->length = 0;
+       j = 0;
+       if (wr->num_sge) {
+               acc = wr->opcode >= IB_WR_RDMA_READ ?
+                       IB_ACCESS_LOCAL_WRITE : 0;
+               for (i = 0; i < wr->num_sge; i++) {
+                       u32 length = wr->sg_list[i].length;
+                       int ok;
+
+                       if (length == 0)
+                               continue;
+                       ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j],
+                                         &wr->sg_list[i], acc);
+                       if (!ok)
+                               goto bail_inval_free;
+                       wqe->length += length;
+                       j++;
+               }
+               wqe->wr.num_sge = j;
+       }
+       if (qp->ibqp.qp_type == IB_QPT_UC ||
+           qp->ibqp.qp_type == IB_QPT_RC) {
+               if (wqe->length > 0x80000000U)
+                       goto bail_inval_free;
+       } else {
+               struct hfi1_ah *ah = to_iah(wr->wr.ud.ah);
+
+               atomic_inc(&ah->refcount);
+       }
+       wqe->ssn = qp->s_ssn++;
+       qp->s_head = next;
+
+       return 0;
+
+bail_inval_free:
+       /* release mr holds */
+       while (j) {
+               struct hfi1_sge *sge = &wqe->sg_list[--j];
+
+               hfi1_put_mr(sge->mr);
+       }
+       return -EINVAL;
+}
+
+/**
+ * post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                    struct ib_send_wr **bad_wr)
+{
+       struct hfi1_qp *qp = to_iqp(ibqp);
+       int err = 0;
+       int call_send;
+       unsigned long flags;
+       unsigned nreq = 0;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Check that state is OK to post send. */
+       if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               return -EINVAL;
+       }
+
+       /* sq empty and not list -> call send */
+       call_send = qp->s_head == qp->s_last && !wr->next;
+
+       for (; wr; wr = wr->next) {
+               err = post_one_send(qp, wr);
+               if (unlikely(err)) {
+                       *bad_wr = wr;
+                       goto bail;
+               }
+               nreq++;
+       }
+bail:
+       if (nreq && !call_send)
+               hfi1_schedule_send(qp);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (nreq && call_send)
+               hfi1_do_send(&qp->s_iowait.iowork);
+       return err;
+}
+
+/**
+ * post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                       struct ib_recv_wr **bad_wr)
+{
+       struct hfi1_qp *qp = to_iqp(ibqp);
+       struct hfi1_rwq *wq = qp->r_rq.wq;
+       unsigned long flags;
+       int ret;
+
+       /* Check that state is OK to post receive. */
+       if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) {
+               *bad_wr = wr;
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       for (; wr; wr = wr->next) {
+               struct hfi1_rwqe *wqe;
+               u32 next;
+               int i;
+
+               if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+                       *bad_wr = wr;
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&qp->r_rq.lock, flags);
+               next = wq->head + 1;
+               if (next >= qp->r_rq.size)
+                       next = 0;
+               if (next == wq->tail) {
+                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+               wqe->wr_id = wr->wr_id;
+               wqe->num_sge = wr->num_sge;
+               for (i = 0; i < wr->num_sge; i++)
+                       wqe->sg_list[i] = wr->sg_list[i];
+               /* Make sure queue entry is written before the head index. */
+               smp_wmb();
+               wq->head = next;
+               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/*
+ * Make sure the QP is ready and able to accept the given opcode.
+ */
+static inline int qp_ok(int opcode, struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp;
+
+       if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK))
+               goto dropit;
+       if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) ||
+           (opcode == IB_OPCODE_CNP))
+               return 1;
+dropit:
+       ibp = &packet->rcd->ppd->ibport_data;
+       ibp->n_pkt_drops++;
+       return 0;
+}
+
+
+/**
+ * hfi1_ib_rcv - process an incoming packet
+ * @packet: data packet information
+ *
+ * This is called to process an incoming packet at interrupt level.
+ *
+ * Tlen is the length of the header + data + CRC in bytes.
+ */
+void hfi1_ib_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ctxtdata *rcd = packet->rcd;
+       struct hfi1_ib_header *hdr = packet->hdr;
+       u32 tlen = packet->tlen;
+       struct hfi1_pportdata *ppd = rcd->ppd;
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       u32 qp_num;
+       int lnh;
+       u8 opcode;
+       u16 lid;
+
+       /* Check for GRH */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_BTH)
+               packet->ohdr = &hdr->u.oth;
+       else if (lnh == HFI1_LRH_GRH) {
+               u32 vtf;
+
+               packet->ohdr = &hdr->u.l.oth;
+               if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+                       goto drop;
+               vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+               if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+                       goto drop;
+               packet->rcv_flags |= HFI1_HAS_GRH;
+       } else
+               goto drop;
+
+       trace_input_ibhdr(rcd->dd, hdr);
+
+       opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+       inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+       /* Get the destination QP number. */
+       qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK;
+       lid = be16_to_cpu(hdr->lrh[1]);
+       if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) &&
+           (lid != HFI1_PERMISSIVE_LID))) {
+               struct hfi1_mcast *mcast;
+               struct hfi1_mcast_qp *p;
+
+               if (lnh != HFI1_LRH_GRH)
+                       goto drop;
+               mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid);
+               if (mcast == NULL)
+                       goto drop;
+               list_for_each_entry_rcu(p, &mcast->qp_list, list) {
+                       packet->qp = p->qp;
+                       spin_lock(&packet->qp->r_lock);
+                       if (likely((qp_ok(opcode, packet))))
+                               opcode_handler_tbl[opcode](packet);
+                       spin_unlock(&packet->qp->r_lock);
+               }
+               /*
+                * Notify hfi1_multicast_detach() if it is waiting for us
+                * to finish.
+                */
+               if (atomic_dec_return(&mcast->refcount) <= 1)
+                       wake_up(&mcast->wait);
+       } else {
+               rcu_read_lock();
+               packet->qp = hfi1_lookup_qpn(ibp, qp_num);
+               if (!packet->qp) {
+                       rcu_read_unlock();
+                       goto drop;
+               }
+               spin_lock(&packet->qp->r_lock);
+               if (likely((qp_ok(opcode, packet))))
+                       opcode_handler_tbl[opcode](packet);
+               spin_unlock(&packet->qp->r_lock);
+               rcu_read_unlock();
+       }
+       return;
+
+drop:
+       ibp->n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+       struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data;
+       struct list_head *list = &dev->memwait;
+       struct hfi1_qp *qp = NULL;
+       struct iowait *wait;
+       unsigned long flags;
+
+       write_seqlock_irqsave(&dev->iowait_lock, flags);
+       if (!list_empty(list)) {
+               wait = list_first_entry(list, struct iowait, list);
+               qp = container_of(wait, struct hfi1_qp, s_iowait);
+               list_del_init(&qp->s_iowait.list);
+               /* refcount held until actual wake up */
+               if (!list_empty(list))
+                       mod_timer(&dev->mem_timer, jiffies + 1);
+       }
+       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+
+       if (qp)
+               hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM);
+}
+
+void update_sge(struct hfi1_sge_state *ss, u32 length)
+{
+       struct hfi1_sge *sge = &ss->sge;
+
+       sge->vaddr += length;
+       sge->length -= length;
+       sge->sge_length -= length;
+       if (sge->sge_length == 0) {
+               if (--ss->num_sge)
+                       *sge = *ss->sg_list++;
+       } else if (sge->length == 0 && sge->mr->lkey) {
+               if (++sge->n >= HFI1_SEGSZ) {
+                       if (++sge->m >= sge->mr->mapsz)
+                               return;
+                       sge->n = 0;
+               }
+               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+       }
+}
+
+static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
+                                               struct hfi1_qp *qp)
+{
+       struct verbs_txreq *tx;
+       unsigned long flags;
+
+       tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+       if (!tx) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               write_seqlock(&dev->iowait_lock);
+               if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK &&
+                   list_empty(&qp->s_iowait.list)) {
+                       dev->n_txwait++;
+                       qp->s_flags |= HFI1_S_WAIT_TX;
+                       list_add_tail(&qp->s_iowait.list, &dev->txwait);
+                       trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX);
+                       atomic_inc(&qp->refcount);
+               }
+               qp->s_flags &= ~HFI1_S_BUSY;
+               write_sequnlock(&dev->iowait_lock);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               tx = ERR_PTR(-EBUSY);
+       }
+       return tx;
+}
+
+static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
+                                           struct hfi1_qp *qp)
+{
+       struct verbs_txreq *tx;
+
+       tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+       if (!tx)
+               /* call slow path to get the lock */
+               tx =  __get_txreq(dev, qp);
+       if (tx)
+               tx->qp = qp;
+       return tx;
+}
+
+void hfi1_put_txreq(struct verbs_txreq *tx)
+{
+       struct hfi1_ibdev *dev;
+       struct hfi1_qp *qp;
+       unsigned long flags;
+       unsigned int seq;
+
+       qp = tx->qp;
+       dev = to_idev(qp->ibqp.device);
+
+       if (tx->mr) {
+               hfi1_put_mr(tx->mr);
+               tx->mr = NULL;
+       }
+       sdma_txclean(dd_from_dev(dev), &tx->txreq);
+
+       /* Free verbs_txreq and return to slab cache */
+       kmem_cache_free(dev->verbs_txreq_cache, tx);
+
+       do {
+               seq = read_seqbegin(&dev->iowait_lock);
+               if (!list_empty(&dev->txwait)) {
+                       struct iowait *wait;
+
+                       write_seqlock_irqsave(&dev->iowait_lock, flags);
+                       /* Wake up first QP wanting a free struct */
+                       wait = list_first_entry(&dev->txwait, struct iowait,
+                                               list);
+                       qp = container_of(wait, struct hfi1_qp, s_iowait);
+                       list_del_init(&qp->s_iowait.list);
+                       /* refcount held until actual wake up */
+                       write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+                       hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX);
+                       break;
+               }
+       } while (read_seqretry(&dev->iowait_lock, seq));
+}
+
+/*
+ * This is called with progress side lock held.
+ */
+/* New API */
+static void verbs_sdma_complete(
+       struct sdma_txreq *cookie,
+       int status,
+       int drained)
+{
+       struct verbs_txreq *tx =
+               container_of(cookie, struct verbs_txreq, txreq);
+       struct hfi1_qp *qp = tx->qp;
+
+       spin_lock(&qp->s_lock);
+       if (tx->wqe)
+               hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+       else if (qp->ibqp.qp_type == IB_QPT_RC) {
+               struct hfi1_ib_header *hdr;
+
+               hdr = &tx->phdr.hdr;
+               hfi1_rc_send_complete(qp, hdr);
+       }
+       if (drained) {
+               /*
+                * This happens when the send engine notes
+                * a QP in the error state and cannot
+                * do the flush work until that QP's
+                * sdma work has finished.
+                */
+               if (qp->s_flags & HFI1_S_WAIT_DMA) {
+                       qp->s_flags &= ~HFI1_S_WAIT_DMA;
+                       hfi1_schedule_send(qp);
+               }
+       }
+       spin_unlock(&qp->s_lock);
+
+       hfi1_put_txreq(tx);
+}
+
+static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp)
+{
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+               write_seqlock(&dev->iowait_lock);
+               if (list_empty(&qp->s_iowait.list)) {
+                       if (list_empty(&dev->memwait))
+                               mod_timer(&dev->mem_timer, jiffies + 1);
+                       qp->s_flags |= HFI1_S_WAIT_KMEM;
+                       list_add_tail(&qp->s_iowait.list, &dev->memwait);
+                       trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM);
+                       atomic_inc(&qp->refcount);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~HFI1_S_BUSY;
+               ret = -EBUSY;
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       return ret;
+}
+
+/*
+ * This routine calls txadds for each sg entry.
+ *
+ * Add failures will revert the sge cursor
+ */
+static int build_verbs_ulp_payload(
+       struct sdma_engine *sde,
+       struct hfi1_sge_state *ss,
+       u32 length,
+       struct verbs_txreq *tx)
+{
+       struct hfi1_sge *sg_list = ss->sg_list;
+       struct hfi1_sge sge = ss->sge;
+       u8 num_sge = ss->num_sge;
+       u32 len;
+       int ret = 0;
+
+       while (length) {
+               len = ss->sge.length;
+               if (len > length)
+                       len = length;
+               if (len > ss->sge.sge_length)
+                       len = ss->sge.sge_length;
+               WARN_ON_ONCE(len == 0);
+               ret = sdma_txadd_kvaddr(
+                       sde->dd,
+                       &tx->txreq,
+                       ss->sge.vaddr,
+                       len);
+               if (ret)
+                       goto bail_txadd;
+               update_sge(ss, len);
+               length -= len;
+       }
+       return ret;
+bail_txadd:
+       /* unwind cursor */
+       ss->sge = sge;
+       ss->num_sge = num_sge;
+       ss->sg_list = sg_list;
+       return ret;
+}
+
+/*
+ * Build the number of DMA descriptors needed to send length bytes of data.
+ *
+ * NOTE: DMA mapping is held in the tx until completed in the ring or
+ *       the tx desc is freed without having been submitted to the ring
+ *
+ * This routine insures the following all the helper routine
+ * calls succeed.
+ */
+/* New API */
+static int build_verbs_tx_desc(
+       struct sdma_engine *sde,
+       struct hfi1_sge_state *ss,
+       u32 length,
+       struct verbs_txreq *tx,
+       struct ahg_ib_header *ahdr,
+       u64 pbc)
+{
+       int ret = 0;
+       struct hfi1_pio_header *phdr;
+       u16 hdrbytes = tx->hdr_dwords << 2;
+
+       phdr = &tx->phdr;
+       if (!ahdr->ahgcount) {
+               ret = sdma_txinit_ahg(
+                       &tx->txreq,
+                       ahdr->tx_flags,
+                       hdrbytes + length,
+                       ahdr->ahgidx,
+                       0,
+                       NULL,
+                       0,
+                       verbs_sdma_complete);
+               if (ret)
+                       goto bail_txadd;
+               phdr->pbc = cpu_to_le64(pbc);
+               memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc));
+               /* add the header */
+               ret = sdma_txadd_kvaddr(
+                       sde->dd,
+                       &tx->txreq,
+                       &tx->phdr,
+                       tx->hdr_dwords << 2);
+               if (ret)
+                       goto bail_txadd;
+       } else {
+               struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth;
+               struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth;
+
+               /* needed in rc_send_complete() */
+               phdr->hdr.lrh[0] = ahdr->ibh.lrh[0];
+               if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) {
+                       sohdr = &ahdr->ibh.u.l.oth;
+                       dohdr = &phdr->hdr.u.l.oth;
+               }
+               /* opcode */
+               dohdr->bth[0] = sohdr->bth[0];
+               /* PSN/ACK  */
+               dohdr->bth[2] = sohdr->bth[2];
+               ret = sdma_txinit_ahg(
+                       &tx->txreq,
+                       ahdr->tx_flags,
+                       length,
+                       ahdr->ahgidx,
+                       ahdr->ahgcount,
+                       ahdr->ahgdesc,
+                       hdrbytes,
+                       verbs_sdma_complete);
+               if (ret)
+                       goto bail_txadd;
+       }
+
+       /* add the ulp payload - if any.  ss can be NULL for acks */
+       if (ss)
+               ret = build_verbs_ulp_payload(sde, ss, length, tx);
+bail_txadd:
+       return ret;
+}
+
+int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+                       u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                       u32 plen, u32 dwords, u64 pbc)
+{
+       struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct verbs_txreq *tx;
+       struct sdma_txreq *stx;
+       u64 pbc_flags = 0;
+       struct sdma_engine *sde;
+       u8 sc5 = qp->s_sc;
+       int ret;
+
+       if (!list_empty(&qp->s_iowait.tx_head)) {
+               stx = list_first_entry(
+                       &qp->s_iowait.tx_head,
+                       struct sdma_txreq,
+                       list);
+               list_del_init(&stx->list);
+               tx = container_of(stx, struct verbs_txreq, txreq);
+               ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx);
+               if (unlikely(ret == -ECOMM))
+                       goto bail_ecomm;
+               return ret;
+       }
+
+       tx = get_txreq(dev, qp);
+       if (IS_ERR(tx))
+               goto bail_tx;
+
+       if (!qp->s_hdr->sde) {
+               tx->sde = sde = qp_to_sdma_engine(qp, sc5);
+               if (!sde)
+                       goto bail_no_sde;
+       } else
+               tx->sde = sde = qp->s_hdr->sde;
+
+       if (likely(pbc == 0)) {
+               u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+               /* No vl15 here */
+               /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+               pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+
+               pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       }
+       tx->wqe = qp->s_wqe;
+       tx->mr = qp->s_rdma_mr;
+       if (qp->s_rdma_mr)
+               qp->s_rdma_mr = NULL;
+       tx->hdr_dwords = hdrwords + 2;
+       ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc);
+       if (unlikely(ret))
+               goto bail_build;
+       trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
+       ret =  sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq);
+       if (unlikely(ret == -ECOMM))
+               goto bail_ecomm;
+       return ret;
+
+bail_no_sde:
+       hfi1_put_txreq(tx);
+bail_ecomm:
+       /* The current one got "sent" */
+       return 0;
+bail_build:
+       /* kmalloc or mapping fail */
+       hfi1_put_txreq(tx);
+       return wait_kmem(dev, qp);
+bail_tx:
+       return PTR_ERR(tx);
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc)
+{
+       struct hfi1_devdata *dd = sc->dd;
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       unsigned long flags;
+       int ret = 0;
+
+       /*
+        * Note that as soon as want_buffer() is called and
+        * possibly before it returns, sc_piobufavail()
+        * could be called. Therefore, put QP on the I/O wait list before
+        * enabling the PIO avail interrupt.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) {
+               write_seqlock(&dev->iowait_lock);
+               if (list_empty(&qp->s_iowait.list)) {
+                       struct hfi1_ibdev *dev = &dd->verbs_dev;
+                       int was_empty;
+
+                       dev->n_piowait++;
+                       qp->s_flags |= HFI1_S_WAIT_PIO;
+                       was_empty = list_empty(&sc->piowait);
+                       list_add_tail(&qp->s_iowait.list, &sc->piowait);
+                       trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO);
+                       atomic_inc(&qp->refcount);
+                       /* counting: only call wantpiobuf_intr if first user */
+                       if (was_empty)
+                               hfi1_sc_wantpiobuf_intr(sc, 1);
+               }
+               write_sequnlock(&dev->iowait_lock);
+               qp->s_flags &= ~HFI1_S_BUSY;
+               ret = -EBUSY;
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1);
+       u8 vl;
+
+       vl = sc_to_vlt(dd, sc5);
+       if (vl >= ppd->vls_supported && vl != 15)
+               return NULL;
+       return dd->vld[vl].sc;
+}
+
+int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+                       u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                       u32 plen, u32 dwords, u64 pbc)
+{
+       struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u32 *hdr = (u32 *)&ahdr->ibh;
+       u64 pbc_flags = 0;
+       u32 sc5;
+       unsigned long flags = 0;
+       struct send_context *sc;
+       struct pio_buf *pbuf;
+       int wc_status = IB_WC_SUCCESS;
+
+       /* vl15 special case taken care of in ud.c */
+       sc5 = qp->s_sc;
+       sc = qp_to_send_context(qp, sc5);
+
+       if (!sc)
+               return -EINVAL;
+       if (likely(pbc == 0)) {
+               u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
+               /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
+               pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT;
+               pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+       }
+       pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
+       if (unlikely(pbuf == NULL)) {
+               if (ppd->host_link_state != HLS_UP_ACTIVE) {
+                       /*
+                        * If we have filled the PIO buffers to capacity and are
+                        * not in an active state this request is not going to
+                        * go out to so just complete it with an error or else a
+                        * ULP or the core may be stuck waiting.
+                        */
+                       hfi1_cdbg(
+                               PIO,
+                               "alloc failed. state not active, completing");
+                       wc_status = IB_WC_GENERAL_ERR;
+                       goto pio_bail;
+               } else {
+                       /*
+                        * This is a normal occurrence. The PIO buffs are full
+                        * up but we are still happily sending, well we could be
+                        * so lets continue to queue the request.
+                        */
+                       hfi1_cdbg(PIO, "alloc failed. state active, queuing");
+                       return no_bufs_available(qp, sc);
+               }
+       }
+
+       if (len == 0) {
+               pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
+       } else {
+               if (ss) {
+                       seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4);
+                       while (len) {
+                               void *addr = ss->sge.vaddr;
+                               u32 slen = ss->sge.length;
+
+                               if (slen > len)
+                                       slen = len;
+                               update_sge(ss, slen);
+                               seg_pio_copy_mid(pbuf, addr, slen);
+                               len -= slen;
+                       }
+                       seg_pio_copy_end(pbuf);
+               }
+       }
+
+       trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
+
+       if (qp->s_rdma_mr) {
+               hfi1_put_mr(qp->s_rdma_mr);
+               qp->s_rdma_mr = NULL;
+       }
+
+pio_bail:
+       if (qp->s_wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_send_complete(qp, qp->s_wqe, wc_status);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       } else if (qp->ibqp.qp_type == IB_QPT_RC) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               hfi1_rc_send_complete(qp, &ahdr->ibh);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
+       return 0;
+}
+/*
+ * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
+ * being an entry from the ingress partition key table), return 0
+ * otherwise. Use the matching criteria for egress partition keys
+ * specified in the OPAv1 spec., section 9.1l.7.
+ */
+static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
+{
+       u16 mkey = pkey & PKEY_LOW_15_MASK;
+       u16 ment = ent & PKEY_LOW_15_MASK;
+
+       if (mkey == ment) {
+               /*
+                * If pkey[15] is set (full partition member),
+                * is bit 15 in the corresponding table element
+                * clear (limited member)?
+                */
+               if (pkey & PKEY_MEMBER_MASK)
+                       return !!(ent & PKEY_MEMBER_MASK);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * egress_pkey_check - return 0 if hdr's pkey matches according to the
+ * criteria in the OPAv1 spec., section 9.11.7.
+ */
+static inline int egress_pkey_check(struct hfi1_pportdata *ppd,
+                                   struct hfi1_ib_header *hdr,
+                                   struct hfi1_qp *qp)
+{
+       struct hfi1_other_headers *ohdr;
+       struct hfi1_devdata *dd;
+       int i = 0;
+       u16 pkey;
+       u8 lnh, sc5 = qp->s_sc;
+
+       if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
+               return 0;
+
+       /* locate the pkey within the headers */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == HFI1_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else
+               ohdr = &hdr->u.oth;
+
+       pkey = (u16)be32_to_cpu(ohdr->bth[0]);
+
+       /* If SC15, pkey[0:14] must be 0x7fff */
+       if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
+               goto bad;
+
+
+       /* Is the pkey = 0x0, or 0x8000? */
+       if ((pkey & PKEY_LOW_15_MASK) == 0)
+               goto bad;
+
+       /* The most likely matching pkey has index qp->s_pkey_index */
+       if (unlikely(!egress_pkey_matches_entry(pkey,
+                                       ppd->pkeys[qp->s_pkey_index]))) {
+               /* no match - try the entire table */
+               for (; i < MAX_PKEY_VALUES; i++) {
+                       if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
+                               break;
+               }
+       }
+
+       if (i < MAX_PKEY_VALUES)
+               return 0;
+bad:
+       incr_cntr64(&ppd->port_xmit_constraint_errors);
+       dd = ppd->dd;
+       if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) {
+               u16 slid = be16_to_cpu(hdr->lrh[3]);
+
+               dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK;
+               dd->err_info_xmit_constraint.slid = slid;
+               dd->err_info_xmit_constraint.pkey = pkey;
+       }
+       return 1;
+}
+
+/**
+ * hfi1_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @ahdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags HFI1_S_BUSY otherwise.
+ */
+int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+                   u32 hdrwords, struct hfi1_sge_state *ss, u32 len)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+       u32 plen;
+       int ret;
+       int pio = 0;
+       unsigned long flags = 0;
+       u32 dwords = (len + 3) >> 2;
+
+       /*
+        * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+        * can defer SDMA restart until link goes ACTIVE without
+        * worrying about just how we got there.
+        */
+       if ((qp->ibqp.qp_type == IB_QPT_SMI) ||
+           !(dd->flags & HFI1_HAS_SEND_DMA))
+               pio = 1;
+
+       ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp);
+       if (unlikely(ret)) {
+               /*
+                * The value we are returning here does not get propagated to
+                * the verbs caller. Thus we need to complete the request with
+                * error otherwise the caller could be sitting waiting on the
+                * completion event. Only do this for PIO. SDMA has its own
+                * mechanism for handling the errors. So for SDMA we can just
+                * return.
+                */
+               if (pio) {
+                       hfi1_cdbg(PIO, "%s() Failed. Completing with err",
+                                 __func__);
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+               }
+               return -EINVAL;
+       }
+
+       /*
+        * Calculate the send buffer trigger address.
+        * The +2 counts for the pbc control qword
+        */
+       plen = hdrwords + dwords + 2;
+
+       if (pio) {
+               ret = dd->process_pio_send(
+                       qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
+       } else {
+#ifdef CONFIG_SDMA_VERBOSITY
+               dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n",
+                          slashstrip(__FILE__), __LINE__, __func__);
+               dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", hdrwords, len);
+#endif
+               ret = dd->process_dma_send(
+                       qp, ahdr, hdrwords, ss, len, plen, dwords, 0);
+       }
+
+       return ret;
+}
+
+static int query_device(struct ib_device *ibdev,
+                       struct ib_device_attr *props,
+                       struct ib_udata *uhw)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibdev *dev = to_idev(ibdev);
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+       memset(props, 0, sizeof(*props));
+
+       props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+               IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+               IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+               IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+
+       props->page_size_cap = PAGE_SIZE;
+       props->vendor_id =
+               dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
+       props->vendor_part_id = dd->pcidev->device;
+       props->hw_ver = dd->minrev;
+       props->sys_image_guid = ib_hfi1_sys_image_guid;
+       props->max_mr_size = ~0ULL;
+       props->max_qp = hfi1_max_qps;
+       props->max_qp_wr = hfi1_max_qp_wrs;
+       props->max_sge = hfi1_max_sges;
+       props->max_sge_rd = hfi1_max_sges;
+       props->max_cq = hfi1_max_cqs;
+       props->max_ah = hfi1_max_ahs;
+       props->max_cqe = hfi1_max_cqes;
+       props->max_mr = dev->lk_table.max;
+       props->max_fmr = dev->lk_table.max;
+       props->max_map_per_fmr = 32767;
+       props->max_pd = hfi1_max_pds;
+       props->max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
+       props->max_qp_init_rd_atom = 255;
+       /* props->max_res_rd_atom */
+       props->max_srq = hfi1_max_srqs;
+       props->max_srq_wr = hfi1_max_srq_wrs;
+       props->max_srq_sge = hfi1_max_srq_sges;
+       /* props->local_ca_ack_delay */
+       props->atomic_cap = IB_ATOMIC_GLOB;
+       props->max_pkeys = hfi1_get_npkeys(dd);
+       props->max_mcast_grp = hfi1_max_mcast_grps;
+       props->max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
+       props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+               props->max_mcast_grp;
+
+       return 0;
+}
+
+static inline u16 opa_speed_to_ib(u16 in)
+{
+       u16 out = 0;
+
+       if (in & OPA_LINK_SPEED_25G)
+               out |= IB_SPEED_EDR;
+       if (in & OPA_LINK_SPEED_12_5G)
+               out |= IB_SPEED_FDR;
+
+       return out;
+}
+
+/*
+ * Convert a single OPA link width (no multiple flags) to an IB value.
+ * A zero OPA link width means link down, which means the IB width value
+ * is a don't care.
+ */
+static inline u16 opa_width_to_ib(u16 in)
+{
+       switch (in) {
+       case OPA_LINK_WIDTH_1X:
+       /* map 2x and 3x to 1x as they don't exist in IB */
+       case OPA_LINK_WIDTH_2X:
+       case OPA_LINK_WIDTH_3X:
+               return IB_WIDTH_1X;
+       default: /* link down or unknown, return our largest width */
+       case OPA_LINK_WIDTH_4X:
+               return IB_WIDTH_4X;
+       }
+}
+
+static int query_port(struct ib_device *ibdev, u8 port,
+                     struct ib_port_attr *props)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       u16 lid = ppd->lid;
+
+       memset(props, 0, sizeof(*props));
+       props->lid = lid ? lid : 0;
+       props->lmc = ppd->lmc;
+       props->sm_lid = ibp->sm_lid;
+       props->sm_sl = ibp->sm_sl;
+       /* OPA logical states match IB logical states */
+       props->state = driver_lstate(ppd);
+       props->phys_state = hfi1_ibphys_portstate(ppd);
+       props->port_cap_flags = ibp->port_cap_flags;
+       props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
+       props->max_msg_sz = 0x80000000;
+       props->pkey_tbl_len = hfi1_get_npkeys(dd);
+       props->bad_pkey_cntr = ibp->pkey_violations;
+       props->qkey_viol_cntr = ibp->qkey_violations;
+       props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
+       /* see rate_show() in ib core/sysfs.c */
+       props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+       props->max_vl_num = ppd->vls_supported;
+       props->init_type_reply = 0;
+
+       /* Once we are a "first class" citizen and have added the OPA MTUs to
+        * the core we can advertise the larger MTU enum to the ULPs, for now
+        * advertise only 4K.
+        *
+        * Those applications which are either OPA aware or pass the MTU enum
+        * from the Path Records to us will get the new 8k MTU.  Those that
+        * attempt to process the MTU enum may fail in various ways.
+        */
+       props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
+                                     4096 : hfi1_max_mtu), IB_MTU_4096);
+       props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
+               mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
+       props->subnet_timeout = ibp->subnet_timeout;
+
+       return 0;
+}
+
+static int port_immutable(struct ib_device *ibdev, u8 port_num,
+                         struct ib_port_immutable *immutable)
+{
+       struct ib_port_attr attr;
+       int err;
+
+       err = query_port(ibdev, port_num, &attr);
+       if (err)
+               return err;
+
+       memset(immutable, 0, sizeof(*immutable));
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA;
+       immutable->max_mad_size = OPA_MGMT_MAD_SIZE;
+
+       return 0;
+}
+
+static int modify_device(struct ib_device *device,
+                        int device_modify_mask,
+                        struct ib_device_modify *device_modify)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(device);
+       unsigned i;
+       int ret;
+
+       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+                                  IB_DEVICE_MODIFY_NODE_DESC)) {
+               ret = -EOPNOTSUPP;
+               goto bail;
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+               memcpy(device->node_desc, device_modify->node_desc, 64);
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+                       hfi1_node_desc_chg(ibp);
+               }
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+               ib_hfi1_sys_image_guid =
+                       cpu_to_be64(device_modify->sys_image_guid);
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
+
+                       hfi1_sys_guid_chg(ibp);
+               }
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int modify_port(struct ib_device *ibdev, u8 port,
+                      int port_modify_mask, struct ib_port_modify *props)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, port);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       int ret = 0;
+
+       ibp->port_cap_flags |= props->set_port_cap_mask;
+       ibp->port_cap_flags &= ~props->clr_port_cap_mask;
+       if (props->set_port_cap_mask || props->clr_port_cap_mask)
+               hfi1_cap_mask_chg(ibp);
+       if (port_modify_mask & IB_PORT_SHUTDOWN) {
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
+                 OPA_LINKDOWN_REASON_UNKNOWN);
+               ret = set_link_state(ppd, HLS_DN_DOWNDEF);
+       }
+       if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+               ibp->qkey_violations = 0;
+       return ret;
+}
+
+static int query_gid(struct ib_device *ibdev, u8 port,
+                    int index, union ib_gid *gid)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       int ret = 0;
+
+       if (!port || port > dd->num_pports)
+               ret = -EINVAL;
+       else {
+               struct hfi1_ibport *ibp = to_iport(ibdev, port);
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               gid->global.subnet_prefix = ibp->gid_prefix;
+               if (index == 0)
+                       gid->global.interface_id = cpu_to_be64(ppd->guid);
+               else if (index < HFI1_GUIDS_PER_PORT)
+                       gid->global.interface_id = ibp->guids[index - 1];
+               else
+                       ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static struct ib_pd *alloc_pd(struct ib_device *ibdev,
+                             struct ib_ucontext *context,
+                             struct ib_udata *udata)
+{
+       struct hfi1_ibdev *dev = to_idev(ibdev);
+       struct hfi1_pd *pd;
+       struct ib_pd *ret;
+
+       /*
+        * This is actually totally arbitrary.  Some correctness tests
+        * assume there's a maximum number of PDs that can be allocated.
+        * We don't actually have this limit, but we fail the test if
+        * we allow allocations of more than we report for this value.
+        */
+
+       pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+       if (!pd) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       spin_lock(&dev->n_pds_lock);
+       if (dev->n_pds_allocated == hfi1_max_pds) {
+               spin_unlock(&dev->n_pds_lock);
+               kfree(pd);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       dev->n_pds_allocated++;
+       spin_unlock(&dev->n_pds_lock);
+
+       /* ib_alloc_pd() will initialize pd->ibpd. */
+       pd->user = udata != NULL;
+
+       ret = &pd->ibpd;
+
+bail:
+       return ret;
+}
+
+static int dealloc_pd(struct ib_pd *ibpd)
+{
+       struct hfi1_pd *pd = to_ipd(ibpd);
+       struct hfi1_ibdev *dev = to_idev(ibpd->device);
+
+       spin_lock(&dev->n_pds_lock);
+       dev->n_pds_allocated--;
+       spin_unlock(&dev->n_pds_lock);
+
+       kfree(pd);
+
+       return 0;
+}
+
+/*
+ * convert ah port,sl to sc
+ */
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah)
+{
+       struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num);
+
+       return ibp->sl_to_sc[ah->sl];
+}
+
+int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+       struct hfi1_ibport *ibp;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+       u8 sc5;
+
+       /* A multicast address requires a GRH (see ch. 8.4.1). */
+       if (ah_attr->dlid >= HFI1_MULTICAST_LID_BASE &&
+           ah_attr->dlid != HFI1_PERMISSIVE_LID &&
+           !(ah_attr->ah_flags & IB_AH_GRH))
+               goto bail;
+       if ((ah_attr->ah_flags & IB_AH_GRH) &&
+           ah_attr->grh.sgid_index >= HFI1_GUIDS_PER_PORT)
+               goto bail;
+       if (ah_attr->dlid == 0)
+               goto bail;
+       if (ah_attr->port_num < 1 ||
+           ah_attr->port_num > ibdev->phys_port_cnt)
+               goto bail;
+       if (ah_attr->static_rate != IB_RATE_PORT_CURRENT &&
+           ib_rate_to_mbps(ah_attr->static_rate) < 0)
+               goto bail;
+       if (ah_attr->sl >= OPA_MAX_SLS)
+               goto bail;
+       /* test the mapping for validity */
+       ibp = to_iport(ibdev, ah_attr->port_num);
+       ppd = ppd_from_ibp(ibp);
+       sc5 = ibp->sl_to_sc[ah_attr->sl];
+       dd = dd_from_ppd(ppd);
+       if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
+               goto bail;
+       return 0;
+bail:
+       return -EINVAL;
+}
+
+/**
+ * create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *create_ah(struct ib_pd *pd,
+                              struct ib_ah_attr *ah_attr)
+{
+       struct hfi1_ah *ah;
+       struct ib_ah *ret;
+       struct hfi1_ibdev *dev = to_idev(pd->device);
+       unsigned long flags;
+
+       if (hfi1_check_ah(pd->device, ah_attr)) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
+       if (!ah) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dev->n_ahs_lock, flags);
+       if (dev->n_ahs_allocated == hfi1_max_ahs) {
+               spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+               kfree(ah);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       dev->n_ahs_allocated++;
+       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+       /* ib_create_ah() will initialize ah->ibah. */
+       ah->attr = *ah_attr;
+       atomic_set(&ah->refcount, 0);
+
+       ret = &ah->ibah;
+
+bail:
+       return ret;
+}
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
+{
+       struct ib_ah_attr attr;
+       struct ib_ah *ah = ERR_PTR(-EINVAL);
+       struct hfi1_qp *qp0;
+
+       memset(&attr, 0, sizeof(attr));
+       attr.dlid = dlid;
+       attr.port_num = ppd_from_ibp(ibp)->port;
+       rcu_read_lock();
+       qp0 = rcu_dereference(ibp->qp[0]);
+       if (qp0)
+               ah = ib_create_ah(qp0->ibqp.pd, &attr);
+       rcu_read_unlock();
+       return ah;
+}
+
+/**
+ * destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int destroy_ah(struct ib_ah *ibah)
+{
+       struct hfi1_ibdev *dev = to_idev(ibah->device);
+       struct hfi1_ah *ah = to_iah(ibah);
+       unsigned long flags;
+
+       if (atomic_read(&ah->refcount) != 0)
+               return -EBUSY;
+
+       spin_lock_irqsave(&dev->n_ahs_lock, flags);
+       dev->n_ahs_allocated--;
+       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+       kfree(ah);
+
+       return 0;
+}
+
+static int modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+       struct hfi1_ah *ah = to_iah(ibah);
+
+       if (hfi1_check_ah(ibah->device, ah_attr))
+               return -EINVAL;
+
+       ah->attr = *ah_attr;
+
+       return 0;
+}
+
+static int query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+       struct hfi1_ah *ah = to_iah(ibah);
+
+       *ah_attr = ah->attr;
+
+       return 0;
+}
+
+/**
+ * hfi1_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the hfi1_ib device
+ */
+unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
+{
+       return ARRAY_SIZE(dd->pport[0].pkeys);
+}
+
+static int query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+                     u16 *pkey)
+{
+       struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
+       int ret;
+
+       if (index >= hfi1_get_npkeys(dd)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       *pkey = hfi1_get_pkey(to_iport(ibdev, port), index);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the driver
+ */
+
+static struct ib_ucontext *alloc_ucontext(struct ib_device *ibdev,
+                                         struct ib_udata *udata)
+{
+       struct hfi1_ucontext *context;
+       struct ib_ucontext *ret;
+
+       context = kmalloc(sizeof(*context), GFP_KERNEL);
+       if (!context) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       ret = &context->ibucontext;
+
+bail:
+       return ret;
+}
+
+static int dealloc_ucontext(struct ib_ucontext *context)
+{
+       kfree(to_iucontext(context));
+       return 0;
+}
+
+static void init_ibport(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_ibport *ibp = &ppd->ibport_data;
+       size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
+       int i;
+
+       for (i = 0; i < sz; i++) {
+               ibp->sl_to_sc[i] = i;
+               ibp->sc_to_sl[i] = i;
+       }
+
+       spin_lock_init(&ibp->lock);
+       /* Set the prefix to the default value (see ch. 4.1.1) */
+       ibp->gid_prefix = IB_DEFAULT_GID_PREFIX;
+       ibp->sm_lid = 0;
+       /* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+       ibp->port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
+               IB_PORT_CAP_MASK_NOTICE_SUP;
+       ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+       ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+       ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+       ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+       ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+       RCU_INIT_POINTER(ibp->qp[0], NULL);
+       RCU_INIT_POINTER(ibp->qp[1], NULL);
+}
+
+static void verbs_txreq_kmem_cache_ctor(void *obj)
+{
+       struct verbs_txreq *tx = (struct verbs_txreq *)obj;
+
+       memset(tx, 0, sizeof(*tx));
+}
+
+/**
+ * hfi1_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return 0 if successful, errno if unsuccessful.
+ */
+int hfi1_register_ib_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct ib_device *ibdev = &dev->ibdev;
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned i, lk_tab_size;
+       int ret;
+       size_t lcpysz = IB_DEVICE_NAME_MAX;
+       u16 descq_cnt;
+
+       ret = hfi1_qp_init(dev);
+       if (ret)
+               goto err_qp_init;
+
+
+       for (i = 0; i < dd->num_pports; i++)
+               init_ibport(ppd + i);
+
+       /* Only need to initialize non-zero fields. */
+       spin_lock_init(&dev->n_pds_lock);
+       spin_lock_init(&dev->n_ahs_lock);
+       spin_lock_init(&dev->n_cqs_lock);
+       spin_lock_init(&dev->n_qps_lock);
+       spin_lock_init(&dev->n_srqs_lock);
+       spin_lock_init(&dev->n_mcast_grps_lock);
+       init_timer(&dev->mem_timer);
+       dev->mem_timer.function = mem_timer;
+       dev->mem_timer.data = (unsigned long) dev;
+
+       /*
+        * The top hfi1_lkey_table_size bits are used to index the
+        * table.  The lower 8 bits can be owned by the user (copied from
+        * the LKEY).  The remaining bits act as a generation number or tag.
+        */
+       spin_lock_init(&dev->lk_table.lock);
+       dev->lk_table.max = 1 << hfi1_lkey_table_size;
+       /* ensure generation is at least 4 bits (keys.c) */
+       if (hfi1_lkey_table_size > MAX_LKEY_TABLE_BITS) {
+               dd_dev_warn(dd, "lkey bits %u too large, reduced to %u\n",
+                             hfi1_lkey_table_size, MAX_LKEY_TABLE_BITS);
+               hfi1_lkey_table_size = MAX_LKEY_TABLE_BITS;
+       }
+       lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+       dev->lk_table.table = (struct hfi1_mregion __rcu **)
+               vmalloc(lk_tab_size);
+       if (dev->lk_table.table == NULL) {
+               ret = -ENOMEM;
+               goto err_lk;
+       }
+       RCU_INIT_POINTER(dev->dma_mr, NULL);
+       for (i = 0; i < dev->lk_table.max; i++)
+               RCU_INIT_POINTER(dev->lk_table.table[i], NULL);
+       INIT_LIST_HEAD(&dev->pending_mmaps);
+       spin_lock_init(&dev->pending_lock);
+       seqlock_init(&dev->iowait_lock);
+       dev->mmap_offset = PAGE_SIZE;
+       spin_lock_init(&dev->mmap_offset_lock);
+       INIT_LIST_HEAD(&dev->txwait);
+       INIT_LIST_HEAD(&dev->memwait);
+
+       descq_cnt = sdma_get_descq_cnt();
+
+       /* SLAB_HWCACHE_ALIGN for AHG */
+       dev->verbs_txreq_cache = kmem_cache_create("hfi1_vtxreq_cache",
+                                                  sizeof(struct verbs_txreq),
+                                                  0, SLAB_HWCACHE_ALIGN,
+                                                  verbs_txreq_kmem_cache_ctor);
+       if (!dev->verbs_txreq_cache) {
+               ret = -ENOMEM;
+               goto err_verbs_txreq;
+       }
+
+       /*
+        * The system image GUID is supposed to be the same for all
+        * HFIs in a single system but since there can be other
+        * device types in the system, we can't be sure this is unique.
+        */
+       if (!ib_hfi1_sys_image_guid)
+               ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid);
+       lcpysz = strlcpy(ibdev->name, class_name(), lcpysz);
+       strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz);
+       ibdev->owner = THIS_MODULE;
+       ibdev->node_guid = cpu_to_be64(ppd->guid);
+       ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION;
+       ibdev->uverbs_cmd_mask =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+               (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_AH)           |
+               (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+               (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+               (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+               (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+               (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+               (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+       ibdev->node_type = RDMA_NODE_IB_CA;
+       ibdev->phys_port_cnt = dd->num_pports;
+       ibdev->num_comp_vectors = 1;
+       ibdev->dma_device = &dd->pcidev->dev;
+       ibdev->query_device = query_device;
+       ibdev->modify_device = modify_device;
+       ibdev->query_port = query_port;
+       ibdev->modify_port = modify_port;
+       ibdev->query_pkey = query_pkey;
+       ibdev->query_gid = query_gid;
+       ibdev->alloc_ucontext = alloc_ucontext;
+       ibdev->dealloc_ucontext = dealloc_ucontext;
+       ibdev->alloc_pd = alloc_pd;
+       ibdev->dealloc_pd = dealloc_pd;
+       ibdev->create_ah = create_ah;
+       ibdev->destroy_ah = destroy_ah;
+       ibdev->modify_ah = modify_ah;
+       ibdev->query_ah = query_ah;
+       ibdev->create_srq = hfi1_create_srq;
+       ibdev->modify_srq = hfi1_modify_srq;
+       ibdev->query_srq = hfi1_query_srq;
+       ibdev->destroy_srq = hfi1_destroy_srq;
+       ibdev->create_qp = hfi1_create_qp;
+       ibdev->modify_qp = hfi1_modify_qp;
+       ibdev->query_qp = hfi1_query_qp;
+       ibdev->destroy_qp = hfi1_destroy_qp;
+       ibdev->post_send = post_send;
+       ibdev->post_recv = post_receive;
+       ibdev->post_srq_recv = hfi1_post_srq_receive;
+       ibdev->create_cq = hfi1_create_cq;
+       ibdev->destroy_cq = hfi1_destroy_cq;
+       ibdev->resize_cq = hfi1_resize_cq;
+       ibdev->poll_cq = hfi1_poll_cq;
+       ibdev->req_notify_cq = hfi1_req_notify_cq;
+       ibdev->get_dma_mr = hfi1_get_dma_mr;
+       ibdev->reg_phys_mr = hfi1_reg_phys_mr;
+       ibdev->reg_user_mr = hfi1_reg_user_mr;
+       ibdev->dereg_mr = hfi1_dereg_mr;
+       ibdev->alloc_mr = hfi1_alloc_mr;
+       ibdev->alloc_fast_reg_page_list = hfi1_alloc_fast_reg_page_list;
+       ibdev->free_fast_reg_page_list = hfi1_free_fast_reg_page_list;
+       ibdev->alloc_fmr = hfi1_alloc_fmr;
+       ibdev->map_phys_fmr = hfi1_map_phys_fmr;
+       ibdev->unmap_fmr = hfi1_unmap_fmr;
+       ibdev->dealloc_fmr = hfi1_dealloc_fmr;
+       ibdev->attach_mcast = hfi1_multicast_attach;
+       ibdev->detach_mcast = hfi1_multicast_detach;
+       ibdev->process_mad = hfi1_process_mad;
+       ibdev->mmap = hfi1_mmap;
+       ibdev->dma_ops = &hfi1_dma_mapping_ops;
+       ibdev->get_port_immutable = port_immutable;
+
+       strncpy(ibdev->node_desc, init_utsname()->nodename,
+               sizeof(ibdev->node_desc));
+
+       ret = ib_register_device(ibdev, hfi1_create_port_files);
+       if (ret)
+               goto err_reg;
+
+       ret = hfi1_create_agents(dev);
+       if (ret)
+               goto err_agents;
+
+       ret = hfi1_verbs_register_sysfs(dd);
+       if (ret)
+               goto err_class;
+
+       goto bail;
+
+err_class:
+       hfi1_free_agents(dev);
+err_agents:
+       ib_unregister_device(ibdev);
+err_reg:
+err_verbs_txreq:
+       kmem_cache_destroy(dev->verbs_txreq_cache);
+       vfree(dev->lk_table.table);
+err_lk:
+       hfi1_qp_exit(dev);
+err_qp_init:
+       dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+bail:
+       return ret;
+}
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
+{
+       struct hfi1_ibdev *dev = &dd->verbs_dev;
+       struct ib_device *ibdev = &dev->ibdev;
+
+       hfi1_verbs_unregister_sysfs(dd);
+
+       hfi1_free_agents(dev);
+
+       ib_unregister_device(ibdev);
+
+       if (!list_empty(&dev->txwait))
+               dd_dev_err(dd, "txwait list not empty!\n");
+       if (!list_empty(&dev->memwait))
+               dd_dev_err(dd, "memwait list not empty!\n");
+       if (dev->dma_mr)
+               dd_dev_err(dd, "DMA MR not NULL!\n");
+
+       hfi1_qp_exit(dev);
+       del_timer_sync(&dev->mem_timer);
+       kmem_cache_destroy(dev->verbs_txreq_cache);
+       vfree(dev->lk_table.table);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_schedule_send(struct hfi1_qp *qp)
+{
+       if (hfi1_send_ok(qp)) {
+               struct hfi1_ibport *ibp =
+                       to_iport(qp->ibqp.device, qp->port_num);
+               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+               iowait_schedule(&qp->s_iowait, ppd->hfi1_wq);
+       }
+}
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet)
+{
+       struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+
+       if (packet->qp->ibqp.qp_type == IB_QPT_UC)
+               hfi1_uc_rcv(packet);
+       else if (packet->qp->ibqp.qp_type == IB_QPT_UD)
+               hfi1_ud_rcv(packet);
+       else
+               ibp->n_pkt_drops++;
+}
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
new file mode 100644 (file)
index 0000000..ed903a9
--- /dev/null
@@ -0,0 +1,1151 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef HFI1_VERBS_H
+#define HFI1_VERBS_H
+
+#include <linux/types.h>
+#include <linux/seqlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_mad.h>
+
+struct hfi1_ctxtdata;
+struct hfi1_pportdata;
+struct hfi1_devdata;
+struct hfi1_packet;
+
+#include "iowait.h"
+
+#define HFI1_MAX_RDMA_ATOMIC     16
+#define HFI1_GUIDS_PER_PORT    5
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define HFI1_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
+
+#define IB_SEQ_NAK     (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+/* Flags for checking QP state (see ib_hfi1_state_ops[]) */
+#define HFI1_POST_SEND_OK                0x01
+#define HFI1_POST_RECV_OK                0x02
+#define HFI1_PROCESS_RECV_OK             0x04
+#define HFI1_PROCESS_SEND_OK             0x08
+#define HFI1_PROCESS_NEXT_SEND_OK        0x10
+#define HFI1_FLUSH_SEND                        0x20
+#define HFI1_FLUSH_RECV                        0x40
+#define HFI1_PROCESS_OR_FLUSH_SEND \
+       (HFI1_PROCESS_SEND_OK | HFI1_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define HFI1_VENDOR_IPG                cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK         (1 << 31)
+#define IB_BTH_SOLICITED       (1 << 23)
+#define IB_BTH_MIG_REQ         (1 << 22)
+
+#define IB_GRH_VERSION         6
+#define IB_GRH_VERSION_MASK    0xF
+#define IB_GRH_VERSION_SHIFT   28
+#define IB_GRH_TCLASS_MASK     0xFF
+#define IB_GRH_TCLASS_SHIFT    20
+#define IB_GRH_FLOW_MASK       0xFFFFF
+#define IB_GRH_FLOW_SHIFT      0
+#define IB_GRH_NEXT_HDR                0x1B
+
+#define IB_DEFAULT_GID_PREFIX  cpu_to_be64(0xfe80000000000000ULL)
+
+/* flags passed by hfi1_ib_rcv() */
+enum {
+       HFI1_HAS_GRH = (1 << 0),
+};
+
+struct ib_reth {
+       __be64 vaddr;
+       __be32 rkey;
+       __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+       __be32 rkey;
+       __be64 swap_data;
+       __be64 compare_data;
+} __packed;
+
+union ib_ehdrs {
+       struct {
+               __be32 deth[2];
+               __be32 imm_data;
+       } ud;
+       struct {
+               struct ib_reth reth;
+               __be32 imm_data;
+       } rc;
+       struct {
+               __be32 aeth;
+               __be32 atomic_ack_eth[2];
+       } at;
+       __be32 imm_data;
+       __be32 aeth;
+       struct ib_atomic_eth atomic_eth;
+}  __packed;
+
+struct hfi1_other_headers {
+       __be32 bth[3];
+       union ib_ehdrs u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct hfi1_ib_header {
+       __be16 lrh[4];
+       union {
+               struct {
+                       struct ib_grh grh;
+                       struct hfi1_other_headers oth;
+               } l;
+               struct hfi1_other_headers oth;
+       } u;
+} __packed;
+
+struct ahg_ib_header {
+       struct sdma_engine *sde;
+       u32 ahgdesc[2];
+       u16 tx_flags;
+       u8 ahgcount;
+       u8 ahgidx;
+       struct hfi1_ib_header ibh;
+};
+
+struct hfi1_pio_header {
+       __le64 pbc;
+       struct hfi1_ib_header hdr;
+} __packed;
+
+/*
+ * used for force cacheline alignment for AHG
+ */
+struct tx_pio_header {
+       struct hfi1_pio_header phdr;
+} ____cacheline_aligned;
+
+/*
+ * There is one struct hfi1_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct hfi1_mcast_qp.
+ */
+struct hfi1_mcast_qp {
+       struct list_head list;
+       struct hfi1_qp *qp;
+};
+
+struct hfi1_mcast {
+       struct rb_node rb_node;
+       union ib_gid mgid;
+       struct list_head qp_list;
+       wait_queue_head_t wait;
+       atomic_t refcount;
+       int n_attached;
+};
+
+/* Protection domain */
+struct hfi1_pd {
+       struct ib_pd ibpd;
+       int user;               /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct hfi1_ah {
+       struct ib_ah ibah;
+       struct ib_ah_attr attr;
+       atomic_t refcount;
+};
+
+/*
+ * This structure is used by hfi1_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct hfi1_mmap_info {
+       struct list_head pending_mmaps;
+       struct ib_ucontext *context;
+       void *obj;
+       __u64 offset;
+       struct kref ref;
+       unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct hfi1_cq_wc {
+       u32 head;               /* index of next entry to fill */
+       u32 tail;               /* index of next ib_poll_cq() entry */
+       union {
+               /* these are actually size ibcq.cqe + 1 */
+               struct ib_uverbs_wc uqueue[0];
+               struct ib_wc kqueue[0];
+       };
+};
+
+/*
+ * The completion queue structure.
+ */
+struct hfi1_cq {
+       struct ib_cq ibcq;
+       struct kthread_work comptask;
+       struct hfi1_devdata *dd;
+       spinlock_t lock; /* protect changes in this struct */
+       u8 notify;
+       u8 triggered;
+       struct hfi1_cq_wc *queue;
+       struct hfi1_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * Used by the verbs layer.
+ */
+struct hfi1_seg {
+       void *vaddr;
+       size_t length;
+};
+
+/* The number of hfi1_segs that fit in a page. */
+#define HFI1_SEGSZ     (PAGE_SIZE / sizeof(struct hfi1_seg))
+
+struct hfi1_segarray {
+       struct hfi1_seg segs[HFI1_SEGSZ];
+};
+
+struct hfi1_mregion {
+       struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
+       u64 user_base;          /* User's address for this region */
+       u64 iova;               /* IB start address of this region */
+       size_t length;
+       u32 lkey;
+       u32 offset;             /* offset (bytes) to start of region */
+       int access_flags;
+       u32 max_segs;           /* number of hfi1_segs in all the arrays */
+       u32 mapsz;              /* size of the map array */
+       u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
+       u8  lkey_published;     /* in global table */
+       struct completion comp; /* complete when refcount goes to zero */
+       atomic_t refcount;
+       struct hfi1_segarray *map[0];    /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct hfi1_sge {
+       struct hfi1_mregion *mr;
+       void *vaddr;            /* kernel virtual address of segment */
+       u32 sge_length;         /* length of the SGE */
+       u32 length;             /* remaining length of the segment */
+       u16 m;                  /* current index: mr->map[m] */
+       u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct hfi1_mr {
+       struct ib_mr ibmr;
+       struct ib_umem *umem;
+       struct hfi1_mregion mr;  /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct hfi1_swqe {
+       struct ib_send_wr wr;   /* don't use wr.sg_list */
+       u32 psn;                /* first packet sequence number */
+       u32 lpsn;               /* last packet sequence number */
+       u32 ssn;                /* send sequence number */
+       u32 length;             /* total length of data in sg_list */
+       struct hfi1_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct hfi1_rwqe {
+       u64 wr_id;
+       u8 num_sge;
+       struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct hfi1_rwq {
+       u32 head;               /* new work requests posted to the head */
+       u32 tail;               /* receives pull requests from here. */
+       struct hfi1_rwqe wq[0];
+};
+
+struct hfi1_rq {
+       struct hfi1_rwq *wq;
+       u32 size;               /* size of RWQE array */
+       u8 max_sge;
+       /* protect changes in this struct */
+       spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct hfi1_srq {
+       struct ib_srq ibsrq;
+       struct hfi1_rq rq;
+       struct hfi1_mmap_info *ip;
+       /* send signal when number of RWQEs < limit */
+       u32 limit;
+};
+
+struct hfi1_sge_state {
+       struct hfi1_sge *sg_list;      /* next SGE to be used if any */
+       struct hfi1_sge sge;   /* progress state for the current SGE */
+       u32 total_len;
+       u8 num_sge;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct hfi1_ack_entry {
+       u8 opcode;
+       u8 sent;
+       u32 psn;
+       u32 lpsn;
+       union {
+               struct hfi1_sge rdma_sge;
+               u64 atomic_data;
+       };
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct hfi1_qp {
+       struct ib_qp ibqp;
+       /* read mostly fields above and below */
+       struct ib_ah_attr remote_ah_attr;
+       struct ib_ah_attr alt_ah_attr;
+       struct hfi1_qp __rcu *next;           /* link list for QPN hash table */
+       struct hfi1_swqe *s_wq;  /* send work queue */
+       struct hfi1_mmap_info *ip;
+       struct ahg_ib_header *s_hdr;     /* next packet header to send */
+       u8 s_sc;                        /* SC[0..4] for next packet */
+       unsigned long timeout_jiffies;  /* computed from timeout */
+
+       enum ib_mtu path_mtu;
+       int srate_mbps;         /* s_srate (below) converted to Mbit/s */
+       u32 remote_qpn;
+       u32 pmtu;               /* decoded from path_mtu */
+       u32 qkey;               /* QKEY for this QP (for UD or RD) */
+       u32 s_size;             /* send work queue size */
+       u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
+       u32 s_ahgpsn;           /* set to the psn in the copy of the header */
+
+       u8 state;               /* QP state */
+       u8 allowed_ops;         /* high order bits of allowed opcodes */
+       u8 qp_access_flags;
+       u8 alt_timeout;         /* Alternate path timeout for this QP */
+       u8 timeout;             /* Timeout for this QP */
+       u8 s_srate;
+       u8 s_mig_state;
+       u8 port_num;
+       u8 s_pkey_index;        /* PKEY index to use */
+       u8 s_alt_pkey_index;    /* Alternate path PKEY index to use */
+       u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
+       u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
+       u8 s_retry_cnt;         /* number of times to retry */
+       u8 s_rnr_retry_cnt;
+       u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
+       u8 s_max_sge;           /* size of s_wq->sg_list */
+       u8 s_draining;
+
+       /* start of read/write fields */
+       atomic_t refcount ____cacheline_aligned_in_smp;
+       wait_queue_head_t wait;
+
+
+       struct hfi1_ack_entry s_ack_queue[HFI1_MAX_RDMA_ATOMIC + 1]
+               ____cacheline_aligned_in_smp;
+       struct hfi1_sge_state s_rdma_read_sge;
+
+       spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
+       unsigned long r_aflags;
+       u64 r_wr_id;            /* ID for current receive WQE */
+       u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
+       u32 r_len;              /* total length of r_sge */
+       u32 r_rcv_len;          /* receive data len processed */
+       u32 r_psn;              /* expected rcv packet sequence number */
+       u32 r_msn;              /* message sequence number */
+
+       u8 r_state;             /* opcode of last packet received */
+       u8 r_flags;
+       u8 r_head_ack_queue;    /* index into s_ack_queue[] */
+
+       struct list_head rspwait;       /* link for waiting to respond */
+
+       struct hfi1_sge_state r_sge;     /* current receive data */
+       struct hfi1_rq r_rq;             /* receive work queue */
+
+       spinlock_t s_lock ____cacheline_aligned_in_smp;
+       struct hfi1_sge_state *s_cur_sge;
+       u32 s_flags;
+       struct hfi1_swqe *s_wqe;
+       struct hfi1_sge_state s_sge;     /* current send request data */
+       struct hfi1_mregion *s_rdma_mr;
+       struct sdma_engine *s_sde; /* current sde */
+       u32 s_cur_size;         /* size of send packet in bytes */
+       u32 s_len;              /* total length of s_sge */
+       u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
+       u32 s_next_psn;         /* PSN for next request */
+       u32 s_last_psn;         /* last response PSN processed */
+       u32 s_sending_psn;      /* lowest PSN that is being sent */
+       u32 s_sending_hpsn;     /* highest PSN that is being sent */
+       u32 s_psn;              /* current packet sequence number */
+       u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
+       u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
+       u32 s_head;             /* new entries added here */
+       u32 s_tail;             /* next entry to process */
+       u32 s_cur;              /* current work queue entry */
+       u32 s_acked;            /* last un-ACK'ed entry */
+       u32 s_last;             /* last completed entry */
+       u32 s_ssn;              /* SSN of tail entry */
+       u32 s_lsn;              /* limit sequence number (credit) */
+       u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
+       u16 s_rdma_ack_cnt;
+       s8 s_ahgidx;
+       u8 s_state;             /* opcode of last packet sent */
+       u8 s_ack_state;         /* opcode of packet to ACK */
+       u8 s_nak_state;         /* non-zero if NAK is pending */
+       u8 r_nak_state;         /* non-zero if NAK is pending */
+       u8 s_retry;             /* requester retry counter */
+       u8 s_rnr_retry;         /* requester RNR retry counter */
+       u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
+       u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+
+       struct hfi1_sge_state s_ack_rdma_sge;
+       struct timer_list s_timer;
+
+       struct iowait s_iowait;
+
+       struct hfi1_sge r_sg_list[0] /* verified SGEs */
+               ____cacheline_aligned_in_smp;
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define HFI1_R_WRID_VALID        0
+#define HFI1_R_REWIND_SGE        1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define HFI1_R_REUSE_SGE 0x01
+#define HFI1_R_RDMAR_SEQ 0x02
+#define HFI1_R_RSP_NAK   0x04
+#define HFI1_R_RSP_SEND  0x08
+#define HFI1_R_COMM_EST  0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * HFI1_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * HFI1_S_BUSY - send tasklet is processing the QP
+ * HFI1_S_TIMER - the RC retry timer is active
+ * HFI1_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * HFI1_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ *                         before processing the next SWQE
+ * HFI1_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ *                         before processing the next SWQE
+ * HFI1_S_WAIT_RNR - waiting for RNR timeout
+ * HFI1_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * HFI1_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                  next send completion entry not via send DMA
+ * HFI1_S_WAIT_PIO - waiting for a send buffer to be available
+ * HFI1_S_WAIT_TX - waiting for a struct verbs_txreq to be available
+ * HFI1_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * HFI1_S_WAIT_KMEM - waiting for kernel memory to be available
+ * HFI1_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * HFI1_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * HFI1_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ * HFI1_S_ECN - a BECN was queued to the send engine
+ */
+#define HFI1_S_SIGNAL_REQ_WR   0x0001
+#define HFI1_S_BUSY            0x0002
+#define HFI1_S_TIMER           0x0004
+#define HFI1_S_RESP_PENDING    0x0008
+#define HFI1_S_ACK_PENDING     0x0010
+#define HFI1_S_WAIT_FENCE      0x0020
+#define HFI1_S_WAIT_RDMAR      0x0040
+#define HFI1_S_WAIT_RNR                0x0080
+#define HFI1_S_WAIT_SSN_CREDIT 0x0100
+#define HFI1_S_WAIT_DMA                0x0200
+#define HFI1_S_WAIT_PIO                0x0400
+#define HFI1_S_WAIT_TX         0x0800
+#define HFI1_S_WAIT_DMA_DESC   0x1000
+#define HFI1_S_WAIT_KMEM               0x2000
+#define HFI1_S_WAIT_PSN                0x4000
+#define HFI1_S_WAIT_ACK                0x8000
+#define HFI1_S_SEND_ONE                0x10000
+#define HFI1_S_UNLIMITED_CREDIT        0x20000
+#define HFI1_S_AHG_VALID               0x40000
+#define HFI1_S_AHG_CLEAR               0x80000
+#define HFI1_S_ECN             0x100000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define HFI1_S_ANY_WAIT_IO (HFI1_S_WAIT_PIO | HFI1_S_WAIT_TX | \
+       HFI1_S_WAIT_DMA_DESC | HFI1_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define HFI1_S_ANY_WAIT_SEND (HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | \
+       HFI1_S_WAIT_RNR | HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_DMA | \
+       HFI1_S_WAIT_PSN | HFI1_S_WAIT_ACK)
+
+#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | HFI1_S_ANY_WAIT_SEND)
+
+#define HFI1_PSN_CREDIT  16
+
+/*
+ * Since struct hfi1_swqe is not a fixed size, we can't simply index into
+ * struct hfi1_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct hfi1_swqe *get_swqe_ptr(struct hfi1_qp *qp,
+                                            unsigned n)
+{
+       return (struct hfi1_swqe *)((char *)qp->s_wq +
+                                    (sizeof(struct hfi1_swqe) +
+                                     qp->s_max_sge *
+                                     sizeof(struct hfi1_sge)) * n);
+}
+
+/*
+ * Since struct hfi1_rwqe is not a fixed size, we can't simply index into
+ * struct hfi1_rwq.wq.  This function does the array index computation.
+ */
+static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, unsigned n)
+{
+       return (struct hfi1_rwqe *)
+               ((char *) rq->wq->wq +
+                (sizeof(struct hfi1_rwqe) +
+                 rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+#define MAX_LKEY_TABLE_BITS 23
+
+struct hfi1_lkey_table {
+       spinlock_t lock; /* protect changes in this struct */
+       u32 next;               /* next unused index (speeds search) */
+       u32 gen;                /* generation count */
+       u32 max;                /* size of the table */
+       struct hfi1_mregion __rcu **table;
+};
+
+struct hfi1_opcode_stats {
+       u64 n_packets;          /* number of packets */
+       u64 n_bytes;            /* total number of bytes */
+};
+
+struct hfi1_opcode_stats_perctx {
+       struct hfi1_opcode_stats stats[256];
+};
+
+static inline void inc_opstats(
+       u32 tlen,
+       struct hfi1_opcode_stats *stats)
+{
+#ifdef CONFIG_DEBUG_FS
+       stats->n_bytes += tlen;
+       stats->n_packets++;
+#endif
+}
+
+struct hfi1_ibport {
+       struct hfi1_qp __rcu *qp[2];
+       struct ib_mad_agent *send_agent;        /* agent for SMI (traps) */
+       struct hfi1_ah *sm_ah;
+       struct hfi1_ah *smi_ah;
+       struct rb_root mcast_tree;
+       spinlock_t lock;                /* protect changes in this struct */
+
+       /* non-zero when timer is set */
+       unsigned long mkey_lease_timeout;
+       unsigned long trap_timeout;
+       __be64 gid_prefix;      /* in network order */
+       __be64 mkey;
+       __be64 guids[HFI1_GUIDS_PER_PORT        - 1];   /* writable GUIDs */
+       u64 tid;                /* TID for traps */
+       u64 n_rc_resends;
+       u64 n_seq_naks;
+       u64 n_rdma_seq;
+       u64 n_rnr_naks;
+       u64 n_other_naks;
+       u64 n_loop_pkts;
+       u64 n_pkt_drops;
+       u64 n_vl15_dropped;
+       u64 n_rc_timeouts;
+       u64 n_dmawait;
+       u64 n_unaligned;
+       u64 n_rc_dupreq;
+       u64 n_rc_seqnak;
+
+       /* Hot-path per CPU counters to avoid cacheline trading to update */
+       u64 z_rc_acks;
+       u64 z_rc_qacks;
+       u64 z_rc_delayed_comp;
+       u64 __percpu *rc_acks;
+       u64 __percpu *rc_qacks;
+       u64 __percpu *rc_delayed_comp;
+
+       u32 port_cap_flags;
+       u32 pma_sample_start;
+       u32 pma_sample_interval;
+       __be16 pma_counter_select[5];
+       u16 pma_tag;
+       u16 pkey_violations;
+       u16 qkey_violations;
+       u16 mkey_violations;
+       u16 mkey_lease_period;
+       u16 sm_lid;
+       u16 repress_traps;
+       u8 sm_sl;
+       u8 mkeyprot;
+       u8 subnet_timeout;
+       u8 vl_high_limit;
+       /* the first 16 entries are sl_to_vl for !OPA */
+       u8 sl_to_sc[32];
+       u8 sc_to_sl[32];
+};
+
+
+struct hfi1_qp_ibdev;
+struct hfi1_ibdev {
+       struct ib_device ibdev;
+       struct list_head pending_mmaps;
+       spinlock_t mmap_offset_lock; /* protect mmap_offset */
+       u32 mmap_offset;
+       struct hfi1_mregion __rcu *dma_mr;
+
+       struct hfi1_qp_ibdev *qp_dev;
+
+       /* QP numbers are shared by all IB ports */
+       struct hfi1_lkey_table lk_table;
+       /* protect wait lists */
+       seqlock_t iowait_lock;
+       struct list_head txwait;        /* list for wait verbs_txreq */
+       struct list_head memwait;       /* list for wait kernel memory */
+       struct list_head txreq_free;
+       struct kmem_cache *verbs_txreq_cache;
+       struct timer_list mem_timer;
+
+       /* other waiters */
+       spinlock_t pending_lock;
+
+       u64 n_piowait;
+       u64 n_txwait;
+       u64 n_kmem_wait;
+
+       u32 n_pds_allocated;    /* number of PDs allocated for device */
+       spinlock_t n_pds_lock;
+       u32 n_ahs_allocated;    /* number of AHs allocated for device */
+       spinlock_t n_ahs_lock;
+       u32 n_cqs_allocated;    /* number of CQs allocated for device */
+       spinlock_t n_cqs_lock;
+       u32 n_qps_allocated;    /* number of QPs allocated for device */
+       spinlock_t n_qps_lock;
+       u32 n_srqs_allocated;   /* number of SRQs allocated for device */
+       spinlock_t n_srqs_lock;
+       u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+       spinlock_t n_mcast_grps_lock;
+#ifdef CONFIG_DEBUG_FS
+       /* per HFI debugfs */
+       struct dentry *hfi1_ibdev_dbg;
+       /* per HFI symlinks to above */
+       struct dentry *hfi1_ibdev_link;
+#endif
+};
+
+struct hfi1_verbs_counters {
+       u64 symbol_error_counter;
+       u64 link_error_recovery_counter;
+       u64 link_downed_counter;
+       u64 port_rcv_errors;
+       u64 port_rcv_remphys_errors;
+       u64 port_xmit_discards;
+       u64 port_xmit_data;
+       u64 port_rcv_data;
+       u64 port_xmit_packets;
+       u64 port_rcv_packets;
+       u32 local_link_integrity_errors;
+       u32 excessive_buffer_overrun_errors;
+       u32 vl15_dropped;
+};
+
+static inline struct hfi1_mr *to_imr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct hfi1_mr, ibmr);
+}
+
+static inline struct hfi1_pd *to_ipd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct hfi1_pd, ibpd);
+}
+
+static inline struct hfi1_ah *to_iah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct hfi1_ah, ibah);
+}
+
+static inline struct hfi1_cq *to_icq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct hfi1_cq, ibcq);
+}
+
+static inline struct hfi1_srq *to_isrq(struct ib_srq *ibsrq)
+{
+       return container_of(ibsrq, struct hfi1_srq, ibsrq);
+}
+
+static inline struct hfi1_qp *to_iqp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct hfi1_qp, ibqp);
+}
+
+static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct hfi1_ibdev, ibdev);
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct hfi1_qp *qp)
+{
+       return !(qp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
+               (qp->s_hdrwords || (qp->s_flags & HFI1_S_RESP_PENDING) ||
+                !(qp->s_flags & HFI1_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void hfi1_schedule_send(struct hfi1_qp *qp);
+void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+                   u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
+void hfi1_cap_mask_chg(struct hfi1_ibport *ibp);
+void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
+void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
+int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+                    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                    const struct ib_mad_hdr *in_mad, size_t in_mad_size,
+                    struct ib_mad_hdr *out_mad, size_t *out_mad_size,
+                    u16 *out_mad_pkey_index);
+int hfi1_create_agents(struct hfi1_ibdev *dev);
+void hfi1_free_agents(struct hfi1_ibdev *dev);
+
+/*
+ * The PSN_MASK and PSN_SHIFT allow for
+ * 1) comparing two PSNs
+ * 2) returning the PSN with any upper bits masked
+ * 3) returning the difference between to PSNs
+ *
+ * The number of significant bits in the PSN must
+ * necessarily be at least one bit less than
+ * the container holding the PSN.
+ */
+#ifndef CONFIG_HFI1_VERBS_31BIT_PSN
+#define PSN_MASK 0xFFFFFF
+#define PSN_SHIFT 8
+#else
+#define PSN_MASK 0x7FFFFFFF
+#define PSN_SHIFT 1
+#endif
+#define PSN_MODIFY_MASK 0xFFFFFF
+
+/* Number of bits to pay attention to in the opcode for checking qp type */
+#define OPCODE_QP_MASK 0xE0
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_msn(u32 a, u32 b)
+{
+       return (((int) a) - ((int) b)) << 8;
+}
+
+/*
+ * Compare two PSNs
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int cmp_psn(u32 a, u32 b)
+{
+       return (((int) a) - ((int) b)) << PSN_SHIFT;
+}
+
+/*
+ * Return masked PSN
+ */
+static inline u32 mask_psn(u32 a)
+{
+       return a & PSN_MASK;
+}
+
+/*
+ * Return delta between two PSNs
+ */
+static inline u32 delta_psn(u32 a, u32 b)
+{
+       return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
+}
+
+struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid);
+
+int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp);
+
+struct verbs_txreq;
+void hfi1_put_txreq(struct verbs_txreq *tx);
+
+int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
+                   u32 hdrwords, struct hfi1_sge_state *ss, u32 len);
+
+void hfi1_copy_sge(struct hfi1_sge_state *ss, void *data, u32 length,
+                  int release);
+
+void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release);
+
+void hfi1_cnp_rcv(struct hfi1_packet *packet);
+
+void hfi1_uc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_rcv(struct hfi1_packet *packet);
+
+void hfi1_rc_hdrerr(
+       struct hfi1_ctxtdata *rcd,
+       struct hfi1_ib_header *hdr,
+       u32 rcv_flags,
+       struct hfi1_qp *qp);
+
+u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
+
+void hfi1_rc_rnr_retry(unsigned long arg);
+
+void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr);
+
+void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err);
+
+void hfi1_ud_rcv(struct hfi1_packet *packet);
+
+int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey);
+
+int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region);
+
+void hfi1_free_lkey(struct hfi1_mregion *mr);
+
+int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd,
+                struct hfi1_sge *isge, struct ib_sge *sge, int acc);
+
+int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge,
+                u32 len, u64 vaddr, u32 rkey, int acc);
+
+int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                         struct ib_recv_wr **bad_wr);
+
+struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd,
+                              struct ib_srq_init_attr *srq_init_attr,
+                              struct ib_udata *udata);
+
+int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                   enum ib_srq_attr_mask attr_mask,
+                   struct ib_udata *udata);
+
+int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int hfi1_destroy_srq(struct ib_srq *ibsrq);
+
+int hfi1_cq_init(struct hfi1_devdata *dd);
+
+void hfi1_cq_exit(struct hfi1_devdata *dd);
+
+void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int sig);
+
+int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *hfi1_create_cq(
+       struct ib_device *ibdev,
+       const struct ib_cq_init_attr *attr,
+       struct ib_ucontext *context,
+       struct ib_udata *udata);
+
+int hfi1_destroy_cq(struct ib_cq *ibcq);
+
+int hfi1_req_notify_cq(
+       struct ib_cq *ibcq,
+       enum ib_cq_notify_flags notify_flags);
+
+int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd,
+                              struct ib_phys_buf *buffer_list,
+                              int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                              u64 virt_addr, int mr_access_flags,
+                              struct ib_udata *udata);
+
+int hfi1_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd,
+                           enum ib_mr_type mr_type,
+                           u32 max_entries);
+
+struct ib_fast_reg_page_list *hfi1_alloc_fast_reg_page_list(
+                               struct ib_device *ibdev, int page_list_len);
+
+void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl);
+
+int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr);
+
+struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+                             struct ib_fmr_attr *fmr_attr);
+
+int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+                     int list_len, u64 iova);
+
+int hfi1_unmap_fmr(struct list_head *fmr_list);
+
+int hfi1_dealloc_fmr(struct ib_fmr *ibfmr);
+
+static inline void hfi1_get_mr(struct hfi1_mregion *mr)
+{
+       atomic_inc(&mr->refcount);
+}
+
+static inline void hfi1_put_mr(struct hfi1_mregion *mr)
+{
+       if (unlikely(atomic_dec_and_test(&mr->refcount)))
+               complete(&mr->comp);
+}
+
+static inline void hfi1_put_ss(struct hfi1_sge_state *ss)
+{
+       while (ss->num_sge) {
+               hfi1_put_mr(ss->sge.mr);
+               if (--ss->num_sge)
+                       ss->sge = *ss->sg_list++;
+       }
+}
+
+void hfi1_release_mmap_info(struct kref *ref);
+
+struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, u32 size,
+                                            struct ib_ucontext *context,
+                                            void *obj);
+
+void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip,
+                          u32 size, void *obj);
+
+int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only);
+
+void hfi1_migrate_qp(struct hfi1_qp *qp);
+
+int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
+                      int has_grh, struct hfi1_qp *qp, u32 bth0);
+
+u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
+                 struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void clear_ahg(struct hfi1_qp *qp);
+
+void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
+                         u32 bth0, u32 bth2, int middle);
+
+void hfi1_do_send(struct work_struct *work);
+
+void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe,
+                       enum ib_wc_status status);
+
+void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct hfi1_qp *qp, int is_fecn);
+
+int hfi1_make_rc_req(struct hfi1_qp *qp);
+
+int hfi1_make_uc_req(struct hfi1_qp *qp);
+
+int hfi1_make_ud_req(struct hfi1_qp *qp);
+
+int hfi1_register_ib_device(struct hfi1_devdata *);
+
+void hfi1_unregister_ib_device(struct hfi1_devdata *);
+
+void hfi1_ib_rcv(struct hfi1_packet *packet);
+
+unsigned hfi1_get_npkeys(struct hfi1_devdata *);
+
+int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
+                       u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                       u32 plen, u32 dwords, u64 pbc);
+
+int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *hdr,
+                       u32 hdrwords, struct hfi1_sge_state *ss, u32 len,
+                       u32 plen, u32 dwords, u64 pbc);
+
+struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5);
+
+extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
+
+extern const u8 hdr_len_by_opcode[];
+
+extern const int ib_hfi1_state_ops[];
+
+extern __be64 ib_hfi1_sys_image_guid;    /* in network order */
+
+extern unsigned int hfi1_lkey_table_size;
+
+extern unsigned int hfi1_max_cqes;
+
+extern unsigned int hfi1_max_cqs;
+
+extern unsigned int hfi1_max_qp_wrs;
+
+extern unsigned int hfi1_max_qps;
+
+extern unsigned int hfi1_max_sges;
+
+extern unsigned int hfi1_max_mcast_grps;
+
+extern unsigned int hfi1_max_mcast_qp_attached;
+
+extern unsigned int hfi1_max_srqs;
+
+extern unsigned int hfi1_max_srq_sges;
+
+extern unsigned int hfi1_max_srq_wrs;
+
+extern const u32 ib_hfi1_rnr_table[];
+
+extern struct ib_dma_mapping_ops hfi1_dma_mapping_ops;
+
+#endif                          /* HFI1_VERBS_H */
diff --git a/drivers/staging/rdma/hfi1/verbs_mcast.c b/drivers/staging/rdma/hfi1/verbs_mcast.c
new file mode 100644 (file)
index 0000000..afc6b4c
--- /dev/null
@@ -0,0 +1,385 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/rculist.h>
+
+#include "hfi.h"
+
+/**
+ * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct hfi1_mcast_qp *mcast_qp_alloc(struct hfi1_qp *qp)
+{
+       struct hfi1_mcast_qp *mqp;
+
+       mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+       if (!mqp)
+               goto bail;
+
+       mqp->qp = qp;
+       atomic_inc(&qp->refcount);
+
+bail:
+       return mqp;
+}
+
+static void mcast_qp_free(struct hfi1_mcast_qp *mqp)
+{
+       struct hfi1_qp *qp = mqp->qp;
+
+       /* Notify hfi1_destroy_qp() if it is waiting. */
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+
+       kfree(mqp);
+}
+
+/**
+ * mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid)
+{
+       struct hfi1_mcast *mcast;
+
+       mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
+       if (!mcast)
+               goto bail;
+
+       mcast->mgid = *mgid;
+       INIT_LIST_HEAD(&mcast->qp_list);
+       init_waitqueue_head(&mcast->wait);
+       atomic_set(&mcast->refcount, 0);
+       mcast->n_attached = 0;
+
+bail:
+       return mcast;
+}
+
+static void mcast_free(struct hfi1_mcast *mcast)
+{
+       struct hfi1_mcast_qp *p, *tmp;
+
+       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+               mcast_qp_free(p);
+
+       kfree(mcast);
+}
+
+/**
+ * hfi1_mcast_find - search the global table for the given multicast GID
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid)
+{
+       struct rb_node *n;
+       unsigned long flags;
+       struct hfi1_mcast *mcast;
+
+       spin_lock_irqsave(&ibp->lock, flags);
+       n = ibp->mcast_tree.rb_node;
+       while (n) {
+               int ret;
+
+               mcast = rb_entry(n, struct hfi1_mcast, rb_node);
+
+               ret = memcmp(mgid->raw, mcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0)
+                       n = n->rb_left;
+               else if (ret > 0)
+                       n = n->rb_right;
+               else {
+                       atomic_inc(&mcast->refcount);
+                       spin_unlock_irqrestore(&ibp->lock, flags);
+                       goto bail;
+               }
+       }
+       spin_unlock_irqrestore(&ibp->lock, flags);
+
+       mcast = NULL;
+
+bail:
+       return mcast;
+}
+
+/**
+ * mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp,
+                    struct hfi1_mcast *mcast, struct hfi1_mcast_qp *mqp)
+{
+       struct rb_node **n = &ibp->mcast_tree.rb_node;
+       struct rb_node *pn = NULL;
+       int ret;
+
+       spin_lock_irq(&ibp->lock);
+
+       while (*n) {
+               struct hfi1_mcast *tmcast;
+               struct hfi1_mcast_qp *p;
+
+               pn = *n;
+               tmcast = rb_entry(pn, struct hfi1_mcast, rb_node);
+
+               ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0) {
+                       n = &pn->rb_left;
+                       continue;
+               }
+               if (ret > 0) {
+                       n = &pn->rb_right;
+                       continue;
+               }
+
+               /* Search the QP list to see if this is already there. */
+               list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+                       if (p->qp == mqp->qp) {
+                               ret = ESRCH;
+                               goto bail;
+                       }
+               }
+               if (tmcast->n_attached == hfi1_max_mcast_qp_attached) {
+                       ret = ENOMEM;
+                       goto bail;
+               }
+
+               tmcast->n_attached++;
+
+               list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+               ret = EEXIST;
+               goto bail;
+       }
+
+       spin_lock(&dev->n_mcast_grps_lock);
+       if (dev->n_mcast_grps_allocated == hfi1_max_mcast_grps) {
+               spin_unlock(&dev->n_mcast_grps_lock);
+               ret = ENOMEM;
+               goto bail;
+       }
+
+       dev->n_mcast_grps_allocated++;
+       spin_unlock(&dev->n_mcast_grps_lock);
+
+       mcast->n_attached++;
+
+       list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+       atomic_inc(&mcast->refcount);
+       rb_link_node(&mcast->rb_node, pn, n);
+       rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+       ret = 0;
+
+bail:
+       spin_unlock_irq(&ibp->lock);
+
+       return ret;
+}
+
+int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct hfi1_qp *qp = to_iqp(ibqp);
+       struct hfi1_ibdev *dev = to_idev(ibqp->device);
+       struct hfi1_ibport *ibp;
+       struct hfi1_mcast *mcast;
+       struct hfi1_mcast_qp *mqp;
+       int ret;
+
+       if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /*
+        * Allocate data structures since its better to do this outside of
+        * spin locks and it will most likely be needed.
+        */
+       mcast = mcast_alloc(gid);
+       if (mcast == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       mqp = mcast_qp_alloc(qp);
+       if (mqp == NULL) {
+               mcast_free(mcast);
+               ret = -ENOMEM;
+               goto bail;
+       }
+       ibp = to_iport(ibqp->device, qp->port_num);
+       switch (mcast_add(dev, ibp, mcast, mqp)) {
+       case ESRCH:
+               /* Neither was used: OK to attach the same QP twice. */
+               mcast_qp_free(mqp);
+               mcast_free(mcast);
+               break;
+
+       case EEXIST:            /* The mcast wasn't used */
+               mcast_free(mcast);
+               break;
+
+       case ENOMEM:
+               /* Exceeded the maximum number of mcast groups. */
+               mcast_qp_free(mqp);
+               mcast_free(mcast);
+               ret = -ENOMEM;
+               goto bail;
+
+       default:
+               break;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct hfi1_qp *qp = to_iqp(ibqp);
+       struct hfi1_ibdev *dev = to_idev(ibqp->device);
+       struct hfi1_ibport *ibp = to_iport(ibqp->device, qp->port_num);
+       struct hfi1_mcast *mcast = NULL;
+       struct hfi1_mcast_qp *p, *tmp;
+       struct rb_node *n;
+       int last = 0;
+       int ret;
+
+       if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       spin_lock_irq(&ibp->lock);
+
+       /* Find the GID in the mcast table. */
+       n = ibp->mcast_tree.rb_node;
+       while (1) {
+               if (n == NULL) {
+                       spin_unlock_irq(&ibp->lock);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               mcast = rb_entry(n, struct hfi1_mcast, rb_node);
+               ret = memcmp(gid->raw, mcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0)
+                       n = n->rb_left;
+               else if (ret > 0)
+                       n = n->rb_right;
+               else
+                       break;
+       }
+
+       /* Search the QP list. */
+       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+               if (p->qp != qp)
+                       continue;
+               /*
+                * We found it, so remove it, but don't poison the forward
+                * link until we are sure there are no list walkers.
+                */
+               list_del_rcu(&p->list);
+               mcast->n_attached--;
+
+               /* If this was the last attached QP, remove the GID too. */
+               if (list_empty(&mcast->qp_list)) {
+                       rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+                       last = 1;
+               }
+               break;
+       }
+
+       spin_unlock_irq(&ibp->lock);
+
+       if (p) {
+               /*
+                * Wait for any list walkers to finish before freeing the
+                * list element.
+                */
+               wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+               mcast_qp_free(p);
+       }
+       if (last) {
+               atomic_dec(&mcast->refcount);
+               wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+               mcast_free(mcast);
+               spin_lock_irq(&dev->n_mcast_grps_lock);
+               dev->n_mcast_grps_allocated--;
+               spin_unlock_irq(&dev->n_mcast_grps_lock);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp)
+{
+       return ibp->mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/staging/rdma/ipath/Kconfig b/drivers/staging/rdma/ipath/Kconfig
new file mode 100644 (file)
index 0000000..041ce06
--- /dev/null
@@ -0,0 +1,16 @@
+config INFINIBAND_IPATH
+       tristate "QLogic HTX HCA support"
+       depends on 64BIT && NET && HT_IRQ
+       ---help---
+       This is a driver for the deprecated QLogic Hyper-Transport
+       IB host channel adapter (model QHT7140),
+       including InfiniBand verbs support.  This driver allows these
+       devices to be used with both kernel upper level protocols such
+       as IP-over-InfiniBand as well as with userspace applications
+       (in conjunction with InfiniBand userspace access).
+       For QLogic PCIe QLE based cards, use the QIB driver instead.
+
+       If you have this hardware you will need to boot with PAT disabled
+       on your x86-64 systems, use the nopat kernel parameter.
+
+       Note that this driver will soon be removed entirely from the kernel.
diff --git a/drivers/staging/rdma/ipath/Makefile b/drivers/staging/rdma/ipath/Makefile
new file mode 100644 (file)
index 0000000..4496f28
--- /dev/null
@@ -0,0 +1,37 @@
+ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
+       -DIPATH_KERN_TYPE=0
+
+obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
+
+ib_ipath-y := \
+       ipath_cq.o \
+       ipath_diag.o \
+       ipath_dma.o \
+       ipath_driver.o \
+       ipath_eeprom.o \
+       ipath_file_ops.o \
+       ipath_fs.o \
+       ipath_init_chip.o \
+       ipath_intr.o \
+       ipath_keys.o \
+       ipath_mad.o \
+       ipath_mmap.o \
+       ipath_mr.o \
+       ipath_qp.o \
+       ipath_rc.o \
+       ipath_ruc.o \
+       ipath_sdma.o \
+       ipath_srq.o \
+       ipath_stats.o \
+       ipath_sysfs.o \
+       ipath_uc.o \
+       ipath_ud.o \
+       ipath_user_pages.o \
+       ipath_user_sdma.o \
+       ipath_verbs_mcast.o \
+       ipath_verbs.o
+
+ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
+
+ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/staging/rdma/ipath/TODO b/drivers/staging/rdma/ipath/TODO
new file mode 100644 (file)
index 0000000..cb00158
--- /dev/null
@@ -0,0 +1,5 @@
+The ipath driver has been moved to staging in preparation for its removal in a
+few releases. The driver will be deleted during the 4.6 merge window.
+
+Contact Dennis Dalessandro <dennis.dalessandro@intel.com> and
+Cc: linux-rdma@vger.kernel.org
diff --git a/drivers/staging/rdma/ipath/ipath_common.h b/drivers/staging/rdma/ipath/ipath_common.h
new file mode 100644 (file)
index 0000000..28cfe97
--- /dev/null
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+
+/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * The value in the BTH QP field that InfiniPath uses to differentiate
+ * an infinipath protocol IB packet vs standard IB transport
+ */
+#define IPATH_KD_QP 0x656b79
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN              0
+#define IPATH_IB_LINKARM               1
+#define IPATH_IB_LINKACTIVE            2
+#define IPATH_IB_LINKDOWN_ONLY         3
+#define IPATH_IB_LINKDOWN_SLEEP                4
+#define IPATH_IB_LINKDOWN_DISABLE      5
+#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */
+#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */
+#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */
+#define IPATH_IB_LINK_HRTBT    9 /* enable heartbeat, normal, non-loopback */
+
+/*
+ * These 3 values (SDR and DDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for ipath_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define IPATH_IB_SDR 1
+#define IPATH_IB_DDR 2
+
+/*
+ * stats maintained by the driver.  For now, at least, this is global
+ * to all minor devices.
+ */
+struct infinipath_stats {
+       /* number of interrupts taken */
+       __u64 sps_ints;
+       /* number of interrupts for errors */
+       __u64 sps_errints;
+       /* number of errors from chip (not incl. packet errors or CRC) */
+       __u64 sps_errs;
+       /* number of packet errors from chip other than CRC */
+       __u64 sps_pkterrs;
+       /* number of packets with CRC errors (ICRC and VCRC) */
+       __u64 sps_crcerrs;
+       /* number of hardware errors reported (parity, etc.) */
+       __u64 sps_hwerrs;
+       /* number of times IB link changed state unexpectedly */
+       __u64 sps_iblink;
+       __u64 sps_unused; /* was fastrcvint, no longer implemented */
+       /* number of kernel (port0) packets received */
+       __u64 sps_port0pkts;
+       /* number of "ethernet" packets sent by driver */
+       __u64 sps_ether_spkts;
+       /* number of "ethernet" packets received by driver */
+       __u64 sps_ether_rpkts;
+       /* number of SMA packets sent by driver. Obsolete. */
+       __u64 sps_sma_spkts;
+       /* number of SMA packets received by driver. Obsolete. */
+       __u64 sps_sma_rpkts;
+       /* number of times all ports rcvhdrq was full and packet dropped */
+       __u64 sps_hdrqfull;
+       /* number of times all ports egrtid was full and packet dropped */
+       __u64 sps_etidfull;
+       /*
+        * number of times we tried to send from driver, but no pio buffers
+        * avail
+        */
+       __u64 sps_nopiobufs;
+       /* number of ports currently open */
+       __u64 sps_ports;
+       /* list of pkeys (other than default) accepted (0 means not set) */
+       __u16 sps_pkeys[4];
+       __u16 sps_unused16[4]; /* available; maintaining compatible layout */
+       /* number of user ports per chip (not IB ports) */
+       __u32 sps_nports;
+       /* not our interrupt, or already handled */
+       __u32 sps_nullintr;
+       /* max number of packets handled per receive call */
+       __u32 sps_maxpkts_call;
+       /* avg number of packets handled per receive call */
+       __u32 sps_avgpkts_call;
+       /* total number of pages locked */
+       __u64 sps_pagelocks;
+       /* total number of pages unlocked */
+       __u64 sps_pageunlocks;
+       /*
+        * Number of packets dropped in kernel other than errors (ether
+        * packets if ipath not configured, etc.)
+        */
+       __u64 sps_krdrops;
+       __u64 sps_txeparity; /* PIO buffer parity error, recovered */
+       /* pad for future growth */
+       __u64 __sps_pad[45];
+};
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED       0x1 /* basic initialization done */
+#define IPATH_STATUS_DISABLED      0x2 /* hardware disabled */
+/* Device has been disabled via admin request */
+#define IPATH_STATUS_ADMIN_DISABLED    0x4
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF      0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE  0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+       /* (RO)  DMA RcvHdr to be used next. */
+       ur_rcvhdrtail = 0,
+       /* (RW)  RcvHdr entry to be processed next by host. */
+       ur_rcvhdrhead = 1,
+       /* (RO)  Index of next Eager index to use. */
+       ur_rcvegrindextail = 2,
+       /* (RW)  Eager TID to be processed next */
+       ur_rcvegrindexhead = 3,
+       /* For internal use only; max register number. */
+       _IPATH_UregMax
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_HT       0x1
+#define IPATH_RUNTIME_PCIE     0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER   0x4
+#define IPATH_RUNTIME_RCVHDR_COPY      0x8
+#define IPATH_RUNTIME_MASTER   0x10
+#define IPATH_RUNTIME_NODMA_RTAIL 0x80
+#define IPATH_RUNTIME_SDMA           0x200
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+       /* version of hardware, for feature checking. */
+       __u32 spi_hw_version;
+       /* version of software, for feature checking. */
+       __u32 spi_sw_version;
+       /* InfiniPath port assigned, goes into sent packets */
+       __u16 spi_port;
+       __u16 spi_subport;
+       /*
+        * IB MTU, packets IB data must be less than this.
+        * The MTU is in bytes, and will be a multiple of 4 bytes.
+        */
+       __u32 spi_mtu;
+       /*
+        * Size of a PIO buffer.  Any given packet's total size must be less
+        * than this (in words).  Included is the starting control word, so
+        * if 513 is returned, then total pkt size is 512 words or less.
+        */
+       __u32 spi_piosize;
+       /* size of the TID cache in infinipath, in entries */
+       __u32 spi_tidcnt;
+       /* size of the TID Eager list in infinipath, in entries */
+       __u32 spi_tidegrcnt;
+       /* size of a single receive header queue entry in words. */
+       __u32 spi_rcvhdrent_size;
+       /*
+        * Count of receive header queue entries allocated.
+        * This may be less than the spu_rcvhdrcnt passed in!.
+        */
+       __u32 spi_rcvhdr_cnt;
+
+       /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+       __u32 spi_runtime_flags;
+
+       /* address where receive buffer queue is mapped into */
+       __u64 spi_rcvhdr_base;
+
+       /* user program. */
+
+       /* base address of eager TID receive buffers. */
+       __u64 spi_rcv_egrbufs;
+
+       /* Allocated by initialization code, not by protocol. */
+
+       /*
+        * Size of each TID buffer in host memory, starting at
+        * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+        */
+       __u32 spi_rcv_egrbufsize;
+       /*
+        * The special QP (queue pair) value that identifies an infinipath
+        * protocol packet from standard IB packets.  More, probably much
+        * more, to be added.
+        */
+       __u32 spi_qpair;
+
+       /*
+        * User register base for init code, not to be used directly by
+        * protocol or applications.
+        */
+       __u64 __spi_uregbase;
+       /*
+        * Maximum buffer size in bytes that can be used in a single TID
+        * entry (assuming the buffer is aligned to this boundary).  This is
+        * the minimum of what the hardware and software support Guaranteed
+        * to be a power of 2.
+        */
+       __u32 spi_tid_maxsize;
+       /*
+        * alignment of each pio send buffer (byte count
+        * to add to spi_piobufbase to get to second buffer)
+        */
+       __u32 spi_pioalign;
+       /*
+        * The index of the first pio buffer available to this process;
+        * needed to do lookup in spi_pioavailaddr; not added to
+        * spi_piobufbase.
+        */
+       __u32 spi_pioindex;
+        /* number of buffers mapped for this process */
+       __u32 spi_piocnt;
+
+       /*
+        * Base address of writeonly pio buffers for this process.
+        * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+        * boundaries.  spi_piocnt buffers are mapped from this address
+        */
+       __u64 spi_piobufbase;
+
+       /*
+        * Base address of readonly memory copy of the pioavail registers.
+        * There are 2 bits for each buffer.
+        */
+       __u64 spi_pioavailaddr;
+
+       /*
+        * Address where driver updates a copy of the interface and driver
+        * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
+        * string indicating hardware error, if there was one.
+        */
+       __u64 spi_status;
+
+       /* number of chip ports available to user processes */
+       __u32 spi_nports;
+       /* unit number of chip we are using */
+       __u32 spi_unit;
+       /* num bufs in each contiguous set */
+       __u32 spi_rcv_egrperchunk;
+       /* size in bytes of each contiguous set */
+       __u32 spi_rcv_egrchunksize;
+       /* total size of mmap to cover full rcvegrbuffers */
+       __u32 spi_rcv_egrbuftotlen;
+       __u32 spi_filler_for_align;
+       /* address of readonly memory copy of the rcvhdrq tail register. */
+       __u64 spi_rcvhdr_tailaddr;
+
+       /* shared memory pages for subports if port is shared */
+       __u64 spi_subport_uregbase;
+       __u64 spi_subport_rcvegrbuf;
+       __u64 spi_subport_rcvhdr_base;
+
+       /* shared memory page for hardware port if it is shared */
+       __u64 spi_port_uregbase;
+       __u64 spi_port_rcvegrbuf;
+       __u64 spi_port_rcvhdr_base;
+       __u64 spi_port_rcvhdr_tailaddr;
+
+} __attribute__ ((aligned(8)));
+
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define IPATH_USER_SWMINOR 6
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+#define IPATH_KERN_TYPE 0
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+       /*
+        * version of user software, to detect compatibility issues.
+        * Should be set to IPATH_USER_SWVERSION.
+        */
+       __u32 spu_userversion;
+
+       /* desired number of receive header queue entries */
+       __u32 spu_rcvhdrcnt;
+
+       /* size of struct base_info to write to */
+       __u32 spu_base_info_size;
+
+       /*
+        * number of words in KD protocol header
+        * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
+        * kernel uses a default.  Once set, attempts to set any other value
+        * are an error (EAGAIN) until driver is reloaded.
+        */
+       __u32 spu_rcvhdrsize;
+
+       /*
+        * If two or more processes wish to share a port, each process
+        * must set the spu_subport_cnt and spu_subport_id to the same
+        * values.  The only restriction on the spu_subport_id is that
+        * it be unique for a given node.
+        */
+       __u16 spu_subport_cnt;
+       __u16 spu_subport_id;
+
+       __u32 spu_unused; /* kept for compatible layout */
+
+       /*
+        * address of struct base_info to write to
+        */
+       __u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define IPATH_CMD_MIN          16
+
+#define __IPATH_CMD_USER_INIT  16      /* old set up userspace (for old user code) */
+#define IPATH_CMD_PORT_INFO    17      /* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL    18      /* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE   19      /* update expected TID entries */
+#define IPATH_CMD_TID_FREE     20      /* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY 21      /* add partition key */
+#define __IPATH_CMD_SLAVE_INFO 22      /* return info on slave processes (for old user code) */
+#define IPATH_CMD_ASSIGN_PORT  23      /* allocate HCA and port */
+#define IPATH_CMD_USER_INIT    24      /* set up userspace */
+#define IPATH_CMD_UNUSED_1     25
+#define IPATH_CMD_UNUSED_2     26
+#define IPATH_CMD_PIOAVAILUPD  27      /* force an update of PIOAvail reg */
+#define IPATH_CMD_POLL_TYPE    28      /* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL       29 /* armlaunch detection control */
+/* 30 is unused */
+#define IPATH_CMD_SDMA_INFLIGHT 31     /* sdma inflight counter request */
+#define IPATH_CMD_SDMA_COMPLETE 32     /* sdma completion counter request */
+
+/*
+ * Poll types
+ */
+#define IPATH_POLL_TYPE_URGENT  0x01
+#define IPATH_POLL_TYPE_OVERFLOW 0x02
+
+struct ipath_port_info {
+       __u32 num_active;       /* number of active units */
+       __u32 unit;             /* unit (chip) assigned to caller */
+       __u16 port;             /* port on unit assigned to caller */
+       __u16 subport;          /* subport on unit assigned to caller */
+       __u16 num_ports;        /* number of ports available on unit */
+       __u16 num_subports;     /* number of subports opened on port */
+};
+
+struct ipath_tid_info {
+       __u32 tidcnt;
+       /* make structure same size in 32 and 64 bit */
+       __u32 tid__unused;
+       /* virtual address of first page in transfer */
+       __u64 tidvaddr;
+       /* pointer (same size 32/64 bit) to __u16 tid array */
+       __u64 tidlist;
+
+       /*
+        * pointer (same size 32/64 bit) to bitmap of TIDs used
+        * for this call; checked for being large enough at open
+        */
+       __u64 tidmap;
+};
+
+struct ipath_cmd {
+       __u32 type;                     /* command type */
+       union {
+               struct ipath_tid_info tid_info;
+               struct ipath_user_info user_info;
+
+               /*
+                * address in userspace where we should put the sdma
+                * inflight counter
+                */
+               __u64 sdma_inflight;
+               /*
+                * address in userspace where we should put the sdma
+                * completion counter
+                */
+               __u64 sdma_complete;
+               /* address in userspace of struct ipath_port_info to
+                  write result to */
+               __u64 port_info;
+               /* enable/disable receipt of packets */
+               __u32 recv_ctrl;
+               /* enable/disable armlaunch errors (non-zero to enable) */
+               __u32 armlaunch_ctrl;
+               /* partition key to set */
+               __u16 part_key;
+               /* user address of __u32 bitmask of active slaves */
+               __u64 slave_mask_addr;
+               /* type of polling we want */
+               __u16 poll_type;
+       } cmd;
+};
+
+struct ipath_iovec {
+       /* Pointer to data, but same size 32 and 64 bit */
+       __u64 iov_base;
+
+       /*
+        * Length of data; don't need 64 bits, but want
+        * ipath_sendpkt to remain same size as before 32 bit changes, so...
+        */
+       __u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+       __u32 sps_flags;        /* flags for packet (TBD) */
+       __u32 sps_cnt;          /* number of entries to use in sps_iov */
+       /* array of iov's describing packet. TEMPORARY */
+       struct ipath_iovec sps_iov[4];
+};
+
+/*
+ * diagnostics can send a packet by "writing" one of the following
+ * two structs to diag data special file
+ * The first is the legacy version for backward compatibility
+ */
+struct ipath_diag_pkt {
+       __u32 unit;
+       __u64 data;
+       __u32 len;
+};
+
+/* The second diag_pkt struct is the expanded version that allows
+ * more control over the packet, specifically, by allowing a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+struct ipath_diag_xpkt {
+       __u64 data;
+       __u64 pbc_wd;
+       __u32 unit;
+       __u32 len;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 2
+struct ipath_flash {
+       /* flash layout version (IPATH_FLASH_VERSION) */
+       __u8 if_fversion;
+       /* checksum protecting if_length bytes */
+       __u8 if_csum;
+       /*
+        * valid length (in use, protected by if_csum), including
+        * if_fversion and if_csum themselves)
+        */
+       __u8 if_length;
+       /* the GUID, in network order */
+       __u8 if_guid[8];
+       /* number of GUIDs to use, starting from if_guid */
+       __u8 if_numguid;
+       /* the (last 10 characters of) board serial number, in ASCII */
+       char if_serial[12];
+       /* board mfg date (YYYYMMDD ASCII) */
+       char if_mfgdate[8];
+       /* last board rework/test date (YYYYMMDD ASCII) */
+       char if_testdate[8];
+       /* logging of error counts, TBD */
+       __u8 if_errcntp[4];
+       /* powered on hours, updated at driver unload */
+       __u8 if_powerhour[2];
+       /* ASCII free-form comment field */
+       char if_comment[32];
+       /* Backwards compatible prefix for longer QLogic Serial Numbers */
+       char if_sprefix[4];
+       /* 82 bytes used, min flash size is 128 bytes */
+       __u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct infinipath_counters {
+       __u64 LBIntCnt;
+       __u64 LBFlowStallCnt;
+       __u64 TxSDmaDescCnt;    /* was Reserved1 */
+       __u64 TxUnsupVLErrCnt;
+       __u64 TxDataPktCnt;
+       __u64 TxFlowPktCnt;
+       __u64 TxDwordCnt;
+       __u64 TxLenErrCnt;
+       __u64 TxMaxMinLenErrCnt;
+       __u64 TxUnderrunCnt;
+       __u64 TxFlowStallCnt;
+       __u64 TxDroppedPktCnt;
+       __u64 RxDroppedPktCnt;
+       __u64 RxDataPktCnt;
+       __u64 RxFlowPktCnt;
+       __u64 RxDwordCnt;
+       __u64 RxLenErrCnt;
+       __u64 RxMaxMinLenErrCnt;
+       __u64 RxICRCErrCnt;
+       __u64 RxVCRCErrCnt;
+       __u64 RxFlowCtrlErrCnt;
+       __u64 RxBadFormatCnt;
+       __u64 RxLinkProblemCnt;
+       __u64 RxEBPCnt;
+       __u64 RxLPCRCErrCnt;
+       __u64 RxBufOvflCnt;
+       __u64 RxTIDFullErrCnt;
+       __u64 RxTIDValidErrCnt;
+       __u64 RxPKeyMismatchCnt;
+       __u64 RxP0HdrEgrOvflCnt;
+       __u64 RxP1HdrEgrOvflCnt;
+       __u64 RxP2HdrEgrOvflCnt;
+       __u64 RxP3HdrEgrOvflCnt;
+       __u64 RxP4HdrEgrOvflCnt;
+       __u64 RxP5HdrEgrOvflCnt;
+       __u64 RxP6HdrEgrOvflCnt;
+       __u64 RxP7HdrEgrOvflCnt;
+       __u64 RxP8HdrEgrOvflCnt;
+       __u64 RxP9HdrEgrOvflCnt;        /* was Reserved6 */
+       __u64 RxP10HdrEgrOvflCnt;       /* was Reserved7 */
+       __u64 RxP11HdrEgrOvflCnt;       /* new for IBA7220 */
+       __u64 RxP12HdrEgrOvflCnt;       /* new for IBA7220 */
+       __u64 RxP13HdrEgrOvflCnt;       /* new for IBA7220 */
+       __u64 RxP14HdrEgrOvflCnt;       /* new for IBA7220 */
+       __u64 RxP15HdrEgrOvflCnt;       /* new for IBA7220 */
+       __u64 RxP16HdrEgrOvflCnt;       /* new for IBA7220 */
+       __u64 IBStatusChangeCnt;
+       __u64 IBLinkErrRecoveryCnt;
+       __u64 IBLinkDownedCnt;
+       __u64 IBSymbolErrCnt;
+       /* The following are new for IBA7220 */
+       __u64 RxVL15DroppedPktCnt;
+       __u64 RxOtherLocalPhyErrCnt;
+       __u64 PcieRetryBufDiagQwordCnt;
+       __u64 ExcessBufferOvflCnt;
+       __u64 LocalLinkIntegrityErrCnt;
+       __u64 RxVlErrCnt;
+       __u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_SEQ_MASK 0xF
+#define INFINIPATH_RHF_SEQ_SHIFT 0
+#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
+#define INFINIPATH_RHF_H_ICRCERR   0x80000000
+#define INFINIPATH_RHF_H_VCRCERR   0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR    0x10000000
+#define INFINIPATH_RHF_H_MTUERR    0x08000000
+#define INFINIPATH_RHF_H_IHDRERR   0x04000000
+#define INFINIPATH_RHF_H_TIDERR    0x02000000
+#define INFINIPATH_RHF_H_MKERR     0x01000000
+#define INFINIPATH_RHF_H_IBERR     0x00800000
+#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
+#define INFINIPATH_RHF_L_USE_EGR   0x80000000
+#define INFINIPATH_RHF_L_SWA       0x00008000
+#define INFINIPATH_RHF_L_SWB       0x00004000
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_PORT_MASK 0xF
+#define INFINIPATH_I_PORT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_SUBPORT_MASK 0x3
+#define INFINIPATH_KPF_SUBPORT_SHIFT 1
+
+#define INFINIPATH_MAX_SUBPORT 4
+
+/* SendPIO per-buffer control */
+#define INFINIPATH_SP_TEST    0x40
+#define INFINIPATH_SP_TESTEBP 0x20
+#define INFINIPATH_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* infinipath header format */
+struct ipath_header {
+       /*
+        * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
+        * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+        * Port 4, TID 11, offset 13.
+        */
+       __le32 ver_port_tid_offset;
+       __le16 chksum;
+       __le16 pkt_flags;
+};
+
+/* infinipath user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ infinipath.
+ */
+struct ipath_message_header {
+       __be16 lrh[4];
+       __be32 bth[3];
+       /* fields below this point are in host byte order */
+       struct ipath_header iph;
+       __u8 sub_opcode;
+};
+
+/* infinipath ethernet header format */
+struct ether_header {
+       __be16 lrh[4];
+       __be32 bth[3];
+       struct ipath_header iph;
+       __u8 sub_opcode;
+       __u8 cmd;
+       __be16 lid;
+       __u16 mac[3];
+       __u8 frag_num;
+       __u8 seq_num;
+       __le32 len;
+       /* MUST be of word size due to PIO write requirements */
+       __le32 csum;
+       __le16 csum_offset;
+       __le16 flags;
+       __u16 first_2_bytes;
+       __u8 unused[2];         /* currently unused */
+};
+
+
+/* IB - LRH header consts */
+#define IPATH_LRH_GRH 0x0003   /* 1. word of IB LRH - next header: GRH */
+#define IPATH_LRH_BTH 0x0002   /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define IPATH_DEFAULT_P_KEY 0xFFFF
+#define IPATH_PERMISSIVE_LID 0xFFFF
+#define IPATH_AETH_CREDIT_SHIFT 24
+#define IPATH_AETH_CREDIT_MASK 0x1F
+#define IPATH_AETH_CREDIT_INVAL 0x1F
+#define IPATH_PSN_MASK 0xFFFFFF
+#define IPATH_MSN_MASK 0xFFFFFF
+#define IPATH_QPN_MASK 0xFFFFFF
+#define IPATH_MULTICAST_LID_BASE 0xC000
+#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
+#define IPATH_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+
+/* sub OpCodes - ith4x  */
+#define IPATH_ITH4X_OPCODE_ENCAP 0x81
+#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
+
+#define IPATH_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
+{
+       return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
+}
+
+static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
+{
+       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+           & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
+{
+       return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+               & INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
+{
+       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+           & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
+{
+       return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
+               & INFINIPATH_RHF_SEQ_MASK;
+}
+
+static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
+{
+       return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
+               & INFINIPATH_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+       return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
+}
+
+static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
+{
+       return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+           & INFINIPATH_I_VERS_MASK;
+}
+
+#endif                         /* _IPATH_COMMON_H */
diff --git a/drivers/staging/rdma/ipath/ipath_cq.c b/drivers/staging/rdma/ipath/ipath_cq.c
new file mode 100644 (file)
index 0000000..e9dd911
--- /dev/null
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
+{
+       struct ipath_cq_wc *wc;
+       unsigned long flags;
+       u32 head;
+       u32 next;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       /*
+        * Note that the head pointer might be writable by user processes.
+        * Take care to verify it is a sane value.
+        */
+       wc = cq->queue;
+       head = wc->head;
+       if (head >= (unsigned) cq->ibcq.cqe) {
+               head = cq->ibcq.cqe;
+               next = 0;
+       } else
+               next = head + 1;
+       if (unlikely(next == wc->tail)) {
+               spin_unlock_irqrestore(&cq->lock, flags);
+               if (cq->ibcq.event_handler) {
+                       struct ib_event ev;
+
+                       ev.device = cq->ibcq.device;
+                       ev.element.cq = &cq->ibcq;
+                       ev.event = IB_EVENT_CQ_ERR;
+                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+               }
+               return;
+       }
+       if (cq->ip) {
+               wc->uqueue[head].wr_id = entry->wr_id;
+               wc->uqueue[head].status = entry->status;
+               wc->uqueue[head].opcode = entry->opcode;
+               wc->uqueue[head].vendor_err = entry->vendor_err;
+               wc->uqueue[head].byte_len = entry->byte_len;
+               wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
+               wc->uqueue[head].qp_num = entry->qp->qp_num;
+               wc->uqueue[head].src_qp = entry->src_qp;
+               wc->uqueue[head].wc_flags = entry->wc_flags;
+               wc->uqueue[head].pkey_index = entry->pkey_index;
+               wc->uqueue[head].slid = entry->slid;
+               wc->uqueue[head].sl = entry->sl;
+               wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+               wc->uqueue[head].port_num = entry->port_num;
+               /* Make sure entry is written before the head index. */
+               smp_wmb();
+       } else
+               wc->kqueue[head] = *entry;
+       wc->head = next;
+
+       if (cq->notify == IB_CQ_NEXT_COMP ||
+           (cq->notify == IB_CQ_SOLICITED && solicited)) {
+               cq->notify = IB_CQ_NONE;
+               cq->triggered++;
+               /*
+                * This will cause send_complete() to be called in
+                * another thread.
+                */
+               tasklet_hi_schedule(&cq->comptask);
+       }
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       if (entry->status != IB_WC_SUCCESS)
+               to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+/**
+ * ipath_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+       struct ipath_cq_wc *wc;
+       unsigned long flags;
+       int npolled;
+       u32 tail;
+
+       /* The kernel can only poll a kernel completion queue */
+       if (cq->ip) {
+               npolled = -EINVAL;
+               goto bail;
+       }
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       wc = cq->queue;
+       tail = wc->tail;
+       if (tail > (u32) cq->ibcq.cqe)
+               tail = (u32) cq->ibcq.cqe;
+       for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+               if (tail == wc->head)
+                       break;
+               /* The kernel doesn't need a RMB since it has the lock. */
+               *entry = wc->kqueue[tail];
+               if (tail >= cq->ibcq.cqe)
+                       tail = 0;
+               else
+                       tail++;
+       }
+       wc->tail = tail;
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+       return npolled;
+}
+
+static void send_complete(unsigned long data)
+{
+       struct ipath_cq *cq = (struct ipath_cq *)data;
+
+       /*
+        * The completion handler will most likely rearm the notification
+        * and poll for all pending entries.  If a new completion entry
+        * is added while we are in this routine, tasklet_hi_schedule()
+        * won't call us again until we return so we check triggered to
+        * see if we need to call the handler again.
+        */
+       for (;;) {
+               u8 triggered = cq->triggered;
+
+               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+               if (cq->triggered == triggered)
+                       return;
+       }
+}
+
+/**
+ * ipath_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @attr: creation attributes
+ * @context: unused by the InfiniPath driver
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
+                             const struct ib_cq_init_attr *attr,
+                             struct ib_ucontext *context,
+                             struct ib_udata *udata)
+{
+       int entries = attr->cqe;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_cq *cq;
+       struct ipath_cq_wc *wc;
+       struct ib_cq *ret;
+       u32 sz;
+
+       if (attr->flags)
+               return ERR_PTR(-EINVAL);
+
+       if (entries < 1 || entries > ib_ipath_max_cqes) {
+               ret = ERR_PTR(-EINVAL);
+               goto done;
+       }
+
+       /* Allocate the completion queue structure. */
+       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto done;
+       }
+
+       /*
+        * Allocate the completion queue entries and head/tail pointers.
+        * This is allocated separately so that it can be resized and
+        * also mapped into user space.
+        * We need to use vmalloc() in order to support mmap and large
+        * numbers of entries.
+        */
+       sz = sizeof(*wc);
+       if (udata && udata->outlen >= sizeof(__u64))
+               sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+       else
+               sz += sizeof(struct ib_wc) * (entries + 1);
+       wc = vmalloc_user(sz);
+       if (!wc) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_cq;
+       }
+
+       /*
+        * Return the address of the WC as the offset to mmap.
+        * See ipath_mmap() for details.
+        */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               int err;
+
+               cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
+               if (!cq->ip) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_wc;
+               }
+
+               err = ib_copy_to_udata(udata, &cq->ip->offset,
+                                      sizeof(cq->ip->offset));
+               if (err) {
+                       ret = ERR_PTR(err);
+                       goto bail_ip;
+               }
+       } else
+               cq->ip = NULL;
+
+       spin_lock(&dev->n_cqs_lock);
+       if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
+               spin_unlock(&dev->n_cqs_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_ip;
+       }
+
+       dev->n_cqs_allocated++;
+       spin_unlock(&dev->n_cqs_lock);
+
+       if (cq->ip) {
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       /*
+        * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+        * The number of entries should be >= the number requested or return
+        * an error.
+        */
+       cq->ibcq.cqe = entries;
+       cq->notify = IB_CQ_NONE;
+       cq->triggered = 0;
+       spin_lock_init(&cq->lock);
+       tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+       wc->head = 0;
+       wc->tail = 0;
+       cq->queue = wc;
+
+       ret = &cq->ibcq;
+
+       goto done;
+
+bail_ip:
+       kfree(cq->ip);
+bail_wc:
+       vfree(wc);
+bail_cq:
+       kfree(cq);
+done:
+       return ret;
+}
+
+/**
+ * ipath_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+       struct ipath_ibdev *dev = to_idev(ibcq->device);
+       struct ipath_cq *cq = to_icq(ibcq);
+
+       tasklet_kill(&cq->comptask);
+       spin_lock(&dev->n_cqs_lock);
+       dev->n_cqs_allocated--;
+       spin_unlock(&dev->n_cqs_lock);
+       if (cq->ip)
+               kref_put(&cq->ip->ref, ipath_release_mmap_info);
+       else
+               vfree(cq->queue);
+       kfree(cq);
+
+       return 0;
+}
+
+/**
+ * ipath_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&cq->lock, flags);
+       /*
+        * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+        * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+        */
+       if (cq->notify != IB_CQ_NEXT_COMP)
+               cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+       if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+           cq->queue->head != cq->queue->tail)
+               ret = 1;
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       return ret;
+}
+
+/**
+ * ipath_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+       struct ipath_cq_wc *old_wc;
+       struct ipath_cq_wc *wc;
+       u32 head, tail, n;
+       int ret;
+       u32 sz;
+
+       if (cqe < 1 || cqe > ib_ipath_max_cqes) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       sz = sizeof(*wc);
+       if (udata && udata->outlen >= sizeof(__u64))
+               sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+       else
+               sz += sizeof(struct ib_wc) * (cqe + 1);
+       wc = vmalloc_user(sz);
+       if (!wc) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       /* Check that we can write the offset to mmap. */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               __u64 offset = 0;
+
+               ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+               if (ret)
+                       goto bail_free;
+       }
+
+       spin_lock_irq(&cq->lock);
+       /*
+        * Make sure head and tail are sane since they
+        * might be user writable.
+        */
+       old_wc = cq->queue;
+       head = old_wc->head;
+       if (head > (u32) cq->ibcq.cqe)
+               head = (u32) cq->ibcq.cqe;
+       tail = old_wc->tail;
+       if (tail > (u32) cq->ibcq.cqe)
+               tail = (u32) cq->ibcq.cqe;
+       if (head < tail)
+               n = cq->ibcq.cqe + 1 + head - tail;
+       else
+               n = head - tail;
+       if (unlikely((u32)cqe < n)) {
+               ret = -EINVAL;
+               goto bail_unlock;
+       }
+       for (n = 0; tail != head; n++) {
+               if (cq->ip)
+                       wc->uqueue[n] = old_wc->uqueue[tail];
+               else
+                       wc->kqueue[n] = old_wc->kqueue[tail];
+               if (tail == (u32) cq->ibcq.cqe)
+                       tail = 0;
+               else
+                       tail++;
+       }
+       cq->ibcq.cqe = cqe;
+       wc->head = n;
+       wc->tail = 0;
+       cq->queue = wc;
+       spin_unlock_irq(&cq->lock);
+
+       vfree(old_wc);
+
+       if (cq->ip) {
+               struct ipath_ibdev *dev = to_idev(ibcq->device);
+               struct ipath_mmap_info *ip = cq->ip;
+
+               ipath_update_mmap_info(dev, ip, sz, wc);
+
+               /*
+                * Return the offset to mmap.
+                * See ipath_mmap() for details.
+                */
+               if (udata && udata->outlen >= sizeof(__u64)) {
+                       ret = ib_copy_to_udata(udata, &ip->offset,
+                                              sizeof(ip->offset));
+                       if (ret)
+                               goto bail;
+               }
+
+               spin_lock_irq(&dev->pending_lock);
+               if (list_empty(&ip->pending_mmaps))
+                       list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       ret = 0;
+       goto bail;
+
+bail_unlock:
+       spin_unlock_irq(&cq->lock);
+bail_free:
+       vfree(wc);
+bail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_debug.h b/drivers/staging/rdma/ipath/ipath_debug.h
new file mode 100644 (file)
index 0000000..65926cd
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING       /* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO        0x1        /* generic low verbosity stuff */
+#define __IPATH_DBG         0x2        /* generic debug */
+#define __IPATH_TRSAMPLE    0x8        /* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG     0x40       /* very verbose debug */
+#define __IPATH_PKTDBG      0x80       /* print packet data */
+/* print process startup (init)/exit messages */
+#define __IPATH_PROCDBG     0x100
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG       0x200
+#define __IPATH_ERRPKTDBG   0x400
+#define __IPATH_USER_SEND   0x1000     /* use user mode send */
+#define __IPATH_KERNEL_SEND 0x2000     /* use kernel mode send */
+#define __IPATH_EPKTDBG     0x4000     /* print ethernet packet data */
+#define __IPATH_IPATHDBG    0x10000    /* Ethernet (IPATH) gen debug */
+#define __IPATH_IPATHWARN   0x20000    /* Ethernet (IPATH) warnings */
+#define __IPATH_IPATHERR    0x40000    /* Ethernet (IPATH) errors */
+#define __IPATH_IPATHPD     0x80000    /* Ethernet (IPATH) packet dump */
+#define __IPATH_IPATHTABLE  0x100000   /* Ethernet (IPATH) table dump */
+#define __IPATH_LINKVERBDBG 0x200000   /* very verbose linkchange debug */
+
+#else                          /* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO      0x0  /* generic low verbosity stuff */
+#define __IPATH_DBG       0x0  /* generic debug */
+#define __IPATH_TRSAMPLE  0x0  /* generate trace buffer sample entries */
+#define __IPATH_VERBDBG   0x0  /* very verbose debug */
+#define __IPATH_PKTDBG    0x0  /* print packet data */
+#define __IPATH_PROCDBG   0x0  /* process startup (init)/exit messages */
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG     0x0
+#define __IPATH_EPKTDBG   0x0  /* print ethernet packet data */
+#define __IPATH_IPATHDBG  0x0  /* Ethernet (IPATH) table dump on */
+#define __IPATH_IPATHWARN 0x0  /* Ethernet (IPATH) warnings on   */
+#define __IPATH_IPATHERR  0x0  /* Ethernet (IPATH) errors on   */
+#define __IPATH_IPATHPD   0x0  /* Ethernet (IPATH) packet dump on   */
+#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on   */
+#define __IPATH_LINKVERBDBG 0x0        /* very verbose linkchange debug */
+
+#endif                         /* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif                         /* _IPATH_DEBUG_H */
diff --git a/drivers/staging/rdma/ipath/ipath_diag.c b/drivers/staging/rdma/ipath/ipath_diag.c
new file mode 100644 (file)
index 0000000..45802e9
--- /dev/null
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the ipath_diag device, normally minor number 129.  Diagnostic use
+ * of the InfiniPath chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <asm/uaccess.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+int ipath_diag_inuse;
+static int diag_set_link;
+
+static int ipath_diag_open(struct inode *in, struct file *fp);
+static int ipath_diag_release(struct inode *in, struct file *fp);
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+                              size_t count, loff_t *off);
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+                               size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+       .owner = THIS_MODULE,
+       .write = ipath_diag_write,
+       .read = ipath_diag_read,
+       .open = ipath_diag_open,
+       .release = ipath_diag_release,
+       .llseek = default_llseek,
+};
+
+static ssize_t ipath_diagpkt_write(struct file *fp,
+                                  const char __user *data,
+                                  size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+       .owner = THIS_MODULE,
+       .write = ipath_diagpkt_write,
+       .llseek = noop_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_dev;
+
+int ipath_diag_add(struct ipath_devdata *dd)
+{
+       char name[16];
+       int ret = 0;
+
+       if (atomic_inc_return(&diagpkt_count) == 1) {
+               ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
+                                     "ipath_diagpkt", &diagpkt_file_ops,
+                                     &diagpkt_cdev, &diagpkt_dev);
+
+               if (ret) {
+                       ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
+                                     "device: %d", ret);
+                       goto done;
+               }
+       }
+
+       snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
+
+       ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
+                             &diag_file_ops, &dd->diag_cdev,
+                             &dd->diag_dev);
+       if (ret)
+               ipath_dev_err(dd, "Couldn't create %s device: %d",
+                             name, ret);
+
+done:
+       return ret;
+}
+
+void ipath_diag_remove(struct ipath_devdata *dd)
+{
+       if (atomic_dec_and_test(&diagpkt_count))
+               ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
+
+       ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
+}
+
+/**
+ * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
+                            const void __iomem *caddr, size_t count)
+{
+       const u64 __iomem *reg_addr = caddr;
+       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+       int ret;
+
+       /* not very efficient, but it works for now */
+       if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       while (reg_addr < reg_end) {
+               u64 data = readq(reg_addr);
+               if (copy_to_user(uaddr, &data, sizeof(u64))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               reg_addr++;
+               uaddr += sizeof(u64);
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
+                             const void __user *uaddr, size_t count)
+{
+       u64 __iomem *reg_addr = caddr;
+       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+       int ret;
+
+       /* not very efficient, but it works for now */
+       if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       while (reg_addr < reg_end) {
+               u64 data;
+               if (copy_from_user(&data, uaddr, sizeof(data))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               writeq(data, reg_addr);
+
+               reg_addr++;
+               uaddr += sizeof(u64);
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
+                            const void __iomem *caddr, size_t count)
+{
+       const u32 __iomem *reg_addr = caddr;
+       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+       int ret;
+
+       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+           reg_end > (u32 __iomem *) dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* not very efficient, but it works for now */
+       while (reg_addr < reg_end) {
+               u32 data = readl(reg_addr);
+               if (copy_to_user(uaddr, &data, sizeof(data))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+
+               reg_addr++;
+               uaddr += sizeof(u32);
+
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
+                             const void __user *uaddr, size_t count)
+{
+       u32 __iomem *reg_addr = caddr;
+       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+       int ret;
+
+       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+           reg_end > (u32 __iomem *) dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       while (reg_addr < reg_end) {
+               u32 data;
+               if (copy_from_user(&data, uaddr, sizeof(data))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               writel(data, reg_addr);
+
+               reg_addr++;
+               uaddr += sizeof(u32);
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+static int ipath_diag_open(struct inode *in, struct file *fp)
+{
+       int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
+       struct ipath_devdata *dd;
+       int ret;
+
+       mutex_lock(&ipath_mutex);
+
+       if (ipath_diag_inuse) {
+               ret = -EBUSY;
+               goto bail;
+       }
+
+       dd = ipath_lookup(unit);
+
+       if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
+           !dd->ipath_kregbase) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       fp->private_data = dd;
+       ipath_diag_inuse = -2;
+       diag_set_link = 0;
+       ret = 0;
+
+       /* Only expose a way to reset the device if we
+          make it into diag mode. */
+       ipath_expose_reset(&dd->pcidev->dev);
+
+bail:
+       mutex_unlock(&ipath_mutex);
+
+       return ret;
+}
+
+/**
+ * ipath_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: ipath_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t ipath_diagpkt_write(struct file *fp,
+                                  const char __user *data,
+                                  size_t count, loff_t *off)
+{
+       u32 __iomem *piobuf;
+       u32 plen, pbufn, maxlen_reserve;
+       struct ipath_diag_pkt odp;
+       struct ipath_diag_xpkt dp;
+       u32 *tmpbuf = NULL;
+       struct ipath_devdata *dd;
+       ssize_t ret = 0;
+       u64 val;
+       u32 l_state, lt_state; /* LinkState, LinkTrainingState */
+
+
+       if (count == sizeof(dp)) {
+               if (copy_from_user(&dp, data, sizeof(dp))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+       } else if (count == sizeof(odp)) {
+               if (copy_from_user(&odp, data, sizeof(odp))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               dp.len = odp.len;
+               dp.unit = odp.unit;
+               dp.data = odp.data;
+               dp.pbc_wd = 0;
+       } else {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* send count must be an exact number of dwords */
+       if (dp.len & 3) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       plen = dp.len >> 2;
+
+       dd = ipath_lookup(dp.unit);
+       if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
+           !dd->ipath_kregbase) {
+               ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
+                          dp.unit);
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       if (ipath_diag_inuse && !diag_set_link &&
+           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+               diag_set_link = 1;
+               ipath_cdbg(VERBOSE, "Trying to set to set link active for "
+                          "diag pkt\n");
+               ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+               ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+       }
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               /* no hardware, freeze, etc. */
+               ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
+               ret = -ENODEV;
+               goto bail;
+       }
+       /*
+        * Want to skip check for l_state if using custom PBC,
+        * because we might be trying to force an SM packet out.
+        * first-cut, skip _all_ state checking in that case.
+        */
+       val = ipath_ib_state(dd, dd->ipath_lastibcstat);
+       lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+       l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+       if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
+           (val != dd->ib_init && val != dd->ib_arm &&
+           val != dd->ib_active))) {
+               ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
+                          dd->ipath_unit, (unsigned long long) val);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /*
+        * need total length before first word written, plus 2 Dwords. One Dword
+        * is for padding so we get the full user data when not aligned on
+        * a word boundary. The other Dword is to make sure we have room for the
+        * ICRC which gets tacked on later.
+        */
+       maxlen_reserve = 2 * sizeof(u32);
+       if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
+               ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
+                         dp.len, dd->ipath_ibmaxlen);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       plen = sizeof(u32) + dp.len;
+
+       tmpbuf = vmalloc(plen);
+       if (!tmpbuf) {
+               dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
+                        "failing\n");
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       if (copy_from_user(tmpbuf,
+                          (const void __user *) (unsigned long) dp.data,
+                          dp.len)) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       plen >>= 2;             /* in dwords */
+
+       piobuf = ipath_getpiobuf(dd, plen, &pbufn);
+       if (!piobuf) {
+               ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
+                          dd->ipath_unit);
+               ret = -EBUSY;
+               goto bail;
+       }
+       /* disarm it just to be extra sure */
+       ipath_disarm_piobufs(dd, pbufn, 1);
+
+       if (ipath_debug & __IPATH_PKTDBG)
+               ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
+                          dd->ipath_unit, plen - 1, pbufn);
+
+       if (dp.pbc_wd == 0)
+               dp.pbc_wd = plen;
+       writeq(dp.pbc_wd, piobuf);
+       /*
+        * Copy all by the trigger word, then flush, so it's written
+        * to chip before trigger word, then write trigger word, then
+        * flush again, so packet is sent.
+        */
+       if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+               ipath_flush_wc();
+               __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
+               ipath_flush_wc();
+               __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+       } else
+               __iowrite32_copy(piobuf + 2, tmpbuf, plen);
+
+       ipath_flush_wc();
+
+       ret = sizeof(dp);
+
+bail:
+       vfree(tmpbuf);
+       return ret;
+}
+
+static int ipath_diag_release(struct inode *in, struct file *fp)
+{
+       mutex_lock(&ipath_mutex);
+       ipath_diag_inuse = 0;
+       fp->private_data = NULL;
+       mutex_unlock(&ipath_mutex);
+       return 0;
+}
+
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+                              size_t count, loff_t *off)
+{
+       struct ipath_devdata *dd = fp->private_data;
+       void __iomem *kreg_base;
+       ssize_t ret;
+
+       kreg_base = dd->ipath_kregbase;
+
+       if (count == 0)
+               ret = 0;
+       else if ((count % 4) || (*off % 4))
+               /* address or length is not 32-bit aligned, hence invalid */
+               ret = -EINVAL;
+       else if (ipath_diag_inuse < 1 && (*off || count != 8))
+               ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
+       else if ((count % 8) || (*off % 8))
+               /* address or length not 64-bit aligned; do 32-bit reads */
+               ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
+       else
+               ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
+
+       if (ret >= 0) {
+               *off += count;
+               ret = count;
+               if (ipath_diag_inuse == -2)
+                       ipath_diag_inuse++;
+       }
+
+       return ret;
+}
+
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+                               size_t count, loff_t *off)
+{
+       struct ipath_devdata *dd = fp->private_data;
+       void __iomem *kreg_base;
+       ssize_t ret;
+
+       kreg_base = dd->ipath_kregbase;
+
+       if (count == 0)
+               ret = 0;
+       else if ((count % 4) || (*off % 4))
+               /* address or length is not 32-bit aligned, hence invalid */
+               ret = -EINVAL;
+       else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
+                ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
+               ret = -EINVAL;  /* before any other write allowed */
+       else if ((count % 8) || (*off % 8))
+               /* address or length not 64-bit aligned; do 32-bit writes */
+               ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
+       else
+               ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
+
+       if (ret >= 0) {
+               *off += count;
+               ret = count;
+               if (ipath_diag_inuse == -1)
+                       ipath_diag_inuse = 1; /* all read/write OK now */
+       }
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_dma.c b/drivers/staging/rdma/ipath/ipath_dma.c
new file mode 100644 (file)
index 0000000..123a8c0
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+       return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 ipath_dma_map_single(struct ib_device *dev,
+                               void *cpu_addr, size_t size,
+                               enum dma_data_direction direction)
+{
+       BUG_ON(!valid_dma_direction(direction));
+       return (u64) cpu_addr;
+}
+
+static void ipath_dma_unmap_single(struct ib_device *dev,
+                                  u64 addr, size_t size,
+                                  enum dma_data_direction direction)
+{
+       BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 ipath_dma_map_page(struct ib_device *dev,
+                             struct page *page,
+                             unsigned long offset,
+                             size_t size,
+                             enum dma_data_direction direction)
+{
+       u64 addr;
+
+       BUG_ON(!valid_dma_direction(direction));
+
+       if (offset + size > PAGE_SIZE) {
+               addr = BAD_DMA_ADDRESS;
+               goto done;
+       }
+
+       addr = (u64) page_address(page);
+       if (addr)
+               addr += offset;
+       /* TODO: handle highmem pages */
+
+done:
+       return addr;
+}
+
+static void ipath_dma_unmap_page(struct ib_device *dev,
+                                u64 addr, size_t size,
+                                enum dma_data_direction direction)
+{
+       BUG_ON(!valid_dma_direction(direction));
+}
+
+static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+                       int nents, enum dma_data_direction direction)
+{
+       struct scatterlist *sg;
+       u64 addr;
+       int i;
+       int ret = nents;
+
+       BUG_ON(!valid_dma_direction(direction));
+
+       for_each_sg(sgl, sg, nents, i) {
+               addr = (u64) page_address(sg_page(sg));
+               /* TODO: handle highmem pages */
+               if (!addr) {
+                       ret = 0;
+                       break;
+               }
+               sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+               sg->dma_length = sg->length;
+#endif
+       }
+       return ret;
+}
+
+static void ipath_unmap_sg(struct ib_device *dev,
+                          struct scatterlist *sg, int nents,
+                          enum dma_data_direction direction)
+{
+       BUG_ON(!valid_dma_direction(direction));
+}
+
+static void ipath_sync_single_for_cpu(struct ib_device *dev,
+                                     u64 addr,
+                                     size_t size,
+                                     enum dma_data_direction dir)
+{
+}
+
+static void ipath_sync_single_for_device(struct ib_device *dev,
+                                        u64 addr,
+                                        size_t size,
+                                        enum dma_data_direction dir)
+{
+}
+
+static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
+                                     u64 *dma_handle, gfp_t flag)
+{
+       struct page *p;
+       void *addr = NULL;
+
+       p = alloc_pages(flag, get_order(size));
+       if (p)
+               addr = page_address(p);
+       if (dma_handle)
+               *dma_handle = (u64) addr;
+       return addr;
+}
+
+static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
+                                   void *cpu_addr, u64 dma_handle)
+{
+       free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
+       .mapping_error = ipath_mapping_error,
+       .map_single = ipath_dma_map_single,
+       .unmap_single = ipath_dma_unmap_single,
+       .map_page = ipath_dma_map_page,
+       .unmap_page = ipath_dma_unmap_page,
+       .map_sg = ipath_map_sg,
+       .unmap_sg = ipath_unmap_sg,
+       .sync_single_for_cpu = ipath_sync_single_for_cpu,
+       .sync_single_for_device = ipath_sync_single_for_device,
+       .alloc_coherent = ipath_dma_alloc_coherent,
+       .free_coherent = ipath_dma_free_coherent
+};
diff --git a/drivers/staging/rdma/ipath/ipath_driver.c b/drivers/staging/rdma/ipath/ipath_driver.c
new file mode 100644 (file)
index 0000000..871dbe5
--- /dev/null
@@ -0,0 +1,2789 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#ifdef CONFIG_X86_64
+#include <asm/pat.h>
+#endif
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+
+static void ipath_update_pio_bufs(struct ipath_devdata *);
+
+const char *ipath_get_unit_name(int unit)
+{
+       static char iname[16];
+       snprintf(iname, sizeof iname, "infinipath%u", unit);
+       return iname;
+}
+
+#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
+#define PFX IPATH_DRV_NAME ": "
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_ipath_version[] = IPATH_IDSTR "\n";
+
+static struct idr unit_table;
+DEFINE_SPINLOCK(ipath_devs_lock);
+LIST_HEAD(ipath_dev_list);
+
+wait_queue_head_t ipath_state_wait;
+
+unsigned ipath_debug = __IPATH_INFO;
+
+module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "mask for debug prints");
+EXPORT_SYMBOL_GPL(ipath_debug);
+
+unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
+module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
+MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
+
+static unsigned ipath_hol_timeout_ms = 13000;
+module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+       "duration of user app suspension after link failure");
+
+unsigned ipath_linkrecovery = 1;
+module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("QLogic <support@qlogic.com>");
+MODULE_DESCRIPTION("QLogic InfiniPath driver");
+
+/*
+ * Table to translate the LINKTRAININGSTATE portion of
+ * IBCStatus to a human-readable form.
+ */
+const char *ipath_ibcstatus_str[] = {
+       "Disabled",
+       "LinkUp",
+       "PollActive",
+       "PollQuiet",
+       "SleepDelay",
+       "SleepQuiet",
+       "LState6",              /* unused */
+       "LState7",              /* unused */
+       "CfgDebounce",
+       "CfgRcvfCfg",
+       "CfgWaitRmt",
+       "CfgIdle",
+       "RecovRetrain",
+       "CfgTxRevLane",         /* unused before IBA7220 */
+       "RecovWaitRmt",
+       "RecovIdle",
+       /* below were added for IBA7220 */
+       "CfgEnhanced",
+       "CfgTest",
+       "CfgWaitRmtTest",
+       "CfgWaitCfgEnhanced",
+       "SendTS_T",
+       "SendTstIdles",
+       "RcvTS_T",
+       "SendTst_TS1s",
+       "LTState18", "LTState19", "LTState1A", "LTState1B",
+       "LTState1C", "LTState1D", "LTState1E", "LTState1F"
+};
+
+static void ipath_remove_one(struct pci_dev *);
+static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
+
+/* Only needed for registration, nothing else needs this info */
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+static const struct pci_device_id ipath_pci_tbl[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
+       { 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
+
+static struct pci_driver ipath_driver = {
+       .name = IPATH_DRV_NAME,
+       .probe = ipath_init_one,
+       .remove = ipath_remove_one,
+       .id_table = ipath_pci_tbl,
+       .driver = {
+               .groups = ipath_driver_attr_groups,
+       },
+};
+
+static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
+                            u32 *bar0, u32 *bar1)
+{
+       int ret;
+
+       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
+       if (ret)
+               ipath_dev_err(dd, "failed to read bar0 before enable: "
+                             "error %d\n", -ret);
+
+       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
+       if (ret)
+               ipath_dev_err(dd, "failed to read bar1 before enable: "
+                             "error %d\n", -ret);
+
+       ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
+}
+
+static void ipath_free_devdata(struct pci_dev *pdev,
+                              struct ipath_devdata *dd)
+{
+       unsigned long flags;
+
+       pci_set_drvdata(pdev, NULL);
+
+       if (dd->ipath_unit != -1) {
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+               idr_remove(&unit_table, dd->ipath_unit);
+               list_del(&dd->ipath_list);
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+       }
+       vfree(dd);
+}
+
+static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
+{
+       unsigned long flags;
+       struct ipath_devdata *dd;
+       int ret;
+
+       dd = vzalloc(sizeof(*dd));
+       if (!dd) {
+               dd = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+       dd->ipath_unit = -1;
+
+       idr_preload(GFP_KERNEL);
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate unit ID: error %d\n", -ret);
+               ipath_free_devdata(pdev, dd);
+               dd = ERR_PTR(ret);
+               goto bail_unlock;
+       }
+       dd->ipath_unit = ret;
+
+       dd->pcidev = pdev;
+       pci_set_drvdata(pdev, dd);
+
+       list_add(&dd->ipath_list, &ipath_dev_list);
+
+bail_unlock:
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+       idr_preload_end();
+bail:
+       return dd;
+}
+
+static inline struct ipath_devdata *__ipath_lookup(int unit)
+{
+       return idr_find(&unit_table, unit);
+}
+
+struct ipath_devdata *ipath_lookup(int unit)
+{
+       struct ipath_devdata *dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+       dd = __ipath_lookup(unit);
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       return dd;
+}
+
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
+{
+       int nunits, npresent, nup;
+       struct ipath_devdata *dd;
+       unsigned long flags;
+       int maxports;
+
+       nunits = npresent = nup = maxports = 0;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+               nunits++;
+               if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
+                       npresent++;
+               if (dd->ipath_lid &&
+                   !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
+                                        | IPATH_LINKUNK)))
+                       nup++;
+               if (dd->ipath_cfgports > maxports)
+                       maxports = dd->ipath_cfgports;
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       if (npresentp)
+               *npresentp = npresent;
+       if (nupp)
+               *nupp = nup;
+       if (maxportsp)
+               *maxportsp = maxports;
+
+       return nunits;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
+{
+       return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
+{
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void ipath_verify_pioperf(struct ipath_devdata *dd)
+{
+       u32 pbnum, cnt, lcnt;
+       u32 __iomem *piobuf;
+       u32 *addr;
+       u64 msecs, emsecs;
+
+       piobuf = ipath_getpiobuf(dd, 0, &pbnum);
+       if (!piobuf) {
+               dev_info(&dd->pcidev->dev,
+                       "No PIObufs for checking perf, skipping\n");
+               return;
+       }
+
+       /*
+        * Enough to give us a reasonable test, less than piobuf size, and
+        * likely multiple of store buffer length.
+        */
+       cnt = 1024;
+
+       addr = vmalloc(cnt);
+       if (!addr) {
+               dev_info(&dd->pcidev->dev,
+                       "Couldn't get memory for checking PIO perf,"
+                       " skipping\n");
+               goto done;
+       }
+
+       preempt_disable();  /* we want reasonably accurate elapsed time */
+       msecs = 1 + jiffies_to_msecs(jiffies);
+       for (lcnt = 0; lcnt < 10000U; lcnt++) {
+               /* wait until we cross msec boundary */
+               if (jiffies_to_msecs(jiffies) >= msecs)
+                       break;
+               udelay(1);
+       }
+
+       ipath_disable_armlaunch(dd);
+
+       /*
+        * length 0, no dwords actually sent, and mark as VL15
+        * on chips where that may matter (due to IB flowcontrol)
+        */
+       if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
+               writeq(1UL << 63, piobuf);
+       else
+               writeq(0, piobuf);
+       ipath_flush_wc();
+
+       /*
+        * this is only roughly accurate, since even with preempt we
+        * still take interrupts that could take a while.   Running for
+        * >= 5 msec seems to get us "close enough" to accurate values
+        */
+       msecs = jiffies_to_msecs(jiffies);
+       for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+               __iowrite32_copy(piobuf + 64, addr, cnt >> 2);
+               emsecs = jiffies_to_msecs(jiffies) - msecs;
+       }
+
+       /* 1 GiB/sec, slightly over IB SDR line rate */
+       if (lcnt < (emsecs * 1024U))
+               ipath_dev_err(dd,
+                       "Performance problem: bandwidth to PIO buffers is "
+                       "only %u MiB/sec\n",
+                       lcnt / (u32) emsecs);
+       else
+               ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
+                       lcnt / (u32) emsecs);
+
+       preempt_enable();
+
+       vfree(addr);
+
+done:
+       /* disarm piobuf, so it's available again */
+       ipath_disarm_piobufs(dd, pbnum, 1);
+       ipath_enable_armlaunch(dd);
+}
+
+static void cleanup_device(struct ipath_devdata *dd);
+
+static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       int ret, len, j;
+       struct ipath_devdata *dd;
+       unsigned long long addr;
+       u32 bar0 = 0, bar1 = 0;
+
+#ifdef CONFIG_X86_64
+       if (pat_enabled()) {
+               pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n");
+               ret = -ENODEV;
+               goto bail;
+       }
+#endif
+
+       dd = ipath_alloc_devdata(pdev);
+       if (IS_ERR(dd)) {
+               ret = PTR_ERR(dd);
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate devdata: error %d\n", -ret);
+               goto bail;
+       }
+
+       ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
+
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               /* This can happen iff:
+                *
+                * We did a chip reset, and then failed to reprogram the
+                * BAR, or the chip reset due to an internal error.  We then
+                * unloaded the driver and reloaded it.
+                *
+                * Both reset cases set the BAR back to initial state.  For
+                * the latter case, the AER sticky error bit at offset 0x718
+                * should be set, but the Linux kernel doesn't yet know
+                * about that, it appears.  If the original BAR was retained
+                * in the kernel data structures, this may be OK.
+                */
+               ipath_dev_err(dd, "enable unit %d failed: error %d\n",
+                             dd->ipath_unit, -ret);
+               goto bail_devdata;
+       }
+       addr = pci_resource_start(pdev, 0);
+       len = pci_resource_len(pdev, 0);
+       ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
+                  "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
+                  ent->device, ent->driver_data);
+
+       read_bars(dd, pdev, &bar0, &bar1);
+
+       if (!bar1 && !(bar0 & ~0xf)) {
+               if (addr) {
+                       dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
+                                "rewriting as %llx\n", addr);
+                       ret = pci_write_config_dword(
+                               pdev, PCI_BASE_ADDRESS_0, addr);
+                       if (ret) {
+                               ipath_dev_err(dd, "rewrite of BAR0 "
+                                             "failed: err %d\n", -ret);
+                               goto bail_disable;
+                       }
+                       ret = pci_write_config_dword(
+                               pdev, PCI_BASE_ADDRESS_1, addr >> 32);
+                       if (ret) {
+                               ipath_dev_err(dd, "rewrite of BAR1 "
+                                             "failed: err %d\n", -ret);
+                               goto bail_disable;
+                       }
+               } else {
+                       ipath_dev_err(dd, "BAR is 0 (probable RESET), "
+                                     "not usable until reboot\n");
+                       ret = -ENODEV;
+                       goto bail_disable;
+               }
+       }
+
+       ret = pci_request_regions(pdev, IPATH_DRV_NAME);
+       if (ret) {
+               dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
+                        "err %d\n", dd->ipath_unit, -ret);
+               goto bail_disable;
+       }
+
+       ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       if (ret) {
+               /*
+                * if the 64 bit setup fails, try 32 bit.  Some systems
+                * do not setup 64 bit maps on systems with 2GB or less
+                * memory installed.
+                */
+               ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+               if (ret) {
+                       dev_info(&pdev->dev,
+                               "Unable to set DMA mask for unit %u: %d\n",
+                               dd->ipath_unit, ret);
+                       goto bail_regions;
+               }
+               else {
+                       ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
+                       ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+                       if (ret)
+                               dev_info(&pdev->dev,
+                                       "Unable to set DMA consistent mask "
+                                       "for unit %u: %d\n",
+                                       dd->ipath_unit, ret);
+
+               }
+       }
+       else {
+               ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+               if (ret)
+                       dev_info(&pdev->dev,
+                               "Unable to set DMA consistent mask "
+                               "for unit %u: %d\n",
+                               dd->ipath_unit, ret);
+       }
+
+       pci_set_master(pdev);
+
+       /*
+        * Save BARs to rewrite after device reset.  Save all 64 bits of
+        * BAR, just in case.
+        */
+       dd->ipath_pcibar0 = addr;
+       dd->ipath_pcibar1 = addr >> 32;
+       dd->ipath_deviceid = ent->device;       /* save for later use */
+       dd->ipath_vendorid = ent->vendor;
+
+       /* setup the chip-specific functions, as early as possible. */
+       switch (ent->device) {
+       case PCI_DEVICE_ID_INFINIPATH_HT:
+               ipath_init_iba6110_funcs(dd);
+               break;
+
+       default:
+               ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
+                             "failing\n", ent->device);
+               return -ENODEV;
+       }
+
+       for (j = 0; j < 6; j++) {
+               if (!pdev->resource[j].start)
+                       continue;
+               ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
+                          j, &pdev->resource[j],
+                          (unsigned long long)pci_resource_len(pdev, j));
+       }
+
+       if (!addr) {
+               ipath_dev_err(dd, "No valid address in BAR 0!\n");
+               ret = -ENODEV;
+               goto bail_regions;
+       }
+
+       dd->ipath_pcirev = pdev->revision;
+
+#if defined(__powerpc__)
+       /* There isn't a generic way to specify writethrough mappings */
+       dd->ipath_kregbase = __ioremap(addr, len,
+               (_PAGE_NO_CACHE|_PAGE_WRITETHRU));
+#else
+       /* XXX: split this properly to enable on PAT */
+       dd->ipath_kregbase = ioremap_nocache(addr, len);
+#endif
+
+       if (!dd->ipath_kregbase) {
+               ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
+                         addr);
+               ret = -ENOMEM;
+               goto bail_iounmap;
+       }
+       dd->ipath_kregend = (u64 __iomem *)
+               ((void __iomem *)dd->ipath_kregbase + len);
+       dd->ipath_physaddr = addr;      /* used for io_remap, etc. */
+       /* for user mmap */
+       ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
+                  addr, dd->ipath_kregbase);
+
+       if (dd->ipath_f_bus(dd, pdev))
+               ipath_dev_err(dd, "Failed to setup config space; "
+                             "continuing anyway\n");
+
+       /*
+        * set up our interrupt handler; IRQF_SHARED probably not needed,
+        * since MSI interrupts shouldn't be shared but won't  hurt for now.
+        * check 0 irq after we return from chip-specific bus setup, since
+        * that can affect this due to setup
+        */
+       if (!dd->ipath_irq)
+               ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
+                             "work\n");
+       else {
+               ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
+                                 IPATH_DRV_NAME, dd);
+               if (ret) {
+                       ipath_dev_err(dd, "Couldn't setup irq handler, "
+                                     "irq=%d: %d\n", dd->ipath_irq, ret);
+                       goto bail_iounmap;
+               }
+       }
+
+       ret = ipath_init_chip(dd, 0);   /* do the chip-specific init */
+       if (ret)
+               goto bail_irqsetup;
+
+       ret = ipath_enable_wc(dd);
+
+       if (ret)
+               ret = 0;
+
+       ipath_verify_pioperf(dd);
+
+       ipath_device_create_group(&pdev->dev, dd);
+       ipathfs_add_device(dd);
+       ipath_user_add(dd);
+       ipath_diag_add(dd);
+       ipath_register_ib_device(dd);
+
+       goto bail;
+
+bail_irqsetup:
+       cleanup_device(dd);
+
+       if (dd->ipath_irq)
+               dd->ipath_f_free_irq(dd);
+
+       if (dd->ipath_f_cleanup)
+               dd->ipath_f_cleanup(dd);
+
+bail_iounmap:
+       iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+bail_regions:
+       pci_release_regions(pdev);
+
+bail_disable:
+       pci_disable_device(pdev);
+
+bail_devdata:
+       ipath_free_devdata(pdev, dd);
+
+bail:
+       return ret;
+}
+
+static void cleanup_device(struct ipath_devdata *dd)
+{
+       int port;
+       struct ipath_portdata **tmp;
+       unsigned long flags;
+
+       if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+               /* can't do anything more with chip; needs re-init */
+               *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+               if (dd->ipath_kregbase) {
+                       /*
+                        * if we haven't already cleaned up before these are
+                        * to ensure any register reads/writes "fail" until
+                        * re-init
+                        */
+                       dd->ipath_kregbase = NULL;
+                       dd->ipath_uregbase = 0;
+                       dd->ipath_sregbase = 0;
+                       dd->ipath_cregbase = 0;
+                       dd->ipath_kregsize = 0;
+               }
+               ipath_disable_wc(dd);
+       }
+
+       if (dd->ipath_spectriggerhit)
+               dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
+                        dd->ipath_spectriggerhit);
+
+       if (dd->ipath_pioavailregs_dma) {
+               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                 (void *) dd->ipath_pioavailregs_dma,
+                                 dd->ipath_pioavailregs_phys);
+               dd->ipath_pioavailregs_dma = NULL;
+       }
+       if (dd->ipath_dummy_hdrq) {
+               dma_free_coherent(&dd->pcidev->dev,
+                       dd->ipath_pd[0]->port_rcvhdrq_size,
+                       dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
+               dd->ipath_dummy_hdrq = NULL;
+       }
+
+       if (dd->ipath_pageshadow) {
+               struct page **tmpp = dd->ipath_pageshadow;
+               dma_addr_t *tmpd = dd->ipath_physshadow;
+               int i, cnt = 0;
+
+               ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+                          "locked\n");
+               for (port = 0; port < dd->ipath_cfgports; port++) {
+                       int port_tidbase = port * dd->ipath_rcvtidcnt;
+                       int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+                       for (i = port_tidbase; i < maxtid; i++) {
+                               if (!tmpp[i])
+                                       continue;
+                               pci_unmap_page(dd->pcidev, tmpd[i],
+                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+                               ipath_release_user_pages(&tmpp[i], 1);
+                               tmpp[i] = NULL;
+                               cnt++;
+                       }
+               }
+               if (cnt) {
+                       ipath_stats.sps_pageunlocks += cnt;
+                       ipath_cdbg(VERBOSE, "There were still %u expTID "
+                                  "entries locked\n", cnt);
+               }
+               if (ipath_stats.sps_pagelocks ||
+                   ipath_stats.sps_pageunlocks)
+                       ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+                                  "unlocked via ipath_m{un}lock\n",
+                                  (unsigned long long)
+                                  ipath_stats.sps_pagelocks,
+                                  (unsigned long long)
+                                  ipath_stats.sps_pageunlocks);
+
+               ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+                          dd->ipath_pageshadow);
+               tmpp = dd->ipath_pageshadow;
+               dd->ipath_pageshadow = NULL;
+               vfree(tmpp);
+
+               dd->ipath_egrtidbase = NULL;
+       }
+
+       /*
+        * free any resources still in use (usually just kernel ports)
+        * at unload; we do for portcnt, because that's what we allocate.
+        * We acquire lock to be really paranoid that ipath_pd isn't being
+        * accessed from some interrupt-related code (that should not happen,
+        * but best to be sure).
+        */
+       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+       tmp = dd->ipath_pd;
+       dd->ipath_pd = NULL;
+       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+       for (port = 0; port < dd->ipath_portcnt; port++) {
+               struct ipath_portdata *pd = tmp[port];
+               tmp[port] = NULL; /* debugging paranoia */
+               ipath_free_pddata(dd, pd);
+       }
+       kfree(tmp);
+}
+
+static void ipath_remove_one(struct pci_dev *pdev)
+{
+       struct ipath_devdata *dd = pci_get_drvdata(pdev);
+
+       ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
+
+       /*
+        * disable the IB link early, to be sure no new packets arrive, which
+        * complicates the shutdown process
+        */
+       ipath_shutdown_device(dd);
+
+       flush_workqueue(ib_wq);
+
+       if (dd->verbs_dev)
+               ipath_unregister_ib_device(dd->verbs_dev);
+
+       ipath_diag_remove(dd);
+       ipath_user_remove(dd);
+       ipathfs_remove_device(dd);
+       ipath_device_remove_group(&pdev->dev, dd);
+
+       ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
+                  "unit %u\n", dd, (u32) dd->ipath_unit);
+
+       cleanup_device(dd);
+
+       /*
+        * turn off rcv, send, and interrupts for all ports, all drivers
+        * should also hard reset the chip here?
+        * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+        * for all versions of the driver, if they were allocated
+        */
+       if (dd->ipath_irq) {
+               ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
+                          dd->ipath_unit, dd->ipath_irq);
+               dd->ipath_f_free_irq(dd);
+       } else
+               ipath_dbg("irq is 0, not doing free_irq "
+                         "for unit %u\n", dd->ipath_unit);
+       /*
+        * we check for NULL here, because it's outside
+        * the kregbase check, and we need to call it
+        * after the free_irq.  Thus it's possible that
+        * the function pointers were never initialized.
+        */
+       if (dd->ipath_f_cleanup)
+               /* clean up chip-specific stuff */
+               dd->ipath_f_cleanup(dd);
+
+       ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
+       iounmap((volatile void __iomem *) dd->ipath_kregbase);
+       pci_release_regions(pdev);
+       ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
+       pci_disable_device(pdev);
+
+       ipath_free_devdata(pdev, dd);
+}
+
+/* general driver use */
+DEFINE_MUTEX(ipath_mutex);
+
+static DEFINE_SPINLOCK(ipath_pioavail_lock);
+
+/**
+ * ipath_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the infinipath device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered.  Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ * Also after errors.
+ */
+void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
+                         unsigned cnt)
+{
+       unsigned i, last = first + cnt;
+       unsigned long flags;
+
+       ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
+       for (i = first; i < last; i++) {
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               /*
+                * The disarm-related bits are write-only, so it
+                * is ok to OR them in with our copy of sendctrl
+                * while we hold the lock.
+                */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                       dd->ipath_sendctrl | INFINIPATH_S_DISARM |
+                       (i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
+               /* can't disarm bufs back-to-back per iba7220 spec */
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+       }
+       /* on some older chips, update may not happen after cancel */
+       ipath_force_pio_avail_update(dd);
+}
+
+/**
+ * ipath_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the infinipath device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * ipath_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
+{
+       dd->ipath_state_wanted = state;
+       wait_event_interruptible_timeout(ipath_state_wait,
+                                        (dd->ipath_flags & state),
+                                        msecs_to_jiffies(msecs));
+       dd->ipath_state_wanted = 0;
+
+       if (!(dd->ipath_flags & state)) {
+               u64 val;
+               ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
+                          " ms\n",
+                          /* test INIT ahead of DOWN, both can be set */
+                          (state & IPATH_LINKINIT) ? "INIT" :
+                          ((state & IPATH_LINKDOWN) ? "DOWN" :
+                           ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+                          msecs);
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+               ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
+                          (unsigned long long) ipath_read_kreg64(
+                                  dd, dd->ipath_kregs->kr_ibcctrl),
+                          (unsigned long long) val,
+                          ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
+       }
+       return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
+       char *buf, size_t blen)
+{
+       static const struct {
+               ipath_err_t err;
+               const char *msg;
+       } errs[] = {
+               { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
+               { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
+               { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
+               { INFINIPATH_E_SDMABASE, "SDmaBase" },
+               { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
+               { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
+               { INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
+               { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
+               { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
+               { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
+               { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
+               { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
+       };
+       int i;
+       int expected;
+       size_t bidx = 0;
+
+       for (i = 0; i < ARRAY_SIZE(errs); i++) {
+               expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
+                       test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+               if ((err & errs[i].err) && !expected)
+                       bidx += snprintf(buf + bidx, blen - bidx,
+                                        "%s ", errs[i].msg);
+       }
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+       ipath_err_t err)
+{
+       int iserr = 1;
+       *buf = '\0';
+       if (err & INFINIPATH_E_PKTERRS) {
+               if (!(err & ~INFINIPATH_E_PKTERRS))
+                       iserr = 0; // if only packet errors.
+               if (ipath_debug & __IPATH_ERRPKTDBG) {
+                       if (err & INFINIPATH_E_REBP)
+                               strlcat(buf, "EBP ", blen);
+                       if (err & INFINIPATH_E_RVCRC)
+                               strlcat(buf, "VCRC ", blen);
+                       if (err & INFINIPATH_E_RICRC) {
+                               strlcat(buf, "CRC ", blen);
+                               // clear for check below, so only once
+                               err &= INFINIPATH_E_RICRC;
+                       }
+                       if (err & INFINIPATH_E_RSHORTPKTLEN)
+                               strlcat(buf, "rshortpktlen ", blen);
+                       if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+                               strlcat(buf, "sdroppeddatapkt ", blen);
+                       if (err & INFINIPATH_E_SPKTLEN)
+                               strlcat(buf, "spktlen ", blen);
+               }
+               if ((err & INFINIPATH_E_RICRC) &&
+                       !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
+                       strlcat(buf, "CRC ", blen);
+               if (!iserr)
+                       goto done;
+       }
+       if (err & INFINIPATH_E_RHDRLEN)
+               strlcat(buf, "rhdrlen ", blen);
+       if (err & INFINIPATH_E_RBADTID)
+               strlcat(buf, "rbadtid ", blen);
+       if (err & INFINIPATH_E_RBADVERSION)
+               strlcat(buf, "rbadversion ", blen);
+       if (err & INFINIPATH_E_RHDR)
+               strlcat(buf, "rhdr ", blen);
+       if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
+               strlcat(buf, "sendspecialtrigger ", blen);
+       if (err & INFINIPATH_E_RLONGPKTLEN)
+               strlcat(buf, "rlongpktlen ", blen);
+       if (err & INFINIPATH_E_RMAXPKTLEN)
+               strlcat(buf, "rmaxpktlen ", blen);
+       if (err & INFINIPATH_E_RMINPKTLEN)
+               strlcat(buf, "rminpktlen ", blen);
+       if (err & INFINIPATH_E_SMINPKTLEN)
+               strlcat(buf, "sminpktlen ", blen);
+       if (err & INFINIPATH_E_RFORMATERR)
+               strlcat(buf, "rformaterr ", blen);
+       if (err & INFINIPATH_E_RUNSUPVL)
+               strlcat(buf, "runsupvl ", blen);
+       if (err & INFINIPATH_E_RUNEXPCHAR)
+               strlcat(buf, "runexpchar ", blen);
+       if (err & INFINIPATH_E_RIBFLOW)
+               strlcat(buf, "ribflow ", blen);
+       if (err & INFINIPATH_E_SUNDERRUN)
+               strlcat(buf, "sunderrun ", blen);
+       if (err & INFINIPATH_E_SPIOARMLAUNCH)
+               strlcat(buf, "spioarmlaunch ", blen);
+       if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+               strlcat(buf, "sunexperrpktnum ", blen);
+       if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+               strlcat(buf, "sdroppedsmppkt ", blen);
+       if (err & INFINIPATH_E_SMAXPKTLEN)
+               strlcat(buf, "smaxpktlen ", blen);
+       if (err & INFINIPATH_E_SUNSUPVL)
+               strlcat(buf, "sunsupVL ", blen);
+       if (err & INFINIPATH_E_INVALIDADDR)
+               strlcat(buf, "invalidaddr ", blen);
+       if (err & INFINIPATH_E_RRCVEGRFULL)
+               strlcat(buf, "rcvegrfull ", blen);
+       if (err & INFINIPATH_E_RRCVHDRFULL)
+               strlcat(buf, "rcvhdrfull ", blen);
+       if (err & INFINIPATH_E_IBSTATUSCHANGED)
+               strlcat(buf, "ibcstatuschg ", blen);
+       if (err & INFINIPATH_E_RIBLOSTLINK)
+               strlcat(buf, "riblostlink ", blen);
+       if (err & INFINIPATH_E_HARDWARE)
+               strlcat(buf, "hardware ", blen);
+       if (err & INFINIPATH_E_RESET)
+               strlcat(buf, "reset ", blen);
+       if (err & INFINIPATH_E_SDMAERRS)
+               decode_sdma_errs(dd, err, buf, blen);
+       if (err & INFINIPATH_E_INVALIDEEPCMD)
+               strlcat(buf, "invalideepromcmd ", blen);
+done:
+       return iserr;
+}
+
+/**
+ * get_rhf_errstring - decode RHF errors
+ * @err: the err number
+ * @msg: the output buffer
+ * @len: the length of the output buffer
+ *
+ * only used one place now, may want more later
+ */
+static void get_rhf_errstring(u32 err, char *msg, size_t len)
+{
+       /* if no errors, and so don't need to check what's first */
+       *msg = '\0';
+
+       if (err & INFINIPATH_RHF_H_ICRCERR)
+               strlcat(msg, "icrcerr ", len);
+       if (err & INFINIPATH_RHF_H_VCRCERR)
+               strlcat(msg, "vcrcerr ", len);
+       if (err & INFINIPATH_RHF_H_PARITYERR)
+               strlcat(msg, "parityerr ", len);
+       if (err & INFINIPATH_RHF_H_LENERR)
+               strlcat(msg, "lenerr ", len);
+       if (err & INFINIPATH_RHF_H_MTUERR)
+               strlcat(msg, "mtuerr ", len);
+       if (err & INFINIPATH_RHF_H_IHDRERR)
+               /* infinipath hdr checksum error */
+               strlcat(msg, "ipathhdrerr ", len);
+       if (err & INFINIPATH_RHF_H_TIDERR)
+               strlcat(msg, "tiderr ", len);
+       if (err & INFINIPATH_RHF_H_MKERR)
+               /* bad port, offset, etc. */
+               strlcat(msg, "invalid ipathhdr ", len);
+       if (err & INFINIPATH_RHF_H_IBERR)
+               strlcat(msg, "iberr ", len);
+       if (err & INFINIPATH_RHF_L_SWA)
+               strlcat(msg, "swA ", len);
+       if (err & INFINIPATH_RHF_L_SWB)
+               strlcat(msg, "swB ", len);
+}
+
+/**
+ * ipath_get_egrbuf - get an eager buffer
+ * @dd: the infinipath device
+ * @bufnum: the eager buffer to get
+ *
+ * must only be called if ipath_pd[port] is known to be allocated
+ */
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
+{
+       return dd->ipath_port0_skbinfo ?
+               (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
+}
+
+/**
+ * ipath_alloc_skb - allocate an skb and buffer with possible constraints
+ * @dd: the infinipath device
+ * @gfp_mask: the sk_buff SFP mask
+ */
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
+                               gfp_t gfp_mask)
+{
+       struct sk_buff *skb;
+       u32 len;
+
+       /*
+        * Only fully supported way to handle this is to allocate lots
+        * extra, align as needed, and then do skb_reserve().  That wastes
+        * a lot of memory...  I'll have to hack this into infinipath_copy
+        * also.
+        */
+
+       /*
+        * We need 2 extra bytes for ipath_ether data sent in the
+        * key header.  In order to keep everything dword aligned,
+        * we'll reserve 4 bytes.
+        */
+       len = dd->ipath_ibmaxlen + 4;
+
+       if (dd->ipath_flags & IPATH_4BYTE_TID) {
+               /* We need a 2KB multiple alignment, and there is no way
+                * to do it except to allocate extra and then skb_reserve
+                * enough to bring it up to the right alignment.
+                */
+               len += 2047;
+       }
+
+       skb = __dev_alloc_skb(len, gfp_mask);
+       if (!skb) {
+               ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
+                             len);
+               goto bail;
+       }
+
+       skb_reserve(skb, 4);
+
+       if (dd->ipath_flags & IPATH_4BYTE_TID) {
+               u32 una = (unsigned long)skb->data & 2047;
+               if (una)
+                       skb_reserve(skb, 2048 - una);
+       }
+
+bail:
+       return skb;
+}
+
+static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
+                            u32 eflags,
+                            u32 l,
+                            u32 etail,
+                            __le32 *rhf_addr,
+                            struct ipath_message_header *hdr)
+{
+       char emsg[128];
+
+       get_rhf_errstring(eflags, emsg, sizeof emsg);
+       ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
+                  "tlen=%x opcode=%x egridx=%x: %s\n",
+                  eflags, l,
+                  ipath_hdrget_rcv_type(rhf_addr),
+                  ipath_hdrget_length_in_bytes(rhf_addr),
+                  be32_to_cpu(hdr->bth[0]) >> 24,
+                  etail, emsg);
+
+       /* Count local link integrity errors. */
+       if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
+               u8 n = (dd->ipath_ibcctrl >>
+                       INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+                       INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+
+               if (++dd->ipath_lli_counter > n) {
+                       dd->ipath_lli_counter = 0;
+                       dd->ipath_lli_errors++;
+               }
+       }
+}
+
+/*
+ * ipath_kreceive - receive a packet
+ * @pd: the infinipath port
+ *
+ * called from interrupt handler for errors or receive interrupt
+ */
+void ipath_kreceive(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       __le32 *rhf_addr;
+       void *ebuf;
+       const u32 rsize = dd->ipath_rcvhdrentsize;      /* words */
+       const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
+       u32 etail = -1, l, hdrqtail;
+       struct ipath_message_header *hdr;
+       u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
+       static u64 totcalls;    /* stats, may eventually remove */
+       int last;
+
+       l = pd->port_head;
+       rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
+       if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+               u32 seq = ipath_hdrget_seq(rhf_addr);
+
+               if (seq != pd->port_seq_cnt)
+                       goto bail;
+               hdrqtail = 0;
+       } else {
+               hdrqtail = ipath_get_rcvhdrtail(pd);
+               if (l == hdrqtail)
+                       goto bail;
+               smp_rmb();
+       }
+
+reloop:
+       for (last = 0, i = 1; !last; i += !last) {
+               hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
+               eflags = ipath_hdrget_err_flags(rhf_addr);
+               etype = ipath_hdrget_rcv_type(rhf_addr);
+               /* total length */
+               tlen = ipath_hdrget_length_in_bytes(rhf_addr);
+               ebuf = NULL;
+               if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
+                   ipath_hdrget_use_egr_buf(rhf_addr) :
+                   (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+                       /*
+                        * It turns out that the chip uses an eager buffer
+                        * for all non-expected packets, whether it "needs"
+                        * one or not.  So always get the index, but don't
+                        * set ebuf (so we try to copy data) unless the
+                        * length requires it.
+                        */
+                       etail = ipath_hdrget_index(rhf_addr);
+                       updegr = 1;
+                       if (tlen > sizeof(*hdr) ||
+                           etype == RCVHQ_RCV_TYPE_NON_KD)
+                               ebuf = ipath_get_egrbuf(dd, etail);
+               }
+
+               /*
+                * both tiderr and ipathhdrerr are set for all plain IB
+                * packets; only ipathhdrerr should be set.
+                */
+
+               if (etype != RCVHQ_RCV_TYPE_NON_KD &&
+                   etype != RCVHQ_RCV_TYPE_ERROR &&
+                   ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+                   IPS_PROTO_VERSION)
+                       ipath_cdbg(PKT, "Bad InfiniPath protocol version "
+                                  "%x\n", etype);
+
+               if (unlikely(eflags))
+                       ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
+               else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+                       ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
+                       if (dd->ipath_lli_counter)
+                               dd->ipath_lli_counter--;
+               } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+                       u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
+                       u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
+                       ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
+                                  "qp=%x), len %x; ignored\n",
+                                  etype, opcode, qp, tlen);
+               }
+               else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
+                       ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
+                                 be32_to_cpu(hdr->bth[0]) >> 24);
+               else {
+                       /*
+                        * error packet, type of error unknown.
+                        * Probably type 3, but we don't know, so don't
+                        * even try to print the opcode, etc.
+                        * Usually caused by a "bad packet", that has no
+                        * BTH, when the LRH says it should.
+                        */
+                       ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
+                                 " %x, len %x hdrq+%x rhf: %Lx\n",
+                                 etail, tlen, l, (unsigned long long)
+                                 le64_to_cpu(*(__le64 *) rhf_addr));
+                       if (ipath_debug & __IPATH_ERRPKTDBG) {
+                               u32 j, *d, dw = rsize-2;
+                               if (rsize > (tlen>>2))
+                                       dw = tlen>>2;
+                               d = (u32 *)hdr;
+                               printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
+                                       dw);
+                               for (j = 0; j < dw; j++)
+                                       printk(KERN_DEBUG "%8x%s", d[j],
+                                               (j%8) == 7 ? "\n" : " ");
+                               printk(KERN_DEBUG ".\n");
+                       }
+               }
+               l += rsize;
+               if (l >= maxcnt)
+                       l = 0;
+               rhf_addr = (__le32 *) pd->port_rcvhdrq +
+                       l + dd->ipath_rhf_offset;
+               if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+                       u32 seq = ipath_hdrget_seq(rhf_addr);
+
+                       if (++pd->port_seq_cnt > 13)
+                               pd->port_seq_cnt = 1;
+                       if (seq != pd->port_seq_cnt)
+                               last = 1;
+               } else if (l == hdrqtail)
+                       last = 1;
+               /*
+                * update head regs on last packet, and every 16 packets.
+                * Reduce bus traffic, while still trying to prevent
+                * rcvhdrq overflows, for when the queue is nearly full
+                */
+               if (last || !(i & 0xf)) {
+                       u64 lval = l;
+
+                       /* request IBA6120 and 7220 interrupt only on last */
+                       if (last)
+                               lval |= dd->ipath_rhdrhead_intr_off;
+                       ipath_write_ureg(dd, ur_rcvhdrhead, lval,
+                               pd->port_port);
+                       if (updegr) {
+                               ipath_write_ureg(dd, ur_rcvegrindexhead,
+                                                etail, pd->port_port);
+                               updegr = 0;
+                       }
+               }
+       }
+
+       if (!dd->ipath_rhdrhead_intr_off && !reloop &&
+           !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+               /* IBA6110 workaround; we can have a race clearing chip
+                * interrupt with another interrupt about to be delivered,
+                * and can clear it before it is delivered on the GPIO
+                * workaround.  By doing the extra check here for the
+                * in-memory tail register updating while we were doing
+                * earlier packets, we "almost" guarantee we have covered
+                * that case.
+                */
+               u32 hqtail = ipath_get_rcvhdrtail(pd);
+               if (hqtail != hdrqtail) {
+                       hdrqtail = hqtail;
+                       reloop = 1; /* loop 1 extra time at most */
+                       goto reloop;
+               }
+       }
+
+       pkttot += i;
+
+       pd->port_head = l;
+
+       if (pkttot > ipath_stats.sps_maxpkts_call)
+               ipath_stats.sps_maxpkts_call = pkttot;
+       ipath_stats.sps_port0pkts += pkttot;
+       ipath_stats.sps_avgpkts_call =
+               ipath_stats.sps_port0pkts / ++totcalls;
+
+bail:;
+}
+
+/**
+ * ipath_update_pio_bufs - update shadow copy of the PIO availability map
+ * @dd: the infinipath device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by some code
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+       int i;
+       const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
+
+       /* If the generation (check) bits have changed, then we update the
+        * busy bit for the corresponding PIO buffer.  This algorithm will
+        * modify positions to the value they already have in some cases
+        * (i.e., no change), but it's faster than changing only the bits
+        * that have changed.
+        *
+        * We would like to do this atomicly, to avoid spinlocks in the
+        * critical send path, but that's not really possible, given the
+        * type of changes, and that this routine could be called on
+        * multiple cpu's simultaneously, so we lock in this routine only,
+        * to avoid conflicting updates; all we change is the shadow, and
+        * it's a single 64 bit memory location, so by definition the update
+        * is atomic in terms of what other cpu's can see in testing the
+        * bits.  The spin_lock overhead isn't too bad, since it only
+        * happens when all buffers are in use, so only cpu overhead, not
+        * latency or bandwidth is affected.
+        */
+       if (!dd->ipath_pioavailregs_dma) {
+               ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
+               return;
+       }
+       if (ipath_debug & __IPATH_VERBDBG) {
+               /* only if packet debug and verbose */
+               volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+               unsigned long *shadow = dd->ipath_pioavailshadow;
+
+               ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
+                          "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
+                          "s3=%lx\n",
+                          (unsigned long long) le64_to_cpu(dma[0]),
+                          shadow[0],
+                          (unsigned long long) le64_to_cpu(dma[1]),
+                          shadow[1],
+                          (unsigned long long) le64_to_cpu(dma[2]),
+                          shadow[2],
+                          (unsigned long long) le64_to_cpu(dma[3]),
+                          shadow[3]);
+               if (piobregs > 4)
+                       ipath_cdbg(
+                               PKT, "2nd group, dma4=%llx shad4=%lx, "
+                               "d5=%llx s5=%lx, d6=%llx s6=%lx, "
+                               "d7=%llx s7=%lx\n",
+                               (unsigned long long) le64_to_cpu(dma[4]),
+                               shadow[4],
+                               (unsigned long long) le64_to_cpu(dma[5]),
+                               shadow[5],
+                               (unsigned long long) le64_to_cpu(dma[6]),
+                               shadow[6],
+                               (unsigned long long) le64_to_cpu(dma[7]),
+                               shadow[7]);
+       }
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       for (i = 0; i < piobregs; i++) {
+               u64 pchbusy, pchg, piov, pnew;
+               /*
+                * Chip Errata: bug 6641; even and odd qwords>3 are swapped
+                */
+               if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
+               else
+                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
+               pchg = dd->ipath_pioavailkernel[i] &
+                       ~(dd->ipath_pioavailshadow[i] ^ piov);
+               pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+               if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
+                       pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
+                       pnew |= piov & pchbusy;
+                       dd->ipath_pioavailshadow[i] = pnew;
+               }
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors.  Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+       int i, im;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       for (i = 0; i < dd->ipath_pioavregs; i++) {
+               u64 val, oldval;
+               /* deal with 6110 chip bug on high register #s */
+               im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+                       i ^ 1 : i;
+               val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+               /*
+                * busy out the buffers not in the kernel avail list,
+                * without changing the generation bits.
+                */
+               oldval = dd->ipath_pioavailshadow[i];
+               dd->ipath_pioavailshadow[i] = val |
+                       ((~dd->ipath_pioavailkernel[i] <<
+                       INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+                       0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+               if (oldval != dd->ipath_pioavailshadow[i])
+                       ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+                               i, (unsigned long long) oldval,
+                               dd->ipath_pioavailshadow[i]);
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/**
+ * ipath_setrcvhdrsize - set the receive header size
+ * @dd: the infinipath device
+ * @rhdrsize: the receive header size
+ *
+ * called from user init code, and also layered driver init
+ */
+int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
+{
+       int ret = 0;
+
+       if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
+               if (dd->ipath_rcvhdrsize != rhdrsize) {
+                       dev_info(&dd->pcidev->dev,
+                                "Error: can't set protocol header "
+                                "size %u, already %u\n",
+                                rhdrsize, dd->ipath_rcvhdrsize);
+                       ret = -EAGAIN;
+               } else
+                       ipath_cdbg(VERBOSE, "Reuse same protocol header "
+                                  "size %u\n", dd->ipath_rcvhdrsize);
+       } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
+                              (sizeof(u64) / sizeof(u32)))) {
+               ipath_dbg("Error: can't set protocol header size %u "
+                         "(> max %u)\n", rhdrsize,
+                         dd->ipath_rcvhdrentsize -
+                         (u32) (sizeof(u64) / sizeof(u32)));
+               ret = -EOVERFLOW;
+       } else {
+               dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
+               dd->ipath_rcvhdrsize = rhdrsize;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+                                dd->ipath_rcvhdrsize);
+               ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
+                          dd->ipath_rcvhdrsize);
+       }
+       return ret;
+}
+
+/*
+ * debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_pio_bufs(struct ipath_devdata *dd)
+{
+       unsigned long *shadow = dd->ipath_pioavailshadow;
+       __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
+
+       dd->ipath_upd_pio_shadow = 1;
+
+       /*
+        * not atomic, but if we lose a stat count in a while, that's OK
+        */
+       ipath_stats.sps_nopiobufs++;
+       if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+               ipath_force_pio_avail_update(dd); /* at start */
+               ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+                       "%llx %llx %llx %llx\n"
+                       "ipath  shadow:  %lx %lx %lx %lx\n",
+                       dd->ipath_consec_nopiobuf,
+                       (unsigned long)get_cycles(),
+                       (unsigned long long) le64_to_cpu(dma[0]),
+                       (unsigned long long) le64_to_cpu(dma[1]),
+                       (unsigned long long) le64_to_cpu(dma[2]),
+                       (unsigned long long) le64_to_cpu(dma[3]),
+                       shadow[0], shadow[1], shadow[2], shadow[3]);
+               /*
+                * 4 buffers per byte, 4 registers above, cover rest
+                * below
+                */
+               if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
+                   (sizeof(shadow[0]) * 4 * 4))
+                       ipath_dbg("2nd group: dmacopy: "
+                                 "%llx %llx %llx %llx\n"
+                                 "ipath  shadow:  %lx %lx %lx %lx\n",
+                                 (unsigned long long)le64_to_cpu(dma[4]),
+                                 (unsigned long long)le64_to_cpu(dma[5]),
+                                 (unsigned long long)le64_to_cpu(dma[6]),
+                                 (unsigned long long)le64_to_cpu(dma[7]),
+                                 shadow[4], shadow[5], shadow[6], shadow[7]);
+
+               /* at end, so update likely happened */
+               ipath_reset_availshadow(dd);
+       }
+}
+
+/*
+ * common code for normal driver pio buffer allocation, and reserved
+ * allocation.
+ *
+ * do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ */
+static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
+       u32 *pbufnum, u32 first, u32 last, u32 firsti)
+{
+       int i, j, updated = 0;
+       unsigned piobcnt;
+       unsigned long flags;
+       unsigned long *shadow = dd->ipath_pioavailshadow;
+       u32 __iomem *buf;
+
+       piobcnt = last - first;
+       if (dd->ipath_upd_pio_shadow) {
+               /*
+                * Minor optimization.  If we had no buffers on last call,
+                * start out by doing the update; continue and do scan even
+                * if no buffers were updated, to be paranoid
+                */
+               ipath_update_pio_bufs(dd);
+               updated++;
+               i = first;
+       } else
+               i = firsti;
+rescan:
+       /*
+        * while test_and_set_bit() is atomic, we do that and then the
+        * change_bit(), and the pair is not.  See if this is the cause
+        * of the remaining armlaunch errors.
+        */
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       for (j = 0; j < piobcnt; j++, i++) {
+               if (i >= last)
+                       i = first;
+               if (__test_and_set_bit((2 * i) + 1, shadow))
+                       continue;
+               /* flip generation bit */
+               __change_bit(2 * i, shadow);
+               break;
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+       if (j == piobcnt) {
+               if (!updated) {
+                       /*
+                        * first time through; shadow exhausted, but may be
+                        * buffers available, try an update and then rescan.
+                        */
+                       ipath_update_pio_bufs(dd);
+                       updated++;
+                       i = first;
+                       goto rescan;
+               } else if (updated == 1 && piobcnt <=
+                       ((dd->ipath_sendctrl
+                       >> INFINIPATH_S_UPDTHRESH_SHIFT) &
+                       INFINIPATH_S_UPDTHRESH_MASK)) {
+                       /*
+                        * for chips supporting and using the update
+                        * threshold we need to force an update of the
+                        * in-memory copy if the count is less than the
+                        * thershold, then check one more time.
+                        */
+                       ipath_force_pio_avail_update(dd);
+                       ipath_update_pio_bufs(dd);
+                       updated++;
+                       i = first;
+                       goto rescan;
+               }
+
+               no_pio_bufs(dd);
+               buf = NULL;
+       } else {
+               if (i < dd->ipath_piobcnt2k)
+                       buf = (u32 __iomem *) (dd->ipath_pio2kbase +
+                                              i * dd->ipath_palign);
+               else
+                       buf = (u32 __iomem *)
+                               (dd->ipath_pio4kbase +
+                                (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+               if (pbufnum)
+                       *pbufnum = i;
+       }
+
+       return buf;
+}
+
+/**
+ * ipath_getpiobuf - find an available pio buffer
+ * @dd: the infinipath device
+ * @plen: the size of the PIO buffer needed in 32-bit words
+ * @pbufnum: the buffer number is placed here
+ */
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
+{
+       u32 __iomem *buf;
+       u32 pnum, nbufs;
+       u32 first, lasti;
+
+       if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
+               first = dd->ipath_piobcnt2k;
+               lasti = dd->ipath_lastpioindexl;
+       } else {
+               first = 0;
+               lasti = dd->ipath_lastpioindex;
+       }
+       nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
+
+       if (buf) {
+               /*
+                * Set next starting place.  It's just an optimization,
+                * it doesn't matter who wins on this, so no locking
+                */
+               if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+                       dd->ipath_lastpioindexl = pnum + 1;
+               else
+                       dd->ipath_lastpioindex = pnum + 1;
+               if (dd->ipath_upd_pio_shadow)
+                       dd->ipath_upd_pio_shadow = 0;
+               if (dd->ipath_consec_nopiobuf)
+                       dd->ipath_consec_nopiobuf = 0;
+               ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
+                          pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
+               if (pbufnum)
+                       *pbufnum = pnum;
+
+       }
+       return buf;
+}
+
+/**
+ * ipath_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the infinipath device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+                             unsigned len, int avail)
+{
+       unsigned long flags;
+       unsigned end, cnt = 0;
+
+       /* There are two bits per send buffer (busy and generation) */
+       start *= 2;
+       end = start + len * 2;
+
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       /* Set or clear the busy bit in the shadow. */
+       while (start < end) {
+               if (avail) {
+                       unsigned long dma;
+                       int i, im;
+                       /*
+                        * the BUSY bit will never be set, because we disarm
+                        * the user buffers before we hand them back to the
+                        * kernel.  We do have to make sure the generation
+                        * bit is set correctly in shadow, since it could
+                        * have changed many times while allocated to user.
+                        * We can't use the bitmap functions on the full
+                        * dma array because it is always little-endian, so
+                        * we have to flip to host-order first.
+                        * BITS_PER_LONG is slightly wrong, since it's
+                        * always 64 bits per register in chip...
+                        * We only work on 64 bit kernels, so that's OK.
+                        */
+                       /* deal with 6110 chip bug on high register #s */
+                       i = start / BITS_PER_LONG;
+                       im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+                               i ^ 1 : i;
+                       __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+                               + start, dd->ipath_pioavailshadow);
+                       dma = (unsigned long) le64_to_cpu(
+                               dd->ipath_pioavailregs_dma[im]);
+                       if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+                               + start) % BITS_PER_LONG, &dma))
+                               __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+                                       + start, dd->ipath_pioavailshadow);
+                       else
+                               __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+                                       + start, dd->ipath_pioavailshadow);
+                       __set_bit(start, dd->ipath_pioavailkernel);
+               } else {
+                       __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
+                               dd->ipath_pioavailshadow);
+                       __clear_bit(start, dd->ipath_pioavailkernel);
+               }
+               start += 2;
+       }
+
+       if (dd->ipath_pioupd_thresh) {
+               end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+               cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+       /*
+        * When moving buffers from kernel to user, if number assigned to
+        * the user is less than the pio update threshold, and threshold
+        * is supported (cnt was computed > 0), drop the update threshold
+        * so we update at least once per allocated number of buffers.
+        * In any case, if the kernel buffers are less than the threshold,
+        * drop the threshold.  We don't bother increasing it, having once
+        * decreased it, since it would typically just cycle back and forth.
+        * If we don't decrease below buffers in use, we can wait a long
+        * time for an update, until some other context uses PIO buffers.
+        */
+       if (!avail && len < cnt)
+               cnt = len;
+       if (cnt < dd->ipath_pioupd_thresh) {
+               dd->ipath_pioupd_thresh = cnt;
+               ipath_dbg("Decreased pio update threshold to %u\n",
+                       dd->ipath_pioupd_thresh);
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+                       << INFINIPATH_S_UPDTHRESH_SHIFT);
+               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+                       << INFINIPATH_S_UPDTHRESH_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                       dd->ipath_sendctrl);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+       }
+}
+
+/**
+ * ipath_create_rcvhdrq - create a receive header queue
+ * @dd: the infinipath device
+ * @pd: the port data
+ *
+ * this must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int ipath_create_rcvhdrq(struct ipath_devdata *dd,
+                        struct ipath_portdata *pd)
+{
+       int ret = 0;
+
+       if (!pd->port_rcvhdrq) {
+               dma_addr_t phys_hdrqtail;
+               gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+               int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+                               sizeof(u32), PAGE_SIZE);
+
+               pd->port_rcvhdrq = dma_alloc_coherent(
+                       &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
+                       gfp_flags);
+
+               if (!pd->port_rcvhdrq) {
+                       ipath_dev_err(dd, "attempt to allocate %d bytes "
+                                     "for port %u rcvhdrq failed\n",
+                                     amt, pd->port_port);
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+                       pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
+                               &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+                               GFP_KERNEL);
+                       if (!pd->port_rcvhdrtail_kvaddr) {
+                               ipath_dev_err(dd, "attempt to allocate 1 page "
+                                       "for port %u rcvhdrqtailaddr "
+                                       "failed\n", pd->port_port);
+                               ret = -ENOMEM;
+                               dma_free_coherent(&dd->pcidev->dev, amt,
+                                       pd->port_rcvhdrq,
+                                       pd->port_rcvhdrq_phys);
+                               pd->port_rcvhdrq = NULL;
+                               goto bail;
+                       }
+                       pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
+                       ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
+                                  "physical\n", pd->port_port,
+                                  (unsigned long long) phys_hdrqtail);
+               }
+
+               pd->port_rcvhdrq_size = amt;
+
+               ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
+                          "for port %u rcvhdr Q\n",
+                          amt >> PAGE_SHIFT, pd->port_rcvhdrq,
+                          (unsigned long) pd->port_rcvhdrq_phys,
+                          (unsigned long) pd->port_rcvhdrq_size,
+                          pd->port_port);
+       }
+       else
+               ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
+                          "hdrtailaddr@%p %llx physical\n",
+                          pd->port_port, pd->port_rcvhdrq,
+                          (unsigned long long) pd->port_rcvhdrq_phys,
+                          pd->port_rcvhdrtail_kvaddr, (unsigned long long)
+                          pd->port_rcvhdrqtailaddr_phys);
+
+       /* clear for security and sanity on each use */
+       memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
+       if (pd->port_rcvhdrtail_kvaddr)
+               memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+
+       /*
+        * tell chip each time we init it, even if we are re-using previous
+        * memory (we zero the register at process close)
+        */
+       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+                             pd->port_port, pd->port_rcvhdrqtailaddr_phys);
+       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+                             pd->port_port, pd->port_rcvhdrq_phys);
+
+bail:
+       return ret;
+}
+
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.   Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if normal send had happened.
+ */
+void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
+{
+       unsigned long flags;
+
+       if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
+               ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
+               goto bail;
+       }
+       /*
+        * If we have SDMA, and it's not disabled, we have to kick off the
+        * abort state machine, provided we aren't already aborting.
+        * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
+        * we skip the rest of this routine. It is already "in progress"
+        */
+       if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
+               int skip_cancel;
+               unsigned long *statp = &dd->ipath_sdma_status;
+
+               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+               skip_cancel =
+                       test_and_set_bit(IPATH_SDMA_ABORTING, statp)
+                       && !test_bit(IPATH_SDMA_DISABLED, statp);
+               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+               if (skip_cancel)
+                       goto bail;
+       }
+
+       ipath_dbg("Cancelling all in-progress send buffers\n");
+
+       /* skip armlaunch errs for a while */
+       dd->ipath_lastcancel = jiffies + HZ / 2;
+
+       /*
+        * The abort bit is auto-clearing.  We also don't want pioavail
+        * update happening during this, and we don't want any other
+        * sends going out, so turn those off for the duration.  We read
+        * the scratch register to be sure that cancels and the abort
+        * have taken effect in the chip.  Otherwise two parts are same
+        * as ipath_force_pio_avail_update()
+        */
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
+               | INFINIPATH_S_PIOENABLE);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+               dd->ipath_sendctrl | INFINIPATH_S_ABORT);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+       /* disarm all send buffers */
+       ipath_disarm_piobufs(dd, 0,
+               dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+
+       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+               set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+
+       if (restore_sendctrl) {
+               /* else done by caller later if needed */
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
+                       INFINIPATH_S_PIOENABLE;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                       dd->ipath_sendctrl);
+               /* and again, be sure all have hit the chip */
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+       }
+
+       if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
+           !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
+           test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
+               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+               /* only wait so long for intr */
+               dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
+               dd->ipath_sdma_reset_wait = 200;
+               if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+                       tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+       }
+bail:;
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.  We read the scratch register
+ * to make it highly likely that the update will have happened by the
+ * time we return.  If already off (as in cancel_sends above), this
+ * routine is a nop, on the assumption that the caller will "do the
+ * right thing".
+ */
+void ipath_force_pio_avail_update(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                       dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                       dd->ipath_sendctrl);
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       }
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
+                               int linitcmd)
+{
+       u64 mod_wd;
+       static const char *what[4] = {
+               [0] = "NOP",
+               [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
+               [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
+               [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
+       };
+
+       if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
+               /*
+                * If we are told to disable, note that so link-recovery
+                * code does not attempt to bring us back up.
+                */
+               preempt_disable();
+               dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+               preempt_enable();
+       } else if (linitcmd) {
+               /*
+                * Any other linkinitcmd will lead to LINKDOWN and then
+                * to INIT (if all is well), so clear flag to let
+                * link-recovery code attempt to bring us back up.
+                */
+               preempt_disable();
+               dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+               preempt_enable();
+       }
+
+       mod_wd = (linkcmd << dd->ibcc_lc_shift) |
+               (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+       ipath_cdbg(VERBOSE,
+               "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
+               dd->ipath_unit, what[linkcmd], linitcmd,
+               ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
+                       ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                        dd->ipath_ibcctrl | mod_wd);
+       /* read from chip so write is flushed */
+       (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+}
+
+int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
+{
+       u32 lstate;
+       int ret;
+
+       switch (newstate) {
+       case IPATH_IB_LINKDOWN_ONLY:
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKDOWN:
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+                                       INFINIPATH_IBCC_LINKINITCMD_POLL);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKDOWN_SLEEP:
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+                                       INFINIPATH_IBCC_LINKINITCMD_SLEEP);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKDOWN_DISABLE:
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+                                       INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKARM:
+               if (dd->ipath_flags & IPATH_LINKARMED) {
+                       ret = 0;
+                       goto bail;
+               }
+               if (!(dd->ipath_flags &
+                     (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
+
+               /*
+                * Since the port can transition to ACTIVE by receiving
+                * a non VL 15 packet, wait for either state.
+                */
+               lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
+               break;
+
+       case IPATH_IB_LINKACTIVE:
+               if (dd->ipath_flags & IPATH_LINKACTIVE) {
+                       ret = 0;
+                       goto bail;
+               }
+               if (!(dd->ipath_flags & IPATH_LINKARMED)) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
+               lstate = IPATH_LINKACTIVE;
+               break;
+
+       case IPATH_IB_LINK_LOOPBACK:
+               dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
+               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+
+               /* turn heartbeat off, as it causes loopback to fail */
+               dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+                                      IPATH_IB_HRTBT_OFF);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINK_EXTERNAL:
+               dev_info(&dd->pcidev->dev,
+                       "Disabling IB local loopback (normal)\n");
+               dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+                                      IPATH_IB_HRTBT_ON);
+               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       /*
+        * Heartbeat can be explicitly enabled by the user via
+        * "hrtbt_enable" "file", and if disabled, trying to enable here
+        * will have no effect.  Implicit changes (heartbeat off when
+        * loopback on, and vice versa) are included to ease testing.
+        */
+       case IPATH_IB_LINK_HRTBT:
+               ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+                       IPATH_IB_HRTBT_ON);
+               goto bail;
+
+       case IPATH_IB_LINK_NO_HRTBT:
+               ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+                       IPATH_IB_HRTBT_OFF);
+               goto bail;
+
+       default:
+               ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
+               ret = -EINVAL;
+               goto bail;
+       }
+       ret = ipath_wait_linkstate(dd, lstate, 2000);
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_set_mtu - set the MTU
+ * @dd: the infinipath device
+ * @arg: the new MTU
+ *
+ * we can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
+{
+       u32 piosize;
+       int changed = 0;
+       int ret;
+
+       /*
+        * mtu is IB data payload max.  It's the largest power of 2 less
+        * than piosize (or even larger, since it only really controls the
+        * largest we can receive; we can send the max of the mtu and
+        * piosize).  We check that it's one of the valid IB sizes.
+        */
+       if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+           (arg != 4096 || !ipath_mtu4096)) {
+               ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
+               ret = -EINVAL;
+               goto bail;
+       }
+       if (dd->ipath_ibmtu == arg) {
+               ret = 0;        /* same as current */
+               goto bail;
+       }
+
+       piosize = dd->ipath_ibmaxlen;
+       dd->ipath_ibmtu = arg;
+
+       if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
+               /* Only if it's not the initial value (or reset to it) */
+               if (piosize != dd->ipath_init_ibmaxlen) {
+                       if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
+                               piosize = dd->ipath_init_ibmaxlen;
+                       dd->ipath_ibmaxlen = piosize;
+                       changed = 1;
+               }
+       } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
+               piosize = arg + IPATH_PIO_MAXIBHDR;
+               ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
+                          "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
+                          arg);
+               dd->ipath_ibmaxlen = piosize;
+               changed = 1;
+       }
+
+       if (changed) {
+               u64 ibc = dd->ipath_ibcctrl, ibdw;
+               /*
+                * update our housekeeping variables, and set IBC max
+                * size, same as init code; max IBC is max we allow in
+                * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+                */
+               dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
+               ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
+               ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
+                        dd->ibcc_mpl_shift);
+               ibc |= ibdw << dd->ibcc_mpl_shift;
+               dd->ipath_ibcctrl = ibc;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+               dd->ipath_f_tidtemplate(dd);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
+{
+       dd->ipath_lid = lid;
+       dd->ipath_lmc = lmc;
+
+       dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
+               (~((1U << lmc) - 1)) << 16);
+
+       dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
+
+       return 0;
+}
+
+
+/**
+ * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to write
+ * @port: the port containing the register
+ * @value: the value to write
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
+                         unsigned port, u64 value)
+{
+       u16 where;
+
+       if (port < dd->ipath_portcnt &&
+           (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+            regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+               where = regno + port;
+       else
+               where = -1;
+
+       ipath_write_kreg(dd, where, value);
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void ipath_run_led_override(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+       int timeoff;
+       int pidx;
+       u64 lstate, ltstate, val;
+
+       if (!(dd->ipath_flags & IPATH_INITTED))
+               return;
+
+       pidx = dd->ipath_led_override_phase++ & 1;
+       dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
+       timeoff = dd->ipath_led_override_timeoff;
+
+       /*
+        * below potentially restores the LED values per current status,
+        * should also possibly setup the traffic-blink register,
+        * but leave that to per-chip functions.
+        */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+       ltstate = ipath_ib_linktrstate(dd, val);
+       lstate = ipath_ib_linkstate(dd, val);
+
+       dd->ipath_f_setextled(dd, lstate, ltstate);
+       mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
+}
+
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
+{
+       int timeoff, freq;
+
+       if (!(dd->ipath_flags & IPATH_INITTED))
+               return;
+
+       /* First check if we are blinking. If not, use 1HZ polling */
+       timeoff = HZ;
+       freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+       if (freq) {
+               /* For blink, set each phase from one nybble of val */
+               dd->ipath_led_override_vals[0] = val & 0xF;
+               dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
+               timeoff = (HZ << 4)/freq;
+       } else {
+               /* Non-blink set both phases the same. */
+               dd->ipath_led_override_vals[0] = val & 0xF;
+               dd->ipath_led_override_vals[1] = val & 0xF;
+       }
+       dd->ipath_led_override_timeoff = timeoff;
+
+       /*
+        * If the timer has not already been started, do so. Use a "quick"
+        * timeout so the function will be called soon, to look at our request.
+        */
+       if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
+               /* Need to start timer */
+               init_timer(&dd->ipath_led_override_timer);
+               dd->ipath_led_override_timer.function =
+                                                ipath_run_led_override;
+               dd->ipath_led_override_timer.data = (unsigned long) dd;
+               dd->ipath_led_override_timer.expires = jiffies + 1;
+               add_timer(&dd->ipath_led_override_timer);
+       } else
+               atomic_dec(&dd->ipath_led_override_timer_active);
+}
+
+/**
+ * ipath_shutdown_device - shut down a device
+ * @dd: the infinipath device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by ipath_init_chip(dd,1)
+ */
+void ipath_shutdown_device(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+
+       ipath_dbg("Shutting down the device\n");
+
+       ipath_hol_up(dd); /* make sure user processes aren't suspended */
+
+       dd->ipath_flags |= IPATH_LINKUNK;
+       dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
+                            IPATH_LINKINIT | IPATH_LINKARMED |
+                            IPATH_LINKACTIVE);
+       *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
+                               IPATH_STATUS_IB_READY);
+
+       /* mask interrupts, but not errors */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+       dd->ipath_rcvctrl = 0;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+               teardown_sdma(dd);
+
+       /*
+        * gracefully stop all sends allowing any in progress to trickle out
+        * first.
+        */
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       dd->ipath_sendctrl = 0;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+       /* flush it */
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+       /*
+        * enough for anything that's going to trickle out to have actually
+        * done so.
+        */
+       udelay(5);
+
+       dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
+
+       ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+       ipath_cancel_sends(dd, 0);
+
+       /*
+        * we are shutting down, so tell components that care.  We don't do
+        * this on just a link state change, much like ethernet, a cable
+        * unplug, etc. doesn't change driver state
+        */
+       signal_ib_event(dd, IB_EVENT_PORT_ERR);
+
+       /* disable IBC */
+       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control | INFINIPATH_C_FREEZEMODE);
+
+       /*
+        * clear SerdesEnable and turn the leds off; do this here because
+        * we are unloading, so don't count on interrupts to move along
+        * Turn the LEDs off explicitly for the same reason.
+        */
+       dd->ipath_f_quiet_serdes(dd);
+
+       /* stop all the timers that might still be running */
+       del_timer_sync(&dd->ipath_hol_timer);
+       if (dd->ipath_stats_timer_active) {
+               del_timer_sync(&dd->ipath_stats_timer);
+               dd->ipath_stats_timer_active = 0;
+       }
+       if (dd->ipath_intrchk_timer.data) {
+               del_timer_sync(&dd->ipath_intrchk_timer);
+               dd->ipath_intrchk_timer.data = 0;
+       }
+       if (atomic_read(&dd->ipath_led_override_timer_active)) {
+               del_timer_sync(&dd->ipath_led_override_timer);
+               atomic_set(&dd->ipath_led_override_timer_active, 0);
+       }
+
+       /*
+        * clear all interrupts and errors, so that the next time the driver
+        * is loaded or device is enabled, we know that whatever is set
+        * happened while we were unloaded
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+       ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
+       ipath_update_eeprom_log(dd);
+}
+
+/**
+ * ipath_free_pddata - free a port's allocated data
+ * @dd: the infinipath device
+ * @pd: the portdata structure
+ *
+ * free up any allocated data for a port
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of port data, because it is called after ipath_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ * (The only exception to global state is freeing the port0 port0_skbs.)
+ */
+void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
+{
+       if (!pd)
+               return;
+
+       if (pd->port_rcvhdrq) {
+               ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
+                          "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
+                          (unsigned long) pd->port_rcvhdrq_size);
+               dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
+                                 pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+               pd->port_rcvhdrq = NULL;
+               if (pd->port_rcvhdrtail_kvaddr) {
+                       dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                        pd->port_rcvhdrtail_kvaddr,
+                                        pd->port_rcvhdrqtailaddr_phys);
+                       pd->port_rcvhdrtail_kvaddr = NULL;
+               }
+       }
+       if (pd->port_port && pd->port_rcvegrbuf) {
+               unsigned e;
+
+               for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+                       void *base = pd->port_rcvegrbuf[e];
+                       size_t size = pd->port_rcvegrbuf_size;
+
+                       ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
+                                  "chunk %u/%u\n", base,
+                                  (unsigned long) size,
+                                  e, pd->port_rcvegrbuf_chunks);
+                       dma_free_coherent(&dd->pcidev->dev, size,
+                               base, pd->port_rcvegrbuf_phys[e]);
+               }
+               kfree(pd->port_rcvegrbuf);
+               pd->port_rcvegrbuf = NULL;
+               kfree(pd->port_rcvegrbuf_phys);
+               pd->port_rcvegrbuf_phys = NULL;
+               pd->port_rcvegrbuf_chunks = 0;
+       } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
+               unsigned e;
+               struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
+
+               dd->ipath_port0_skbinfo = NULL;
+               ipath_cdbg(VERBOSE, "free closed port %d "
+                          "ipath_port0_skbinfo @ %p\n", pd->port_port,
+                          skbinfo);
+               for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
+                       if (skbinfo[e].skb) {
+                               pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+                                                dd->ipath_ibmaxlen,
+                                                PCI_DMA_FROMDEVICE);
+                               dev_kfree_skb(skbinfo[e].skb);
+                       }
+               vfree(skbinfo);
+       }
+       kfree(pd->port_tid_pg_list);
+       vfree(pd->subport_uregbase);
+       vfree(pd->subport_rcvegrbuf);
+       vfree(pd->subport_rcvhdr_base);
+       kfree(pd);
+}
+
+static int __init infinipath_init(void)
+{
+       int ret;
+
+       if (ipath_debug & __IPATH_DBG)
+               printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
+
+       /*
+        * These must be called before the driver is registered with
+        * the PCI subsystem.
+        */
+       idr_init(&unit_table);
+
+       ret = pci_register_driver(&ipath_driver);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Unable to register driver: error %d\n", -ret);
+               goto bail_unit;
+       }
+
+       ret = ipath_init_ipathfs();
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
+                      "ipathfs: error %d\n", -ret);
+               goto bail_pci;
+       }
+
+       goto bail;
+
+bail_pci:
+       pci_unregister_driver(&ipath_driver);
+
+bail_unit:
+       idr_destroy(&unit_table);
+
+bail:
+       return ret;
+}
+
+static void __exit infinipath_cleanup(void)
+{
+       ipath_exit_ipathfs();
+
+       ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
+       pci_unregister_driver(&ipath_driver);
+
+       idr_destroy(&unit_table);
+}
+
+/**
+ * ipath_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user ports are open that use chip resources
+ */
+int ipath_reset_device(int unit)
+{
+       int ret, i;
+       struct ipath_devdata *dd = ipath_lookup(unit);
+       unsigned long flags;
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       if (atomic_read(&dd->ipath_led_override_timer_active)) {
+               /* Need to stop LED timer, _then_ shut off LEDs */
+               del_timer_sync(&dd->ipath_led_override_timer);
+               atomic_set(&dd->ipath_led_override_timer_active, 0);
+       }
+
+       /* Shut off LEDs after we are sure timer is not running */
+       dd->ipath_led_override = LED_OVER_BOTH_OFF;
+       dd->ipath_f_setextled(dd, 0, 0);
+
+       dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
+
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
+               dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
+                        "not initialized or not present\n", unit);
+               ret = -ENXIO;
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+       if (dd->ipath_pd)
+               for (i = 1; i < dd->ipath_cfgports; i++) {
+                       if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+                               continue;
+                       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+                       ipath_dbg("unit %u port %d is in use "
+                                 "(PID %u cmd %s), can't reset\n",
+                                 unit, i,
+                                 pid_nr(dd->ipath_pd[i]->port_pid),
+                                 dd->ipath_pd[i]->port_comm);
+                       ret = -EBUSY;
+                       goto bail;
+               }
+       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+               teardown_sdma(dd);
+
+       dd->ipath_flags &= ~IPATH_INITTED;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+       ret = dd->ipath_f_reset(dd);
+       if (ret == 1) {
+               ipath_dbg("Reinitializing unit %u after reset attempt\n",
+                         unit);
+               ret = ipath_init_chip(dd, 1);
+       } else
+               ret = -EAGAIN;
+       if (ret)
+               ipath_dev_err(dd, "Reinitialize unit %u after "
+                             "reset failed with %d\n", unit, ret);
+       else
+               dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
+                        "resetting\n", unit);
+
+bail:
+       return ret;
+}
+
+/*
+ * send a signal to all the processes that have the driver open
+ * through the normal interfaces (i.e., everything other than diags
+ * interface).  Returns number of signalled processes.
+ */
+static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
+{
+       int i, sub, any = 0;
+       struct pid *pid;
+       unsigned long flags;
+
+       if (!dd->ipath_pd)
+               return 0;
+
+       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+       for (i = 1; i < dd->ipath_cfgports; i++) {
+               if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+                       continue;
+               pid = dd->ipath_pd[i]->port_pid;
+               if (!pid)
+                       continue;
+
+               dev_info(&dd->pcidev->dev, "context %d in use "
+                         "(PID %u), sending signal %d\n",
+                         i, pid_nr(pid), sig);
+               kill_pid(pid, sig, 1);
+               any++;
+               for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
+                       pid = dd->ipath_pd[i]->port_subpid[sub];
+                       if (!pid)
+                               continue;
+                       dev_info(&dd->pcidev->dev, "sub-context "
+                               "%d:%d in use (PID %u), sending "
+                               "signal %d\n", i, sub, pid_nr(pid), sig);
+                       kill_pid(pid, sig, 1);
+                       any++;
+               }
+       }
+       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+       return any;
+}
+
+static void ipath_hol_signal_down(struct ipath_devdata *dd)
+{
+       if (ipath_signal_procs(dd, SIGSTOP))
+               ipath_dbg("Stopped some processes\n");
+       ipath_cancel_sends(dd, 1);
+}
+
+
+static void ipath_hol_signal_up(struct ipath_devdata *dd)
+{
+       if (ipath_signal_procs(dd, SIGCONT))
+               ipath_dbg("Continued some processes\n");
+}
+
+/*
+ * link is down, stop any users processes, and flush pending sends
+ * to prevent HoL blocking, then start the HoL timer that
+ * periodically continues, then stop procs, so they can detect
+ * link down if they want, and do something about it.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void ipath_hol_down(struct ipath_devdata *dd)
+{
+       dd->ipath_hol_state = IPATH_HOL_DOWN;
+       ipath_hol_signal_down(dd);
+       dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+       dd->ipath_hol_timer.expires = jiffies +
+               msecs_to_jiffies(ipath_hol_timeout_ms);
+       mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+}
+
+/*
+ * link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up
+ */
+void ipath_hol_up(struct ipath_devdata *dd)
+{
+       ipath_hol_signal_up(dd);
+       dd->ipath_hol_state = IPATH_HOL_UP;
+}
+
+/*
+ * toggle the running/not running state of user proceses
+ * to prevent HoL blocking on chip resources, but still allow
+ * user processes to do link down special case handling.
+ * Should only be called via the timer
+ */
+void ipath_hol_event(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+       if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
+               && dd->ipath_hol_state != IPATH_HOL_UP) {
+               dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+               ipath_dbg("Stopping processes\n");
+               ipath_hol_signal_down(dd);
+       } else { /* may do "extra" if also in ipath_hol_up() */
+               dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
+               ipath_dbg("Continuing processes\n");
+               ipath_hol_signal_up(dd);
+       }
+       if (dd->ipath_hol_state == IPATH_HOL_UP)
+               ipath_dbg("link's up, don't resched timer\n");
+       else {
+               dd->ipath_hol_timer.expires = jiffies +
+                       msecs_to_jiffies(ipath_hol_timeout_ms);
+               mod_timer(&dd->ipath_hol_timer,
+                       dd->ipath_hol_timer.expires);
+       }
+}
+
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
+{
+       u64 val;
+
+       if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
+               return -1;
+       if (dd->ipath_rx_pol_inv != new_pol_inv) {
+               dd->ipath_rx_pol_inv = new_pol_inv;
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+               val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+                        INFINIPATH_XGXS_RX_POL_SHIFT);
+               val |= ((u64)dd->ipath_rx_pol_inv) <<
+                       INFINIPATH_XGXS_RX_POL_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+       }
+       return 0;
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
+ * the 7220, which is count-based, rather than trigger-based.  Safe for the
+ * driver check, since it's at init.   Not completely safe when used for
+ * user-mode checking, since some error checking can be lost, but not
+ * particularly risky, and only has problematic side-effects in the face of
+ * very buggy user code.  There is no reference counting, but that's also
+ * fine, given the intended use.
+ */
+void ipath_enable_armlaunch(struct ipath_devdata *dd)
+{
+       dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+               INFINIPATH_E_SPIOARMLAUNCH);
+       dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+               dd->ipath_errormask);
+}
+
+void ipath_disable_armlaunch(struct ipath_devdata *dd)
+{
+       /* so don't re-enable if already set */
+       dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
+       dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+               dd->ipath_errormask);
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
diff --git a/drivers/staging/rdma/ipath/ipath_eeprom.c b/drivers/staging/rdma/ipath/ipath_eeprom.c
new file mode 100644 (file)
index 0000000..fc71819
--- /dev/null
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+
+/*
+ * InfiniPath I2C driver for a serial eeprom.  This is not a generic
+ * I2C interface.  For a start, the device we're using (Atmel AT24C11)
+ * doesn't work like a regular I2C device.  It looks like one
+ * electrically, but not logically.  Normal I2C devices have a single
+ * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
+ * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
+ * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
+ * call" address.)  The Atmel device, on the other hand, responds to ALL
+ * 7-bit addresses.  It's designed to be the only device on a given I2C
+ * bus.  A 7-bit address corresponds to the memory address within the
+ * Atmel device itself.
+ *
+ * Also, the timing requirements mean more than simple software
+ * bitbanging, with readbacks from chip to ensure timing (simple udelay
+ * is not enough).
+ *
+ * This all means that accessing the device is specialized enough
+ * that using the standard kernel I2C bitbanging interface would be
+ * impossible.  For example, the core I2C eeprom driver expects to find
+ * a device at one or more of a limited set of addresses only.  It doesn't
+ * allow writing to an eeprom.  It also doesn't provide any means of
+ * accessing eeprom contents from within the kernel, only via sysfs.
+ */
+
+/* Added functionality for IBA7220-based cards */
+#define IPATH_EEPROM_DEV_V1 0xA0
+#define IPATH_EEPROM_DEV_V2 0xA2
+#define IPATH_TEMP_DEV 0x98
+#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
+#define IPATH_NO_DEV (0xFF)
+
+/*
+ * The number of I2C chains is proliferating. Table below brings
+ * some order to the madness. The basic principle is that the
+ * table is scanned from the top, and a "probe" is made to the
+ * device probe_dev. If that succeeds, the chain is considered
+ * to be of that type, and dd->i2c_chain_type is set to the index+1
+ * of the entry.
+ * The +1 is so static initialization can mean "unknown, do probe."
+ */
+static struct i2c_chain_desc {
+       u8 probe_dev;   /* If seen at probe, chain is this type */
+       u8 eeprom_dev;  /* Dev addr (if any) for EEPROM */
+       u8 temp_dev;    /* Dev Addr (if any) for Temp-sense */
+} i2c_chains[] = {
+       { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
+       { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
+       { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
+       { IPATH_NO_DEV }
+};
+
+enum i2c_type {
+       i2c_line_scl = 0,
+       i2c_line_sda
+};
+
+enum i2c_state {
+       i2c_line_low = 0,
+       i2c_line_high
+};
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_gpio_set - set a GPIO line
+ * @dd: the infinipath device
+ * @line: the line to set
+ * @new_line_state: the state to set
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.
+ */
+static int i2c_gpio_set(struct ipath_devdata *dd,
+                       enum i2c_type line,
+                       enum i2c_state new_line_state)
+{
+       u64 out_mask, dir_mask, *gpioval;
+       unsigned long flags = 0;
+
+       gpioval = &dd->ipath_gpio_out;
+
+       if (line == i2c_line_scl) {
+               dir_mask = dd->ipath_gpio_scl;
+               out_mask = (1UL << dd->ipath_gpio_scl_num);
+       } else {
+               dir_mask = dd->ipath_gpio_sda;
+               out_mask = (1UL << dd->ipath_gpio_sda_num);
+       }
+
+       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+       if (new_line_state == i2c_line_high) {
+               /* tri-state the output rather than force high */
+               dd->ipath_extctrl &= ~dir_mask;
+       } else {
+               /* config line to be an output */
+               dd->ipath_extctrl |= dir_mask;
+       }
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+
+       /* set output as well (no real verify) */
+       if (new_line_state == i2c_line_high)
+               *gpioval |= out_mask;
+       else
+               *gpioval &= ~out_mask;
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
+       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+       return 0;
+}
+
+/**
+ * i2c_gpio_get - get a GPIO line state
+ * @dd: the infinipath device
+ * @line: the line to get
+ * @curr_statep: where to put the line state
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.  curr_state is not set on error.
+ */
+static int i2c_gpio_get(struct ipath_devdata *dd,
+                       enum i2c_type line,
+                       enum i2c_state *curr_statep)
+{
+       u64 read_val, mask;
+       int ret;
+       unsigned long flags = 0;
+
+       /* check args */
+       if (curr_statep == NULL) {
+               ret = 1;
+               goto bail;
+       }
+
+       /* config line to be an input */
+       if (line == i2c_line_scl)
+               mask = dd->ipath_gpio_scl;
+       else
+               mask = dd->ipath_gpio_sda;
+
+       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+       dd->ipath_extctrl &= ~mask;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+       /*
+        * Below is very unlikely to reflect true input state if Output
+        * Enable actually changed.
+        */
+       read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+       if (read_val & mask)
+               *curr_statep = i2c_line_high;
+       else
+               *curr_statep = i2c_line_low;
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the infinipath device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct ipath_devdata *dd)
+{
+       (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+       rmb();
+}
+
+static void scl_out(struct ipath_devdata *dd, u8 bit)
+{
+       udelay(1);
+       i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
+
+       i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct ipath_devdata *dd, u8 bit)
+{
+       i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
+
+       i2c_wait_for_writes(dd);
+}
+
+static u8 sda_in(struct ipath_devdata *dd, int wait)
+{
+       enum i2c_state bit;
+
+       if (i2c_gpio_get(dd, i2c_line_sda, &bit))
+               ipath_dbg("get bit failed!\n");
+
+       if (wait)
+               i2c_wait_for_writes(dd);
+
+       return bit == i2c_line_high ? 1U : 0;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the infinipath device
+ */
+static int i2c_ackrcv(struct ipath_devdata *dd)
+{
+       u8 ack_received;
+
+       /* AT ENTRY SCL = LOW */
+       /* change direction, ignore data */
+       ack_received = sda_in(dd, 1);
+       scl_out(dd, i2c_line_high);
+       ack_received = sda_in(dd, 1) == 0;
+       scl_out(dd, i2c_line_low);
+       return ack_received;
+}
+
+/**
+ * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
+ * @dd: the infinipath device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct ipath_devdata *dd)
+{
+       int bit_cntr, data;
+
+       data = 0;
+
+       for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+               data <<= 1;
+               scl_out(dd, i2c_line_high);
+               data |= sda_in(dd, 0);
+               scl_out(dd, i2c_line_low);
+       }
+       return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the infinipath device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct ipath_devdata *dd, u8 data)
+{
+       int bit_cntr;
+       u8 bit;
+
+       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+               bit = (data >> bit_cntr) & 1;
+               sda_out(dd, bit);
+               scl_out(dd, i2c_line_high);
+               scl_out(dd, i2c_line_low);
+       }
+       return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+static void send_ack(struct ipath_devdata *dd)
+{
+       sda_out(dd, i2c_line_low);
+       scl_out(dd, i2c_line_high);
+       scl_out(dd, i2c_line_low);
+       sda_out(dd, i2c_line_high);
+}
+
+/**
+ * i2c_startcmd - transmit the start condition, followed by address/cmd
+ * @dd: the infinipath device
+ * @offset_dir: direction byte
+ *
+ *      (both clock/data high, clock high, data low while clock is high)
+ */
+static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
+{
+       int res;
+
+       /* issue start sequence */
+       sda_out(dd, i2c_line_high);
+       scl_out(dd, i2c_line_high);
+       sda_out(dd, i2c_line_low);
+       scl_out(dd, i2c_line_low);
+
+       /* issue length and direction byte */
+       res = wr_byte(dd, offset_dir);
+
+       if (res)
+               ipath_cdbg(VERBOSE, "No ack to complete start\n");
+
+       return res;
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the infinipath device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct ipath_devdata *dd)
+{
+       scl_out(dd, i2c_line_low);
+       sda_out(dd, i2c_line_low);
+       scl_out(dd, i2c_line_high);
+       sda_out(dd, i2c_line_high);
+       udelay(2);
+}
+
+/**
+ * eeprom_reset - reset I2C communication
+ * @dd: the infinipath device
+ */
+
+static int eeprom_reset(struct ipath_devdata *dd)
+{
+       int clock_cycles_left = 9;
+       u64 *gpioval = &dd->ipath_gpio_out;
+       int ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+       /* Make sure shadows are consistent */
+       dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+       *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
+       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+       ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
+                  "is %llx\n", (unsigned long long) *gpioval);
+
+       /*
+        * This is to get the i2c into a known state, by first going low,
+        * then tristate sda (and then tristate scl as first thing
+        * in loop)
+        */
+       scl_out(dd, i2c_line_low);
+       sda_out(dd, i2c_line_high);
+
+       /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
+       while (clock_cycles_left--) {
+               scl_out(dd, i2c_line_high);
+
+               /* SDA seen high, issue START by dropping it while SCL high */
+               if (sda_in(dd, 0)) {
+                       sda_out(dd, i2c_line_low);
+                       scl_out(dd, i2c_line_low);
+                       /* ATMEL spec says must be followed by STOP. */
+                       scl_out(dd, i2c_line_high);
+                       sda_out(dd, i2c_line_high);
+                       ret = 0;
+                       goto bail;
+               }
+
+               scl_out(dd, i2c_line_low);
+       }
+
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/*
+ * Probe for I2C device at specified address. Returns 0 for "success"
+ * to match rest of this file.
+ * Leave bus in "reasonable" state for further commands.
+ */
+static int i2c_probe(struct ipath_devdata *dd, int devaddr)
+{
+       int ret = 0;
+
+       ret = eeprom_reset(dd);
+       if (ret) {
+               ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
+                             devaddr);
+               return ret;
+       }
+       /*
+        * Reset no longer leaves bus in start condition, so normal
+        * i2c_startcmd() will do.
+        */
+       ret = i2c_startcmd(dd, devaddr | READ_CMD);
+       if (ret)
+               ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
+                          devaddr);
+       else {
+               /*
+                * Device did respond. Complete a single-byte read, because some
+                * devices apparently cannot handle STOP immediately after they
+                * ACK the start-cmd.
+                */
+               int data;
+               data = rd_byte(dd);
+               stop_cmd(dd);
+               ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
+       }
+       return ret;
+}
+
+/*
+ * Returns the "i2c type". This is a pointer to a struct that describes
+ * the I2C chain on this board. To minimize impact on struct ipath_devdata,
+ * the (small integer) index into the table is actually memoized, rather
+ * then the pointer.
+ * Memoization is because the type is determined on the first call per chip.
+ * An alternative would be to move type determination to early
+ * init code.
+ */
+static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
+{
+       int idx;
+
+       /* Get memoized index, from previous successful probes */
+       idx = dd->ipath_i2c_chain_type - 1;
+       if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
+               goto done;
+
+       idx = 0;
+       while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
+               /* if probe succeeds, this is type */
+               if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
+                       break;
+               ++idx;
+       }
+
+       /*
+        * Old EEPROM (first entry) may require a reset after probe,
+        * rather than being able to "start" after "stop"
+        */
+       if (idx == 0)
+               eeprom_reset(dd);
+
+       if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
+               idx = -1;
+       else
+               dd->ipath_i2c_chain_type = idx + 1;
+done:
+       return (idx >= 0) ? i2c_chains + idx : NULL;
+}
+
+static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
+                                       u8 eeprom_offset, void *buffer, int len)
+{
+       int ret;
+       struct i2c_chain_desc *icd;
+       u8 *bp = buffer;
+
+       ret = 1;
+       icd = ipath_i2c_type(dd);
+       if (!icd)
+               goto bail;
+
+       if (icd->eeprom_dev == IPATH_NO_DEV) {
+               /* legacy not-really-I2C */
+               ipath_cdbg(VERBOSE, "Start command only address\n");
+               eeprom_offset = (eeprom_offset << 1) | READ_CMD;
+               ret = i2c_startcmd(dd, eeprom_offset);
+       } else {
+               /* Actual I2C */
+               ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
+               if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+                       ipath_dbg("Failed EEPROM startcmd\n");
+                       stop_cmd(dd);
+                       ret = 1;
+                       goto bail;
+               }
+               ret = wr_byte(dd, eeprom_offset);
+               stop_cmd(dd);
+               if (ret) {
+                       ipath_dev_err(dd, "Failed to write EEPROM address\n");
+                       ret = 1;
+                       goto bail;
+               }
+               ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
+       }
+       if (ret) {
+               ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
+               stop_cmd(dd);
+               ret = 1;
+               goto bail;
+       }
+
+       /*
+        * eeprom keeps clocking data out as long as we ack, automatically
+        * incrementing the address.
+        */
+       while (len-- > 0) {
+               /* get and store data */
+               *bp++ = rd_byte(dd);
+               /* send ack if not the last byte */
+               if (len)
+                       send_ack(dd);
+       }
+
+       stop_cmd(dd);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
+                                      const void *buffer, int len)
+{
+       int sub_len;
+       const u8 *bp = buffer;
+       int max_wait_time, i;
+       int ret;
+       struct i2c_chain_desc *icd;
+
+       ret = 1;
+       icd = ipath_i2c_type(dd);
+       if (!icd)
+               goto bail;
+
+       while (len > 0) {
+               if (icd->eeprom_dev == IPATH_NO_DEV) {
+                       if (i2c_startcmd(dd,
+                                        (eeprom_offset << 1) | WRITE_CMD)) {
+                               ipath_dbg("Failed to start cmd offset %u\n",
+                                       eeprom_offset);
+                               goto failed_write;
+                       }
+               } else {
+                       /* Real I2C */
+                       if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+                               ipath_dbg("Failed EEPROM startcmd\n");
+                               goto failed_write;
+                       }
+                       ret = wr_byte(dd, eeprom_offset);
+                       if (ret) {
+                               ipath_dev_err(dd, "Failed to write EEPROM "
+                                             "address\n");
+                               goto failed_write;
+                       }
+               }
+
+               sub_len = min(len, 4);
+               eeprom_offset += sub_len;
+               len -= sub_len;
+
+               for (i = 0; i < sub_len; i++) {
+                       if (wr_byte(dd, *bp++)) {
+                               ipath_dbg("no ack after byte %u/%u (%u "
+                                         "total remain)\n", i, sub_len,
+                                         len + sub_len - i);
+                               goto failed_write;
+                       }
+               }
+
+               stop_cmd(dd);
+
+               /*
+                * wait for write complete by waiting for a successful
+                * read (the chip replies with a zero after the write
+                * cmd completes, and before it writes to the eeprom.
+                * The startcmd for the read will fail the ack until
+                * the writes have completed.   We do this inline to avoid
+                * the debug prints that are in the real read routine
+                * if the startcmd fails.
+                * We also use the proper device address, so it doesn't matter
+                * whether we have real eeprom_dev. legacy likes any address.
+                */
+               max_wait_time = 100;
+               while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
+                       stop_cmd(dd);
+                       if (!--max_wait_time) {
+                               ipath_dbg("Did not get successful read to "
+                                         "complete write\n");
+                               goto failed_write;
+                       }
+               }
+               /* now read (and ignore) the resulting byte */
+               rd_byte(dd);
+               stop_cmd(dd);
+       }
+
+       ret = 0;
+       goto bail;
+
+failed_write:
+       stop_cmd(dd);
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
+                       void *buff, int len)
+{
+       int ret;
+
+       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+       if (!ret) {
+               ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
+               mutex_unlock(&dd->ipath_eep_lock);
+       }
+
+       return ret;
+}
+
+/**
+ * ipath_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
+                       const void *buff, int len)
+{
+       int ret;
+
+       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+       if (!ret) {
+               ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
+               mutex_unlock(&dd->ipath_eep_lock);
+       }
+
+       return ret;
+}
+
+static u8 flash_csum(struct ipath_flash *ifp, int adjust)
+{
+       u8 *ip = (u8 *) ifp;
+       u8 csum = 0, len;
+
+       /*
+        * Limit length checksummed to max length of actual data.
+        * Checksum of erased eeprom will still be bad, but we avoid
+        * reading past the end of the buffer we were passed.
+        */
+       len = ifp->if_length;
+       if (len > sizeof(struct ipath_flash))
+               len = sizeof(struct ipath_flash);
+       while (len--)
+               csum += *ip++;
+       csum -= ifp->if_csum;
+       csum = ~csum;
+       if (adjust)
+               ifp->if_csum = csum;
+
+       return csum;
+}
+
+/**
+ * ipath_get_guid - get the GUID from the i2c device
+ * @dd: the infinipath device
+ *
+ * We have the capability to use the ipath_nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void ipath_get_eeprom_info(struct ipath_devdata *dd)
+{
+       void *buf;
+       struct ipath_flash *ifp;
+       __be64 guid;
+       int len, eep_stat;
+       u8 csum, *bguid;
+       int t = dd->ipath_unit;
+       struct ipath_devdata *dd0 = ipath_lookup(0);
+
+       if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
+               u8 oguid;
+               dd->ipath_guid = dd0->ipath_guid;
+               bguid = (u8 *) & dd->ipath_guid;
+
+               oguid = bguid[7];
+               bguid[7] += t;
+               if (oguid > bguid[7]) {
+                       if (bguid[6] == 0xff) {
+                               if (bguid[5] == 0xff) {
+                                       ipath_dev_err(
+                                               dd,
+                                               "Can't set %s GUID from "
+                                               "base, wraps to OUI!\n",
+                                               ipath_get_unit_name(t));
+                                       dd->ipath_guid = 0;
+                                       goto bail;
+                               }
+                               bguid[5]++;
+                       }
+                       bguid[6]++;
+               }
+               dd->ipath_nguid = 1;
+
+               ipath_dbg("nguid %u, so adding %u to device 0 guid, "
+                         "for %llx\n",
+                         dd0->ipath_nguid, t,
+                         (unsigned long long) be64_to_cpu(dd->ipath_guid));
+               goto bail;
+       }
+
+       /*
+        * read full flash, not just currently used part, since it may have
+        * been written with a newer definition
+        * */
+       len = sizeof(struct ipath_flash);
+       buf = vmalloc(len);
+       if (!buf) {
+               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+                             "bytes from eeprom for GUID\n", len);
+               goto bail;
+       }
+
+       mutex_lock(&dd->ipath_eep_lock);
+       eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
+       mutex_unlock(&dd->ipath_eep_lock);
+
+       if (eep_stat) {
+               ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
+               goto done;
+       }
+       ifp = (struct ipath_flash *)buf;
+
+       csum = flash_csum(ifp, 0);
+       if (csum != ifp->if_csum) {
+               dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
+                        "0x%x, not 0x%x\n", csum, ifp->if_csum);
+               goto done;
+       }
+       if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+           *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+               ipath_dev_err(dd, "Invalid GUID %llx from flash; "
+                             "ignoring\n",
+                             *(unsigned long long *) ifp->if_guid);
+               /* don't allow GUID if all 0 or all 1's */
+               goto done;
+       }
+
+       /* complain, but allow it */
+       if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+               dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
+                        "default, probably not correct!\n",
+                        *(unsigned long long *) ifp->if_guid);
+
+       bguid = ifp->if_guid;
+       if (!bguid[0] && !bguid[1] && !bguid[2]) {
+               /* original incorrect GUID format in flash; fix in
+                * core copy, by shifting up 2 octets; don't need to
+                * change top octet, since both it and shifted are
+                * 0.. */
+               bguid[1] = bguid[3];
+               bguid[2] = bguid[4];
+               bguid[3] = bguid[4] = 0;
+               guid = *(__be64 *) ifp->if_guid;
+               ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
+                          "shifting 2 octets\n");
+       } else
+               guid = *(__be64 *) ifp->if_guid;
+       dd->ipath_guid = guid;
+       dd->ipath_nguid = ifp->if_numguid;
+       /*
+        * Things are slightly complicated by the desire to transparently
+        * support both the Pathscale 10-digit serial number and the QLogic
+        * 13-character version.
+        */
+       if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
+               && ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
+               /* This board has a Serial-prefix, which is stored
+                * elsewhere for backward-compatibility.
+                */
+               char *snp = dd->ipath_serial;
+               memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
+               snp[sizeof ifp->if_sprefix] = '\0';
+               len = strlen(snp);
+               snp += len;
+               len = (sizeof dd->ipath_serial) - len;
+               if (len > sizeof ifp->if_serial) {
+                       len = sizeof ifp->if_serial;
+               }
+               memcpy(snp, ifp->if_serial, len);
+       } else
+               memcpy(dd->ipath_serial, ifp->if_serial,
+                      sizeof ifp->if_serial);
+       if (!strstr(ifp->if_comment, "Tested successfully"))
+               ipath_dev_err(dd, "Board SN %s did not pass functional "
+                       "test: %s\n", dd->ipath_serial,
+                       ifp->if_comment);
+
+       ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
+                  (unsigned long long) be64_to_cpu(dd->ipath_guid));
+
+       memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
+       /*
+        * Power-on (actually "active") hours are kept as little-endian value
+        * in EEPROM, but as seconds in a (possibly as small as 24-bit)
+        * atomic_t while running.
+        */
+       atomic_set(&dd->ipath_active_time, 0);
+       dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
+
+done:
+       vfree(buf);
+
+bail:;
+}
+
+/**
+ * ipath_update_eeprom_log - copy active-time and error counters to eeprom
+ * @dd: the infinipath device
+ *
+ * Although the time is kept as seconds in the ipath_devdata struct, it is
+ * rounded to hours for re-write, as we have only 16 bits in EEPROM.
+ * First-cut code reads whole (expected) struct ipath_flash, modifies,
+ * re-writes. Future direction: read/write only what we need, assuming
+ * that the EEPROM had to have been "good enough" for driver init, and
+ * if not, we aren't making it worse.
+ *
+ */
+
+int ipath_update_eeprom_log(struct ipath_devdata *dd)
+{
+       void *buf;
+       struct ipath_flash *ifp;
+       int len, hi_water;
+       uint32_t new_time, new_hrs;
+       u8 csum;
+       int ret, idx;
+       unsigned long flags;
+
+       /* first, check if we actually need to do anything. */
+       ret = 0;
+       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+               if (dd->ipath_eep_st_new_errs[idx]) {
+                       ret = 1;
+                       break;
+               }
+       }
+       new_time = atomic_read(&dd->ipath_active_time);
+
+       if (ret == 0 && new_time < 3600)
+               return 0;
+
+       /*
+        * The quick-check above determined that there is something worthy
+        * of logging, so get current contents and do a more detailed idea.
+        * read full flash, not just currently used part, since it may have
+        * been written with a newer definition
+        */
+       len = sizeof(struct ipath_flash);
+       buf = vmalloc(len);
+       ret = 1;
+       if (!buf) {
+               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+                               "bytes from eeprom for logging\n", len);
+               goto bail;
+       }
+
+       /* Grab semaphore and read current EEPROM. If we get an
+        * error, let go, but if not, keep it until we finish write.
+        */
+       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+       if (ret) {
+               ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
+               goto free_bail;
+       }
+       ret = ipath_eeprom_internal_read(dd, 0, buf, len);
+       if (ret) {
+               mutex_unlock(&dd->ipath_eep_lock);
+               ipath_dev_err(dd, "Unable read EEPROM for logging\n");
+               goto free_bail;
+       }
+       ifp = (struct ipath_flash *)buf;
+
+       csum = flash_csum(ifp, 0);
+       if (csum != ifp->if_csum) {
+               mutex_unlock(&dd->ipath_eep_lock);
+               ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
+                               csum, ifp->if_csum);
+               ret = 1;
+               goto free_bail;
+       }
+       hi_water = 0;
+       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+               int new_val = dd->ipath_eep_st_new_errs[idx];
+               if (new_val) {
+                       /*
+                        * If we have seen any errors, add to EEPROM values
+                        * We need to saturate at 0xFF (255) and we also
+                        * would need to adjust the checksum if we were
+                        * trying to minimize EEPROM traffic
+                        * Note that we add to actual current count in EEPROM,
+                        * in case it was altered while we were running.
+                        */
+                       new_val += ifp->if_errcntp[idx];
+                       if (new_val > 0xFF)
+                               new_val = 0xFF;
+                       if (ifp->if_errcntp[idx] != new_val) {
+                               ifp->if_errcntp[idx] = new_val;
+                               hi_water = offsetof(struct ipath_flash,
+                                               if_errcntp) + idx;
+                       }
+                       /*
+                        * update our shadow (used to minimize EEPROM
+                        * traffic), to match what we are about to write.
+                        */
+                       dd->ipath_eep_st_errs[idx] = new_val;
+                       dd->ipath_eep_st_new_errs[idx] = 0;
+               }
+       }
+       /*
+        * now update active-time. We would like to round to the nearest hour
+        * but unless atomic_t are sure to be proper signed ints we cannot,
+        * because we need to account for what we "transfer" to EEPROM and
+        * if we log an hour at 31 minutes, then we would need to set
+        * active_time to -29 to accurately count the _next_ hour.
+        */
+       if (new_time >= 3600) {
+               new_hrs = new_time / 3600;
+               atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
+               new_hrs += dd->ipath_eep_hrs;
+               if (new_hrs > 0xFFFF)
+                       new_hrs = 0xFFFF;
+               dd->ipath_eep_hrs = new_hrs;
+               if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
+                       ifp->if_powerhour[0] = new_hrs & 0xFF;
+                       hi_water = offsetof(struct ipath_flash, if_powerhour);
+               }
+               if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
+                       ifp->if_powerhour[1] = new_hrs >> 8;
+                       hi_water = offsetof(struct ipath_flash, if_powerhour)
+                                       + 1;
+               }
+       }
+       /*
+        * There is a tiny possibility that we could somehow fail to write
+        * the EEPROM after updating our shadows, but problems from holding
+        * the spinlock too long are a much bigger issue.
+        */
+       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+       if (hi_water) {
+               /* we made some change to the data, uopdate cksum and write */
+               csum = flash_csum(ifp, 1);
+               ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
+       }
+       mutex_unlock(&dd->ipath_eep_lock);
+       if (ret)
+               ipath_dev_err(dd, "Failed updating EEPROM\n");
+
+free_bail:
+       vfree(buf);
+bail:
+       return ret;
+
+}
+
+/**
+ * ipath_inc_eeprom_err - increment one of the four error counters
+ * that are logged to EEPROM.
+ * @dd: the infinipath device
+ * @eidx: 0..3, the counter to increment
+ * @incr: how much to add
+ *
+ * Each counter is 8-bits, and saturates at 255 (0xFF). They
+ * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
+ * is called, but it can only be called in a context that allows sleep.
+ * This function can be called even at interrupt level.
+ */
+
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
+{
+       uint new_val;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+       new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
+       if (new_val > 255)
+               new_val = 255;
+       dd->ipath_eep_st_new_errs[eidx] = new_val;
+       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+       return;
+}
+
+static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
+{
+       int ret;
+       struct i2c_chain_desc *icd;
+
+       ret = -ENOENT;
+
+       icd = ipath_i2c_type(dd);
+       if (!icd)
+               goto bail;
+
+       if (icd->temp_dev == IPATH_NO_DEV) {
+               /* tempsense only exists on new, real-I2C boards */
+               ret = -ENXIO;
+               goto bail;
+       }
+
+       if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+               ipath_dbg("Failed tempsense startcmd\n");
+               stop_cmd(dd);
+               ret = -ENXIO;
+               goto bail;
+       }
+       ret = wr_byte(dd, regnum);
+       stop_cmd(dd);
+       if (ret) {
+               ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
+                             regnum);
+               ret = -ENXIO;
+               goto bail;
+       }
+       if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
+               ipath_dbg("Failed tempsense RD startcmd\n");
+               stop_cmd(dd);
+               ret = -ENXIO;
+               goto bail;
+       }
+       /*
+        * We can only clock out one byte per command, sensibly
+        */
+       ret = rd_byte(dd);
+       stop_cmd(dd);
+
+bail:
+       return ret;
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+
+/**
+ * ipath_tempsense_read - read register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
+{
+       int ret;
+
+       if (regnum > 7)
+               return -EINVAL;
+
+       /* return a bogus value for (the one) register we do not have */
+       if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
+               return 0;
+
+       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+       if (!ret) {
+               ret = ipath_tempsense_internal_read(dd, regnum);
+               mutex_unlock(&dd->ipath_eep_lock);
+       }
+
+       /*
+        * There are three possibilities here:
+        * ret is actual value (0..255)
+        * ret is -ENXIO or -EINVAL from code in this file
+        * ret is -EINTR from mutex_lock_interruptible.
+        */
+       return ret;
+}
+
+static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
+                                         u8 regnum, u8 data)
+{
+       int ret = -ENOENT;
+       struct i2c_chain_desc *icd;
+
+       icd = ipath_i2c_type(dd);
+       if (!icd)
+               goto bail;
+
+       if (icd->temp_dev == IPATH_NO_DEV) {
+               /* tempsense only exists on new, real-I2C boards */
+               ret = -ENXIO;
+               goto bail;
+       }
+       if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+               ipath_dbg("Failed tempsense startcmd\n");
+               stop_cmd(dd);
+               ret = -ENXIO;
+               goto bail;
+       }
+       ret = wr_byte(dd, regnum);
+       if (ret) {
+               stop_cmd(dd);
+               ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
+                             regnum);
+               ret = -ENXIO;
+               goto bail;
+       }
+       ret = wr_byte(dd, data);
+       stop_cmd(dd);
+       ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
+       if (ret) {
+               ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
+                             regnum);
+               ret = -ENXIO;
+       }
+
+bail:
+       return ret;
+}
+
+#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
+
+/**
+ * ipath_tempsense_write - write register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to write
+ * @data: data to write
+ *
+ * returns 0 for success or < 0 for error
+ */
+int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
+{
+       int ret;
+
+       if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
+               return -EINVAL;
+
+       ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+       if (!ret) {
+               ret = ipath_tempsense_internal_write(dd, regnum, data);
+               mutex_unlock(&dd->ipath_eep_lock);
+       }
+
+       /*
+        * There are three possibilities here:
+        * ret is 0 for success
+        * ret is -ENXIO or -EINVAL from code in this file
+        * ret is -EINTR from mutex_lock_interruptible.
+        */
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c
new file mode 100644 (file)
index 0000000..450d159
--- /dev/null
@@ -0,0 +1,2620 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/pgtable.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+#include "ipath_user_sdma.h"
+
+static int ipath_open(struct inode *, struct file *);
+static int ipath_close(struct inode *, struct file *);
+static ssize_t ipath_write(struct file *, const char __user *, size_t,
+                          loff_t *);
+static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
+static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
+static int ipath_mmap(struct file *, struct vm_area_struct *);
+
+/*
+ * This is really, really weird shit - write() and writev() here
+ * have completely unrelated semantics.  Sucky userland ABI,
+ * film at 11.
+ */
+static const struct file_operations ipath_file_ops = {
+       .owner = THIS_MODULE,
+       .write = ipath_write,
+       .write_iter = ipath_write_iter,
+       .open = ipath_open,
+       .release = ipath_close,
+       .poll = ipath_poll,
+       .mmap = ipath_mmap,
+       .llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+       struct page *page;
+       u64 paddr = 0;
+
+       page = vmalloc_to_page(p);
+       if (page)
+               paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+       return paddr;
+}
+
+static int ipath_get_base_info(struct file *fp,
+                              void __user *ubase, size_t ubase_size)
+{
+       struct ipath_portdata *pd = port_fp(fp);
+       int ret = 0;
+       struct ipath_base_info *kinfo = NULL;
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned subport_cnt;
+       int shared, master;
+       size_t sz;
+
+       subport_cnt = pd->port_subport_cnt;
+       if (!subport_cnt) {
+               shared = 0;
+               master = 0;
+               subport_cnt = 1;
+       } else {
+               shared = 1;
+               master = !subport_fp(fp);
+       }
+
+       sz = sizeof(*kinfo);
+       /* If port sharing is not requested, allow the old size structure */
+       if (!shared)
+               sz -= 7 * sizeof(u64);
+       if (ubase_size < sz) {
+               ipath_cdbg(PROC,
+                          "Base size %zu, need %zu (version mismatch?)\n",
+                          ubase_size, sz);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+       if (kinfo == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       ret = dd->ipath_f_get_base_info(pd, kinfo);
+       if (ret < 0)
+               goto bail;
+
+       kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+       kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+       kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
+       kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+       /*
+        * have to mmap whole thing
+        */
+       kinfo->spi_rcv_egrbuftotlen =
+               pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+       kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+       kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+               pd->port_rcvegrbuf_chunks;
+       kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
+       if (master)
+               kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
+       /*
+        * for this use, may be ipath_cfgports summed over all chips that
+        * are are configured and present
+        */
+       kinfo->spi_nports = dd->ipath_cfgports;
+       /* unit (chip/board) our port is on */
+       kinfo->spi_unit = dd->ipath_unit;
+       /* for now, only a single page */
+       kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+       /*
+        * Doing this per port, and based on the skip value, etc.  This has
+        * to be the actual buffer size, since the protocol code treats it
+        * as an array.
+        *
+        * These have to be set to user addresses in the user code via mmap.
+        * These values are used on return to user code for the mmap target
+        * addresses only.  For 32 bit, same 44 bit address problem, so use
+        * the physical address, not virtual.  Before 2.6.11, using the
+        * page_address() macro worked, but in 2.6.11, even that returns the
+        * full 64 bit address (upper bits all 1's).  So far, using the
+        * physical addresses (or chip offsets, for chip mapping) works, but
+        * no doubt some future kernel release will change that, and we'll be
+        * on to yet another method of dealing with this.
+        */
+       kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
+       kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
+       kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
+       kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
+       kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+               (void *) dd->ipath_statusp -
+               (void *) dd->ipath_pioavailregs_dma;
+       if (!shared) {
+               kinfo->spi_piocnt = pd->port_piocnt;
+               kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+               kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+                       dd->ipath_ureg_align * pd->port_port;
+       } else if (master) {
+               kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+                                   (pd->port_piocnt % subport_cnt);
+               /* Master's PIO buffers are after all the slave's */
+               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+                       dd->ipath_palign *
+                       (pd->port_piocnt - kinfo->spi_piocnt);
+       } else {
+               unsigned slave = subport_fp(fp) - 1;
+
+               kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
+               kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+                       dd->ipath_palign * kinfo->spi_piocnt * slave;
+       }
+
+       if (shared) {
+               kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
+                       dd->ipath_ureg_align * pd->port_port;
+               kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
+               kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
+               kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
+
+               kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
+                       PAGE_SIZE * subport_fp(fp));
+
+               kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
+                       pd->port_rcvhdrq_size * subport_fp(fp));
+               kinfo->spi_rcvhdr_tailaddr = 0;
+               kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
+                       pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
+                       subport_fp(fp));
+
+               kinfo->spi_subport_uregbase =
+                       cvt_kvaddr(pd->subport_uregbase);
+               kinfo->spi_subport_rcvegrbuf =
+                       cvt_kvaddr(pd->subport_rcvegrbuf);
+               kinfo->spi_subport_rcvhdr_base =
+                       cvt_kvaddr(pd->subport_rcvhdr_base);
+               ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
+                       kinfo->spi_port, kinfo->spi_runtime_flags,
+                       (unsigned long long) kinfo->spi_subport_uregbase,
+                       (unsigned long long) kinfo->spi_subport_rcvegrbuf,
+                       (unsigned long long) kinfo->spi_subport_rcvhdr_base);
+       }
+
+       /*
+        * All user buffers are 2KB buffers.  If we ever support
+        * giving 4KB buffers to user processes, this will need some
+        * work.
+        */
+       kinfo->spi_pioindex = (kinfo->spi_piobufbase -
+               (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
+       kinfo->spi_pioalign = dd->ipath_palign;
+
+       kinfo->spi_qpair = IPATH_KD_QP;
+       /*
+        * user mode PIO buffers are always 2KB, even when 4KB can
+        * be received, and sent via the kernel; this is ibmaxlen
+        * for 2K MTU.
+        */
+       kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
+       kinfo->spi_mtu = dd->ipath_ibmaxlen;    /* maxlen, not ibmtu */
+       kinfo->spi_port = pd->port_port;
+       kinfo->spi_subport = subport_fp(fp);
+       kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
+       kinfo->spi_hw_version = dd->ipath_revision;
+
+       if (master) {
+               kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
+       }
+
+       sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+       if (copy_to_user(ubase, kinfo, sz))
+               ret = -EFAULT;
+
+bail:
+       kfree(kinfo);
+       return ret;
+}
+
+/**
+ * ipath_tid_update - update a port TID
+ * @pd: the port
+ * @fp: the ipath device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
+                           const struct ipath_tid_info *ti)
+{
+       int ret = 0, ntids;
+       u32 tid, porttid, cnt, i, tidcnt, tidoff;
+       u16 *tidlist;
+       struct ipath_devdata *dd = pd->port_dd;
+       u64 physaddr;
+       unsigned long vaddr;
+       u64 __iomem *tidbase;
+       unsigned long tidmap[8];
+       struct page **pagep = NULL;
+       unsigned subport = subport_fp(fp);
+
+       if (!dd->ipath_pageshadow) {
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       cnt = ti->tidcnt;
+       if (!cnt) {
+               ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
+                         (unsigned long long) ti->tidlist);
+               /*
+                * Should we treat as success?  likely a bug
+                */
+               ret = -EFAULT;
+               goto done;
+       }
+       porttid = pd->port_port * dd->ipath_rcvtidcnt;
+       if (!pd->port_subport_cnt) {
+               tidcnt = dd->ipath_rcvtidcnt;
+               tid = pd->port_tidcursor;
+               tidoff = 0;
+       } else if (!subport) {
+               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+               tidoff = dd->ipath_rcvtidcnt - tidcnt;
+               porttid += tidoff;
+               tid = tidcursor_fp(fp);
+       } else {
+               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+               tidoff = tidcnt * (subport - 1);
+               porttid += tidoff;
+               tid = tidcursor_fp(fp);
+       }
+       if (cnt > tidcnt) {
+               /* make sure it all fits in port_tid_pg_list */
+               dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
+                        "TIDs, only trying max (%u)\n", cnt, tidcnt);
+               cnt = tidcnt;
+       }
+       pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
+       tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
+
+       memset(tidmap, 0, sizeof(tidmap));
+       /* before decrement; chip actual # */
+       ntids = tidcnt;
+       tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+                                  dd->ipath_rcvtidbase +
+                                  porttid * sizeof(*tidbase));
+
+       ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
+                  pd->port_port, cnt, tid, tidbase);
+
+       /* virtual address of first page in transfer */
+       vaddr = ti->tidvaddr;
+       if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+                      cnt * PAGE_SIZE)) {
+               ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
+                         (void *)vaddr, cnt);
+               ret = -EFAULT;
+               goto done;
+       }
+       ret = ipath_get_user_pages(vaddr, cnt, pagep);
+       if (ret) {
+               if (ret == -EBUSY) {
+                       ipath_dbg("Failed to lock addr %p, %u pages "
+                                 "(already locked)\n",
+                                 (void *) vaddr, cnt);
+                       /*
+                        * for now, continue, and see what happens but with
+                        * the new implementation, this should never happen,
+                        * unless perhaps the user has mpin'ed the pages
+                        * themselves (something we need to test)
+                        */
+                       ret = 0;
+               } else {
+                       dev_info(&dd->pcidev->dev,
+                                "Failed to lock addr %p, %u pages: "
+                                "errno %d\n", (void *) vaddr, cnt, -ret);
+                       goto done;
+               }
+       }
+       for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+               for (; ntids--; tid++) {
+                       if (tid == tidcnt)
+                               tid = 0;
+                       if (!dd->ipath_pageshadow[porttid + tid])
+                               break;
+               }
+               if (ntids < 0) {
+                       /*
+                        * oops, wrapped all the way through their TIDs,
+                        * and didn't have enough free; see comments at
+                        * start of routine
+                        */
+                       ipath_dbg("Not enough free TIDs for %u pages "
+                                 "(index %d), failing\n", cnt, i);
+                       i--;    /* last tidlist[i] not filled in */
+                       ret = -ENOMEM;
+                       break;
+               }
+               tidlist[i] = tid + tidoff;
+               ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
+                          "vaddr %lx\n", i, tid + tidoff, vaddr);
+               /* we "know" system pages and TID pages are same size */
+               dd->ipath_pageshadow[porttid + tid] = pagep[i];
+               dd->ipath_physshadow[porttid + tid] = ipath_map_page(
+                       dd->pcidev, pagep[i], 0, PAGE_SIZE,
+                       PCI_DMA_FROMDEVICE);
+               /*
+                * don't need atomic or it's overhead
+                */
+               __set_bit(tid, tidmap);
+               physaddr = dd->ipath_physshadow[porttid + tid];
+               ipath_stats.sps_pagelocks++;
+               ipath_cdbg(VERBOSE,
+                          "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
+                          tid, vaddr, (unsigned long long) physaddr,
+                          pagep[i]);
+               dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
+                                   physaddr);
+               /*
+                * don't check this tid in ipath_portshadow, since we
+                * just filled it in; start with the next one.
+                */
+               tid++;
+       }
+
+       if (ret) {
+               u32 limit;
+       cleanup:
+               /* jump here if copy out of updated info failed... */
+               ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
+                         -ret, i, cnt);
+               /* same code that's in ipath_free_tid() */
+               limit = sizeof(tidmap) * BITS_PER_BYTE;
+               if (limit > tidcnt)
+                       /* just in case size changes in future */
+                       limit = tidcnt;
+               tid = find_first_bit((const unsigned long *)tidmap, limit);
+               for (; tid < limit; tid++) {
+                       if (!test_bit(tid, tidmap))
+                               continue;
+                       if (dd->ipath_pageshadow[porttid + tid]) {
+                               ipath_cdbg(VERBOSE, "Freeing TID %u\n",
+                                          tid);
+                               dd->ipath_f_put_tid(dd, &tidbase[tid],
+                                                   RCVHQ_RCV_TYPE_EXPECTED,
+                                                   dd->ipath_tidinvalid);
+                               pci_unmap_page(dd->pcidev,
+                                       dd->ipath_physshadow[porttid + tid],
+                                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+                               dd->ipath_pageshadow[porttid + tid] = NULL;
+                               ipath_stats.sps_pageunlocks++;
+                       }
+               }
+               ipath_release_user_pages(pagep, cnt);
+       } else {
+               /*
+                * Copy the updated array, with ipath_tid's filled in, back
+                * to user.  Since we did the copy in already, this "should
+                * never fail" If it does, we have to clean up...
+                */
+               if (copy_to_user((void __user *)
+                                (unsigned long) ti->tidlist,
+                                tidlist, cnt * sizeof(*tidlist))) {
+                       ret = -EFAULT;
+                       goto cleanup;
+               }
+               if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+                                tidmap, sizeof tidmap)) {
+                       ret = -EFAULT;
+                       goto cleanup;
+               }
+               if (tid == tidcnt)
+                       tid = 0;
+               if (!pd->port_subport_cnt)
+                       pd->port_tidcursor = tid;
+               else
+                       tidcursor_fp(fp) = tid;
+       }
+
+done:
+       if (ret)
+               ipath_dbg("Failed to map %u TID pages, failing with %d\n",
+                         ti->tidcnt, -ret);
+       return ret;
+}
+
+/**
+ * ipath_tid_free - free a port TID
+ * @pd: the port
+ * @subport: the subport
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
+                         const struct ipath_tid_info *ti)
+{
+       int ret = 0;
+       u32 tid, porttid, cnt, limit, tidcnt;
+       struct ipath_devdata *dd = pd->port_dd;
+       u64 __iomem *tidbase;
+       unsigned long tidmap[8];
+
+       if (!dd->ipath_pageshadow) {
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+                          sizeof tidmap)) {
+               ret = -EFAULT;
+               goto done;
+       }
+
+       porttid = pd->port_port * dd->ipath_rcvtidcnt;
+       if (!pd->port_subport_cnt)
+               tidcnt = dd->ipath_rcvtidcnt;
+       else if (!subport) {
+               tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+                        (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+               porttid += dd->ipath_rcvtidcnt - tidcnt;
+       } else {
+               tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+               porttid += tidcnt * (subport - 1);
+       }
+       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+                                  dd->ipath_rcvtidbase +
+                                  porttid * sizeof(*tidbase));
+
+       limit = sizeof(tidmap) * BITS_PER_BYTE;
+       if (limit > tidcnt)
+               /* just in case size changes in future */
+               limit = tidcnt;
+       tid = find_first_bit(tidmap, limit);
+       ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
+                  "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
+                  limit, tid, porttid);
+       for (cnt = 0; tid < limit; tid++) {
+               /*
+                * small optimization; if we detect a run of 3 or so without
+                * any set, use find_first_bit again.  That's mainly to
+                * accelerate the case where we wrapped, so we have some at
+                * the beginning, and some at the end, and a big gap
+                * in the middle.
+                */
+               if (!test_bit(tid, tidmap))
+                       continue;
+               cnt++;
+               if (dd->ipath_pageshadow[porttid + tid]) {
+                       struct page *p;
+                       p = dd->ipath_pageshadow[porttid + tid];
+                       dd->ipath_pageshadow[porttid + tid] = NULL;
+                       ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
+                                  pid_nr(pd->port_pid), tid);
+                       dd->ipath_f_put_tid(dd, &tidbase[tid],
+                                           RCVHQ_RCV_TYPE_EXPECTED,
+                                           dd->ipath_tidinvalid);
+                       pci_unmap_page(dd->pcidev,
+                               dd->ipath_physshadow[porttid + tid],
+                               PAGE_SIZE, PCI_DMA_FROMDEVICE);
+                       ipath_release_user_pages(&p, 1);
+                       ipath_stats.sps_pageunlocks++;
+               } else
+                       ipath_dbg("Unused tid %u, ignoring\n", tid);
+       }
+       if (cnt != ti->tidcnt)
+               ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
+                         ti->tidcnt, cnt);
+done:
+       if (ret)
+               ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
+                         ti->tidcnt, -ret);
+       return ret;
+}
+
+/**
+ * ipath_set_part_key - set a partition key
+ * @pd: the port
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple ports may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single infinipath register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       int i, any = 0, pidx = -1;
+       u16 lkey = key & 0x7FFF;
+       int ret;
+
+       if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
+               /* nothing to do; this key always valid */
+               ret = 0;
+               goto bail;
+       }
+
+       ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
+                  "%hx:%x %hx:%x %hx:%x %hx:%x\n",
+                  pd->port_port, key, dd->ipath_pkeys[0],
+                  atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+                  atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+                  atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+                  atomic_read(&dd->ipath_pkeyrefs[3]));
+
+       if (!lkey) {
+               ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
+                          pd->port_port);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /*
+        * Set the full membership bit, because it has to be
+        * set in the register or the packet, and it seems
+        * cleaner to set in the register than to force all
+        * callers to set it. (see bug 4331)
+        */
+       key |= 0x8000;
+
+       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+               if (!pd->port_pkeys[i] && pidx == -1)
+                       pidx = i;
+               if (pd->port_pkeys[i] == key) {
+                       ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
+                                  "(%x) more than once\n",
+                                  pd->port_port, key);
+                       ret = -EEXIST;
+                       goto bail;
+               }
+       }
+       if (pidx == -1) {
+               ipath_dbg("All pkeys for port %u already in use, "
+                         "can't set %x\n", pd->port_port, key);
+               ret = -EBUSY;
+               goto bail;
+       }
+       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i]) {
+                       any++;
+                       continue;
+               }
+               if (dd->ipath_pkeys[i] == key) {
+                       atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
+
+                       if (atomic_inc_return(pkrefs) > 1) {
+                               pd->port_pkeys[pidx] = key;
+                               ipath_cdbg(VERBOSE, "p%u set key %x "
+                                          "matches #%d, count now %d\n",
+                                          pd->port_port, key, i,
+                                          atomic_read(pkrefs));
+                               ret = 0;
+                               goto bail;
+                       } else {
+                               /*
+                                * lost race, decrement count, catch below
+                                */
+                               atomic_dec(pkrefs);
+                               ipath_cdbg(VERBOSE, "Lost race, count was "
+                                          "0, after dec, it's %d\n",
+                                          atomic_read(pkrefs));
+                               any++;
+                       }
+               }
+               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+                       /*
+                        * It makes no sense to have both the limited and
+                        * full membership PKEY set at the same time since
+                        * the unlimited one will disable the limited one.
+                        */
+                       ret = -EEXIST;
+                       goto bail;
+               }
+       }
+       if (!any) {
+               ipath_dbg("port %u, all pkeys already in use, "
+                         "can't set %x\n", pd->port_port, key);
+               ret = -EBUSY;
+               goto bail;
+       }
+       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i] &&
+                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+                       u64 pkey;
+
+                       /* for ipathstats, etc. */
+                       ipath_stats.sps_pkeys[i] = lkey;
+                       pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+                       pkey =
+                               (u64) dd->ipath_pkeys[0] |
+                               ((u64) dd->ipath_pkeys[1] << 16) |
+                               ((u64) dd->ipath_pkeys[2] << 32) |
+                               ((u64) dd->ipath_pkeys[3] << 48);
+                       ipath_cdbg(PROC, "p%u set key %x in #%d, "
+                                  "portidx %d, new pkey reg %llx\n",
+                                  pd->port_port, key, i, pidx,
+                                  (unsigned long long) pkey);
+                       ipath_write_kreg(
+                               dd, dd->ipath_kregs->kr_partitionkey, pkey);
+
+                       ret = 0;
+                       goto bail;
+               }
+       }
+       ipath_dbg("port %u, all pkeys already in use 2nd pass, "
+                 "can't set %x\n", pd->port_port, key);
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_manage_rcvq - manage a port's receive queue
+ * @pd: the port
+ * @subport: the subport
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the port, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
+                            int start_stop)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+
+       ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
+                  start_stop ? "en" : "dis", dd->ipath_unit,
+                  pd->port_port, subport);
+       if (subport)
+               goto bail;
+       /* atomically clear receive enable port. */
+       if (start_stop) {
+               /*
+                * On enable, force in-memory copy of the tail register to
+                * 0, so that protocol code doesn't have to worry about
+                * whether or not the chip has yet updated the in-memory
+                * copy or not on return from the system call. The chip
+                * always resets it's tail register back to 0 on a
+                * transition from disabled to enabled.  This could cause a
+                * problem if software was broken, and did the enable w/o
+                * the disable, but eventually the in-memory copy will be
+                * updated and correct itself, even in the face of software
+                * bugs.
+                */
+               if (pd->port_rcvhdrtail_kvaddr)
+                       ipath_clear_rcvhdrtail(pd);
+               set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+                       &dd->ipath_rcvctrl);
+       } else
+               clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
+                         &dd->ipath_rcvctrl);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+       /* now be sure chip saw it before we return */
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       if (start_stop) {
+               /*
+                * And try to be sure that tail reg update has happened too.
+                * This should in theory interlock with the RXE changes to
+                * the tail register.  Don't assign it to the tail register
+                * in memory copy, since we could overwrite an update by the
+                * chip if we did.
+                */
+               ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+       }
+       /* always; new head should be equal to new tail; see above */
+bail:
+       return 0;
+}
+
+static void ipath_clean_part_key(struct ipath_portdata *pd,
+                                struct ipath_devdata *dd)
+{
+       int i, j, pchanged = 0;
+       u64 oldpkey;
+
+       /* for debugging only */
+       oldpkey = (u64) dd->ipath_pkeys[0] |
+               ((u64) dd->ipath_pkeys[1] << 16) |
+               ((u64) dd->ipath_pkeys[2] << 32) |
+               ((u64) dd->ipath_pkeys[3] << 48);
+
+       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+               if (!pd->port_pkeys[i])
+                       continue;
+               ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
+                          pd->port_pkeys[i]);
+               for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
+                       /* check for match independent of the global bit */
+                       if ((dd->ipath_pkeys[j] & 0x7fff) !=
+                           (pd->port_pkeys[i] & 0x7fff))
+                               continue;
+                       if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+                               ipath_cdbg(VERBOSE, "p%u clear key "
+                                          "%x matches #%d\n",
+                                          pd->port_port,
+                                          pd->port_pkeys[i], j);
+                               ipath_stats.sps_pkeys[j] =
+                                       dd->ipath_pkeys[j] = 0;
+                               pchanged++;
+                       }
+                       else ipath_cdbg(
+                               VERBOSE, "p%u key %x matches #%d, "
+                               "but ref still %d\n", pd->port_port,
+                               pd->port_pkeys[i], j,
+                               atomic_read(&dd->ipath_pkeyrefs[j]));
+                       break;
+               }
+               pd->port_pkeys[i] = 0;
+       }
+       if (pchanged) {
+               u64 pkey = (u64) dd->ipath_pkeys[0] |
+                       ((u64) dd->ipath_pkeys[1] << 16) |
+                       ((u64) dd->ipath_pkeys[2] << 32) |
+                       ((u64) dd->ipath_pkeys[3] << 48);
+               ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
+                          "new pkey reg %llx\n", pd->port_port,
+                          (unsigned long long) oldpkey,
+                          (unsigned long long) pkey);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+                                pkey);
+       }
+}
+
+/*
+ * Initialize the port data with the receive buffer sizes
+ * so this can be done while the master port is locked.
+ * Otherwise, there is a race with a slave opening the port
+ * and seeing these fields uninitialized.
+ */
+static void init_user_egr_sizes(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned egrperchunk, egrcnt, size;
+
+       /*
+        * to avoid wasting a lot of memory, we allocate 32KB chunks of
+        * physically contiguous memory, advance through it until used up
+        * and then allocate more.  Of course, we need memory to store those
+        * extra pointers, now.  Started out with 256KB, but under heavy
+        * memory pressure (creating large files and then copying them over
+        * NFS while doing lots of MPI jobs), we hit some allocation
+        * failures, even though we can sleep...  (2.6.10) Still get
+        * failures at 64K.  32K is the lowest we can go without wasting
+        * additional memory.
+        */
+       size = 0x8000;
+       egrperchunk = size / dd->ipath_rcvegrbufsize;
+       egrcnt = dd->ipath_rcvegrcnt;
+       pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
+       pd->port_rcvegrbufs_perchunk = egrperchunk;
+       pd->port_rcvegrbuf_size = size;
+}
+
+/**
+ * ipath_create_user_egr - allocate eager TID buffers
+ * @pd: the port to allocate TID buffers for
+ *
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * Allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.
+ */
+static int ipath_create_user_egr(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+       size_t size;
+       int ret;
+       gfp_t gfp_flags;
+
+       /*
+        * GFP_USER, but without GFP_FS, so buffer cache can be
+        * coalesced (we hope); otherwise, even at order 4,
+        * heavy filesystem activity makes these fail, and we can
+        * use compound pages.
+        */
+       gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+       egrcnt = dd->ipath_rcvegrcnt;
+       /* TID number offset for this port */
+       egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
+       egrsize = dd->ipath_rcvegrbufsize;
+       ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
+                  "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
+
+       chunk = pd->port_rcvegrbuf_chunks;
+       egrperchunk = pd->port_rcvegrbufs_perchunk;
+       size = pd->port_rcvegrbuf_size;
+       pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+                                    GFP_KERNEL);
+       if (!pd->port_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       pd->port_rcvegrbuf_phys =
+               kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
+                       GFP_KERNEL);
+       if (!pd->port_rcvegrbuf_phys) {
+               ret = -ENOMEM;
+               goto bail_rcvegrbuf;
+       }
+       for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+
+               pd->port_rcvegrbuf[e] = dma_alloc_coherent(
+                       &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
+                       gfp_flags);
+
+               if (!pd->port_rcvegrbuf[e]) {
+                       ret = -ENOMEM;
+                       goto bail_rcvegrbuf_phys;
+               }
+       }
+
+       pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
+
+       for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+               dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
+               unsigned i;
+
+               for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+                       dd->ipath_f_put_tid(dd, e + egroff +
+                                           (u64 __iomem *)
+                                           ((char __iomem *)
+                                            dd->ipath_kregbase +
+                                            dd->ipath_rcvegrbase),
+                                           RCVHQ_RCV_TYPE_EAGER, pa);
+                       pa += egrsize;
+               }
+               cond_resched(); /* don't hog the cpu */
+       }
+
+       ret = 0;
+       goto bail;
+
+bail_rcvegrbuf_phys:
+       for (e = 0; e < pd->port_rcvegrbuf_chunks &&
+               pd->port_rcvegrbuf[e]; e++) {
+               dma_free_coherent(&dd->pcidev->dev, size,
+                                 pd->port_rcvegrbuf[e],
+                                 pd->port_rcvegrbuf_phys[e]);
+
+       }
+       kfree(pd->port_rcvegrbuf_phys);
+       pd->port_rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+       kfree(pd->port_rcvegrbuf);
+       pd->port_rcvegrbuf = NULL;
+bail:
+       return ret;
+}
+
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int ipath_mmap_mem(struct vm_area_struct *vma,
+       struct ipath_portdata *pd, unsigned len, int write_ok,
+       void *kvaddr, char *what)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned long pfn;
+       int ret;
+
+       if ((vma->vm_end - vma->vm_start) > len) {
+               dev_info(&dd->pcidev->dev,
+                        "FAIL on %s: len %lx > %x\n", what,
+                        vma->vm_end - vma->vm_start, len);
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       if (!write_ok) {
+               if (vma->vm_flags & VM_WRITE) {
+                       dev_info(&dd->pcidev->dev,
+                                "%s must be mapped readonly\n", what);
+                       ret = -EPERM;
+                       goto bail;
+               }
+
+               /* don't allow them to later change with mprotect */
+               vma->vm_flags &= ~VM_MAYWRITE;
+       }
+
+       pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+       ret = remap_pfn_range(vma, vma->vm_start, pfn,
+                             len, vma->vm_page_prot);
+       if (ret)
+               dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
+                        "bytes r%c failed: %d\n", what, pd->port_port,
+                        pfn, len, write_ok?'w':'o', ret);
+       else
+               ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
+                          "r%c\n", what, pd->port_port, pfn, len,
+                          write_ok?'w':'o');
+bail:
+       return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
+                    u64 ureg)
+{
+       unsigned long phys;
+       int ret;
+
+       /*
+        * This is real hardware, so use io_remap.  This is the mechanism
+        * for the user process to update the head registers for their port
+        * in the chip.
+        */
+       if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+               dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
+                        "%lx > PAGE\n", vma->vm_end - vma->vm_start);
+               ret = -EFAULT;
+       } else {
+               phys = dd->ipath_physaddr + ureg;
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+               vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               ret = io_remap_pfn_range(vma, vma->vm_start,
+                                        phys >> PAGE_SHIFT,
+                                        vma->vm_end - vma->vm_start,
+                                        vma->vm_page_prot);
+       }
+       return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+                       struct ipath_devdata *dd,
+                       struct ipath_portdata *pd,
+                       unsigned piobufs, unsigned piocnt)
+{
+       unsigned long phys;
+       int ret;
+
+       /*
+        * When we map the PIO buffers in the chip, we want to map them as
+        * writeonly, no read possible.   This prevents access to previous
+        * process data, and catches users who might try to read the i/o
+        * space due to a bug.
+        */
+       if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
+               dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
+                        "reqlen %lx > PAGE\n",
+                        vma->vm_end - vma->vm_start);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       phys = dd->ipath_physaddr + piobufs;
+
+#if defined(__powerpc__)
+       /* There isn't a generic way to specify writethrough mappings */
+       pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+       pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+       pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+       /*
+        * don't allow them to later change to readable with mprotect (for when
+        * not initially mapped readable, as is normally the case)
+        */
+       vma->vm_flags &= ~VM_MAYREAD;
+       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+       ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+                                vma->vm_end - vma->vm_start,
+                                vma->vm_page_prot);
+bail:
+       return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+                          struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned long start, size;
+       size_t total_size, i;
+       unsigned long pfn;
+       int ret;
+
+       size = pd->port_rcvegrbuf_size;
+       total_size = pd->port_rcvegrbuf_chunks * size;
+       if ((vma->vm_end - vma->vm_start) > total_size) {
+               dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
+                        "reqlen %lx > actual %lx\n",
+                        vma->vm_end - vma->vm_start,
+                        (unsigned long) total_size);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (vma->vm_flags & VM_WRITE) {
+               dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
+                        "writable (flags=%lx)\n", vma->vm_flags);
+               ret = -EPERM;
+               goto bail;
+       }
+       /* don't allow them to later change to writeable with mprotect */
+       vma->vm_flags &= ~VM_MAYWRITE;
+
+       start = vma->vm_start;
+
+       for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
+               pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
+               ret = remap_pfn_range(vma, start, pfn, size,
+                                     vma->vm_page_prot);
+               if (ret < 0)
+                       goto bail;
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/*
+ * ipath_file_vma_fault - handle a VMA page fault.
+ */
+static int ipath_file_vma_fault(struct vm_area_struct *vma,
+                                       struct vm_fault *vmf)
+{
+       struct page *page;
+
+       page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+       if (!page)
+               return VM_FAULT_SIGBUS;
+       get_page(page);
+       vmf->page = page;
+
+       return 0;
+}
+
+static const struct vm_operations_struct ipath_file_vm_ops = {
+       .fault = ipath_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+                      struct ipath_portdata *pd, unsigned subport)
+{
+       unsigned long len;
+       struct ipath_devdata *dd;
+       void *addr;
+       size_t size;
+       int ret = 0;
+
+       /* If the port is not shared, all addresses should be physical */
+       if (!pd->port_subport_cnt)
+               goto bail;
+
+       dd = pd->port_dd;
+       size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+
+       /*
+        * Each process has all the subport uregbase, rcvhdrq, and
+        * rcvegrbufs mmapped - as an array for all the processes,
+        * and also separately for this process.
+        */
+       if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
+               addr = pd->subport_uregbase;
+               size = PAGE_SIZE * pd->port_subport_cnt;
+       } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
+               addr = pd->subport_rcvhdr_base;
+               size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
+       } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
+               addr = pd->subport_rcvegrbuf;
+               size *= pd->port_subport_cnt;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
+                                        PAGE_SIZE * subport)) {
+                addr = pd->subport_uregbase + PAGE_SIZE * subport;
+                size = PAGE_SIZE;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
+                                pd->port_rcvhdrq_size * subport)) {
+                addr = pd->subport_rcvhdr_base +
+                        pd->port_rcvhdrq_size * subport;
+                size = pd->port_rcvhdrq_size;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
+                               size * subport)) {
+                addr = pd->subport_rcvegrbuf + size * subport;
+                /* rcvegrbufs are read-only on the slave */
+                if (vma->vm_flags & VM_WRITE) {
+                        dev_info(&dd->pcidev->dev,
+                                 "Can't map eager buffers as "
+                                 "writable (flags=%lx)\n", vma->vm_flags);
+                        ret = -EPERM;
+                        goto bail;
+                }
+                /*
+                 * Don't allow permission to later change to writeable
+                 * with mprotect.
+                 */
+                vma->vm_flags &= ~VM_MAYWRITE;
+       } else {
+               goto bail;
+       }
+       len = vma->vm_end - vma->vm_start;
+       if (len > size) {
+               ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+       vma->vm_ops = &ipath_file_vm_ops;
+       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_mmap - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+       struct ipath_portdata *pd;
+       struct ipath_devdata *dd;
+       u64 pgaddr, ureg;
+       unsigned piobufs, piocnt;
+       int ret;
+
+       pd = port_fp(fp);
+       if (!pd) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       dd = pd->port_dd;
+
+       /*
+        * This is the ipath_do_user_init() code, mapping the shared buffers
+        * into the user process. The address referred to by vm_pgoff is the
+        * file offset passed via mmap().  For shared ports, this is the
+        * kernel vmalloc() address of the pages to share with the master.
+        * For non-shared or master ports, this is a physical address.
+        * We only do one mmap for each space mapped.
+        */
+       pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+       /*
+        * Check for 0 in case one of the allocations failed, but user
+        * called mmap anyway.
+        */
+       if (!pgaddr)  {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
+                  (unsigned long long) pgaddr, vma->vm_start,
+                  vma->vm_end - vma->vm_start, dd->ipath_unit,
+                  pd->port_port, subport_fp(fp));
+
+       /*
+        * Physical addresses must fit in 40 bits for our hardware.
+        * Check for kernel virtual addresses first, anything else must
+        * match a HW or memory address.
+        */
+       ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
+       if (ret) {
+               if (ret > 0)
+                       ret = 0;
+               goto bail;
+       }
+
+       ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
+       if (!pd->port_subport_cnt) {
+               /* port is not shared */
+               piocnt = pd->port_piocnt;
+               piobufs = pd->port_piobufs;
+       } else if (!subport_fp(fp)) {
+               /* caller is the master */
+               piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+                        (pd->port_piocnt % pd->port_subport_cnt);
+               piobufs = pd->port_piobufs +
+                       dd->ipath_palign * (pd->port_piocnt - piocnt);
+       } else {
+               unsigned slave = subport_fp(fp) - 1;
+
+               /* caller is a slave */
+               piocnt = pd->port_piocnt / pd->port_subport_cnt;
+               piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
+       }
+
+       if (pgaddr == ureg)
+               ret = mmap_ureg(vma, dd, ureg);
+       else if (pgaddr == piobufs)
+               ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
+       else if (pgaddr == dd->ipath_pioavailregs_phys)
+               /* in-memory copy of pioavail registers */
+               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+                                    (void *) dd->ipath_pioavailregs_dma,
+                                    "pioavail registers");
+       else if (pgaddr == pd->port_rcvegr_phys)
+               ret = mmap_rcvegrbufs(vma, pd);
+       else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
+               /*
+                * The rcvhdrq itself; readonly except on HT (so have
+                * to allow writable mapping), multiple pages, contiguous
+                * from an i/o perspective.
+                */
+               ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
+                                    pd->port_rcvhdrq,
+                                    "rcvhdrq");
+       else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
+               /* in-memory copy of rcvhdrq tail register */
+               ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+                                    pd->port_rcvhdrtail_kvaddr,
+                                    "rcvhdrq tail");
+       else
+               ret = -EINVAL;
+
+       vma->vm_private_data = NULL;
+
+       if (ret < 0)
+               dev_info(&dd->pcidev->dev,
+                        "Failure %d on off %llx len %lx\n",
+                        -ret, (unsigned long long)pgaddr,
+                        vma->vm_end - vma->vm_start);
+bail:
+       return ret;
+}
+
+static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
+{
+       unsigned pollflag = 0;
+
+       if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
+           pd->port_hdrqfull != pd->port_hdrqfull_poll) {
+               pollflag |= POLLIN | POLLRDNORM;
+               pd->port_hdrqfull_poll = pd->port_hdrqfull;
+       }
+
+       return pollflag;
+}
+
+static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
+                                     struct file *fp,
+                                     struct poll_table_struct *pt)
+{
+       unsigned pollflag = 0;
+       struct ipath_devdata *dd;
+
+       dd = pd->port_dd;
+
+       /* variable access in ipath_poll_hdrqfull() needs this */
+       rmb();
+       pollflag = ipath_poll_hdrqfull(pd);
+
+       if (pd->port_urgent != pd->port_urgent_poll) {
+               pollflag |= POLLIN | POLLRDNORM;
+               pd->port_urgent_poll = pd->port_urgent;
+       }
+
+       if (!pollflag) {
+               /* this saves a spin_lock/unlock in interrupt handler... */
+               set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
+               /* flush waiting flag so don't miss an event... */
+               wmb();
+               poll_wait(fp, &pd->port_wait, pt);
+       }
+
+       return pollflag;
+}
+
+static unsigned int ipath_poll_next(struct ipath_portdata *pd,
+                                   struct file *fp,
+                                   struct poll_table_struct *pt)
+{
+       u32 head;
+       u32 tail;
+       unsigned pollflag = 0;
+       struct ipath_devdata *dd;
+
+       dd = pd->port_dd;
+
+       /* variable access in ipath_poll_hdrqfull() needs this */
+       rmb();
+       pollflag = ipath_poll_hdrqfull(pd);
+
+       head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
+       if (pd->port_rcvhdrtail_kvaddr)
+               tail = ipath_get_rcvhdrtail(pd);
+       else
+               tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+
+       if (head != tail)
+               pollflag |= POLLIN | POLLRDNORM;
+       else {
+               /* this saves a spin_lock/unlock in interrupt handler */
+               set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+               /* flush waiting flag so we don't miss an event */
+               wmb();
+
+               set_bit(pd->port_port + dd->ipath_r_intravail_shift,
+                       &dd->ipath_rcvctrl);
+
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                                dd->ipath_rcvctrl);
+
+               if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
+                       ipath_write_ureg(dd, ur_rcvhdrhead,
+                                        dd->ipath_rhdrhead_intr_off | head,
+                                        pd->port_port);
+
+               poll_wait(fp, &pd->port_wait, pt);
+       }
+
+       return pollflag;
+}
+
+static unsigned int ipath_poll(struct file *fp,
+                              struct poll_table_struct *pt)
+{
+       struct ipath_portdata *pd;
+       unsigned pollflag;
+
+       pd = port_fp(fp);
+       if (!pd)
+               pollflag = 0;
+       else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
+               pollflag = ipath_poll_urgent(pd, fp, pt);
+       else
+               pollflag = ipath_poll_next(pd, fp, pt);
+
+       return pollflag;
+}
+
+static int ipath_supports_subports(int user_swmajor, int user_swminor)
+{
+       /* no subport implementation prior to software version 1.3 */
+       return (user_swmajor > 1) || (user_swminor >= 3);
+}
+
+static int ipath_compatible_subports(int user_swmajor, int user_swminor)
+{
+       /* this code is written long-hand for clarity */
+       if (IPATH_USER_SWMAJOR != user_swmajor) {
+               /* no promise of compatibility if major mismatch */
+               return 0;
+       }
+       if (IPATH_USER_SWMAJOR == 1) {
+               switch (IPATH_USER_SWMINOR) {
+               case 0:
+               case 1:
+               case 2:
+                       /* no subport implementation so cannot be compatible */
+                       return 0;
+               case 3:
+                       /* 3 is only compatible with itself */
+                       return user_swminor == 3;
+               default:
+                       /* >= 4 are compatible (or are expected to be) */
+                       return user_swminor >= 4;
+               }
+       }
+       /* make no promises yet for future major versions */
+       return 0;
+}
+
+static int init_subports(struct ipath_devdata *dd,
+                        struct ipath_portdata *pd,
+                        const struct ipath_user_info *uinfo)
+{
+       int ret = 0;
+       unsigned num_subports;
+       size_t size;
+
+       /*
+        * If the user is requesting zero subports,
+        * skip the subport allocation.
+        */
+       if (uinfo->spu_subport_cnt <= 0)
+               goto bail;
+
+       /* Self-consistency check for ipath_compatible_subports() */
+       if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
+           !ipath_compatible_subports(IPATH_USER_SWMAJOR,
+                                      IPATH_USER_SWMINOR)) {
+               dev_info(&dd->pcidev->dev,
+                        "Inconsistent ipath_compatible_subports()\n");
+               goto bail;
+       }
+
+       /* Check for subport compatibility */
+       if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
+                                      uinfo->spu_userversion & 0xffff)) {
+               dev_info(&dd->pcidev->dev,
+                        "Mismatched user version (%d.%d) and driver "
+                        "version (%d.%d) while port sharing. Ensure "
+                         "that driver and library are from the same "
+                         "release.\n",
+                        (int) (uinfo->spu_userversion >> 16),
+                         (int) (uinfo->spu_userversion & 0xffff),
+                        IPATH_USER_SWMAJOR,
+                        IPATH_USER_SWMINOR);
+               goto bail;
+       }
+       if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       num_subports = uinfo->spu_subport_cnt;
+       pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
+       if (!pd->subport_uregbase) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       /* Note: pd->port_rcvhdrq_size isn't initialized yet. */
+       size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+                    sizeof(u32), PAGE_SIZE) * num_subports;
+       pd->subport_rcvhdr_base = vzalloc(size);
+       if (!pd->subport_rcvhdr_base) {
+               ret = -ENOMEM;
+               goto bail_ureg;
+       }
+
+       pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
+                                       pd->port_rcvegrbuf_size *
+                                       num_subports);
+       if (!pd->subport_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail_rhdr;
+       }
+
+       pd->port_subport_cnt = uinfo->spu_subport_cnt;
+       pd->port_subport_id = uinfo->spu_subport_id;
+       pd->active_slaves = 1;
+       set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+       goto bail;
+
+bail_rhdr:
+       vfree(pd->subport_rcvhdr_base);
+bail_ureg:
+       vfree(pd->subport_uregbase);
+       pd->subport_uregbase = NULL;
+bail:
+       return ret;
+}
+
+static int try_alloc_port(struct ipath_devdata *dd, int port,
+                         struct file *fp,
+                         const struct ipath_user_info *uinfo)
+{
+       struct ipath_portdata *pd;
+       int ret;
+
+       if (!(pd = dd->ipath_pd[port])) {
+               void *ptmp;
+
+               pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+
+               /*
+                * Allocate memory for use in ipath_tid_update() just once
+                * at open, not per call.  Reduces cost of expected send
+                * setup.
+                */
+               ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
+                              dd->ipath_rcvtidcnt * sizeof(struct page **),
+                              GFP_KERNEL);
+               if (!pd || !ptmp) {
+                       ipath_dev_err(dd, "Unable to allocate portdata "
+                                     "memory, failing open\n");
+                       ret = -ENOMEM;
+                       kfree(pd);
+                       kfree(ptmp);
+                       goto bail;
+               }
+               dd->ipath_pd[port] = pd;
+               dd->ipath_pd[port]->port_port = port;
+               dd->ipath_pd[port]->port_dd = dd;
+               dd->ipath_pd[port]->port_tid_pg_list = ptmp;
+               init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
+       }
+       if (!pd->port_cnt) {
+               pd->userversion = uinfo->spu_userversion;
+               init_user_egr_sizes(pd);
+               if ((ret = init_subports(dd, pd, uinfo)) != 0)
+                       goto bail;
+               ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
+                          current->comm, current->pid, dd->ipath_unit,
+                          port);
+               pd->port_cnt = 1;
+               port_fp(fp) = pd;
+               pd->port_pid = get_pid(task_pid(current));
+               strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
+               ipath_stats.sps_ports++;
+               ret = 0;
+       } else
+               ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+static inline int usable(struct ipath_devdata *dd)
+{
+       return dd &&
+               (dd->ipath_flags & IPATH_PRESENT) &&
+               dd->ipath_kregbase &&
+               dd->ipath_lid &&
+               !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
+                                    | IPATH_LINKUNK));
+}
+
+static int find_free_port(int unit, struct file *fp,
+                         const struct ipath_user_info *uinfo)
+{
+       struct ipath_devdata *dd = ipath_lookup(unit);
+       int ret, i;
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       if (!usable(dd)) {
+               ret = -ENETDOWN;
+               goto bail;
+       }
+
+       for (i = 1; i < dd->ipath_cfgports; i++) {
+               ret = try_alloc_port(dd, i, fp, uinfo);
+               if (ret != -EBUSY)
+                       goto bail;
+       }
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+static int find_best_unit(struct file *fp,
+                         const struct ipath_user_info *uinfo)
+{
+       int ret = 0, i, prefunit = -1, devmax;
+       int maxofallports, npresent, nup;
+       int ndev;
+
+       devmax = ipath_count_units(&npresent, &nup, &maxofallports);
+
+       /*
+        * This code is present to allow a knowledgeable person to
+        * specify the layout of processes to processors before opening
+        * this driver, and then we'll assign the process to the "closest"
+        * InfiniPath chip to that processor (we assume reasonable connectivity,
+        * for now).  This code assumes that if affinity has been set
+        * before this point, that at most one cpu is set; for now this
+        * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
+        * in case some kernel variant sets none of the bits when no
+        * affinity is set.  2.6.11 and 12 kernels have all present
+        * cpus set.  Some day we'll have to fix it up further to handle
+        * a cpu subset.  This algorithm fails for two HT chips connected
+        * in tunnel fashion.  Eventually this needs real topology
+        * information.  There may be some issues with dual core numbering
+        * as well.  This needs more work prior to release.
+        */
+       if (!cpumask_empty(tsk_cpus_allowed(current)) &&
+           !cpumask_full(tsk_cpus_allowed(current))) {
+               int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
+               get_online_cpus();
+               for_each_online_cpu(i)
+                       if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
+                               ipath_cdbg(PROC, "%s[%u] affinity set for "
+                                          "cpu %d/%d\n", current->comm,
+                                          current->pid, i, ncpus);
+                               curcpu = i;
+                               nset++;
+                       }
+               put_online_cpus();
+               if (curcpu != -1 && nset != ncpus) {
+                       if (npresent) {
+                               prefunit = curcpu / (ncpus / npresent);
+                               ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
+                                         "%d cpus/chip, select unit %d\n",
+                                         current->comm, current->pid,
+                                         npresent, ncpus, ncpus / npresent,
+                                         prefunit);
+                       }
+               }
+       }
+
+       /*
+        * user ports start at 1, kernel port is 0
+        * For now, we do round-robin access across all chips
+        */
+
+       if (prefunit != -1)
+               devmax = prefunit + 1;
+recheck:
+       for (i = 1; i < maxofallports; i++) {
+               for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
+                    ndev++) {
+                       struct ipath_devdata *dd = ipath_lookup(ndev);
+
+                       if (!usable(dd))
+                               continue; /* can't use this unit */
+                       if (i >= dd->ipath_cfgports)
+                               /*
+                                * Maxed out on users of this unit. Try
+                                * next.
+                                */
+                               continue;
+                       ret = try_alloc_port(dd, i, fp, uinfo);
+                       if (!ret)
+                               goto done;
+               }
+       }
+
+       if (npresent) {
+               if (nup == 0) {
+                       ret = -ENETDOWN;
+                       ipath_dbg("No ports available (none initialized "
+                                 "and ready)\n");
+               } else {
+                       if (prefunit > 0) {
+                               /* if started above 0, retry from 0 */
+                               ipath_cdbg(PROC,
+                                          "%s[%u] no ports on prefunit "
+                                          "%d, clear and re-check\n",
+                                          current->comm, current->pid,
+                                          prefunit);
+                               devmax = ipath_count_units(NULL, NULL,
+                                                          NULL);
+                               prefunit = -1;
+                               goto recheck;
+                       }
+                       ret = -EBUSY;
+                       ipath_dbg("No ports available\n");
+               }
+       } else {
+               ret = -ENXIO;
+               ipath_dbg("No boards found\n");
+       }
+
+done:
+       return ret;
+}
+
+static int find_shared_port(struct file *fp,
+                           const struct ipath_user_info *uinfo)
+{
+       int devmax, ndev, i;
+       int ret = 0;
+
+       devmax = ipath_count_units(NULL, NULL, NULL);
+
+       for (ndev = 0; ndev < devmax; ndev++) {
+               struct ipath_devdata *dd = ipath_lookup(ndev);
+
+               if (!usable(dd))
+                       continue;
+               for (i = 1; i < dd->ipath_cfgports; i++) {
+                       struct ipath_portdata *pd = dd->ipath_pd[i];
+
+                       /* Skip ports which are not yet open */
+                       if (!pd || !pd->port_cnt)
+                               continue;
+                       /* Skip port if it doesn't match the requested one */
+                       if (pd->port_subport_id != uinfo->spu_subport_id)
+                               continue;
+                       /* Verify the sharing process matches the master */
+                       if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
+                           pd->userversion != uinfo->spu_userversion ||
+                           pd->port_cnt >= pd->port_subport_cnt) {
+                               ret = -EINVAL;
+                               goto done;
+                       }
+                       port_fp(fp) = pd;
+                       subport_fp(fp) = pd->port_cnt++;
+                       pd->port_subpid[subport_fp(fp)] =
+                               get_pid(task_pid(current));
+                       tidcursor_fp(fp) = 0;
+                       pd->active_slaves |= 1 << subport_fp(fp);
+                       ipath_cdbg(PROC,
+                                  "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
+                                  current->comm, current->pid,
+                                  subport_fp(fp),
+                                  pd->port_comm, pid_nr(pd->port_pid),
+                                  dd->ipath_unit, pd->port_port);
+                       ret = 1;
+                       goto done;
+               }
+       }
+
+done:
+       return ret;
+}
+
+static int ipath_open(struct inode *in, struct file *fp)
+{
+       /* The real work is performed later in ipath_assign_port() */
+       fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
+       return fp->private_data ? 0 : -ENOMEM;
+}
+
+/* Get port early, so can set affinity prior to memory allocation */
+static int ipath_assign_port(struct file *fp,
+                             const struct ipath_user_info *uinfo)
+{
+       int ret;
+       int i_minor;
+       unsigned swmajor, swminor;
+
+       /* Check to be sure we haven't already initialized this file */
+       if (port_fp(fp)) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       /* for now, if major version is different, bail */
+       swmajor = uinfo->spu_userversion >> 16;
+       if (swmajor != IPATH_USER_SWMAJOR) {
+               ipath_dbg("User major version %d not same as driver "
+                         "major %d\n", uinfo->spu_userversion >> 16,
+                         IPATH_USER_SWMAJOR);
+               ret = -ENODEV;
+               goto done;
+       }
+
+       swminor = uinfo->spu_userversion & 0xffff;
+       if (swminor != IPATH_USER_SWMINOR)
+               ipath_dbg("User minor version %d not same as driver "
+                         "minor %d\n", swminor, IPATH_USER_SWMINOR);
+
+       mutex_lock(&ipath_mutex);
+
+       if (ipath_compatible_subports(swmajor, swminor) &&
+           uinfo->spu_subport_cnt &&
+           (ret = find_shared_port(fp, uinfo))) {
+               if (ret > 0)
+                       ret = 0;
+               goto done_chk_sdma;
+       }
+
+       i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
+       ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
+                  (long)file_inode(fp)->i_rdev, i_minor);
+
+       if (i_minor)
+               ret = find_free_port(i_minor - 1, fp, uinfo);
+       else
+               ret = find_best_unit(fp, uinfo);
+
+done_chk_sdma:
+       if (!ret) {
+               struct ipath_filedata *fd = fp->private_data;
+               const struct ipath_portdata *pd = fd->pd;
+               const struct ipath_devdata *dd = pd->port_dd;
+
+               fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
+                                                     dd->ipath_unit,
+                                                     pd->port_port,
+                                                     fd->subport);
+
+               if (!fd->pq)
+                       ret = -ENOMEM;
+       }
+
+       mutex_unlock(&ipath_mutex);
+
+done:
+       return ret;
+}
+
+
+static int ipath_do_user_init(struct file *fp,
+                             const struct ipath_user_info *uinfo)
+{
+       int ret;
+       struct ipath_portdata *pd = port_fp(fp);
+       struct ipath_devdata *dd;
+       u32 head32;
+
+       /* Subports don't need to initialize anything since master did it. */
+       if (subport_fp(fp)) {
+               ret = wait_event_interruptible(pd->port_wait,
+                       !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
+               goto done;
+       }
+
+       dd = pd->port_dd;
+
+       if (uinfo->spu_rcvhdrsize) {
+               ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+               if (ret)
+                       goto done;
+       }
+
+       /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+       /* some ports may get extra buffers, calculate that here */
+       if (pd->port_port <= dd->ipath_ports_extrabuf)
+               pd->port_piocnt = dd->ipath_pbufsport + 1;
+       else
+               pd->port_piocnt = dd->ipath_pbufsport;
+
+       /* for right now, kernel piobufs are at end, so port 1 is at 0 */
+       if (pd->port_port <= dd->ipath_ports_extrabuf)
+               pd->port_pio_base = (dd->ipath_pbufsport + 1)
+                       * (pd->port_port - 1);
+       else
+               pd->port_pio_base = dd->ipath_ports_extrabuf +
+                       dd->ipath_pbufsport * (pd->port_port - 1);
+       pd->port_piobufs = dd->ipath_piobufbase +
+               pd->port_pio_base * dd->ipath_palign;
+       ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+               " first pio %u\n", pd->port_port, pd->port_piobufs,
+               pd->port_piocnt, pd->port_pio_base);
+       ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
+
+       /*
+        * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+        * array for time being.  If pd->port_port > chip-supported,
+        * we need to do extra stuff here to handle by handling overflow
+        * through port 0, someday
+        */
+       ret = ipath_create_rcvhdrq(dd, pd);
+       if (!ret)
+               ret = ipath_create_user_egr(pd);
+       if (ret)
+               goto done;
+
+       /*
+        * set the eager head register for this port to the current values
+        * of the tail pointers, since we don't know if they were
+        * updated on last use of the port.
+        */
+       head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+       ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+       pd->port_lastrcvhdrqtail = -1;
+       ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
+               pd->port_port, head32);
+       pd->port_tidcursor = 0; /* start at beginning after open */
+
+       /* initialize poll variables... */
+       pd->port_urgent = 0;
+       pd->port_urgent_poll = 0;
+       pd->port_hdrqfull_poll = pd->port_hdrqfull;
+
+       /*
+        * Now enable the port for receive.
+        * For chips that are set to DMA the tail register to memory
+        * when they change (and when the update bit transitions from
+        * 0 to 1.  So for those chips, we turn it off and then back on.
+        * This will (very briefly) affect any other open ports, but the
+        * duration is very short, and therefore isn't an issue.  We
+        * explicitly set the in-memory tail copy to 0 beforehand, so we
+        * don't have to wait to be sure the DMA update has happened
+        * (chip resets head/tail to 0 on transition to enable).
+        */
+       set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+               &dd->ipath_rcvctrl);
+       if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+               if (pd->port_rcvhdrtail_kvaddr)
+                       ipath_clear_rcvhdrtail(pd);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                       dd->ipath_rcvctrl &
+                       ~(1ULL << dd->ipath_r_tailupd_shift));
+       }
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+       /* Notify any waiting slaves */
+       if (pd->port_subport_cnt) {
+               clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+               wake_up(&pd->port_wait);
+       }
+done:
+       return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries port still had in use
+ * @pd: port
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using ipath_f_clear_tids.
+ */
+static void unlock_expected_tids(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
+       int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+
+       ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
+                  pd->port_port);
+       for (i = port_tidbase; i < maxtid; i++) {
+               struct page *ps = dd->ipath_pageshadow[i];
+
+               if (!ps)
+                       continue;
+
+               dd->ipath_pageshadow[i] = NULL;
+               pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
+                       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+               ipath_release_user_pages_on_close(&ps, 1);
+               cnt++;
+               ipath_stats.sps_pageunlocks++;
+       }
+       if (cnt)
+               ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
+                          pd->port_port, cnt);
+
+       if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
+               ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
+                          (unsigned long long) ipath_stats.sps_pagelocks,
+                          (unsigned long long)
+                          ipath_stats.sps_pageunlocks);
+}
+
+static int ipath_close(struct inode *in, struct file *fp)
+{
+       int ret = 0;
+       struct ipath_filedata *fd;
+       struct ipath_portdata *pd;
+       struct ipath_devdata *dd;
+       unsigned long flags;
+       unsigned port;
+       struct pid *pid;
+
+       ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
+                  (long)in->i_rdev, fp->private_data);
+
+       mutex_lock(&ipath_mutex);
+
+       fd = fp->private_data;
+       fp->private_data = NULL;
+       pd = fd->pd;
+       if (!pd) {
+               mutex_unlock(&ipath_mutex);
+               goto bail;
+       }
+
+       dd = pd->port_dd;
+
+       /* drain user sdma queue */
+       ipath_user_sdma_queue_drain(dd, fd->pq);
+       ipath_user_sdma_queue_destroy(fd->pq);
+
+       if (--pd->port_cnt) {
+               /*
+                * XXX If the master closes the port before the slave(s),
+                * revoke the mmap for the eager receive queue so
+                * the slave(s) don't wait for receive data forever.
+                */
+               pd->active_slaves &= ~(1 << fd->subport);
+               put_pid(pd->port_subpid[fd->subport]);
+               pd->port_subpid[fd->subport] = NULL;
+               mutex_unlock(&ipath_mutex);
+               goto bail;
+       }
+       /* early; no interrupt users after this */
+       spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+       port = pd->port_port;
+       dd->ipath_pd[port] = NULL;
+       pid = pd->port_pid;
+       pd->port_pid = NULL;
+       spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+       if (pd->port_rcvwait_to || pd->port_piowait_to
+           || pd->port_rcvnowait || pd->port_pionowait) {
+               ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
+                          "%u rcv %u, pio already\n",
+                          pd->port_port, pd->port_rcvwait_to,
+                          pd->port_piowait_to, pd->port_rcvnowait,
+                          pd->port_pionowait);
+               pd->port_rcvwait_to = pd->port_piowait_to =
+                       pd->port_rcvnowait = pd->port_pionowait = 0;
+       }
+       if (pd->port_flag) {
+               ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
+                         pd->port_port, pd->port_flag);
+               pd->port_flag = 0;
+       }
+
+       if (dd->ipath_kregbase) {
+               /* atomically clear receive enable port and intr avail. */
+               clear_bit(dd->ipath_r_portenable_shift + port,
+                         &dd->ipath_rcvctrl);
+               clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
+                         &dd->ipath_rcvctrl);
+               ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
+                       dd->ipath_rcvctrl);
+               /* and read back from chip to be sure that nothing
+                * else is in flight when we do the rest */
+               (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+               /* clean up the pkeys for this port user */
+               ipath_clean_part_key(pd, dd);
+               /*
+                * be paranoid, and never write 0's to these, just use an
+                * unused part of the port 0 tail page.  Of course,
+                * rcvhdraddr points to a large chunk of memory, so this
+                * could still trash things, but at least it won't trash
+                * page 0, and by disabling the port, it should stop "soon",
+                * even if a packet or two is in already in flight after we
+                * disabled the port.
+                */
+               ipath_write_kreg_port(dd,
+                       dd->ipath_kregs->kr_rcvhdrtailaddr, port,
+                       dd->ipath_dummy_hdrq_phys);
+               ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+                       pd->port_port, dd->ipath_dummy_hdrq_phys);
+
+               ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+               ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+                       pd->port_piocnt, 1);
+
+               dd->ipath_f_clear_tids(dd, pd->port_port);
+
+               if (dd->ipath_pageshadow)
+                       unlock_expected_tids(pd);
+               ipath_stats.sps_ports--;
+               ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
+                          pd->port_comm, pid_nr(pid),
+                          dd->ipath_unit, port);
+       }
+
+       put_pid(pid);
+       mutex_unlock(&ipath_mutex);
+       ipath_free_pddata(dd, pd); /* after releasing the mutex */
+
+bail:
+       kfree(fd);
+       return ret;
+}
+
+static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
+                          struct ipath_port_info __user *uinfo)
+{
+       struct ipath_port_info info;
+       int nup;
+       int ret;
+       size_t sz;
+
+       (void) ipath_count_units(NULL, &nup, NULL);
+       info.num_active = nup;
+       info.unit = pd->port_dd->ipath_unit;
+       info.port = pd->port_port;
+       info.subport = subport;
+       /* Don't return new fields if old library opened the port. */
+       if (ipath_supports_subports(pd->userversion >> 16,
+                                   pd->userversion & 0xffff)) {
+               /* Number of user ports available for this device. */
+               info.num_ports = pd->port_dd->ipath_cfgports - 1;
+               info.num_subports = pd->port_subport_cnt;
+               sz = sizeof(info);
+       } else
+               sz = sizeof(info) - 2 * sizeof(u16);
+
+       if (copy_to_user(uinfo, &info, sz)) {
+               ret = -EFAULT;
+               goto bail;
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int ipath_get_slave_info(struct ipath_portdata *pd,
+                               void __user *slave_mask_addr)
+{
+       int ret = 0;
+
+       if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
+               ret = -EFAULT;
+       return ret;
+}
+
+static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
+                                  u32 __user *inflightp)
+{
+       const u32 val = ipath_user_sdma_inflight_counter(pq);
+
+       if (put_user(val, inflightp))
+               return -EFAULT;
+
+       return 0;
+}
+
+static int ipath_sdma_get_complete(struct ipath_devdata *dd,
+                                  struct ipath_user_sdma_queue *pq,
+                                  u32 __user *completep)
+{
+       u32 val;
+       int err;
+
+       err = ipath_user_sdma_make_progress(dd, pq);
+       if (err < 0)
+               return err;
+
+       val = ipath_user_sdma_complete_counter(pq);
+       if (put_user(val, completep))
+               return -EFAULT;
+
+       return 0;
+}
+
+static ssize_t ipath_write(struct file *fp, const char __user *data,
+                          size_t count, loff_t *off)
+{
+       const struct ipath_cmd __user *ucmd;
+       struct ipath_portdata *pd;
+       const void __user *src;
+       size_t consumed, copy;
+       struct ipath_cmd cmd;
+       ssize_t ret = 0;
+       void *dest;
+
+       if (count < sizeof(cmd.type)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ucmd = (const struct ipath_cmd __user *) data;
+
+       if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       consumed = sizeof(cmd.type);
+
+       switch (cmd.type) {
+       case IPATH_CMD_ASSIGN_PORT:
+       case __IPATH_CMD_USER_INIT:
+       case IPATH_CMD_USER_INIT:
+               copy = sizeof(cmd.cmd.user_info);
+               dest = &cmd.cmd.user_info;
+               src = &ucmd->cmd.user_info;
+               break;
+       case IPATH_CMD_RECV_CTRL:
+               copy = sizeof(cmd.cmd.recv_ctrl);
+               dest = &cmd.cmd.recv_ctrl;
+               src = &ucmd->cmd.recv_ctrl;
+               break;
+       case IPATH_CMD_PORT_INFO:
+               copy = sizeof(cmd.cmd.port_info);
+               dest = &cmd.cmd.port_info;
+               src = &ucmd->cmd.port_info;
+               break;
+       case IPATH_CMD_TID_UPDATE:
+       case IPATH_CMD_TID_FREE:
+               copy = sizeof(cmd.cmd.tid_info);
+               dest = &cmd.cmd.tid_info;
+               src = &ucmd->cmd.tid_info;
+               break;
+       case IPATH_CMD_SET_PART_KEY:
+               copy = sizeof(cmd.cmd.part_key);
+               dest = &cmd.cmd.part_key;
+               src = &ucmd->cmd.part_key;
+               break;
+       case __IPATH_CMD_SLAVE_INFO:
+               copy = sizeof(cmd.cmd.slave_mask_addr);
+               dest = &cmd.cmd.slave_mask_addr;
+               src = &ucmd->cmd.slave_mask_addr;
+               break;
+       case IPATH_CMD_PIOAVAILUPD:     // force an update of PIOAvail reg
+               copy = 0;
+               src = NULL;
+               dest = NULL;
+               break;
+       case IPATH_CMD_POLL_TYPE:
+               copy = sizeof(cmd.cmd.poll_type);
+               dest = &cmd.cmd.poll_type;
+               src = &ucmd->cmd.poll_type;
+               break;
+       case IPATH_CMD_ARMLAUNCH_CTRL:
+               copy = sizeof(cmd.cmd.armlaunch_ctrl);
+               dest = &cmd.cmd.armlaunch_ctrl;
+               src = &ucmd->cmd.armlaunch_ctrl;
+               break;
+       case IPATH_CMD_SDMA_INFLIGHT:
+               copy = sizeof(cmd.cmd.sdma_inflight);
+               dest = &cmd.cmd.sdma_inflight;
+               src = &ucmd->cmd.sdma_inflight;
+               break;
+       case IPATH_CMD_SDMA_COMPLETE:
+               copy = sizeof(cmd.cmd.sdma_complete);
+               dest = &cmd.cmd.sdma_complete;
+               src = &ucmd->cmd.sdma_complete;
+               break;
+       default:
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (copy) {
+               if ((count - consumed) < copy) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               if (copy_from_user(dest, src, copy)) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+
+               consumed += copy;
+       }
+
+       pd = port_fp(fp);
+       if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
+               cmd.type != IPATH_CMD_ASSIGN_PORT) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       switch (cmd.type) {
+       case IPATH_CMD_ASSIGN_PORT:
+               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+               if (ret)
+                       goto bail;
+               break;
+       case __IPATH_CMD_USER_INIT:
+               /* backwards compatibility, get port first */
+               ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+               if (ret)
+                       goto bail;
+               /* and fall through to current version. */
+       case IPATH_CMD_USER_INIT:
+               ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
+               if (ret)
+                       goto bail;
+               ret = ipath_get_base_info(
+                       fp, (void __user *) (unsigned long)
+                       cmd.cmd.user_info.spu_base_info,
+                       cmd.cmd.user_info.spu_base_info_size);
+               break;
+       case IPATH_CMD_RECV_CTRL:
+               ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
+               break;
+       case IPATH_CMD_PORT_INFO:
+               ret = ipath_port_info(pd, subport_fp(fp),
+                                     (struct ipath_port_info __user *)
+                                     (unsigned long) cmd.cmd.port_info);
+               break;
+       case IPATH_CMD_TID_UPDATE:
+               ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
+               break;
+       case IPATH_CMD_TID_FREE:
+               ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
+               break;
+       case IPATH_CMD_SET_PART_KEY:
+               ret = ipath_set_part_key(pd, cmd.cmd.part_key);
+               break;
+       case __IPATH_CMD_SLAVE_INFO:
+               ret = ipath_get_slave_info(pd,
+                                          (void __user *) (unsigned long)
+                                          cmd.cmd.slave_mask_addr);
+               break;
+       case IPATH_CMD_PIOAVAILUPD:
+               ipath_force_pio_avail_update(pd->port_dd);
+               break;
+       case IPATH_CMD_POLL_TYPE:
+               pd->poll_type = cmd.cmd.poll_type;
+               break;
+       case IPATH_CMD_ARMLAUNCH_CTRL:
+               if (cmd.cmd.armlaunch_ctrl)
+                       ipath_enable_armlaunch(pd->port_dd);
+               else
+                       ipath_disable_armlaunch(pd->port_dd);
+               break;
+       case IPATH_CMD_SDMA_INFLIGHT:
+               ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
+                                             (u32 __user *) (unsigned long)
+                                             cmd.cmd.sdma_inflight);
+               break;
+       case IPATH_CMD_SDMA_COMPLETE:
+               ret = ipath_sdma_get_complete(pd->port_dd,
+                                             user_sdma_queue_fp(fp),
+                                             (u32 __user *) (unsigned long)
+                                             cmd.cmd.sdma_complete);
+               break;
+       }
+
+       if (ret >= 0)
+               ret = consumed;
+
+bail:
+       return ret;
+}
+
+static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *filp = iocb->ki_filp;
+       struct ipath_filedata *fp = filp->private_data;
+       struct ipath_portdata *pd = port_fp(filp);
+       struct ipath_user_sdma_queue *pq = fp->pq;
+
+       if (!iter_is_iovec(from) || !from->nr_segs)
+               return -EINVAL;
+
+       return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
+}
+
+static struct class *ipath_class;
+
+static int init_cdev(int minor, char *name, const struct file_operations *fops,
+                    struct cdev **cdevp, struct device **devp)
+{
+       const dev_t dev = MKDEV(IPATH_MAJOR, minor);
+       struct cdev *cdev = NULL;
+       struct device *device = NULL;
+       int ret;
+
+       cdev = cdev_alloc();
+       if (!cdev) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate cdev for minor %d, %s\n",
+                      minor, name);
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       cdev->owner = THIS_MODULE;
+       cdev->ops = fops;
+       kobject_set_name(&cdev->kobj, name);
+
+       ret = cdev_add(cdev, dev, 1);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not add cdev for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto err_cdev;
+       }
+
+       device = device_create(ipath_class, NULL, dev, NULL, name);
+
+       if (IS_ERR(device)) {
+               ret = PTR_ERR(device);
+               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+                      "device for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto err_cdev;
+       }
+
+       goto done;
+
+err_cdev:
+       cdev_del(cdev);
+       cdev = NULL;
+
+done:
+       if (ret >= 0) {
+               *cdevp = cdev;
+               *devp = device;
+       } else {
+               *cdevp = NULL;
+               *devp = NULL;
+       }
+
+       return ret;
+}
+
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+                   struct cdev **cdevp, struct device **devp)
+{
+       return init_cdev(minor, name, fops, cdevp, devp);
+}
+
+static void cleanup_cdev(struct cdev **cdevp,
+                        struct device **devp)
+{
+       struct device *dev = *devp;
+
+       if (dev) {
+               device_unregister(dev);
+               *devp = NULL;
+       }
+
+       if (*cdevp) {
+               cdev_del(*cdevp);
+               *cdevp = NULL;
+       }
+}
+
+void ipath_cdev_cleanup(struct cdev **cdevp,
+                       struct device **devp)
+{
+       cleanup_cdev(cdevp, devp);
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_dev;
+
+static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
+
+static int user_init(void)
+{
+       int ret;
+
+       ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
+                      "chrdev region (err %d)\n", -ret);
+               goto done;
+       }
+
+       ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
+
+       if (IS_ERR(ipath_class)) {
+               ret = PTR_ERR(ipath_class);
+               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+                      "device class (err %d)\n", -ret);
+               goto bail;
+       }
+
+       goto done;
+bail:
+       unregister_chrdev_region(dev, IPATH_NMINORS);
+done:
+       return ret;
+}
+
+static void user_cleanup(void)
+{
+       if (ipath_class) {
+               class_destroy(ipath_class);
+               ipath_class = NULL;
+       }
+
+       unregister_chrdev_region(dev, IPATH_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+static atomic_t user_setup = ATOMIC_INIT(0);
+
+int ipath_user_add(struct ipath_devdata *dd)
+{
+       char name[10];
+       int ret;
+
+       if (atomic_inc_return(&user_count) == 1) {
+               ret = user_init();
+               if (ret < 0) {
+                       ipath_dev_err(dd, "Unable to set up user support: "
+                                     "error %d\n", -ret);
+                       goto bail;
+               }
+               ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
+                               &wildcard_dev);
+               if (ret < 0) {
+                       ipath_dev_err(dd, "Could not create wildcard "
+                                     "minor: error %d\n", -ret);
+                       goto bail_user;
+               }
+
+               atomic_set(&user_setup, 1);
+       }
+
+       snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
+
+       ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
+                       &dd->user_cdev, &dd->user_dev);
+       if (ret < 0)
+               ipath_dev_err(dd, "Could not create user minor %d, %s\n",
+                             dd->ipath_unit + 1, name);
+
+       goto bail;
+
+bail_user:
+       user_cleanup();
+bail:
+       return ret;
+}
+
+void ipath_user_remove(struct ipath_devdata *dd)
+{
+       cleanup_cdev(&dd->user_cdev, &dd->user_dev);
+
+       if (atomic_dec_return(&user_count) == 0) {
+               if (atomic_read(&user_setup) == 0)
+                       goto bail;
+
+               cleanup_cdev(&wildcard_cdev, &wildcard_dev);
+               user_cleanup();
+
+               atomic_set(&user_setup, 0);
+       }
+bail:
+       return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c
new file mode 100644 (file)
index 0000000..25422a3
--- /dev/null
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include "ipath_kernel.h"
+
+#define IPATHFS_MAGIC 0x726a77
+
+static struct super_block *ipath_super;
+
+static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
+                        umode_t mode, const struct file_operations *fops,
+                        void *data)
+{
+       int error;
+       struct inode *inode = new_inode(dir->i_sb);
+
+       if (!inode) {
+               error = -EPERM;
+               goto bail;
+       }
+
+       inode->i_ino = get_next_ino();
+       inode->i_mode = mode;
+       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       inode->i_private = data;
+       if (S_ISDIR(mode)) {
+               inode->i_op = &simple_dir_inode_operations;
+               inc_nlink(inode);
+               inc_nlink(dir);
+       }
+
+       inode->i_fop = fops;
+
+       d_instantiate(dentry, inode);
+       error = 0;
+
+bail:
+       return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+                      struct dentry *parent, struct dentry **dentry,
+                      const struct file_operations *fops, void *data)
+{
+       int error;
+
+       mutex_lock(&d_inode(parent)->i_mutex);
+       *dentry = lookup_one_len(name, parent, strlen(name));
+       if (!IS_ERR(*dentry))
+               error = ipathfs_mknod(d_inode(parent), *dentry,
+                                     mode, fops, data);
+       else
+               error = PTR_ERR(*dentry);
+       mutex_unlock(&d_inode(parent)->i_mutex);
+
+       return error;
+}
+
+static ssize_t atomic_stats_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
+                                      sizeof ipath_stats);
+}
+
+static const struct file_operations atomic_stats_ops = {
+       .read = atomic_stats_read,
+       .llseek = default_llseek,
+};
+
+static ssize_t atomic_counters_read(struct file *file, char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+       struct infinipath_counters counters;
+       struct ipath_devdata *dd;
+
+       dd = file_inode(file)->i_private;
+       dd->ipath_f_read_counters(dd, &counters);
+
+       return simple_read_from_buffer(buf, count, ppos, &counters,
+                                      sizeof counters);
+}
+
+static const struct file_operations atomic_counters_ops = {
+       .read = atomic_counters_read,
+       .llseek = default_llseek,
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+                         size_t count, loff_t *ppos)
+{
+       struct ipath_devdata *dd;
+       ssize_t ret;
+       loff_t pos;
+       char *tmp;
+
+       pos = *ppos;
+
+       if ( pos < 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (pos >= sizeof(struct ipath_flash)) {
+               ret = 0;
+               goto bail;
+       }
+
+       if (count > sizeof(struct ipath_flash) - pos)
+               count = sizeof(struct ipath_flash) - pos;
+
+       tmp = kmalloc(count, GFP_KERNEL);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       dd = file_inode(file)->i_private;
+       if (ipath_eeprom_read(dd, pos, tmp, count)) {
+               ipath_dev_err(dd, "failed to read from flash\n");
+               ret = -ENXIO;
+               goto bail_tmp;
+       }
+
+       if (copy_to_user(buf, tmp, count)) {
+               ret = -EFAULT;
+               goto bail_tmp;
+       }
+
+       *ppos = pos + count;
+       ret = count;
+
+bail_tmp:
+       kfree(tmp);
+
+bail:
+       return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       struct ipath_devdata *dd;
+       ssize_t ret;
+       loff_t pos;
+       char *tmp;
+
+       pos = *ppos;
+
+       if (pos != 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (count != sizeof(struct ipath_flash)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       tmp = kmalloc(count, GFP_KERNEL);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       if (copy_from_user(tmp, buf, count)) {
+               ret = -EFAULT;
+               goto bail_tmp;
+       }
+
+       dd = file_inode(file)->i_private;
+       if (ipath_eeprom_write(dd, pos, tmp, count)) {
+               ret = -ENXIO;
+               ipath_dev_err(dd, "failed to write to flash\n");
+               goto bail_tmp;
+       }
+
+       *ppos = pos + count;
+       ret = count;
+
+bail_tmp:
+       kfree(tmp);
+
+bail:
+       return ret;
+}
+
+static const struct file_operations flash_ops = {
+       .read = flash_read,
+       .write = flash_write,
+       .llseek = default_llseek,
+};
+
+static int create_device_files(struct super_block *sb,
+                              struct ipath_devdata *dd)
+{
+       struct dentry *dir, *tmp;
+       char unit[10];
+       int ret;
+
+       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+       ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+                         &simple_dir_operations, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+       ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
+                         &atomic_counters_ops, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s/atomic_counters) "
+                      "failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+       ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+                         &flash_ops, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s/flash) "
+                      "failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+bail:
+       return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+       struct dentry *tmp;
+       int ret;
+
+       tmp = lookup_one_len(name, parent, strlen(name));
+
+       if (IS_ERR(tmp)) {
+               ret = PTR_ERR(tmp);
+               goto bail;
+       }
+
+       spin_lock(&tmp->d_lock);
+       if (simple_positive(tmp)) {
+               dget_dlock(tmp);
+               __d_drop(tmp);
+               spin_unlock(&tmp->d_lock);
+               simple_unlink(d_inode(parent), tmp);
+       } else
+               spin_unlock(&tmp->d_lock);
+
+       ret = 0;
+bail:
+       /*
+        * We don't expect clients to care about the return value, but
+        * it's there if they need it.
+        */
+       return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+                              struct ipath_devdata *dd)
+{
+       struct dentry *dir, *root;
+       char unit[10];
+       int ret;
+
+       root = dget(sb->s_root);
+       mutex_lock(&d_inode(root)->i_mutex);
+       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+       dir = lookup_one_len(unit, root, strlen(unit));
+
+       if (IS_ERR(dir)) {
+               ret = PTR_ERR(dir);
+               printk(KERN_ERR "Lookup of %s failed\n", unit);
+               goto bail;
+       }
+
+       remove_file(dir, "flash");
+       remove_file(dir, "atomic_counters");
+       d_delete(dir);
+       ret = simple_rmdir(d_inode(root), dir);
+
+bail:
+       mutex_unlock(&d_inode(root)->i_mutex);
+       dput(root);
+       return ret;
+}
+
+static int ipathfs_fill_super(struct super_block *sb, void *data,
+                             int silent)
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+       int ret;
+
+       static struct tree_descr files[] = {
+               [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+               {""},
+       };
+
+       ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
+       if (ret) {
+               printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
+               goto bail;
+       }
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+               ret = create_device_files(sb, dd);
+               if (ret)
+                       goto bail;
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+       return ret;
+}
+
+static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
+                       int flags, const char *dev_name, void *data)
+{
+       struct dentry *ret;
+       ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
+       if (!IS_ERR(ret))
+               ipath_super = ret->d_sb;
+       return ret;
+}
+
+static void ipathfs_kill_super(struct super_block *s)
+{
+       kill_litter_super(s);
+       ipath_super = NULL;
+}
+
+int ipathfs_add_device(struct ipath_devdata *dd)
+{
+       int ret;
+
+       if (ipath_super == NULL) {
+               ret = 0;
+               goto bail;
+       }
+
+       ret = create_device_files(ipath_super, dd);
+
+bail:
+       return ret;
+}
+
+int ipathfs_remove_device(struct ipath_devdata *dd)
+{
+       int ret;
+
+       if (ipath_super == NULL) {
+               ret = 0;
+               goto bail;
+       }
+
+       ret = remove_device_files(ipath_super, dd);
+
+bail:
+       return ret;
+}
+
+static struct file_system_type ipathfs_fs_type = {
+       .owner =        THIS_MODULE,
+       .name =         "ipathfs",
+       .mount =        ipathfs_mount,
+       .kill_sb =      ipathfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init ipath_init_ipathfs(void)
+{
+       return register_filesystem(&ipathfs_fs_type);
+}
+
+void __exit ipath_exit_ipathfs(void)
+{
+       unregister_filesystem(&ipathfs_fs_type);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_iba6110.c b/drivers/staging/rdma/ipath/ipath_iba6110.c
new file mode 100644 (file)
index 0000000..7cc3054
--- /dev/null
@@ -0,0 +1,1940 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the InfiniPath
+ * HT chip.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/htirq.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
+
+
+/*
+ * This lists the InfiniPath registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ *
+ * The names are in InterCap form because they're taken straight from
+ * the chip specification.  Since they're only used in this file, they
+ * don't pollute the rest of the source.
+*/
+
+struct _infinipath_do_not_use_kernel_regs {
+       unsigned long long Revision;
+       unsigned long long Control;
+       unsigned long long PageAlign;
+       unsigned long long PortCnt;
+       unsigned long long DebugPortSelect;
+       unsigned long long DebugPort;
+       unsigned long long SendRegBase;
+       unsigned long long UserRegBase;
+       unsigned long long CounterRegBase;
+       unsigned long long Scratch;
+       unsigned long long ReservedMisc1;
+       unsigned long long InterruptConfig;
+       unsigned long long IntBlocked;
+       unsigned long long IntMask;
+       unsigned long long IntStatus;
+       unsigned long long IntClear;
+       unsigned long long ErrorMask;
+       unsigned long long ErrorStatus;
+       unsigned long long ErrorClear;
+       unsigned long long HwErrMask;
+       unsigned long long HwErrStatus;
+       unsigned long long HwErrClear;
+       unsigned long long HwDiagCtrl;
+       unsigned long long MDIO;
+       unsigned long long IBCStatus;
+       unsigned long long IBCCtrl;
+       unsigned long long ExtStatus;
+       unsigned long long ExtCtrl;
+       unsigned long long GPIOOut;
+       unsigned long long GPIOMask;
+       unsigned long long GPIOStatus;
+       unsigned long long GPIOClear;
+       unsigned long long RcvCtrl;
+       unsigned long long RcvBTHQP;
+       unsigned long long RcvHdrSize;
+       unsigned long long RcvHdrCnt;
+       unsigned long long RcvHdrEntSize;
+       unsigned long long RcvTIDBase;
+       unsigned long long RcvTIDCnt;
+       unsigned long long RcvEgrBase;
+       unsigned long long RcvEgrCnt;
+       unsigned long long RcvBufBase;
+       unsigned long long RcvBufSize;
+       unsigned long long RxIntMemBase;
+       unsigned long long RxIntMemSize;
+       unsigned long long RcvPartitionKey;
+       unsigned long long ReservedRcv[10];
+       unsigned long long SendCtrl;
+       unsigned long long SendPIOBufBase;
+       unsigned long long SendPIOSize;
+       unsigned long long SendPIOBufCnt;
+       unsigned long long SendPIOAvailAddr;
+       unsigned long long TxIntMemBase;
+       unsigned long long TxIntMemSize;
+       unsigned long long ReservedSend[9];
+       unsigned long long SendBufferError;
+       unsigned long long SendBufferErrorCONT1;
+       unsigned long long SendBufferErrorCONT2;
+       unsigned long long SendBufferErrorCONT3;
+       unsigned long long ReservedSBE[4];
+       unsigned long long RcvHdrAddr0;
+       unsigned long long RcvHdrAddr1;
+       unsigned long long RcvHdrAddr2;
+       unsigned long long RcvHdrAddr3;
+       unsigned long long RcvHdrAddr4;
+       unsigned long long RcvHdrAddr5;
+       unsigned long long RcvHdrAddr6;
+       unsigned long long RcvHdrAddr7;
+       unsigned long long RcvHdrAddr8;
+       unsigned long long ReservedRHA[7];
+       unsigned long long RcvHdrTailAddr0;
+       unsigned long long RcvHdrTailAddr1;
+       unsigned long long RcvHdrTailAddr2;
+       unsigned long long RcvHdrTailAddr3;
+       unsigned long long RcvHdrTailAddr4;
+       unsigned long long RcvHdrTailAddr5;
+       unsigned long long RcvHdrTailAddr6;
+       unsigned long long RcvHdrTailAddr7;
+       unsigned long long RcvHdrTailAddr8;
+       unsigned long long ReservedRHTA[7];
+       unsigned long long Sync;        /* Software only */
+       unsigned long long Dump;        /* Software only */
+       unsigned long long SimVer;      /* Software only */
+       unsigned long long ReservedSW[5];
+       unsigned long long SerdesConfig0;
+       unsigned long long SerdesConfig1;
+       unsigned long long SerdesStatus;
+       unsigned long long XGXSConfig;
+       unsigned long long ReservedSW2[4];
+};
+
+struct _infinipath_do_not_use_counters {
+       __u64 LBIntCnt;
+       __u64 LBFlowStallCnt;
+       __u64 Reserved1;
+       __u64 TxUnsupVLErrCnt;
+       __u64 TxDataPktCnt;
+       __u64 TxFlowPktCnt;
+       __u64 TxDwordCnt;
+       __u64 TxLenErrCnt;
+       __u64 TxMaxMinLenErrCnt;
+       __u64 TxUnderrunCnt;
+       __u64 TxFlowStallCnt;
+       __u64 TxDroppedPktCnt;
+       __u64 RxDroppedPktCnt;
+       __u64 RxDataPktCnt;
+       __u64 RxFlowPktCnt;
+       __u64 RxDwordCnt;
+       __u64 RxLenErrCnt;
+       __u64 RxMaxMinLenErrCnt;
+       __u64 RxICRCErrCnt;
+       __u64 RxVCRCErrCnt;
+       __u64 RxFlowCtrlErrCnt;
+       __u64 RxBadFormatCnt;
+       __u64 RxLinkProblemCnt;
+       __u64 RxEBPCnt;
+       __u64 RxLPCRCErrCnt;
+       __u64 RxBufOvflCnt;
+       __u64 RxTIDFullErrCnt;
+       __u64 RxTIDValidErrCnt;
+       __u64 RxPKeyMismatchCnt;
+       __u64 RxP0HdrEgrOvflCnt;
+       __u64 RxP1HdrEgrOvflCnt;
+       __u64 RxP2HdrEgrOvflCnt;
+       __u64 RxP3HdrEgrOvflCnt;
+       __u64 RxP4HdrEgrOvflCnt;
+       __u64 RxP5HdrEgrOvflCnt;
+       __u64 RxP6HdrEgrOvflCnt;
+       __u64 RxP7HdrEgrOvflCnt;
+       __u64 RxP8HdrEgrOvflCnt;
+       __u64 Reserved6;
+       __u64 Reserved7;
+       __u64 IBStatusChangeCnt;
+       __u64 IBLinkErrRecoveryCnt;
+       __u64 IBLinkDownedCnt;
+       __u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+       struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+       struct _infinipath_do_not_use_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_ht_kregs = {
+       .kr_control = IPATH_KREG_OFFSET(Control),
+       .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+       .kr_debugport = IPATH_KREG_OFFSET(DebugPort),
+       .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+       .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+       .kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+       .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+       .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+       .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+       .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+       .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+       .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+       .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+       .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+       .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+       .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+       .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+       .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+       .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+       .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+       .kr_intclear = IPATH_KREG_OFFSET(IntClear),
+       .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
+       .kr_intmask = IPATH_KREG_OFFSET(IntMask),
+       .kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+       .kr_mdio = IPATH_KREG_OFFSET(MDIO),
+       .kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+       .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+       .kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+       .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+       .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+       .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+       .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+       .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+       .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+       .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+       .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+       .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+       .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+       .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+       .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+       .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+       .kr_revision = IPATH_KREG_OFFSET(Revision),
+       .kr_scratch = IPATH_KREG_OFFSET(Scratch),
+       .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+       .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+       .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+       .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+       .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+       .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+       .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+       .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+       .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+       .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+       .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+       .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+       .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+       .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+       /*
+        * These should not be used directly via ipath_write_kreg64(),
+        * use them with ipath_write_kreg64_port(),
+        */
+       .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+       .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
+};
+
+static const struct ipath_cregs ipath_ht_cregs = {
+       .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+       .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+       .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+       .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+       .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+       .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+       .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+       .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+       .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+       .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+       .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+       .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+       /* calc from Reg_CounterRegBase + offset */
+       .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+       .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+       .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+       .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+       .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+       .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+       .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+       .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+       .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+       .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+       .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+       .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+       .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+       .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+       .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+       .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+       .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+       .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+       .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+       .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+       .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVURG_SHIFT 0
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVAVAIL_SHIFT 12
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
+#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
+#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
+#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
+#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
+#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
+#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
+#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
+#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
+#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
+
+#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6110_IBCS_LINKSTATE_SHIFT 4
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
+
+
+/* TID entries (memory), HT-only */
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL        /* 40 bits valid */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET          0x7ULL
+
+/*
+ * masks and bits that are different in different chips, or present only
+ * in one
+ */
+static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
+    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
+static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
+    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
+
+static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA \
+       (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL \
+       (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/* keep the code below somewhat more readable; not used elsewhere */
+#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
+                               infinipath_hwe_htclnkabyte1crcerr)
+#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |    \
+                               infinipath_hwe_htclnkbbyte1crcerr)
+#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
+                               infinipath_hwe_htclnkbbyte0crcerr)
+#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |    \
+                               infinipath_hwe_htclnkbbyte1crcerr)
+
+static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
+                         char *msg, size_t msgl)
+{
+       char bitsmsg[64];
+       ipath_err_t crcbits = hwerrs &
+               (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
+       /* don't check if 8bit HT */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+               crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
+       /* don't check if 8bit HT */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+               crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
+       /*
+        * we'll want to ignore link errors on link that is
+        * not in use, if any.  For now, complain about both
+        */
+       if (crcbits) {
+               u16 ctrl0, ctrl1;
+               snprintf(bitsmsg, sizeof bitsmsg,
+                        "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
+                        !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
+                        "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
+                                   ? "1 (B)" : "0+1 (A+B)"),
+                        !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
+                        : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
+                           "0+1"), (unsigned long long) crcbits);
+               strlcat(msg, bitsmsg, msgl);
+
+               /*
+                * print extra info for debugging.  slave/primary
+                * config word 4, 8 (link control 0, 1)
+                */
+
+               if (pci_read_config_word(dd->pcidev,
+                                        dd->ipath_ht_slave_off + 0x4,
+                                        &ctrl0))
+                       dev_info(&dd->pcidev->dev, "Couldn't read "
+                                "linkctrl0 of slave/primary "
+                                "config block\n");
+               else if (!(ctrl0 & 1 << 6))
+                       /* not if EOC bit set */
+                       ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
+                                 ((ctrl0 >> 8) & 7) ? " CRC" : "",
+                                 ((ctrl0 >> 4) & 1) ? "linkfail" :
+                                 "");
+               if (pci_read_config_word(dd->pcidev,
+                                        dd->ipath_ht_slave_off + 0x8,
+                                        &ctrl1))
+                       dev_info(&dd->pcidev->dev, "Couldn't read "
+                                "linkctrl1 of slave/primary "
+                                "config block\n");
+               else if (!(ctrl1 & 1 << 6))
+                       /* not if EOC bit set */
+                       ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
+                                 ((ctrl1 >> 8) & 7) ? " CRC" : "",
+                                 ((ctrl1 >> 4) & 1) ? "linkfail" :
+                                 "");
+
+               /* disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~crcbits;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+               ipath_dbg("HT crc errs: %s\n", msg);
+       } else
+               ipath_dbg("ignoring HT crc errors 0x%llx, "
+                         "not in use\n", (unsigned long long)
+                         (hwerrs & (_IPATH_HTLINK0_CRCBITS |
+                                    _IPATH_HTLINK1_CRCBITS)));
+}
+
+/* 6110 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
+       INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
+       INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
+       INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
+       INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
+       INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
+       INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
+       INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+       INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+                       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
+                         << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
+
+static void ipath_ht_txe_recover(struct ipath_devdata *dd)
+{
+       ++ipath_stats.sps_txeparity;
+       dev_info(&dd->pcidev->dev,
+               "Recovering from TXE PIO parity error\n");
+}
+
+
+/**
+ * ipath_ht_handle_hwerrors - display hardware errors.
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
+ */
+static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+                                    size_t msgl)
+{
+       ipath_err_t hwerrs;
+       u32 bits, ctrl;
+       int isfatal = 0;
+       char bitsmsg[64];
+       int log_idx;
+
+       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+
+       if (!hwerrs) {
+               ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+               /*
+                * better than printing cofusing messages
+                * This seems to be related to clearing the crc error, or
+                * the pll error during init.
+                */
+               goto bail;
+       } else if (hwerrs == -1LL) {
+               ipath_dev_err(dd, "Read of hardware error status failed "
+                             "(all bits set); ignoring\n");
+               goto bail;
+       }
+       ipath_stats.sps_hwerrs++;
+
+       /* Always clear the error status register, except MEMBISTFAIL,
+        * regardless of whether we continue or stop using the chip.
+        * We want that set so we know it failed, even across driver reload.
+        * We'll still ignore it in the hwerrmask.  We do this partly for
+        * diagnostics, but also for support */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+       hwerrs &= dd->ipath_hwerrmask;
+
+       /* We log some errors to EEPROM, check if we have any of those. */
+       for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
+               if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
+                       ipath_inc_eeprom_err(dd, log_idx, 1);
+
+       /*
+        * make sure we get this much out, unless told to be quiet,
+        * it's a parity error we may recover from,
+        * or it's occurred within the last 5 seconds
+        */
+       if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
+               RXE_EAGER_PARITY)) ||
+               (ipath_debug & __IPATH_VERBDBG))
+               dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+                        "(cleared)\n", (unsigned long long) hwerrs);
+       dd->ipath_lasthwerror |= hwerrs;
+
+       if (hwerrs & ~dd->ipath_hwe_bitsextant)
+               ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+                             "%llx set\n", (unsigned long long)
+                             (hwerrs & ~dd->ipath_hwe_bitsextant));
+
+       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+       if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
+               /*
+                * parity errors in send memory are recoverable,
+                * just cancel the send (if indicated in * sendbuffererror),
+                * count the occurrence, unfreeze (if no other handled
+                * hardware error bits are set), and continue. They can
+                * occur if a processor speculative read is done to the PIO
+                * buffer while we are sending a packet, for example.
+                */
+               if (hwerrs & TXE_PIO_PARITY) {
+                       ipath_ht_txe_recover(dd);
+                       hwerrs &= ~TXE_PIO_PARITY;
+               }
+
+               if (!hwerrs) {
+                       ipath_dbg("Clearing freezemode on ignored or "
+                                 "recovered hardware error\n");
+                       ipath_clear_freeze(dd);
+               }
+       }
+
+       *msg = '\0';
+
+       /*
+        * may someday want to decode into which bits are which
+        * functional area for parity errors, etc.
+        */
+       if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
+                     << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
+                        bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+
+       ipath_format_hwerrors(hwerrs,
+                             ipath_6110_hwerror_msgs,
+                             ARRAY_SIZE(ipath_6110_hwerror_msgs),
+                             msg, msgl);
+
+       if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
+               hwerr_crcbits(dd, hwerrs, msg, msgl);
+
+       if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+               strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
+                       msgl);
+               /* ignore from now on, so disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |       \
+                        INFINIPATH_HWE_COREPLL_RFSLIP |        \
+                        INFINIPATH_HWE_HTBPLL_FBSLIP |         \
+                        INFINIPATH_HWE_HTBPLL_RFSLIP |         \
+                        INFINIPATH_HWE_HTAPLL_FBSLIP |         \
+                        INFINIPATH_HWE_HTAPLL_RFSLIP)
+
+       if (hwerrs & _IPATH_PLL_FAIL) {
+               snprintf(bitsmsg, sizeof bitsmsg,
+                        "[PLL failed (%llx), InfiniPath hardware unusable]",
+                        (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
+               strlcat(msg, bitsmsg, msgl);
+               /* ignore from now on, so disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+
+       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+               /*
+                * If it occurs, it is left masked since the eternal
+                * interface is unused
+                */
+               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+
+       if (hwerrs) {
+               /*
+                * if any set that we aren't ignoring; only
+                * make the complaint once, in case it's stuck
+                * or recurring, and we get here multiple
+                * times.
+                * force link down, so switch knows, and
+                * LEDs are turned off
+                */
+               if (dd->ipath_flags & IPATH_INITTED) {
+                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+                       ipath_setup_ht_setextled(dd,
+                               INFINIPATH_IBCS_L_STATE_DOWN,
+                               INFINIPATH_IBCS_LT_STATE_DISABLED);
+                       ipath_dev_err(dd, "Fatal Hardware Error (freeze "
+                                         "mode), no longer usable, SN %.16s\n",
+                                         dd->ipath_serial);
+                       isfatal = 1;
+               }
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+               /* mark as having had error */
+               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+               /*
+                * mark as not usable, at a minimum until driver
+                * is reloaded, probably until reboot, since no
+                * other reset is possible.
+                */
+               dd->ipath_flags &= ~IPATH_INITTED;
+       }
+       else
+               *msg = 0; /* recovered from all of them */
+       if (*msg)
+               ipath_dev_err(dd, "%s hardware error\n", msg);
+       if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
+               /*
+                * for status file; if no trailing brace is copied,
+                * we'll know it was truncated.
+                */
+               snprintf(dd->ipath_freezemsg,
+                        dd->ipath_freezelen, "{%s}", msg);
+
+bail:;
+}
+
+/**
+ * ipath_ht_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * fill in the board name, based on the board revision register
+ */
+static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
+                             size_t namelen)
+{
+       char *n = NULL;
+       u8 boardrev = dd->ipath_boardrev;
+       int ret = 0;
+
+       switch (boardrev) {
+       case 5:
+               /*
+                * original production board; two production levels, with
+                * different serial number ranges.   See ipath_ht_early_init() for
+                * case where we enable IPATH_GPIO_INTR for later serial # range.
+                * Original 112* serial number is no longer supported.
+                */
+               n = "InfiniPath_QHT7040";
+               break;
+       case 7:
+               /* small form factor production board */
+               n = "InfiniPath_QHT7140";
+               break;
+       default:                /* don't know, just print the number */
+               ipath_dev_err(dd, "Don't yet know about board "
+                             "with ID %u\n", boardrev);
+               snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
+                        boardrev);
+               break;
+       }
+       if (n)
+               snprintf(name, namelen, "%s", n);
+
+       if (ret) {
+               ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
+               goto bail;
+       }
+       if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
+               dd->ipath_minrev > 4)) {
+               /*
+                * This version of the driver only supports Rev 3.2 - 3.4
+                */
+               ipath_dev_err(dd,
+                             "Unsupported InfiniPath hardware revision %u.%u!\n",
+                             dd->ipath_majrev, dd->ipath_minrev);
+               ret = 1;
+               goto bail;
+       }
+       /*
+        * pkt/word counters are 32 bit, and therefore wrap fast enough
+        * that we snapshot them from a timer, and maintain 64 bit shadow
+        * copies
+        */
+       dd->ipath_flags |= IPATH_32BITCOUNTERS;
+       dd->ipath_flags |= IPATH_GPIO_INTR;
+       if (dd->ipath_lbus_speed != 800)
+               ipath_dev_err(dd,
+                             "Incorrectly configured for HT @ %uMHz\n",
+                             dd->ipath_lbus_speed);
+
+       /*
+        * set here, not in ipath_init_*_funcs because we have to do
+        * it after we can read chip registers.
+        */
+       dd->ipath_ureg_align =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
+bail:
+       return ret;
+}
+
+static void ipath_check_htlink(struct ipath_devdata *dd)
+{
+       u8 linkerr, link_off, i;
+
+       for (i = 0; i < 2; i++) {
+               link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
+               if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
+                       dev_info(&dd->pcidev->dev, "Couldn't read "
+                                "linkerror%d of HT slave/primary block\n",
+                                i);
+               else if (linkerr & 0xf0) {
+                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+                                  "clearing\n", linkerr >> 4, i);
+                       /*
+                        * writing the linkerr bits that are set should
+                        * clear them
+                        */
+                       if (pci_write_config_byte(dd->pcidev, link_off,
+                                                 linkerr))
+                               ipath_dbg("Failed write to clear HT "
+                                         "linkerror%d\n", i);
+                       if (pci_read_config_byte(dd->pcidev, link_off,
+                                                &linkerr))
+                               dev_info(&dd->pcidev->dev,
+                                        "Couldn't reread linkerror%d of "
+                                        "HT slave/primary block\n", i);
+                       else if (linkerr & 0xf0)
+                               dev_info(&dd->pcidev->dev,
+                                        "HT linkerror%d bits 0x%x "
+                                        "couldn't be cleared\n",
+                                        i, linkerr >> 4);
+               }
+       }
+}
+
+static int ipath_setup_ht_reset(struct ipath_devdata *dd)
+{
+       ipath_dbg("No reset possible for this InfiniPath hardware\n");
+       return 0;
+}
+
+#define HT_INTR_DISC_CONFIG  0x80      /* HT interrupt and discovery cap */
+#define HT_INTR_REG_INDEX    2 /* intconfig requires indirect accesses */
+
+/*
+ * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
+ * errors.  We only bother to do this at load time, because it's OK if
+ * it happened before we were loaded (first time after boot/reset),
+ * but any time after that, it's fatal anyway.  Also need to not check
+ * for upper byte errors if we are in 8 bit mode, so figure out
+ * our width.  For now, at least, also complain if it's 8 bit.
+ */
+static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
+                            int pos, u8 cap_type)
+{
+       u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
+       u16 linkctrl = 0;
+       int i;
+
+       dd->ipath_ht_slave_off = pos;
+       /* command word, master_host bit */
+       /* master host || slave */
+       if ((cap_type >> 2) & 1)
+               link_a_b_off = 4;
+       else
+               link_a_b_off = 0;
+       ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
+                  link_a_b_off ? 1 : 0,
+                  link_a_b_off ? 'B' : 'A');
+
+       link_a_b_off += pos;
+
+       /*
+        * check both link control registers; clear both HT CRC sets if
+        * necessary.
+        */
+       for (i = 0; i < 2; i++) {
+               link_off = pos + i * 4 + 0x4;
+               if (pci_read_config_word(pdev, link_off, &linkctrl))
+                       ipath_dev_err(dd, "Couldn't read HT link control%d "
+                                     "register\n", i);
+               else if (linkctrl & (0xf << 8)) {
+                       ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
+                                  "bits %x\n", i, linkctrl & (0xf << 8));
+                       /*
+                        * now write them back to clear the error.
+                        */
+                       pci_write_config_word(pdev, link_off,
+                                             linkctrl & (0xf << 8));
+               }
+       }
+
+       /*
+        * As with HT CRC bits, same for protocol errors that might occur
+        * during boot.
+        */
+       for (i = 0; i < 2; i++) {
+               link_off = pos + i * 4 + 0xd;
+               if (pci_read_config_byte(pdev, link_off, &linkerr))
+                       dev_info(&pdev->dev, "Couldn't read linkerror%d "
+                                "of HT slave/primary block\n", i);
+               else if (linkerr & 0xf0) {
+                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+                                  "clearing\n", linkerr >> 4, i);
+                       /*
+                        * writing the linkerr bits that are set will clear
+                        * them
+                        */
+                       if (pci_write_config_byte
+                           (pdev, link_off, linkerr))
+                               ipath_dbg("Failed write to clear HT "
+                                         "linkerror%d\n", i);
+                       if (pci_read_config_byte(pdev, link_off, &linkerr))
+                               dev_info(&pdev->dev, "Couldn't reread "
+                                        "linkerror%d of HT slave/primary "
+                                        "block\n", i);
+                       else if (linkerr & 0xf0)
+                               dev_info(&pdev->dev, "HT linkerror%d bits "
+                                        "0x%x couldn't be cleared\n",
+                                        i, linkerr >> 4);
+               }
+       }
+
+       /*
+        * this is just for our link to the host, not devices connected
+        * through tunnel.
+        */
+
+       if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
+               ipath_dev_err(dd, "Couldn't read HT link width "
+                             "config register\n");
+       else {
+               u32 width;
+               switch (linkwidth & 7) {
+               case 5:
+                       width = 4;
+                       break;
+               case 4:
+                       width = 2;
+                       break;
+               case 3:
+                       width = 32;
+                       break;
+               case 1:
+                       width = 16;
+                       break;
+               case 0:
+               default:        /* if wrong, assume 8 bit */
+                       width = 8;
+                       break;
+               }
+
+               dd->ipath_lbus_width = width;
+
+               if (linkwidth != 0x11) {
+                       ipath_dev_err(dd, "Not configured for 16 bit HT "
+                                     "(%x)\n", linkwidth);
+                       if (!(linkwidth & 0xf)) {
+                               ipath_dbg("Will ignore HT lane1 errors\n");
+                               dd->ipath_flags |= IPATH_8BIT_IN_HT0;
+                       }
+               }
+       }
+
+       /*
+        * this is just for our link to the host, not devices connected
+        * through tunnel.
+        */
+       if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
+               ipath_dev_err(dd, "Couldn't read HT link frequency "
+                             "config register\n");
+       else {
+               u32 speed;
+               switch (linkwidth & 0xf) {
+               case 6:
+                       speed = 1000;
+                       break;
+               case 5:
+                       speed = 800;
+                       break;
+               case 4:
+                       speed = 600;
+                       break;
+               case 3:
+                       speed = 500;
+                       break;
+               case 2:
+                       speed = 400;
+                       break;
+               case 1:
+                       speed = 300;
+                       break;
+               default:
+                       /*
+                        * assume reserved and vendor-specific are 200...
+                        */
+               case 0:
+                       speed = 200;
+                       break;
+               }
+               dd->ipath_lbus_speed = speed;
+       }
+
+       snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
+               "HyperTransport,%uMHz,x%u\n",
+               dd->ipath_lbus_speed,
+               dd->ipath_lbus_width);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+       int ret;
+
+       if (dd->ipath_intconfig) {
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+                                dd->ipath_intconfig);  /* interrupt address */
+               ret = 0;
+       } else {
+               ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+                             "interrupt address\n");
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
+                               struct ht_irq_msg *msg)
+{
+       struct ipath_devdata *dd = pci_get_drvdata(dev);
+       u64 prev_intconfig = dd->ipath_intconfig;
+
+       dd->ipath_intconfig = msg->address_lo;
+       dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
+
+       /*
+        * If the previous value of dd->ipath_intconfig is zero, we're
+        * getting configured for the first time, and must not program the
+        * intconfig register here (it will be programmed later, when the
+        * hardware is ready).  Otherwise, we should.
+        */
+       if (prev_intconfig)
+               ipath_ht_intconfig(dd);
+}
+
+/**
+ * ipath_setup_ht_config - setup the interruptconfig register
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * setup the interruptconfig register from the HT config info.
+ * Also clear CRC errors in HT linkcontrol, if necessary.
+ * This is done only for the real hardware.  It is done before
+ * chip address space is initted, so can't touch infinipath registers
+ */
+static int ipath_setup_ht_config(struct ipath_devdata *dd,
+                                struct pci_dev *pdev)
+{
+       int pos, ret;
+
+       ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
+       if (ret < 0) {
+               ipath_dev_err(dd, "Couldn't create interrupt handler: "
+                             "err %d\n", ret);
+               goto bail;
+       }
+       dd->ipath_irq = ret;
+       ret = 0;
+
+       /*
+        * Handle clearing CRC errors in linkctrl register if necessary.  We
+        * do this early, before we ever enable errors or hardware errors,
+        * mostly to avoid causing the chip to enter freeze mode.
+        */
+       pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
+       if (!pos) {
+               ipath_dev_err(dd, "Couldn't find HyperTransport "
+                             "capability; no interrupts\n");
+               ret = -ENODEV;
+               goto bail;
+       }
+       do {
+               u8 cap_type;
+
+               /*
+                * The HT capability type byte is 3 bytes after the
+                * capability byte.
+                */
+               if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
+                       dev_info(&pdev->dev, "Couldn't read config "
+                                "command @ %d\n", pos);
+                       continue;
+               }
+               if (!(cap_type & 0xE0))
+                       slave_or_pri_blk(dd, pdev, pos, cap_type);
+       } while ((pos = pci_find_next_capability(pdev, pos,
+                                                PCI_CAP_ID_HT)));
+
+       dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * Called during driver unload.
+ * This is currently a nop for the HT chip, not for all chips
+ */
+static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
+{
+}
+
+/**
+ * ipath_setup_ht_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+ *
+ * Set the state of the two external LEDs, to indicate physical and
+ * logical state of IB link.   For this chip (at least with recommended
+ * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
+ * (logical state)
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
+                                    u64 lst, u64 ltst)
+{
+       u64 extctl;
+       unsigned long flags = 0;
+
+       /* the diags use the LED to indicate diag info, so we leave
+        * the external LED alone when the diags are running */
+       if (ipath_diag_inuse)
+               return;
+
+       /* Allow override of LED display for, e.g. Locating system in rack */
+       if (dd->ipath_led_override) {
+               ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
+                       ? INFINIPATH_IBCS_LT_STATE_LINKUP
+                       : INFINIPATH_IBCS_LT_STATE_DISABLED;
+               lst = (dd->ipath_led_override & IPATH_LED_LOG)
+                       ? INFINIPATH_IBCS_L_STATE_ACTIVE
+                       : INFINIPATH_IBCS_L_STATE_DOWN;
+       }
+
+       spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+       /*
+        * start by setting both LED control bits to off, then turn
+        * on the appropriate bit(s).
+        */
+       if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
+               /*
+                * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
+                * is inverted,  because it is normally used to indicate
+                * a hardware fault at reset, if there were errors
+                */
+               extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
+                       | INFINIPATH_EXTC_LEDGBLERR_OFF;
+               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+                       extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
+               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+                       extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
+       }
+       else {
+               extctl = dd->ipath_extctrl &
+                       ~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+                         INFINIPATH_EXTC_LED2PRIPORT_ON);
+               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+                       extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+                       extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+       }
+       dd->ipath_extctrl = extctl;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+       spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+}
+
+static void ipath_init_ht_variables(struct ipath_devdata *dd)
+{
+       /*
+        * setup the register offsets, since they are different for each
+        * chip
+        */
+       dd->ipath_kregs = &ipath_ht_kregs;
+       dd->ipath_cregs = &ipath_ht_cregs;
+
+       dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+       dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+       dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+       dd->ipath_gpio_scl = IPATH_GPIO_SCL;
+
+       /*
+        * Fill in data for field-values that change in newer chips.
+        * We dynamically specify only the mask for LINKTRAININGSTATE
+        * and only the shift for LINKSTATE, as they are the only ones
+        * that change.  Also precalculate the 3 link states of interest
+        * and the combined mask.
+        */
+       dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
+       dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
+       dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+               dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+       dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+               (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+       dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+               (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+       dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+               (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+       /*
+        * Fill in data for ibcc field-values that change in newer chips.
+        * We dynamically specify only the mask for LINKINITCMD
+        * and only the shift for LINKCMD and MAXPKTLEN, as they are
+        * the only ones that change.
+        */
+       dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+       dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+       dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+       /* Fill in shifts for RcvCtrl. */
+       dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+       dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+       dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+       dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
+
+       dd->ipath_i_bitsextant =
+               (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+               (INFINIPATH_I_RCVAVAIL_MASK <<
+                INFINIPATH_I_RCVAVAIL_SHIFT) |
+               INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+               INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+
+       dd->ipath_e_bitsextant =
+               INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+               INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+               INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+               INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+               INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+               INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+               INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+               INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+               INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+               INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+               INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+               INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+               INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+               INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+               INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+               INFINIPATH_E_HARDWARE;
+
+       dd->ipath_hwe_bitsextant =
+               (INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+               INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
+               INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
+               INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
+               INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
+               INFINIPATH_HWE_HTCMISCERR4 |
+               INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
+               INFINIPATH_HWE_HTCMISCERR7 |
+               INFINIPATH_HWE_HTCBUSTREQPARITYERR |
+               INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
+               INFINIPATH_HWE_HTCBUSIREQPARITYERR |
+               INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
+               INFINIPATH_HWE_MEMBISTFAILED |
+               INFINIPATH_HWE_COREPLL_FBSLIP |
+               INFINIPATH_HWE_COREPLL_RFSLIP |
+               INFINIPATH_HWE_HTBPLL_FBSLIP |
+               INFINIPATH_HWE_HTBPLL_RFSLIP |
+               INFINIPATH_HWE_HTAPLL_FBSLIP |
+               INFINIPATH_HWE_HTAPLL_RFSLIP |
+               INFINIPATH_HWE_SERDESPLLFAILED |
+               INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+               INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+
+       dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+       dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+       dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+       dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
+
+       /*
+        * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+        * 2 is Some Misc, 3 is reserved for future.
+        */
+       dd->ipath_eep_st_masks[0].hwerrs_to_log =
+               INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+               INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
+
+       dd->ipath_eep_st_masks[1].hwerrs_to_log =
+               INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+               INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
+
+       dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
+
+       dd->delay_mult = 2; /* SDR, 4X, can't change */
+
+       dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+       dd->ipath_link_speed_supported = IPATH_IB_SDR;
+       dd->ipath_link_width_enabled = IB_WIDTH_4X;
+       dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+       /* these can't change for this chip, so set once */
+       dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+       dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
+}
+
+/**
+ * ipath_ht_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
+{
+       ipath_err_t val;
+       u64 extsval;
+
+       extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+       if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+               ipath_dev_err(dd, "MemBIST did not complete!\n");
+       if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
+               ipath_dbg("MemBIST corrected\n");
+
+       ipath_check_htlink(dd);
+
+       /* barring bugs, all hwerrors become interrupts, which can */
+       val = -1LL;
+       /* don't look at crc lane1 if 8 bit */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+               val &= ~infinipath_hwe_htclnkabyte1crcerr;
+       /* don't look at crc lane1 if 8 bit */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+               val &= ~infinipath_hwe_htclnkbbyte1crcerr;
+
+       /*
+        * disable RXDSYNCMEMPARITY because external serdes is unused,
+        * and therefore the logic will never be used or initialized,
+        * and uninitialized state will normally result in this error
+        * being asserted.  Similarly for the external serdess pll
+        * lock signal.
+        */
+       val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
+                INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
+
+       /*
+        * Disable MISCERR4 because of an inversion in the HT core
+        * logic checking for errors that cause this bit to be set.
+        * The errata can also cause the protocol error bit to be set
+        * in the HT config space linkerror register(s).
+        */
+       val &= ~INFINIPATH_HWE_HTCMISCERR4;
+
+       /*
+        * PLL ignored because unused MDIO interface has a logic problem
+        */
+       if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
+               val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+       dd->ipath_hwerrmask = val;
+}
+
+
+
+
+/**
+ * ipath_ht_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
+{
+       u64 val, config1;
+       int ret = 0, change = 0;
+
+       ipath_dbg("Trying to bringup serdes\n");
+
+       if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+           INFINIPATH_HWE_SERDESPLLFAILED)
+       {
+               ipath_dbg("At start, serdes PLL failed bit set in "
+                         "hwerrstatus, clearing and continuing\n");
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                                INFINIPATH_HWE_SERDESPLLFAILED);
+       }
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+       config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+       ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
+                  "config1=%llx, sstatus=%llx xgxs %llx\n",
+                  (unsigned long long) val, (unsigned long long) config1,
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+       /* force reset on */
+       val |= INFINIPATH_SERDC0_RESET_PLL
+               /* | INFINIPATH_SERDC0_RESET_MASK */
+               ;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+       udelay(15);             /* need pll reset set at least for a bit */
+
+       if (val & INFINIPATH_SERDC0_RESET_PLL) {
+               u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
+               /* set lane resets, and tx idle, during pll reset */
+               val2 |= INFINIPATH_SERDC0_RESET_MASK |
+                       INFINIPATH_SERDC0_TXIDLE;
+               ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
+                          "%llx)\n", (unsigned long long) val2);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+                                val2);
+               /*
+                * be sure chip saw it
+                */
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               /*
+                * need pll reset clear at least 11 usec before lane
+                * resets cleared; give it a few more
+                */
+               udelay(15);
+               val = val2;     /* for check below */
+       }
+
+       if (val & (INFINIPATH_SERDC0_RESET_PLL |
+                  INFINIPATH_SERDC0_RESET_MASK |
+                  INFINIPATH_SERDC0_TXIDLE)) {
+               val &= ~(INFINIPATH_SERDC0_RESET_PLL |
+                        INFINIPATH_SERDC0_RESET_MASK |
+                        INFINIPATH_SERDC0_TXIDLE);
+               /* clear them */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+                                val);
+       }
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+       if (val & INFINIPATH_XGXS_RESET) {
+               /* normally true after boot */
+               val &= ~INFINIPATH_XGXS_RESET;
+               change = 1;
+       }
+       if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
+            INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
+               /* need to compensate for Tx inversion in partner */
+               val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+                        INFINIPATH_XGXS_RX_POL_SHIFT);
+               val |= dd->ipath_rx_pol_inv <<
+                       INFINIPATH_XGXS_RX_POL_SHIFT;
+               change = 1;
+       }
+       if (change)
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+       /* clear current and de-emphasis bits */
+       config1 &= ~0x0ffffffff00ULL;
+       /* set current to 20ma */
+       config1 |= 0x00000000000ULL;
+       /* set de-emphasis to -5.68dB */
+       config1 |= 0x0cccc000000ULL;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+       ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
+                  "config1=%llx, sstatus=%llx xgxs %llx\n",
+                  (unsigned long long) val, (unsigned long long) config1,
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+       return ret;             /* for now, say we always succeeded */
+}
+
+/**
+ * ipath_ht_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * driver is being unloaded
+ */
+static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
+{
+       u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+       val |= INFINIPATH_SERDC0_TXIDLE;
+       ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+                 (unsigned long long) val);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_ht_put_tid(struct ipath_devdata *dd,
+                            u64 __iomem *tidptr, u32 type,
+                            unsigned long pa)
+{
+       if (!dd->ipath_kregbase)
+               return;
+
+       if (pa != dd->ipath_tidinvalid) {
+               if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
+                       dev_info(&dd->pcidev->dev,
+                                "physaddr %lx has more than "
+                                "40 bits, using only 40!!!\n", pa);
+                       pa &= INFINIPATH_RT_ADDR_MASK;
+               }
+               if (type == RCVHQ_RCV_TYPE_EAGER)
+                       pa |= dd->ipath_tidtemplate;
+               else {
+                       /* in words (fixed, full page).  */
+                       u64 lenvalid = PAGE_SIZE >> 2;
+                       lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+                       pa |= lenvalid | INFINIPATH_RT_VALID;
+               }
+       }
+
+       writeq(pa, tidptr);
+}
+
+
+/**
+ * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * Used from ipath_close(), and at chip initialization.
+ */
+static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+       u64 __iomem *tidbase;
+       int i;
+
+       if (!dd->ipath_kregbase)
+               return;
+
+       ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+       /*
+        * need to invalidate all of the expected TID entries for this
+        * port, so we don't have valid entries that might somehow get
+        * used (early in next use of this port, or through some bug)
+        */
+       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+                                  dd->ipath_rcvtidbase +
+                                  port * dd->ipath_rcvtidcnt *
+                                  sizeof(*tidbase));
+       for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+               ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+                                dd->ipath_tidinvalid);
+
+       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+                                  dd->ipath_rcvegrbase +
+                                  port * dd->ipath_rcvegrcnt *
+                                  sizeof(*tidbase));
+
+       for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+               ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+                                dd->ipath_tidinvalid);
+}
+
+/**
+ * ipath_ht_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
+{
+       dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
+       dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+       dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
+
+       /*
+        * work around chip errata bug 7358, by marking invalid tids
+        * as having max length
+        */
+       dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+               INFINIPATH_RT_BUFSIZE_SHIFT;
+}
+
+static int ipath_ht_early_init(struct ipath_devdata *dd)
+{
+       u32 __iomem *piobuf;
+       u32 pioincr, val32;
+       int i;
+
+       /*
+        * one cache line; long IB headers will spill over into received
+        * buffer
+        */
+       dd->ipath_rcvhdrentsize = 16;
+       dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+       /*
+        * For HT, we allocate a somewhat overly large eager buffer,
+        * such that we can guarantee that we can receive the largest
+        * packet that we can send out.  To truly support a 4KB MTU,
+        * we need to bump this to a large value.  To date, other than
+        * testing, we have never encountered an HCA that can really
+        * send 4KB MTU packets, so we do not handle that (we'll get
+        * errors interrupts if we ever see one).
+        */
+       dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
+
+       /*
+        * the min() check here is currently a nop, but it may not
+        * always be, depending on just how we do ipath_rcvegrbufsize
+        */
+       dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+                                dd->ipath_rcvegrbufsize);
+       dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+       ipath_ht_tidtemplate(dd);
+
+       /*
+        * zero all the TID entries at startup.  We do this for sanity,
+        * in case of a previous driver crash of some kind, and also
+        * because the chip powers up with these memories in an unknown
+        * state.  Use portcnt, not cfgports, since this is for the
+        * full chip, not for current (possibly different) configuration
+        * value.
+        * Chip Errata bug 6447
+        */
+       for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+               ipath_ht_clear_tids(dd, val32);
+
+       /*
+        * write the pbc of each buffer, to be sure it's initialized, then
+        * cancel all the buffers, and also abort any packets that might
+        * have been in flight for some reason (the latter is for driver
+        * unload/reload, but isn't a bad idea at first init).  PIO send
+        * isn't enabled at this point, so there is no danger of sending
+        * these out on the wire.
+        * Chip Errata bug 6610
+        */
+       piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
+                                 dd->ipath_piobufbase);
+       pioincr = dd->ipath_palign / sizeof(*piobuf);
+       for (i = 0; i < dd->ipath_piobcnt2k; i++) {
+               /*
+                * reasonable word count, just to init pbc
+                */
+               writel(16, piobuf);
+               piobuf += pioincr;
+       }
+
+       ipath_get_eeprom_info(dd);
+       if (dd->ipath_boardrev == 5) {
+               /*
+                * Later production QHT7040 has same changes as QHT7140, so
+                * can use GPIO interrupts.  They have serial #'s starting
+                * with 128, rather than 112.
+                */
+               if (dd->ipath_serial[0] == '1' &&
+                   dd->ipath_serial[1] == '2' &&
+                   dd->ipath_serial[2] == '8')
+                       dd->ipath_flags |= IPATH_GPIO_INTR;
+               else {
+                       ipath_dev_err(dd, "Unsupported InfiniPath board "
+                               "(serial number %.16s)!\n",
+                               dd->ipath_serial);
+                       return 1;
+               }
+       }
+
+       if (dd->ipath_minrev >= 4) {
+               /* Rev4+ reports extra errors via internal GPIO pins */
+               dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+               dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+                                dd->ipath_gpio_mask);
+       }
+
+       return 0;
+}
+
+
+/**
+ * ipath_init_ht_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+       struct ipath_base_info *kinfo = kbase;
+
+       kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
+               IPATH_RUNTIME_PIO_REGSWAPPED;
+
+       if (pd->port_dd->ipath_minrev < 4)
+               kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
+
+       return 0;
+}
+
+static void ipath_ht_free_irq(struct ipath_devdata *dd)
+{
+       free_irq(dd->ipath_irq, dd);
+       ht_destroy_irq(dd->ipath_irq);
+       dd->ipath_irq = 0;
+       dd->ipath_intconfig = 0;
+}
+
+static struct ipath_message_header *
+ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+       return (struct ipath_message_header *)
+               &rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+       dd->ipath_portcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+       dd->ipath_p0_rcvegrcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_ht_read_counters(struct ipath_devdata *dd,
+                                  struct infinipath_counters *cntrs)
+{
+       cntrs->LBIntCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+       cntrs->LBFlowStallCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+       cntrs->TxSDmaDescCnt = 0;
+       cntrs->TxUnsupVLErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+       cntrs->TxDataPktCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+       cntrs->TxFlowPktCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+       cntrs->TxDwordCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+       cntrs->TxLenErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+       cntrs->TxMaxMinLenErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+       cntrs->TxUnderrunCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+       cntrs->TxFlowStallCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+       cntrs->TxDroppedPktCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+       cntrs->RxDroppedPktCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+       cntrs->RxDataPktCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+       cntrs->RxFlowPktCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+       cntrs->RxDwordCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+       cntrs->RxLenErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+       cntrs->RxMaxMinLenErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+       cntrs->RxICRCErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+       cntrs->RxVCRCErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+       cntrs->RxFlowCtrlErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+       cntrs->RxBadFormatCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+       cntrs->RxLinkProblemCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+       cntrs->RxEBPCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+       cntrs->RxLPCRCErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+       cntrs->RxBufOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+       cntrs->RxTIDFullErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+       cntrs->RxTIDValidErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+       cntrs->RxPKeyMismatchCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+       cntrs->RxP0HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+       cntrs->RxP1HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+       cntrs->RxP2HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+       cntrs->RxP3HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+       cntrs->RxP4HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+       cntrs->RxP5HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
+       cntrs->RxP6HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
+       cntrs->RxP7HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
+       cntrs->RxP8HdrEgrOvflCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
+       cntrs->RxP9HdrEgrOvflCnt = 0;
+       cntrs->RxP10HdrEgrOvflCnt = 0;
+       cntrs->RxP11HdrEgrOvflCnt = 0;
+       cntrs->RxP12HdrEgrOvflCnt = 0;
+       cntrs->RxP13HdrEgrOvflCnt = 0;
+       cntrs->RxP14HdrEgrOvflCnt = 0;
+       cntrs->RxP15HdrEgrOvflCnt = 0;
+       cntrs->RxP16HdrEgrOvflCnt = 0;
+       cntrs->IBStatusChangeCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+       cntrs->IBLinkErrRecoveryCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+       cntrs->IBLinkDownedCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+       cntrs->IBSymbolErrCnt =
+               ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+       cntrs->RxVL15DroppedPktCnt = 0;
+       cntrs->RxOtherLocalPhyErrCnt = 0;
+       cntrs->PcieRetryBufDiagQwordCnt = 0;
+       cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+       cntrs->LocalLinkIntegrityErrCnt =
+               (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+               dd->ipath_lli_errs : dd->ipath_lli_errors;
+       cntrs->RxVlErrCnt = 0;
+       cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
+{
+       return 0;
+}
+
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
+{
+       u64 val, prev_val;
+
+       prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+       val = prev_val | INFINIPATH_XGXS_RESET;
+       prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+       ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control);
+}
+
+
+static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+       int ret;
+
+       switch (which) {
+       case IPATH_IB_CFG_LWID:
+               ret = dd->ipath_link_width_active;
+               break;
+       case IPATH_IB_CFG_SPD:
+               ret = dd->ipath_link_speed_active;
+               break;
+       case IPATH_IB_CFG_LWID_ENB:
+               ret = dd->ipath_link_width_enabled;
+               break;
+       case IPATH_IB_CFG_SPD_ENB:
+               ret = dd->ipath_link_speed_enabled;
+               break;
+       default:
+               ret =  -ENOTSUPP;
+               break;
+       }
+       return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+       int ret = 0;
+
+       if (which == IPATH_IB_CFG_LWID_ENB)
+               dd->ipath_link_width_enabled = val;
+       else if (which == IPATH_IB_CFG_SPD_ENB)
+               dd->ipath_link_speed_enabled = val;
+       else
+               ret = -ENOTSUPP;
+       return ret;
+}
+
+
+static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+       ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+               ipath_ib_linktrstate(dd, ibcs));
+       return 0;
+}
+
+
+/**
+ * ipath_init_iba6110_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
+{
+       dd->ipath_f_intrsetup = ipath_ht_intconfig;
+       dd->ipath_f_bus = ipath_setup_ht_config;
+       dd->ipath_f_reset = ipath_setup_ht_reset;
+       dd->ipath_f_get_boardname = ipath_ht_boardname;
+       dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+       dd->ipath_f_early_init = ipath_ht_early_init;
+       dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
+       dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
+       dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
+       dd->ipath_f_clear_tids = ipath_ht_clear_tids;
+       dd->ipath_f_put_tid = ipath_ht_put_tid;
+       dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
+       dd->ipath_f_setextled = ipath_setup_ht_setextled;
+       dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+       dd->ipath_f_free_irq = ipath_ht_free_irq;
+       dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+       dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
+       dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
+       dd->ipath_f_config_ports = ipath_ht_config_ports;
+       dd->ipath_f_read_counters = ipath_ht_read_counters;
+       dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
+       dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
+       dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
+       dd->ipath_f_config_jint = ipath_ht_config_jint;
+       dd->ipath_f_ib_updown = ipath_ht_ib_updown;
+
+       /*
+        * initialize chip-specific variables
+        */
+       ipath_init_ht_variables(dd);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c
new file mode 100644 (file)
index 0000000..be2a60e
--- /dev/null
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+/*
+ * min buffers we want to have per port, after driver
+ */
+#define IPATH_MIN_USER_PORT_BUFCNT 7
+
+/*
+ * Number of ports we are configured to use (to allow for more pio
+ * buffers per port, etc.)  Zero means use chip value.
+ */
+static ushort ipath_cfgports;
+
+module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
+
+/*
+ * Number of buffers reserved for driver (verbs and layered drivers.)
+ * Initialized based on number of PIO buffers if not set via module interface.
+ * The problem with this is that it's global, but we'll use different
+ * numbers for different chip types.
+ */
+static ushort ipath_kpiobufs;
+
+static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
+
+module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
+                 &ipath_kpiobufs, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
+
+/**
+ * create_port0_egr - allocate the eager TID buffers
+ * @dd: the infinipath device
+ *
+ * This code is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance.
+ * This is the kernel (port0) version.
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the
+ * memory, and either use the buffers as is for things like verbs
+ * packets, or pass the buffers up to the ipath layered driver and
+ * thence the network layer, replacing them as we do so (see
+ * ipath_rcv_layer()).
+ */
+static int create_port0_egr(struct ipath_devdata *dd)
+{
+       unsigned e, egrcnt;
+       struct ipath_skbinfo *skbinfo;
+       int ret;
+
+       egrcnt = dd->ipath_p0_rcvegrcnt;
+
+       skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
+       if (skbinfo == NULL) {
+               ipath_dev_err(dd, "allocation error for eager TID "
+                             "skb array\n");
+               ret = -ENOMEM;
+               goto bail;
+       }
+       for (e = 0; e < egrcnt; e++) {
+               /*
+                * This is a bit tricky in that we allocate extra
+                * space for 2 bytes of the 14 byte ethernet header.
+                * These two bytes are passed in the ipath header so
+                * the rest of the data is word aligned.  We allocate
+                * 4 bytes so that the data buffer stays word aligned.
+                * See ipath_kreceive() for more details.
+                */
+               skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
+               if (!skbinfo[e].skb) {
+                       ipath_dev_err(dd, "SKB allocation error for "
+                                     "eager TID %u\n", e);
+                       while (e != 0)
+                               dev_kfree_skb(skbinfo[--e].skb);
+                       vfree(skbinfo);
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+       }
+       /*
+        * After loop above, so we can test non-NULL to see if ready
+        * to use at receive, etc.
+        */
+       dd->ipath_port0_skbinfo = skbinfo;
+
+       for (e = 0; e < egrcnt; e++) {
+               dd->ipath_port0_skbinfo[e].phys =
+                 ipath_map_single(dd->pcidev,
+                                  dd->ipath_port0_skbinfo[e].skb->data,
+                                  dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
+               dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
+                                   ((char __iomem *) dd->ipath_kregbase +
+                                    dd->ipath_rcvegrbase),
+                                   RCVHQ_RCV_TYPE_EAGER,
+                                   dd->ipath_port0_skbinfo[e].phys);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int bringup_link(struct ipath_devdata *dd)
+{
+       u64 val, ibc;
+       int ret = 0;
+
+       /* hold IBC in reset */
+       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control);
+
+       /*
+        * set initial max size pkt IBC will send, including ICRC; it's the
+        * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
+        */
+       val = (dd->ipath_ibmaxlen >> 2) + 1;
+       ibc = val << dd->ibcc_mpl_shift;
+
+       /* flowcontrolwatermark is in units of KBytes */
+       ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+       /*
+        * How often flowctrl sent.  More or less in usecs; balance against
+        * watermark value, so that in theory senders always get a flow
+        * control update in time to not let the IB link go idle.
+        */
+       ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+       /* max error tolerance */
+       ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+       /* use "real" buffer space for */
+       ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+       /* IB credit flow control. */
+       ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+       /* initially come up waiting for TS1, without sending anything. */
+       dd->ipath_ibcctrl = ibc;
+       /*
+        * Want to start out with both LINKCMD and LINKINITCMD in NOP
+        * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
+        * to stay a NOP. Flag that we are disabled, for the (unlikely)
+        * case that some recovery path is trying to bring the link up
+        * before we are ready.
+        */
+       ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+               INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+       dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+       ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
+                  (unsigned long long) ibc);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
+
+       // be sure chip saw it
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+       ret = dd->ipath_f_bringup_serdes(dd);
+
+       if (ret)
+               dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
+                        "not usable\n");
+       else {
+               /* enable IBC */
+               dd->ipath_control |= INFINIPATH_C_LINKENABLE;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                                dd->ipath_control);
+       }
+
+       return ret;
+}
+
+static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
+{
+       struct ipath_portdata *pd = NULL;
+
+       pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+       if (pd) {
+               pd->port_dd = dd;
+               pd->port_cnt = 1;
+               /* The port 0 pkey table is used by the layer interface. */
+               pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
+               pd->port_seq_cnt = 1;
+       }
+       return pd;
+}
+
+static int init_chip_first(struct ipath_devdata *dd)
+{
+       struct ipath_portdata *pd;
+       int ret = 0;
+       u64 val;
+
+       spin_lock_init(&dd->ipath_kernel_tid_lock);
+       spin_lock_init(&dd->ipath_user_tid_lock);
+       spin_lock_init(&dd->ipath_sendctrl_lock);
+       spin_lock_init(&dd->ipath_uctxt_lock);
+       spin_lock_init(&dd->ipath_sdma_lock);
+       spin_lock_init(&dd->ipath_gpio_lock);
+       spin_lock_init(&dd->ipath_eep_st_lock);
+       spin_lock_init(&dd->ipath_sdepb_lock);
+       mutex_init(&dd->ipath_eep_lock);
+
+       /*
+        * skip cfgports stuff because we are not allocating memory,
+        * and we don't want problems if the portcnt changed due to
+        * cfgports.  We do still check and report a difference, if
+        * not same (should be impossible).
+        */
+       dd->ipath_f_config_ports(dd, ipath_cfgports);
+       if (!ipath_cfgports)
+               dd->ipath_cfgports = dd->ipath_portcnt;
+       else if (ipath_cfgports <= dd->ipath_portcnt) {
+               dd->ipath_cfgports = ipath_cfgports;
+               ipath_dbg("Configured to use %u ports out of %u in chip\n",
+                         dd->ipath_cfgports, ipath_read_kreg32(dd,
+                         dd->ipath_kregs->kr_portcnt));
+       } else {
+               dd->ipath_cfgports = dd->ipath_portcnt;
+               ipath_dbg("Tried to configured to use %u ports; chip "
+                         "only supports %u\n", ipath_cfgports,
+                         ipath_read_kreg32(dd,
+                                 dd->ipath_kregs->kr_portcnt));
+       }
+       /*
+        * Allocate full portcnt array, rather than just cfgports, because
+        * cleanup iterates across all possible ports.
+        */
+       dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
+                              GFP_KERNEL);
+
+       if (!dd->ipath_pd) {
+               ipath_dev_err(dd, "Unable to allocate portdata array, "
+                             "failing\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       pd = create_portdata0(dd);
+       if (!pd) {
+               ipath_dev_err(dd, "Unable to allocate portdata for port "
+                             "0, failing\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+       dd->ipath_pd[0] = pd;
+
+       dd->ipath_rcvtidcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+       dd->ipath_rcvtidbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+       dd->ipath_rcvegrcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+       dd->ipath_rcvegrbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+       dd->ipath_palign =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+       dd->ipath_piobufbase =
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
+       dd->ipath_piosize2k = val & ~0U;
+       dd->ipath_piosize4k = val >> 32;
+       if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
+               ipath_mtu4096 = 0; /* 4KB not supported by this chip */
+       dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
+       dd->ipath_piobcnt2k = val & ~0U;
+       dd->ipath_piobcnt4k = val >> 32;
+       dd->ipath_pio2kbase =
+               (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+                                (dd->ipath_piobufbase & 0xffffffff));
+       if (dd->ipath_piobcnt4k) {
+               dd->ipath_pio4kbase = (u32 __iomem *)
+                       (((char __iomem *) dd->ipath_kregbase) +
+                        (dd->ipath_piobufbase >> 32));
+               /*
+                * 4K buffers take 2 pages; we use roundup just to be
+                * paranoid; we calculate it once here, rather than on
+                * ever buf allocate
+                */
+               dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
+                                         dd->ipath_palign);
+               ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
+                         "(%x aligned)\n",
+                         dd->ipath_piobcnt2k, dd->ipath_piosize2k,
+                         dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
+                         dd->ipath_piosize4k, dd->ipath_pio4kbase,
+                         dd->ipath_4kalign);
+       }
+       else ipath_dbg("%u 2k piobufs @ %p\n",
+                      dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
+
+done:
+       return ret;
+}
+
+/**
+ * init_chip_reset - re-initialize after a reset, or enable
+ * @dd: the infinipath device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_chip_reset(struct ipath_devdata *dd)
+{
+       u32 rtmp;
+       int i;
+       unsigned long flags;
+
+       /*
+        * ensure chip does no sends or receives, tail updates, or
+        * pioavail updates while we re-initialize
+        */
+       dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
+       for (i = 0; i < dd->ipath_portcnt; i++) {
+               clear_bit(dd->ipath_r_portenable_shift + i,
+                         &dd->ipath_rcvctrl);
+               clear_bit(dd->ipath_r_intravail_shift + i,
+                         &dd->ipath_rcvctrl);
+       }
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+               dd->ipath_rcvctrl);
+
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       dd->ipath_sendctrl = 0U; /* no sdma, etc */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+       if (rtmp != dd->ipath_rcvtidcnt)
+               dev_info(&dd->pcidev->dev, "tidcnt was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvtidcnt, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+       if (rtmp != dd->ipath_rcvtidbase)
+               dev_info(&dd->pcidev->dev, "tidbase was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvtidbase, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+       if (rtmp != dd->ipath_rcvegrcnt)
+               dev_info(&dd->pcidev->dev, "egrcnt was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvegrcnt, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+       if (rtmp != dd->ipath_rcvegrbase)
+               dev_info(&dd->pcidev->dev, "egrbase was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvegrbase, rtmp);
+
+       return 0;
+}
+
+static int init_pioavailregs(struct ipath_devdata *dd)
+{
+       int ret;
+
+       dd->ipath_pioavailregs_dma = dma_alloc_coherent(
+               &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
+               GFP_KERNEL);
+       if (!dd->ipath_pioavailregs_dma) {
+               ipath_dev_err(dd, "failed to allocate PIOavail reg area "
+                             "in memory\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       /*
+        * we really want L2 cache aligned, but for current CPUs of
+        * interest, they are the same.
+        */
+       dd->ipath_statusp = (u64 *)
+               ((char *)dd->ipath_pioavailregs_dma +
+                ((2 * L1_CACHE_BYTES +
+                  dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+       /* copy the current value now that it's really allocated */
+       *dd->ipath_statusp = dd->_ipath_status;
+       /*
+        * setup buffer to hold freeze msg, accessible to apps,
+        * following statusp
+        */
+       dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+       /* and its length */
+       dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+       ret = 0;
+
+done:
+       return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the infinipath device
+ *
+ * allocate the shadow TID array, so we can ipath_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * port data structure, so we only allocate memory for ports actually
+ * in use, since we at 8k per port, now.
+ */
+static void init_shadow_tids(struct ipath_devdata *dd)
+{
+       struct page **pages;
+       dma_addr_t *addrs;
+
+       pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+                       sizeof(struct page *));
+       if (!pages) {
+               ipath_dev_err(dd, "failed to allocate shadow page * "
+                             "array, no expected sends!\n");
+               dd->ipath_pageshadow = NULL;
+               return;
+       }
+
+       addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+                       sizeof(dma_addr_t));
+       if (!addrs) {
+               ipath_dev_err(dd, "failed to allocate shadow dma handle "
+                             "array, no expected sends!\n");
+               vfree(pages);
+               dd->ipath_pageshadow = NULL;
+               return;
+       }
+
+       dd->ipath_pageshadow = pages;
+       dd->ipath_physshadow = addrs;
+}
+
+static void enable_chip(struct ipath_devdata *dd, int reinit)
+{
+       u32 val;
+       u64 rcvmask;
+       unsigned long flags;
+       int i;
+
+       if (!reinit)
+               init_waitqueue_head(&ipath_state_wait);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       /* Enable PIO send, and update of PIOavail regs to memory. */
+       dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
+               INFINIPATH_S_PIOBUFAVAILUPD;
+
+       /*
+        * Set the PIO avail update threshold to host memory
+        * on chips that support it.
+        */
+       if (dd->ipath_pioupd_thresh)
+               dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+                       << INFINIPATH_S_UPDTHRESH_SHIFT;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+       /*
+        * Enable kernel ports' receive and receive interrupt.
+        * Other ports done as user opens and inits them.
+        */
+       rcvmask = 1ULL;
+       dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
+               (rcvmask << dd->ipath_r_intravail_shift);
+       if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
+               dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       /*
+        * now ready for use.  this should be cleared whenever we
+        * detect a reset, or initiate one.
+        */
+       dd->ipath_flags |= IPATH_INITTED;
+
+       /*
+        * Init our shadow copies of head from tail values,
+        * and write head values to match.
+        */
+       val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
+       ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
+
+       /* Initialize so we interrupt on next packet received */
+       ipath_write_ureg(dd, ur_rcvhdrhead,
+                        dd->ipath_rhdrhead_intr_off |
+                        dd->ipath_pd[0]->port_head, 0);
+
+       /*
+        * by now pioavail updates to memory should have occurred, so
+        * copy them into our working/shadow registers; this is in
+        * case something went wrong with abort, but mostly to get the
+        * initial values of the generation bit correct.
+        */
+       for (i = 0; i < dd->ipath_pioavregs; i++) {
+               __le64 pioavail;
+
+               /*
+                * Chip Errata bug 6641; even and odd qwords>3 are swapped.
+                */
+               if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+                       pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
+               else
+                       pioavail = dd->ipath_pioavailregs_dma[i];
+               /*
+                * don't need to worry about ipath_pioavailkernel here
+                * because we will call ipath_chg_pioavailkernel() later
+                * in initialization, to busy out buffers as needed
+                */
+               dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
+       }
+       /* can get counters, stats, etc. */
+       dd->ipath_flags |= IPATH_PRESENT;
+}
+
+static int init_housekeeping(struct ipath_devdata *dd, int reinit)
+{
+       char boardn[40];
+       int ret = 0;
+
+       /*
+        * have to clear shadow copies of registers at init that are
+        * not otherwise set here, or all kinds of bizarre things
+        * happen with driver on chip reset
+        */
+       dd->ipath_rcvhdrsize = 0;
+
+       /*
+        * Don't clear ipath_flags as 8bit mode was set before
+        * entering this func. However, we do set the linkstate to
+        * unknown, so we can watch for a transition.
+        * PRESENT is set because we want register reads to work,
+        * and the kernel infrastructure saw it in config space;
+        * We clear it if we have failures.
+        */
+       dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
+       dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
+                            IPATH_LINKDOWN | IPATH_LINKINIT);
+
+       ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
+       dd->ipath_revision =
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+
+       /*
+        * set up fundamental info we need to use the chip; we assume
+        * if the revision reg and these regs are OK, we don't need to
+        * special case the rest
+        */
+       dd->ipath_sregbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
+       dd->ipath_cregbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
+       dd->ipath_uregbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
+       ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
+                  "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
+                  dd->ipath_uregbase, dd->ipath_cregbase);
+       if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
+           || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
+           || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
+           || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+               ipath_dev_err(dd, "Register read failures from chip, "
+                             "giving up initialization\n");
+               dd->ipath_flags &= ~IPATH_PRESENT;
+               ret = -ENODEV;
+               goto done;
+       }
+
+
+       /* clear diagctrl register, in case diags were running and crashed */
+       ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
+
+       /* clear the initial reset flag, in case first driver load */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+                        INFINIPATH_E_RESET);
+
+       ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
+                  (unsigned long long) dd->ipath_revision,
+                  dd->ipath_pcirev);
+
+       if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
+            INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
+               ipath_dev_err(dd, "Driver only handles version %d, "
+                             "chip swversion is %d (%llx), failng\n",
+                             IPATH_CHIP_SWVERSION,
+                             (int)(dd->ipath_revision >>
+                                   INFINIPATH_R_SOFTWARE_SHIFT) &
+                             INFINIPATH_R_SOFTWARE_MASK,
+                             (unsigned long long) dd->ipath_revision);
+               ret = -ENOSYS;
+               goto done;
+       }
+       dd->ipath_majrev = (u8) ((dd->ipath_revision >>
+                                 INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+                                INFINIPATH_R_CHIPREVMAJOR_MASK);
+       dd->ipath_minrev = (u8) ((dd->ipath_revision >>
+                                 INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+                                INFINIPATH_R_CHIPREVMINOR_MASK);
+       dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
+                                   INFINIPATH_R_BOARDID_SHIFT) &
+                                  INFINIPATH_R_BOARDID_MASK);
+
+       ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
+
+       snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
+                "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
+                "SW Compat %u\n",
+                IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+                (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+                INFINIPATH_R_ARCH_MASK,
+                dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
+                (unsigned)(dd->ipath_revision >>
+                           INFINIPATH_R_SOFTWARE_SHIFT) &
+                INFINIPATH_R_SOFTWARE_MASK);
+
+       ipath_dbg("%s", dd->ipath_boardversion);
+
+       if (ret)
+               goto done;
+
+       if (reinit)
+               ret = init_chip_reset(dd);
+       else
+               ret = init_chip_first(dd);
+
+done:
+       return ret;
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+
+       if (!dd)
+               return; /* being torn down */
+
+       /*
+        * If we don't have any interrupts, let the user know and
+        * don't bother checking again.
+        */
+       if (dd->ipath_int_counter == 0) {
+               if (!dd->ipath_f_intr_fallback(dd))
+                       dev_err(&dd->pcidev->dev, "No interrupts detected, "
+                               "not usable.\n");
+               else /* re-arm the timer to see if fallback works */
+                       mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
+       } else
+               ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
+                       dd->ipath_int_counter);
+}
+
+/**
+ * ipath_init_chip - do the actual initialization sequence on the chip
+ * @dd: the infinipath device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int ipath_init_chip(struct ipath_devdata *dd, int reinit)
+{
+       int ret = 0;
+       u32 kpiobufs, defkbufs;
+       u32 piobufs, uports;
+       u64 val;
+       struct ipath_portdata *pd;
+       gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+
+       ret = init_housekeeping(dd, reinit);
+       if (ret)
+               goto done;
+
+       /*
+        * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
+        * but then it no longer nicely fits power of two, and since
+        * we now use routines that backend onto __get_free_pages, the
+        * rest would be wasted.
+        */
+       dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
+                        dd->ipath_rcvhdrcnt);
+
+       /*
+        * Set up the shadow copies of the piobufavail registers,
+        * which we compare against the chip registers for now, and
+        * the in memory DMA'ed copies of the registers.  This has to
+        * be done early, before we calculate lastport, etc.
+        */
+       piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       /*
+        * calc number of pioavail registers, and save it; we have 2
+        * bits per buffer.
+        */
+       dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
+               / (sizeof(u64) * BITS_PER_BYTE / 2);
+       uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
+       if (piobufs > 144)
+               defkbufs = 32 + dd->ipath_pioreserved;
+       else
+               defkbufs = 16 + dd->ipath_pioreserved;
+
+       if (ipath_kpiobufs && (ipath_kpiobufs +
+               (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
+               int i = (int) piobufs -
+                       (int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
+               if (i < 1)
+                       i = 1;
+               dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
+                        "%d for kernel leaves too few for %d user ports "
+                        "(%d each); using %u\n", ipath_kpiobufs,
+                        piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
+               /*
+                * shouldn't change ipath_kpiobufs, because could be
+                * different for different devices...
+                */
+               kpiobufs = i;
+       } else if (ipath_kpiobufs)
+               kpiobufs = ipath_kpiobufs;
+       else
+               kpiobufs = defkbufs;
+       dd->ipath_lastport_piobuf = piobufs - kpiobufs;
+       dd->ipath_pbufsport =
+               uports ? dd->ipath_lastport_piobuf / uports : 0;
+       /* if not an even divisor, some user ports get extra buffers */
+       dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+               (dd->ipath_pbufsport * uports);
+       if (dd->ipath_ports_extrabuf)
+               ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+                       "ports <= %u\n", dd->ipath_pbufsport,
+                       dd->ipath_ports_extrabuf);
+       dd->ipath_lastpioindex = 0;
+       dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
+       /* ipath_pioavailshadow initialized earlier */
+       ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
+                  "each for %u user ports\n", kpiobufs,
+                  piobufs, dd->ipath_pbufsport, uports);
+       ret = dd->ipath_f_early_init(dd);
+       if (ret) {
+               ipath_dev_err(dd, "Early initialization failure\n");
+               goto done;
+       }
+
+       /*
+        * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
+        * done after early_init.
+        */
+       dd->ipath_hdrqlast =
+               dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
+                        dd->ipath_rcvhdrentsize);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+                        dd->ipath_rcvhdrsize);
+
+       if (!reinit) {
+               ret = init_pioavailregs(dd);
+               init_shadow_tids(dd);
+               if (ret)
+                       goto done;
+       }
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
+                        dd->ipath_pioavailregs_phys);
+
+       /*
+        * this is to detect s/w errors, which the h/w works around by
+        * ignoring the low 6 bits of address, if it wasn't aligned.
+        */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
+       if (val != dd->ipath_pioavailregs_phys) {
+               ipath_dev_err(dd, "Catastrophic software error, "
+                             "SendPIOAvailAddr written as %lx, "
+                             "read back as %llx\n",
+                             (unsigned long) dd->ipath_pioavailregs_phys,
+                             (unsigned long long) val);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
+
+       /*
+        * make sure we are not in freeze, and PIO send enabled, so
+        * writes to pbc happen
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+       /*
+        * before error clears, since we expect serdes pll errors during
+        * this, the first time after reset
+        */
+       if (bringup_link(dd)) {
+               dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
+               ret = -ENETDOWN;
+               goto done;
+       }
+
+       /*
+        * clear any "expected" hwerrs from reset and/or initialization
+        * clear any that aren't enabled (at least this once), and then
+        * set the enable mask
+        */
+       dd->ipath_f_init_hwerrors(dd);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                        dd->ipath_hwerrmask);
+
+       /* clear all */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+       /* enable errors that are masked, at least this first time. */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+                        ~dd->ipath_maskederrs);
+       dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
+       dd->ipath_errormask =
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+       /* clear any interrupts up to this point (ints still not enabled) */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+       dd->ipath_f_tidtemplate(dd);
+
+       /*
+        * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
+        * re-init, the simplest way to handle this is to free
+        * existing, and re-allocate.
+        * Need to re-create rest of port 0 portdata as well.
+        */
+       pd = dd->ipath_pd[0];
+       if (reinit) {
+               struct ipath_portdata *npd;
+
+               /*
+                * Alloc and init new ipath_portdata for port0,
+                * Then free old pd. Could lead to fragmentation, but also
+                * makes later support for hot-swap easier.
+                */
+               npd = create_portdata0(dd);
+               if (npd) {
+                       ipath_free_pddata(dd, pd);
+                       dd->ipath_pd[0] = npd;
+                       pd = npd;
+               } else {
+                       ipath_dev_err(dd, "Unable to allocate portdata"
+                                     " for port 0, failing\n");
+                       ret = -ENOMEM;
+                       goto done;
+               }
+       }
+       ret = ipath_create_rcvhdrq(dd, pd);
+       if (!ret)
+               ret = create_port0_egr(dd);
+       if (ret) {
+               ipath_dev_err(dd, "failed to allocate kernel port's "
+                             "rcvhdrq and/or egr bufs\n");
+               goto done;
+       }
+       else
+               enable_chip(dd, reinit);
+
+       /* after enable_chip, so pioavailshadow setup */
+       ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+       /*
+        * Cancel any possible active sends from early driver load.
+        * Follows early_init because some chips have to initialize
+        * PIO buffers in early_init to avoid false parity errors.
+        * After enable and ipath_chg_pioavailkernel so we can safely
+        * enable pioavail updates and PIOENABLE; packets are now
+        * ready to go out.
+        */
+       ipath_cancel_sends(dd, 1);
+
+       if (!reinit) {
+               /*
+                * Used when we close a port, for DMA already in flight
+                * at close.
+                */
+               dd->ipath_dummy_hdrq = dma_alloc_coherent(
+                       &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
+                       &dd->ipath_dummy_hdrq_phys,
+                       gfp_flags);
+               if (!dd->ipath_dummy_hdrq) {
+                       dev_info(&dd->pcidev->dev,
+                               "Couldn't allocate 0x%lx bytes for dummy hdrq\n",
+                               dd->ipath_pd[0]->port_rcvhdrq_size);
+                       /* fallback to just 0'ing */
+                       dd->ipath_dummy_hdrq_phys = 0UL;
+               }
+       }
+
+       /*
+        * cause retrigger of pending interrupts ignored during init,
+        * even if we had errors
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+
+       if (!dd->ipath_stats_timer_active) {
+               /*
+                * first init, or after an admin disable/enable
+                * set up stats retrieval timer, even if we had errors
+                * in last portion of setup
+                */
+               init_timer(&dd->ipath_stats_timer);
+               dd->ipath_stats_timer.function = ipath_get_faststats;
+               dd->ipath_stats_timer.data = (unsigned long) dd;
+               /* every 5 seconds; */
+               dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+               /* takes ~16 seconds to overflow at full IB 4x bandwdith */
+               add_timer(&dd->ipath_stats_timer);
+               dd->ipath_stats_timer_active = 1;
+       }
+
+       /* Set up SendDMA if chip supports it */
+       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+               ret = setup_sdma(dd);
+
+       /* Set up HoL state */
+       init_timer(&dd->ipath_hol_timer);
+       dd->ipath_hol_timer.function = ipath_hol_event;
+       dd->ipath_hol_timer.data = (unsigned long)dd;
+       dd->ipath_hol_state = IPATH_HOL_UP;
+
+done:
+       if (!ret) {
+               *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+               if (!dd->ipath_f_intrsetup(dd)) {
+                       /* now we can enable all interrupts from the chip */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+                                        -1LL);
+                       /* force re-interrupt of any pending interrupts. */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
+                                        0ULL);
+                       /* chip is usable; mark it as initialized */
+                       *dd->ipath_statusp |= IPATH_STATUS_INITTED;
+
+                       /*
+                        * setup to verify we get an interrupt, and fallback
+                        * to an alternate if necessary and possible
+                        */
+                       if (!reinit) {
+                               init_timer(&dd->ipath_intrchk_timer);
+                               dd->ipath_intrchk_timer.function =
+                                       verify_interrupt;
+                               dd->ipath_intrchk_timer.data =
+                                       (unsigned long) dd;
+                       }
+                       dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
+                       add_timer(&dd->ipath_intrchk_timer);
+               } else
+                       ipath_dev_err(dd, "No interrupts enabled, couldn't "
+                                     "setup interrupt address\n");
+
+               if (dd->ipath_cfgports > ipath_stats.sps_nports)
+                       /*
+                        * sps_nports is a global, so, we set it to
+                        * the highest number of ports of any of the
+                        * chips we find; we never decrement it, at
+                        * least for now.  Since this might have changed
+                        * over disable/enable or prior to reset, always
+                        * do the check and potentially adjust.
+                        */
+                       ipath_stats.sps_nports = dd->ipath_cfgports;
+       } else
+               ipath_dbg("Failed (%d) to initialize chip\n", ret);
+
+       /* if ret is non-zero, we probably should do some cleanup
+          here... */
+       return ret;
+}
+
+static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
+{
+       struct ipath_devdata *dd;
+       unsigned long flags;
+       unsigned short val;
+       int ret;
+
+       ret = ipath_parse_ushort(str, &val);
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       if (ret < 0)
+               goto bail;
+
+       if (val == 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+               if (dd->ipath_kregbase)
+                       continue;
+               if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+                          (dd->ipath_cfgports *
+                           IPATH_MIN_USER_PORT_BUFCNT)))
+               {
+                       ipath_dev_err(
+                               dd,
+                               "Allocating %d PIO bufs for kernel leaves "
+                               "too few for %d user ports (%d each)\n",
+                               val, dd->ipath_cfgports - 1,
+                               IPATH_MIN_USER_PORT_BUFCNT);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               dd->ipath_lastport_piobuf =
+                       dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
+       }
+
+       ipath_kpiobufs = val;
+       ret = 0;
+bail:
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_intr.c b/drivers/staging/rdma/ipath/ipath_intr.c
new file mode 100644 (file)
index 0000000..01ba792
--- /dev/null
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+       u32 piobcnt;
+       unsigned long sbuf[4];
+       /*
+        * it's possible that sendbuffererror could have bits set; might
+        * have already done this as a result of hardware error handling
+        */
+       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       /* read these before writing errorclear */
+       sbuf[0] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror);
+       sbuf[1] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+       if (piobcnt > 128)
+               sbuf[2] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+       if (piobcnt > 192)
+               sbuf[3] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+       else
+               sbuf[3] = 0;
+
+       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+               int i;
+               if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
+                       time_after(dd->ipath_lastcancel, jiffies)) {
+                       __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+                                         "SendbufErrs %lx %lx", sbuf[0],
+                                         sbuf[1]);
+                       if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+                               printk(" %lx %lx ", sbuf[2], sbuf[3]);
+                       printk("\n");
+               }
+
+               for (i = 0; i < piobcnt; i++)
+                       if (test_bit(i, sbuf))
+                               ipath_disarm_piobufs(dd, i, 1);
+               /* ignore armlaunch errs for a bit */
+               dd->ipath_lastcancel = jiffies+3;
+       }
+}
+
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+       (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
+        INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
+        INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
+        INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+        INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
+        INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+       (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
+        INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
+        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+        INFINIPATH_E_INVALIDADDR)
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+        (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
+        INFINIPATH_E_SPKTLEN)
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+       (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+        INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+        INFINIPATH_E_RUNEXPCHAR)
+
+static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
+{
+       u64 ignore_this_time = 0;
+
+       ipath_disarm_senderrbufs(dd);
+       if ((errs & E_SUM_LINK_PKTERRS) &&
+           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+               /*
+                * This can happen when SMA is trying to bring the link
+                * up, but the IB link changes state at the "wrong" time.
+                * The IB logic then complains that the packet isn't
+                * valid.  We don't want to confuse people, so we just
+                * don't print them, except at debug
+                */
+               ipath_dbg("Ignoring packet errors %llx, because link not "
+                         "ACTIVE\n", (unsigned long long) errs);
+               ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+       }
+
+       return ignore_this_time;
+}
+
+/* generic hw error messages... */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
+       { \
+               .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
+                         INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
+               .msg = "TXE " #a " Memory Parity"            \
+       }
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
+       { \
+               .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
+                         INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
+               .msg = "RXE " #a " Memory Parity"            \
+       }
+
+static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
+       INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
+       INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
+
+       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
+       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
+       INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
+
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
+       INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
+};
+
+/**
+ * ipath_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+       strlcat(msg, "[", msgl);
+       strlcat(msg, hwmsg, msgl);
+       strlcat(msg, "]", msgl);
+}
+
+/**
+ * ipath_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void ipath_format_hwerrors(u64 hwerrs,
+                          const struct ipath_hwerror_msgs *hwerrmsgs,
+                          size_t nhwerrmsgs,
+                          char *msg, size_t msgl)
+{
+       int i;
+       const int glen =
+           ARRAY_SIZE(ipath_generic_hwerror_msgs);
+
+       for (i=0; i<glen; i++) {
+               if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
+                       ipath_format_hwmsg(msg, msgl,
+                                          ipath_generic_hwerror_msgs[i].msg);
+               }
+       }
+
+       for (i=0; i<nhwerrmsgs; i++) {
+               if (hwerrs & hwerrmsgs[i].mask) {
+                       ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+               }
+       }
+}
+
+/* return the strings for the most common link states */
+static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+       char *ret;
+       u32 state;
+
+       state = ipath_ib_state(dd, ibcs);
+       if (state == dd->ib_init)
+               ret = "Init";
+       else if (state == dd->ib_arm)
+               ret = "Arm";
+       else if (state == dd->ib_active)
+               ret = "Active";
+       else
+               ret = "Down";
+       return ret;
+}
+
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
+{
+       struct ib_event event;
+
+       event.device = &dd->verbs_dev->ibdev;
+       event.element.port_num = 1;
+       event.event = ev;
+       ib_dispatch_event(&event);
+}
+
+static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
+                                    ipath_err_t errs)
+{
+       u32 ltstate, lstate, ibstate, lastlstate;
+       u32 init = dd->ib_init;
+       u32 arm = dd->ib_arm;
+       u32 active = dd->ib_active;
+       const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+
+       lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
+       ibstate = ipath_ib_state(dd, ibcs);
+       /* linkstate at last interrupt */
+       lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+       ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
+
+       /*
+        * Since going into a recovery state causes the link state to go
+        * down and since recovery is transitory, it is better if we "miss"
+        * ever seeing the link training state go into recovery (i.e.,
+        * ignore this transition for link state special handling purposes)
+        * without even updating ipath_lastibcstat.
+        */
+       if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
+           (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
+           (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
+               goto done;
+
+       /*
+        * if linkstate transitions into INIT from any of the various down
+        * states, or if it transitions from any of the up (INIT or better)
+        * states into any of the down states (except link recovery), then
+        * call the chip-specific code to take appropriate actions.
+        */
+       if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
+               lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
+               /* transitioned to UP */
+               if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
+                       /* link came up, so we must no longer be disabled */
+                       dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+                       ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
+                       goto skip_ibchange; /* chip-code handled */
+               }
+       } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
+               (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
+               ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
+               ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+               int handled;
+               handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
+               dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
+               if (handled) {
+                       ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
+                       goto skip_ibchange; /* chip-code handled */
+               }
+       }
+
+       /*
+        * Significant enough to always print and get into logs, if it was
+        * unexpected.  If it was a requested state change, we'll have
+        * already cleared the flags, so we won't print this warning
+        */
+       if ((ibstate != arm && ibstate != active) &&
+           (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
+               dev_info(&dd->pcidev->dev, "Link state changed from %s "
+                        "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
+                        "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
+       }
+
+       if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+           ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+               u32 lastlts;
+               lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+               /*
+                * Ignore cycling back and forth from Polling.Active to
+                * Polling.Quiet while waiting for the other end of the link
+                * to come up, except to try and decide if we are connected
+                * to a live IB device or not.  We will cycle back and
+                * forth between them if no cable is plugged in, the other
+                * device is powered off or disabled, etc.
+                */
+               if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+                   lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+                       if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
+                            (++dd->ipath_ibpollcnt == 40)) {
+                               dd->ipath_flags |= IPATH_NOCABLE;
+                               *dd->ipath_statusp |=
+                                       IPATH_STATUS_IB_NOCABLE;
+                               ipath_cdbg(LINKVERB, "Set NOCABLE\n");
+                       }
+                       ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
+                               ipath_ibcstatus_str[ltstate], ibstate);
+                       goto skip_ibchange;
+               }
+       }
+
+       dd->ipath_ibpollcnt = 0; /* not poll*, now */
+       ipath_stats.sps_iblink++;
+
+       if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
+               u64 linkrecov;
+               linkrecov = ipath_snap_cntr(dd,
+                       dd->ipath_cregs->cr_iblinkerrrecovcnt);
+               if (linkrecov != dd->ipath_lastlinkrecov) {
+                       ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
+                               (unsigned long long) ibcs,
+                               ib_linkstate(dd, ibcs),
+                               ipath_ibcstatus_str[ltstate],
+                               (unsigned long long) linkrecov);
+                       /* and no more until active again */
+                       dd->ipath_lastlinkrecov = 0;
+                       ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+                       goto skip_ibchange;
+               }
+       }
+
+       if (ibstate == init || ibstate == arm || ibstate == active) {
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+               if (ibstate == init || ibstate == arm) {
+                       *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+                       if (dd->ipath_flags & IPATH_LINKACTIVE)
+                               signal_ib_event(dd, IB_EVENT_PORT_ERR);
+               }
+               if (ibstate == arm) {
+                       dd->ipath_flags |= IPATH_LINKARMED;
+                       dd->ipath_flags &= ~(IPATH_LINKUNK |
+                               IPATH_LINKINIT | IPATH_LINKDOWN |
+                               IPATH_LINKACTIVE | IPATH_NOCABLE);
+                       ipath_hol_down(dd);
+               } else  if (ibstate == init) {
+                       /*
+                        * set INIT and DOWN.  Down is checked by
+                        * most of the other code, but INIT is
+                        * useful to know in a few places.
+                        */
+                       dd->ipath_flags |= IPATH_LINKINIT |
+                               IPATH_LINKDOWN;
+                       dd->ipath_flags &= ~(IPATH_LINKUNK |
+                               IPATH_LINKARMED | IPATH_LINKACTIVE |
+                               IPATH_NOCABLE);
+                       ipath_hol_down(dd);
+               } else {  /* active */
+                       dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
+                               dd->ipath_cregs->cr_iblinkerrrecovcnt);
+                       *dd->ipath_statusp |=
+                               IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+                       dd->ipath_flags |= IPATH_LINKACTIVE;
+                       dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+                               | IPATH_LINKDOWN | IPATH_LINKARMED |
+                               IPATH_NOCABLE);
+                       if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+                               ipath_restart_sdma(dd);
+                       signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
+                       /* LED active not handled in chip _f_updown */
+                       dd->ipath_f_setextled(dd, lstate, ltstate);
+                       ipath_hol_up(dd);
+               }
+
+               /*
+                * print after we've already done the work, so as not to
+                * delay the state changes and notifications, for debugging
+                */
+               if (lstate == lastlstate)
+                       ipath_cdbg(LINKVERB, "Unchanged from last: %s "
+                               "(%x)\n", ib_linkstate(dd, ibcs), ibstate);
+               else
+                       ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
+                                 dd->ipath_unit, ib_linkstate(dd, ibcs),
+                                 ipath_ibcstatus_str[ltstate],  ibstate);
+       } else { /* down */
+               if (dd->ipath_flags & IPATH_LINKACTIVE)
+                       signal_ib_event(dd, IB_EVENT_PORT_ERR);
+               dd->ipath_flags |= IPATH_LINKDOWN;
+               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+                                    | IPATH_LINKACTIVE |
+                                    IPATH_LINKARMED);
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+               dd->ipath_lli_counter = 0;
+
+               if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
+                       ipath_cdbg(VERBOSE, "Unit %u link state down "
+                                  "(state 0x%x), from %s\n",
+                                  dd->ipath_unit, lstate,
+                                  ib_linkstate(dd, dd->ipath_lastibcstat));
+               else
+                       ipath_cdbg(LINKVERB, "Unit %u link state changed "
+                                  "to %s (0x%x) from down (%x)\n",
+                                  dd->ipath_unit,
+                                  ipath_ibcstatus_str[ltstate],
+                                  ibstate, lastlstate);
+       }
+
+skip_ibchange:
+       dd->ipath_lastibcstat = ibcs;
+done:
+       return;
+}
+
+static void handle_supp_msgs(struct ipath_devdata *dd,
+                            unsigned supp_msgs, char *msg, u32 msgsz)
+{
+       /*
+        * Print the message unless it's ibc status change only, which
+        * happens so often we never want to count it.
+        */
+       if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+               int iserr;
+               ipath_err_t mask;
+               iserr = ipath_decode_err(dd, msg, msgsz,
+                                        dd->ipath_lasterror &
+                                        ~INFINIPATH_E_IBSTATUSCHANGED);
+
+               mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+                       INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
+
+               /* if we're in debug, then don't mask SDMADISABLED msgs */
+               if (ipath_debug & __IPATH_DBG)
+                       mask &= ~INFINIPATH_E_SDMADISABLED;
+
+               if (dd->ipath_lasterror & ~mask)
+                       ipath_dev_err(dd, "Suppressed %u messages for "
+                                     "fast-repeating errors (%s) (%llx)\n",
+                                     supp_msgs, msg,
+                                     (unsigned long long)
+                                     dd->ipath_lasterror);
+               else {
+                       /*
+                        * rcvegrfull and rcvhdrqfull are "normal", for some
+                        * types of processes (mostly benchmarks) that send
+                        * huge numbers of messages, while not processing
+                        * them. So only complain about these at debug
+                        * level.
+                        */
+                       if (iserr)
+                               ipath_dbg("Suppressed %u messages for %s\n",
+                                         supp_msgs, msg);
+                       else
+                               ipath_cdbg(ERRPKT,
+                                       "Suppressed %u messages for %s\n",
+                                         supp_msgs, msg);
+               }
+       }
+}
+
+static unsigned handle_frequent_errors(struct ipath_devdata *dd,
+                                      ipath_err_t errs, char *msg,
+                                      u32 msgsz, int *noprint)
+{
+       unsigned long nc;
+       static unsigned long nextmsg_time;
+       static unsigned nmsgs, supp_msgs;
+
+       /*
+        * Throttle back "fast" messages to no more than 10 per 5 seconds.
+        * This isn't perfect, but it's a reasonable heuristic. If we get
+        * more than 10, give a 6x longer delay.
+        */
+       nc = jiffies;
+       if (nmsgs > 10) {
+               if (time_before(nc, nextmsg_time)) {
+                       *noprint = 1;
+                       if (!supp_msgs++)
+                               nextmsg_time = nc + HZ * 3;
+               }
+               else if (supp_msgs) {
+                       handle_supp_msgs(dd, supp_msgs, msg, msgsz);
+                       supp_msgs = 0;
+                       nmsgs = 0;
+               }
+       }
+       else if (!nmsgs++ || time_after(nc, nextmsg_time))
+               nextmsg_time = nc + HZ / 2;
+
+       return supp_msgs;
+}
+
+static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+       unsigned long flags;
+       int expected;
+
+       if (ipath_debug & __IPATH_DBG) {
+               char msg[128];
+               ipath_decode_err(dd, msg, sizeof msg, errs &
+                       INFINIPATH_E_SDMAERRS);
+               ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
+       }
+       if (ipath_debug & __IPATH_VERBDBG) {
+               unsigned long tl, hd, status, lengen;
+               tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+               hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+               status = ipath_read_kreg64(dd
+                       , dd->ipath_kregs->kr_senddmastatus);
+               lengen = ipath_read_kreg64(dd,
+                       dd->ipath_kregs->kr_senddmalengen);
+               ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
+                       "lengen 0x%lx\n", tl, hd, status, lengen);
+       }
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+       __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+       expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+       if (!expected)
+               ipath_cancel_sends(dd, 1);
+}
+
+static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
+{
+       unsigned long flags;
+       int expected;
+
+       if ((istat & INFINIPATH_I_SDMAINT) &&
+           !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+               ipath_sdma_intr(dd);
+
+       if (istat & INFINIPATH_I_SDMADISABLED) {
+               expected = test_bit(IPATH_SDMA_ABORTING,
+                       &dd->ipath_sdma_status);
+               ipath_dbg("%s SDmaDisabled intr\n",
+                       expected ? "expected" : "unexpected");
+               spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+               __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+               if (!expected)
+                       ipath_cancel_sends(dd, 1);
+               if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+                       tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+       }
+}
+
+static int handle_hdrq_full(struct ipath_devdata *dd)
+{
+       int chkerrpkts = 0;
+       u32 hd, tl;
+       u32 i;
+
+       ipath_stats.sps_hdrqfull++;
+       for (i = 0; i < dd->ipath_cfgports; i++) {
+               struct ipath_portdata *pd = dd->ipath_pd[i];
+
+               if (i == 0) {
+                       /*
+                        * For kernel receive queues, we just want to know
+                        * if there are packets in the queue that we can
+                        * process.
+                        */
+                       if (pd->port_head != ipath_get_hdrqtail(pd))
+                               chkerrpkts |= 1 << i;
+                       continue;
+               }
+
+               /* Skip if user context is not open */
+               if (!pd || !pd->port_cnt)
+                       continue;
+
+               /* Don't report the same point multiple times. */
+               if (dd->ipath_flags & IPATH_NODMA_RTAIL)
+                       tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
+               else
+                       tl = ipath_get_rcvhdrtail(pd);
+               if (tl == pd->port_lastrcvhdrqtail)
+                       continue;
+
+               hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
+               if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+                       pd->port_lastrcvhdrqtail = tl;
+                       pd->port_hdrqfull++;
+                       /* flush hdrqfull so that poll() sees it */
+                       wmb();
+                       wake_up_interruptible(&pd->port_wait);
+               }
+       }
+
+       return chkerrpkts;
+}
+
+static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+       char msg[128];
+       u64 ignore_this_time = 0;
+       u64 iserr = 0;
+       int chkerrpkts = 0, noprint = 0;
+       unsigned supp_msgs;
+       int log_idx;
+
+       /*
+        * don't report errors that are masked, either at init
+        * (not set in ipath_errormask), or temporarily (set in
+        * ipath_maskederrs)
+        */
+       errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
+
+       supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
+               &noprint);
+
+       /* do these first, they are most important */
+       if (errs & INFINIPATH_E_HARDWARE) {
+               /* reuse same msg buf */
+               dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
+       } else {
+               u64 mask;
+               for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
+                       mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
+                       if (errs & mask)
+                               ipath_inc_eeprom_err(dd, log_idx, 1);
+               }
+       }
+
+       if (errs & INFINIPATH_E_SDMAERRS)
+               handle_sdma_errors(dd, errs);
+
+       if (!noprint && (errs & ~dd->ipath_e_bitsextant))
+               ipath_dev_err(dd, "error interrupt with unknown errors "
+                             "%llx set\n", (unsigned long long)
+                             (errs & ~dd->ipath_e_bitsextant));
+
+       if (errs & E_SUM_ERRS)
+               ignore_this_time = handle_e_sum_errs(dd, errs);
+       else if ((errs & E_SUM_LINK_PKTERRS) &&
+           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+               /*
+                * This can happen when SMA is trying to bring the link
+                * up, but the IB link changes state at the "wrong" time.
+                * The IB logic then complains that the packet isn't
+                * valid.  We don't want to confuse people, so we just
+                * don't print them, except at debug
+                */
+               ipath_dbg("Ignoring packet errors %llx, because link not "
+                         "ACTIVE\n", (unsigned long long) errs);
+               ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+       }
+
+       if (supp_msgs == 250000) {
+               int s_iserr;
+               /*
+                * It's not entirely reasonable assuming that the errors set
+                * in the last clear period are all responsible for the
+                * problem, but the alternative is to assume it's the only
+                * ones on this particular interrupt, which also isn't great
+                */
+               dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+
+               dd->ipath_errormask &= ~dd->ipath_maskederrs;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+                                dd->ipath_errormask);
+               s_iserr = ipath_decode_err(dd, msg, sizeof msg,
+                                          dd->ipath_maskederrs);
+
+               if (dd->ipath_maskederrs &
+                   ~(INFINIPATH_E_RRCVEGRFULL |
+                     INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
+                       ipath_dev_err(dd, "Temporarily disabling "
+                           "error(s) %llx reporting; too frequent (%s)\n",
+                               (unsigned long long) dd->ipath_maskederrs,
+                               msg);
+               else {
+                       /*
+                        * rcvegrfull and rcvhdrqfull are "normal",
+                        * for some types of processes (mostly benchmarks)
+                        * that send huge numbers of messages, while not
+                        * processing them.  So only complain about
+                        * these at debug level.
+                        */
+                       if (s_iserr)
+                               ipath_dbg("Temporarily disabling reporting "
+                                   "too frequent queue full errors (%s)\n",
+                                   msg);
+                       else
+                               ipath_cdbg(ERRPKT,
+                                   "Temporarily disabling reporting too"
+                                   " frequent packet errors (%s)\n",
+                                   msg);
+               }
+
+               /*
+                * Re-enable the masked errors after around 3 minutes.  in
+                * ipath_get_faststats().  If we have a series of fast
+                * repeating but different errors, the interval will keep
+                * stretching out, but that's OK, as that's pretty
+                * catastrophic.
+                */
+               dd->ipath_unmasktime = jiffies + HZ * 180;
+       }
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
+       if (ignore_this_time)
+               errs &= ~ignore_this_time;
+       if (errs & ~dd->ipath_lasterror) {
+               errs &= ~dd->ipath_lasterror;
+               /* never suppress duplicate hwerrors or ibstatuschange */
+               dd->ipath_lasterror |= errs &
+                       ~(INFINIPATH_E_HARDWARE |
+                         INFINIPATH_E_IBSTATUSCHANGED);
+       }
+
+       if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
+               dd->ipath_spectriggerhit++;
+               ipath_dbg("%lu special trigger hits\n",
+                       dd->ipath_spectriggerhit);
+       }
+
+       /* likely due to cancel; so suppress message unless verbose */
+       if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+               time_after(dd->ipath_lastcancel, jiffies)) {
+               /* armlaunch takes precedence; it often causes both. */
+               ipath_cdbg(VERBOSE,
+                       "Suppressed %s error (%llx) after sendbuf cancel\n",
+                       (errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
+                       "armlaunch" : "sendpktlen", (unsigned long long)errs);
+               errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+       }
+
+       if (!errs)
+               return 0;
+
+       if (!noprint) {
+               ipath_err_t mask;
+               /*
+                * The ones we mask off are handled specially below
+                * or above.  Also mask SDMADISABLED by default as it
+                * is too chatty.
+                */
+               mask = INFINIPATH_E_IBSTATUSCHANGED |
+                       INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+                       INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
+
+               /* if we're in debug, then don't mask SDMADISABLED msgs */
+               if (ipath_debug & __IPATH_DBG)
+                       mask &= ~INFINIPATH_E_SDMADISABLED;
+
+               ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
+       } else
+               /* so we don't need if (!noprint) at strlcat's below */
+               *msg = 0;
+
+       if (errs & E_SUM_PKTERRS) {
+               ipath_stats.sps_pkterrs++;
+               chkerrpkts = 1;
+       }
+       if (errs & E_SUM_ERRS)
+               ipath_stats.sps_errs++;
+
+       if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+               ipath_stats.sps_crcerrs++;
+               chkerrpkts = 1;
+       }
+       iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
+
+
+       /*
+        * We don't want to print these two as they happen, or we can make
+        * the situation even worse, because it takes so long to print
+        * messages to serial consoles.  Kernel ports get printed from
+        * fast_stats, no more than every 5 seconds, user ports get printed
+        * on close
+        */
+       if (errs & INFINIPATH_E_RRCVHDRFULL)
+               chkerrpkts |= handle_hdrq_full(dd);
+       if (errs & INFINIPATH_E_RRCVEGRFULL) {
+               struct ipath_portdata *pd = dd->ipath_pd[0];
+
+               /*
+                * since this is of less importance and not likely to
+                * happen without also getting hdrfull, only count
+                * occurrences; don't check each port (or even the kernel
+                * vs user)
+                */
+               ipath_stats.sps_etidfull++;
+               if (pd->port_head != ipath_get_hdrqtail(pd))
+                       chkerrpkts |= 1;
+       }
+
+       /*
+        * do this before IBSTATUSCHANGED, in case both bits set in a single
+        * interrupt; we want the STATUSCHANGE to "win", so we do our
+        * internal copy of state machine correctly
+        */
+       if (errs & INFINIPATH_E_RIBLOSTLINK) {
+               /*
+                * force through block below
+                */
+               errs |= INFINIPATH_E_IBSTATUSCHANGED;
+               ipath_stats.sps_iblink++;
+               dd->ipath_flags |= IPATH_LINKDOWN;
+               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+                                    | IPATH_LINKARMED | IPATH_LINKACTIVE);
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+
+               ipath_dbg("Lost link, link now down (%s)\n",
+                       ipath_ibcstatus_str[ipath_read_kreg64(dd,
+                       dd->ipath_kregs->kr_ibcstatus) & 0xf]);
+       }
+       if (errs & INFINIPATH_E_IBSTATUSCHANGED)
+               handle_e_ibstatuschanged(dd, errs);
+
+       if (errs & INFINIPATH_E_RESET) {
+               if (!noprint)
+                       ipath_dev_err(dd, "Got reset, requires re-init "
+                                     "(unload and reload driver)\n");
+               dd->ipath_flags &= ~IPATH_INITTED;      /* needs re-init */
+               /* mark as having had error */
+               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+       }
+
+       if (!noprint && *msg) {
+               if (iserr)
+                       ipath_dev_err(dd, "%s error\n", msg);
+       }
+       if (dd->ipath_state_wanted & dd->ipath_flags) {
+               ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
+                          "waking\n", dd->ipath_state_wanted,
+                          dd->ipath_flags);
+               wake_up_interruptible(&ipath_state_wait);
+       }
+
+       return chkerrpkts;
+}
+
+/*
+ * try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ */
+void ipath_clear_freeze(struct ipath_devdata *dd)
+{
+       /* disable error interrupts, to avoid confusion */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
+
+       /* also disable interrupts; errormask is sometimes overwriten */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+       ipath_cancel_sends(dd, 1);
+
+       /* clear the freeze, and be sure chip saw it */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+       /* force in-memory update now we are out of freeze */
+       ipath_force_pio_avail_update(dd);
+
+       /*
+        * force new interrupt if any hwerr, error or interrupt bits are
+        * still set, and clear "safe" send packet errors related to freeze
+        * and cancelling sends.  Re-enable error interrupts before possible
+        * force of re-interrupt on pending interrupts.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+               E_SPKT_ERRS_IGNORE);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+               dd->ipath_errormask);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+}
+
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
+{
+       /*
+        * sometimes happen during driver init and unload, don't want
+        * to process any interrupts at that point
+        */
+
+       /* this is just a bandaid, not a fix, if something goes badly
+        * wrong */
+       if (++*unexpectp > 100) {
+               if (++*unexpectp > 105) {
+                       /*
+                        * ok, we must be taking somebody else's interrupts,
+                        * due to a messed up mptable and/or PIRQ table, so
+                        * unregister the interrupt.  We've seen this during
+                        * linuxbios development work, and it may happen in
+                        * the future again.
+                        */
+                       if (dd->pcidev && dd->ipath_irq) {
+                               ipath_dev_err(dd, "Now %u unexpected "
+                                             "interrupts, unregistering "
+                                             "interrupt handler\n",
+                                             *unexpectp);
+                               ipath_dbg("free_irq of irq %d\n",
+                                         dd->ipath_irq);
+                               dd->ipath_f_free_irq(dd);
+                       }
+               }
+               if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
+                       ipath_dev_err(dd, "%u unexpected interrupts, "
+                                     "disabling interrupts completely\n",
+                                     *unexpectp);
+                       /*
+                        * disable all interrupts, something is very wrong
+                        */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+                                        0ULL);
+               }
+       } else if (*unexpectp > 1)
+               ipath_dbg("Interrupt when not ready, should not happen, "
+                         "ignoring\n");
+}
+
+static noinline void ipath_bad_regread(struct ipath_devdata *dd)
+{
+       static int allbits;
+
+       /* separate routine, for better optimization of ipath_intr() */
+
+       /*
+        * We print the message and disable interrupts, in hope of
+        * having a better chance of debugging the problem.
+        */
+       ipath_dev_err(dd,
+                     "Read of interrupt status failed (all bits set)\n");
+       if (allbits++) {
+               /* disable all interrupts, something is very wrong */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+               if (allbits == 2) {
+                       ipath_dev_err(dd, "Still bad interrupt status, "
+                                     "unregistering interrupt\n");
+                       dd->ipath_f_free_irq(dd);
+               } else if (allbits > 2) {
+                       if ((allbits % 10000) == 0)
+                               printk(".");
+               } else
+                       ipath_dev_err(dd, "Disabling interrupts, "
+                                     "multiple errors\n");
+       }
+}
+
+static void handle_layer_pioavail(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+       int ret;
+
+       ret = ipath_ib_piobufavail(dd->verbs_dev);
+       if (ret > 0)
+               goto set;
+
+       return;
+set:
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+/*
+ * Handle receive interrupts for user ports; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll
+ */
+static void handle_urcv(struct ipath_devdata *dd, u64 istat)
+{
+       u64 portr;
+       int i;
+       int rcvdint = 0;
+
+       /*
+        * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
+        * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
+        * would both like timely updates of the bits so that
+        * we don't pass them by unnecessarily.  the rmb()
+        * here ensures that we see them promptly -- the
+        * corresponding wmb()'s are in ipath_poll_urgent()
+        * and ipath_poll_next()...
+        */
+       rmb();
+       portr = ((istat >> dd->ipath_i_rcvavail_shift) &
+                dd->ipath_i_rcvavail_mask) |
+               ((istat >> dd->ipath_i_rcvurg_shift) &
+                dd->ipath_i_rcvurg_mask);
+       for (i = 1; i < dd->ipath_cfgports; i++) {
+               struct ipath_portdata *pd = dd->ipath_pd[i];
+
+               if (portr & (1 << i) && pd && pd->port_cnt) {
+                       if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
+                                              &pd->port_flag)) {
+                               clear_bit(i + dd->ipath_r_intravail_shift,
+                                         &dd->ipath_rcvctrl);
+                               wake_up_interruptible(&pd->port_wait);
+                               rcvdint = 1;
+                       } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
+                                                     &pd->port_flag)) {
+                               pd->port_urgent++;
+                               wake_up_interruptible(&pd->port_wait);
+                       }
+               }
+       }
+       if (rcvdint) {
+               /* only want to take one interrupt, so turn off the rcv
+                * interrupt for all the ports that we set the rcv_waiting
+                * (but never for kernel port)
+                */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                                dd->ipath_rcvctrl);
+       }
+}
+
+irqreturn_t ipath_intr(int irq, void *data)
+{
+       struct ipath_devdata *dd = data;
+       u64 istat, chk0rcv = 0;
+       ipath_err_t estat = 0;
+       irqreturn_t ret;
+       static unsigned unexpected = 0;
+       u64 kportrbits;
+
+       ipath_stats.sps_ints++;
+
+       if (dd->ipath_int_counter != (u32) -1)
+               dd->ipath_int_counter++;
+
+       if (!(dd->ipath_flags & IPATH_PRESENT)) {
+               /*
+                * This return value is not great, but we do not want the
+                * interrupt core code to remove our interrupt handler
+                * because we don't appear to be handling an interrupt
+                * during a chip reset.
+                */
+               return IRQ_HANDLED;
+       }
+
+       /*
+        * this needs to be flags&initted, not statusp, so we keep
+        * taking interrupts even after link goes down, etc.
+        * Also, we *must* clear the interrupt at some point, or we won't
+        * take it again, which can be real bad for errors, etc...
+        */
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               ipath_bad_intr(dd, &unexpected);
+               ret = IRQ_NONE;
+               goto bail;
+       }
+
+       istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
+
+       if (unlikely(!istat)) {
+               ipath_stats.sps_nullintr++;
+               ret = IRQ_NONE; /* not our interrupt, or already handled */
+               goto bail;
+       }
+       if (unlikely(istat == -1)) {
+               ipath_bad_regread(dd);
+               /* don't know if it was our interrupt or not */
+               ret = IRQ_NONE;
+               goto bail;
+       }
+
+       if (unexpected)
+               unexpected = 0;
+
+       if (unlikely(istat & ~dd->ipath_i_bitsextant))
+               ipath_dev_err(dd,
+                             "interrupt with unknown interrupts %Lx set\n",
+                             (unsigned long long)
+                             istat & ~dd->ipath_i_bitsextant);
+       else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
+               ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
+                       (unsigned long long) istat);
+
+       if (istat & INFINIPATH_I_ERROR) {
+               ipath_stats.sps_errints++;
+               estat = ipath_read_kreg64(dd,
+                                         dd->ipath_kregs->kr_errorstatus);
+               if (!estat)
+                       dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
+                                "but no error bits set!\n",
+                                (unsigned long long) istat);
+               else if (estat == -1LL)
+                       /*
+                        * should we try clearing all, or hope next read
+                        * works?
+                        */
+                       ipath_dev_err(dd, "Read of error status failed "
+                                     "(all bits set); ignoring\n");
+               else
+                       chk0rcv |= handle_errors(dd, estat);
+       }
+
+       if (istat & INFINIPATH_I_GPIO) {
+               /*
+                * GPIO interrupts fall in two broad classes:
+                * GPIO_2 indicates (on some HT4xx boards) that a packet
+                *        has arrived for Port 0. Checking for this
+                *        is controlled by flag IPATH_GPIO_INTR.
+                * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
+                *        errors that we need to count. Checking for this
+                *        is controlled by flag IPATH_GPIO_ERRINTRS.
+                */
+               u32 gpiostatus;
+               u32 to_clear = 0;
+
+               gpiostatus = ipath_read_kreg32(
+                       dd, dd->ipath_kregs->kr_gpio_status);
+               /* First the error-counter case. */
+               if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
+                   (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
+                       /* want to clear the bits we see asserted. */
+                       to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
+
+                       /*
+                        * Count appropriately, clear bits out of our copy,
+                        * as they have been "handled".
+                        */
+                       if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
+                               ipath_dbg("FlowCtl on UnsupVL\n");
+                               dd->ipath_rxfc_unsupvl_errs++;
+                       }
+                       if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
+                               ipath_dbg("Overrun Threshold exceeded\n");
+                               dd->ipath_overrun_thresh_errs++;
+                       }
+                       if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
+                               ipath_dbg("Local Link Integrity error\n");
+                               dd->ipath_lli_errs++;
+                       }
+                       gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
+               }
+               /* Now the Port0 Receive case */
+               if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
+                   (dd->ipath_flags & IPATH_GPIO_INTR)) {
+                       /*
+                        * GPIO status bit 2 is set, and we expected it.
+                        * clear it and indicate in p0bits.
+                        * This probably only happens if a Port0 pkt
+                        * arrives at _just_ the wrong time, and we
+                        * handle that by seting chk0rcv;
+                        */
+                       to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
+                       gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
+                       chk0rcv = 1;
+               }
+               if (gpiostatus) {
+                       /*
+                        * Some unexpected bits remain. If they could have
+                        * caused the interrupt, complain and clear.
+                        * To avoid repetition of this condition, also clear
+                        * the mask. It is almost certainly due to error.
+                        */
+                       const u32 mask = (u32) dd->ipath_gpio_mask;
+
+                       if (mask & gpiostatus) {
+                               ipath_dbg("Unexpected GPIO IRQ bits %x\n",
+                                 gpiostatus & mask);
+                               to_clear |= (gpiostatus & mask);
+                               dd->ipath_gpio_mask &= ~(gpiostatus & mask);
+                               ipath_write_kreg(dd,
+                                       dd->ipath_kregs->kr_gpio_mask,
+                                       dd->ipath_gpio_mask);
+                       }
+               }
+               if (to_clear) {
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+                                       (u64) to_clear);
+               }
+       }
+
+       /*
+        * Clear the interrupt bits we found set, unless they are receive
+        * related, in which case we already cleared them above, and don't
+        * want to clear them again, because we might lose an interrupt.
+        * Clear it early, so we "know" know the chip will have seen this by
+        * the time we process the queue, and will re-interrupt if necessary.
+        * The processor itself won't take the interrupt again until we return.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
+
+       /*
+        * Handle kernel receive queues before checking for pio buffers
+        * available since receives can overflow; piobuf waiters can afford
+        * a few extra cycles, since they were waiting anyway, and user's
+        * waiting for receive are at the bottom.
+        */
+       kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
+               (1ULL << dd->ipath_i_rcvurg_shift);
+       if (chk0rcv || (istat & kportrbits)) {
+               istat &= ~kportrbits;
+               ipath_kreceive(dd->ipath_pd[0]);
+       }
+
+       if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
+                    (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
+               handle_urcv(dd, istat);
+
+       if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
+               handle_sdma_intr(dd, istat);
+
+       if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                                dd->ipath_sendctrl);
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+               /* always process; sdma verbs uses PIO for acks and VL15  */
+               handle_layer_pioavail(dd);
+       }
+
+       ret = IRQ_HANDLED;
+
+bail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_kernel.h b/drivers/staging/rdma/ipath/ipath_kernel.h
new file mode 100644 (file)
index 0000000..f0f9471
--- /dev/null
@@ -0,0 +1,1373 @@
+#ifndef _IPATH_KERNEL_H
+#define _IPATH_KERNEL_H
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for infinipath kernel code
+ * ipath_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_common.h"
+#include "ipath_debug.h"
+#include "ipath_registers.h"
+
+/* only s/w major version of InfiniPath we can handle */
+#define IPATH_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define IPATH_CHIP_VERS_MIN 0U
+
+/* temporary, maybe always */
+extern struct infinipath_stats ipath_stats;
+
+#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define IPATH_EEP_LOG_CNT (4)
+struct ipath_eep_log_mask {
+       u64 errs_to_log;
+       u64 hwerrs_to_log;
+};
+
+struct ipath_portdata {
+       void **port_rcvegrbuf;
+       dma_addr_t *port_rcvegrbuf_phys;
+       /* rcvhdrq base, needs mmap before useful */
+       void *port_rcvhdrq;
+       /* kernel virtual address where hdrqtail is updated */
+       void *port_rcvhdrtail_kvaddr;
+       /*
+        * temp buffer for expected send setup, allocated at open, instead
+        * of each setup call
+        */
+       void *port_tid_pg_list;
+       /* when waiting for rcv or pioavail */
+       wait_queue_head_t port_wait;
+       /*
+        * rcvegr bufs base, physical, must fit
+        * in 44 bits so 32 bit programs mmap64 44 bit works)
+        */
+       dma_addr_t port_rcvegr_phys;
+       /* mmap of hdrq, must fit in 44 bits */
+       dma_addr_t port_rcvhdrq_phys;
+       dma_addr_t port_rcvhdrqtailaddr_phys;
+       /*
+        * number of opens (including slave subports) on this instance
+        * (ignoring forks, dup, etc. for now)
+        */
+       int port_cnt;
+       /*
+        * how much space to leave at start of eager TID entries for
+        * protocol use, on each TID
+        */
+       /* instead of calculating it */
+       unsigned port_port;
+       /* non-zero if port is being shared. */
+       u16 port_subport_cnt;
+       /* non-zero if port is being shared. */
+       u16 port_subport_id;
+       /* number of pio bufs for this port (all procs, if shared) */
+       u32 port_piocnt;
+       /* first pio buffer for this port */
+       u32 port_pio_base;
+       /* chip offset of PIO buffers for this port */
+       u32 port_piobufs;
+       /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
+       u32 port_rcvegrbuf_chunks;
+       /* how many egrbufs per chunk */
+       u32 port_rcvegrbufs_perchunk;
+       /* order for port_rcvegrbuf_pages */
+       size_t port_rcvegrbuf_size;
+       /* rcvhdrq size (for freeing) */
+       size_t port_rcvhdrq_size;
+       /* next expected TID to check when looking for free */
+       u32 port_tidcursor;
+       /* next expected TID to check */
+       unsigned long port_flag;
+       /* what happened */
+       unsigned long int_flag;
+       /* WAIT_RCV that timed out, no interrupt */
+       u32 port_rcvwait_to;
+       /* WAIT_PIO that timed out, no interrupt */
+       u32 port_piowait_to;
+       /* WAIT_RCV already happened, no wait */
+       u32 port_rcvnowait;
+       /* WAIT_PIO already happened, no wait */
+       u32 port_pionowait;
+       /* total number of rcvhdrqfull errors */
+       u32 port_hdrqfull;
+       /*
+        * Used to suppress multiple instances of same
+        * port staying stuck at same point.
+        */
+       u32 port_lastrcvhdrqtail;
+       /* saved total number of rcvhdrqfull errors for poll edge trigger */
+       u32 port_hdrqfull_poll;
+       /* total number of polled urgent packets */
+       u32 port_urgent;
+       /* saved total number of polled urgent packets for poll edge trigger */
+       u32 port_urgent_poll;
+       /* pid of process using this port */
+       struct pid *port_pid;
+       struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
+       /* same size as task_struct .comm[] */
+       char port_comm[16];
+       /* pkeys set by this use of this port */
+       u16 port_pkeys[4];
+       /* so file ops can get at unit */
+       struct ipath_devdata *port_dd;
+       /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+       void *subport_uregbase;
+       /* An array of pages for the eager receive buffers * N */
+       void *subport_rcvegrbuf;
+       /* An array of pages for the eager header queue entries * N */
+       void *subport_rcvhdr_base;
+       /* The version of the library which opened this port */
+       u32 userversion;
+       /* Bitmask of active slaves */
+       u32 active_slaves;
+       /* Type of packets or conditions we want to poll for */
+       u16 poll_type;
+       /* port rcvhdrq head offset */
+       u32 port_head;
+       /* receive packet sequence counter */
+       u32 port_seq_cnt;
+};
+
+struct sk_buff;
+struct ipath_sge_state;
+struct ipath_verbs_txreq;
+
+/*
+ * control information for layered drivers
+ */
+struct _ipath_layer {
+       void *l_arg;
+};
+
+struct ipath_skbinfo {
+       struct sk_buff *skb;
+       dma_addr_t phys;
+};
+
+struct ipath_sdma_txreq {
+       int                 flags;
+       int                 sg_count;
+       union {
+               struct scatterlist *sg;
+               void *map_addr;
+       };
+       void              (*callback)(void *, int);
+       void               *callback_cookie;
+       int                 callback_status;
+       u16                 start_idx;  /* sdma private */
+       u16                 next_descq_idx;  /* sdma private */
+       struct list_head    list;       /* sdma private */
+};
+
+struct ipath_sdma_desc {
+       __le64 qw[2];
+};
+
+#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
+#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
+#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
+#define IPATH_SDMA_TXREQ_F_VL15         0x20
+
+#define IPATH_SDMA_TXREQ_S_OK        0
+#define IPATH_SDMA_TXREQ_S_SENDERROR 1
+#define IPATH_SDMA_TXREQ_S_ABORTED   2
+#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
+
+#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG    (1ull << 63)
+#define IPATH_SDMA_STATUS_ABORT_IN_PROG                        (1ull << 62)
+#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE         (1ull << 61)
+#define IPATH_SDMA_STATUS_SCB_EMPTY                    (1ull << 30)
+
+/* max dwords in small buffer packet */
+#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
+
+/*
+ * Possible IB config parameters for ipath_f_get/set_ib_cfg()
+ */
+#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
+#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
+#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
+#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
+#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
+#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
+#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
+#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
+#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
+#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
+#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
+
+
+struct ipath_devdata {
+       struct list_head ipath_list;
+
+       struct ipath_kregs const *ipath_kregs;
+       struct ipath_cregs const *ipath_cregs;
+
+       /* mem-mapped pointer to base of chip regs */
+       u64 __iomem *ipath_kregbase;
+       /* end of mem-mapped chip space; range checking */
+       u64 __iomem *ipath_kregend;
+       /* physical address of chip for io_remap, etc. */
+       unsigned long ipath_physaddr;
+       /* base of memory alloced for ipath_kregbase, for free */
+       u64 *ipath_kregalloc;
+       /* ipath_cfgports pointers */
+       struct ipath_portdata **ipath_pd;
+       /* sk_buffs used by port 0 eager receive queue */
+       struct ipath_skbinfo *ipath_port0_skbinfo;
+       /* kvirt address of 1st 2k pio buffer */
+       void __iomem *ipath_pio2kbase;
+       /* kvirt address of 1st 4k pio buffer */
+       void __iomem *ipath_pio4kbase;
+       /*
+        * points to area where PIOavail registers will be DMA'ed.
+        * Has to be on a page of it's own, because the page will be
+        * mapped into user program space.  This copy is *ONLY* ever
+        * written by DMA, not by the driver!  Need a copy per device
+        * when we get to multiple devices
+        */
+       volatile __le64 *ipath_pioavailregs_dma;
+       /* physical address where updates occur */
+       dma_addr_t ipath_pioavailregs_phys;
+       struct _ipath_layer ipath_layer;
+       /* setup intr */
+       int (*ipath_f_intrsetup)(struct ipath_devdata *);
+       /* fallback to alternate interrupt type if possible */
+       int (*ipath_f_intr_fallback)(struct ipath_devdata *);
+       /* setup on-chip bus config */
+       int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
+       /* hard reset chip */
+       int (*ipath_f_reset)(struct ipath_devdata *);
+       int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
+                                    size_t);
+       void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
+       void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
+                                       size_t);
+       void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
+       int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
+       int (*ipath_f_early_init)(struct ipath_devdata *);
+       void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
+       void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
+                               u32, unsigned long);
+       void (*ipath_f_tidtemplate)(struct ipath_devdata *);
+       void (*ipath_f_cleanup)(struct ipath_devdata *);
+       void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
+       /* fill out chip-specific fields */
+       int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+       /* free irq */
+       void (*ipath_f_free_irq)(struct ipath_devdata *);
+       struct ipath_message_header *(*ipath_f_get_msgheader)
+                                       (struct ipath_devdata *, __le32 *);
+       void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
+       int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
+       int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
+       void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
+       void (*ipath_f_read_counters)(struct ipath_devdata *,
+                                       struct infinipath_counters *);
+       void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
+       /* per chip actions needed for IB Link up/down changes */
+       int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
+
+       unsigned ipath_lastegr_idx;
+       struct ipath_ibdev *verbs_dev;
+       struct timer_list verbs_timer;
+       /* total dwords sent (summed from counter) */
+       u64 ipath_sword;
+       /* total dwords rcvd (summed from counter) */
+       u64 ipath_rword;
+       /* total packets sent (summed from counter) */
+       u64 ipath_spkts;
+       /* total packets rcvd (summed from counter) */
+       u64 ipath_rpkts;
+       /* ipath_statusp initially points to this. */
+       u64 _ipath_status;
+       /* GUID for this interface, in network order */
+       __be64 ipath_guid;
+       /*
+        * aggregrate of error bits reported since last cleared, for
+        * limiting of error reporting
+        */
+       ipath_err_t ipath_lasterror;
+       /*
+        * aggregrate of error bits reported since last cleared, for
+        * limiting of hwerror reporting
+        */
+       ipath_err_t ipath_lasthwerror;
+       /* errors masked because they occur too fast */
+       ipath_err_t ipath_maskederrs;
+       u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
+       /* these 5 fields are used to establish deltas for IB Symbol
+        * errors and linkrecovery errors. They can be reported on
+        * some chips during link negotiation prior to INIT, and with
+        * DDR when faking DDR negotiations with non-IBTA switches.
+        * The chip counters are adjusted at driver unload if there is
+        * a non-zero delta.
+        */
+       u64 ibdeltainprog;
+       u64 ibsymdelta;
+       u64 ibsymsnap;
+       u64 iblnkerrdelta;
+       u64 iblnkerrsnap;
+
+       /* time in jiffies at which to re-enable maskederrs */
+       unsigned long ipath_unmasktime;
+       /* count of egrfull errors, combined for all ports */
+       u64 ipath_last_tidfull;
+       /* for ipath_qcheck() */
+       u64 ipath_lastport0rcv_cnt;
+       /* template for writing TIDs  */
+       u64 ipath_tidtemplate;
+       /* value to write to free TIDs */
+       u64 ipath_tidinvalid;
+       /* IBA6120 rcv interrupt setup */
+       u64 ipath_rhdrhead_intr_off;
+
+       /* size of memory at ipath_kregbase */
+       u32 ipath_kregsize;
+       /* number of registers used for pioavail */
+       u32 ipath_pioavregs;
+       /* IPATH_POLL, etc. */
+       u32 ipath_flags;
+       /* ipath_flags driver is waiting for */
+       u32 ipath_state_wanted;
+       /* last buffer for user use, first buf for kernel use is this
+        * index. */
+       u32 ipath_lastport_piobuf;
+       /* is a stats timer active */
+       u32 ipath_stats_timer_active;
+       /* number of interrupts for this device -- saturates... */
+       u32 ipath_int_counter;
+       /* dwords sent read from counter */
+       u32 ipath_lastsword;
+       /* dwords received read from counter */
+       u32 ipath_lastrword;
+       /* sent packets read from counter */
+       u32 ipath_lastspkts;
+       /* received packets read from counter */
+       u32 ipath_lastrpkts;
+       /* pio bufs allocated per port */
+       u32 ipath_pbufsport;
+       /* if remainder on bufs/port, ports < extrabuf get 1 extra */
+       u32 ipath_ports_extrabuf;
+       u32 ipath_pioupd_thresh; /* update threshold, some chips */
+       /*
+        * number of ports configured as max; zero is set to number chip
+        * supports, less gives more pio bufs/port, etc.
+        */
+       u32 ipath_cfgports;
+       /* count of port 0 hdrqfull errors */
+       u32 ipath_p0_hdrqfull;
+       /* port 0 number of receive eager buffers */
+       u32 ipath_p0_rcvegrcnt;
+
+       /*
+        * index of last piobuffer we used.  Speeds up searching, by
+        * starting at this point.  Doesn't matter if multiple cpu's use and
+        * update, last updater is only write that matters.  Whenever it
+        * wraps, we update shadow copies.  Need a copy per device when we
+        * get to multiple devices
+        */
+       u32 ipath_lastpioindex;
+       u32 ipath_lastpioindexl;
+       /* max length of freezemsg */
+       u32 ipath_freezelen;
+       /*
+        * consecutive times we wanted a PIO buffer but were unable to
+        * get one
+        */
+       u32 ipath_consec_nopiobuf;
+       /*
+        * hint that we should update ipath_pioavailshadow before
+        * looking for a PIO buffer
+        */
+       u32 ipath_upd_pio_shadow;
+       /* so we can rewrite it after a chip reset */
+       u32 ipath_pcibar0;
+       /* so we can rewrite it after a chip reset */
+       u32 ipath_pcibar1;
+       u32 ipath_x1_fix_tries;
+       u32 ipath_autoneg_tries;
+       u32 serdes_first_init_done;
+
+       struct ipath_relock {
+               atomic_t ipath_relock_timer_active;
+               struct timer_list ipath_relock_timer;
+               unsigned int ipath_relock_interval; /* in jiffies */
+       } ipath_relock_singleton;
+
+       /* interrupt number */
+       int ipath_irq;
+       /* HT/PCI Vendor ID (here for NodeInfo) */
+       u16 ipath_vendorid;
+       /* HT/PCI Device ID (here for NodeInfo) */
+       u16 ipath_deviceid;
+       /* offset in HT config space of slave/primary interface block */
+       u8 ipath_ht_slave_off;
+       /* for write combining settings */
+       int wc_cookie;
+       /* ref count for each pkey */
+       atomic_t ipath_pkeyrefs[4];
+       /* shadow copy of struct page *'s for exp tid pages */
+       struct page **ipath_pageshadow;
+       /* shadow copy of dma handles for exp tid pages */
+       dma_addr_t *ipath_physshadow;
+       u64 __iomem *ipath_egrtidbase;
+       /* lock to workaround chip bug 9437 and others */
+       spinlock_t ipath_kernel_tid_lock;
+       spinlock_t ipath_user_tid_lock;
+       spinlock_t ipath_sendctrl_lock;
+       /* around ipath_pd and (user ports) port_cnt use (intr vs free) */
+       spinlock_t ipath_uctxt_lock;
+
+       /*
+        * IPATH_STATUS_*,
+        * this address is mapped readonly into user processes so they can
+        * get status cheaply, whenever they want.
+        */
+       u64 *ipath_statusp;
+       /* freeze msg if hw error put chip in freeze */
+       char *ipath_freezemsg;
+       /* pci access data structure */
+       struct pci_dev *pcidev;
+       struct cdev *user_cdev;
+       struct cdev *diag_cdev;
+       struct device *user_dev;
+       struct device *diag_dev;
+       /* timer used to prevent stats overflow, error throttling, etc. */
+       struct timer_list ipath_stats_timer;
+       /* timer to verify interrupts work, and fallback if possible */
+       struct timer_list ipath_intrchk_timer;
+       void *ipath_dummy_hdrq; /* used after port close */
+       dma_addr_t ipath_dummy_hdrq_phys;
+
+       /* SendDMA related entries */
+       spinlock_t            ipath_sdma_lock;
+       unsigned long         ipath_sdma_status;
+       unsigned long         ipath_sdma_abort_jiffies;
+       unsigned long         ipath_sdma_abort_intr_timeout;
+       unsigned long         ipath_sdma_buf_jiffies;
+       struct ipath_sdma_desc *ipath_sdma_descq;
+       u64                   ipath_sdma_descq_added;
+       u64                   ipath_sdma_descq_removed;
+       int                   ipath_sdma_desc_nreserved;
+       u16                   ipath_sdma_descq_cnt;
+       u16                   ipath_sdma_descq_tail;
+       u16                   ipath_sdma_descq_head;
+       u16                   ipath_sdma_next_intr;
+       u16                   ipath_sdma_reset_wait;
+       u8                    ipath_sdma_generation;
+       struct tasklet_struct ipath_sdma_abort_task;
+       struct tasklet_struct ipath_sdma_notify_task;
+       struct list_head      ipath_sdma_activelist;
+       struct list_head      ipath_sdma_notifylist;
+       atomic_t              ipath_sdma_vl15_count;
+       struct timer_list     ipath_sdma_vl15_timer;
+
+       dma_addr_t       ipath_sdma_descq_phys;
+       volatile __le64 *ipath_sdma_head_dma;
+       dma_addr_t       ipath_sdma_head_phys;
+
+       unsigned long ipath_ureg_align; /* user register alignment */
+
+       struct delayed_work ipath_autoneg_work;
+       wait_queue_head_t ipath_autoneg_wait;
+
+       /* HoL blocking / user app forward-progress state */
+       unsigned          ipath_hol_state;
+       unsigned          ipath_hol_next;
+       struct timer_list ipath_hol_timer;
+
+       /*
+        * Shadow copies of registers; size indicates read access size.
+        * Most of them are readonly, but some are write-only register,
+        * where we manipulate the bits in the shadow copy, and then write
+        * the shadow copy to infinipath.
+        *
+        * We deliberately make most of these 32 bits, since they have
+        * restricted range.  For any that we read, we won't to generate 32
+        * bit accesses, since Opteron will generate 2 separate 32 bit HT
+        * transactions for a 64 bit read, and we want to avoid unnecessary
+        * HT transactions.
+        */
+
+       /* This is the 64 bit group */
+
+       /*
+        * shadow of pioavail, check to be sure it's large enough at
+        * init time.
+        */
+       unsigned long ipath_pioavailshadow[8];
+       /* bitmap of send buffers available for the kernel to use with PIO. */
+       unsigned long ipath_pioavailkernel[8];
+       /* shadow of kr_gpio_out, for rmw ops */
+       u64 ipath_gpio_out;
+       /* shadow the gpio mask register */
+       u64 ipath_gpio_mask;
+       /* shadow the gpio output enable, etc... */
+       u64 ipath_extctrl;
+       /* kr_revision shadow */
+       u64 ipath_revision;
+       /*
+        * shadow of ibcctrl, for interrupt handling of link changes,
+        * etc.
+        */
+       u64 ipath_ibcctrl;
+       /*
+        * last ibcstatus, to suppress "duplicate" status change messages,
+        * mostly from 2 to 3
+        */
+       u64 ipath_lastibcstat;
+       /* hwerrmask shadow */
+       ipath_err_t ipath_hwerrmask;
+       ipath_err_t ipath_errormask; /* errormask shadow */
+       /* interrupt config reg shadow */
+       u64 ipath_intconfig;
+       /* kr_sendpiobufbase value */
+       u64 ipath_piobufbase;
+       /* kr_ibcddrctrl shadow */
+       u64 ipath_ibcddrctrl;
+
+       /* these are the "32 bit" regs */
+
+       /*
+        * number of GUIDs in the flash for this interface; may need some
+        * rethinking for setting on other ifaces
+        */
+       u32 ipath_nguid;
+       /*
+        * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+        * all expect bit fields to be "unsigned long"
+        */
+       /* shadow kr_rcvctrl */
+       unsigned long ipath_rcvctrl;
+       /* shadow kr_sendctrl */
+       unsigned long ipath_sendctrl;
+       /* to not count armlaunch after cancel */
+       unsigned long ipath_lastcancel;
+       /* count cases where special trigger was needed (double write) */
+       unsigned long ipath_spectriggerhit;
+
+       /* value we put in kr_rcvhdrcnt */
+       u32 ipath_rcvhdrcnt;
+       /* value we put in kr_rcvhdrsize */
+       u32 ipath_rcvhdrsize;
+       /* value we put in kr_rcvhdrentsize */
+       u32 ipath_rcvhdrentsize;
+       /* offset of last entry in rcvhdrq */
+       u32 ipath_hdrqlast;
+       /* kr_portcnt value */
+       u32 ipath_portcnt;
+       /* kr_pagealign value */
+       u32 ipath_palign;
+       /* number of "2KB" PIO buffers */
+       u32 ipath_piobcnt2k;
+       /* size in bytes of "2KB" PIO buffers */
+       u32 ipath_piosize2k;
+       /* number of "4KB" PIO buffers */
+       u32 ipath_piobcnt4k;
+       /* size in bytes of "4KB" PIO buffers */
+       u32 ipath_piosize4k;
+       u32 ipath_pioreserved; /* reserved special-inkernel; */
+       /* kr_rcvegrbase value */
+       u32 ipath_rcvegrbase;
+       /* kr_rcvegrcnt value */
+       u32 ipath_rcvegrcnt;
+       /* kr_rcvtidbase value */
+       u32 ipath_rcvtidbase;
+       /* kr_rcvtidcnt value */
+       u32 ipath_rcvtidcnt;
+       /* kr_sendregbase */
+       u32 ipath_sregbase;
+       /* kr_userregbase */
+       u32 ipath_uregbase;
+       /* kr_counterregbase */
+       u32 ipath_cregbase;
+       /* shadow the control register contents */
+       u32 ipath_control;
+       /* PCI revision register (HTC rev on FPGA) */
+       u32 ipath_pcirev;
+
+       /* chip address space used by 4k pio buffers */
+       u32 ipath_4kalign;
+       /* The MTU programmed for this unit */
+       u32 ipath_ibmtu;
+       /*
+        * The max size IB packet, included IB headers that we can send.
+        * Starts same as ipath_piosize, but is affected when ibmtu is
+        * changed, or by size of eager buffers
+        */
+       u32 ipath_ibmaxlen;
+       /*
+        * ibmaxlen at init time, limited by chip and by receive buffer
+        * size.  Not changed after init.
+        */
+       u32 ipath_init_ibmaxlen;
+       /* size of each rcvegrbuffer */
+       u32 ipath_rcvegrbufsize;
+       /* localbus width (1, 2,4,8,16,32) from config space  */
+       u32 ipath_lbus_width;
+       /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
+       u32 ipath_lbus_speed;
+       /*
+        * number of sequential ibcstatus change for polling active/quiet
+        * (i.e., link not coming up).
+        */
+       u32 ipath_ibpollcnt;
+       /* low and high portions of MSI capability/vector */
+       u32 ipath_msi_lo;
+       /* saved after PCIe init for restore after reset */
+       u32 ipath_msi_hi;
+       /* MSI data (vector) saved for restore */
+       u16 ipath_msi_data;
+       /* MLID programmed for this instance */
+       u16 ipath_mlid;
+       /* LID programmed for this instance */
+       u16 ipath_lid;
+       /* list of pkeys programmed; 0 if not set */
+       u16 ipath_pkeys[4];
+       /*
+        * ASCII serial number, from flash, large enough for original
+        * all digit strings, and longer QLogic serial number format
+        */
+       u8 ipath_serial[16];
+       /* human readable board version */
+       u8 ipath_boardversion[96];
+       u8 ipath_lbus_info[32]; /* human readable localbus info */
+       /* chip major rev, from ipath_revision */
+       u8 ipath_majrev;
+       /* chip minor rev, from ipath_revision */
+       u8 ipath_minrev;
+       /* board rev, from ipath_revision */
+       u8 ipath_boardrev;
+       /* saved for restore after reset */
+       u8 ipath_pci_cacheline;
+       /* LID mask control */
+       u8 ipath_lmc;
+       /* link width supported */
+       u8 ipath_link_width_supported;
+       /* link speed supported */
+       u8 ipath_link_speed_supported;
+       u8 ipath_link_width_enabled;
+       u8 ipath_link_speed_enabled;
+       u8 ipath_link_width_active;
+       u8 ipath_link_speed_active;
+       /* Rx Polarity inversion (compensate for ~tx on partner) */
+       u8 ipath_rx_pol_inv;
+
+       u8 ipath_r_portenable_shift;
+       u8 ipath_r_intravail_shift;
+       u8 ipath_r_tailupd_shift;
+       u8 ipath_r_portcfg_shift;
+
+       /* unit # of this chip, if present */
+       int ipath_unit;
+
+       /* local link integrity counter */
+       u32 ipath_lli_counter;
+       /* local link integrity errors */
+       u32 ipath_lli_errors;
+       /*
+        * Above counts only cases where _successive_ LocalLinkIntegrity
+        * errors were seen in the receive headers of kern-packets.
+        * Below are the three (monotonically increasing) counters
+        * maintained via GPIO interrupts on iba6120-rev2.
+        */
+       u32 ipath_rxfc_unsupvl_errs;
+       u32 ipath_overrun_thresh_errs;
+       u32 ipath_lli_errs;
+
+       /*
+        * Not all devices managed by a driver instance are the same
+        * type, so these fields must be per-device.
+        */
+       u64 ipath_i_bitsextant;
+       ipath_err_t ipath_e_bitsextant;
+       ipath_err_t ipath_hwe_bitsextant;
+
+       /*
+        * Below should be computable from number of ports,
+        * since they are never modified.
+        */
+       u64 ipath_i_rcvavail_mask;
+       u64 ipath_i_rcvurg_mask;
+       u16 ipath_i_rcvurg_shift;
+       u16 ipath_i_rcvavail_shift;
+
+       /*
+        * Register bits for selecting i2c direction and values, used for
+        * I2C serial flash.
+        */
+       u8 ipath_gpio_sda_num;
+       u8 ipath_gpio_scl_num;
+       u8 ipath_i2c_chain_type;
+       u64 ipath_gpio_sda;
+       u64 ipath_gpio_scl;
+
+       /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
+       spinlock_t ipath_gpio_lock;
+
+       /*
+        * IB link and linktraining states and masks that vary per chip in
+        * some way.  Set at init, to avoid each IB status change interrupt
+        */
+       u8 ibcs_ls_shift;
+       u8 ibcs_lts_mask;
+       u32 ibcs_mask;
+       u32 ib_init;
+       u32 ib_arm;
+       u32 ib_active;
+
+       u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
+
+       /*
+        * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
+        * reg. Changes for IBA7220
+        */
+       u8 ibcc_lic_mask; /* LinkInitCmd */
+       u8 ibcc_lc_shift; /* LinkCmd */
+       u8 ibcc_mpl_shift; /* Maxpktlen */
+
+       u8 delay_mult;
+
+       /* used to override LED behavior */
+       u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
+       u16 ipath_led_override_timeoff; /* delta to next timer event */
+       u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
+       u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
+       atomic_t ipath_led_override_timer_active;
+       /* Used to flash LEDs in override mode */
+       struct timer_list ipath_led_override_timer;
+
+       /* Support (including locks) for EEPROM logging of errors and time */
+       /* control access to actual counters, timer */
+       spinlock_t ipath_eep_st_lock;
+       /* control high-level access to EEPROM */
+       struct mutex ipath_eep_lock;
+       /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
+       uint64_t ipath_traffic_wds;
+       /* active time is kept in seconds, but logged in hours */
+       atomic_t ipath_active_time;
+       /* Below are nominal shadow of EEPROM, new since last EEPROM update */
+       uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
+       uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
+       uint16_t ipath_eep_hrs;
+       /*
+        * masks for which bits of errs, hwerrs that cause
+        * each of the counters to increment.
+        */
+       struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
+
+       /* interrupt mitigation reload register info */
+       u16 ipath_jint_idle_ticks;      /* idle clock ticks */
+       u16 ipath_jint_max_packets;     /* max packets across all ports */
+
+       /*
+        * lock for access to SerDes, and flags to sequence preset
+        * versus steady-state. 7220-only at the moment.
+        */
+       spinlock_t ipath_sdepb_lock;
+       u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
+};
+
+/* ipath_hol_state values (stopping/starting user proc, send flushing) */
+#define IPATH_HOL_UP       0
+#define IPATH_HOL_DOWN     1
+/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
+#define IPATH_HOL_DOWNSTOP 0
+#define IPATH_HOL_DOWNCONT 1
+
+/* bit positions for sdma_status */
+#define IPATH_SDMA_ABORTING  0
+#define IPATH_SDMA_DISARMED  1
+#define IPATH_SDMA_DISABLED  2
+#define IPATH_SDMA_LAYERBUF  3
+#define IPATH_SDMA_RUNNING  30
+#define IPATH_SDMA_SHUTDOWN 31
+
+/* bit combinations that correspond to abort states */
+#define IPATH_SDMA_ABORT_NONE 0
+#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
+#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
+       (1UL << IPATH_SDMA_DISARMED))
+#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
+       (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
+       (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
+       (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+
+#define IPATH_SDMA_BUF_NONE 0
+#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
+
+/* Private data for file operations */
+struct ipath_filedata {
+       struct ipath_portdata *pd;
+       unsigned subport;
+       unsigned tidcursor;
+       struct ipath_user_sdma_queue *pq;
+};
+extern struct list_head ipath_dev_list;
+extern spinlock_t ipath_devs_lock;
+extern struct ipath_devdata *ipath_lookup(int unit);
+
+int ipath_init_chip(struct ipath_devdata *, int);
+int ipath_enable_wc(struct ipath_devdata *dd);
+void ipath_disable_wc(struct ipath_devdata *dd);
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
+void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_clear_freeze(struct ipath_devdata *);
+
+struct file_operations;
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+                   struct cdev **cdevp, struct device **devp);
+void ipath_cdev_cleanup(struct cdev **cdevp,
+                       struct device **devp);
+
+int ipath_diag_add(struct ipath_devdata *);
+void ipath_diag_remove(struct ipath_devdata *);
+
+extern wait_queue_head_t ipath_state_wait;
+
+int ipath_user_add(struct ipath_devdata *dd);
+void ipath_user_remove(struct ipath_devdata *dd);
+
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
+
+extern int ipath_diag_inuse;
+
+irqreturn_t ipath_intr(int irq, void *devid);
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+                    ipath_err_t err);
+#if __IPATH_INFO || __IPATH_DBG
+extern const char *ipath_ibcstatus_str[];
+#endif
+
+/* clean up any per-chip chip-specific stuff */
+void ipath_chip_cleanup(struct ipath_devdata *);
+/* clean up any chip type-specific stuff */
+void ipath_chip_done(void);
+
+void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
+                         unsigned cnt);
+void ipath_cancel_sends(struct ipath_devdata *, int);
+
+int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
+void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
+
+int ipath_parse_ushort(const char *str, unsigned short *valp);
+
+void ipath_kreceive(struct ipath_portdata *);
+int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
+int ipath_reset_device(int);
+void ipath_get_faststats(unsigned long);
+int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
+int ipath_set_linkstate(struct ipath_devdata *, u8);
+int ipath_set_mtu(struct ipath_devdata *, u16);
+int ipath_set_lid(struct ipath_devdata *, u32, u8);
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
+void ipath_enable_armlaunch(struct ipath_devdata *);
+void ipath_disable_armlaunch(struct ipath_devdata *);
+void ipath_hol_down(struct ipath_devdata *);
+void ipath_hol_up(struct ipath_devdata *);
+void ipath_hol_event(unsigned long);
+void ipath_toggle_rclkrls(struct ipath_devdata *);
+void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
+void ipath_set_relock_poll(struct ipath_devdata *, int);
+void ipath_shutdown_relock_poll(struct ipath_devdata *);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
+#define subport_fp(fp) \
+       ((struct ipath_filedata *)(fp)->private_data)->subport
+#define tidcursor_fp(fp) \
+       ((struct ipath_filedata *)(fp)->private_data)->tidcursor
+#define user_sdma_queue_fp(fp) \
+       ((struct ipath_filedata *)(fp)->private_data)->pq
+
+/*
+ * values for ipath_flags
+ */
+               /* chip can report link latency (IB 1.2) */
+#define IPATH_HAS_LINK_LATENCY 0x1
+               /* The chip is up and initted */
+#define IPATH_INITTED       0x2
+               /* set if any user code has set kr_rcvhdrsize */
+#define IPATH_RCVHDRSZ_SET  0x4
+               /* The chip is present and valid for accesses */
+#define IPATH_PRESENT       0x8
+               /* HT link0 is only 8 bits wide, ignore upper byte crc
+                * errors, etc. */
+#define IPATH_8BIT_IN_HT0   0x10
+               /* HT link1 is only 8 bits wide, ignore upper byte crc
+                * errors, etc. */
+#define IPATH_8BIT_IN_HT1   0x20
+               /* The link is down */
+#define IPATH_LINKDOWN      0x40
+               /* The link level is up (0x11) */
+#define IPATH_LINKINIT      0x80
+               /* The link is in the armed (0x21) state */
+#define IPATH_LINKARMED     0x100
+               /* The link is in the active (0x31) state */
+#define IPATH_LINKACTIVE    0x200
+               /* link current state is unknown */
+#define IPATH_LINKUNK       0x400
+               /* Write combining flush needed for PIO */
+#define IPATH_PIO_FLUSH_WC  0x1000
+               /* DMA Receive tail pointer */
+#define IPATH_NODMA_RTAIL   0x2000
+               /* no IB cable, or no device on IB cable */
+#define IPATH_NOCABLE       0x4000
+               /* Supports port zero per packet receive interrupts via
+                * GPIO */
+#define IPATH_GPIO_INTR     0x8000
+               /* uses the coded 4byte TID, not 8 byte */
+#define IPATH_4BYTE_TID     0x10000
+               /* packet/word counters are 32 bit, else those 4 counters
+                * are 64bit */
+#define IPATH_32BITCOUNTERS 0x20000
+               /* Interrupt register is 64 bits */
+#define IPATH_INTREG_64     0x40000
+               /* can miss port0 rx interrupts */
+#define IPATH_DISABLED      0x80000 /* administratively disabled */
+               /* Use GPIO interrupts for new counters */
+#define IPATH_GPIO_ERRINTRS 0x100000
+#define IPATH_SWAP_PIOBUFS  0x200000
+               /* Supports Send DMA */
+#define IPATH_HAS_SEND_DMA  0x400000
+               /* Supports Send Count (not just word count) in PBC */
+#define IPATH_HAS_PBC_CNT   0x800000
+               /* Suppress heartbeat, even if turning off loopback */
+#define IPATH_NO_HRTBT      0x1000000
+#define IPATH_HAS_THRESH_UPDATE 0x4000000
+#define IPATH_HAS_MULT_IB_SPEED 0x8000000
+#define IPATH_IB_AUTONEG_INPROG 0x10000000
+#define IPATH_IB_AUTONEG_FAILED 0x20000000
+               /* Linkdown-disable intentionally, Do not attempt to bring up */
+#define IPATH_IB_LINK_DISABLED 0x40000000
+#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
+
+/* Bits in GPIO for the added interrupts */
+#define IPATH_GPIO_PORT0_BIT 2
+#define IPATH_GPIO_RXUVL_BIT 3
+#define IPATH_GPIO_OVRUN_BIT 4
+#define IPATH_GPIO_LLI_BIT 5
+#define IPATH_GPIO_ERRINTR_MASK 0x38
+
+/* portdata flag bit offsets */
+               /* waiting for a packet to arrive */
+#define IPATH_PORT_WAITING_RCV   2
+               /* master has not finished initializing */
+#define IPATH_PORT_MASTER_UNINIT 4
+               /* waiting for an urgent packet to arrive */
+#define IPATH_PORT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void ipath_free_data(struct ipath_portdata *dd);
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+                               unsigned len, int avail);
+void ipath_init_iba6110_funcs(struct ipath_devdata *);
+void ipath_get_eeprom_info(struct ipath_devdata *);
+int ipath_update_eeprom_log(struct ipath_devdata *dd);
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
+u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
+void ipath_force_pio_avail_update(struct ipath_devdata *);
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
+
+/* send dma routines */
+int setup_sdma(struct ipath_devdata *);
+void teardown_sdma(struct ipath_devdata *);
+void ipath_restart_sdma(struct ipath_devdata *);
+void ipath_sdma_intr(struct ipath_devdata *);
+int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
+                         u32, struct ipath_verbs_txreq *);
+/* ipath_sdma_lock should be locked before calling this. */
+int ipath_sdma_make_progress(struct ipath_devdata *dd);
+
+/* must be called under ipath_sdma_lock */
+static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
+{
+       return dd->ipath_sdma_descq_cnt -
+               (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
+               1 - dd->ipath_sdma_desc_nreserved;
+}
+
+static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
+{
+       dd->ipath_sdma_desc_nreserved += cnt;
+}
+
+static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
+{
+       dd->ipath_sdma_desc_nreserved -= cnt;
+}
+
+/*
+ * number of words used for protocol header if not set by ipath_userinit();
+ */
+#define IPATH_DFLT_RCVHDRSIZE 9
+
+int ipath_get_user_pages(unsigned long, size_t, struct page **);
+void ipath_release_user_pages(struct page **, size_t);
+void ipath_release_user_pages_on_close(struct page **, size_t);
+int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
+int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
+int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
+int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
+
+/* these are used for the registers that vary with port */
+void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
+                          unsigned, u64);
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/*
+ * At the moment, none of the s-registers are writable, so no
+ * ipath_write_sreg().
+ */
+
+/**
+ * ipath_read_ureg32 - read 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @port: port number
+ *
+ * Return the contents of a register that is virtualized to be per port.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
+                                   ipath_ureg regno, int port)
+{
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+               return 0;
+
+       return readl(regno + (u64 __iomem *)
+                    (dd->ipath_uregbase +
+                     (char __iomem *)dd->ipath_kregbase +
+                     dd->ipath_ureg_align * port));
+}
+
+/**
+ * ipath_write_ureg - write 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @port: port
+ *
+ * Write the contents of a register that is virtualized to be per port.
+ */
+static inline void ipath_write_ureg(const struct ipath_devdata *dd,
+                                   ipath_ureg regno, u64 value, int port)
+{
+       u64 __iomem *ubase = (u64 __iomem *)
+               (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
+                dd->ipath_ureg_align * port);
+       if (dd->ipath_kregbase)
+               writeq(value, &ubase[regno]);
+}
+
+static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
+                                   ipath_kreg regno)
+{
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+               return -1;
+       return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
+                                   ipath_kreg regno)
+{
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+               return -1;
+
+       return readq(&dd->ipath_kregbase[regno]);
+}
+
+static inline void ipath_write_kreg(const struct ipath_devdata *dd,
+                                   ipath_kreg regno, u64 value)
+{
+       if (dd->ipath_kregbase)
+               writeq(value, &dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
+                                 ipath_sreg regno)
+{
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+               return 0;
+
+       return readq(regno + (u64 __iomem *)
+                    (dd->ipath_cregbase +
+                     (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
+                                        ipath_sreg regno)
+{
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+               return 0;
+       return readl(regno + (u64 __iomem *)
+                    (dd->ipath_cregbase +
+                     (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_write_creg(const struct ipath_devdata *dd,
+                                   ipath_creg regno, u64 value)
+{
+       if (dd->ipath_kregbase)
+               writeq(value, regno + (u64 __iomem *)
+                      (dd->ipath_cregbase +
+                       (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
+{
+       *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
+{
+       return (u32) le64_to_cpu(*((volatile __le64 *)
+                               pd->port_rcvhdrtail_kvaddr));
+}
+
+static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
+{
+       const struct ipath_devdata *dd = pd->port_dd;
+       u32 hdrqtail;
+
+       if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+               __le32 *rhf_addr;
+               u32 seq;
+
+               rhf_addr = (__le32 *) pd->port_rcvhdrq +
+                       pd->port_head + dd->ipath_rhf_offset;
+               seq = ipath_hdrget_seq(rhf_addr);
+               hdrqtail = pd->port_head;
+               if (seq == pd->port_seq_cnt)
+                       hdrqtail++;
+       } else
+               hdrqtail = ipath_get_rcvhdrtail(pd);
+
+       return hdrqtail;
+}
+
+static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
+{
+       return (dd->ipath_flags & IPATH_INTREG_64) ?
+               ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return linkstate
+ * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
+ * everywhere, anyway (and should be, for almost all purposes).
+ */
+static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+       u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
+               INFINIPATH_IBCS_LINKSTATE_MASK;
+       if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
+               state = INFINIPATH_IBCS_L_STATE_ACTIVE;
+       return state;
+}
+
+/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
+static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
+{
+       return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+               dd->ibcs_lts_mask;
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return logical link state
+ * combination of link state and linktraining state (down, active, init,
+ * arm, etc.
+ */
+static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
+{
+       u32 ibs;
+       ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+               dd->ibcs_lts_mask;
+       ibs |= (u32)(ibcs &
+               (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
+       return ibs;
+}
+
+/*
+ * sysfs interface.
+ */
+
+struct device_driver;
+
+extern const char ib_ipath_version[];
+
+extern const struct attribute_group *ipath_driver_attr_groups[];
+
+int ipath_device_create_group(struct device *, struct ipath_devdata *);
+void ipath_device_remove_group(struct device *, struct ipath_devdata *);
+int ipath_expose_reset(struct device *);
+
+int ipath_init_ipathfs(void);
+void ipath_exit_ipathfs(void);
+int ipathfs_add_device(struct ipath_devdata *);
+int ipathfs_remove_device(struct ipath_devdata *);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
+                         size_t, int);
+dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
+const char *ipath_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
+#else
+#define ipath_flush_wc() wmb()
+#endif
+
+extern unsigned ipath_debug; /* debugging bit mask */
+extern unsigned ipath_linkrecovery;
+extern unsigned ipath_mtu4096;
+extern struct mutex ipath_mutex;
+
+#define IPATH_DRV_NAME         "ib_ipath"
+#define IPATH_MAJOR            233
+#define IPATH_USER_MINOR_BASE  0
+#define IPATH_DIAGPKT_MINOR    127
+#define IPATH_DIAG_MINOR_BASE  129
+#define IPATH_NMINORS          255
+
+#define ipath_dev_err(dd,fmt,...) \
+       do { \
+               const struct ipath_devdata *__dd = (dd); \
+               if (__dd->pcidev) \
+                       dev_err(&__dd->pcidev->dev, "%s: " fmt, \
+                               ipath_get_unit_name(__dd->ipath_unit), \
+                               ##__VA_ARGS__); \
+               else \
+                       printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
+                              ipath_get_unit_name(__dd->ipath_unit), \
+                              ##__VA_ARGS__); \
+       } while (0)
+
+#if _IPATH_DEBUGGING
+
+# define __IPATH_DBG_WHICH(which,fmt,...) \
+       do { \
+               if (unlikely(ipath_debug & (which))) \
+                       printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
+                              __func__,##__VA_ARGS__); \
+       } while(0)
+
+# define ipath_dbg(fmt,...) \
+       __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+# define ipath_cdbg(which,fmt,...) \
+       __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+
+#else /* ! _IPATH_DEBUGGING */
+
+# define ipath_dbg(fmt,...)
+# define ipath_cdbg(which,fmt,...)
+
+#endif /* _IPATH_DEBUGGING */
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct ipath_hwerror_msgs {
+       u64 mask;
+       const char *msg;
+};
+
+#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
+
+/* in ipath_intr.c... */
+void ipath_format_hwerrors(u64 hwerrs,
+                          const struct ipath_hwerror_msgs *hwerrmsgs,
+                          size_t nhwerrmsgs,
+                          char *msg, size_t lmsg);
+
+#endif                         /* _IPATH_KERNEL_H */
diff --git a/drivers/staging/rdma/ipath/ipath_keys.c b/drivers/staging/rdma/ipath/ipath_keys.c
new file mode 100644 (file)
index 0000000..c0e933f
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_alloc_lkey - allocate an lkey
+ * @rkt: lkey table in which to allocate the lkey
+ * @mr: memory region that this lkey protects
+ *
+ * Returns 1 if successful, otherwise returns 0.
+ */
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
+{
+       unsigned long flags;
+       u32 r;
+       u32 n;
+       int ret;
+
+       spin_lock_irqsave(&rkt->lock, flags);
+
+       /* Find the next available LKEY */
+       r = n = rkt->next;
+       for (;;) {
+               if (rkt->table[r] == NULL)
+                       break;
+               r = (r + 1) & (rkt->max - 1);
+               if (r == n) {
+                       spin_unlock_irqrestore(&rkt->lock, flags);
+                       ipath_dbg("LKEY table full\n");
+                       ret = 0;
+                       goto bail;
+               }
+       }
+       rkt->next = (r + 1) & (rkt->max - 1);
+       /*
+        * Make sure lkey is never zero which is reserved to indicate an
+        * unrestricted LKEY.
+        */
+       rkt->gen++;
+       mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+               ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
+                << 8);
+       if (mr->lkey == 0) {
+               mr->lkey |= 1 << 8;
+               rkt->gen++;
+       }
+       rkt->table[r] = mr;
+       spin_unlock_irqrestore(&rkt->lock, flags);
+
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_free_lkey - free an lkey
+ * @rkt: table from which to free the lkey
+ * @lkey: lkey id to free
+ */
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+       unsigned long flags;
+       u32 r;
+
+       if (lkey == 0)
+               return;
+       r = lkey >> (32 - ib_ipath_lkey_table_size);
+       spin_lock_irqsave(&rkt->lock, flags);
+       rkt->table[r] = NULL;
+       spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * ipath_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+                 struct ib_sge *sge, int acc)
+{
+       struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+       struct ipath_mregion *mr;
+       unsigned n, m;
+       size_t off;
+       int ret;
+
+       /*
+        * We use LKEY == zero for kernel virtual addresses
+        * (see ipath_get_dma_mr and ipath_dma.c).
+        */
+       if (sge->lkey == 0) {
+               /* always a kernel port, no locking needed */
+               struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+               if (pd->user) {
+                       ret = 0;
+                       goto bail;
+               }
+               isge->mr = NULL;
+               isge->vaddr = (void *) sge->addr;
+               isge->length = sge->length;
+               isge->sge_length = sge->length;
+               ret = 1;
+               goto bail;
+       }
+       mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+       if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
+                    qp->ibqp.pd != mr->pd)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off = sge->addr - mr->user_base;
+       if (unlikely(sge->addr < mr->user_base ||
+                    off + sge->length > mr->length ||
+                    (mr->access_flags & acc) != acc)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off += mr->offset;
+       m = 0;
+       n = 0;
+       while (off >= mr->map[m]->segs[n].length) {
+               off -= mr->map[m]->segs[n].length;
+               n++;
+               if (n >= IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+       isge->mr = mr;
+       isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+       isge->length = mr->map[m]->segs[n].length - off;
+       isge->sge_length = sge->length;
+       isge->m = m;
+       isge->n = n;
+
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_rkey_ok - check the IB virtual address, length, and RKEY
+ * @dev: infiniband device
+ * @ss: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ */
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+                 u32 len, u64 vaddr, u32 rkey, int acc)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_lkey_table *rkt = &dev->lk_table;
+       struct ipath_sge *sge = &ss->sge;
+       struct ipath_mregion *mr;
+       unsigned n, m;
+       size_t off;
+       int ret;
+
+       /*
+        * We use RKEY == zero for kernel virtual addresses
+        * (see ipath_get_dma_mr and ipath_dma.c).
+        */
+       if (rkey == 0) {
+               /* always a kernel port, no locking needed */
+               struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+               if (pd->user) {
+                       ret = 0;
+                       goto bail;
+               }
+               sge->mr = NULL;
+               sge->vaddr = (void *) vaddr;
+               sge->length = len;
+               sge->sge_length = len;
+               ss->sg_list = NULL;
+               ss->num_sge = 1;
+               ret = 1;
+               goto bail;
+       }
+
+       mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+       if (unlikely(mr == NULL || mr->lkey != rkey ||
+                    qp->ibqp.pd != mr->pd)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off = vaddr - mr->iova;
+       if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+                    (mr->access_flags & acc) == 0)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off += mr->offset;
+       m = 0;
+       n = 0;
+       while (off >= mr->map[m]->segs[n].length) {
+               off -= mr->map[m]->segs[n].length;
+               n++;
+               if (n >= IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+       sge->mr = mr;
+       sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+       sge->length = mr->map[m]->segs[n].length - off;
+       sge->sge_length = len;
+       sge->m = m;
+       sge->n = n;
+       ss->sg_list = NULL;
+       ss->num_sge = 1;
+
+       ret = 1;
+
+bail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mad.c b/drivers/staging/rdma/ipath/ipath_mad.c
new file mode 100644 (file)
index 0000000..ad3a926
--- /dev/null
@@ -0,0 +1,1521 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_pma.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define IB_SMP_UNSUP_VERSION   cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD    cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD   cpu_to_be16(0x001C)
+
+static int reply(struct ib_smp *smp)
+{
+       /*
+        * The verbs framework will handle the directed/LID route
+        * packet changes.
+        */
+       smp->method = IB_MGMT_METHOD_GET_RESP;
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+               smp->status |= IB_SMP_DIRECTION;
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int recv_subn_get_nodedescription(struct ib_smp *smp,
+                                        struct ib_device *ibdev)
+{
+       if (smp->attr_mod)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+       return reply(smp);
+}
+
+struct nodeinfo {
+       u8 base_version;
+       u8 class_version;
+       u8 node_type;
+       u8 num_ports;
+       __be64 sys_guid;
+       __be64 node_guid;
+       __be64 port_guid;
+       __be16 partition_cap;
+       __be16 device_id;
+       __be32 revision;
+       u8 local_port_num;
+       u8 vendor_id[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_nodeinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev, u8 port)
+{
+       struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
+       struct ipath_devdata *dd = to_idev(ibdev)->dd;
+       u32 vendor, majrev, minrev;
+
+       /* GUID 0 is illegal */
+       if (smp->attr_mod || (dd->ipath_guid == 0))
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       nip->base_version = 1;
+       nip->class_version = 1;
+       nip->node_type = 1;     /* channel adapter */
+       /*
+        * XXX The num_ports value will need a layer function to get
+        * the value if we ever have more than one IB port on a chip.
+        * We will also need to get the GUID for the port.
+        */
+       nip->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       nip->sys_guid = to_idev(ibdev)->sys_image_guid;
+       nip->node_guid = dd->ipath_guid;
+       nip->port_guid = dd->ipath_guid;
+       nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
+       nip->device_id = cpu_to_be16(dd->ipath_deviceid);
+       majrev = dd->ipath_majrev;
+       minrev = dd->ipath_minrev;
+       nip->revision = cpu_to_be32((majrev << 16) | minrev);
+       nip->local_port_num = port;
+       vendor = dd->ipath_vendorid;
+       nip->vendor_id[0] = IPATH_SRC_OUI_1;
+       nip->vendor_id[1] = IPATH_SRC_OUI_2;
+       nip->vendor_id[2] = IPATH_SRC_OUI_3;
+
+       return reply(smp);
+}
+
+static int recv_subn_get_guidinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev)
+{
+       u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+       __be64 *p = (__be64 *) smp->data;
+
+       /* 32 blocks of 8 64-bit GUIDs per block */
+
+       memset(smp->data, 0, sizeof(smp->data));
+
+       /*
+        * We only support one GUID for now.  If this changes, the
+        * portinfo.guid_cap field needs to be updated too.
+        */
+       if (startgx == 0) {
+               __be64 g = to_idev(ibdev)->dd->ipath_guid;
+               if (g == 0)
+                       /* GUID 0 is illegal */
+                       smp->status |= IB_SMP_INVALID_FIELD;
+               else
+                       /* The first is a copy of the read-only HW GUID. */
+                       *p = g;
+       } else
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return reply(smp);
+}
+
+static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
+{
+       (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
+{
+       (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct ipath_devdata *dd)
+{
+       return (dd->ipath_ibcctrl >>
+               INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
+{
+       unsigned v;
+
+       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+       if (v != n) {
+               dd->ipath_ibcctrl &=
+                       ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
+                         INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
+               dd->ipath_ibcctrl |=
+                       (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+       }
+       return 0;
+}
+
+static int get_phyerrthreshold(struct ipath_devdata *dd)
+{
+       return (dd->ipath_ibcctrl >>
+               INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
+{
+       unsigned v;
+
+       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+       if (v != n) {
+               dd->ipath_ibcctrl &=
+                       ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
+                         INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
+               dd->ipath_ibcctrl |=
+                       (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+       }
+       return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @dd: the infinipath device
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct ipath_devdata *dd)
+{
+       return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
+}
+
+static int recv_subn_get_portinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev, u8 port)
+{
+       struct ipath_ibdev *dev;
+       struct ipath_devdata *dd;
+       struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+       u16 lid;
+       u8 ibcstat;
+       u8 mtu;
+       int ret;
+
+       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               ret = reply(smp);
+               goto bail;
+       }
+
+       dev = to_idev(ibdev);
+       dd = dev->dd;
+
+       /* Clear all fields.  Only set the non-zero fields. */
+       memset(smp->data, 0, sizeof(smp->data));
+
+       /* Only return the mkey if the protection field allows it. */
+       if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
+           dev->mkeyprot == 0)
+               pip->mkey = dev->mkey;
+       pip->gid_prefix = dev->gid_prefix;
+       lid = dd->ipath_lid;
+       pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
+       pip->sm_lid = cpu_to_be16(dev->sm_lid);
+       pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
+       /* pip->diag_code; */
+       pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
+       pip->local_port_num = port;
+       pip->link_width_enabled = dd->ipath_link_width_enabled;
+       pip->link_width_supported = dd->ipath_link_width_supported;
+       pip->link_width_active = dd->ipath_link_width_active;
+       pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
+       ibcstat = dd->ipath_lastibcstat;
+       /* map LinkState to IB portinfo values.  */
+       pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
+
+       pip->portphysstate_linkdown =
+               (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
+               (get_linkdowndefaultstate(dd) ? 1 : 2);
+       pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
+       pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
+               dd->ipath_link_speed_enabled;
+       switch (dd->ipath_ibmtu) {
+       case 4096:
+               mtu = IB_MTU_4096;
+               break;
+       case 2048:
+               mtu = IB_MTU_2048;
+               break;
+       case 1024:
+               mtu = IB_MTU_1024;
+               break;
+       case 512:
+               mtu = IB_MTU_512;
+               break;
+       case 256:
+               mtu = IB_MTU_256;
+               break;
+       default:                /* oops, something is wrong */
+               mtu = IB_MTU_2048;
+               break;
+       }
+       pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
+       pip->vlcap_inittype = 0x10;     /* VLCap = VL0, InitType = 0 */
+       pip->vl_high_limit = dev->vl_high_limit;
+       /* pip->vl_arb_high_cap; // only one VL */
+       /* pip->vl_arb_low_cap; // only one VL */
+       /* InitTypeReply = 0 */
+       /* our mtu cap depends on whether 4K MTU enabled or not */
+       pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+       /* HCAs ignore VLStallCount and HOQLife */
+       /* pip->vlstallcnt_hoqlife; */
+       pip->operationalvl_pei_peo_fpi_fpo = 0x10;      /* OVLs = 1 */
+       pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
+       /* P_KeyViolations are counted by hardware. */
+       pip->pkey_violations =
+               cpu_to_be16((ipath_get_cr_errpkey(dd) -
+                            dev->z_pkey_violations) & 0xFFFF);
+       pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
+       /* Only the hardware GUID is supported for now */
+       pip->guid_cap = 1;
+       pip->clientrereg_resv_subnetto = dev->subnet_timeout;
+       /* 32.768 usec. response time (guessing) */
+       pip->resv_resptimevalue = 3;
+       pip->localphyerrors_overrunerrors =
+               (get_phyerrthreshold(dd) << 4) |
+               get_overrunthreshold(dd);
+       /* pip->max_credit_hint; */
+       if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+               u32 v;
+
+               v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
+               pip->link_roundtrip_latency[0] = v >> 16;
+               pip->link_roundtrip_latency[1] = v >> 8;
+               pip->link_roundtrip_latency[2] = v;
+       }
+
+       ret = reply(smp);
+
+bail:
+       return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+       /* always a kernel port, no locking needed */
+       struct ipath_portdata *pd = dd->ipath_pd[0];
+
+       memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
+
+       return 0;
+}
+
+static int recv_subn_get_pkeytable(struct ib_smp *smp,
+                                  struct ib_device *ibdev)
+{
+       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+       u16 *p = (u16 *) smp->data;
+       __be16 *q = (__be16 *) smp->data;
+
+       /* 64 blocks of 32 16-bit P_Key entries */
+
+       memset(smp->data, 0, sizeof(smp->data));
+       if (startpx == 0) {
+               struct ipath_ibdev *dev = to_idev(ibdev);
+               unsigned i, n = ipath_get_npkeys(dev->dd);
+
+               get_pkeys(dev->dd, p);
+
+               for (i = 0; i < n; i++)
+                       q[i] = cpu_to_be16(p[i]);
+       } else
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return reply(smp);
+}
+
+static int recv_subn_set_guidinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev)
+{
+       /* The only GUID we support is the first read-only entry. */
+       return recv_subn_get_guidinfo(smp, ibdev);
+}
+
+/**
+ * set_linkdowndefaultstate - set the default linkdown state
+ * @dd: the infinipath device
+ * @sleep: the new state
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
+{
+       if (sleep)
+               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+       else
+               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                        dd->ipath_ibcctrl);
+       return 0;
+}
+
+/**
+ * recv_subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int recv_subn_set_portinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev, u8 port)
+{
+       struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+       struct ib_event event;
+       struct ipath_ibdev *dev;
+       struct ipath_devdata *dd;
+       char clientrereg = 0;
+       u16 lid, smlid;
+       u8 lwe;
+       u8 lse;
+       u8 state;
+       u16 lstate;
+       u32 mtu;
+       int ret, ore;
+
+       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
+               goto err;
+
+       dev = to_idev(ibdev);
+       dd = dev->dd;
+       event.device = ibdev;
+       event.element.port_num = port;
+
+       dev->mkey = pip->mkey;
+       dev->gid_prefix = pip->gid_prefix;
+       dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+       lid = be16_to_cpu(pip->lid);
+       if (dd->ipath_lid != lid ||
+           dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
+               /* Must be a valid unicast LID address. */
+               if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
+                       goto err;
+               ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
+               event.event = IB_EVENT_LID_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       smlid = be16_to_cpu(pip->sm_lid);
+       if (smlid != dev->sm_lid) {
+               /* Must be a valid unicast LID address. */
+               if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
+                       goto err;
+               dev->sm_lid = smlid;
+               event.event = IB_EVENT_SM_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       /* Allow 1x or 4x to be set (see 14.2.6.6). */
+       lwe = pip->link_width_enabled;
+       if (lwe) {
+               if (lwe == 0xFF)
+                       lwe = dd->ipath_link_width_supported;
+               else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
+                       goto err;
+               set_link_width_enabled(dd, lwe);
+       }
+
+       /* Allow 2.5 or 5.0 Gbs. */
+       lse = pip->linkspeedactive_enabled & 0xF;
+       if (lse) {
+               if (lse == 15)
+                       lse = dd->ipath_link_speed_supported;
+               else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
+                       goto err;
+               set_link_speed_enabled(dd, lse);
+       }
+
+       /* Set link down default state. */
+       switch (pip->portphysstate_linkdown & 0xF) {
+       case 0: /* NOP */
+               break;
+       case 1: /* SLEEP */
+               if (set_linkdowndefaultstate(dd, 1))
+                       goto err;
+               break;
+       case 2: /* POLL */
+               if (set_linkdowndefaultstate(dd, 0))
+                       goto err;
+               break;
+       default:
+               goto err;
+       }
+
+       dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+       dev->vl_high_limit = pip->vl_high_limit;
+
+       switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
+       case IB_MTU_256:
+               mtu = 256;
+               break;
+       case IB_MTU_512:
+               mtu = 512;
+               break;
+       case IB_MTU_1024:
+               mtu = 1024;
+               break;
+       case IB_MTU_2048:
+               mtu = 2048;
+               break;
+       case IB_MTU_4096:
+               if (!ipath_mtu4096)
+                       goto err;
+               mtu = 4096;
+               break;
+       default:
+               /* XXX We have already partially updated our state! */
+               goto err;
+       }
+       ipath_set_mtu(dd, mtu);
+
+       dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
+
+       /* We only support VL0 */
+       if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
+               goto err;
+
+       if (pip->mkey_violations == 0)
+               dev->mkey_violations = 0;
+
+       /*
+        * Hardware counter can't be reset so snapshot and subtract
+        * later.
+        */
+       if (pip->pkey_violations == 0)
+               dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
+
+       if (pip->qkey_violations == 0)
+               dev->qkey_violations = 0;
+
+       ore = pip->localphyerrors_overrunerrors;
+       if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
+               goto err;
+
+       if (set_overrunthreshold(dd, (ore & 0xF)))
+               goto err;
+
+       dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+       if (pip->clientrereg_resv_subnetto & 0x80) {
+               clientrereg = 1;
+               event.event = IB_EVENT_CLIENT_REREGISTER;
+               ib_dispatch_event(&event);
+       }
+
+       /*
+        * Do the port state change now that the other link parameters
+        * have been set.
+        * Changing the port physical state only makes sense if the link
+        * is down or is being set to down.
+        */
+       state = pip->linkspeed_portstate & 0xF;
+       lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+       if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+               goto err;
+
+       /*
+        * Only state changes of DOWN, ARM, and ACTIVE are valid
+        * and must be in the correct state to take effect (see 7.2.6).
+        */
+       switch (state) {
+       case IB_PORT_NOP:
+               if (lstate == 0)
+                       break;
+               /* FALLTHROUGH */
+       case IB_PORT_DOWN:
+               if (lstate == 0)
+                       lstate = IPATH_IB_LINKDOWN_ONLY;
+               else if (lstate == 1)
+                       lstate = IPATH_IB_LINKDOWN_SLEEP;
+               else if (lstate == 2)
+                       lstate = IPATH_IB_LINKDOWN;
+               else if (lstate == 3)
+                       lstate = IPATH_IB_LINKDOWN_DISABLE;
+               else
+                       goto err;
+               ipath_set_linkstate(dd, lstate);
+               if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
+                       ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+                       goto done;
+               }
+               ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
+                               IPATH_LINKACTIVE, 1000);
+               break;
+       case IB_PORT_ARMED:
+               ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+               break;
+       case IB_PORT_ACTIVE:
+               ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+               break;
+       default:
+               /* XXX We have already partially updated our state! */
+               goto err;
+       }
+
+       ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+       if (clientrereg)
+               pip->clientrereg_resv_subnetto |= 0x80;
+
+       goto done;
+
+err:
+       smp->status |= IB_SMP_INVALID_FIELD;
+       ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+done:
+       return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the infinipath device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct ipath_devdata *dd, u16 key)
+{
+       int i;
+       int ret;
+
+       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (dd->ipath_pkeys[i] != key)
+                       continue;
+               if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
+                       dd->ipath_pkeys[i] = 0;
+                       ret = 1;
+                       goto bail;
+               }
+               break;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the infinipath device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct ipath_devdata *dd, u16 key)
+{
+       int i;
+       u16 lkey = key & 0x7FFF;
+       int any = 0;
+       int ret;
+
+       if (lkey == 0x7FFF) {
+               ret = 0;
+               goto bail;
+       }
+
+       /* Look for an empty slot or a matching PKEY. */
+       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i]) {
+                       any++;
+                       continue;
+               }
+               /* If it matches exactly, try to increment the ref count */
+               if (dd->ipath_pkeys[i] == key) {
+                       if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+                               ret = 0;
+                               goto bail;
+                       }
+                       /* Lost the race. Look for an empty slot below. */
+                       atomic_dec(&dd->ipath_pkeyrefs[i]);
+                       any++;
+               }
+               /*
+                * It makes no sense to have both the limited and unlimited
+                * PKEY set at the same time since the unlimited one will
+                * disable the limited one.
+                */
+               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+                       ret = -EEXIST;
+                       goto bail;
+               }
+       }
+       if (!any) {
+               ret = -EBUSY;
+               goto bail;
+       }
+       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i] &&
+                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+                       /* for ipathstats, etc. */
+                       ipath_stats.sps_pkeys[i] = lkey;
+                       dd->ipath_pkeys[i] = key;
+                       ret = 1;
+                       goto bail;
+               }
+       }
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
+{
+       struct ipath_portdata *pd;
+       int i;
+       int changed = 0;
+
+       /* always a kernel port, no locking needed */
+       pd = dd->ipath_pd[0];
+
+       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+               u16 key = pkeys[i];
+               u16 okey = pd->port_pkeys[i];
+
+               if (key == okey)
+                       continue;
+               /*
+                * The value of this PKEY table entry is changing.
+                * Remove the old entry in the hardware's array of PKEYs.
+                */
+               if (okey & 0x7FFF)
+                       changed |= rm_pkey(dd, okey);
+               if (key & 0x7FFF) {
+                       int ret = add_pkey(dd, key);
+
+                       if (ret < 0)
+                               key = 0;
+                       else
+                               changed |= ret;
+               }
+               pd->port_pkeys[i] = key;
+       }
+       if (changed) {
+               u64 pkey;
+               struct ib_event event;
+
+               pkey = (u64) dd->ipath_pkeys[0] |
+                       ((u64) dd->ipath_pkeys[1] << 16) |
+                       ((u64) dd->ipath_pkeys[2] << 32) |
+                       ((u64) dd->ipath_pkeys[3] << 48);
+               ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
+                          (unsigned long long) pkey);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+                                pkey);
+
+               event.event = IB_EVENT_PKEY_CHANGE;
+               event.device = &dd->verbs_dev->ibdev;
+               event.element.port_num = port;
+               ib_dispatch_event(&event);
+       }
+       return 0;
+}
+
+static int recv_subn_set_pkeytable(struct ib_smp *smp,
+                                  struct ib_device *ibdev, u8 port)
+{
+       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+       __be16 *p = (__be16 *) smp->data;
+       u16 *q = (u16 *) smp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       unsigned i, n = ipath_get_npkeys(dev->dd);
+
+       for (i = 0; i < n; i++)
+               q[i] = be16_to_cpu(p[i]);
+
+       if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return recv_subn_get_pkeytable(smp, ibdev);
+}
+
+static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
+{
+       struct ib_class_port_info *p =
+               (struct ib_class_port_info *)pmp->data;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       if (pmp->mad_hdr.attr_mod != 0)
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+       /* Indicate AllPortSelect is valid (only one port anyway) */
+       p->capability_mask = cpu_to_be16(1 << 8);
+       p->base_version = 1;
+       p->class_version = 1;
+       /*
+        * Expected response time is 4.096 usec. * 2^18 == 1.073741824
+        * sec.
+        */
+       p->resp_time_value = 18;
+
+       return reply((struct ib_smp *) pmp);
+}
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
+                                   COUNTER_MASK(1, 1) | \
+                                   COUNTER_MASK(1, 2) | \
+                                   COUNTER_MASK(1, 3) | \
+                                   COUNTER_MASK(1, 4))
+
+static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+                                          struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portsamplescontrol *p =
+               (struct ib_pma_portsamplescontrol *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+       unsigned long flags;
+       u8 port_select = p->port_select;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       p->port_select = port_select;
+       if (pmp->mad_hdr.attr_mod != 0 ||
+           (port_select != port && port_select != 0xFF))
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+       /*
+        * Ticks are 10x the link transfer period which for 2.5Gbs is 4
+        * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
+        * intervals are counted in ticks.  Since we use Linux timers, that
+        * count in jiffies, we can't sample for less than 1000 ticks if HZ
+        * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
+        * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
+        * have hardware support for delaying packets.
+        */
+       if (crp->cr_psstat)
+               p->tick = dev->dd->ipath_link_speed_active - 1;
+       else
+               p->tick = 250;          /* 1 usec. */
+       p->counter_width = 4;   /* 32 bit counters */
+       p->counter_mask0_9 = COUNTER_MASK0_9;
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       if (crp->cr_psstat)
+               p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+       else
+               p->sample_status = dev->pma_sample_status;
+       p->sample_start = cpu_to_be32(dev->pma_sample_start);
+       p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
+       p->tag = cpu_to_be16(dev->pma_tag);
+       p->counter_select[0] = dev->pma_counter_select[0];
+       p->counter_select[1] = dev->pma_counter_select[1];
+       p->counter_select[2] = dev->pma_counter_select[2];
+       p->counter_select[3] = dev->pma_counter_select[3];
+       p->counter_select[4] = dev->pma_counter_select[4];
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+                                          struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portsamplescontrol *p =
+               (struct ib_pma_portsamplescontrol *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+       unsigned long flags;
+       u8 status;
+       int ret;
+
+       if (pmp->mad_hdr.attr_mod != 0 ||
+           (p->port_select != port && p->port_select != 0xFF)) {
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+               ret = reply((struct ib_smp *) pmp);
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       if (crp->cr_psstat)
+               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+       else
+               status = dev->pma_sample_status;
+       if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+               dev->pma_sample_start = be32_to_cpu(p->sample_start);
+               dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
+               dev->pma_tag = be16_to_cpu(p->tag);
+               dev->pma_counter_select[0] = p->counter_select[0];
+               dev->pma_counter_select[1] = p->counter_select[1];
+               dev->pma_counter_select[2] = p->counter_select[2];
+               dev->pma_counter_select[3] = p->counter_select[3];
+               dev->pma_counter_select[4] = p->counter_select[4];
+               if (crp->cr_psstat) {
+                       ipath_write_creg(dev->dd, crp->cr_psinterval,
+                                        dev->pma_sample_interval);
+                       ipath_write_creg(dev->dd, crp->cr_psstart,
+                                        dev->pma_sample_start);
+               } else
+                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+       return ret;
+}
+
+static u64 get_counter(struct ipath_ibdev *dev,
+                      struct ipath_cregs const *crp,
+                      __be16 sel)
+{
+       u64 ret;
+
+       switch (sel) {
+       case IB_PMA_PORT_XMIT_DATA:
+               ret = (crp->cr_psxmitdatacount) ?
+                       ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
+                       dev->ipath_sword;
+               break;
+       case IB_PMA_PORT_RCV_DATA:
+               ret = (crp->cr_psrcvdatacount) ?
+                       ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
+                       dev->ipath_rword;
+               break;
+       case IB_PMA_PORT_XMIT_PKTS:
+               ret = (crp->cr_psxmitpktscount) ?
+                       ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
+                       dev->ipath_spkts;
+               break;
+       case IB_PMA_PORT_RCV_PKTS:
+               ret = (crp->cr_psrcvpktscount) ?
+                       ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
+                       dev->ipath_rpkts;
+               break;
+       case IB_PMA_PORT_XMIT_WAIT:
+               ret = (crp->cr_psxmitwaitcount) ?
+                       ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
+                       dev->ipath_xmit_wait;
+               break;
+       default:
+               ret = 0;
+       }
+
+       return ret;
+}
+
+static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+                                         struct ib_device *ibdev)
+{
+       struct ib_pma_portsamplesresult *p =
+               (struct ib_pma_portsamplesresult *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+       u8 status;
+       int i;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+       p->tag = cpu_to_be16(dev->pma_tag);
+       if (crp->cr_psstat)
+               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+       else
+               status = dev->pma_sample_status;
+       p->sample_status = cpu_to_be16(status);
+       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+               p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+                   cpu_to_be32(
+                       get_counter(dev, crp, dev->pma_counter_select[i]));
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+                                             struct ib_device *ibdev)
+{
+       struct ib_pma_portsamplesresult_ext *p =
+               (struct ib_pma_portsamplesresult_ext *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+       u8 status;
+       int i;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+       p->tag = cpu_to_be16(dev->pma_tag);
+       if (crp->cr_psstat)
+               status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+       else
+               status = dev->pma_sample_status;
+       p->sample_status = cpu_to_be16(status);
+       /* 64 bits */
+       p->extended_width = cpu_to_be32(0x80000000);
+       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+               p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+                   cpu_to_be64(
+                       get_counter(dev, crp, dev->pma_counter_select[i]));
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
+                                    struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_verbs_counters cntrs;
+       u8 port_select = p->port_select;
+
+       ipath_get_counters(dev->dd, &cntrs);
+
+       /* Adjust counters for any resets done. */
+       cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
+       cntrs.link_error_recovery_counter -=
+               dev->z_link_error_recovery_counter;
+       cntrs.link_downed_counter -= dev->z_link_downed_counter;
+       cntrs.port_rcv_errors += dev->rcv_errors;
+       cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
+       cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
+       cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
+       cntrs.port_xmit_data -= dev->z_port_xmit_data;
+       cntrs.port_rcv_data -= dev->z_port_rcv_data;
+       cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
+       cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
+       cntrs.local_link_integrity_errors -=
+               dev->z_local_link_integrity_errors;
+       cntrs.excessive_buffer_overrun_errors -=
+               dev->z_excessive_buffer_overrun_errors;
+       cntrs.vl15_dropped -= dev->z_vl15_dropped;
+       cntrs.vl15_dropped += dev->n_vl15_dropped;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       p->port_select = port_select;
+       if (pmp->mad_hdr.attr_mod != 0 ||
+           (port_select != port && port_select != 0xFF))
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+       if (cntrs.symbol_error_counter > 0xFFFFUL)
+               p->symbol_error_counter = cpu_to_be16(0xFFFF);
+       else
+               p->symbol_error_counter =
+                       cpu_to_be16((u16)cntrs.symbol_error_counter);
+       if (cntrs.link_error_recovery_counter > 0xFFUL)
+               p->link_error_recovery_counter = 0xFF;
+       else
+               p->link_error_recovery_counter =
+                       (u8)cntrs.link_error_recovery_counter;
+       if (cntrs.link_downed_counter > 0xFFUL)
+               p->link_downed_counter = 0xFF;
+       else
+               p->link_downed_counter = (u8)cntrs.link_downed_counter;
+       if (cntrs.port_rcv_errors > 0xFFFFUL)
+               p->port_rcv_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_errors =
+                       cpu_to_be16((u16) cntrs.port_rcv_errors);
+       if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+               p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_remphys_errors =
+                       cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+       if (cntrs.port_xmit_discards > 0xFFFFUL)
+               p->port_xmit_discards = cpu_to_be16(0xFFFF);
+       else
+               p->port_xmit_discards =
+                       cpu_to_be16((u16)cntrs.port_xmit_discards);
+       if (cntrs.local_link_integrity_errors > 0xFUL)
+               cntrs.local_link_integrity_errors = 0xFUL;
+       if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+               cntrs.excessive_buffer_overrun_errors = 0xFUL;
+       p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+               cntrs.excessive_buffer_overrun_errors;
+       if (cntrs.vl15_dropped > 0xFFFFUL)
+               p->vl15_dropped = cpu_to_be16(0xFFFF);
+       else
+               p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+       if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+               p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+       if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+               p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+       if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+               p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_xmit_packets =
+                       cpu_to_be32((u32)cntrs.port_xmit_packets);
+       if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+               p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_rcv_packets =
+                       cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+                                        struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters_ext *p =
+               (struct ib_pma_portcounters_ext *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       u64 swords, rwords, spkts, rpkts, xwait;
+       u8 port_select = p->port_select;
+
+       ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+                               &rpkts, &xwait);
+
+       /* Adjust counters for any resets done. */
+       swords -= dev->z_port_xmit_data;
+       rwords -= dev->z_port_rcv_data;
+       spkts -= dev->z_port_xmit_packets;
+       rpkts -= dev->z_port_rcv_packets;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       p->port_select = port_select;
+       if (pmp->mad_hdr.attr_mod != 0 ||
+           (port_select != port && port_select != 0xFF))
+               pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+       p->port_xmit_data = cpu_to_be64(swords);
+       p->port_rcv_data = cpu_to_be64(rwords);
+       p->port_xmit_packets = cpu_to_be64(spkts);
+       p->port_rcv_packets = cpu_to_be64(rpkts);
+       p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
+       p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
+       p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
+       p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
+                                    struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_verbs_counters cntrs;
+
+       /*
+        * Since the HW doesn't support clearing counters, we save the
+        * current count and subtract it from future responses.
+        */
+       ipath_get_counters(dev->dd, &cntrs);
+
+       if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+               dev->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+       if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+               dev->z_link_error_recovery_counter =
+                       cntrs.link_error_recovery_counter;
+
+       if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+               dev->z_link_downed_counter = cntrs.link_downed_counter;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+               dev->z_port_rcv_errors =
+                       cntrs.port_rcv_errors + dev->rcv_errors;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+               dev->z_port_rcv_remphys_errors =
+                       cntrs.port_rcv_remphys_errors;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+               dev->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+       if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+               dev->z_local_link_integrity_errors =
+                       cntrs.local_link_integrity_errors;
+
+       if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+               dev->z_excessive_buffer_overrun_errors =
+                       cntrs.excessive_buffer_overrun_errors;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+               dev->n_vl15_dropped = 0;
+               dev->z_vl15_dropped = cntrs.vl15_dropped;
+       }
+
+       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+               dev->z_port_xmit_data = cntrs.port_xmit_data;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+               dev->z_port_rcv_data = cntrs.port_rcv_data;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+               dev->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+               dev->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+       return recv_pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+                                        struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       u64 swords, rwords, spkts, rpkts, xwait;
+
+       ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+                               &rpkts, &xwait);
+
+       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+               dev->z_port_xmit_data = swords;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+               dev->z_port_rcv_data = rwords;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+               dev->z_port_xmit_packets = spkts;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+               dev->z_port_rcv_packets = rpkts;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+               dev->n_unicast_xmit = 0;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+               dev->n_unicast_rcv = 0;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+               dev->n_multicast_xmit = 0;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+               dev->n_multicast_rcv = 0;
+
+       return recv_pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+                       u8 port_num, const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_smp *smp = (struct ib_smp *)out_mad;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       *out_mad = *in_mad;
+       if (smp->class_version != 1) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply(smp);
+               goto bail;
+       }
+
+       /* Is the mkey in the process of expiring? */
+       if (dev->mkey_lease_timeout &&
+           time_after_eq(jiffies, dev->mkey_lease_timeout)) {
+               /* Clear timeout and mkey protection field. */
+               dev->mkey_lease_timeout = 0;
+               dev->mkeyprot = 0;
+       }
+
+       /*
+        * M_Key checking depends on
+        * Portinfo:M_Key_protect_bits
+        */
+       if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
+           dev->mkey != smp->mkey &&
+           (smp->method == IB_MGMT_METHOD_SET ||
+            (smp->method == IB_MGMT_METHOD_GET &&
+             dev->mkeyprot >= 2))) {
+               if (dev->mkey_violations != 0xFFFF)
+                       ++dev->mkey_violations;
+               if (dev->mkey_lease_timeout ||
+                   dev->mkey_lease_period == 0) {
+                       ret = IB_MAD_RESULT_SUCCESS |
+                               IB_MAD_RESULT_CONSUMED;
+                       goto bail;
+               }
+               dev->mkey_lease_timeout = jiffies +
+                       dev->mkey_lease_period * HZ;
+               /* Future: Generate a trap notice. */
+               ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               goto bail;
+       } else if (dev->mkey_lease_timeout)
+               dev->mkey_lease_timeout = 0;
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_NODE_DESC:
+                       ret = recv_subn_get_nodedescription(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_NODE_INFO:
+                       ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_GUID_INFO:
+                       ret = recv_subn_get_guidinfo(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_PORT_INFO:
+                       ret = recv_subn_get_portinfo(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_PKEY_TABLE:
+                       ret = recv_subn_get_pkeytable(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_SM_INFO:
+                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+                               ret = IB_MAD_RESULT_SUCCESS |
+                                       IB_MAD_RESULT_CONSUMED;
+                               goto bail;
+                       }
+                       if (dev->port_cap_flags & IB_PORT_SM) {
+                               ret = IB_MAD_RESULT_SUCCESS;
+                               goto bail;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply(smp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_SET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_GUID_INFO:
+                       ret = recv_subn_set_guidinfo(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_PORT_INFO:
+                       ret = recv_subn_set_portinfo(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_PKEY_TABLE:
+                       ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_SM_INFO:
+                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+                               ret = IB_MAD_RESULT_SUCCESS |
+                                       IB_MAD_RESULT_CONSUMED;
+                               goto bail;
+                       }
+                       if (dev->port_cap_flags & IB_PORT_SM) {
+                               ret = IB_MAD_RESULT_SUCCESS;
+                               goto bail;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply(smp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_TRAP:
+       case IB_MGMT_METHOD_REPORT:
+       case IB_MGMT_METHOD_REPORT_RESP:
+       case IB_MGMT_METHOD_TRAP_REPRESS:
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               goto bail;
+       default:
+               smp->status |= IB_SMP_UNSUP_METHOD;
+               ret = reply(smp);
+       }
+
+bail:
+       return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port_num,
+                       const struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+       int ret;
+
+       *out_mad = *in_mad;
+       if (pmp->mad_hdr.class_version != 1) {
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_smp *) pmp);
+               goto bail;
+       }
+
+       switch (pmp->mad_hdr.method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_CLASS_PORT_INFO:
+                       ret = recv_pma_get_classportinfo(pmp);
+                       goto bail;
+               case IB_PMA_PORT_SAMPLES_CONTROL:
+                       ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
+                                                             port_num);
+                       goto bail;
+               case IB_PMA_PORT_SAMPLES_RESULT:
+                       ret = recv_pma_get_portsamplesresult(pmp, ibdev);
+                       goto bail;
+               case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+                       ret = recv_pma_get_portsamplesresult_ext(pmp,
+                                                                ibdev);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS:
+                       ret = recv_pma_get_portcounters(pmp, ibdev,
+                                                       port_num);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = recv_pma_get_portcounters_ext(pmp, ibdev,
+                                                           port_num);
+                       goto bail;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_smp *) pmp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_SET:
+               switch (pmp->mad_hdr.attr_id) {
+               case IB_PMA_PORT_SAMPLES_CONTROL:
+                       ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
+                                                             port_num);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS:
+                       ret = recv_pma_set_portcounters(pmp, ibdev,
+                                                       port_num);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = recv_pma_set_portcounters_ext(pmp, ibdev,
+                                                           port_num);
+                       goto bail;
+               default:
+                       pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_smp *) pmp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_SUCCESS;
+               goto bail;
+       default:
+               pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_smp *) pmp);
+       }
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+                     const struct ib_mad_hdr *in, size_t in_mad_size,
+                     struct ib_mad_hdr *out, size_t *out_mad_size,
+                     u16 *out_mad_pkey_index)
+{
+       int ret;
+       const struct ib_mad *in_mad = (const struct ib_mad *)in;
+       struct ib_mad *out_mad = (struct ib_mad *)out;
+
+       if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
+                        *out_mad_size != sizeof(*out_mad)))
+               return IB_MAD_RESULT_FAILURE;
+
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               ret = process_subn(ibdev, mad_flags, port_num,
+                                  in_mad, out_mad);
+               goto bail;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf(ibdev, port_num, in_mad, out_mad);
+               goto bail;
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+       }
+
+bail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mmap.c b/drivers/staging/rdma/ipath/ipath_mmap.c
new file mode 100644 (file)
index 0000000..e732742
--- /dev/null
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct ipath_mmap_info
+ */
+void ipath_release_mmap_info(struct kref *ref)
+{
+       struct ipath_mmap_info *ip =
+               container_of(ref, struct ipath_mmap_info, ref);
+       struct ipath_ibdev *dev = to_idev(ip->context->device);
+
+       spin_lock_irq(&dev->pending_lock);
+       list_del(&ip->pending_mmaps);
+       spin_unlock_irq(&dev->pending_lock);
+
+       vfree(ip->obj);
+       kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void ipath_vma_open(struct vm_area_struct *vma)
+{
+       struct ipath_mmap_info *ip = vma->vm_private_data;
+
+       kref_get(&ip->ref);
+}
+
+static void ipath_vma_close(struct vm_area_struct *vma)
+{
+       struct ipath_mmap_info *ip = vma->vm_private_data;
+
+       kref_put(&ip->ref, ipath_release_mmap_info);
+}
+
+static const struct vm_operations_struct ipath_vm_ops = {
+       .open =     ipath_vma_open,
+       .close =    ipath_vma_close,
+};
+
+/**
+ * ipath_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       struct ipath_ibdev *dev = to_idev(context->device);
+       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+       unsigned long size = vma->vm_end - vma->vm_start;
+       struct ipath_mmap_info *ip, *pp;
+       int ret = -EINVAL;
+
+       /*
+        * Search the device's list of objects waiting for a mmap call.
+        * Normally, this list is very short since a call to create a
+        * CQ, QP, or SRQ is soon followed by a call to mmap().
+        */
+       spin_lock_irq(&dev->pending_lock);
+       list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+                                pending_mmaps) {
+               /* Only the creator is allowed to mmap the object */
+               if (context != ip->context || (__u64) offset != ip->offset)
+                       continue;
+               /* Don't allow a mmap larger than the object. */
+               if (size > ip->size)
+                       break;
+
+               list_del_init(&ip->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+
+               ret = remap_vmalloc_range(vma, ip->obj, 0);
+               if (ret)
+                       goto done;
+               vma->vm_ops = &ipath_vm_ops;
+               vma->vm_private_data = ip;
+               ipath_vma_open(vma);
+               goto done;
+       }
+       spin_unlock_irq(&dev->pending_lock);
+done:
+       return ret;
+}
+
+/*
+ * Allocate information for ipath_mmap
+ */
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+                                              u32 size,
+                                              struct ib_ucontext *context,
+                                              void *obj) {
+       struct ipath_mmap_info *ip;
+
+       ip = kmalloc(sizeof *ip, GFP_KERNEL);
+       if (!ip)
+               goto bail;
+
+       size = PAGE_ALIGN(size);
+
+       spin_lock_irq(&dev->mmap_offset_lock);
+       if (dev->mmap_offset == 0)
+               dev->mmap_offset = PAGE_SIZE;
+       ip->offset = dev->mmap_offset;
+       dev->mmap_offset += size;
+       spin_unlock_irq(&dev->mmap_offset_lock);
+
+       INIT_LIST_HEAD(&ip->pending_mmaps);
+       ip->size = size;
+       ip->context = context;
+       ip->obj = obj;
+       kref_init(&ip->ref);
+
+bail:
+       return ip;
+}
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+                           struct ipath_mmap_info *ip,
+                           u32 size, void *obj) {
+       size = PAGE_ALIGN(size);
+
+       spin_lock_irq(&dev->mmap_offset_lock);
+       if (dev->mmap_offset == 0)
+               dev->mmap_offset = PAGE_SIZE;
+       ip->offset = dev->mmap_offset;
+       dev->mmap_offset += size;
+       spin_unlock_irq(&dev->mmap_offset_lock);
+
+       ip->size = size;
+       ip->obj = obj;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c
new file mode 100644 (file)
index 0000000..c7278f6
--- /dev/null
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+
+/* Fast memory region */
+struct ipath_fmr {
+       struct ib_fmr ibfmr;
+       u8 page_shift;
+       struct ipath_mregion mr;        /* must be last */
+};
+
+static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+       return container_of(ibfmr, struct ipath_fmr, ibfmr);
+}
+
+/**
+ * ipath_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see ipath_dma.c).
+ */
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+       struct ipath_mr *mr;
+       struct ib_mr *ret;
+
+       mr = kzalloc(sizeof *mr, GFP_KERNEL);
+       if (!mr) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       mr->mr.access_flags = acc;
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+static struct ipath_mr *alloc_mr(int count,
+                                struct ipath_lkey_table *lk_table)
+{
+       struct ipath_mr *mr;
+       int m, i = 0;
+
+       /* Allocate struct plus pointers to first level page tables. */
+       m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+       mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+       if (!mr)
+               goto done;
+
+       /* Allocate first level page tables. */
+       for (; i < m; i++) {
+               mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+               if (!mr->mr.map[i])
+                       goto bail;
+       }
+       mr->mr.mapsz = m;
+
+       /*
+        * ib_reg_phys_mr() will initialize mr->ibmr except for
+        * lkey and rkey.
+        */
+       if (!ipath_alloc_lkey(lk_table, &mr->mr))
+               goto bail;
+       mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+
+       goto done;
+
+bail:
+       while (i) {
+               i--;
+               kfree(mr->mr.map[i]);
+       }
+       kfree(mr);
+       mr = NULL;
+
+done:
+       return mr;
+}
+
+/**
+ * ipath_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+                               struct ib_phys_buf *buffer_list,
+                               int num_phys_buf, int acc, u64 *iova_start)
+{
+       struct ipath_mr *mr;
+       int n, m, i;
+       struct ib_mr *ret;
+
+       mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
+       if (mr == NULL) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       mr->mr.pd = pd;
+       mr->mr.user_base = *iova_start;
+       mr->mr.iova = *iova_start;
+       mr->mr.length = 0;
+       mr->mr.offset = 0;
+       mr->mr.access_flags = acc;
+       mr->mr.max_segs = num_phys_buf;
+       mr->umem = NULL;
+
+       m = 0;
+       n = 0;
+       for (i = 0; i < num_phys_buf; i++) {
+               mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+               mr->mr.length += buffer_list[i].size;
+               n++;
+               if (n == IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: virtual address to use (from HCA's point of view)
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                               u64 virt_addr, int mr_access_flags,
+                               struct ib_udata *udata)
+{
+       struct ipath_mr *mr;
+       struct ib_umem *umem;
+       int n, m, entry;
+       struct scatterlist *sg;
+       struct ib_mr *ret;
+
+       if (length == 0) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       umem = ib_umem_get(pd->uobject->context, start, length,
+                          mr_access_flags, 0);
+       if (IS_ERR(umem))
+               return (void *) umem;
+
+       n = umem->nmap;
+       mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
+       if (!mr) {
+               ret = ERR_PTR(-ENOMEM);
+               ib_umem_release(umem);
+               goto bail;
+       }
+
+       mr->mr.pd = pd;
+       mr->mr.user_base = start;
+       mr->mr.iova = virt_addr;
+       mr->mr.length = length;
+       mr->mr.offset = ib_umem_offset(umem);
+       mr->mr.access_flags = mr_access_flags;
+       mr->mr.max_segs = n;
+       mr->umem = umem;
+
+       m = 0;
+       n = 0;
+       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+               void *vaddr;
+
+               vaddr = page_address(sg_page(sg));
+               if (!vaddr) {
+                       ret = ERR_PTR(-EINVAL);
+                       goto bail;
+               }
+               mr->mr.map[m]->segs[n].vaddr = vaddr;
+               mr->mr.map[m]->segs[n].length = umem->page_size;
+               n++;
+               if (n == IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by ipath_get_dma_mr()
+ * or ipath_reg_user_mr().
+ */
+int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+       struct ipath_mr *mr = to_imr(ibmr);
+       int i;
+
+       ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+       i = mr->mr.mapsz;
+       while (i) {
+               i--;
+               kfree(mr->mr.map[i]);
+       }
+
+       if (mr->umem)
+               ib_umem_release(mr->umem);
+
+       kfree(mr);
+       return 0;
+}
+
+/**
+ * ipath_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+                              struct ib_fmr_attr *fmr_attr)
+{
+       struct ipath_fmr *fmr;
+       int m, i = 0;
+       struct ib_fmr *ret;
+
+       /* Allocate struct plus pointers to first level page tables. */
+       m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+       fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+       if (!fmr)
+               goto bail;
+
+       /* Allocate first level page tables. */
+       for (; i < m; i++) {
+               fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
+                                        GFP_KERNEL);
+               if (!fmr->mr.map[i])
+                       goto bail;
+       }
+       fmr->mr.mapsz = m;
+
+       /*
+        * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+        * rkey.
+        */
+       if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
+               goto bail;
+       fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+       /*
+        * Resources are allocated but no valid mapping (RKEY can't be
+        * used).
+        */
+       fmr->mr.pd = pd;
+       fmr->mr.user_base = 0;
+       fmr->mr.iova = 0;
+       fmr->mr.length = 0;
+       fmr->mr.offset = 0;
+       fmr->mr.access_flags = mr_access_flags;
+       fmr->mr.max_segs = fmr_attr->max_pages;
+       fmr->page_shift = fmr_attr->page_shift;
+
+       ret = &fmr->ibfmr;
+       goto done;
+
+bail:
+       while (i)
+               kfree(fmr->mr.map[--i]);
+       kfree(fmr);
+       ret = ERR_PTR(-ENOMEM);
+
+done:
+       return ret;
+}
+
+/**
+ * ipath_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+                      int list_len, u64 iova)
+{
+       struct ipath_fmr *fmr = to_ifmr(ibfmr);
+       struct ipath_lkey_table *rkt;
+       unsigned long flags;
+       int m, n, i;
+       u32 ps;
+       int ret;
+
+       if (list_len > fmr->mr.max_segs) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       rkt = &to_idev(ibfmr->device)->lk_table;
+       spin_lock_irqsave(&rkt->lock, flags);
+       fmr->mr.user_base = iova;
+       fmr->mr.iova = iova;
+       ps = 1 << fmr->page_shift;
+       fmr->mr.length = list_len * ps;
+       m = 0;
+       n = 0;
+       ps = 1 << fmr->page_shift;
+       for (i = 0; i < list_len; i++) {
+               fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+               fmr->mr.map[m]->segs[n].length = ps;
+               if (++n == IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+       spin_unlock_irqrestore(&rkt->lock, flags);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+       struct ipath_fmr *fmr;
+       struct ipath_lkey_table *rkt;
+       unsigned long flags;
+
+       list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+               rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+               spin_lock_irqsave(&rkt->lock, flags);
+               fmr->mr.user_base = 0;
+               fmr->mr.iova = 0;
+               fmr->mr.length = 0;
+               spin_unlock_irqrestore(&rkt->lock, flags);
+       }
+       return 0;
+}
+
+/**
+ * ipath_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+       struct ipath_fmr *fmr = to_ifmr(ibfmr);
+       int i;
+
+       ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+       i = fmr->mr.mapsz;
+       while (i)
+               kfree(fmr->mr.map[--i]);
+       kfree(fmr);
+       return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_qp.c b/drivers/staging/rdma/ipath/ipath_qp.c
new file mode 100644 (file)
index 0000000..face876
--- /dev/null
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+#define BITS_PER_PAGE          (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK     (BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off)  (((map) - (qpt)->map) * BITS_PER_PAGE + \
+                                (off))
+#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
+                                                     BITS_PER_PAGE, off)
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+       0,                      /* 0 */
+       1,                      /* 1 */
+       2,                      /* 2 */
+       3,                      /* 3 */
+       4,                      /* 4 */
+       6,                      /* 5 */
+       8,                      /* 6 */
+       12,                     /* 7 */
+       16,                     /* 8 */
+       24,                     /* 9 */
+       32,                     /* A */
+       48,                     /* B */
+       64,                     /* C */
+       96,                     /* D */
+       128,                    /* E */
+       192,                    /* F */
+       256,                    /* 10 */
+       384,                    /* 11 */
+       512,                    /* 12 */
+       768,                    /* 13 */
+       1024,                   /* 14 */
+       1536,                   /* 15 */
+       2048,                   /* 16 */
+       3072,                   /* 17 */
+       4096,                   /* 18 */
+       6144,                   /* 19 */
+       8192,                   /* 1A */
+       12288,                  /* 1B */
+       16384,                  /* 1C */
+       24576,                  /* 1D */
+       32768                   /* 1E */
+};
+
+
+static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
+{
+       unsigned long page = get_zeroed_page(GFP_KERNEL);
+       unsigned long flags;
+
+       /*
+        * Free the page if someone raced with us installing it.
+        */
+
+       spin_lock_irqsave(&qpt->lock, flags);
+       if (map->page)
+               free_page(page);
+       else
+               map->page = (void *)page;
+       spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+
+static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
+{
+       u32 i, offset, max_scan, qpn;
+       struct qpn_map *map;
+       u32 ret = -1;
+
+       if (type == IB_QPT_SMI)
+               ret = 0;
+       else if (type == IB_QPT_GSI)
+               ret = 1;
+
+       if (ret != -1) {
+               map = &qpt->map[0];
+               if (unlikely(!map->page)) {
+                       get_map_page(qpt, map);
+                       if (unlikely(!map->page)) {
+                               ret = -ENOMEM;
+                               goto bail;
+                       }
+               }
+               if (!test_and_set_bit(ret, map->page))
+                       atomic_dec(&map->n_free);
+               else
+                       ret = -EBUSY;
+               goto bail;
+       }
+
+       qpn = qpt->last + 1;
+       if (qpn >= QPN_MAX)
+               qpn = 2;
+       offset = qpn & BITS_PER_PAGE_MASK;
+       map = &qpt->map[qpn / BITS_PER_PAGE];
+       max_scan = qpt->nmaps - !offset;
+       for (i = 0;;) {
+               if (unlikely(!map->page)) {
+                       get_map_page(qpt, map);
+                       if (unlikely(!map->page))
+                               break;
+               }
+               if (likely(atomic_read(&map->n_free))) {
+                       do {
+                               if (!test_and_set_bit(offset, map->page)) {
+                                       atomic_dec(&map->n_free);
+                                       qpt->last = qpn;
+                                       ret = qpn;
+                                       goto bail;
+                               }
+                               offset = find_next_offset(map, offset);
+                               qpn = mk_qpn(qpt, map, offset);
+                               /*
+                                * This test differs from alloc_pidmap().
+                                * If find_next_offset() does find a zero
+                                * bit, we don't need to check for QPN
+                                * wrapping around past our starting QPN.
+                                * We just need to be sure we don't loop
+                                * forever.
+                                */
+                       } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+               }
+               /*
+                * In order to keep the number of pages allocated to a
+                * minimum, we scan the all existing pages before increasing
+                * the size of the bitmap table.
+                */
+               if (++i > max_scan) {
+                       if (qpt->nmaps == QPNMAP_ENTRIES)
+                               break;
+                       map = &qpt->map[qpt->nmaps++];
+                       offset = 0;
+               } else if (map < &qpt->map[qpt->nmaps]) {
+                       ++map;
+                       offset = 0;
+               } else {
+                       map = &qpt->map[0];
+                       offset = 2;
+               }
+               qpn = mk_qpn(qpt, map, offset);
+       }
+
+       ret = -ENOMEM;
+
+bail:
+       return ret;
+}
+
+static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+       struct qpn_map *map;
+
+       map = qpt->map + qpn / BITS_PER_PAGE;
+       if (map->page)
+               clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+       atomic_inc(&map->n_free);
+}
+
+/**
+ * ipath_alloc_qpn - allocate a QP number
+ * @qpt: the QP table
+ * @qp: the QP
+ * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
+ *
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+                          enum ib_qp_type type)
+{
+       unsigned long flags;
+       int ret;
+
+       ret = alloc_qpn(qpt, type);
+       if (ret < 0)
+               goto bail;
+       qp->ibqp.qp_num = ret;
+
+       /* Add the QP to the hash table. */
+       spin_lock_irqsave(&qpt->lock, flags);
+
+       ret %= qpt->max;
+       qp->next = qpt->table[ret];
+       qpt->table[ret] = qp;
+       atomic_inc(&qp->refcount);
+
+       spin_unlock_irqrestore(&qpt->lock, flags);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_free_qp - remove a QP from the QP table
+ * @qpt: the QP table
+ * @qp: the QP to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+       struct ipath_qp *q, **qpp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qpt->lock, flags);
+
+       /* Remove QP from the hash table. */
+       qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+       for (; (q = *qpp) != NULL; qpp = &q->next) {
+               if (q == qp) {
+                       *qpp = qp->next;
+                       qp->next = NULL;
+                       atomic_dec(&qp->refcount);
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+/**
+ * ipath_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+       unsigned long flags;
+       struct ipath_qp *qp;
+       u32 n, qp_inuse = 0;
+
+       spin_lock_irqsave(&qpt->lock, flags);
+       for (n = 0; n < qpt->max; n++) {
+               qp = qpt->table[n];
+               qpt->table[n] = NULL;
+
+               for (; qp; qp = qp->next)
+                       qp_inuse++;
+       }
+       spin_unlock_irqrestore(&qpt->lock, flags);
+
+       for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
+               if (qpt->map[n].page)
+                       free_page((unsigned long) qpt->map[n].page);
+       return qp_inuse;
+}
+
+/**
+ * ipath_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+       unsigned long flags;
+       struct ipath_qp *qp;
+
+       spin_lock_irqsave(&qpt->lock, flags);
+
+       for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+               if (qp->ibqp.qp_num == qpn) {
+                       atomic_inc(&qp->refcount);
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&qpt->lock, flags);
+       return qp;
+}
+
+/**
+ * ipath_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
+{
+       qp->remote_qpn = 0;
+       qp->qkey = 0;
+       qp->qp_access_flags = 0;
+       atomic_set(&qp->s_dma_busy, 0);
+       qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
+       qp->s_hdrwords = 0;
+       qp->s_wqe = NULL;
+       qp->s_pkt_delay = 0;
+       qp->s_draining = 0;
+       qp->s_psn = 0;
+       qp->r_psn = 0;
+       qp->r_msn = 0;
+       if (type == IB_QPT_RC) {
+               qp->s_state = IB_OPCODE_RC_SEND_LAST;
+               qp->r_state = IB_OPCODE_RC_SEND_LAST;
+       } else {
+               qp->s_state = IB_OPCODE_UC_SEND_LAST;
+               qp->r_state = IB_OPCODE_UC_SEND_LAST;
+       }
+       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+       qp->r_nak_state = 0;
+       qp->r_aflags = 0;
+       qp->r_flags = 0;
+       qp->s_rnr_timeout = 0;
+       qp->s_head = 0;
+       qp->s_tail = 0;
+       qp->s_cur = 0;
+       qp->s_last = 0;
+       qp->s_ssn = 1;
+       qp->s_lsn = 0;
+       memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+       qp->r_head_ack_queue = 0;
+       qp->s_tail_ack_queue = 0;
+       qp->s_num_rd_atomic = 0;
+       if (qp->r_rq.wq) {
+               qp->r_rq.wq->head = 0;
+               qp->r_rq.wq->tail = 0;
+       }
+}
+
+/**
+ * ipath_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ib_wc wc;
+       int ret = 0;
+
+       if (qp->state == IB_QPS_ERR)
+               goto bail;
+
+       qp->state = IB_QPS_ERR;
+
+       spin_lock(&dev->pending_lock);
+       if (!list_empty(&qp->timerwait))
+               list_del_init(&qp->timerwait);
+       if (!list_empty(&qp->piowait))
+               list_del_init(&qp->piowait);
+       spin_unlock(&dev->pending_lock);
+
+       /* Schedule the sending tasklet to drain the send work queue. */
+       if (qp->s_last != qp->s_head)
+               ipath_schedule_send(qp);
+
+       memset(&wc, 0, sizeof(wc));
+       wc.qp = &qp->ibqp;
+       wc.opcode = IB_WC_RECV;
+
+       if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
+               wc.wr_id = qp->r_wr_id;
+               wc.status = err;
+               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+       }
+       wc.status = IB_WC_WR_FLUSH_ERR;
+
+       if (qp->r_rq.wq) {
+               struct ipath_rwq *wq;
+               u32 head;
+               u32 tail;
+
+               spin_lock(&qp->r_rq.lock);
+
+               /* sanity check pointers before trusting them */
+               wq = qp->r_rq.wq;
+               head = wq->head;
+               if (head >= qp->r_rq.size)
+                       head = 0;
+               tail = wq->tail;
+               if (tail >= qp->r_rq.size)
+                       tail = 0;
+               while (tail != head) {
+                       wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+                       if (++tail >= qp->r_rq.size)
+                               tail = 0;
+                       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+               }
+               wq->tail = tail;
+
+               spin_unlock(&qp->r_rq.lock);
+       } else if (qp->ibqp.event_handler)
+               ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for ipathverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                   int attr_mask, struct ib_udata *udata)
+{
+       struct ipath_ibdev *dev = to_idev(ibqp->device);
+       struct ipath_qp *qp = to_iqp(ibqp);
+       enum ib_qp_state cur_state, new_state;
+       int lastwqe = 0;
+       int ret;
+
+       spin_lock_irq(&qp->s_lock);
+
+       cur_state = attr_mask & IB_QP_CUR_STATE ?
+               attr->cur_qp_state : qp->state;
+       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+                               attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+               goto inval;
+
+       if (attr_mask & IB_QP_AV) {
+               if (attr->ah_attr.dlid == 0 ||
+                   attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
+                       goto inval;
+
+               if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
+                   (attr->ah_attr.grh.sgid_index > 1))
+                       goto inval;
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX)
+               if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
+                       goto inval;
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER)
+               if (attr->min_rnr_timer > 31)
+                       goto inval;
+
+       if (attr_mask & IB_QP_PORT)
+               if (attr->port_num == 0 ||
+                   attr->port_num > ibqp->device->phys_port_cnt)
+                       goto inval;
+
+       /*
+        * don't allow invalid Path MTU values or greater than 2048
+        * unless we are configured for a 4KB MTU
+        */
+       if ((attr_mask & IB_QP_PATH_MTU) &&
+               (ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
+               (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
+               goto inval;
+
+       if (attr_mask & IB_QP_PATH_MIG_STATE)
+               if (attr->path_mig_state != IB_MIG_MIGRATED &&
+                   attr->path_mig_state != IB_MIG_REARM)
+                       goto inval;
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+               if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
+                       goto inval;
+
+       switch (new_state) {
+       case IB_QPS_RESET:
+               if (qp->state != IB_QPS_RESET) {
+                       qp->state = IB_QPS_RESET;
+                       spin_lock(&dev->pending_lock);
+                       if (!list_empty(&qp->timerwait))
+                               list_del_init(&qp->timerwait);
+                       if (!list_empty(&qp->piowait))
+                               list_del_init(&qp->piowait);
+                       spin_unlock(&dev->pending_lock);
+                       qp->s_flags &= ~IPATH_S_ANY_WAIT;
+                       spin_unlock_irq(&qp->s_lock);
+                       /* Stop the sending tasklet */
+                       tasklet_kill(&qp->s_task);
+                       wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+                       spin_lock_irq(&qp->s_lock);
+               }
+               ipath_reset_qp(qp, ibqp->qp_type);
+               break;
+
+       case IB_QPS_SQD:
+               qp->s_draining = qp->s_last != qp->s_cur;
+               qp->state = new_state;
+               break;
+
+       case IB_QPS_SQE:
+               if (qp->ibqp.qp_type == IB_QPT_RC)
+                       goto inval;
+               qp->state = new_state;
+               break;
+
+       case IB_QPS_ERR:
+               lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+               break;
+
+       default:
+               qp->state = new_state;
+               break;
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX)
+               qp->s_pkey_index = attr->pkey_index;
+
+       if (attr_mask & IB_QP_DEST_QPN)
+               qp->remote_qpn = attr->dest_qp_num;
+
+       if (attr_mask & IB_QP_SQ_PSN) {
+               qp->s_psn = qp->s_next_psn = attr->sq_psn;
+               qp->s_last_psn = qp->s_next_psn - 1;
+       }
+
+       if (attr_mask & IB_QP_RQ_PSN)
+               qp->r_psn = attr->rq_psn;
+
+       if (attr_mask & IB_QP_ACCESS_FLAGS)
+               qp->qp_access_flags = attr->qp_access_flags;
+
+       if (attr_mask & IB_QP_AV) {
+               qp->remote_ah_attr = attr->ah_attr;
+               qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
+       }
+
+       if (attr_mask & IB_QP_PATH_MTU)
+               qp->path_mtu = attr->path_mtu;
+
+       if (attr_mask & IB_QP_RETRY_CNT)
+               qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               qp->s_rnr_retry = attr->rnr_retry;
+               if (qp->s_rnr_retry > 7)
+                       qp->s_rnr_retry = 7;
+               qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+       }
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER)
+               qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+       if (attr_mask & IB_QP_TIMEOUT)
+               qp->timeout = attr->timeout;
+
+       if (attr_mask & IB_QP_QKEY)
+               qp->qkey = attr->qkey;
+
+       if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+               qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+       if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+               qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+       spin_unlock_irq(&qp->s_lock);
+
+       if (lastwqe) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+       ret = 0;
+       goto bail;
+
+inval:
+       spin_unlock_irq(&qp->s_lock);
+       ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                  int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+
+       attr->qp_state = qp->state;
+       attr->cur_qp_state = attr->qp_state;
+       attr->path_mtu = qp->path_mtu;
+       attr->path_mig_state = 0;
+       attr->qkey = qp->qkey;
+       attr->rq_psn = qp->r_psn;
+       attr->sq_psn = qp->s_next_psn;
+       attr->dest_qp_num = qp->remote_qpn;
+       attr->qp_access_flags = qp->qp_access_flags;
+       attr->cap.max_send_wr = qp->s_size - 1;
+       attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+       attr->cap.max_send_sge = qp->s_max_sge;
+       attr->cap.max_recv_sge = qp->r_rq.max_sge;
+       attr->cap.max_inline_data = 0;
+       attr->ah_attr = qp->remote_ah_attr;
+       memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
+       attr->pkey_index = qp->s_pkey_index;
+       attr->alt_pkey_index = 0;
+       attr->en_sqd_async_notify = 0;
+       attr->sq_draining = qp->s_draining;
+       attr->max_rd_atomic = qp->s_max_rd_atomic;
+       attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+       attr->min_rnr_timer = qp->r_min_rnr_timer;
+       attr->port_num = 1;
+       attr->timeout = qp->timeout;
+       attr->retry_cnt = qp->s_retry_cnt;
+       attr->rnr_retry = qp->s_rnr_retry_cnt;
+       attr->alt_port_num = 0;
+       attr->alt_timeout = 0;
+
+       init_attr->event_handler = qp->ibqp.event_handler;
+       init_attr->qp_context = qp->ibqp.qp_context;
+       init_attr->send_cq = qp->ibqp.send_cq;
+       init_attr->recv_cq = qp->ibqp.recv_cq;
+       init_attr->srq = qp->ibqp.srq;
+       init_attr->cap = attr->cap;
+       if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
+               init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+       else
+               init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+       init_attr->qp_type = qp->ibqp.qp_type;
+       init_attr->port_num = 1;
+       return 0;
+}
+
+/**
+ * ipath_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+       u32 aeth = qp->r_msn & IPATH_MSN_MASK;
+
+       if (qp->ibqp.srq) {
+               /*
+                * Shared receive queues don't generate credits.
+                * Set the credit field to the invalid value.
+                */
+               aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
+       } else {
+               u32 min, max, x;
+               u32 credits;
+               struct ipath_rwq *wq = qp->r_rq.wq;
+               u32 head;
+               u32 tail;
+
+               /* sanity check pointers before trusting them */
+               head = wq->head;
+               if (head >= qp->r_rq.size)
+                       head = 0;
+               tail = wq->tail;
+               if (tail >= qp->r_rq.size)
+                       tail = 0;
+               /*
+                * Compute the number of credits available (RWQEs).
+                * XXX Not holding the r_rq.lock here so there is a small
+                * chance that the pair of reads are not atomic.
+                */
+               credits = head - tail;
+               if ((int)credits < 0)
+                       credits += qp->r_rq.size;
+               /*
+                * Binary search the credit table to find the code to
+                * use.
+                */
+               min = 0;
+               max = 31;
+               for (;;) {
+                       x = (min + max) / 2;
+                       if (credit_table[x] == credits)
+                               break;
+                       if (credit_table[x] > credits)
+                               max = x;
+                       else if (min == x)
+                               break;
+                       else
+                               min = x;
+               }
+               aeth |= x << IPATH_AETH_CREDIT_SHIFT;
+       }
+       return cpu_to_be32(aeth);
+}
+
+/**
+ * ipath_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: unused by InfiniPath
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+                             struct ib_qp_init_attr *init_attr,
+                             struct ib_udata *udata)
+{
+       struct ipath_qp *qp;
+       int err;
+       struct ipath_swqe *swq = NULL;
+       struct ipath_ibdev *dev;
+       size_t sz;
+       size_t sg_list_sz;
+       struct ib_qp *ret;
+
+       if (init_attr->create_flags) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
+           init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       /* Check receive queue parameters if no SRQ is specified. */
+       if (!init_attr->srq) {
+               if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
+                   init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
+                       ret = ERR_PTR(-EINVAL);
+                       goto bail;
+               }
+               if (init_attr->cap.max_send_sge +
+                   init_attr->cap.max_send_wr +
+                   init_attr->cap.max_recv_sge +
+                   init_attr->cap.max_recv_wr == 0) {
+                       ret = ERR_PTR(-EINVAL);
+                       goto bail;
+               }
+       }
+
+       switch (init_attr->qp_type) {
+       case IB_QPT_UC:
+       case IB_QPT_RC:
+       case IB_QPT_UD:
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               sz = sizeof(struct ipath_sge) *
+                       init_attr->cap.max_send_sge +
+                       sizeof(struct ipath_swqe);
+               swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+               if (swq == NULL) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail;
+               }
+               sz = sizeof(*qp);
+               sg_list_sz = 0;
+               if (init_attr->srq) {
+                       struct ipath_srq *srq = to_isrq(init_attr->srq);
+
+                       if (srq->rq.max_sge > 1)
+                               sg_list_sz = sizeof(*qp->r_sg_list) *
+                                       (srq->rq.max_sge - 1);
+               } else if (init_attr->cap.max_recv_sge > 1)
+                       sg_list_sz = sizeof(*qp->r_sg_list) *
+                               (init_attr->cap.max_recv_sge - 1);
+               qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
+               if (!qp) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_swq;
+               }
+               if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
+                   init_attr->qp_type == IB_QPT_SMI ||
+                   init_attr->qp_type == IB_QPT_GSI)) {
+                       qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
+                       if (!qp->r_ud_sg_list) {
+                               ret = ERR_PTR(-ENOMEM);
+                               goto bail_qp;
+                       }
+               } else
+                       qp->r_ud_sg_list = NULL;
+               if (init_attr->srq) {
+                       sz = 0;
+                       qp->r_rq.size = 0;
+                       qp->r_rq.max_sge = 0;
+                       qp->r_rq.wq = NULL;
+                       init_attr->cap.max_recv_wr = 0;
+                       init_attr->cap.max_recv_sge = 0;
+               } else {
+                       qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+                       qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+                       sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+                               sizeof(struct ipath_rwqe);
+                       qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
+                                             qp->r_rq.size * sz);
+                       if (!qp->r_rq.wq) {
+                               ret = ERR_PTR(-ENOMEM);
+                               goto bail_sg_list;
+                       }
+               }
+
+               /*
+                * ib_create_qp() will initialize qp->ibqp
+                * except for qp->ibqp.qp_num.
+                */
+               spin_lock_init(&qp->s_lock);
+               spin_lock_init(&qp->r_rq.lock);
+               atomic_set(&qp->refcount, 0);
+               init_waitqueue_head(&qp->wait);
+               init_waitqueue_head(&qp->wait_dma);
+               tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
+               INIT_LIST_HEAD(&qp->piowait);
+               INIT_LIST_HEAD(&qp->timerwait);
+               qp->state = IB_QPS_RESET;
+               qp->s_wq = swq;
+               qp->s_size = init_attr->cap.max_send_wr + 1;
+               qp->s_max_sge = init_attr->cap.max_send_sge;
+               if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+                       qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
+               else
+                       qp->s_flags = 0;
+               dev = to_idev(ibpd->device);
+               err = ipath_alloc_qpn(&dev->qp_table, qp,
+                                     init_attr->qp_type);
+               if (err) {
+                       ret = ERR_PTR(err);
+                       vfree(qp->r_rq.wq);
+                       goto bail_sg_list;
+               }
+               qp->ip = NULL;
+               qp->s_tx = NULL;
+               ipath_reset_qp(qp, init_attr->qp_type);
+               break;
+
+       default:
+               /* Don't support raw QPs */
+               ret = ERR_PTR(-ENOSYS);
+               goto bail;
+       }
+
+       init_attr->cap.max_inline_data = 0;
+
+       /*
+        * Return the address of the RWQ as the offset to mmap.
+        * See ipath_mmap() for details.
+        */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               if (!qp->r_rq.wq) {
+                       __u64 offset = 0;
+
+                       err = ib_copy_to_udata(udata, &offset,
+                                              sizeof(offset));
+                       if (err) {
+                               ret = ERR_PTR(err);
+                               goto bail_ip;
+                       }
+               } else {
+                       u32 s = sizeof(struct ipath_rwq) +
+                               qp->r_rq.size * sz;
+
+                       qp->ip =
+                           ipath_create_mmap_info(dev, s,
+                                                  ibpd->uobject->context,
+                                                  qp->r_rq.wq);
+                       if (!qp->ip) {
+                               ret = ERR_PTR(-ENOMEM);
+                               goto bail_ip;
+                       }
+
+                       err = ib_copy_to_udata(udata, &(qp->ip->offset),
+                                              sizeof(qp->ip->offset));
+                       if (err) {
+                               ret = ERR_PTR(err);
+                               goto bail_ip;
+                       }
+               }
+       }
+
+       spin_lock(&dev->n_qps_lock);
+       if (dev->n_qps_allocated == ib_ipath_max_qps) {
+               spin_unlock(&dev->n_qps_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_ip;
+       }
+
+       dev->n_qps_allocated++;
+       spin_unlock(&dev->n_qps_lock);
+
+       if (qp->ip) {
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       ret = &qp->ibqp;
+       goto bail;
+
+bail_ip:
+       if (qp->ip)
+               kref_put(&qp->ip->ref, ipath_release_mmap_info);
+       else
+               vfree(qp->r_rq.wq);
+       ipath_free_qp(&dev->qp_table, qp);
+       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+bail_sg_list:
+       kfree(qp->r_ud_sg_list);
+bail_qp:
+       kfree(qp);
+bail_swq:
+       vfree(swq);
+bail:
+       return ret;
+}
+
+/**
+ * ipath_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+       /* Make sure HW and driver activity is stopped. */
+       spin_lock_irq(&qp->s_lock);
+       if (qp->state != IB_QPS_RESET) {
+               qp->state = IB_QPS_RESET;
+               spin_lock(&dev->pending_lock);
+               if (!list_empty(&qp->timerwait))
+                       list_del_init(&qp->timerwait);
+               if (!list_empty(&qp->piowait))
+                       list_del_init(&qp->piowait);
+               spin_unlock(&dev->pending_lock);
+               qp->s_flags &= ~IPATH_S_ANY_WAIT;
+               spin_unlock_irq(&qp->s_lock);
+               /* Stop the sending tasklet */
+               tasklet_kill(&qp->s_task);
+               wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+       } else
+               spin_unlock_irq(&qp->s_lock);
+
+       ipath_free_qp(&dev->qp_table, qp);
+
+       if (qp->s_tx) {
+               atomic_dec(&qp->refcount);
+               if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+                       kfree(qp->s_tx->txreq.map_addr);
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
+               spin_unlock_irq(&dev->pending_lock);
+               qp->s_tx = NULL;
+       }
+
+       wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+       /* all user's cleaned up, mark it available */
+       free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+       spin_lock(&dev->n_qps_lock);
+       dev->n_qps_allocated--;
+       spin_unlock(&dev->n_qps_lock);
+
+       if (qp->ip)
+               kref_put(&qp->ip->ref, ipath_release_mmap_info);
+       else
+               vfree(qp->r_rq.wq);
+       kfree(qp->r_ud_sg_list);
+       vfree(qp->s_wq);
+       kfree(qp);
+       return 0;
+}
+
+/**
+ * ipath_init_qp_table - initialize the QP table for a device
+ * @idev: the device who's QP table we're initializing
+ * @size: the size of the QP table
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
+{
+       int i;
+       int ret;
+
+       idev->qp_table.last = 1;        /* QPN 0 and 1 are special. */
+       idev->qp_table.max = size;
+       idev->qp_table.nmaps = 1;
+       idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
+                                      GFP_KERNEL);
+       if (idev->qp_table.table == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+               atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+               idev->qp_table.map[i].page = NULL;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+       u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
+
+       /*
+        * If the credit is invalid, we can send
+        * as many packets as we like.  Otherwise, we have to
+        * honor the credit field.
+        */
+       if (credit == IPATH_AETH_CREDIT_INVAL)
+               qp->s_lsn = (u32) -1;
+       else if (qp->s_lsn != (u32) -1) {
+               /* Compute new LSN (i.e., MSN + credit) */
+               credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
+               if (ipath_cmp24(credit, qp->s_lsn) > 0)
+                       qp->s_lsn = credit;
+       }
+
+       /* Restart sending if it was blocked due to lack of credits. */
+       if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
+           qp->s_cur != qp->s_head &&
+           (qp->s_lsn == (u32) -1 ||
+            ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
+                        qp->s_lsn + 1) <= 0))
+               ipath_schedule_send(qp);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_rc.c b/drivers/staging/rdma/ipath/ipath_rc.c
new file mode 100644 (file)
index 0000000..79b3dbc
--- /dev/null
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
+                      u32 psn, u32 pmtu)
+{
+       u32 len;
+
+       len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+       ss->sge = wqe->sg_list[0];
+       ss->sg_list = wqe->sg_list + 1;
+       ss->num_sge = wqe->wr.num_sge;
+       ipath_skip_sge(ss, len);
+       return wqe->length - len;
+}
+
+/**
+ * ipath_init_restart- initialize the qp->s_sge after a restart
+ * @qp: the QP who's SGE we're restarting
+ * @wqe: the work queue to initialize the QP's SGE from
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+       struct ipath_ibdev *dev;
+
+       qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
+                               ib_mtu_enum_to_int(qp->path_mtu));
+       dev = to_idev(qp->ibqp.device);
+       spin_lock(&dev->pending_lock);
+       if (list_empty(&qp->timerwait))
+               list_add_tail(&qp->timerwait,
+                             &dev->pending[dev->pending_index]);
+       spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+                            struct ipath_other_headers *ohdr, u32 pmtu)
+{
+       struct ipath_ack_entry *e;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+       u32 bth2;
+
+       /* Don't send an ACK if we aren't supposed to. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+               goto bail;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+
+       switch (qp->s_ack_state) {
+       case OP(RDMA_READ_RESPONSE_LAST):
+       case OP(RDMA_READ_RESPONSE_ONLY):
+       case OP(ATOMIC_ACKNOWLEDGE):
+               /*
+                * We can increment the tail pointer now that the last
+                * response has been sent instead of only being
+                * constructed.
+                */
+               if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
+                       qp->s_tail_ack_queue = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_ONLY):
+       case OP(ACKNOWLEDGE):
+               /* Check for no next entry in the queue. */
+               if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+                       if (qp->s_flags & IPATH_S_ACK_PENDING)
+                               goto normal;
+                       qp->s_ack_state = OP(ACKNOWLEDGE);
+                       goto bail;
+               }
+
+               e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+               if (e->opcode == OP(RDMA_READ_REQUEST)) {
+                       /* Copy SGE state in case we need to resend */
+                       qp->s_ack_rdma_sge = e->rdma_sge;
+                       qp->s_cur_sge = &qp->s_ack_rdma_sge;
+                       len = e->rdma_sge.sge.sge_length;
+                       if (len > pmtu) {
+                               len = pmtu;
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+                       } else {
+                               qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+                               e->sent = 1;
+                       }
+                       ohdr->u.aeth = ipath_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_rdma_psn = e->psn;
+                       bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+               } else {
+                       /* COMPARE_SWAP or FETCH_ADD */
+                       qp->s_cur_sge = NULL;
+                       len = 0;
+                       qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+                       ohdr->u.at.aeth = ipath_compute_aeth(qp);
+                       ohdr->u.at.atomic_ack_eth[0] =
+                               cpu_to_be32(e->atomic_data >> 32);
+                       ohdr->u.at.atomic_ack_eth[1] =
+                               cpu_to_be32(e->atomic_data);
+                       hwords += sizeof(ohdr->u.at) / sizeof(u32);
+                       bth2 = e->psn;
+                       e->sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               len = qp->s_ack_rdma_sge.sge.sge_length;
+               if (len > pmtu)
+                       len = pmtu;
+               else {
+                       ohdr->u.aeth = ipath_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+                       qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
+               }
+               bth0 = qp->s_ack_state << 24;
+               bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+               break;
+
+       default:
+       normal:
+               /*
+                * Send a regular ACK.
+                * Set the s_ack_state so we wait until after sending
+                * the ACK before setting s_ack_state to ACKNOWLEDGE
+                * (see above).
+                */
+               qp->s_ack_state = OP(SEND_ONLY);
+               qp->s_flags &= ~IPATH_S_ACK_PENDING;
+               qp->s_cur_sge = NULL;
+               if (qp->s_nak_state)
+                       ohdr->u.aeth =
+                               cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+                                           (qp->s_nak_state <<
+                                            IPATH_AETH_CREDIT_SHIFT));
+               else
+                       ohdr->u.aeth = ipath_compute_aeth(qp);
+               hwords++;
+               len = 0;
+               bth0 = OP(ACKNOWLEDGE) << 24;
+               bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
+       }
+       qp->s_hdrwords = hwords;
+       qp->s_cur_size = len;
+       ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
+       return 1;
+
+bail:
+       return 0;
+}
+
+/**
+ * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_rc_req(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_other_headers *ohdr;
+       struct ipath_sge_state *ss;
+       struct ipath_swqe *wqe;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+       u32 bth2;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       char newreq;
+       unsigned long flags;
+       int ret = 0;
+
+       ohdr = &qp->s_hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr.u.l.oth;
+
+       /*
+        * The lock is needed to synchronize between the sending tasklet,
+        * the receive interrupt handler, and timeout resends.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Sending responses has higher priority over sending requests. */
+       if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+            (qp->s_flags & IPATH_S_ACK_PENDING) ||
+            qp->s_ack_state != OP(ACKNOWLEDGE)) &&
+           ipath_make_rc_ack(dev, qp, ohdr, pmtu))
+               goto done;
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_dma_busy)) {
+                       qp->s_flags |= IPATH_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
+
+       /* Leave BUSY set until RNR timeout. */
+       if (qp->s_rnr_timeout) {
+               qp->s_flags |= IPATH_S_WAITING;
+               goto bail;
+       }
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+       bth0 = 1 << 22; /* Set M bit */
+
+       /* Send a request. */
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       switch (qp->s_state) {
+       default:
+               if (!(ib_ipath_state_ops[qp->state] &
+                   IPATH_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /*
+                * Resend an old request or start a new one.
+                *
+                * We keep track of the current SWQE so that
+                * we don't reset the "furthest progress" state
+                * if we need to back up.
+                */
+               newreq = 0;
+               if (qp->s_cur == qp->s_tail) {
+                       /* Check if send work queue is empty. */
+                       if (qp->s_tail == qp->s_head)
+                               goto bail;
+                       /*
+                        * If a fence is requested, wait for previous
+                        * RDMA read and atomic operations to finish.
+                        */
+                       if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+                           qp->s_num_rd_atomic) {
+                               qp->s_flags |= IPATH_S_FENCE_PENDING;
+                               goto bail;
+                       }
+                       wqe->psn = qp->s_next_psn;
+                       newreq = 1;
+               }
+               /*
+                * Note that we have to be careful not to modify the
+                * original work request since we may need to resend
+                * it.
+                */
+               len = wqe->length;
+               ss = &qp->s_sge;
+               bth2 = 0;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       /* If no credit, return. */
+                       if (qp->s_lsn != (u32) -1 &&
+                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       wqe->lpsn = wqe->psn;
+                       if (len > pmtu) {
+                               wqe->lpsn += (len - 1) / pmtu;
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND)
+                               qp->s_state = OP(SEND_ONLY);
+                       else {
+                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+                       bth2 = 1 << 31; /* Request ACK. */
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+                       if (newreq && qp->s_lsn != (u32) -1)
+                               qp->s_lsn++;
+                       /* FALLTHROUGH */
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       /* If no credit, return. */
+                       if (qp->s_lsn != (u32) -1 &&
+                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+                               qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+                               goto bail;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / sizeof(u32);
+                       wqe->lpsn = wqe->psn;
+                       if (len > pmtu) {
+                               wqe->lpsn += (len - 1) / pmtu;
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= 1 << 23;
+                       }
+                       bth2 = 1 << 31; /* Request ACK. */
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_READ:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= IPATH_S_RDMAR_PENDING;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (qp->s_lsn != (u32) -1)
+                                       qp->s_lsn++;
+                               /*
+                                * Adjust s_next_psn to count the
+                                * expected number of responses.
+                                */
+                               if (len > pmtu)
+                                       qp->s_next_psn += (len - 1) / pmtu;
+                               wqe->lpsn = qp->s_next_psn++;
+                       }
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       qp->s_state = OP(RDMA_READ_REQUEST);
+                       hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       /*
+                        * Don't allow more operations to be started
+                        * than the QP limits allow.
+                        */
+                       if (newreq) {
+                               if (qp->s_num_rd_atomic >=
+                                   qp->s_max_rd_atomic) {
+                                       qp->s_flags |= IPATH_S_RDMAR_PENDING;
+                                       goto bail;
+                               }
+                               qp->s_num_rd_atomic++;
+                               if (qp->s_lsn != (u32) -1)
+                                       qp->s_lsn++;
+                               wqe->lpsn = wqe->psn;
+                       }
+                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+                               qp->s_state = OP(COMPARE_SWAP);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->wr.wr.atomic.swap);
+                               ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+                                       wqe->wr.wr.atomic.compare_add);
+                       } else {
+                               qp->s_state = OP(FETCH_ADD);
+                               ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                                       wqe->wr.wr.atomic.compare_add);
+                               ohdr->u.atomic_eth.compare_data = 0;
+                       }
+                       ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+                               wqe->wr.wr.atomic.remote_addr >> 32);
+                       ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+                               wqe->wr.wr.atomic.remote_addr);
+                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
+                               wqe->wr.wr.atomic.rkey);
+                       hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+                       ss = NULL;
+                       len = 0;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_len = wqe->length;
+               if (newreq) {
+                       qp->s_tail++;
+                       if (qp->s_tail >= qp->s_size)
+                               qp->s_tail = 0;
+               }
+               bth2 |= qp->s_psn & IPATH_PSN_MASK;
+               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                       qp->s_psn = wqe->lpsn + 1;
+               else {
+                       qp->s_psn++;
+                       if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+                               qp->s_next_psn = qp->s_psn;
+               }
+               /*
+                * Put the QP on the pending list so lost ACKs will cause
+                * a retry.  More than one request can be pending so the
+                * QP may already be on the dev->pending list.
+                */
+               spin_lock(&dev->pending_lock);
+               if (list_empty(&qp->timerwait))
+                       list_add_tail(&qp->timerwait,
+                                     &dev->pending[dev->pending_index]);
+               spin_unlock(&dev->pending_lock);
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               /*
+                * This case can only happen if a send is restarted.
+                * See ipath_restart_rc().
+                */
+               ipath_init_restart(qp, wqe);
+               /* FALLTHROUGH */
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND)
+                       qp->s_state = OP(SEND_LAST);
+               else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= 1 << 23;
+               bth2 |= 1 << 31;        /* Request ACK. */
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /*
+                * This case can only happen if a RDMA write is restarted.
+                * See ipath_restart_rc().
+                */
+               ipath_init_restart(qp, wqe);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               else {
+                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+               }
+               bth2 |= 1 << 31;        /* Request ACK. */
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /*
+                * This case can only happen if a RDMA read is restarted.
+                * See ipath_restart_rc().
+                */
+               ipath_init_restart(qp, wqe);
+               len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+               ohdr->u.rc.reth.vaddr =
+                       cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+               ohdr->u.rc.reth.rkey =
+                       cpu_to_be32(wqe->wr.wr.rdma.rkey);
+               ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+               qp->s_state = OP(RDMA_READ_REQUEST);
+               hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+               bth2 = qp->s_psn & IPATH_PSN_MASK;
+               qp->s_psn = wqe->lpsn + 1;
+               ss = NULL;
+               len = 0;
+               qp->s_cur++;
+               if (qp->s_cur == qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+               bth2 |= 1 << 31;        /* Request ACK. */
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = ss;
+       qp->s_cur_size = len;
+       ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+       ret = 1;
+       goto unlock;
+
+bail:
+       qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/**
+ * send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from ipath_rc_rcv() and only uses the receive
+ * side QP state.
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+static void send_rc_ack(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_devdata *dd;
+       u16 lrh0;
+       u32 bth0;
+       u32 hwords;
+       u32 __iomem *piobuf;
+       struct ipath_ib_header hdr;
+       struct ipath_other_headers *ohdr;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+       if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+           (qp->s_flags & IPATH_S_ACK_PENDING) ||
+           qp->s_ack_state != OP(ACKNOWLEDGE))
+               goto queue_ack;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       /* Don't try to send ACKs if the link isn't ACTIVE */
+       dd = dev->dd;
+       if (!(dd->ipath_flags & IPATH_LINKACTIVE))
+               goto done;
+
+       piobuf = ipath_getpiobuf(dd, 0, NULL);
+       if (!piobuf) {
+               /*
+                * We are out of PIO buffers at the moment.
+                * Pass responsibility for sending the ACK to the
+                * send tasklet so that when a PIO buffer becomes
+                * available, the ACK is sent ahead of other outgoing
+                * packets.
+                */
+               spin_lock_irqsave(&qp->s_lock, flags);
+               goto queue_ack;
+       }
+
+       /* Construct the header. */
+       ohdr = &hdr.u.oth;
+       lrh0 = IPATH_LRH_BTH;
+       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+       hwords = 6;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               hwords += ipath_make_grh(dev, &hdr.u.l.grh,
+                                        &qp->remote_ah_attr.grh,
+                                        hwords, 0);
+               ohdr = &hdr.u.l.oth;
+               lrh0 = IPATH_LRH_GRH;
+       }
+       /* read pkey_index w/o lock (its atomic) */
+       bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
+               (OP(ACKNOWLEDGE) << 24) | (1 << 22);
+       if (qp->r_nak_state)
+               ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+                                           (qp->r_nak_state <<
+                                            IPATH_AETH_CREDIT_SHIFT));
+       else
+               ohdr->u.aeth = ipath_compute_aeth(qp);
+       lrh0 |= qp->remote_ah_attr.sl << 4;
+       hdr.lrh[0] = cpu_to_be16(lrh0);
+       hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+       hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
+                                qp->remote_ah_attr.src_path_bits);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
+
+       writeq(hwords + 1, piobuf);
+
+       if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+               u32 *hdrp = (u32 *) &hdr;
+
+               ipath_flush_wc();
+               __iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
+               ipath_flush_wc();
+               __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+       } else
+               __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+       ipath_flush_wc();
+
+       dev->n_unicast_xmit++;
+       goto done;
+
+queue_ack:
+       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
+               dev->n_rc_qacks++;
+               qp->s_flags |= IPATH_S_ACK_PENDING;
+               qp->s_nak_state = qp->r_nak_state;
+               qp->s_ack_psn = qp->r_ack_psn;
+
+               /* Schedule the send tasklet. */
+               ipath_schedule_send(qp);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+       return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct ipath_qp *qp, u32 psn)
+{
+       u32 n = qp->s_last;
+       struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
+       u32 opcode;
+
+       qp->s_cur = n;
+
+       /*
+        * If we are starting the request from the beginning,
+        * let the normal send code handle initialization.
+        */
+       if (ipath_cmp24(psn, wqe->psn) <= 0) {
+               qp->s_state = OP(SEND_LAST);
+               goto done;
+       }
+
+       /* Find the work request opcode corresponding to the given PSN. */
+       opcode = wqe->wr.opcode;
+       for (;;) {
+               int diff;
+
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail)
+                       break;
+               wqe = get_swqe_ptr(qp, n);
+               diff = ipath_cmp24(psn, wqe->psn);
+               if (diff < 0)
+                       break;
+               qp->s_cur = n;
+               /*
+                * If we are starting the request from the beginning,
+                * let the normal send code handle initialization.
+                */
+               if (diff == 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       goto done;
+               }
+               opcode = wqe->wr.opcode;
+       }
+
+       /*
+        * Set the state to restart in the middle of a request.
+        * Don't change the s_sge, s_cur_sge, or s_cur_size.
+        * See ipath_make_rc_req().
+        */
+       switch (opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+               break;
+
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+               break;
+
+       case IB_WR_RDMA_READ:
+               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               break;
+
+       default:
+               /*
+                * This case shouldn't happen since its only
+                * one PSN per req.
+                */
+               qp->s_state = OP(SEND_LAST);
+       }
+done:
+       qp->s_psn = psn;
+}
+
+/**
+ * ipath_restart_rc - back up requester to resend the last un-ACKed request
+ * @qp: the QP to restart
+ * @psn: packet sequence number for the request
+ * @wc: the work completion request
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
+{
+       struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+       struct ipath_ibdev *dev;
+
+       if (qp->s_retry == 0) {
+               ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+               ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+               goto bail;
+       }
+       qp->s_retry--;
+
+       /*
+        * Remove the QP from the timeout queue.
+        * Note: it may already have been removed by ipath_ib_timer().
+        */
+       dev = to_idev(qp->ibqp.device);
+       spin_lock(&dev->pending_lock);
+       if (!list_empty(&qp->timerwait))
+               list_del_init(&qp->timerwait);
+       if (!list_empty(&qp->piowait))
+               list_del_init(&qp->piowait);
+       spin_unlock(&dev->pending_lock);
+
+       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+               dev->n_rc_resends++;
+       else
+               dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+       reset_psn(qp, psn);
+       ipath_schedule_send(qp);
+
+bail:
+       return;
+}
+
+static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
+{
+       qp->s_last_psn = psn;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held and interrupts disabled.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
+                    u64 val)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ib_wc wc;
+       enum ib_wc_status status;
+       struct ipath_swqe *wqe;
+       int ret = 0;
+       u32 ack_psn;
+       int diff;
+
+       /*
+        * Remove the QP from the timeout queue (or RNR timeout queue).
+        * If ipath_ib_timer() has already removed it,
+        * it's OK since we hold the QP s_lock and ipath_restart_rc()
+        * just won't find anything to restart if we ACK everything.
+        */
+       spin_lock(&dev->pending_lock);
+       if (!list_empty(&qp->timerwait))
+               list_del_init(&qp->timerwait);
+       spin_unlock(&dev->pending_lock);
+
+       /*
+        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+        * requests and implicitly NAK RDMA read and atomic requests issued
+        * before the NAK'ed request.  The MSN won't include the NAK'ed
+        * request but will include an ACK'ed request(s).
+        */
+       ack_psn = psn;
+       if (aeth >> 29)
+               ack_psn--;
+       wqe = get_swqe_ptr(qp, qp->s_last);
+
+       /*
+        * The MSN might be for a later WQE than the PSN indicates so
+        * only complete WQEs that the PSN finishes.
+        */
+       while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+               /*
+                * RDMA_READ_RESPONSE_ONLY is a special case since
+                * we want to generate completion events for everything
+                * before the RDMA read, copy the data, then generate
+                * the completion for the read.
+                */
+               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+                   diff == 0) {
+                       ret = 1;
+                       goto bail;
+               }
+               /*
+                * If this request is a RDMA read or atomic, and the ACK is
+                * for a later operation, this ACK NAKs the RDMA read or
+                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+                * can ACK a RDMA read and likewise for atomic ops.  Note
+                * that the NAK case can only happen if relaxed ordering is
+                * used and requests are sent after an RDMA read or atomic
+                * is sent but before the response is received.
+                */
+               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+                       /*
+                        * The last valid PSN seen is the previous
+                        * request's.
+                        */
+                       update_last_psn(qp, wqe->psn - 1);
+                       /* Retry this request. */
+                       ipath_restart_rc(qp, wqe->psn);
+                       /*
+                        * No need to process the ACK/NAK since we are
+                        * restarting an earlier request.
+                        */
+                       goto bail;
+               }
+               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+                       *(u64 *) wqe->sg_list[0].vaddr = val;
+               if (qp->s_num_rd_atomic &&
+                   (wqe->wr.opcode == IB_WR_RDMA_READ ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+                       qp->s_num_rd_atomic--;
+                       /* Restart sending task if fence is complete */
+                       if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
+                            !qp->s_num_rd_atomic) ||
+                           qp->s_flags & IPATH_S_RDMAR_PENDING)
+                               ipath_schedule_send(qp);
+               }
+               /* Post a send completion queue entry if requested. */
+               if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       memset(&wc, 0, sizeof wc);
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+                       wc.byte_len = wqe->length;
+                       wc.qp = &qp->ibqp;
+                       wc.src_qp = qp->remote_qpn;
+                       wc.slid = qp->remote_ah_attr.dlid;
+                       wc.sl = qp->remote_ah_attr.sl;
+                       ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+               }
+               qp->s_retry = qp->s_retry_cnt;
+               /*
+                * If we are completing a request which is in the process of
+                * being resent, we can stop resending it since we know the
+                * responder has already seen it.
+                */
+               if (qp->s_last == qp->s_cur) {
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       qp->s_last = qp->s_cur;
+                       if (qp->s_last == qp->s_tail)
+                               break;
+                       wqe = get_swqe_ptr(qp, qp->s_cur);
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = wqe->psn;
+               } else {
+                       if (++qp->s_last >= qp->s_size)
+                               qp->s_last = 0;
+                       if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+                               qp->s_draining = 0;
+                       if (qp->s_last == qp->s_tail)
+                               break;
+                       wqe = get_swqe_ptr(qp, qp->s_last);
+               }
+       }
+
+       switch (aeth >> 29) {
+       case 0:         /* ACK */
+               dev->n_rc_acks++;
+               /* If this is a partial ACK, reset the retransmit timer. */
+               if (qp->s_last != qp->s_tail) {
+                       spin_lock(&dev->pending_lock);
+                       if (list_empty(&qp->timerwait))
+                               list_add_tail(&qp->timerwait,
+                                       &dev->pending[dev->pending_index]);
+                       spin_unlock(&dev->pending_lock);
+                       /*
+                        * If we get a partial ACK for a resent operation,
+                        * we can stop resending the earlier packets and
+                        * continue with the next packet the receiver wants.
+                        */
+                       if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+                               reset_psn(qp, psn + 1);
+                               ipath_schedule_send(qp);
+                       }
+               } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = psn + 1;
+               }
+               ipath_get_credit(qp, aeth);
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               qp->s_retry = qp->s_retry_cnt;
+               update_last_psn(qp, psn);
+               ret = 1;
+               goto bail;
+
+       case 1:         /* RNR NAK */
+               dev->n_rnr_naks++;
+               if (qp->s_last == qp->s_tail)
+                       goto bail;
+               if (qp->s_rnr_retry == 0) {
+                       status = IB_WC_RNR_RETRY_EXC_ERR;
+                       goto class_b;
+               }
+               if (qp->s_rnr_retry_cnt < 7)
+                       qp->s_rnr_retry--;
+
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+
+               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                       dev->n_rc_resends++;
+               else
+                       dev->n_rc_resends +=
+                               (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+               reset_psn(qp, psn);
+
+               qp->s_rnr_timeout =
+                       ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
+                                          IPATH_AETH_CREDIT_MASK];
+               ipath_insert_rnr_queue(qp);
+               ipath_schedule_send(qp);
+               goto bail;
+
+       case 3:         /* NAK */
+               if (qp->s_last == qp->s_tail)
+                       goto bail;
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
+               switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
+                       IPATH_AETH_CREDIT_MASK) {
+               case 0: /* PSN sequence error */
+                       dev->n_seq_naks++;
+                       /*
+                        * Back up to the responder's expected PSN.
+                        * Note that we might get a NAK in the middle of an
+                        * RDMA READ response which terminates the RDMA
+                        * READ.
+                        */
+                       ipath_restart_rc(qp, psn);
+                       break;
+
+               case 1: /* Invalid Request */
+                       status = IB_WC_REM_INV_REQ_ERR;
+                       dev->n_other_naks++;
+                       goto class_b;
+
+               case 2: /* Remote Access Error */
+                       status = IB_WC_REM_ACCESS_ERR;
+                       dev->n_other_naks++;
+                       goto class_b;
+
+               case 3: /* Remote Operation Error */
+                       status = IB_WC_REM_OP_ERR;
+                       dev->n_other_naks++;
+               class_b:
+                       ipath_send_complete(qp, wqe, status);
+                       ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+                       break;
+
+               default:
+                       /* Ignore other reserved NAK error codes */
+                       goto reserved;
+               }
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               goto bail;
+
+       default:                /* 2: reserved */
+       reserved:
+               /* Ignore reserved NAK codes. */
+               goto bail;
+       }
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_rc_rcv_resp - process an incoming RC response packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
+                                    struct ipath_other_headers *ohdr,
+                                    void *data, u32 tlen,
+                                    struct ipath_qp *qp,
+                                    u32 opcode,
+                                    u32 psn, u32 hdrsize, u32 pmtu,
+                                    int header_in_data)
+{
+       struct ipath_swqe *wqe;
+       enum ib_wc_status status;
+       unsigned long flags;
+       int diff;
+       u32 pad;
+       u32 aeth;
+       u64 val;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Double check we can process this now that we hold the s_lock. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+               goto ack_done;
+
+       /* Ignore invalid responses. */
+       if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
+               goto ack_done;
+
+       /* Ignore duplicate responses. */
+       diff = ipath_cmp24(psn, qp->s_last_psn);
+       if (unlikely(diff <= 0)) {
+               /* Update credits for "ghost" ACKs */
+               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+                       if (!header_in_data)
+                               aeth = be32_to_cpu(ohdr->u.aeth);
+                       else {
+                               aeth = be32_to_cpu(((__be32 *) data)[0]);
+                               data += sizeof(__be32);
+                       }
+                       if ((aeth >> 29) == 0)
+                               ipath_get_credit(qp, aeth);
+               }
+               goto ack_done;
+       }
+
+       if (unlikely(qp->s_last == qp->s_tail))
+               goto ack_done;
+       wqe = get_swqe_ptr(qp, qp->s_last);
+       status = IB_WC_SUCCESS;
+
+       switch (opcode) {
+       case OP(ACKNOWLEDGE):
+       case OP(ATOMIC_ACKNOWLEDGE):
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               if (!header_in_data)
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+               else {
+                       aeth = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               }
+               if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+                       if (!header_in_data) {
+                               __be32 *p = ohdr->u.at.atomic_ack_eth;
+
+                               val = ((u64) be32_to_cpu(p[0]) << 32) |
+                                       be32_to_cpu(p[1]);
+                       } else
+                               val = be64_to_cpu(((__be64 *) data)[0]);
+               } else
+                       val = 0;
+               if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
+                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
+                       goto ack_done;
+               hdrsize += 4;
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_middle;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /* no AETH, no ACK */
+               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+                       dev->n_rdma_seq++;
+                       if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+                               goto ack_done;
+                       qp->r_flags |= IPATH_R_RDMAR_SEQ;
+                       ipath_restart_rc(qp, qp->s_last_psn + 1);
+                       goto ack_done;
+               }
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+       read_middle:
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto ack_len_err;
+               if (unlikely(pmtu >= qp->s_rdma_read_len))
+                       goto ack_len_err;
+
+               /* We got a response so update the timeout. */
+               spin_lock(&dev->pending_lock);
+               if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
+                       list_move_tail(&qp->timerwait,
+                                      &dev->pending[dev->pending_index]);
+               spin_unlock(&dev->pending_lock);
+
+               if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+                       qp->s_retry = qp->s_retry_cnt;
+
+               /*
+                * Update the RDMA receive state but do the copy w/o
+                * holding the locks and blocking interrupts.
+                */
+               qp->s_rdma_read_len -= pmtu;
+               update_last_psn(qp, psn);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
+               goto bail;
+
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               if (!header_in_data)
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+               else
+                       aeth = be32_to_cpu(((__be32 *) data)[0]);
+               if (!do_rc_ack(qp, aeth, psn, opcode, 0))
+                       goto ack_done;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 0 && <= pmtu.
+                * Remember to account for the AETH header (4) and
+                * ICRC (4).
+                */
+               if (unlikely(tlen < (hdrsize + pad + 8)))
+                       goto ack_len_err;
+               /*
+                * If this is a response to a resent RDMA read, we
+                * have to be careful to copy the data to the right
+                * location.
+                */
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+                                                 wqe, psn, pmtu);
+               goto read_last;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /* ACKs READ req. */
+               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+                       dev->n_rdma_seq++;
+                       if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+                               goto ack_done;
+                       qp->r_flags |= IPATH_R_RDMAR_SEQ;
+                       ipath_restart_rc(qp, qp->s_last_psn + 1);
+                       goto ack_done;
+               }
+               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+                       goto ack_op_err;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 1 && <= pmtu.
+                * Remember to account for the AETH header (4) and
+                * ICRC (4).
+                */
+               if (unlikely(tlen <= (hdrsize + pad + 8)))
+                       goto ack_len_err;
+       read_last:
+               tlen -= hdrsize + pad + 8;
+               if (unlikely(tlen != qp->s_rdma_read_len))
+                       goto ack_len_err;
+               if (!header_in_data)
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+               else {
+                       aeth = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               }
+               ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
+               (void) do_rc_ack(qp, aeth, psn,
+                                OP(RDMA_READ_RESPONSE_LAST), 0);
+               goto ack_done;
+       }
+
+ack_op_err:
+       status = IB_WC_LOC_QP_OP_ERR;
+       goto ack_err;
+
+ack_len_err:
+       status = IB_WC_LOC_LEN_ERR;
+ack_err:
+       ipath_send_complete(qp, wqe, status);
+       ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ack_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+       return;
+}
+
+/**
+ * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
+                                    struct ipath_other_headers *ohdr,
+                                    void *data,
+                                    struct ipath_qp *qp,
+                                    u32 opcode,
+                                    u32 psn,
+                                    int diff,
+                                    int header_in_data)
+{
+       struct ipath_ack_entry *e;
+       u8 i, prev;
+       int old_req;
+       unsigned long flags;
+
+       if (diff > 0) {
+               /*
+                * Packet sequence error.
+                * A NAK will ACK earlier sends and RDMA writes.
+                * Don't queue the NAK if we already sent one.
+                */
+               if (!qp->r_nak_state) {
+                       qp->r_nak_state = IB_NAK_PSN_ERROR;
+                       /* Use the expected PSN. */
+                       qp->r_ack_psn = qp->r_psn;
+                       goto send_ack;
+               }
+               goto done;
+       }
+
+       /*
+        * Handle a duplicate request.  Don't re-execute SEND, RDMA
+        * write or atomic op.  Don't NAK errors, just silently drop
+        * the duplicate request.  Note that r_sge, r_len, and
+        * r_rcv_len may be in use so don't modify them.
+        *
+        * We are supposed to ACK the earliest duplicate PSN but we
+        * can coalesce an outstanding duplicate ACK.  We have to
+        * send the earliest so that RDMA reads can be restarted at
+        * the requester's expected PSN.
+        *
+        * First, find where this duplicate PSN falls within the
+        * ACKs previously sent.
+        */
+       psn &= IPATH_PSN_MASK;
+       e = NULL;
+       old_req = 1;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       /* Double check we can process this now that we hold the s_lock. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+               goto unlock_done;
+
+       for (i = qp->r_head_ack_queue; ; i = prev) {
+               if (i == qp->s_tail_ack_queue)
+                       old_req = 0;
+               if (i)
+                       prev = i - 1;
+               else
+                       prev = IPATH_MAX_RDMA_ATOMIC;
+               if (prev == qp->r_head_ack_queue) {
+                       e = NULL;
+                       break;
+               }
+               e = &qp->s_ack_queue[prev];
+               if (!e->opcode) {
+                       e = NULL;
+                       break;
+               }
+               if (ipath_cmp24(psn, e->psn) >= 0) {
+                       if (prev == qp->s_tail_ack_queue)
+                               old_req = 0;
+                       break;
+               }
+       }
+       switch (opcode) {
+       case OP(RDMA_READ_REQUEST): {
+               struct ib_reth *reth;
+               u32 offset;
+               u32 len;
+
+               /*
+                * If we didn't find the RDMA read request in the ack queue,
+                * or the send tasklet is already backed up to send an
+                * earlier entry, we can ignore this request.
+                */
+               if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
+                       goto unlock_done;
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               /*
+                * Address range must be a subset of the original
+                * request and start on pmtu boundaries.
+                * We reuse the old ack_queue slot since the requester
+                * should not back up and request an earlier PSN for the
+                * same request.
+                */
+               offset = ((psn - e->psn) & IPATH_PSN_MASK) *
+                       ib_mtu_enum_to_int(qp->path_mtu);
+               len = be32_to_cpu(reth->length);
+               if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
+                       goto unlock_done;
+               if (len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       ok = ipath_rkey_ok(qp, &e->rdma_sge,
+                                          len, vaddr, rkey,
+                                          IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto unlock_done;
+               } else {
+                       e->rdma_sge.sg_list = NULL;
+                       e->rdma_sge.num_sge = 0;
+                       e->rdma_sge.sge.mr = NULL;
+                       e->rdma_sge.sge.vaddr = NULL;
+                       e->rdma_sge.sge.length = 0;
+                       e->rdma_sge.sge.sge_length = 0;
+               }
+               e->psn = psn;
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               /*
+                * If we didn't find the atomic request in the ack queue
+                * or the send tasklet is already backed up to send an
+                * earlier entry, we can ignore this request.
+                */
+               if (!e || e->opcode != (u8) opcode || old_req)
+                       goto unlock_done;
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               qp->s_tail_ack_queue = prev;
+               break;
+       }
+
+       default:
+               if (old_req)
+                       goto unlock_done;
+               /*
+                * Resend the most recent ACK if this request is
+                * after all the previous RDMA reads and atomics.
+                */
+               if (i == qp->r_head_ack_queue) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       qp->r_nak_state = 0;
+                       qp->r_ack_psn = qp->r_psn - 1;
+                       goto send_ack;
+               }
+               /*
+                * Try to send a simple ACK to work around a Mellanox bug
+                * which doesn't accept a RDMA read response or atomic
+                * response as an ACK for earlier SENDs or RDMA writes.
+                */
+               if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+                   !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+                   qp->s_ack_state == OP(ACKNOWLEDGE)) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       qp->r_nak_state = 0;
+                       qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+                       goto send_ack;
+               }
+               /*
+                * Resend the RDMA read or atomic op which
+                * ACKs this duplicate request.
+                */
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               qp->s_tail_ack_queue = i;
+               break;
+       }
+       qp->r_nak_state = 0;
+       ipath_schedule_send(qp);
+
+unlock_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+       return 1;
+
+send_ack:
+       return 0;
+}
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
+{
+       unsigned long flags;
+       int lastwqe;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+       lastwqe = ipath_error_qp(qp, err);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       if (lastwqe) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
+{
+       unsigned next;
+
+       next = n + 1;
+       if (next > IPATH_MAX_RDMA_ATOMIC)
+               next = 0;
+       if (n == qp->s_tail_ack_queue) {
+               qp->s_tail_ack_queue = next;
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+       }
+}
+
+/**
+ * ipath_rc_rcv - process an incoming RC packet
+ * @dev: the device this packet came in on
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from ipath_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       u32 opcode;
+       u32 hdrsize;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       int diff;
+       struct ib_reth *reth;
+       int header_in_data;
+       unsigned long flags;
+
+       /* Validate the SLID. See Ch. 9.6.1.5 */
+       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+               goto done;
+
+       /* Check for GRH */
+       if (!has_grh) {
+               ohdr = &hdr->u.oth;
+               hdrsize = 8 + 12;       /* LRH + BTH */
+               psn = be32_to_cpu(ohdr->bth[2]);
+               header_in_data = 0;
+       } else {
+               ohdr = &hdr->u.l.oth;
+               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+               /*
+                * The header with GRH is 60 bytes and the core driver sets
+                * the eager header buffer size to 56 bytes so the last 4
+                * bytes of the BTH header (PSN) is in the data buffer.
+                */
+               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+               if (header_in_data) {
+                       psn = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               } else
+                       psn = be32_to_cpu(ohdr->bth[2]);
+       }
+
+       /*
+        * Process responses (ACKs) before anything else.  Note that the
+        * packet sequence number will be for something in the send work
+        * queue rather than the expected receive packet sequence number.
+        * In other words, this QP is the requester.
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
+                                 hdrsize, pmtu, header_in_data);
+               goto done;
+       }
+
+       /* Compute 24 bits worth of difference. */
+       diff = ipath_cmp24(psn, qp->r_psn);
+       if (unlikely(diff)) {
+               if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
+                                      psn, diff, header_in_data))
+                       goto done;
+               goto send_ack;
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       default:
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       goto nack_inv;
+               /*
+                * Note that it is up to the requester to not send a new
+                * RDMA read or atomic operation before receiving an ACK
+                * for the previous operation.
+                */
+               break;
+       }
+
+       memset(&wc, 0, sizeof wc);
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+               if (!ipath_get_rwqe(qp, 0))
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+       case OP(RDMA_WRITE_MIDDLE):
+       send_middle:
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto nack_inv;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto nack_inv;
+               ipath_copy_sge(&qp->r_sge, data, pmtu);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               /* consume RWQE */
+               if (!ipath_get_rwqe(qp, 1))
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+               if (!ipath_get_rwqe(qp, 0))
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto send_last;
+               /* FALLTHROUGH */
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+       send_last_imm:
+               if (header_in_data) {
+                       wc.ex.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else {
+                       /* Immediate data comes after BTH */
+                       wc.ex.imm_data = ohdr->u.imm_data;
+               }
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               /* FALLTHROUGH */
+       case OP(SEND_LAST):
+       case OP(RDMA_WRITE_LAST):
+       send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto nack_inv;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto nack_inv;
+               ipath_copy_sge(&qp->r_sge, data, tlen);
+               qp->r_msn++;
+               if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+                       break;
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               else
+                       wc.opcode = IB_WC_RECV;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               wc.sl = qp->remote_ah_attr.sl;
+               /* Signal completion event if the solicited bit is set. */
+               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                              (ohdr->bth[0] &
+                               cpu_to_be32(1 << 23)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE)))
+                       goto nack_inv;
+               /* consume RWQE */
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               hdrsize += sizeof(*reth);
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = ipath_rkey_ok(qp, &qp->r_sge,
+                                          qp->r_len, vaddr, rkey,
+                                          IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok))
+                               goto nack_acc;
+               } else {
+                       qp->r_sge.sg_list = NULL;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (opcode == OP(RDMA_WRITE_FIRST))
+                       goto send_middle;
+               else if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto send_last;
+               if (!ipath_get_rwqe(qp, 1))
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(RDMA_READ_REQUEST): {
+               struct ipath_ack_entry *e;
+               u32 len;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_READ)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               if (next > IPATH_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               /* Double check we can process this while holding the s_lock. */
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+                       goto unlock;
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       ipath_update_ack_queue(qp, next);
+               }
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               len = be32_to_cpu(reth->length);
+               if (len) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+                                          rkey, IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto nack_acc_unlck;
+                       /*
+                        * Update the next expected PSN.  We add 1 later
+                        * below, so only add the remainder here.
+                        */
+                       if (len > pmtu)
+                               qp->r_psn += (len - 1) / pmtu;
+               } else {
+                       e->rdma_sge.sg_list = NULL;
+                       e->rdma_sge.num_sge = 0;
+                       e->rdma_sge.sge.mr = NULL;
+                       e->rdma_sge.sge.vaddr = NULL;
+                       e->rdma_sge.sge.length = 0;
+                       e->rdma_sge.sge.sge_length = 0;
+               }
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn;
+               /*
+                * We need to increment the MSN here instead of when we
+                * finish sending the result since a duplicate request would
+                * increment it more than once.
+                */
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               ipath_schedule_send(qp);
+
+               goto unlock;
+       }
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               struct ib_atomic_eth *ateth;
+               struct ipath_ack_entry *e;
+               u64 vaddr;
+               atomic64_t *maddr;
+               u64 sdata;
+               u32 rkey;
+               u8 next;
+
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_inv;
+               next = qp->r_head_ack_queue + 1;
+               if (next > IPATH_MAX_RDMA_ATOMIC)
+                       next = 0;
+               spin_lock_irqsave(&qp->s_lock, flags);
+               /* Double check we can process this while holding the s_lock. */
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+                       goto unlock;
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv_unlck;
+                       ipath_update_ack_queue(qp, next);
+               }
+               if (!header_in_data)
+                       ateth = &ohdr->u.atomic_eth;
+               else
+                       ateth = (struct ib_atomic_eth *)data;
+               vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+                       be32_to_cpu(ateth->vaddr[1]);
+               if (unlikely(vaddr & (sizeof(u64) - 1)))
+                       goto nack_inv_unlck;
+               rkey = be32_to_cpu(ateth->rkey);
+               /* Check rkey & NAK */
+               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
+                                           sizeof(u64), vaddr, rkey,
+                                           IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_acc_unlck;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+               sdata = be64_to_cpu(ateth->swap_data);
+               e = &qp->s_ack_queue[qp->r_head_ack_queue];
+               e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+                       (u64) atomic64_add_return(sdata, maddr) - sdata :
+                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+                                     be64_to_cpu(ateth->compare_data),
+                                     sdata);
+               e->opcode = opcode;
+               e->sent = 0;
+               e->psn = psn & IPATH_PSN_MASK;
+               qp->r_msn++;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               qp->r_nak_state = 0;
+               qp->r_head_ack_queue = next;
+
+               /* Schedule the send tasklet. */
+               ipath_schedule_send(qp);
+
+               goto unlock;
+       }
+
+       default:
+               /* NAK unknown opcodes. */
+               goto nack_inv;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       qp->r_ack_psn = psn;
+       qp->r_nak_state = 0;
+       /* Send an ACK if requested or required. */
+       if (psn & (1 << 31))
+               goto send_ack;
+       goto done;
+
+rnr_nak:
+       qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+       qp->r_ack_psn = qp->r_psn;
+       goto send_ack;
+
+nack_inv_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+       ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+       qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+       qp->r_ack_psn = qp->r_psn;
+       goto send_ack;
+
+nack_acc_unlck:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+       ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
+       qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+       qp->r_ack_psn = qp->r_psn;
+send_ack:
+       send_rc_ack(qp);
+       goto done;
+
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+       return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_registers.h b/drivers/staging/rdma/ipath/ipath_registers.h
new file mode 100644 (file)
index 0000000..8f44d0c
--- /dev/null
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_REGISTERS_H
+#define _IPATH_REGISTERS_H
+
+/*
+ * This file should only be included by kernel source, and by the diags.  It
+ * defines the registers, and their contents, for InfiniPath chips.
+ */
+
+/*
+ * These are the InfiniPath register and buffer bit definitions,
+ * that are visible to software, and needed only by the kernel
+ * and diag code.  A few, that are visible to protocol and user
+ * code are in ipath_common.h.  Some bits are specific
+ * to a given chip implementation, and have been moved to the
+ * chip-specific source file
+ */
+
+/* kr_revision bits */
+#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
+#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
+#define INFINIPATH_R_ARCH_MASK 0xFF
+#define INFINIPATH_R_ARCH_SHIFT 16
+#define INFINIPATH_R_SOFTWARE_MASK 0xFF
+#define INFINIPATH_R_SOFTWARE_SHIFT 24
+#define INFINIPATH_R_BOARDID_MASK 0xFF
+#define INFINIPATH_R_BOARDID_SHIFT 32
+
+/* kr_control bits */
+#define INFINIPATH_C_FREEZEMODE 0x00000002
+#define INFINIPATH_C_LINKENABLE 0x00000004
+
+/* kr_sendctrl bits */
+#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
+#define INFINIPATH_S_UPDTHRESH_SHIFT 24
+#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
+
+#define IPATH_S_ABORT          0
+#define IPATH_S_PIOINTBUFAVAIL 1
+#define IPATH_S_PIOBUFAVAILUPD 2
+#define IPATH_S_PIOENABLE      3
+#define IPATH_S_SDMAINTENABLE  9
+#define IPATH_S_SDMASINGLEDESCRIPTOR   10
+#define IPATH_S_SDMAENABLE     11
+#define IPATH_S_SDMAHALT       12
+#define IPATH_S_DISARM         31
+
+#define INFINIPATH_S_ABORT             (1U << IPATH_S_ABORT)
+#define INFINIPATH_S_PIOINTBUFAVAIL    (1U << IPATH_S_PIOINTBUFAVAIL)
+#define INFINIPATH_S_PIOBUFAVAILUPD    (1U << IPATH_S_PIOBUFAVAILUPD)
+#define INFINIPATH_S_PIOENABLE         (1U << IPATH_S_PIOENABLE)
+#define INFINIPATH_S_SDMAINTENABLE     (1U << IPATH_S_SDMAINTENABLE)
+#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
+                                       (1U << IPATH_S_SDMASINGLEDESCRIPTOR)
+#define INFINIPATH_S_SDMAENABLE                (1U << IPATH_S_SDMAENABLE)
+#define INFINIPATH_S_SDMAHALT          (1U << IPATH_S_SDMAHALT)
+#define INFINIPATH_S_DISARM            (1U << IPATH_S_DISARM)
+
+/* kr_rcvctrl bits that are the same on multiple chips */
+#define INFINIPATH_R_PORTENABLE_SHIFT 0
+#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_SDMAINT           0x8000000000000000ULL
+#define INFINIPATH_I_SDMADISABLED      0x4000000000000000ULL
+#define INFINIPATH_I_ERROR             0x0000000080000000ULL
+#define INFINIPATH_I_SPIOSENT          0x0000000040000000ULL
+#define INFINIPATH_I_SPIOBUFAVAIL      0x0000000020000000ULL
+#define INFINIPATH_I_GPIO              0x0000000010000000ULL
+#define INFINIPATH_I_JINT              0x0000000004000000ULL
+
+/* kr_errorstatus, kr_errorclear, kr_errormask bits */
+#define INFINIPATH_E_RFORMATERR                        0x0000000000000001ULL
+#define INFINIPATH_E_RVCRC                     0x0000000000000002ULL
+#define INFINIPATH_E_RICRC                     0x0000000000000004ULL
+#define INFINIPATH_E_RMINPKTLEN                        0x0000000000000008ULL
+#define INFINIPATH_E_RMAXPKTLEN                        0x0000000000000010ULL
+#define INFINIPATH_E_RLONGPKTLEN               0x0000000000000020ULL
+#define INFINIPATH_E_RSHORTPKTLEN              0x0000000000000040ULL
+#define INFINIPATH_E_RUNEXPCHAR                        0x0000000000000080ULL
+#define INFINIPATH_E_RUNSUPVL                  0x0000000000000100ULL
+#define INFINIPATH_E_REBP                      0x0000000000000200ULL
+#define INFINIPATH_E_RIBFLOW                   0x0000000000000400ULL
+#define INFINIPATH_E_RBADVERSION               0x0000000000000800ULL
+#define INFINIPATH_E_RRCVEGRFULL               0x0000000000001000ULL
+#define INFINIPATH_E_RRCVHDRFULL               0x0000000000002000ULL
+#define INFINIPATH_E_RBADTID                   0x0000000000004000ULL
+#define INFINIPATH_E_RHDRLEN                   0x0000000000008000ULL
+#define INFINIPATH_E_RHDR                      0x0000000000010000ULL
+#define INFINIPATH_E_RIBLOSTLINK               0x0000000000020000ULL
+#define INFINIPATH_E_SENDSPECIALTRIGGER                0x0000000008000000ULL
+#define INFINIPATH_E_SDMADISABLED              0x0000000010000000ULL
+#define INFINIPATH_E_SMINPKTLEN                        0x0000000020000000ULL
+#define INFINIPATH_E_SMAXPKTLEN                        0x0000000040000000ULL
+#define INFINIPATH_E_SUNDERRUN                 0x0000000080000000ULL
+#define INFINIPATH_E_SPKTLEN                   0x0000000100000000ULL
+#define INFINIPATH_E_SDROPPEDSMPPKT            0x0000000200000000ULL
+#define INFINIPATH_E_SDROPPEDDATAPKT           0x0000000400000000ULL
+#define INFINIPATH_E_SPIOARMLAUNCH             0x0000000800000000ULL
+#define INFINIPATH_E_SUNEXPERRPKTNUM           0x0000001000000000ULL
+#define INFINIPATH_E_SUNSUPVL                  0x0000002000000000ULL
+#define INFINIPATH_E_SENDBUFMISUSE             0x0000004000000000ULL
+#define INFINIPATH_E_SDMAGENMISMATCH           0x0000008000000000ULL
+#define INFINIPATH_E_SDMAOUTOFBOUND            0x0000010000000000ULL
+#define INFINIPATH_E_SDMATAILOUTOFBOUND                0x0000020000000000ULL
+#define INFINIPATH_E_SDMABASE                  0x0000040000000000ULL
+#define INFINIPATH_E_SDMA1STDESC               0x0000080000000000ULL
+#define INFINIPATH_E_SDMARPYTAG                        0x0000100000000000ULL
+#define INFINIPATH_E_SDMADWEN                  0x0000200000000000ULL
+#define INFINIPATH_E_SDMAMISSINGDW             0x0000400000000000ULL
+#define INFINIPATH_E_SDMAUNEXPDATA             0x0000800000000000ULL
+#define INFINIPATH_E_IBSTATUSCHANGED           0x0001000000000000ULL
+#define INFINIPATH_E_INVALIDADDR               0x0002000000000000ULL
+#define INFINIPATH_E_RESET                     0x0004000000000000ULL
+#define INFINIPATH_E_HARDWARE                  0x0008000000000000ULL
+#define INFINIPATH_E_SDMADESCADDRMISALIGN      0x0010000000000000ULL
+#define INFINIPATH_E_INVALIDEEPCMD             0x0020000000000000ULL
+
+/*
+ * this is used to print "common" packet errors only when the
+ * __IPATH_ERRPKTDBG bit is set in ipath_debug.
+ */
+#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
+               | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
+               | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
+               | INFINIPATH_E_REBP )
+
+/* Convenience for decoding Send DMA errors */
+#define INFINIPATH_E_SDMAERRS ( \
+       INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
+       INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
+       INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
+       INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
+       INFINIPATH_E_SDMAUNEXPDATA | \
+       INFINIPATH_E_SDMADESCADDRMISALIGN | \
+       INFINIPATH_E_SDMADISABLED | \
+       INFINIPATH_E_SENDBUFMISUSE)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
+ * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
+ *             bit 4: flag buffer, 5: datainfo, 6: header info */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF  0x1ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC  0x2ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
+/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
+/* waldo specific -- find the rest in ipath_6110.c */
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
+/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
+#define INFINIPATH_HWE_MEMBISTFAILED   0x0040000000000000ULL
+
+/* kr_hwdiagctrl bits */
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
+#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
+#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_ibcctrl bits */
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
+#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
+#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
+#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKCMD_DOWN 1         /* move to 0x11 */
+#define INFINIPATH_IBCC_LINKCMD_ARMED 2                /* move to 0x21 */
+#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3       /* move to 0x31 */
+#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
+#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
+#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
+#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
+#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
+#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
+#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
+
+/* kr_ibcstatus bits */
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
+#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
+
+#define INFINIPATH_IBCS_TXREADY       0x40000000
+#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
+/* link training states (shift by
+   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
+#define INFINIPATH_IBCS_LT_STATE_DISABLED      0x00
+#define INFINIPATH_IBCS_LT_STATE_LINKUP                0x01
+#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE    0x02
+#define INFINIPATH_IBCS_LT_STATE_POLLQUIET     0x03
+#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY    0x04
+#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET    0x05
+#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE   0x08
+#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG    0x09
+#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT    0x0a
+#define INFINIPATH_IBCS_LT_STATE_CFGIDLE       0x0b
+#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN        0x0c
+#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT        0x0e
+#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE   0x0f
+/* link state machine states (shift by ibcs_ls_shift) */
+#define INFINIPATH_IBCS_L_STATE_DOWN           0x0
+#define INFINIPATH_IBCS_L_STATE_INIT           0x1
+#define INFINIPATH_IBCS_L_STATE_ARM            0x2
+#define INFINIPATH_IBCS_L_STATE_ACTIVE         0x3
+#define INFINIPATH_IBCS_L_STATE_ACT_DEFER      0x4
+
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
+#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
+#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
+
+/* kr_extctrl bits */
+#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
+#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
+#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
+#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
+#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
+#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
+#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
+#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
+#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
+#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
+#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
+#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
+#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
+
+/* kr_partitionkey bits */
+#define INFINIPATH_PKEY_SIZE 16
+#define INFINIPATH_PKEY_MASK 0xFFFF
+#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
+
+/* kr_serdesconfig0 bits */
+#define INFINIPATH_SERDC0_RESET_MASK  0xfULL   /* overal reset bits */
+#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL    /* pll reset */
+/* tx idle enables (per lane) */
+#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
+/* rx detect enables (per lane) */
+#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
+/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
+#define INFINIPATH_SERDC0_L1PWR_DN      0xF0ULL
+
+/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
+#define INFINIPATH_XGXS_RX_POL_SHIFT 19
+#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
+
+
+/*
+ * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define IPATH_PIO_MAXIBHDR 128
+
+typedef u64 ipath_err_t;
+
+/* The following change with the type of device, so
+ * need to be part of the ipath_devdata struct, or
+ * we could have problems plugging in devices of
+ * different types (e.g. one HT, one PCIE)
+ * in one system, to be managed by one driver.
+ * On the other hand, this file is may also be included
+ * by other code, so leave the declarations here
+ * temporarily. Minor footprint issue if common-model
+ * linker used, none if C89+ linker used.
+ */
+
+/* mask of defined bits for various registers */
+extern u64 infinipath_i_bitsextant;
+extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+
+/* masks that are different in various chips, or only exist in some chips */
+extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+/*
+ * These are the infinipath general register numbers (not offsets).
+ * The kernel registers are used directly, those beyond the kernel
+ * registers are calculated from one of the base registers.  The use of
+ * an integer type doesn't allow type-checking as thorough as, say,
+ * an enum but allows for better hiding of chip differences.
+ */
+typedef const u16 ipath_kreg,  /* infinipath general registers */
+ ipath_creg,                   /* infinipath counter registers */
+ ipath_sreg;                   /* kernel-only, infinipath send registers */
+
+/*
+ * These are the chip registers common to all infinipath chips, and
+ * used both by the kernel and the diagnostics or other user code.
+ * They are all implemented such that 64 bit accesses work.
+ * Some implement no more than 32 bits.  Because 64 bit reads
+ * require 2 HT cmds on opteron, we access those with 32 bit
+ * reads for efficiency (they are written as 64 bits, since
+ * the extra 32 bits are nearly free on writes, and it slightly reduces
+ * complexity).  The rest are all accessed as 64 bits.
+ */
+struct ipath_kregs {
+       /* These are the 32 bit group */
+       ipath_kreg kr_control;
+       ipath_kreg kr_counterregbase;
+       ipath_kreg kr_intmask;
+       ipath_kreg kr_intstatus;
+       ipath_kreg kr_pagealign;
+       ipath_kreg kr_portcnt;
+       ipath_kreg kr_rcvtidbase;
+       ipath_kreg kr_rcvtidcnt;
+       ipath_kreg kr_rcvegrbase;
+       ipath_kreg kr_rcvegrcnt;
+       ipath_kreg kr_scratch;
+       ipath_kreg kr_sendctrl;
+       ipath_kreg kr_sendpiobufbase;
+       ipath_kreg kr_sendpiobufcnt;
+       ipath_kreg kr_sendpiosize;
+       ipath_kreg kr_sendregbase;
+       ipath_kreg kr_userregbase;
+       /* These are the 64 bit group */
+       ipath_kreg kr_debugport;
+       ipath_kreg kr_debugportselect;
+       ipath_kreg kr_errorclear;
+       ipath_kreg kr_errormask;
+       ipath_kreg kr_errorstatus;
+       ipath_kreg kr_extctrl;
+       ipath_kreg kr_extstatus;
+       ipath_kreg kr_gpio_clear;
+       ipath_kreg kr_gpio_mask;
+       ipath_kreg kr_gpio_out;
+       ipath_kreg kr_gpio_status;
+       ipath_kreg kr_hwdiagctrl;
+       ipath_kreg kr_hwerrclear;
+       ipath_kreg kr_hwerrmask;
+       ipath_kreg kr_hwerrstatus;
+       ipath_kreg kr_ibcctrl;
+       ipath_kreg kr_ibcstatus;
+       ipath_kreg kr_intblocked;
+       ipath_kreg kr_intclear;
+       ipath_kreg kr_interruptconfig;
+       ipath_kreg kr_mdio;
+       ipath_kreg kr_partitionkey;
+       ipath_kreg kr_rcvbthqp;
+       ipath_kreg kr_rcvbufbase;
+       ipath_kreg kr_rcvbufsize;
+       ipath_kreg kr_rcvctrl;
+       ipath_kreg kr_rcvhdrcnt;
+       ipath_kreg kr_rcvhdrentsize;
+       ipath_kreg kr_rcvhdrsize;
+       ipath_kreg kr_rcvintmembase;
+       ipath_kreg kr_rcvintmemsize;
+       ipath_kreg kr_revision;
+       ipath_kreg kr_sendbuffererror;
+       ipath_kreg kr_sendpioavailaddr;
+       ipath_kreg kr_serdesconfig0;
+       ipath_kreg kr_serdesconfig1;
+       ipath_kreg kr_serdesstatus;
+       ipath_kreg kr_txintmembase;
+       ipath_kreg kr_txintmemsize;
+       ipath_kreg kr_xgxsconfig;
+       ipath_kreg kr_ibpllcfg;
+       /* use these two (and the following N ports) only with
+        * ipath_k*_kreg64_port(); not *kreg64() */
+       ipath_kreg kr_rcvhdraddr;
+       ipath_kreg kr_rcvhdrtailaddr;
+
+       /* remaining registers are not present on all types of infinipath
+          chips  */
+       ipath_kreg kr_rcvpktledcnt;
+       ipath_kreg kr_pcierbuftestreg0;
+       ipath_kreg kr_pcierbuftestreg1;
+       ipath_kreg kr_pcieq0serdesconfig0;
+       ipath_kreg kr_pcieq0serdesconfig1;
+       ipath_kreg kr_pcieq0serdesstatus;
+       ipath_kreg kr_pcieq1serdesconfig0;
+       ipath_kreg kr_pcieq1serdesconfig1;
+       ipath_kreg kr_pcieq1serdesstatus;
+       ipath_kreg kr_hrtbt_guid;
+       ipath_kreg kr_ibcddrctrl;
+       ipath_kreg kr_ibcddrstatus;
+       ipath_kreg kr_jintreload;
+
+       /* send dma related regs */
+       ipath_kreg kr_senddmabase;
+       ipath_kreg kr_senddmalengen;
+       ipath_kreg kr_senddmatail;
+       ipath_kreg kr_senddmahead;
+       ipath_kreg kr_senddmaheadaddr;
+       ipath_kreg kr_senddmabufmask0;
+       ipath_kreg kr_senddmabufmask1;
+       ipath_kreg kr_senddmabufmask2;
+       ipath_kreg kr_senddmastatus;
+
+       /* SerDes related regs (IBA7220-only) */
+       ipath_kreg kr_ibserdesctrl;
+       ipath_kreg kr_ib_epbacc;
+       ipath_kreg kr_ib_epbtrans;
+       ipath_kreg kr_pcie_epbacc;
+       ipath_kreg kr_pcie_epbtrans;
+       ipath_kreg kr_ib_ddsrxeq;
+};
+
+struct ipath_cregs {
+       ipath_creg cr_badformatcnt;
+       ipath_creg cr_erricrccnt;
+       ipath_creg cr_errlinkcnt;
+       ipath_creg cr_errlpcrccnt;
+       ipath_creg cr_errpkey;
+       ipath_creg cr_errrcvflowctrlcnt;
+       ipath_creg cr_err_rlencnt;
+       ipath_creg cr_errslencnt;
+       ipath_creg cr_errtidfull;
+       ipath_creg cr_errtidvalid;
+       ipath_creg cr_errvcrccnt;
+       ipath_creg cr_ibstatuschange;
+       ipath_creg cr_intcnt;
+       ipath_creg cr_invalidrlencnt;
+       ipath_creg cr_invalidslencnt;
+       ipath_creg cr_lbflowstallcnt;
+       ipath_creg cr_iblinkdowncnt;
+       ipath_creg cr_iblinkerrrecovcnt;
+       ipath_creg cr_ibsymbolerrcnt;
+       ipath_creg cr_pktrcvcnt;
+       ipath_creg cr_pktrcvflowctrlcnt;
+       ipath_creg cr_pktsendcnt;
+       ipath_creg cr_pktsendflowcnt;
+       ipath_creg cr_portovflcnt;
+       ipath_creg cr_rcvebpcnt;
+       ipath_creg cr_rcvovflcnt;
+       ipath_creg cr_rxdroppktcnt;
+       ipath_creg cr_senddropped;
+       ipath_creg cr_sendstallcnt;
+       ipath_creg cr_sendunderruncnt;
+       ipath_creg cr_unsupvlcnt;
+       ipath_creg cr_wordrcvcnt;
+       ipath_creg cr_wordsendcnt;
+       ipath_creg cr_vl15droppedpktcnt;
+       ipath_creg cr_rxotherlocalphyerrcnt;
+       ipath_creg cr_excessbufferovflcnt;
+       ipath_creg cr_locallinkintegrityerrcnt;
+       ipath_creg cr_rxvlerrcnt;
+       ipath_creg cr_rxdlidfltrcnt;
+       ipath_creg cr_psstat;
+       ipath_creg cr_psstart;
+       ipath_creg cr_psinterval;
+       ipath_creg cr_psrcvdatacount;
+       ipath_creg cr_psrcvpktscount;
+       ipath_creg cr_psxmitdatacount;
+       ipath_creg cr_psxmitpktscount;
+       ipath_creg cr_psxmitwaitcount;
+};
+
+#endif                         /* _IPATH_REGISTERS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_ruc.c b/drivers/staging/rdma/ipath/ipath_ruc.c
new file mode 100644 (file)
index 0000000..1f95bba
--- /dev/null
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+const u32 ib_ipath_rnr_table[32] = {
+       656,                    /* 0 */
+       1,                      /* 1 */
+       1,                      /* 2 */
+       1,                      /* 3 */
+       1,                      /* 4 */
+       1,                      /* 5 */
+       1,                      /* 6 */
+       1,                      /* 7 */
+       1,                      /* 8 */
+       1,                      /* 9 */
+       1,                      /* A */
+       1,                      /* B */
+       1,                      /* C */
+       1,                      /* D */
+       2,                      /* E */
+       2,                      /* F */
+       3,                      /* 10 */
+       4,                      /* 11 */
+       6,                      /* 12 */
+       8,                      /* 13 */
+       11,                     /* 14 */
+       16,                     /* 15 */
+       21,                     /* 16 */
+       31,                     /* 17 */
+       41,                     /* 18 */
+       62,                     /* 19 */
+       82,                     /* 1A */
+       123,                    /* 1B */
+       164,                    /* 1C */
+       246,                    /* 1D */
+       328,                    /* 1E */
+       492                     /* 1F */
+};
+
+/**
+ * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
+ * @qp: the QP
+ *
+ * Called with the QP s_lock held and interrupts disabled.
+ * XXX Use a simple list for now.  We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+void ipath_insert_rnr_queue(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+
+       /* We already did a spin_lock_irqsave(), so just use spin_lock */
+       spin_lock(&dev->pending_lock);
+       if (list_empty(&dev->rnrwait))
+               list_add(&qp->timerwait, &dev->rnrwait);
+       else {
+               struct list_head *l = &dev->rnrwait;
+               struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+                                                 timerwait);
+
+               while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+                       qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+                       l = l->next;
+                       if (l->next == &dev->rnrwait) {
+                               nqp = NULL;
+                               break;
+                       }
+                       nqp = list_entry(l->next, struct ipath_qp,
+                                        timerwait);
+               }
+               if (nqp)
+                       nqp->s_rnr_timeout -= qp->s_rnr_timeout;
+               list_add(&qp->timerwait, l);
+       }
+       spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_init_sge - Validate a RWQE and fill in the SGE state
+ * @qp: the QP
+ *
+ * Return 1 if OK.
+ */
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+                  u32 *lengthp, struct ipath_sge_state *ss)
+{
+       int i, j, ret;
+       struct ib_wc wc;
+
+       *lengthp = 0;
+       for (i = j = 0; i < wqe->num_sge; i++) {
+               if (wqe->sg_list[i].length == 0)
+                       continue;
+               /* Check LKEY */
+               if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
+                                  &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+                       goto bad_lkey;
+               *lengthp += wqe->sg_list[i].length;
+               j++;
+       }
+       ss->num_sge = j;
+       ret = 1;
+       goto bail;
+
+bad_lkey:
+       memset(&wc, 0, sizeof(wc));
+       wc.wr_id = wqe->wr_id;
+       wc.status = IB_WC_LOC_PROT_ERR;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       /* Signal solicited completion event. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return 0 if no RWQE is available, otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
+{
+       unsigned long flags;
+       struct ipath_rq *rq;
+       struct ipath_rwq *wq;
+       struct ipath_srq *srq;
+       struct ipath_rwqe *wqe;
+       void (*handler)(struct ib_event *, void *);
+       u32 tail;
+       int ret;
+
+       if (qp->ibqp.srq) {
+               srq = to_isrq(qp->ibqp.srq);
+               handler = srq->ibsrq.event_handler;
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               handler = NULL;
+               rq = &qp->r_rq;
+       }
+
+       spin_lock_irqsave(&rq->lock, flags);
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               ret = 0;
+               goto unlock;
+       }
+
+       wq = rq->wq;
+       tail = wq->tail;
+       /* Validate tail before using it since it is user writable. */
+       if (tail >= rq->size)
+               tail = 0;
+       do {
+               if (unlikely(tail == wq->head)) {
+                       ret = 0;
+                       goto unlock;
+               }
+               /* Make sure entry is read after head index is read. */
+               smp_rmb();
+               wqe = get_rwqe_ptr(rq, tail);
+               if (++tail >= rq->size)
+                       tail = 0;
+               if (wr_id_only)
+                       break;
+               qp->r_sge.sg_list = qp->r_sg_list;
+       } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
+       qp->r_wr_id = wqe->wr_id;
+       wq->tail = tail;
+
+       ret = 1;
+       set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
+       if (handler) {
+               u32 n;
+
+               /*
+                * validate head pointer value and compute
+                * the number of remaining WQEs.
+                */
+               n = wq->head;
+               if (n >= rq->size)
+                       n = 0;
+               if (n < tail)
+                       n += rq->size - tail;
+               else
+                       n -= tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       handler(&ev, srq->ibsrq.srq_context);
+                       goto bail;
+               }
+       }
+unlock:
+       spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+       return ret;
+}
+
+/**
+ * ipath_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from ipath_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ipath_ruc_loopback(struct ipath_qp *sqp)
+{
+       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+       struct ipath_qp *qp;
+       struct ipath_swqe *wqe;
+       struct ipath_sge *sge;
+       unsigned long flags;
+       struct ib_wc wc;
+       u64 sdata;
+       atomic64_t *maddr;
+       enum ib_wc_status send_status;
+
+       /*
+        * Note that we check the responder QP state after
+        * checking the requester's state.
+        */
+       qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+
+       spin_lock_irqsave(&sqp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+           !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+               goto unlock;
+
+       sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+       if (sqp->s_last == sqp->s_head)
+               goto clr_busy;
+       wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+       /* Return if it is not OK to start a new work reqeust. */
+       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+                       goto clr_busy;
+               /* We are in the error state, flush the work request. */
+               send_status = IB_WC_WR_FLUSH_ERR;
+               goto flush_send;
+       }
+
+       /*
+        * We can rely on the entry not changing without the s_lock
+        * being held until we update s_last.
+        * We increment s_cur to indicate s_last is in progress.
+        */
+       if (sqp->s_last == sqp->s_cur) {
+               if (++sqp->s_cur >= sqp->s_size)
+                       sqp->s_cur = 0;
+       }
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               dev->n_pkt_drops++;
+               /*
+                * For RC, the requester would timeout and retry so
+                * shortcut the timeouts and just signal too many retries.
+                */
+               if (sqp->ibqp.qp_type == IB_QPT_RC)
+                       send_status = IB_WC_RETRY_EXC_ERR;
+               else
+                       send_status = IB_WC_SUCCESS;
+               goto serr;
+       }
+
+       memset(&wc, 0, sizeof wc);
+       send_status = IB_WC_SUCCESS;
+
+       sqp->s_sge.sge = wqe->sg_list[0];
+       sqp->s_sge.sg_list = wqe->sg_list + 1;
+       sqp->s_sge.num_sge = wqe->wr.num_sge;
+       sqp->s_len = wqe->length;
+       switch (wqe->wr.opcode) {
+       case IB_WR_SEND_WITH_IMM:
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               /* FALLTHROUGH */
+       case IB_WR_SEND:
+               if (!ipath_get_rwqe(qp, 0))
+                       goto rnr_nak;
+               break;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
+               if (!ipath_get_rwqe(qp, 1))
+                       goto rnr_nak;
+               /* FALLTHROUGH */
+       case IB_WR_RDMA_WRITE:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
+               if (wqe->length == 0)
+                       break;
+               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
+                                           wqe->wr.wr.rdma.remote_addr,
+                                           wqe->wr.wr.rdma.rkey,
+                                           IB_ACCESS_REMOTE_WRITE)))
+                       goto acc_err;
+               break;
+
+       case IB_WR_RDMA_READ:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto inv_err;
+               if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
+                                           wqe->wr.wr.rdma.remote_addr,
+                                           wqe->wr.wr.rdma.rkey,
+                                           IB_ACCESS_REMOTE_READ)))
+                       goto acc_err;
+               qp->r_sge.sge = wqe->sg_list[0];
+               qp->r_sge.sg_list = wqe->sg_list + 1;
+               qp->r_sge.num_sge = wqe->wr.num_sge;
+               break;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto inv_err;
+               if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
+                                           wqe->wr.wr.atomic.remote_addr,
+                                           wqe->wr.wr.atomic.rkey,
+                                           IB_ACCESS_REMOTE_ATOMIC)))
+                       goto acc_err;
+               /* Perform atomic OP and save result. */
+               maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+               sdata = wqe->wr.wr.atomic.compare_add;
+               *(u64 *) sqp->s_sge.sge.vaddr =
+                       (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+                       (u64) atomic64_add_return(sdata, maddr) - sdata :
+                       (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+                                     sdata, wqe->wr.wr.atomic.swap);
+               goto send_comp;
+
+       default:
+               send_status = IB_WC_LOC_QP_OP_ERR;
+               goto serr;
+       }
+
+       sge = &sqp->s_sge.sge;
+       while (sqp->s_len) {
+               u32 len = sqp->s_len;
+
+               if (len > sge->length)
+                       len = sge->length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               BUG_ON(len == 0);
+               ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--sqp->s_sge.num_sge)
+                               *sge = *sqp->s_sge.sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               sqp->s_len -= len;
+       }
+
+       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+               goto send_comp;
+
+       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+       else
+               wc.opcode = IB_WC_RECV;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.byte_len = wqe->length;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = qp->remote_qpn;
+       wc.slid = qp->remote_ah_attr.dlid;
+       wc.sl = qp->remote_ah_attr.sl;
+       wc.port_num = 1;
+       /* Signal completion event if the solicited bit is set. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                      wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
+       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+       ipath_send_complete(sqp, wqe, send_status);
+       goto again;
+
+rnr_nak:
+       /* Handle RNR NAK */
+       if (qp->ibqp.qp_type == IB_QPT_UC)
+               goto send_comp;
+       /*
+        * Note: we don't need the s_lock held since the BUSY flag
+        * makes this single threaded.
+        */
+       if (sqp->s_rnr_retry == 0) {
+               send_status = IB_WC_RNR_RETRY_EXC_ERR;
+               goto serr;
+       }
+       if (sqp->s_rnr_retry_cnt < 7)
+               sqp->s_rnr_retry--;
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
+               goto clr_busy;
+       sqp->s_flags |= IPATH_S_WAITING;
+       dev->n_rnr_naks++;
+       sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
+       ipath_insert_rnr_queue(sqp);
+       goto clr_busy;
+
+inv_err:
+       send_status = IB_WC_REM_INV_REQ_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+acc_err:
+       send_status = IB_WC_REM_ACCESS_ERR;
+       wc.status = IB_WC_LOC_PROT_ERR;
+err:
+       /* responder goes to error state */
+       ipath_rc_error(qp, wc.status);
+
+serr:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       ipath_send_complete(sqp, wqe, send_status);
+       if (sqp->ibqp.qp_type == IB_QPT_RC) {
+               int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+               sqp->s_flags &= ~IPATH_S_BUSY;
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               if (lastwqe) {
+                       struct ib_event ev;
+
+                       ev.device = sqp->ibqp.device;
+                       ev.element.qp = &sqp->ibqp;
+                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+               }
+               goto done;
+       }
+clr_busy:
+       sqp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+       if (qp && atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
+{
+       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+           qp->ibqp.qp_type == IB_QPT_SMI) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                                dd->ipath_sendctrl);
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+       }
+}
+
+/**
+ * ipath_no_bufs_available - tell the layer driver we need buffers
+ * @qp: the QP that caused the problem
+ * @dev: the device we ran out of buffers on
+ *
+ * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int ipath_no_bufs_available(struct ipath_qp *qp,
+                                   struct ipath_ibdev *dev)
+{
+       unsigned long flags;
+       int ret = 1;
+
+       /*
+        * Note that as soon as want_buffer() is called and
+        * possibly before it returns, ipath_ib_piobufavail()
+        * could be called. Therefore, put QP on the piowait list before
+        * enabling the PIO avail interrupt.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+               dev->n_piowait++;
+               qp->s_flags |= IPATH_S_WAITING;
+               qp->s_flags &= ~IPATH_S_BUSY;
+               spin_lock(&dev->pending_lock);
+               if (list_empty(&qp->piowait))
+                       list_add_tail(&qp->piowait, &dev->piowait);
+               spin_unlock(&dev->pending_lock);
+       } else
+               ret = 0;
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (ret)
+               want_buffer(dev->dd, qp);
+       return ret;
+}
+
+/**
+ * ipath_make_grh - construct a GRH header
+ * @dev: a pointer to the ipath device
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+                  struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+       hdr->version_tclass_flow =
+               cpu_to_be32((6 << 28) |
+                           (grh->traffic_class << 20) |
+                           grh->flow_label);
+       hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+       hdr->next_hdr = 0x1B;
+       hdr->hop_limit = grh->hop_limit;
+       /* The SGID is 32-bit aligned. */
+       hdr->sgid.global.subnet_prefix = dev->gid_prefix;
+       hdr->sgid.global.interface_id = dev->dd->ipath_guid;
+       hdr->dgid = grh->dgid;
+
+       /* GRH header size in 32-bit words. */
+       return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+                          struct ipath_other_headers *ohdr,
+                          u32 bth0, u32 bth2)
+{
+       u16 lrh0;
+       u32 nwords;
+       u32 extra_bytes;
+
+       /* Construct the header. */
+       extra_bytes = -qp->s_cur_size & 3;
+       nwords = (qp->s_cur_size + extra_bytes) >> 2;
+       lrh0 = IPATH_LRH_BTH;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+                                                &qp->remote_ah_attr.grh,
+                                                qp->s_hdrwords, nwords);
+               lrh0 = IPATH_LRH_GRH;
+       }
+       lrh0 |= qp->remote_ah_attr.sl << 4;
+       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+       qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
+                                      qp->remote_ah_attr.src_path_bits);
+       bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
+       bth0 |= extra_bytes << 20;
+       ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * ipath_do_send - perform a send on a QP
+ * @data: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void ipath_do_send(unsigned long data)
+{
+       struct ipath_qp *qp = (struct ipath_qp *)data;
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       int (*make_req)(struct ipath_qp *qp);
+       unsigned long flags;
+
+       if ((qp->ibqp.qp_type == IB_QPT_RC ||
+            qp->ibqp.qp_type == IB_QPT_UC) &&
+           qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
+               ipath_ruc_loopback(qp);
+               goto bail;
+       }
+
+       if (qp->ibqp.qp_type == IB_QPT_RC)
+              make_req = ipath_make_rc_req;
+       else if (qp->ibqp.qp_type == IB_QPT_UC)
+              make_req = ipath_make_uc_req;
+       else
+              make_req = ipath_make_ud_req;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+           !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               goto bail;
+       }
+
+       qp->s_flags |= IPATH_S_BUSY;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+again:
+       /* Check for a constructed packet to be sent. */
+       if (qp->s_hdrwords != 0) {
+               /*
+                * If no PIO bufs are available, return.  An interrupt will
+                * call ipath_ib_piobufavail() when one is available.
+                */
+               if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
+                                    qp->s_cur_sge, qp->s_cur_size)) {
+                       if (ipath_no_bufs_available(qp, dev))
+                               goto bail;
+               }
+               dev->n_unicast_xmit++;
+               /* Record that we sent the packet and s_hdr is empty. */
+               qp->s_hdrwords = 0;
+       }
+
+       if (make_req(qp))
+               goto again;
+
+bail:;
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+                        enum ib_wc_status status)
+{
+       u32 old_last, last;
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+               return;
+
+       /* See ch. 11.2.4.1 and 10.7.3.1 */
+       if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+           status != IB_WC_SUCCESS) {
+               struct ib_wc wc;
+
+               memset(&wc, 0, sizeof wc);
+               wc.wr_id = wqe->wr.wr_id;
+               wc.status = status;
+               wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+               wc.qp = &qp->ibqp;
+               if (status == IB_WC_SUCCESS)
+                       wc.byte_len = wqe->length;
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+                              status != IB_WC_SUCCESS);
+       }
+
+       old_last = last = qp->s_last;
+       if (++last >= qp->s_size)
+               last = 0;
+       qp->s_last = last;
+       if (qp->s_cur == old_last)
+               qp->s_cur = last;
+       if (qp->s_tail == old_last)
+               qp->s_tail = last;
+       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+               qp->s_draining = 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_sdma.c b/drivers/staging/rdma/ipath/ipath_sdma.c
new file mode 100644 (file)
index 0000000..17a5177
--- /dev/null
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
+
+static void vl15_watchdog_enq(struct ipath_devdata *dd)
+{
+       /* ipath_sdma_lock must already be held */
+       if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
+               unsigned long interval = (HZ + 19) / 20;
+               dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
+               add_timer(&dd->ipath_sdma_vl15_timer);
+       }
+}
+
+static void vl15_watchdog_deq(struct ipath_devdata *dd)
+{
+       /* ipath_sdma_lock must already be held */
+       if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
+               unsigned long interval = (HZ + 19) / 20;
+               mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
+       } else {
+               del_timer(&dd->ipath_sdma_vl15_timer);
+       }
+}
+
+static void vl15_watchdog_timeout(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+       if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
+               ipath_dbg("vl15 watchdog timeout - clearing\n");
+               ipath_cancel_sends(dd, 1);
+               ipath_hol_down(dd);
+       } else {
+               ipath_dbg("vl15 watchdog timeout - "
+                         "condition already cleared\n");
+       }
+}
+
+static void unmap_desc(struct ipath_devdata *dd, unsigned head)
+{
+       __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
+       u64 desc[2];
+       dma_addr_t addr;
+       size_t len;
+
+       desc[0] = le64_to_cpu(descqp[0]);
+       desc[1] = le64_to_cpu(descqp[1]);
+
+       addr = (desc[1] << 32) | (desc[0] >> 32);
+       len = (desc[0] >> 14) & (0x7ffULL << 2);
+       dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+/*
+ * ipath_sdma_lock should be locked before calling this.
+ */
+int ipath_sdma_make_progress(struct ipath_devdata *dd)
+{
+       struct list_head *lp = NULL;
+       struct ipath_sdma_txreq *txp = NULL;
+       u16 dmahead;
+       u16 start_idx = 0;
+       int progress = 0;
+
+       if (!list_empty(&dd->ipath_sdma_activelist)) {
+               lp = dd->ipath_sdma_activelist.next;
+               txp = list_entry(lp, struct ipath_sdma_txreq, list);
+               start_idx = txp->start_idx;
+       }
+
+       /*
+        * Read the SDMA head register in order to know that the
+        * interrupt clear has been written to the chip.
+        * Otherwise, we may not get an interrupt for the last
+        * descriptor in the queue.
+        */
+       dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
+       /* sanity check return value for error handling (chip reset, etc.) */
+       if (dmahead >= dd->ipath_sdma_descq_cnt)
+               goto done;
+
+       while (dd->ipath_sdma_descq_head != dmahead) {
+               if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
+                   dd->ipath_sdma_descq_head == start_idx) {
+                       unmap_desc(dd, dd->ipath_sdma_descq_head);
+                       start_idx++;
+                       if (start_idx == dd->ipath_sdma_descq_cnt)
+                               start_idx = 0;
+               }
+
+               /* increment free count and head */
+               dd->ipath_sdma_descq_removed++;
+               if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
+                       dd->ipath_sdma_descq_head = 0;
+
+               if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
+                       /* move to notify list */
+                       if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+                               vl15_watchdog_deq(dd);
+                       list_move_tail(lp, &dd->ipath_sdma_notifylist);
+                       if (!list_empty(&dd->ipath_sdma_activelist)) {
+                               lp = dd->ipath_sdma_activelist.next;
+                               txp = list_entry(lp, struct ipath_sdma_txreq,
+                                                list);
+                               start_idx = txp->start_idx;
+                       } else {
+                               lp = NULL;
+                               txp = NULL;
+                       }
+               }
+               progress = 1;
+       }
+
+       if (progress)
+               tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+done:
+       return progress;
+}
+
+static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
+{
+       struct ipath_sdma_txreq *txp, *txp_next;
+
+       list_for_each_entry_safe(txp, txp_next, list, list) {
+               list_del_init(&txp->list);
+
+               if (txp->callback)
+                       (*txp->callback)(txp->callback_cookie,
+                                        txp->callback_status);
+       }
+}
+
+static void sdma_notify_taskbody(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+       struct list_head list;
+
+       INIT_LIST_HEAD(&list);
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+       list_splice_init(&dd->ipath_sdma_notifylist, &list);
+
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+       ipath_sdma_notify(dd, &list);
+
+       /*
+        * The IB verbs layer needs to see the callback before getting
+        * the call to ipath_ib_piobufavail() because the callback
+        * handles releasing resources the next send will need.
+        * Otherwise, we could do these calls in
+        * ipath_sdma_make_progress().
+        */
+       ipath_ib_piobufavail(dd->verbs_dev);
+}
+
+static void sdma_notify_task(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+       if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+               sdma_notify_taskbody(dd);
+}
+
+static void dump_sdma_state(struct ipath_devdata *dd)
+{
+       unsigned long reg;
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
+       ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
+       ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
+       ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
+       ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
+       ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+       ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
+
+       reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+       ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
+}
+
+static void sdma_abort_task(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+       u64 status;
+       unsigned long flags;
+
+       if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+               return;
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+       status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
+
+       /* nothing to do */
+       if (status == IPATH_SDMA_ABORT_NONE)
+               goto unlock;
+
+       /* ipath_sdma_abort() is done, waiting for interrupt */
+       if (status == IPATH_SDMA_ABORT_DISARMED) {
+               if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
+                       goto resched_noprint;
+               /* give up, intr got lost somewhere */
+               ipath_dbg("give up waiting for SDMADISABLED intr\n");
+               __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+               status = IPATH_SDMA_ABORT_ABORTED;
+       }
+
+       /* everything is stopped, time to clean up and restart */
+       if (status == IPATH_SDMA_ABORT_ABORTED) {
+               struct ipath_sdma_txreq *txp, *txpnext;
+               u64 hwstatus;
+               int notify = 0;
+
+               hwstatus = ipath_read_kreg64(dd,
+                               dd->ipath_kregs->kr_senddmastatus);
+
+               if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
+                                IPATH_SDMA_STATUS_ABORT_IN_PROG             |
+                                IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
+                   !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
+                       if (dd->ipath_sdma_reset_wait > 0) {
+                               /* not done shutting down sdma */
+                               --dd->ipath_sdma_reset_wait;
+                               goto resched;
+                       }
+                       ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
+                               "status after SDMA reset, continuing\n");
+                       dump_sdma_state(dd);
+               }
+
+               /* dequeue all "sent" requests */
+               list_for_each_entry_safe(txp, txpnext,
+                                        &dd->ipath_sdma_activelist, list) {
+                       txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
+                       if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+                               vl15_watchdog_deq(dd);
+                       list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+                       notify = 1;
+               }
+               if (notify)
+                       tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+               /* reset our notion of head and tail */
+               dd->ipath_sdma_descq_tail = 0;
+               dd->ipath_sdma_descq_head = 0;
+               dd->ipath_sdma_head_dma[0] = 0;
+               dd->ipath_sdma_generation = 0;
+               dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
+
+               /* Reset SendDmaLenGen */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
+                       (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
+
+               /* done with sdma state for a bit */
+               spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+               /*
+                * Don't restart sdma here (with the exception
+                * below). Wait until link is up to ACTIVE.  VL15 MADs
+                * used to bring the link up use PIO, and multiple link
+                * transitions otherwise cause the sdma engine to be
+                * stopped and started multiple times.
+                * The disable is done here, including the shadow,
+                * so the state is kept consistent.
+                * See ipath_restart_sdma() for the actual starting
+                * of sdma.
+                */
+               spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+               dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                                dd->ipath_sendctrl);
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+               /* make sure I see next message */
+               dd->ipath_sdma_abort_jiffies = 0;
+
+               /*
+                * Not everything that takes SDMA offline is a link
+                * status change.  If the link was up, restart SDMA.
+                */
+               if (dd->ipath_flags & IPATH_LINKACTIVE)
+                       ipath_restart_sdma(dd);
+
+               goto done;
+       }
+
+resched:
+       /*
+        * for now, keep spinning
+        * JAG - this is bad to just have default be a loop without
+        * state change
+        */
+       if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
+               ipath_dbg("looping with status 0x%08lx\n",
+                         dd->ipath_sdma_status);
+               dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
+       }
+resched_noprint:
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+       if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+               tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+       return;
+
+unlock:
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+done:
+       return;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void ipath_sdma_intr(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+       (void) ipath_sdma_make_progress(dd);
+
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+}
+
+static int alloc_sdma(struct ipath_devdata *dd)
+{
+       int ret = 0;
+
+       /* Allocate memory for SendDMA descriptor FIFO */
+       dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
+               SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
+
+       if (!dd->ipath_sdma_descq) {
+               ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
+                       "FIFO memory\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       dd->ipath_sdma_descq_cnt =
+               SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
+
+       /* Allocate memory for DMA of head register to memory */
+       dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
+               PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
+       if (!dd->ipath_sdma_head_dma) {
+               ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
+               ret = -ENOMEM;
+               goto cleanup_descq;
+       }
+       dd->ipath_sdma_head_dma[0] = 0;
+
+       init_timer(&dd->ipath_sdma_vl15_timer);
+       dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
+       dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
+       atomic_set(&dd->ipath_sdma_vl15_count, 0);
+
+       goto done;
+
+cleanup_descq:
+       dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+               (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
+       dd->ipath_sdma_descq = NULL;
+       dd->ipath_sdma_descq_phys = 0;
+done:
+       return ret;
+}
+
+int setup_sdma(struct ipath_devdata *dd)
+{
+       int ret = 0;
+       unsigned i, n;
+       u64 tmp64;
+       u64 senddmabufmask[3] = { 0 };
+       unsigned long flags;
+
+       ret = alloc_sdma(dd);
+       if (ret)
+               goto done;
+
+       if (!dd->ipath_sdma_descq) {
+               ipath_dev_err(dd, "SendDMA memory not allocated\n");
+               goto done;
+       }
+
+       /*
+        * Set initial status as if we had been up, then gone down.
+        * This lets initial start on transition to ACTIVE be the
+        * same as restart after link flap.
+        */
+       dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
+       dd->ipath_sdma_abort_jiffies = 0;
+       dd->ipath_sdma_generation = 0;
+       dd->ipath_sdma_descq_tail = 0;
+       dd->ipath_sdma_descq_head = 0;
+       dd->ipath_sdma_descq_removed = 0;
+       dd->ipath_sdma_descq_added = 0;
+
+       /* Set SendDmaBase */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
+                        dd->ipath_sdma_descq_phys);
+       /* Set SendDmaLenGen */
+       tmp64 = dd->ipath_sdma_descq_cnt;
+       tmp64 |= 1<<18; /* enable generation checking */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
+       /* Set SendDmaTail */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
+                        dd->ipath_sdma_descq_tail);
+       /* Set SendDmaHeadAddr */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
+                        dd->ipath_sdma_head_phys);
+
+       /*
+        * Reserve all the former "kernel" piobufs, using high number range
+        * so we get as many 4K buffers as possible
+        */
+       n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+       ipath_chg_pioavailkernel(dd, i, n - i , 0);
+       for (; i < n; ++i) {
+               unsigned word = i / 64;
+               unsigned bit = i & 63;
+               BUG_ON(word >= 3);
+               senddmabufmask[word] |= 1ULL << bit;
+       }
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
+                        senddmabufmask[0]);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
+                        senddmabufmask[1]);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
+                        senddmabufmask[2]);
+
+       INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
+       INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
+
+       tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
+                    (unsigned long) dd);
+       tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
+                    (unsigned long) dd);
+
+       /*
+        * No use to turn on SDMA here, as link is probably not ACTIVE
+        * Just mark it RUNNING and enable the interrupt, and let the
+        * ipath_restart_sdma() on link transition to ACTIVE actually
+        * enable it.
+        */
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+done:
+       return ret;
+}
+
+void teardown_sdma(struct ipath_devdata *dd)
+{
+       struct ipath_sdma_txreq *txp, *txpnext;
+       unsigned long flags;
+       dma_addr_t sdma_head_phys = 0;
+       dma_addr_t sdma_descq_phys = 0;
+       void *sdma_descq = NULL;
+       void *sdma_head_dma = NULL;
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+       __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+       __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+       __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+       tasklet_kill(&dd->ipath_sdma_abort_task);
+       tasklet_kill(&dd->ipath_sdma_notify_task);
+
+       /* turn off sdma */
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+               dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+       /* dequeue all "sent" requests */
+       list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
+                                list) {
+               txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
+               if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+                       vl15_watchdog_deq(dd);
+               list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+       }
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+       sdma_notify_taskbody(dd);
+
+       del_timer_sync(&dd->ipath_sdma_vl15_timer);
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+       dd->ipath_sdma_abort_jiffies = 0;
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
+
+       if (dd->ipath_sdma_head_dma) {
+               sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
+               sdma_head_phys = dd->ipath_sdma_head_phys;
+               dd->ipath_sdma_head_dma = NULL;
+               dd->ipath_sdma_head_phys = 0;
+       }
+
+       if (dd->ipath_sdma_descq) {
+               sdma_descq = dd->ipath_sdma_descq;
+               sdma_descq_phys = dd->ipath_sdma_descq_phys;
+               dd->ipath_sdma_descq = NULL;
+               dd->ipath_sdma_descq_phys = 0;
+       }
+
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+       if (sdma_head_dma)
+               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                 sdma_head_dma, sdma_head_phys);
+
+       if (sdma_descq)
+               dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+                                 sdma_descq, sdma_descq_phys);
+}
+
+/*
+ * [Re]start SDMA, if we use it, and it's not already OK.
+ * This is called on transition to link ACTIVE, either the first or
+ * subsequent times.
+ */
+void ipath_restart_sdma(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+       int needed = 1;
+
+       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+               goto bail;
+
+       /*
+        * First, make sure we should, which is to say,
+        * check that we are "RUNNING" (not in teardown)
+        * and not "SHUTDOWN"
+        */
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+       if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
+               || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+                       needed = 0;
+       else {
+               __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+               __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+               __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+       }
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+       if (!needed) {
+               ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
+                       dd->ipath_sdma_status);
+               goto bail;
+       }
+       spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+       /*
+        * First clear, just to be safe. Enable is only done
+        * in chip on 0->1 transition
+        */
+       dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+       ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+       /* notify upper layers */
+       ipath_ib_piobufavail(dd->verbs_dev);
+
+bail:
+       return;
+}
+
+static inline void make_sdma_desc(struct ipath_devdata *dd,
+       u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
+{
+       WARN_ON(addr & 3);
+       /* SDmaPhyAddr[47:32] */
+       sdmadesc[1] = addr >> 32;
+       /* SDmaPhyAddr[31:0] */
+       sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+       /* SDmaGeneration[1:0] */
+       sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
+       /* SDmaDwordCount[10:0] */
+       sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
+       /* SDmaBufOffset[12:2] */
+       sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int ipath_sdma_verbs_send(struct ipath_devdata *dd,
+       struct ipath_sge_state *ss, u32 dwords,
+       struct ipath_verbs_txreq *tx)
+{
+
+       unsigned long flags;
+       struct ipath_sge *sge;
+       int ret = 0;
+       u16 tail;
+       __le64 *descqp;
+       u64 sdmadesc[2];
+       u32 dwoffset;
+       dma_addr_t addr;
+
+       if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
+               ipath_dbg("packet size %X > ibmax %X, fail\n",
+                       tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
+               ret = -EMSGSIZE;
+               goto fail;
+       }
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+retry:
+       if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
+               ret = -EBUSY;
+               goto unlock;
+       }
+
+       if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
+               if (ipath_sdma_make_progress(dd))
+                       goto retry;
+               ret = -ENOBUFS;
+               goto unlock;
+       }
+
+       addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
+                             tx->map_len, DMA_TO_DEVICE);
+       if (dma_mapping_error(&dd->pcidev->dev, addr))
+               goto ioerr;
+
+       dwoffset = tx->map_len >> 2;
+       make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
+
+       /* SDmaFirstDesc */
+       sdmadesc[0] |= 1ULL << 12;
+       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+               sdmadesc[0] |= 1ULL << 14;      /* SDmaUseLargeBuf */
+
+       /* write to the descq */
+       tail = dd->ipath_sdma_descq_tail;
+       descqp = &dd->ipath_sdma_descq[tail].qw[0];
+       *descqp++ = cpu_to_le64(sdmadesc[0]);
+       *descqp++ = cpu_to_le64(sdmadesc[1]);
+
+       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
+               tx->txreq.start_idx = tail;
+
+       /* increment the tail */
+       if (++tail == dd->ipath_sdma_descq_cnt) {
+               tail = 0;
+               descqp = &dd->ipath_sdma_descq[0].qw[0];
+               ++dd->ipath_sdma_generation;
+       }
+
+       sge = &ss->sge;
+       while (dwords) {
+               u32 dw;
+               u32 len;
+
+               len = dwords << 2;
+               if (len > sge->length)
+                       len = sge->length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               BUG_ON(len == 0);
+               dw = (len + 3) >> 2;
+               addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
+                                     DMA_TO_DEVICE);
+               if (dma_mapping_error(&dd->pcidev->dev, addr))
+                       goto unmap;
+               make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
+               /* SDmaUseLargeBuf has to be set in every descriptor */
+               if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+                       sdmadesc[0] |= 1ULL << 14;
+               /* write to the descq */
+               *descqp++ = cpu_to_le64(sdmadesc[0]);
+               *descqp++ = cpu_to_le64(sdmadesc[1]);
+
+               /* increment the tail */
+               if (++tail == dd->ipath_sdma_descq_cnt) {
+                       tail = 0;
+                       descqp = &dd->ipath_sdma_descq[0].qw[0];
+                       ++dd->ipath_sdma_generation;
+               }
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+
+               dwoffset += dw;
+               dwords -= dw;
+       }
+
+       if (!tail)
+               descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
+       descqp -= 2;
+       /* SDmaLastDesc */
+       descqp[0] |= cpu_to_le64(1ULL << 11);
+       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
+               /* SDmaIntReq */
+               descqp[0] |= cpu_to_le64(1ULL << 15);
+       }
+
+       /* Commit writes to memory and advance the tail on the chip */
+       wmb();
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+
+       tx->txreq.next_descq_idx = tail;
+       tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
+       dd->ipath_sdma_descq_tail = tail;
+       dd->ipath_sdma_descq_added += tx->txreq.sg_count;
+       list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
+       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
+               vl15_watchdog_enq(dd);
+       goto unlock;
+
+unmap:
+       while (tail != dd->ipath_sdma_descq_tail) {
+               if (!tail)
+                       tail = dd->ipath_sdma_descq_cnt - 1;
+               else
+                       tail--;
+               unmap_desc(dd, tail);
+       }
+ioerr:
+       ret = -EIO;
+unlock:
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+fail:
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_srq.c b/drivers/staging/rdma/ipath/ipath_srq.c
new file mode 100644 (file)
index 0000000..2627198
--- /dev/null
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                          struct ib_recv_wr **bad_wr)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+       struct ipath_rwq *wq;
+       unsigned long flags;
+       int ret;
+
+       for (; wr; wr = wr->next) {
+               struct ipath_rwqe *wqe;
+               u32 next;
+               int i;
+
+               if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+                       *bad_wr = wr;
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&srq->rq.lock, flags);
+               wq = srq->rq.wq;
+               next = wq->head + 1;
+               if (next >= srq->rq.size)
+                       next = 0;
+               if (next == wq->tail) {
+                       spin_unlock_irqrestore(&srq->rq.lock, flags);
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               wqe = get_rwqe_ptr(&srq->rq, wq->head);
+               wqe->wr_id = wr->wr_id;
+               wqe->num_sge = wr->num_sge;
+               for (i = 0; i < wr->num_sge; i++)
+                       wqe->sg_list[i] = wr->sg_list[i];
+               /* Make sure queue entry is written before the head index. */
+               smp_wmb();
+               wq->head = next;
+               spin_unlock_irqrestore(&srq->rq.lock, flags);
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libipathverbs when creating a user SRQ
+ */
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+                               struct ib_srq_init_attr *srq_init_attr,
+                               struct ib_udata *udata)
+{
+       struct ipath_ibdev *dev = to_idev(ibpd->device);
+       struct ipath_srq *srq;
+       u32 sz;
+       struct ib_srq *ret;
+
+       if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+               ret = ERR_PTR(-ENOSYS);
+               goto done;
+       }
+
+       if (srq_init_attr->attr.max_wr == 0) {
+               ret = ERR_PTR(-EINVAL);
+               goto done;
+       }
+
+       if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
+           (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
+               ret = ERR_PTR(-EINVAL);
+               goto done;
+       }
+
+       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+       if (!srq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto done;
+       }
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       srq->rq.size = srq_init_attr->attr.max_wr + 1;
+       srq->rq.max_sge = srq_init_attr->attr.max_sge;
+       sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+               sizeof(struct ipath_rwqe);
+       srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
+       if (!srq->rq.wq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_srq;
+       }
+
+       /*
+        * Return the address of the RWQ as the offset to mmap.
+        * See ipath_mmap() for details.
+        */
+       if (udata && udata->outlen >= sizeof(__u64)) {
+               int err;
+               u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
+
+               srq->ip =
+                   ipath_create_mmap_info(dev, s,
+                                          ibpd->uobject->context,
+                                          srq->rq.wq);
+               if (!srq->ip) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail_wq;
+               }
+
+               err = ib_copy_to_udata(udata, &srq->ip->offset,
+                                      sizeof(srq->ip->offset));
+               if (err) {
+                       ret = ERR_PTR(err);
+                       goto bail_ip;
+               }
+       } else
+               srq->ip = NULL;
+
+       /*
+        * ib_create_srq() will initialize srq->ibsrq.
+        */
+       spin_lock_init(&srq->rq.lock);
+       srq->rq.wq->head = 0;
+       srq->rq.wq->tail = 0;
+       srq->limit = srq_init_attr->attr.srq_limit;
+
+       spin_lock(&dev->n_srqs_lock);
+       if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
+               spin_unlock(&dev->n_srqs_lock);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail_ip;
+       }
+
+       dev->n_srqs_allocated++;
+       spin_unlock(&dev->n_srqs_lock);
+
+       if (srq->ip) {
+               spin_lock_irq(&dev->pending_lock);
+               list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+               spin_unlock_irq(&dev->pending_lock);
+       }
+
+       ret = &srq->ibsrq;
+       goto done;
+
+bail_ip:
+       kfree(srq->ip);
+bail_wq:
+       vfree(srq->rq.wq);
+bail_srq:
+       kfree(srq);
+done:
+       return ret;
+}
+
+/**
+ * ipath_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for ipathverbs.so
+ */
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                    enum ib_srq_attr_mask attr_mask,
+                    struct ib_udata *udata)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+       struct ipath_rwq *wq;
+       int ret = 0;
+
+       if (attr_mask & IB_SRQ_MAX_WR) {
+               struct ipath_rwq *owq;
+               struct ipath_rwqe *p;
+               u32 sz, size, n, head, tail;
+
+               /* Check that the requested sizes are below the limits. */
+               if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
+                   ((attr_mask & IB_SRQ_LIMIT) ?
+                    attr->srq_limit : srq->limit) > attr->max_wr) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               sz = sizeof(struct ipath_rwqe) +
+                       srq->rq.max_sge * sizeof(struct ib_sge);
+               size = attr->max_wr + 1;
+               wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
+               if (!wq) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               /* Check that we can write the offset to mmap. */
+               if (udata && udata->inlen >= sizeof(__u64)) {
+                       __u64 offset_addr;
+                       __u64 offset = 0;
+
+                       ret = ib_copy_from_udata(&offset_addr, udata,
+                                                sizeof(offset_addr));
+                       if (ret)
+                               goto bail_free;
+                       udata->outbuf =
+                               (void __user *) (unsigned long) offset_addr;
+                       ret = ib_copy_to_udata(udata, &offset,
+                                              sizeof(offset));
+                       if (ret)
+                               goto bail_free;
+               }
+
+               spin_lock_irq(&srq->rq.lock);
+               /*
+                * validate head pointer value and compute
+                * the number of remaining WQEs.
+                */
+               owq = srq->rq.wq;
+               head = owq->head;
+               if (head >= srq->rq.size)
+                       head = 0;
+               tail = owq->tail;
+               if (tail >= srq->rq.size)
+                       tail = 0;
+               n = head;
+               if (n < tail)
+                       n += srq->rq.size - tail;
+               else
+                       n -= tail;
+               if (size <= n) {
+                       ret = -EINVAL;
+                       goto bail_unlock;
+               }
+               n = 0;
+               p = wq->wq;
+               while (tail != head) {
+                       struct ipath_rwqe *wqe;
+                       int i;
+
+                       wqe = get_rwqe_ptr(&srq->rq, tail);
+                       p->wr_id = wqe->wr_id;
+                       p->num_sge = wqe->num_sge;
+                       for (i = 0; i < wqe->num_sge; i++)
+                               p->sg_list[i] = wqe->sg_list[i];
+                       n++;
+                       p = (struct ipath_rwqe *)((char *) p + sz);
+                       if (++tail >= srq->rq.size)
+                               tail = 0;
+               }
+               srq->rq.wq = wq;
+               srq->rq.size = size;
+               wq->head = n;
+               wq->tail = 0;
+               if (attr_mask & IB_SRQ_LIMIT)
+                       srq->limit = attr->srq_limit;
+               spin_unlock_irq(&srq->rq.lock);
+
+               vfree(owq);
+
+               if (srq->ip) {
+                       struct ipath_mmap_info *ip = srq->ip;
+                       struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
+                       u32 s = sizeof(struct ipath_rwq) + size * sz;
+
+                       ipath_update_mmap_info(dev, ip, s, wq);
+
+                       /*
+                        * Return the offset to mmap.
+                        * See ipath_mmap() for details.
+                        */
+                       if (udata && udata->inlen >= sizeof(__u64)) {
+                               ret = ib_copy_to_udata(udata, &ip->offset,
+                                                      sizeof(ip->offset));
+                               if (ret)
+                                       goto bail;
+                       }
+
+                       spin_lock_irq(&dev->pending_lock);
+                       if (list_empty(&ip->pending_mmaps))
+                               list_add(&ip->pending_mmaps,
+                                        &dev->pending_mmaps);
+                       spin_unlock_irq(&dev->pending_lock);
+               }
+       } else if (attr_mask & IB_SRQ_LIMIT) {
+               spin_lock_irq(&srq->rq.lock);
+               if (attr->srq_limit >= srq->rq.size)
+                       ret = -EINVAL;
+               else
+                       srq->limit = attr->srq_limit;
+               spin_unlock_irq(&srq->rq.lock);
+       }
+       goto bail;
+
+bail_unlock:
+       spin_unlock_irq(&srq->rq.lock);
+bail_free:
+       vfree(wq);
+bail:
+       return ret;
+}
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+
+       attr->max_wr = srq->rq.size - 1;
+       attr->max_sge = srq->rq.max_sge;
+       attr->srq_limit = srq->limit;
+       return 0;
+}
+
+/**
+ * ipath_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+       struct ipath_ibdev *dev = to_idev(ibsrq->device);
+
+       spin_lock(&dev->n_srqs_lock);
+       dev->n_srqs_allocated--;
+       spin_unlock(&dev->n_srqs_lock);
+       if (srq->ip)
+               kref_put(&srq->ip->ref, ipath_release_mmap_info);
+       else
+               vfree(srq->rq.wq);
+       kfree(srq);
+
+       return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_stats.c b/drivers/staging/rdma/ipath/ipath_stats.c
new file mode 100644 (file)
index 0000000..f63e143
--- /dev/null
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_kernel.h"
+
+struct infinipath_stats ipath_stats;
+
+/**
+ * ipath_snap_cntr - snapshot a chip counter
+ * @dd: the infinipath device
+ * @creg: the counter to snapshot
+ *
+ * called from add_timer and user counter read calls, to deal with
+ * counters that wrap in "human time".  The words sent and received, and
+ * the packets sent and received are all that we worry about.  For now,
+ * at least, we don't worry about error counters, because if they wrap
+ * that quickly, we probably don't care.  We may eventually just make this
+ * handle all the counters.  word counters can wrap in about 20 seconds
+ * of full bandwidth traffic, packet counters in a few hours.
+ */
+
+u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
+{
+       u32 val, reg64 = 0;
+       u64 val64;
+       unsigned long t0, t1;
+       u64 ret;
+
+       t0 = jiffies;
+       /* If fast increment counters are only 32 bits, snapshot them,
+        * and maintain them as 64bit values in the driver */
+       if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
+           (creg == dd->ipath_cregs->cr_wordsendcnt ||
+            creg == dd->ipath_cregs->cr_wordrcvcnt ||
+            creg == dd->ipath_cregs->cr_pktsendcnt ||
+            creg == dd->ipath_cregs->cr_pktrcvcnt)) {
+               val64 = ipath_read_creg(dd, creg);
+               val = val64 == ~0ULL ? ~0U : 0;
+               reg64 = 1;
+       } else                  /* val64 just to keep gcc quiet... */
+               val64 = val = ipath_read_creg32(dd, creg);
+       /*
+        * See if a second has passed.  This is just a way to detect things
+        * that are quite broken.  Normally this should take just a few
+        * cycles (the check is for long enough that we don't care if we get
+        * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
+        * normal NB values
+        */
+       t1 = jiffies;
+       if (time_before(t0 + HZ, t1) && val == -1) {
+               ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
+                             creg);
+               ret = 0ULL;
+               goto bail;
+       }
+       if (reg64) {
+               ret = val64;
+               goto bail;
+       }
+
+       if (creg == dd->ipath_cregs->cr_wordsendcnt) {
+               if (val != dd->ipath_lastsword) {
+                       dd->ipath_sword += val - dd->ipath_lastsword;
+                       dd->ipath_lastsword = val;
+               }
+               val64 = dd->ipath_sword;
+       } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
+               if (val != dd->ipath_lastrword) {
+                       dd->ipath_rword += val - dd->ipath_lastrword;
+                       dd->ipath_lastrword = val;
+               }
+               val64 = dd->ipath_rword;
+       } else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
+               if (val != dd->ipath_lastspkts) {
+                       dd->ipath_spkts += val - dd->ipath_lastspkts;
+                       dd->ipath_lastspkts = val;
+               }
+               val64 = dd->ipath_spkts;
+       } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
+               if (val != dd->ipath_lastrpkts) {
+                       dd->ipath_rpkts += val - dd->ipath_lastrpkts;
+                       dd->ipath_lastrpkts = val;
+               }
+               val64 = dd->ipath_rpkts;
+       } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
+               if (dd->ibdeltainprog)
+                       val64 -= val64 - dd->ibsymsnap;
+               val64 -= dd->ibsymdelta;
+       } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
+               if (dd->ibdeltainprog)
+                       val64 -= val64 - dd->iblnkerrsnap;
+               val64 -= dd->iblnkerrdelta;
+       } else
+               val64 = (u64) val;
+
+       ret = val64;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
+ * @dd: the infinipath device
+ *
+ * print the delta of egrfull/hdrqfull errors for kernel ports no more than
+ * every 5 seconds.  User processes are printed at close, but kernel doesn't
+ * close, so...  Separate routine so may call from other places someday, and
+ * so function name when printed by _IPATH_INFO is meaningfull
+ */
+static void ipath_qcheck(struct ipath_devdata *dd)
+{
+       static u64 last_tot_hdrqfull;
+       struct ipath_portdata *pd = dd->ipath_pd[0];
+       size_t blen = 0;
+       char buf[128];
+       u32 hdrqtail;
+
+       *buf = 0;
+       if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+               blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
+                               pd->port_hdrqfull -
+                               dd->ipath_p0_hdrqfull);
+               dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
+       }
+       if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
+               blen += snprintf(buf + blen, sizeof buf - blen,
+                                "%srcvegrfull %llu",
+                                blen ? ", " : "",
+                                (unsigned long long)
+                                (ipath_stats.sps_etidfull -
+                                 dd->ipath_last_tidfull));
+               dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
+       }
+
+       /*
+        * this is actually the number of hdrq full interrupts, not actual
+        * events, but at the moment that's mostly what I'm interested in.
+        * Actual count, etc. is in the counters, if needed.  For production
+        * users this won't ordinarily be printed.
+        */
+
+       if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
+           ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
+               blen += snprintf(buf + blen, sizeof buf - blen,
+                                "%shdrqfull %llu (all ports)",
+                                blen ? ", " : "",
+                                (unsigned long long)
+                                (ipath_stats.sps_hdrqfull -
+                                 last_tot_hdrqfull));
+               last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
+       }
+       if (blen)
+               ipath_dbg("%s\n", buf);
+
+       hdrqtail = ipath_get_hdrqtail(pd);
+       if (pd->port_head != hdrqtail) {
+               if (dd->ipath_lastport0rcv_cnt ==
+                   ipath_stats.sps_port0pkts) {
+                       ipath_cdbg(PKT, "missing rcv interrupts? "
+                                  "port0 hd=%x tl=%x; port0pkts %llx; write"
+                                  " hd (w/intr)\n",
+                                  pd->port_head, hdrqtail,
+                                  (unsigned long long)
+                                  ipath_stats.sps_port0pkts);
+                       ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
+                               dd->ipath_rhdrhead_intr_off, pd->port_port);
+               }
+               dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
+       }
+}
+
+static void ipath_chk_errormask(struct ipath_devdata *dd)
+{
+       static u32 fixed;
+       u32 ctrl;
+       unsigned long errormask;
+       unsigned long hwerrs;
+
+       if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
+               return;
+
+       errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+
+       if (errormask == dd->ipath_errormask)
+               return;
+       fixed++;
+
+       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+               dd->ipath_errormask);
+
+       if ((hwerrs & dd->ipath_hwerrmask) ||
+               (ctrl & INFINIPATH_C_FREEZEMODE)) {
+               /* force re-interrupt of pending events, just in case */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+               dev_info(&dd->pcidev->dev,
+                       "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
+                       fixed, errormask, (unsigned long)dd->ipath_errormask,
+                       ctrl, hwerrs);
+       } else
+               ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
+                       fixed, errormask,
+                       (unsigned long)dd->ipath_errormask);
+}
+
+
+/**
+ * ipath_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the infinipath device ipath_devdata
+ *
+ * called from add_timer
+ */
+void ipath_get_faststats(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+       int i;
+       static unsigned cnt;
+       unsigned long flags;
+       u64 traffic_wds;
+
+       /*
+        * don't access the chip while running diags, or memory diags can
+        * fail
+        */
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
+           ipath_diag_inuse)
+               /* but re-arm the timer, for diags case; won't hurt other */
+               goto done;
+
+       /*
+        * We now try to maintain a "active timer", based on traffic
+        * exceeding a threshold, so we need to check the word-counts
+        * even if they are 64-bit.
+        */
+       traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+       spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+       traffic_wds -= dd->ipath_traffic_wds;
+       dd->ipath_traffic_wds += traffic_wds;
+       if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
+               atomic_add(5, &dd->ipath_active_time); /* S/B #define */
+       spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+
+       if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+       }
+
+       ipath_qcheck(dd);
+
+       /*
+        * deal with repeat error suppression.  Doesn't really matter if
+        * last error was almost a full interval ago, or just a few usecs
+        * ago; still won't get more than 2 per interval.  We may want
+        * longer intervals for this eventually, could do with mod, counter
+        * or separate timer.  Also see code in ipath_handle_errors() and
+        * ipath_handle_hwerrors().
+        */
+
+       if (dd->ipath_lasterror)
+               dd->ipath_lasterror = 0;
+       if (dd->ipath_lasthwerror)
+               dd->ipath_lasthwerror = 0;
+       if (dd->ipath_maskederrs
+           && time_after(jiffies, dd->ipath_unmasktime)) {
+               char ebuf[256];
+               int iserr;
+               iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
+                                        dd->ipath_maskederrs);
+               if (dd->ipath_maskederrs &
+                   ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+                     INFINIPATH_E_PKTERRS))
+                       ipath_dev_err(dd, "Re-enabling masked errors "
+                                     "(%s)\n", ebuf);
+               else {
+                       /*
+                        * rcvegrfull and rcvhdrqfull are "normal", for some
+                        * types of processes (mostly benchmarks) that send
+                        * huge numbers of messages, while not processing
+                        * them.  So only complain about these at debug
+                        * level.
+                        */
+                       if (iserr)
+                               ipath_dbg(
+                                       "Re-enabling queue full errors (%s)\n",
+                                       ebuf);
+                       else
+                               ipath_cdbg(ERRPKT, "Re-enabling packet"
+                                       " problem interrupt (%s)\n", ebuf);
+               }
+
+               /* re-enable masked errors */
+               dd->ipath_errormask |= dd->ipath_maskederrs;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+                                dd->ipath_errormask);
+               dd->ipath_maskederrs = 0;
+       }
+
+       /* limit qfull messages to ~one per minute per port */
+       if ((++cnt & 0x10)) {
+               for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
+                       struct ipath_portdata *pd = dd->ipath_pd[i];
+
+                       if (pd && pd->port_lastrcvhdrqtail != -1)
+                               pd->port_lastrcvhdrqtail = -1;
+               }
+       }
+
+       ipath_chk_errormask(dd);
+done:
+       mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_sysfs.c b/drivers/staging/rdma/ipath/ipath_sysfs.c
new file mode 100644 (file)
index 0000000..75558f3
--- /dev/null
@@ -0,0 +1,1238 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+/**
+ * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
+ * @str: the string containing the number
+ * @valp: where to put the result
+ *
+ * returns the number of bytes consumed, or negative value on error
+ */
+int ipath_parse_ushort(const char *str, unsigned short *valp)
+{
+       unsigned long val;
+       char *end;
+       int ret;
+
+       if (!isdigit(str[0])) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       val = simple_strtoul(str, &end, 0);
+
+       if (val > 0xffff) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       *valp = val;
+
+       ret = end + 1 - str;
+       if (ret == 0)
+               ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+static ssize_t show_version(struct device_driver *dev, char *buf)
+{
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
+}
+
+static ssize_t show_num_units(struct device_driver *dev, char *buf)
+{
+       return scnprintf(buf, PAGE_SIZE, "%d\n",
+                        ipath_count_units(NULL, NULL, NULL));
+}
+
+static ssize_t show_status(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+
+       if (!dd->ipath_statusp) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+                       (unsigned long long) *(dd->ipath_statusp));
+
+bail:
+       return ret;
+}
+
+static const char *ipath_status_str[] = {
+       "Initted",
+       "Disabled",
+       "Admin_Disabled",
+       "", /* This used to be the old "OIB_SMA" status. */
+       "", /* This used to be the old "SMA" status. */
+       "Present",
+       "IB_link_up",
+       "IB_configured",
+       "NoIBcable",
+       "Fatal_Hardware_Error",
+       NULL,
+};
+
+static ssize_t show_status_str(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int i, any;
+       u64 s;
+       ssize_t ret;
+
+       if (!dd->ipath_statusp) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       s = *(dd->ipath_statusp);
+       *buf = '\0';
+       for (any = i = 0; s && ipath_status_str[i]; i++) {
+               if (s & 1) {
+                       if (any && strlcat(buf, " ", PAGE_SIZE) >=
+                           PAGE_SIZE)
+                               /* overflow */
+                               break;
+                       if (strlcat(buf, ipath_status_str[i],
+                                   PAGE_SIZE) >= PAGE_SIZE)
+                               break;
+                       any = 1;
+               }
+               s >>= 1;
+       }
+       if (any)
+               strlcat(buf, "\n", PAGE_SIZE);
+
+       ret = strlen(buf);
+
+bail:
+       return ret;
+}
+
+static ssize_t show_boardversion(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
+}
+
+static ssize_t show_localbus_info(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
+}
+
+static ssize_t show_lmc(struct device *dev,
+                       struct device_attribute *attr,
+                       char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
+}
+
+static ssize_t store_lmc(struct device *dev,
+                        struct device_attribute *attr,
+                        const char *buf,
+                        size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u16 lmc = 0;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &lmc);
+       if (ret < 0)
+               goto invalid;
+
+       if (lmc > 7) {
+               ret = -EINVAL;
+               goto invalid;
+       }
+
+       ipath_set_lid(dd, dd->ipath_lid, lmc);
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
+bail:
+       return ret;
+}
+
+static ssize_t show_lid(struct device *dev,
+                       struct device_attribute *attr,
+                       char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
+}
+
+static ssize_t store_lid(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u16 lid = 0;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &lid);
+       if (ret < 0)
+               goto invalid;
+
+       if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
+               ret = -EINVAL;
+               goto invalid;
+       }
+
+       ipath_set_lid(dd, lid, dd->ipath_lmc);
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
+bail:
+       return ret;
+}
+
+static ssize_t show_mlid(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
+}
+
+static ssize_t store_mlid(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u16 mlid;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &mlid);
+       if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
+               goto invalid;
+
+       dd->ipath_mlid = mlid;
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid MLID\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_guid(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u8 *guid;
+
+       guid = (u8 *) & (dd->ipath_guid);
+
+       return scnprintf(buf, PAGE_SIZE,
+                        "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+                        guid[0], guid[1], guid[2], guid[3],
+                        guid[4], guid[5], guid[6], guid[7]);
+}
+
+static ssize_t store_guid(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+       unsigned short guid[8];
+       __be64 new_guid;
+       u8 *ng;
+       int i;
+
+       if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+                  &guid[0], &guid[1], &guid[2], &guid[3],
+                  &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
+               goto invalid;
+
+       ng = (u8 *) &new_guid;
+
+       for (i = 0; i < 8; i++) {
+               if (guid[i] > 0xff)
+                       goto invalid;
+               ng[i] = guid[i];
+       }
+
+       if (new_guid == 0)
+               goto invalid;
+
+       dd->ipath_guid = new_guid;
+       dd->ipath_nguid = 1;
+       if (dd->verbs_dev)
+               dd->verbs_dev->ibdev.node_guid = new_guid;
+
+       ret = strlen(buf);
+       goto bail;
+
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid GUID\n");
+       ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+static ssize_t show_nguid(struct device *dev,
+                         struct device_attribute *attr,
+                         char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
+}
+
+static ssize_t show_nports(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       /* Return the number of user ports available. */
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
+}
+
+static ssize_t show_serial(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       buf[sizeof dd->ipath_serial] = '\0';
+       memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
+       strcat(buf, "\n");
+       return strlen(buf);
+}
+
+static ssize_t show_unit(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
+}
+
+static ssize_t show_jint_max_packets(struct device *dev,
+                                    struct device_attribute *attr,
+                                    char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
+}
+
+static ssize_t store_jint_max_packets(struct device *dev,
+                                     struct device_attribute *attr,
+                                     const char *buf,
+                                     size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u16 v = 0;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &v);
+       if (ret < 0)
+               ipath_dev_err(dd, "invalid jint_max_packets.\n");
+       else
+               dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
+
+       return ret;
+}
+
+static ssize_t show_jint_idle_ticks(struct device *dev,
+                                   struct device_attribute *attr,
+                                   char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
+}
+
+static ssize_t store_jint_idle_ticks(struct device *dev,
+                                    struct device_attribute *attr,
+                                    const char *buf,
+                                    size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u16 v = 0;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &v);
+       if (ret < 0)
+               ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
+       else
+               dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
+
+       return ret;
+}
+
+#define DEVICE_COUNTER(name, attr) \
+       static ssize_t show_counter_##name(struct device *dev, \
+                                          struct device_attribute *attr, \
+                                          char *buf) \
+       { \
+               struct ipath_devdata *dd = dev_get_drvdata(dev); \
+               return scnprintf(\
+                       buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
+                       ipath_snap_cntr( \
+                               dd, offsetof(struct infinipath_counters, \
+                                            attr) / sizeof(u64)));     \
+       } \
+       static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
+
+DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
+DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
+DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
+DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
+DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
+DEVICE_COUNTER(lb_ints, LBIntCnt);
+DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
+DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
+DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
+DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
+DEVICE_COUNTER(rx_dwords, RxDwordCnt);
+DEVICE_COUNTER(rx_ebps, RxEBPCnt);
+DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
+DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
+DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
+DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
+DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
+DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
+DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
+DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
+DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
+DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
+DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
+DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
+DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
+DEVICE_COUNTER(tx_dwords, TxDwordCnt);
+DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
+DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
+DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
+DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
+DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
+DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
+
+static struct attribute *dev_counter_attributes[] = {
+       &dev_attr_ib_link_downeds.attr,
+       &dev_attr_ib_link_err_recoveries.attr,
+       &dev_attr_ib_status_changes.attr,
+       &dev_attr_ib_symbol_errs.attr,
+       &dev_attr_lb_flow_stalls.attr,
+       &dev_attr_lb_ints.attr,
+       &dev_attr_rx_bad_formats.attr,
+       &dev_attr_rx_buf_ovfls.attr,
+       &dev_attr_rx_data_pkts.attr,
+       &dev_attr_rx_dropped_pkts.attr,
+       &dev_attr_rx_dwords.attr,
+       &dev_attr_rx_ebps.attr,
+       &dev_attr_rx_flow_ctrl_errs.attr,
+       &dev_attr_rx_flow_pkts.attr,
+       &dev_attr_rx_icrc_errs.attr,
+       &dev_attr_rx_len_errs.attr,
+       &dev_attr_rx_link_problems.attr,
+       &dev_attr_rx_lpcrc_errs.attr,
+       &dev_attr_rx_max_min_len_errs.attr,
+       &dev_attr_rx_p0_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p1_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p2_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p3_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p4_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p5_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p6_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p7_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p8_hdr_egr_ovfls.attr,
+       &dev_attr_rx_pkey_mismatches.attr,
+       &dev_attr_rx_tid_full_errs.attr,
+       &dev_attr_rx_tid_valid_errs.attr,
+       &dev_attr_rx_vcrc_errs.attr,
+       &dev_attr_tx_data_pkts.attr,
+       &dev_attr_tx_dropped_pkts.attr,
+       &dev_attr_tx_dwords.attr,
+       &dev_attr_tx_flow_pkts.attr,
+       &dev_attr_tx_flow_stalls.attr,
+       &dev_attr_tx_len_errs.attr,
+       &dev_attr_tx_max_min_len_errs.attr,
+       &dev_attr_tx_underruns.attr,
+       &dev_attr_tx_unsup_vl_errs.attr,
+       NULL
+};
+
+static struct attribute_group dev_counter_attr_group = {
+       .name = "counters",
+       .attrs = dev_counter_attributes
+};
+
+static ssize_t store_reset(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       if (count < 5 || memcmp(buf, "reset", 5)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (dd->ipath_flags & IPATH_DISABLED) {
+               /*
+                * post-reset init would re-enable interrupts, etc.
+                * so don't allow reset on disabled devices.  Not
+                * perfect error, but about the best choice.
+                */
+               dev_info(dev,"Unit %d is disabled, can't reset\n",
+                        dd->ipath_unit);
+               ret = -EINVAL;
+               goto bail;
+       }
+       ret = ipath_reset_device(dd->ipath_unit);
+bail:
+       return ret<0 ? ret : count;
+}
+
+static ssize_t store_link_state(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 state;
+
+       ret = ipath_parse_ushort(buf, &state);
+       if (ret < 0)
+               goto invalid;
+
+       r = ipath_set_linkstate(dd, state);
+       if (r < 0) {
+               ret = r;
+               goto bail;
+       }
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid link state\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_mtu(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
+}
+
+static ssize_t store_mtu(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+       u16 mtu = 0;
+       int r;
+
+       ret = ipath_parse_ushort(buf, &mtu);
+       if (ret < 0)
+               goto invalid;
+
+       r = ipath_set_mtu(dd, mtu);
+       if (r < 0)
+               ret = r;
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid MTU\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_enabled(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
+}
+
+static ssize_t store_enabled(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+       u16 enable = 0;
+
+       ret = ipath_parse_ushort(buf, &enable);
+       if (ret < 0) {
+               ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
+               goto bail;
+       }
+
+       if (enable) {
+               if (!(dd->ipath_flags & IPATH_DISABLED))
+                       goto bail;
+
+               dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
+               /* same as post-reset */
+               ret = ipath_init_chip(dd, 1);
+               if (ret)
+                       ipath_dev_err(dd, "Failed to enable unit %d\n",
+                                     dd->ipath_unit);
+               else {
+                       dd->ipath_flags &= ~IPATH_DISABLED;
+                       *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
+               }
+       }
+       else if (!(dd->ipath_flags & IPATH_DISABLED)) {
+               dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
+               ipath_shutdown_device(dd);
+               dd->ipath_flags |= IPATH_DISABLED;
+               *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
+       }
+
+bail:
+       return ret;
+}
+
+static ssize_t store_rx_pol_inv(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret < 0)
+               goto invalid;
+
+       r = ipath_set_rx_pol_inv(dd, val);
+       if (r < 0) {
+               ret = r;
+               goto bail;
+       }
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
+bail:
+       return ret;
+}
+
+static ssize_t store_led_override(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret > 0)
+               ipath_set_led_override(dd, val);
+       else
+               ipath_dev_err(dd, "attempt to set invalid LED override\n");
+       return ret;
+}
+
+static ssize_t show_logged_errs(struct device *dev,
+                               struct device_attribute *attr,
+                               char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int idx, count;
+
+       /* force consistency with actual EEPROM */
+       if (ipath_update_eeprom_log(dd) != 0)
+               return -ENXIO;
+
+       count = 0;
+       for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+               count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
+                       dd->ipath_eep_st_errs[idx],
+                       idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
+       }
+
+       return count;
+}
+
+/*
+ * New sysfs entries to control various IB config. These all turn into
+ * accesses via ipath_f_get/set_ib_cfg.
+ *
+ * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret >= 0 && val > 3)
+               ret = -EINVAL;
+       if (ret < 0) {
+               ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+               goto bail;
+       }
+
+       /*
+        * Set the "intentional" heartbeat enable per either of
+        * "Enable" and "Auto", as these are normally set together.
+        * This bit is consulted when leaving loopback mode,
+        * because entering loopback mode overrides it and automatically
+        * disables heartbeat.
+        */
+       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
+       if (r < 0)
+               ret = r;
+       else if (val == IPATH_IB_HRTBT_OFF)
+               dd->ipath_flags |= IPATH_NO_HRTBT;
+       else
+               dd->ipath_flags &= ~IPATH_NO_HRTBT;
+
+bail:
+       return ret;
+}
+
+/*
+ * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
+ * _not_ the particular encoding of any given chip)
+ */
+static ssize_t show_lwid_enb(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+static ssize_t store_lwid_enb(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret >= 0 && (val == 0 || val > 3))
+               ret = -EINVAL;
+       if (ret < 0) {
+               ipath_dev_err(dd,
+                       "attempt to set invalid Link Width (enable)\n");
+               goto bail;
+       }
+
+       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
+       if (r < 0)
+               ret = r;
+
+bail:
+       return ret;
+}
+
+/* Get current link width */
+static ssize_t show_lwid(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+/*
+ * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
+ */
+static ssize_t show_spd_enb(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+static ssize_t store_spd_enb(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
+               ret = -EINVAL;
+       if (ret < 0) {
+               ipath_dev_err(dd,
+                       "attempt to set invalid Link Speed (enable)\n");
+               goto bail;
+       }
+
+       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
+       if (r < 0)
+               ret = r;
+
+bail:
+       return ret;
+}
+
+/* Get current link speed */
+static ssize_t show_spd(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+/*
+ * Get/Set RX polarity-invert enable. 0=no, 1=yes.
+ */
+static ssize_t show_rx_polinv_enb(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+static ssize_t store_rx_polinv_enb(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret >= 0 && val > 1) {
+               ipath_dev_err(dd,
+                       "attempt to set invalid Rx Polarity (enable)\n");
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
+       if (r < 0)
+               ret = r;
+
+bail:
+       return ret;
+}
+
+/*
+ * Get/Set RX lane-reversal enable. 0=no, 1=yes.
+ */
+static ssize_t show_lanerev_enb(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
+       if (ret >= 0)
+               ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+       return ret;
+}
+
+static ssize_t store_lanerev_enb(struct device *dev,
+                         struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret >= 0 && val > 1) {
+               ret = -EINVAL;
+               ipath_dev_err(dd,
+                       "attempt to set invalid Lane reversal (enable)\n");
+               goto bail;
+       }
+
+       r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
+       if (r < 0)
+               ret = r;
+
+bail:
+       return ret;
+}
+
+static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
+static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
+
+static struct attribute *driver_attributes[] = {
+       &driver_attr_num_units.attr,
+       &driver_attr_version.attr,
+       NULL
+};
+
+static struct attribute_group driver_attr_group = {
+       .attrs = driver_attributes
+};
+
+static ssize_t store_tempsense(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf,
+                              size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, stat;
+       u16 val;
+
+       ret = ipath_parse_ushort(buf, &val);
+       if (ret <= 0) {
+               ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
+               goto bail;
+       }
+       /* If anything but the highest limit, enable T_CRIT_A "interrupt" */
+       stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
+       if (stat) {
+               ipath_dev_err(dd, "Unable to set tempsense config\n");
+               ret = -1;
+               goto bail;
+       }
+       stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
+       if (stat) {
+               ipath_dev_err(dd, "Unable to set local Tcrit\n");
+               ret = -1;
+               goto bail;
+       }
+       stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
+       if (stat) {
+               ipath_dev_err(dd, "Unable to set remote Tcrit\n");
+               ret = -1;
+               goto bail;
+       }
+
+bail:
+       return ret;
+}
+
+/*
+ * dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *dev,
+                             struct device_attribute *attr,
+                             char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+       int idx;
+       u8 regvals[8];
+
+       ret = -ENXIO;
+       for (idx = 0; idx < 8; ++idx) {
+               if (idx == 6)
+                       continue;
+               ret = ipath_tempsense_read(dd, idx);
+               if (ret < 0)
+                       break;
+               regvals[idx] = ret;
+       }
+       if (idx == 8)
+               ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+                       *(signed char *)(regvals),
+                       *(signed char *)(regvals + 1),
+                       regvals[2], regvals[3],
+                       *(signed char *)(regvals + 5),
+                       *(signed char *)(regvals + 7));
+       return ret;
+}
+
+const struct attribute_group *ipath_driver_attr_groups[] = {
+       &driver_attr_group,
+       NULL,
+};
+
+static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
+static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
+static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
+static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
+static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
+static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
+static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
+static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
+static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
+static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
+                  show_jint_max_packets, store_jint_max_packets);
+static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
+                  show_jint_idle_ticks, store_jint_idle_ticks);
+static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
+                  show_tempsense, store_tempsense);
+
+static struct attribute *dev_attributes[] = {
+       &dev_attr_guid.attr,
+       &dev_attr_lmc.attr,
+       &dev_attr_lid.attr,
+       &dev_attr_link_state.attr,
+       &dev_attr_mlid.attr,
+       &dev_attr_mtu.attr,
+       &dev_attr_nguid.attr,
+       &dev_attr_nports.attr,
+       &dev_attr_serial.attr,
+       &dev_attr_status.attr,
+       &dev_attr_status_str.attr,
+       &dev_attr_boardversion.attr,
+       &dev_attr_unit.attr,
+       &dev_attr_enabled.attr,
+       &dev_attr_rx_pol_inv.attr,
+       &dev_attr_led_override.attr,
+       &dev_attr_logged_errors.attr,
+       &dev_attr_tempsense.attr,
+       &dev_attr_localbus_info.attr,
+       NULL
+};
+
+static struct attribute_group dev_attr_group = {
+       .attrs = dev_attributes
+};
+
+static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+                  store_hrtbt_enb);
+static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
+                  store_lwid_enb);
+static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
+static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
+                  store_spd_enb);
+static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
+static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
+                  store_rx_polinv_enb);
+static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
+                  store_lanerev_enb);
+
+static struct attribute *dev_ibcfg_attributes[] = {
+       &dev_attr_hrtbt_enable.attr,
+       &dev_attr_link_width_enable.attr,
+       &dev_attr_link_width.attr,
+       &dev_attr_link_speed_enable.attr,
+       &dev_attr_link_speed.attr,
+       &dev_attr_rx_pol_inv_enable.attr,
+       &dev_attr_rx_lane_rev_enable.attr,
+       NULL
+};
+
+static struct attribute_group dev_ibcfg_attr_group = {
+       .attrs = dev_ibcfg_attributes
+};
+
+/**
+ * ipath_expose_reset - create a device reset file
+ * @dev: the device structure
+ *
+ * Only expose a file that lets us reset the device after someone
+ * enters diag mode.  A device reset is quite likely to crash the
+ * machine entirely, so we don't want to normally make it
+ * available.
+ *
+ * Called with ipath_mutex held.
+ */
+int ipath_expose_reset(struct device *dev)
+{
+       static int exposed;
+       int ret;
+
+       if (!exposed) {
+               ret = device_create_file(dev, &dev_attr_reset);
+               exposed = 1;
+       }
+       else
+               ret = 0;
+
+       return ret;
+}
+
+int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
+{
+       int ret;
+
+       ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
+       if (ret)
+               goto bail;
+
+       ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
+       if (ret)
+               goto bail_attrs;
+
+       if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+               ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
+               if (ret)
+                       goto bail_counter;
+               ret = device_create_file(dev, &dev_attr_jint_max_packets);
+               if (ret)
+                       goto bail_idle;
+
+               ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
+               if (ret)
+                       goto bail_max;
+       }
+
+       return 0;
+
+bail_max:
+       device_remove_file(dev, &dev_attr_jint_max_packets);
+bail_idle:
+       device_remove_file(dev, &dev_attr_jint_idle_ticks);
+bail_counter:
+       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+bail_attrs:
+       sysfs_remove_group(&dev->kobj, &dev_attr_group);
+bail:
+       return ret;
+}
+
+void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
+{
+       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+
+       if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+               sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
+               device_remove_file(dev, &dev_attr_jint_idle_ticks);
+               device_remove_file(dev, &dev_attr_jint_max_packets);
+       }
+
+       sysfs_remove_group(&dev->kobj, &dev_attr_group);
+
+       device_remove_file(dev, &dev_attr_reset);
+}
diff --git a/drivers/staging/rdma/ipath/ipath_uc.c b/drivers/staging/rdma/ipath/ipath_uc.c
new file mode 100644 (file)
index 0000000..22e6099
--- /dev/null
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_uc_req(struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       struct ipath_swqe *wqe;
+       unsigned long flags;
+       u32 hwords;
+       u32 bth0;
+       u32 len;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       int ret = 0;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_dma_busy)) {
+                       qp->s_flags |= IPATH_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
+
+       ohdr = &qp->s_hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr.u.l.oth;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+       bth0 = 1 << 22; /* Set M bit */
+
+       /* Get the next send request. */
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       qp->s_wqe = NULL;
+       switch (qp->s_state) {
+       default:
+               if (!(ib_ipath_state_ops[qp->state] &
+                   IPATH_PROCESS_NEXT_SEND_OK))
+                       goto bail;
+               /* Check if send work queue is empty. */
+               if (qp->s_cur == qp->s_head)
+                       goto bail;
+               /*
+                * Start a new request.
+                */
+               qp->s_psn = wqe->psn = qp->s_next_psn;
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_len = len = wqe->length;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND)
+                               qp->s_state = OP(SEND_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / 4;
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= 1 << 23;
+                       }
+                       qp->s_wqe = wqe;
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               default:
+                       goto bail;
+               }
+               break;
+
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND)
+                       qp->s_state = OP(SEND_LAST);
+               else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= 1 << 23;
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               else {
+                       qp->s_state =
+                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.ex.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+               }
+               qp->s_wqe = wqe;
+               if (++qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_cur_size = len;
+       ipath_make_ruc_header(to_idev(qp->ibqp.device),
+                             qp, ohdr, bth0 | (qp->s_state << 24),
+                             qp->s_next_psn++ & IPATH_PSN_MASK);
+done:
+       ret = 1;
+       goto unlock;
+
+bail:
+       qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/**
+ * ipath_uc_rcv - handle an incoming UC packet
+ * @dev: the device the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       int opcode;
+       u32 hdrsize;
+       u32 psn;
+       u32 pad;
+       struct ib_wc wc;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       struct ib_reth *reth;
+       int header_in_data;
+
+       /* Validate the SLID. See Ch. 9.6.1.5 */
+       if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+               goto done;
+
+       /* Check for GRH */
+       if (!has_grh) {
+               ohdr = &hdr->u.oth;
+               hdrsize = 8 + 12;       /* LRH + BTH */
+               psn = be32_to_cpu(ohdr->bth[2]);
+               header_in_data = 0;
+       } else {
+               ohdr = &hdr->u.l.oth;
+               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+               /*
+                * The header with GRH is 60 bytes and the
+                * core driver sets the eager header buffer
+                * size to 56 bytes so the last 4 bytes of
+                * the BTH header (PSN) is in the data buffer.
+                */
+               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+               if (header_in_data) {
+                       psn = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               } else
+                       psn = be32_to_cpu(ohdr->bth[2]);
+       }
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+       memset(&wc, 0, sizeof wc);
+
+       /* Compare the PSN verses the expected PSN. */
+       if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
+               /*
+                * Handle a sequence error.
+                * Silently drop any current message.
+                */
+               qp->r_psn = psn;
+       inv:
+               qp->r_state = OP(SEND_LAST);
+               switch (opcode) {
+               case OP(SEND_FIRST):
+               case OP(SEND_ONLY):
+               case OP(SEND_ONLY_WITH_IMMEDIATE):
+                       goto send_first;
+
+               case OP(RDMA_WRITE_FIRST):
+               case OP(RDMA_WRITE_ONLY):
+               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+                       goto rdma_first;
+
+               default:
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       default:
+               if (opcode == OP(SEND_FIRST) ||
+                   opcode == OP(SEND_ONLY) ||
+                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_FIRST) ||
+                   opcode == OP(RDMA_WRITE_ONLY) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+       }
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+       send_first:
+               if (qp->r_flags & IPATH_R_REUSE_SGE) {
+                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
+                       qp->r_sge = qp->s_rdma_read_sge;
+               } else if (!ipath_get_rwqe(qp, 0)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Save the WQE so we can reuse it in case of an error. */
+               qp->s_rdma_read_sge = qp->r_sge;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto send_last;
+               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+                       goto send_last_imm;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len)) {
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               ipath_copy_sge(&qp->r_sge, data, pmtu);
+               break;
+
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+       send_last_imm:
+               if (header_in_data) {
+                       wc.ex.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else {
+                       /* Immediate data comes after BTH */
+                       wc.ex.imm_data = ohdr->u.imm_data;
+               }
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               /* FALLTHROUGH */
+       case OP(SEND_LAST):
+       send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4))) {
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len)) {
+                       qp->r_flags |= IPATH_R_REUSE_SGE;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               wc.opcode = IB_WC_RECV;
+       last_imm:
+               ipath_copy_sge(&qp->r_sge, data, tlen);
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.qp = &qp->ibqp;
+               wc.src_qp = qp->remote_qpn;
+               wc.slid = qp->remote_ah_attr.dlid;
+               wc.sl = qp->remote_ah_attr.sl;
+               /* Signal completion event if the solicited bit is set. */
+               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                              (ohdr->bth[0] &
+                               cpu_to_be32(1 << 23)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+       rdma_first:
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               hdrsize += sizeof(*reth);
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey */
+                       ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
+                                          vaddr, rkey,
+                                          IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok)) {
+                               dev->n_pkt_drops++;
+                               goto done;
+                       }
+               } else {
+                       qp->r_sge.sg_list = NULL;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto rdma_last;
+               else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       goto rdma_last_imm;
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               ipath_copy_sge(&qp->r_sge, data, pmtu);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+       rdma_last_imm:
+               if (header_in_data) {
+                       wc.ex.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else {
+                       /* Immediate data comes after BTH */
+                       wc.ex.imm_data = ohdr->u.imm_data;
+               }
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
+
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               if (qp->r_flags & IPATH_R_REUSE_SGE)
+                       qp->r_flags &= ~IPATH_R_REUSE_SGE;
+               else if (!ipath_get_rwqe(qp, 1)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               wc.byte_len = qp->r_len;
+               wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+               goto last_imm;
+
+       case OP(RDMA_WRITE_LAST):
+       rdma_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               ipath_copy_sge(&qp->r_sge, data, tlen);
+               break;
+
+       default:
+               /* Drop packet for unknown opcodes. */
+               dev->n_pkt_drops++;
+               goto done;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+done:
+       return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_ud.c b/drivers/staging/rdma/ipath/ipath_ud.c
new file mode 100644 (file)
index 0000000..e8a2a91
--- /dev/null
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from ipath_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling ipath_ud_rcv()
+ * while this is being called.
+ */
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
+{
+       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+       struct ipath_qp *qp;
+       struct ib_ah_attr *ah_attr;
+       unsigned long flags;
+       struct ipath_rq *rq;
+       struct ipath_srq *srq;
+       struct ipath_sge_state rsge;
+       struct ipath_sge *sge;
+       struct ipath_rwq *wq;
+       struct ipath_rwqe *wqe;
+       void (*handler)(struct ib_event *, void *);
+       struct ib_wc wc;
+       u32 tail;
+       u32 rlen;
+       u32 length;
+
+       qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
+       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               dev->n_pkt_drops++;
+               goto done;
+       }
+
+       /*
+        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       if (unlikely(qp->ibqp.qp_num &&
+                    ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
+                     sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
+               /* XXX OK to lose a count once in a while. */
+               dev->qkey_violations++;
+               dev->n_pkt_drops++;
+               goto drop;
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       length = swqe->length;
+       memset(&wc, 0, sizeof wc);
+       wc.byte_len = length + sizeof(struct ib_grh);
+
+       if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.ex.imm_data = swqe->wr.ex.imm_data;
+       }
+
+       /*
+        * This would be a lot simpler if we could call ipath_get_rwqe()
+        * but that uses state that the receive interrupt handler uses
+        * so we would need to lock out receive interrupts while doing
+        * local loopback.
+        */
+       if (qp->ibqp.srq) {
+               srq = to_isrq(qp->ibqp.srq);
+               handler = srq->ibsrq.event_handler;
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               handler = NULL;
+               rq = &qp->r_rq;
+       }
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        * Note that it is safe to drop the lock after changing rq->tail
+        * since ipath_post_receive() won't fill the empty slot.
+        */
+       spin_lock_irqsave(&rq->lock, flags);
+       wq = rq->wq;
+       tail = wq->tail;
+       /* Validate tail before using it since it is user writable. */
+       if (tail >= rq->size)
+               tail = 0;
+       if (unlikely(tail == wq->head)) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto drop;
+       }
+       wqe = get_rwqe_ptr(rq, tail);
+       rsge.sg_list = qp->r_ud_sg_list;
+       if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto drop;
+       }
+       /* Silently drop packets which are too big. */
+       if (wc.byte_len > rlen) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto drop;
+       }
+       if (++tail >= rq->size)
+               tail = 0;
+       wq->tail = tail;
+       wc.wr_id = wqe->wr_id;
+       if (handler) {
+               u32 n;
+
+               /*
+                * validate head pointer value and compute
+                * the number of remaining WQEs.
+                */
+               n = wq->head;
+               if (n >= rq->size)
+                       n = 0;
+               if (n < tail)
+                       n += rq->size - tail;
+               else
+                       n -= tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       handler(&ev, srq->ibsrq.srq_context);
+               } else
+                       spin_unlock_irqrestore(&rq->lock, flags);
+       } else
+               spin_unlock_irqrestore(&rq->lock, flags);
+
+       ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+               wc.wc_flags |= IB_WC_GRH;
+       } else
+               ipath_skip_sge(&rsge, sizeof(struct ib_grh));
+       sge = swqe->sg_list;
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               BUG_ON(len == 0);
+               ipath_copy_sge(&rsge, sge->vaddr, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--swqe->wr.num_sge)
+                               sge++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = sqp->ibqp.qp_num;
+       /* XXX do we know which pkey matched? Only needed for GSI. */
+       wc.pkey_index = 0;
+       wc.slid = dev->dd->ipath_lid |
+               (ah_attr->src_path_bits &
+                ((1 << dev->dd->ipath_lmc) - 1));
+       wc.sl = ah_attr->sl;
+       wc.dlid_path_bits =
+               ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
+       wc.port_num = 1;
+       /* Signal completion event if the solicited bit is set. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                      swqe->wr.send_flags & IB_SEND_SOLICITED);
+drop:
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+done:;
+}
+
+/**
+ * ipath_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_ud_req(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_other_headers *ohdr;
+       struct ib_ah_attr *ah_attr;
+       struct ipath_swqe *wqe;
+       unsigned long flags;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth0;
+       u16 lrh0;
+       u16 lid;
+       int ret = 0;
+       int next_cur;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+                       goto bail;
+               /* We are in the error state, flush the work request. */
+               if (qp->s_last == qp->s_head)
+                       goto bail;
+               /* If DMAs are in progress, we can't flush immediately. */
+               if (atomic_read(&qp->s_dma_busy)) {
+                       qp->s_flags |= IPATH_S_WAIT_DMA;
+                       goto bail;
+               }
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+               goto done;
+       }
+
+       if (qp->s_cur == qp->s_head)
+               goto bail;
+
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       next_cur = qp->s_cur + 1;
+       if (next_cur >= qp->s_size)
+               next_cur = 0;
+
+       /* Construct the header. */
+       ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+       if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
+               if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
+                       dev->n_multicast_xmit++;
+               else
+                       dev->n_unicast_xmit++;
+       } else {
+               dev->n_unicast_xmit++;
+               lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
+               if (unlikely(lid == dev->dd->ipath_lid)) {
+                       /*
+                        * If DMAs are in progress, we can't generate
+                        * a completion for the loopback packet since
+                        * it would be out of order.
+                        * XXX Instead of waiting, we could queue a
+                        * zero length descriptor so we get a callback.
+                        */
+                       if (atomic_read(&qp->s_dma_busy)) {
+                               qp->s_flags |= IPATH_S_WAIT_DMA;
+                               goto bail;
+                       }
+                       qp->s_cur = next_cur;
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       ipath_ud_loopback(qp, wqe);
+                       spin_lock_irqsave(&qp->s_lock, flags);
+                       ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
+                       goto done;
+               }
+       }
+
+       qp->s_cur = next_cur;
+       extra_bytes = -wqe->length & 3;
+       nwords = (wqe->length + extra_bytes) >> 2;
+
+       /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+       qp->s_hdrwords = 7;
+       qp->s_cur_size = wqe->length;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_dmult = ah_attr->static_rate;
+       qp->s_wqe = wqe;
+       qp->s_sge.sge = wqe->sg_list[0];
+       qp->s_sge.sg_list = wqe->sg_list + 1;
+       qp->s_sge.num_sge = wqe->wr.num_sge;
+
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               /* Header size in 32-bit words. */
+               qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+                                                &ah_attr->grh,
+                                                qp->s_hdrwords, nwords);
+               lrh0 = IPATH_LRH_GRH;
+               ohdr = &qp->s_hdr.u.l.oth;
+               /*
+                * Don't worry about sending to locally attached multicast
+                * QPs.  It is unspecified by the spec. what happens.
+                */
+       } else {
+               /* Header size in 32-bit words. */
+               lrh0 = IPATH_LRH_BTH;
+               ohdr = &qp->s_hdr.u.oth;
+       }
+       if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+               qp->s_hdrwords++;
+               ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+       } else
+               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+       lrh0 |= ah_attr->sl << 4;
+       if (qp->ibqp.qp_type == IB_QPT_SMI)
+               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
+       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+                                          SIZE_OF_CRC);
+       lid = dev->dd->ipath_lid;
+       if (lid) {
+               lid |= ah_attr->src_path_bits &
+                       ((1 << dev->dd->ipath_lmc) - 1);
+               qp->s_hdr.lrh[3] = cpu_to_be16(lid);
+       } else
+               qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
+       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+               bth0 |= 1 << 23;
+       bth0 |= extra_bytes << 20;
+       bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
+               ipath_get_pkey(dev->dd, qp->s_pkey_index);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       /*
+        * Use the multicast QP if the destination LID is a multicast LID.
+        */
+       ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+               ah_attr->dlid != IPATH_PERMISSIVE_LID ?
+               cpu_to_be32(IPATH_MULTICAST_QPN) :
+               cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
+       /*
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+                                        qp->qkey : wqe->wr.wr.ud.remote_qkey);
+       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+       ret = 1;
+       goto unlock;
+
+bail:
+       qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/**
+ * ipath_ud_rcv - receive an incoming UD packet
+ * @dev: the device the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       int opcode;
+       u32 hdrsize;
+       u32 pad;
+       struct ib_wc wc;
+       u32 qkey;
+       u32 src_qp;
+       u16 dlid;
+       int header_in_data;
+
+       /* Check for GRH */
+       if (!has_grh) {
+               ohdr = &hdr->u.oth;
+               hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
+               qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+               src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+               header_in_data = 0;
+       } else {
+               ohdr = &hdr->u.l.oth;
+               hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+               /*
+                * The header with GRH is 68 bytes and the core driver sets
+                * the eager header buffer size to 56 bytes so the last 12
+                * bytes of the IB header is in the data buffer.
+                */
+               header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+               if (header_in_data) {
+                       qkey = be32_to_cpu(((__be32 *) data)[1]);
+                       src_qp = be32_to_cpu(((__be32 *) data)[2]);
+                       data += 12;
+               } else {
+                       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+                       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+               }
+       }
+       src_qp &= IPATH_QPN_MASK;
+
+       /*
+        * Check that the permissive LID is only used on QP0
+        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+        */
+       if (qp->ibqp.qp_num) {
+               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                            hdr->lrh[3] == IB_LID_PERMISSIVE)) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+               if (unlikely(qkey != qp->qkey)) {
+                       /* XXX OK to lose a count once in a while. */
+                       dev->qkey_violations++;
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+       } else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                  hdr->lrh[3] == IB_LID_PERMISSIVE) {
+               struct ib_smp *smp = (struct ib_smp *) data;
+
+               if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+       }
+
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       if (qp->ibqp.qp_num > 1 &&
+           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+               if (header_in_data) {
+                       wc.ex.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else
+                       wc.ex.imm_data = ohdr->u.ud.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               hdrsize += sizeof(u32);
+       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+               wc.ex.imm_data = 0;
+               wc.wc_flags = 0;
+       } else {
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+
+       /* Get the number of bytes the message was padded by. */
+       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       if (unlikely(tlen < (hdrsize + pad + 4))) {
+               /* Drop incomplete packets. */
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+       tlen -= hdrsize + pad + 4;
+
+       /* Drop invalid MAD packets (see 13.5.3.1). */
+       if (unlikely((qp->ibqp.qp_num == 0 &&
+                     (tlen != 256 ||
+                      (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
+                    (qp->ibqp.qp_num == 1 &&
+                     (tlen != 256 ||
+                      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+
+       /*
+        * A GRH is expected to precede the data even if not
+        * present on the wire.
+        */
+       wc.byte_len = tlen + sizeof(struct ib_grh);
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        */
+       if (qp->r_flags & IPATH_R_REUSE_SGE)
+               qp->r_flags &= ~IPATH_R_REUSE_SGE;
+       else if (!ipath_get_rwqe(qp, 0)) {
+               /*
+                * Count VL15 packets dropped due to no receive buffer.
+                * Otherwise, count them as buffer overruns since usually,
+                * the HW will be able to receive packets even if there are
+                * no QPs with posted receive buffers.
+                */
+               if (qp->ibqp.qp_num == 0)
+                       dev->n_vl15_dropped++;
+               else
+                       dev->rcv_errors++;
+               goto bail;
+       }
+       /* Silently drop packets which are too big. */
+       if (wc.byte_len > qp->r_len) {
+               qp->r_flags |= IPATH_R_REUSE_SGE;
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+       if (has_grh) {
+               ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+                              sizeof(struct ib_grh));
+               wc.wc_flags |= IB_WC_GRH;
+       } else
+               ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+       ipath_copy_sge(&qp->r_sge, data,
+                      wc.byte_len - sizeof(struct ib_grh));
+       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+               goto bail;
+       wc.wr_id = qp->r_wr_id;
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.vendor_err = 0;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = src_qp;
+       /* XXX do we know which pkey matched? Only needed for GSI. */
+       wc.pkey_index = 0;
+       wc.slid = be16_to_cpu(hdr->lrh[3]);
+       wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+       dlid = be16_to_cpu(hdr->lrh[1]);
+       /*
+        * Save the LMC lower bits if the destination LID is a unicast LID.
+        */
+       wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
+               dlid & ((1 << dev->dd->ipath_lmc) - 1);
+       wc.port_num = 1;
+       /* Signal completion event if the solicited bit is set. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                      (ohdr->bth[0] &
+                       cpu_to_be32(1 << 23)) != 0);
+
+bail:;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c
new file mode 100644 (file)
index 0000000..1da1252
--- /dev/null
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+
+static void __ipath_release_user_pages(struct page **p, size_t num_pages,
+                                  int dirty)
+{
+       size_t i;
+
+       for (i = 0; i < num_pages; i++) {
+               ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
+                          (unsigned long) num_pages, p[i]);
+               if (dirty)
+                       set_page_dirty_lock(p[i]);
+               put_page(p[i]);
+       }
+}
+
+/* call with current->mm->mmap_sem held */
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+                                 struct page **p)
+{
+       unsigned long lock_limit;
+       size_t got;
+       int ret;
+
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if (num_pages > lock_limit) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
+                  (unsigned long) num_pages, start_page);
+
+       for (got = 0; got < num_pages; got += ret) {
+               ret = get_user_pages(current, current->mm,
+                                    start_page + got * PAGE_SIZE,
+                                    num_pages - got, 1, 1,
+                                    p + got, NULL);
+               if (ret < 0)
+                       goto bail_release;
+       }
+
+       current->mm->pinned_vm += num_pages;
+
+       ret = 0;
+       goto bail;
+
+bail_release:
+       __ipath_release_user_pages(p, got, 0);
+bail:
+       return ret;
+}
+
+/**
+ * ipath_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
+       unsigned long offset, size_t size, int direction)
+{
+       dma_addr_t phys;
+
+       phys = pci_map_page(hwdev, page, offset, size, direction);
+
+       if (phys == 0) {
+               pci_unmap_page(hwdev, phys, size, direction);
+               phys = pci_map_page(hwdev, page, offset, size, direction);
+               /*
+                * FIXME: If we get 0 again, we should keep this page,
+                * map another, then free the 0 page.
+                */
+       }
+
+       return phys;
+}
+
+/**
+ * ipath_map_single - a safety wrapper around pci_map_single()
+ *
+ * Same idea as ipath_map_page().
+ */
+dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
+       int direction)
+{
+       dma_addr_t phys;
+
+       phys = pci_map_single(hwdev, ptr, size, direction);
+
+       if (phys == 0) {
+               pci_unmap_single(hwdev, phys, size, direction);
+               phys = pci_map_single(hwdev, ptr, size, direction);
+               /*
+                * FIXME: If we get 0 again, we should keep this page,
+                * map another, then free the 0 page.
+                */
+       }
+
+       return phys;
+}
+
+/**
+ * ipath_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+                        struct page **p)
+{
+       int ret;
+
+       down_write(&current->mm->mmap_sem);
+
+       ret = __ipath_get_user_pages(start_page, num_pages, p);
+
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+void ipath_release_user_pages(struct page **p, size_t num_pages)
+{
+       down_write(&current->mm->mmap_sem);
+
+       __ipath_release_user_pages(p, num_pages, 1);
+
+       current->mm->pinned_vm -= num_pages;
+
+       up_write(&current->mm->mmap_sem);
+}
+
+struct ipath_user_pages_work {
+       struct work_struct work;
+       struct mm_struct *mm;
+       unsigned long num_pages;
+};
+
+static void user_pages_account(struct work_struct *_work)
+{
+       struct ipath_user_pages_work *work =
+               container_of(_work, struct ipath_user_pages_work, work);
+
+       down_write(&work->mm->mmap_sem);
+       work->mm->pinned_vm -= work->num_pages;
+       up_write(&work->mm->mmap_sem);
+       mmput(work->mm);
+       kfree(work);
+}
+
+void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
+{
+       struct ipath_user_pages_work *work;
+       struct mm_struct *mm;
+
+       __ipath_release_user_pages(p, num_pages, 1);
+
+       mm = get_task_mm(current);
+       if (!mm)
+               return;
+
+       work = kmalloc(sizeof(*work), GFP_KERNEL);
+       if (!work)
+               goto bail_mm;
+
+       INIT_WORK(&work->work, user_pages_account);
+       work->mm = mm;
+       work->num_pages = num_pages;
+
+       queue_work(ib_wq, &work->work);
+       return;
+
+bail_mm:
+       mmput(mm);
+       return;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.c b/drivers/staging/rdma/ipath/ipath_user_sdma.c
new file mode 100644 (file)
index 0000000..cc04b7b
--- /dev/null
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "ipath_kernel.h"
+#include "ipath_user_sdma.h"
+
+/* minimum size of header */
+#define IPATH_USER_SDMA_MIN_HEADER_LENGTH      64
+/* expected size of headers (for dma_pool) */
+#define IPATH_USER_SDMA_EXP_HEADER_LENGTH      64
+/* length mask in PBC (lower 11 bits) */
+#define IPATH_PBC_LENGTH_MASK                  ((1 << 11) - 1)
+
+struct ipath_user_sdma_pkt {
+       u8 naddr;               /* dimension of addr (1..3) ... */
+       u32 counter;            /* sdma pkts queued counter for this entry */
+       u64 added;              /* global descq number of entries */
+
+       struct {
+               u32 offset;                     /* offset for kvaddr, addr */
+               u32 length;                     /* length in page */
+               u8  put_page;                   /* should we put_page? */
+               u8  dma_mapped;                 /* is page dma_mapped? */
+               struct page *page;              /* may be NULL (coherent mem) */
+               void *kvaddr;                   /* FIXME: only for pio hack */
+               dma_addr_t addr;
+       } addr[4];   /* max pages, any more and we coalesce */
+       struct list_head list;  /* list element */
+};
+
+struct ipath_user_sdma_queue {
+       /*
+        * pkts sent to dma engine are queued on this
+        * list head.  the type of the elements of this
+        * list are struct ipath_user_sdma_pkt...
+        */
+       struct list_head sent;
+
+       /* headers with expected length are allocated from here... */
+       char header_cache_name[64];
+       struct dma_pool *header_cache;
+
+       /* packets are allocated from the slab cache... */
+       char pkt_slab_name[64];
+       struct kmem_cache *pkt_slab;
+
+       /* as packets go on the queued queue, they are counted... */
+       u32 counter;
+       u32 sent_counter;
+
+       /* dma page table */
+       struct rb_root dma_pages_root;
+
+       /* protect everything above... */
+       struct mutex lock;
+};
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
+{
+       struct ipath_user_sdma_queue *pq =
+               kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
+
+       if (!pq)
+               goto done;
+
+       pq->counter = 0;
+       pq->sent_counter = 0;
+       INIT_LIST_HEAD(&pq->sent);
+
+       mutex_init(&pq->lock);
+
+       snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+                "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
+       pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+                                        sizeof(struct ipath_user_sdma_pkt),
+                                        0, 0, NULL);
+
+       if (!pq->pkt_slab)
+               goto err_kfree;
+
+       snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+                "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
+       pq->header_cache = dma_pool_create(pq->header_cache_name,
+                                          dev,
+                                          IPATH_USER_SDMA_EXP_HEADER_LENGTH,
+                                          4, 0);
+       if (!pq->header_cache)
+               goto err_slab;
+
+       pq->dma_pages_root = RB_ROOT;
+
+       goto done;
+
+err_slab:
+       kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+       kfree(pq);
+       pq = NULL;
+
+done:
+       return pq;
+}
+
+static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
+                                     int i, size_t offset, size_t len,
+                                     int put_page, int dma_mapped,
+                                     struct page *page,
+                                     void *kvaddr, dma_addr_t dma_addr)
+{
+       pkt->addr[i].offset = offset;
+       pkt->addr[i].length = len;
+       pkt->addr[i].put_page = put_page;
+       pkt->addr[i].dma_mapped = dma_mapped;
+       pkt->addr[i].page = page;
+       pkt->addr[i].kvaddr = kvaddr;
+       pkt->addr[i].addr = dma_addr;
+}
+
+static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
+                                       u32 counter, size_t offset,
+                                       size_t len, int dma_mapped,
+                                       struct page *page,
+                                       void *kvaddr, dma_addr_t dma_addr)
+{
+       pkt->naddr = 1;
+       pkt->counter = counter;
+       ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
+                                 kvaddr, dma_addr);
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
+                                   struct ipath_user_sdma_pkt *pkt,
+                                   const struct iovec *iov,
+                                   unsigned long niov) {
+       int ret = 0;
+       struct page *page = alloc_page(GFP_KERNEL);
+       void *mpage_save;
+       char *mpage;
+       int i;
+       int len = 0;
+       dma_addr_t dma_addr;
+
+       if (!page) {
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       mpage = kmap(page);
+       mpage_save = mpage;
+       for (i = 0; i < niov; i++) {
+               int cfur;
+
+               cfur = copy_from_user(mpage,
+                                     iov[i].iov_base, iov[i].iov_len);
+               if (cfur) {
+                       ret = -EFAULT;
+                       goto free_unmap;
+               }
+
+               mpage += iov[i].iov_len;
+               len += iov[i].iov_len;
+       }
+
+       dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
+                               DMA_TO_DEVICE);
+       if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+               ret = -ENOMEM;
+               goto free_unmap;
+       }
+
+       ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
+                                 dma_addr);
+       pkt->naddr = 2;
+
+       goto done;
+
+free_unmap:
+       kunmap(page);
+       __free_page(page);
+done:
+       return ret;
+}
+
+/* how many pages in this iovec element? */
+static int ipath_user_sdma_num_pages(const struct iovec *iov)
+{
+       const unsigned long addr  = (unsigned long) iov->iov_base;
+       const unsigned long  len  = iov->iov_len;
+       const unsigned long spage = addr & PAGE_MASK;
+       const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+       return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+/* truncate length to page boundary */
+static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
+{
+       const unsigned long offset = addr & ~PAGE_MASK;
+
+       return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
+}
+
+static void ipath_user_sdma_free_pkt_frag(struct device *dev,
+                                         struct ipath_user_sdma_queue *pq,
+                                         struct ipath_user_sdma_pkt *pkt,
+                                         int frag)
+{
+       const int i = frag;
+
+       if (pkt->addr[i].page) {
+               if (pkt->addr[i].dma_mapped)
+                       dma_unmap_page(dev,
+                                      pkt->addr[i].addr,
+                                      pkt->addr[i].length,
+                                      DMA_TO_DEVICE);
+
+               if (pkt->addr[i].kvaddr)
+                       kunmap(pkt->addr[i].page);
+
+               if (pkt->addr[i].put_page)
+                       put_page(pkt->addr[i].page);
+               else
+                       __free_page(pkt->addr[i].page);
+       } else if (pkt->addr[i].kvaddr)
+               /* free coherent mem from cache... */
+               dma_pool_free(pq->header_cache,
+                             pkt->addr[i].kvaddr, pkt->addr[i].addr);
+}
+
+/* return number of pages pinned... */
+static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
+                                    struct ipath_user_sdma_pkt *pkt,
+                                    unsigned long addr, int tlen, int npages)
+{
+       struct page *pages[2];
+       int j;
+       int ret;
+
+       ret = get_user_pages_fast(addr, npages, 0, pages);
+       if (ret != npages) {
+               int i;
+
+               for (i = 0; i < ret; i++)
+                       put_page(pages[i]);
+
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       for (j = 0; j < npages; j++) {
+               /* map the pages... */
+               const int flen =
+                       ipath_user_sdma_page_length(addr, tlen);
+               dma_addr_t dma_addr =
+                       dma_map_page(&dd->pcidev->dev,
+                                    pages[j], 0, flen, DMA_TO_DEVICE);
+               unsigned long fofs = addr & ~PAGE_MASK;
+
+               if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+                       ret = -ENOMEM;
+                       goto done;
+               }
+
+               ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
+                                         pages[j], kmap(pages[j]),
+                                         dma_addr);
+
+               pkt->naddr++;
+               addr += flen;
+               tlen -= flen;
+       }
+
+done:
+       return ret;
+}
+
+static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
+                                  struct ipath_user_sdma_queue *pq,
+                                  struct ipath_user_sdma_pkt *pkt,
+                                  const struct iovec *iov,
+                                  unsigned long niov)
+{
+       int ret = 0;
+       unsigned long idx;
+
+       for (idx = 0; idx < niov; idx++) {
+               const int npages = ipath_user_sdma_num_pages(iov + idx);
+               const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+               ret = ipath_user_sdma_pin_pages(dd, pkt,
+                                               addr, iov[idx].iov_len,
+                                               npages);
+               if (ret < 0)
+                       goto free_pkt;
+       }
+
+       goto done;
+
+free_pkt:
+       for (idx = 0; idx < pkt->naddr; idx++)
+               ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+done:
+       return ret;
+}
+
+static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
+                                       struct ipath_user_sdma_queue *pq,
+                                       struct ipath_user_sdma_pkt *pkt,
+                                       const struct iovec *iov,
+                                       unsigned long niov, int npages)
+{
+       int ret = 0;
+
+       if (npages >= ARRAY_SIZE(pkt->addr))
+               ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
+       else
+               ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+       return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void ipath_user_sdma_free_pkt_list(struct device *dev,
+                                         struct ipath_user_sdma_queue *pq,
+                                         struct list_head *list)
+{
+       struct ipath_user_sdma_pkt *pkt, *pkt_next;
+
+       list_for_each_entry_safe(pkt, pkt_next, list, list) {
+               int i;
+
+               for (i = 0; i < pkt->naddr; i++)
+                       ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+               kmem_cache_free(pq->pkt_slab, pkt);
+       }
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
+                                     struct ipath_user_sdma_queue *pq,
+                                     struct list_head *list,
+                                     const struct iovec *iov,
+                                     unsigned long niov,
+                                     int maxpkts)
+{
+       unsigned long idx = 0;
+       int ret = 0;
+       int npkts = 0;
+       struct page *page = NULL;
+       __le32 *pbc;
+       dma_addr_t dma_addr;
+       struct ipath_user_sdma_pkt *pkt = NULL;
+       size_t len;
+       size_t nw;
+       u32 counter = pq->counter;
+       int dma_mapped = 0;
+
+       while (idx < niov && npkts < maxpkts) {
+               const unsigned long addr = (unsigned long) iov[idx].iov_base;
+               const unsigned long idx_save = idx;
+               unsigned pktnw;
+               unsigned pktnwc;
+               int nfrags = 0;
+               int npages = 0;
+               int cfur;
+
+               dma_mapped = 0;
+               len = iov[idx].iov_len;
+               nw = len >> 2;
+               page = NULL;
+
+               pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+               if (!pkt) {
+                       ret = -ENOMEM;
+                       goto free_list;
+               }
+
+               if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
+                   len > PAGE_SIZE || len & 3 || addr & 3) {
+                       ret = -EINVAL;
+                       goto free_pkt;
+               }
+
+               if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
+                       pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+                                            &dma_addr);
+               else
+                       pbc = NULL;
+
+               if (!pbc) {
+                       page = alloc_page(GFP_KERNEL);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto free_pkt;
+                       }
+                       pbc = kmap(page);
+               }
+
+               cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+               if (cfur) {
+                       ret = -EFAULT;
+                       goto free_pbc;
+               }
+
+               /*
+                * this assignment is a bit strange.  it's because the
+                * the pbc counts the number of 32 bit words in the full
+                * packet _except_ the first word of the pbc itself...
+                */
+               pktnwc = nw - 1;
+
+               /*
+                * pktnw computation yields the number of 32 bit words
+                * that the caller has indicated in the PBC.  note that
+                * this is one less than the total number of words that
+                * goes to the send DMA engine as the first 32 bit word
+                * of the PBC itself is not counted.  Armed with this count,
+                * we can verify that the packet is consistent with the
+                * iovec lengths.
+                */
+               pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
+               if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
+                       ret = -EINVAL;
+                       goto free_pbc;
+               }
+
+
+               idx++;
+               while (pktnwc < pktnw && idx < niov) {
+                       const size_t slen = iov[idx].iov_len;
+                       const unsigned long faddr =
+                               (unsigned long) iov[idx].iov_base;
+
+                       if (slen & 3 || faddr & 3 || !slen ||
+                           slen > PAGE_SIZE) {
+                               ret = -EINVAL;
+                               goto free_pbc;
+                       }
+
+                       npages++;
+                       if ((faddr & PAGE_MASK) !=
+                           ((faddr + slen - 1) & PAGE_MASK))
+                               npages++;
+
+                       pktnwc += slen >> 2;
+                       idx++;
+                       nfrags++;
+               }
+
+               if (pktnwc != pktnw) {
+                       ret = -EINVAL;
+                       goto free_pbc;
+               }
+
+               if (page) {
+                       dma_addr = dma_map_page(&dd->pcidev->dev,
+                                               page, 0, len, DMA_TO_DEVICE);
+                       if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+                               ret = -ENOMEM;
+                               goto free_pbc;
+                       }
+
+                       dma_mapped = 1;
+               }
+
+               ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
+                                           page, pbc, dma_addr);
+
+               if (nfrags) {
+                       ret = ipath_user_sdma_init_payload(dd, pq, pkt,
+                                                          iov + idx_save + 1,
+                                                          nfrags, npages);
+                       if (ret < 0)
+                               goto free_pbc_dma;
+               }
+
+               counter++;
+               npkts++;
+
+               list_add_tail(&pkt->list, list);
+       }
+
+       ret = idx;
+       goto done;
+
+free_pbc_dma:
+       if (dma_mapped)
+               dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
+free_pbc:
+       if (page) {
+               kunmap(page);
+               __free_page(page);
+       } else
+               dma_pool_free(pq->header_cache, pbc, dma_addr);
+free_pkt:
+       kmem_cache_free(pq->pkt_slab, pkt);
+free_list:
+       ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+       return ret;
+}
+
+static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
+                                                u32 c)
+{
+       pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
+                                      struct ipath_user_sdma_queue *pq)
+{
+       struct list_head free_list;
+       struct ipath_user_sdma_pkt *pkt;
+       struct ipath_user_sdma_pkt *pkt_prev;
+       int ret = 0;
+
+       INIT_LIST_HEAD(&free_list);
+
+       list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+               s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
+
+               if (descd < 0)
+                       break;
+
+               list_move_tail(&pkt->list, &free_list);
+
+               /* one more packet cleaned */
+               ret++;
+       }
+
+       if (!list_empty(&free_list)) {
+               u32 counter;
+
+               pkt = list_entry(free_list.prev,
+                                struct ipath_user_sdma_pkt, list);
+               counter = pkt->counter;
+
+               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+               ipath_user_sdma_set_complete_counter(pq, counter);
+       }
+
+       return ret;
+}
+
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
+{
+       if (!pq)
+               return;
+
+       kmem_cache_destroy(pq->pkt_slab);
+       dma_pool_destroy(pq->header_cache);
+       kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
+{
+       int ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+       ret = ipath_sdma_make_progress(dd);
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+       return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+                                struct ipath_user_sdma_queue *pq)
+{
+       int i;
+
+       if (!pq)
+               return;
+
+       for (i = 0; i < 100; i++) {
+               mutex_lock(&pq->lock);
+               if (list_empty(&pq->sent)) {
+                       mutex_unlock(&pq->lock);
+                       break;
+               }
+               ipath_user_sdma_hwqueue_clean(dd);
+               ipath_user_sdma_queue_clean(dd, pq);
+               mutex_unlock(&pq->lock);
+               msleep(10);
+       }
+
+       if (!list_empty(&pq->sent)) {
+               struct list_head free_list;
+
+               printk(KERN_INFO "drain: lists not empty: forcing!\n");
+               INIT_LIST_HEAD(&free_list);
+               mutex_lock(&pq->lock);
+               list_splice_init(&pq->sent, &free_list);
+               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+               mutex_unlock(&pq->lock);
+       }
+}
+
+static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
+                                          u64 addr, u64 dwlen, u64 dwoffset)
+{
+       return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+                          ((addr & 0xfffffffcULL) << 32) |
+                          /* SDmaGeneration[1:0] */
+                          ((dd->ipath_sdma_generation & 3ULL) << 30) |
+                          /* SDmaDwordCount[10:0] */
+                          ((dwlen & 0x7ffULL) << 16) |
+                          /* SDmaBufOffset[12:2] */
+                          (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
+{
+       return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
+{
+                                             /* last */  /* dma head */
+       return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 ipath_sdma_make_desc1(u64 addr)
+{
+       /* SDmaPhyAddr[47:32] */
+       return cpu_to_le64(addr >> 32);
+}
+
+static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
+                                     struct ipath_user_sdma_pkt *pkt, int idx,
+                                     unsigned ofs, u16 tail)
+{
+       const u64 addr = (u64) pkt->addr[idx].addr +
+               (u64) pkt->addr[idx].offset;
+       const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+       __le64 *descqp;
+       __le64 descq0;
+
+       descqp = &dd->ipath_sdma_descq[tail].qw[0];
+
+       descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
+       if (idx == 0)
+               descq0 = ipath_sdma_make_first_desc0(descq0);
+       if (idx == pkt->naddr - 1)
+               descq0 = ipath_sdma_make_last_desc0(descq0);
+
+       descqp[0] = descq0;
+       descqp[1] = ipath_sdma_make_desc1(addr);
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
+                                    struct ipath_user_sdma_queue *pq,
+                                    struct list_head *pktlist)
+{
+       int ret = 0;
+       unsigned long flags;
+       u16 tail;
+
+       if (list_empty(pktlist))
+               return 0;
+
+       if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
+               return -ECOMM;
+
+       spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+       if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
+               ret = -ECOMM;
+               goto unlock;
+       }
+
+       tail = dd->ipath_sdma_descq_tail;
+       while (!list_empty(pktlist)) {
+               struct ipath_user_sdma_pkt *pkt =
+                       list_entry(pktlist->next, struct ipath_user_sdma_pkt,
+                                  list);
+               int i;
+               unsigned ofs = 0;
+               u16 dtail = tail;
+
+               if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
+                       goto unlock_check_tail;
+
+               for (i = 0; i < pkt->naddr; i++) {
+                       ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
+                       ofs += pkt->addr[i].length >> 2;
+
+                       if (++tail == dd->ipath_sdma_descq_cnt) {
+                               tail = 0;
+                               ++dd->ipath_sdma_generation;
+                       }
+               }
+
+               if ((ofs<<2) > dd->ipath_ibmaxlen) {
+                       ipath_dbg("packet size %X > ibmax %X, fail\n",
+                               ofs<<2, dd->ipath_ibmaxlen);
+                       ret = -EMSGSIZE;
+                       goto unlock;
+               }
+
+               /*
+                * if the packet is >= 2KB mtu equivalent, we have to use
+                * the large buffers, and have to mark each descriptor as
+                * part of a large buffer packet.
+                */
+               if (ofs >= IPATH_SMALLBUF_DWORDS) {
+                       for (i = 0; i < pkt->naddr; i++) {
+                               dd->ipath_sdma_descq[dtail].qw[0] |=
+                                       cpu_to_le64(1ULL << 14);
+                               if (++dtail == dd->ipath_sdma_descq_cnt)
+                                       dtail = 0;
+                       }
+               }
+
+               dd->ipath_sdma_descq_added += pkt->naddr;
+               pkt->added = dd->ipath_sdma_descq_added;
+               list_move_tail(&pkt->list, &pq->sent);
+               ret++;
+       }
+
+unlock_check_tail:
+       /* advance the tail on the chip if necessary */
+       if (dd->ipath_sdma_descq_tail != tail) {
+               wmb();
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+               dd->ipath_sdma_descq_tail = tail;
+       }
+
+unlock:
+       spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+       return ret;
+}
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+                          struct ipath_user_sdma_queue *pq,
+                          const struct iovec *iov,
+                          unsigned long dim)
+{
+       int ret = 0;
+       struct list_head list;
+       int npkts = 0;
+
+       INIT_LIST_HEAD(&list);
+
+       mutex_lock(&pq->lock);
+
+       if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
+               ipath_user_sdma_hwqueue_clean(dd);
+               ipath_user_sdma_queue_clean(dd, pq);
+       }
+
+       while (dim) {
+               const int mxp = 8;
+
+               ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+               if (ret <= 0)
+                       goto done_unlock;
+               else {
+                       dim -= ret;
+                       iov += ret;
+               }
+
+               /* force packets onto the sdma hw queue... */
+               if (!list_empty(&list)) {
+                       /*
+                        * lazily clean hw queue.  the 4 is a guess of about
+                        * how many sdma descriptors a packet will take (it
+                        * doesn't have to be perfect).
+                        */
+                       if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
+                               ipath_user_sdma_hwqueue_clean(dd);
+                               ipath_user_sdma_queue_clean(dd, pq);
+                       }
+
+                       ret = ipath_user_sdma_push_pkts(dd, pq, &list);
+                       if (ret < 0)
+                               goto done_unlock;
+                       else {
+                               npkts += ret;
+                               pq->counter += ret;
+
+                               if (!list_empty(&list))
+                                       goto done_unlock;
+                       }
+               }
+       }
+
+done_unlock:
+       if (!list_empty(&list))
+               ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+       mutex_unlock(&pq->lock);
+
+       return (ret < 0) ? ret : npkts;
+}
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+                                 struct ipath_user_sdma_queue *pq)
+{
+       int ret = 0;
+
+       mutex_lock(&pq->lock);
+       ipath_user_sdma_hwqueue_clean(dd);
+       ret = ipath_user_sdma_queue_clean(dd, pq);
+       mutex_unlock(&pq->lock);
+
+       return ret;
+}
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
+{
+       return pq->sent_counter;
+}
+
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
+{
+       return pq->counter;
+}
+
diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.h b/drivers/staging/rdma/ipath/ipath_user_sdma.h
new file mode 100644 (file)
index 0000000..fc76316
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct ipath_user_sdma_queue;
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+                          struct ipath_user_sdma_queue *pq,
+                          const struct iovec *iov,
+                          unsigned long dim);
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+                                 struct ipath_user_sdma_queue *pq);
+
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+                                struct ipath_user_sdma_queue *pq);
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c
new file mode 100644 (file)
index 0000000..ed2bbc2
--- /dev/null
@@ -0,0 +1,2365 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+static unsigned int ib_ipath_qp_table_size = 251;
+module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
+                  S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+                "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_ipath_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+                "Maximum number of protection domains to support");
+
+static unsigned int ib_ipath_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_ipath_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+                "Maximum number of completion queue entries to support");
+
+unsigned int ib_ipath_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
+                  S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_ipath_max_qps = 16384;
+module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_ipath_max_sges = 0x60;
+module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_ipath_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
+                  S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+                "Maximum number of multicast groups to support");
+
+unsigned int ib_ipath_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
+                  uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+                "Maximum number of attached QPs to support");
+
+unsigned int ib_ipath_max_srqs = 1024;
+module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_ipath_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
+                  uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
+                  uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_ipath_disable_sma;
+module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
+       [IB_QPS_RESET] = 0,
+       [IB_QPS_INIT] = IPATH_POST_RECV_OK,
+       [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+       [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+           IPATH_PROCESS_NEXT_SEND_OK,
+       [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+       [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+       [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+           IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+};
+
+struct ipath_ucontext {
+       struct ib_ucontext ibucontext;
+};
+
+static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
+                                                 *ibucontext)
+{
+       return container_of(ibucontext, struct ipath_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
+       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+       [IB_WR_SEND] = IB_WC_SEND,
+       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+static __be64 sys_image_guid;
+
+/**
+ * ipath_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               BUG_ON(len == 0);
+               memcpy(sge->vaddr, data, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               data += len;
+               length -= len;
+       }
+}
+
+/**
+ * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               BUG_ON(len == 0);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the ipath_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
+{
+       struct ipath_sge *sg_list = ss->sg_list;
+       struct ipath_sge sge = ss->sge;
+       u8 num_sge = ss->num_sge;
+       u32 ndesc = 1;  /* count the header */
+
+       while (length) {
+               u32 len = sge.length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge.sge_length)
+                       len = sge.sge_length;
+               BUG_ON(len == 0);
+               if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+                   (len != length && (len & (sizeof(u32) - 1)))) {
+                       ndesc = 0;
+                       break;
+               }
+               ndesc++;
+               sge.vaddr += len;
+               sge.length -= len;
+               sge.sge_length -= len;
+               if (sge.sge_length == 0) {
+                       if (--num_sge)
+                               sge = *sg_list++;
+               } else if (sge.length == 0 && sge.mr != NULL) {
+                       if (++sge.n >= IPATH_SEGSZ) {
+                               if (++sge.m >= sge.mr->mapsz)
+                                       break;
+                               sge.n = 0;
+                       }
+                       sge.vaddr =
+                               sge.mr->map[sge.m]->segs[sge.n].vaddr;
+                       sge.length =
+                               sge.mr->map[sge.m]->segs[sge.n].length;
+               }
+               length -= len;
+       }
+       return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
+                               u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               if (len > sge->sge_length)
+                       len = sge->sge_length;
+               BUG_ON(len == 0);
+               memcpy(data, sge->vaddr, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               data += len;
+               length -= len;
+       }
+}
+
+/**
+ * ipath_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+       struct ipath_swqe *wqe;
+       u32 next;
+       int i;
+       int j;
+       int acc;
+       int ret;
+       unsigned long flags;
+       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (qp->ibqp.qp_type != IB_QPT_SMI &&
+           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+               ret = -ENETDOWN;
+               goto bail;
+       }
+
+       /* Check that state is OK to post send. */
+       if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+               goto bail_inval;
+
+       /* IB spec says that num_sge == 0 is OK. */
+       if (wr->num_sge > qp->s_max_sge)
+               goto bail_inval;
+
+       /*
+        * Don't allow RDMA reads or atomic operations on UC or
+        * undefined operations.
+        * Make sure buffer is large enough to hold the result for atomics.
+        */
+       if (qp->ibqp.qp_type == IB_QPT_UC) {
+               if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+                       goto bail_inval;
+       } else if (qp->ibqp.qp_type == IB_QPT_UD) {
+               /* Check UD opcode */
+               if (wr->opcode != IB_WR_SEND &&
+                   wr->opcode != IB_WR_SEND_WITH_IMM)
+                       goto bail_inval;
+               /* Check UD destination address PD */
+               if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+                       goto bail_inval;
+       } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+               goto bail_inval;
+       else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+                  (wr->num_sge == 0 ||
+                   wr->sg_list[0].length < sizeof(u64) ||
+                   wr->sg_list[0].addr & (sizeof(u64) - 1)))
+               goto bail_inval;
+       else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+               goto bail_inval;
+
+       next = qp->s_head + 1;
+       if (next >= qp->s_size)
+               next = 0;
+       if (next == qp->s_last) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       wqe = get_swqe_ptr(qp, qp->s_head);
+       wqe->wr = *wr;
+       wqe->length = 0;
+       if (wr->num_sge) {
+               acc = wr->opcode >= IB_WR_RDMA_READ ?
+                       IB_ACCESS_LOCAL_WRITE : 0;
+               for (i = 0, j = 0; i < wr->num_sge; i++) {
+                       u32 length = wr->sg_list[i].length;
+                       int ok;
+
+                       if (length == 0)
+                               continue;
+                       ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
+                                          &wr->sg_list[i], acc);
+                       if (!ok)
+                               goto bail_inval;
+                       wqe->length += length;
+                       j++;
+               }
+               wqe->wr.num_sge = j;
+       }
+       if (qp->ibqp.qp_type == IB_QPT_UC ||
+           qp->ibqp.qp_type == IB_QPT_RC) {
+               if (wqe->length > 0x80000000U)
+                       goto bail_inval;
+       } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
+               goto bail_inval;
+       wqe->ssn = qp->s_ssn++;
+       qp->s_head = next;
+
+       ret = 0;
+       goto bail;
+
+bail_inval:
+       ret = -EINVAL;
+bail:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
+}
+
+/**
+ * ipath_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                          struct ib_send_wr **bad_wr)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       int err = 0;
+
+       for (; wr; wr = wr->next) {
+               err = ipath_post_one_send(qp, wr);
+               if (err) {
+                       *bad_wr = wr;
+                       goto bail;
+               }
+       }
+
+       /* Try to do the send work in the caller's context. */
+       ipath_do_send((unsigned long) qp);
+
+bail:
+       return err;
+}
+
+/**
+ * ipath_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                             struct ib_recv_wr **bad_wr)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_rwq *wq = qp->r_rq.wq;
+       unsigned long flags;
+       int ret;
+
+       /* Check that state is OK to post receive. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
+               *bad_wr = wr;
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       for (; wr; wr = wr->next) {
+               struct ipath_rwqe *wqe;
+               u32 next;
+               int i;
+
+               if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+                       *bad_wr = wr;
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&qp->r_rq.lock, flags);
+               next = wq->head + 1;
+               if (next >= qp->r_rq.size)
+                       next = 0;
+               if (next == wq->tail) {
+                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+               wqe->wr_id = wr->wr_id;
+               wqe->num_sge = wr->num_sge;
+               for (i = 0; i < wr->num_sge; i++)
+                       wqe->sg_list[i] = wr->sg_list[i];
+               /* Make sure queue entry is written before the head index. */
+               smp_wmb();
+               wq->head = next;
+               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_qp_rcv - processing an incoming packet on a QP
+ * @dev: the device the packet came on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_qp_rcv(struct ipath_ibdev *dev,
+                        struct ipath_ib_header *hdr, int has_grh,
+                        void *data, u32 tlen, struct ipath_qp *qp)
+{
+       /* Check for valid receive state. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               dev->n_pkt_drops++;
+               return;
+       }
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               if (ib_ipath_disable_sma)
+                       break;
+               /* FALLTHROUGH */
+       case IB_QPT_UD:
+               ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+               break;
+
+       case IB_QPT_RC:
+               ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+               break;
+
+       case IB_QPT_UC:
+               ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+               break;
+
+       default:
+               break;
+       }
+}
+
+/**
+ * ipath_ib_rcv - process an incoming packet
+ * @arg: the device pointer
+ * @rhdr: the header of the packet
+ * @data: the packet data
+ * @tlen: the packet length
+ *
+ * This is called from ipath_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
+                 u32 tlen)
+{
+       struct ipath_ib_header *hdr = rhdr;
+       struct ipath_other_headers *ohdr;
+       struct ipath_qp *qp;
+       u32 qp_num;
+       int lnh;
+       u8 opcode;
+       u16 lid;
+
+       if (unlikely(dev == NULL))
+               goto bail;
+
+       if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
+               dev->rcv_errors++;
+               goto bail;
+       }
+
+       /* Check for a valid destination LID (see ch. 7.11.1). */
+       lid = be16_to_cpu(hdr->lrh[1]);
+       if (lid < IPATH_MULTICAST_LID_BASE) {
+               lid &= ~((1 << dev->dd->ipath_lmc) - 1);
+               if (unlikely(lid != dev->dd->ipath_lid)) {
+                       dev->rcv_errors++;
+                       goto bail;
+               }
+       }
+
+       /* Check for GRH */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == IPATH_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else if (lnh == IPATH_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else {
+               dev->rcv_errors++;
+               goto bail;
+       }
+
+       opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+       dev->opstats[opcode].n_bytes += tlen;
+       dev->opstats[opcode].n_packets++;
+
+       /* Get the destination QP number. */
+       qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
+       if (qp_num == IPATH_MULTICAST_QPN) {
+               struct ipath_mcast *mcast;
+               struct ipath_mcast_qp *p;
+
+               if (lnh != IPATH_LRH_GRH) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+               mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+               if (mcast == NULL) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+               dev->n_multicast_rcv++;
+               list_for_each_entry_rcu(p, &mcast->qp_list, list)
+                       ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
+               /*
+                * Notify ipath_multicast_detach() if it is waiting for us
+                * to finish.
+                */
+               if (atomic_dec_return(&mcast->refcount) <= 1)
+                       wake_up(&mcast->wait);
+       } else {
+               qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
+               if (qp) {
+                       dev->n_unicast_rcv++;
+                       ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
+                                    tlen, qp);
+                       /*
+                        * Notify ipath_destroy_qp() if it is waiting
+                        * for us to finish.
+                        */
+                       if (atomic_dec_and_test(&qp->refcount))
+                               wake_up(&qp->wait);
+               } else
+                       dev->n_pkt_drops++;
+       }
+
+bail:;
+}
+
+/**
+ * ipath_ib_timer - verbs timer
+ * @arg: the device pointer
+ *
+ * This is called from ipath_do_rcv_timer() at interrupt level to check for
+ * QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(struct ipath_ibdev *dev)
+{
+       struct ipath_qp *resend = NULL;
+       struct ipath_qp *rnr = NULL;
+       struct list_head *last;
+       struct ipath_qp *qp;
+       unsigned long flags;
+
+       if (dev == NULL)
+               return;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       /* Start filling the next pending queue. */
+       if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+               dev->pending_index = 0;
+       /* Save any requests still in the new queue, they have timed out. */
+       last = &dev->pending[dev->pending_index];
+       while (!list_empty(last)) {
+               qp = list_entry(last->next, struct ipath_qp, timerwait);
+               list_del_init(&qp->timerwait);
+               qp->timer_next = resend;
+               resend = qp;
+               atomic_inc(&qp->refcount);
+       }
+       last = &dev->rnrwait;
+       if (!list_empty(last)) {
+               qp = list_entry(last->next, struct ipath_qp, timerwait);
+               if (--qp->s_rnr_timeout == 0) {
+                       do {
+                               list_del_init(&qp->timerwait);
+                               qp->timer_next = rnr;
+                               rnr = qp;
+                               atomic_inc(&qp->refcount);
+                               if (list_empty(last))
+                                       break;
+                               qp = list_entry(last->next, struct ipath_qp,
+                                               timerwait);
+                       } while (qp->s_rnr_timeout == 0);
+               }
+       }
+       /*
+        * We should only be in the started state if pma_sample_start != 0
+        */
+       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+           --dev->pma_sample_start == 0) {
+               dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+               ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
+                                       &dev->ipath_rword,
+                                       &dev->ipath_spkts,
+                                       &dev->ipath_rpkts,
+                                       &dev->ipath_xmit_wait);
+       }
+       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+               if (dev->pma_sample_interval == 0) {
+                       u64 ta, tb, tc, td, te;
+
+                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+                       ipath_snapshot_counters(dev->dd, &ta, &tb,
+                                               &tc, &td, &te);
+
+                       dev->ipath_sword = ta - dev->ipath_sword;
+                       dev->ipath_rword = tb - dev->ipath_rword;
+                       dev->ipath_spkts = tc - dev->ipath_spkts;
+                       dev->ipath_rpkts = td - dev->ipath_rpkts;
+                       dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
+               }
+               else
+                       dev->pma_sample_interval--;
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       /* XXX What if timer fires again while this is running? */
+       while (resend != NULL) {
+               qp = resend;
+               resend = qp->timer_next;
+
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (qp->s_last != qp->s_tail &&
+                   ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+                       dev->n_timeouts++;
+                       ipath_restart_rc(qp, qp->s_last_psn + 1);
+               }
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+
+               /* Notify ipath_destroy_qp() if it is waiting. */
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       while (rnr != NULL) {
+               qp = rnr;
+               rnr = qp->timer_next;
+
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+
+               /* Notify ipath_destroy_qp() if it is waiting. */
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+}
+
+static void update_sge(struct ipath_sge_state *ss, u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       sge->vaddr += length;
+       sge->length -= length;
+       sge->sge_length -= length;
+       if (sge->sge_length == 0) {
+               if (--ss->num_sge)
+                       *sge = *ss->sg_list++;
+       } else if (sge->length == 0 && sge->mr != NULL) {
+               if (++sge->n >= IPATH_SEGSZ) {
+                       if (++sge->m >= sge->mr->mapsz)
+                               return;
+                       sge->n = 0;
+               }
+               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+       }
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+       return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+       return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+       data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+       data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+       return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+       return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+       return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+       data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+       data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+       return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
+                   u32 length, unsigned flush_wc)
+{
+       u32 extra = 0;
+       u32 data = 0;
+       u32 last;
+
+       while (1) {
+               u32 len = ss->sge.length;
+               u32 off;
+
+               if (len > length)
+                       len = length;
+               if (len > ss->sge.sge_length)
+                       len = ss->sge.sge_length;
+               BUG_ON(len == 0);
+               /* If the source address is not aligned, try to align it. */
+               off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+               if (off) {
+                       u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+                                           ~(sizeof(u32) - 1));
+                       u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+                       u32 y;
+
+                       y = sizeof(u32) - off;
+                       if (len > y)
+                               len = y;
+                       if (len + extra >= sizeof(u32)) {
+                               data |= set_upper_bits(v, extra *
+                                                      BITS_PER_BYTE);
+                               len = sizeof(u32) - extra;
+                               if (len == length) {
+                                       last = data;
+                                       break;
+                               }
+                               __raw_writel(data, piobuf);
+                               piobuf++;
+                               extra = 0;
+                               data = 0;
+                       } else {
+                               /* Clear unused upper bytes */
+                               data |= clear_upper_bytes(v, len, extra);
+                               if (len == length) {
+                                       last = data;
+                                       break;
+                               }
+                               extra += len;
+                       }
+               } else if (extra) {
+                       /* Source address is aligned. */
+                       u32 *addr = (u32 *) ss->sge.vaddr;
+                       int shift = extra * BITS_PER_BYTE;
+                       int ushift = 32 - shift;
+                       u32 l = len;
+
+                       while (l >= sizeof(u32)) {
+                               u32 v = *addr;
+
+                               data |= set_upper_bits(v, shift);
+                               __raw_writel(data, piobuf);
+                               data = get_upper_bits(v, ushift);
+                               piobuf++;
+                               addr++;
+                               l -= sizeof(u32);
+                       }
+                       /*
+                        * We still have 'extra' number of bytes leftover.
+                        */
+                       if (l) {
+                               u32 v = *addr;
+
+                               if (l + extra >= sizeof(u32)) {
+                                       data |= set_upper_bits(v, shift);
+                                       len -= l + extra - sizeof(u32);
+                                       if (len == length) {
+                                               last = data;
+                                               break;
+                                       }
+                                       __raw_writel(data, piobuf);
+                                       piobuf++;
+                                       extra = 0;
+                                       data = 0;
+                               } else {
+                                       /* Clear unused upper bytes */
+                                       data |= clear_upper_bytes(v, l,
+                                                                 extra);
+                                       if (len == length) {
+                                               last = data;
+                                               break;
+                                       }
+                                       extra += l;
+                               }
+                       } else if (len == length) {
+                               last = data;
+                               break;
+                       }
+               } else if (len == length) {
+                       u32 w;
+
+                       /*
+                        * Need to round up for the last dword in the
+                        * packet.
+                        */
+                       w = (len + 3) >> 2;
+                       __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+                       piobuf += w - 1;
+                       last = ((u32 *) ss->sge.vaddr)[w - 1];
+                       break;
+               } else {
+                       u32 w = len >> 2;
+
+                       __iowrite32_copy(piobuf, ss->sge.vaddr, w);
+                       piobuf += w;
+
+                       extra = len & (sizeof(u32) - 1);
+                       if (extra) {
+                               u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+                               /* Clear unused upper bytes */
+                               data = clear_upper_bytes(v, extra, 0);
+                       }
+               }
+               update_sge(ss, len);
+               length -= len;
+       }
+       /* Update address before sending packet. */
+       update_sge(ss, length);
+       if (flush_wc) {
+               /* must flush early everything before trigger word */
+               ipath_flush_wc();
+               __raw_writel(last, piobuf);
+               /* be sure trigger word is written */
+               ipath_flush_wc();
+       } else
+               __raw_writel(last, piobuf);
+}
+
+/*
+ * Convert IB rate to delay multiplier.
+ */
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
+{
+       switch (rate) {
+       case IB_RATE_2_5_GBPS: return 8;
+       case IB_RATE_5_GBPS:   return 4;
+       case IB_RATE_10_GBPS:  return 2;
+       case IB_RATE_20_GBPS:  return 1;
+       default:               return 0;
+       }
+}
+
+/*
+ * Convert delay multiplier to IB rate
+ */
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+{
+       switch (mult) {
+       case 8:  return IB_RATE_2_5_GBPS;
+       case 4:  return IB_RATE_5_GBPS;
+       case 2:  return IB_RATE_10_GBPS;
+       case 1:  return IB_RATE_20_GBPS;
+       default: return IB_RATE_PORT_CURRENT;
+       }
+}
+
+static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
+{
+       struct ipath_verbs_txreq *tx = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       if (!list_empty(&dev->txreq_free)) {
+               struct list_head *l = dev->txreq_free.next;
+
+               list_del(l);
+               tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+       return tx;
+}
+
+static inline void put_txreq(struct ipath_ibdev *dev,
+                            struct ipath_verbs_txreq *tx)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       list_add(&tx->txreq.list, &dev->txreq_free);
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+static void sdma_complete(void *cookie, int status)
+{
+       struct ipath_verbs_txreq *tx = cookie;
+       struct ipath_qp *qp = tx->qp;
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned long flags;
+       enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+               IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
+
+       if (atomic_dec_and_test(&qp->s_dma_busy)) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (tx->wqe)
+                       ipath_send_complete(qp, tx->wqe, ibs);
+               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+                    qp->s_last != qp->s_head) ||
+                   (qp->s_flags & IPATH_S_WAIT_DMA))
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               wake_up(&qp->wait_dma);
+       } else if (tx->wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               ipath_send_complete(qp, tx->wqe, ibs);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
+
+       if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+               kfree(tx->txreq.map_addr);
+       put_txreq(dev, tx);
+
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+       unsigned long flags;
+
+       if (atomic_dec_and_test(&qp->s_dma_busy)) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+                    qp->s_last != qp->s_head) ||
+                   (qp->s_flags & IPATH_S_WAIT_DMA))
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               wake_up(&qp->wait_dma);
+       }
+}
+
+/*
+ * Compute the number of clock cycles of delay before sending the next packet.
+ * The multipliers reflect the number of clocks for the fastest rate so
+ * one tick at 4xDDR is 8 ticks at 1xSDR.
+ * If the destination port will take longer to receive a packet than
+ * the outgoing link can send it, we need to delay sending the next packet
+ * by the difference in time it takes the receiver to receive and the sender
+ * to send this packet.
+ * Note that this delay is always correct for UC and RC but not always
+ * optimal for UD. For UD, the destination HCA can be different for each
+ * packet, in which case, we could send packets to a different destination
+ * while "waiting" for the delay. The overhead for doing this without
+ * HW support is more than just paying the cost of delaying some packets
+ * unnecessarily.
+ */
+static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
+{
+       return (rcv_mult > snd_mult) ?
+               (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+}
+
+static int ipath_verbs_send_dma(struct ipath_qp *qp,
+                               struct ipath_ib_header *hdr, u32 hdrwords,
+                               struct ipath_sge_state *ss, u32 len,
+                               u32 plen, u32 dwords)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_devdata *dd = dev->dd;
+       struct ipath_verbs_txreq *tx;
+       u32 *piobuf;
+       u32 control;
+       u32 ndesc;
+       int ret;
+
+       tx = qp->s_tx;
+       if (tx) {
+               qp->s_tx = NULL;
+               /* resend previously constructed packet */
+               atomic_inc(&qp->s_dma_busy);
+               ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
+               if (ret) {
+                       qp->s_tx = tx;
+                       decrement_dma_busy(qp);
+               }
+               goto bail;
+       }
+
+       tx = get_txreq(dev);
+       if (!tx) {
+               ret = -EBUSY;
+               goto bail;
+       }
+
+       /*
+        * Get the saved delay count we computed for the previous packet
+        * and save the delay count for this packet to be used next time
+        * we get here.
+        */
+       control = qp->s_pkt_delay;
+       qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+       tx->qp = qp;
+       atomic_inc(&qp->refcount);
+       tx->wqe = qp->s_wqe;
+       tx->txreq.callback = sdma_complete;
+       tx->txreq.callback_cookie = tx;
+       tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
+               IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
+       if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+               tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
+
+       /* VL15 packets bypass credit check */
+       if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
+               control |= 1ULL << 31;
+               tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
+       }
+
+       if (len) {
+               /*
+                * Don't try to DMA if it takes more descriptors than
+                * the queue holds.
+                */
+               ndesc = ipath_count_sge(ss, len);
+               if (ndesc >= dd->ipath_sdma_descq_cnt)
+                       ndesc = 0;
+       } else
+               ndesc = 1;
+       if (ndesc) {
+               tx->hdr.pbc[0] = cpu_to_le32(plen);
+               tx->hdr.pbc[1] = cpu_to_le32(control);
+               memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
+               tx->txreq.sg_count = ndesc;
+               tx->map_len = (hdrwords + 2) << 2;
+               tx->txreq.map_addr = &tx->hdr;
+               atomic_inc(&qp->s_dma_busy);
+               ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
+               if (ret) {
+                       /* save ss and length in dwords */
+                       tx->ss = ss;
+                       tx->len = dwords;
+                       qp->s_tx = tx;
+                       decrement_dma_busy(qp);
+               }
+               goto bail;
+       }
+
+       /* Allocate a buffer and copy the header and payload to it. */
+       tx->map_len = (plen + 1) << 2;
+       piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
+       if (unlikely(piobuf == NULL)) {
+               ret = -EBUSY;
+               goto err_tx;
+       }
+       tx->txreq.map_addr = piobuf;
+       tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
+       tx->txreq.sg_count = 1;
+
+       *piobuf++ = (__force u32) cpu_to_le32(plen);
+       *piobuf++ = (__force u32) cpu_to_le32(control);
+       memcpy(piobuf, hdr, hdrwords << 2);
+       ipath_copy_from_sge(piobuf + hdrwords, ss, len);
+
+       atomic_inc(&qp->s_dma_busy);
+       ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
+       /*
+        * If we couldn't queue the DMA request, save the info
+        * and try again later rather than destroying the
+        * buffer and undoing the side effects of the copy.
+        */
+       if (ret) {
+               tx->ss = NULL;
+               tx->len = 0;
+               qp->s_tx = tx;
+               decrement_dma_busy(qp);
+       }
+       dev->n_unaligned++;
+       goto bail;
+
+err_tx:
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+       put_txreq(dev, tx);
+bail:
+       return ret;
+}
+
+static int ipath_verbs_send_pio(struct ipath_qp *qp,
+                               struct ipath_ib_header *ibhdr, u32 hdrwords,
+                               struct ipath_sge_state *ss, u32 len,
+                               u32 plen, u32 dwords)
+{
+       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+       u32 *hdr = (u32 *) ibhdr;
+       u32 __iomem *piobuf;
+       unsigned flush_wc;
+       u32 control;
+       int ret;
+       unsigned long flags;
+
+       piobuf = ipath_getpiobuf(dd, plen, NULL);
+       if (unlikely(piobuf == NULL)) {
+               ret = -EBUSY;
+               goto bail;
+       }
+
+       /*
+        * Get the saved delay count we computed for the previous packet
+        * and save the delay count for this packet to be used next time
+        * we get here.
+        */
+       control = qp->s_pkt_delay;
+       qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+       /* VL15 packets bypass credit check */
+       if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
+               control |= 1ULL << 31;
+
+       /*
+        * Write the length to the control qword plus any needed flags.
+        * We have to flush after the PBC for correctness on some cpus
+        * or WC buffer can be written out of order.
+        */
+       writeq(((u64) control << 32) | plen, piobuf);
+       piobuf += 2;
+
+       flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
+       if (len == 0) {
+               /*
+                * If there is just the header portion, must flush before
+                * writing last word of header for correctness, and after
+                * the last header word (trigger word).
+                */
+               if (flush_wc) {
+                       ipath_flush_wc();
+                       __iowrite32_copy(piobuf, hdr, hdrwords - 1);
+                       ipath_flush_wc();
+                       __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+                       ipath_flush_wc();
+               } else
+                       __iowrite32_copy(piobuf, hdr, hdrwords);
+               goto done;
+       }
+
+       if (flush_wc)
+               ipath_flush_wc();
+       __iowrite32_copy(piobuf, hdr, hdrwords);
+       piobuf += hdrwords;
+
+       /* The common case is aligned and contained in one segment. */
+       if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+                  !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+               u32 *addr = (u32 *) ss->sge.vaddr;
+
+               /* Update address before sending packet. */
+               update_sge(ss, len);
+               if (flush_wc) {
+                       __iowrite32_copy(piobuf, addr, dwords - 1);
+                       /* must flush early everything before trigger word */
+                       ipath_flush_wc();
+                       __raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+                       /* be sure trigger word is written */
+                       ipath_flush_wc();
+               } else
+                       __iowrite32_copy(piobuf, addr, dwords);
+               goto done;
+       }
+       copy_io(piobuf, ss, len, flush_wc);
+done:
+       if (qp->s_wqe) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ */
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+                    u32 hdrwords, struct ipath_sge_state *ss, u32 len)
+{
+       struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+       u32 plen;
+       int ret;
+       u32 dwords = (len + 3) >> 2;
+
+       /*
+        * Calculate the send buffer trigger address.
+        * The +1 counts for the pbc control dword following the pbc length.
+        */
+       plen = hdrwords + dwords + 1;
+
+       /*
+        * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+        * can defer SDMA restart until link goes ACTIVE without
+        * worrying about just how we got there.
+        */
+       if (qp->ibqp.qp_type == IB_QPT_SMI ||
+           !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+               ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+                                          plen, dwords);
+       else
+               ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+                                          plen, dwords);
+
+       return ret;
+}
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+                           u64 *rwords, u64 *spkts, u64 *rpkts,
+                           u64 *xmit_wait)
+{
+       int ret;
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               /* no hardware, freeze, etc. */
+               ret = -EINVAL;
+               goto bail;
+       }
+       *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+       *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+       *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+       *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+       *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_get_counters - get various chip counters
+ * @dd: the infinipath device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int ipath_get_counters(struct ipath_devdata *dd,
+                      struct ipath_verbs_counters *cntrs)
+{
+       struct ipath_cregs const *crp = dd->ipath_cregs;
+       int ret;
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               /* no hardware, freeze, etc. */
+               ret = -EINVAL;
+               goto bail;
+       }
+       cntrs->symbol_error_counter =
+               ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
+       cntrs->link_error_recovery_counter =
+               ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
+       /*
+        * The link downed counter counts when the other side downs the
+        * connection.  We add in the number of times we downed the link
+        * due to local link integrity errors to compensate.
+        */
+       cntrs->link_downed_counter =
+               ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
+       cntrs->port_rcv_errors =
+               ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
+               ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
+               ipath_snap_cntr(dd, crp->cr_portovflcnt) +
+               ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
+               ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
+               ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
+               ipath_snap_cntr(dd, crp->cr_erricrccnt) +
+               ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
+               ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
+               ipath_snap_cntr(dd, crp->cr_badformatcnt) +
+               dd->ipath_rxfc_unsupvl_errs;
+       if (crp->cr_rxotherlocalphyerrcnt)
+               cntrs->port_rcv_errors +=
+                       ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
+       if (crp->cr_rxvlerrcnt)
+               cntrs->port_rcv_errors +=
+                       ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
+       cntrs->port_rcv_remphys_errors =
+               ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
+       cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
+       cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
+       cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
+       cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
+       cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
+       cntrs->local_link_integrity_errors =
+               crp->cr_locallinkintegrityerrcnt ?
+               ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
+               ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+                dd->ipath_lli_errs : dd->ipath_lli_errors);
+       cntrs->excessive_buffer_overrun_errors =
+               crp->cr_excessbufferovflcnt ?
+               ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
+               dd->ipath_overrun_thresh_errs;
+       cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
+               ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_ib_piobufavail - callback when a PIO buffer is available
+ * @arg: the device pointer
+ *
+ * This is called from ipath_intr() at interrupt level when a PIO buffer is
+ * available after ipath_verbs_send() returned an error that no buffers were
+ * available.  Return 1 if we consumed all the PIO buffers and we still have
+ * QPs waiting for buffers (for now, just restart the send tasklet and
+ * return zero).
+ */
+int ipath_ib_piobufavail(struct ipath_ibdev *dev)
+{
+       struct list_head *list;
+       struct ipath_qp *qplist;
+       struct ipath_qp *qp;
+       unsigned long flags;
+
+       if (dev == NULL)
+               goto bail;
+
+       list = &dev->piowait;
+       qplist = NULL;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       while (!list_empty(list)) {
+               qp = list_entry(list->next, struct ipath_qp, piowait);
+               list_del_init(&qp->piowait);
+               qp->pio_next = qplist;
+               qplist = qp;
+               atomic_inc(&qp->refcount);
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       while (qplist != NULL) {
+               qp = qplist;
+               qplist = qp->pio_next;
+
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+                       ipath_schedule_send(qp);
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+
+               /* Notify ipath_destroy_qp() if it is waiting. */
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+
+bail:
+       return 0;
+}
+
+static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+                             struct ib_udata *uhw)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+
+       if (uhw->inlen || uhw->outlen)
+               return -EINVAL;
+
+       memset(props, 0, sizeof(*props));
+
+       props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+               IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+               IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+               IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+       props->page_size_cap = PAGE_SIZE;
+       props->vendor_id =
+               IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
+       props->vendor_part_id = dev->dd->ipath_deviceid;
+       props->hw_ver = dev->dd->ipath_pcirev;
+
+       props->sys_image_guid = dev->sys_image_guid;
+
+       props->max_mr_size = ~0ull;
+       props->max_qp = ib_ipath_max_qps;
+       props->max_qp_wr = ib_ipath_max_qp_wrs;
+       props->max_sge = ib_ipath_max_sges;
+       props->max_sge_rd = ib_ipath_max_sges;
+       props->max_cq = ib_ipath_max_cqs;
+       props->max_ah = ib_ipath_max_ahs;
+       props->max_cqe = ib_ipath_max_cqes;
+       props->max_mr = dev->lk_table.max;
+       props->max_fmr = dev->lk_table.max;
+       props->max_map_per_fmr = 32767;
+       props->max_pd = ib_ipath_max_pds;
+       props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
+       props->max_qp_init_rd_atom = 255;
+       /* props->max_res_rd_atom */
+       props->max_srq = ib_ipath_max_srqs;
+       props->max_srq_wr = ib_ipath_max_srq_wrs;
+       props->max_srq_sge = ib_ipath_max_srq_sges;
+       /* props->local_ca_ack_delay */
+       props->atomic_cap = IB_ATOMIC_GLOB;
+       props->max_pkeys = ipath_get_npkeys(dev->dd);
+       props->max_mcast_grp = ib_ipath_max_mcast_grps;
+       props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
+       props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+               props->max_mcast_grp;
+
+       return 0;
+}
+
+const u8 ipath_cvt_physportstate[32] = {
+       [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+       [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+       [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+       [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+       [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+       [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+       [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
+               IB_PHYSPORTSTATE_CFG_TRAIN,
+       [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
+               IB_PHYSPORTSTATE_CFG_TRAIN,
+       [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
+               IB_PHYSPORTSTATE_CFG_TRAIN,
+       [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
+               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+       [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
+               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+       [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
+               IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+       [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+       [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
+{
+       return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+}
+
+static int ipath_query_port(struct ib_device *ibdev,
+                           u8 port, struct ib_port_attr *props)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_devdata *dd = dev->dd;
+       enum ib_mtu mtu;
+       u16 lid = dd->ipath_lid;
+       u64 ibcstat;
+
+       memset(props, 0, sizeof(*props));
+       props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+       props->lmc = dd->ipath_lmc;
+       props->sm_lid = dev->sm_lid;
+       props->sm_sl = dev->sm_sl;
+       ibcstat = dd->ipath_lastibcstat;
+       /* map LinkState to IB portinfo values.  */
+       props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
+
+       /* See phys_state_show() */
+       props->phys_state = /* MEA: assumes shift == 0 */
+               ipath_cvt_physportstate[dd->ipath_lastibcstat &
+               dd->ibcs_lts_mask];
+       props->port_cap_flags = dev->port_cap_flags;
+       props->gid_tbl_len = 1;
+       props->max_msg_sz = 0x80000000;
+       props->pkey_tbl_len = ipath_get_npkeys(dd);
+       props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
+               dev->z_pkey_violations;
+       props->qkey_viol_cntr = dev->qkey_violations;
+       props->active_width = dd->ipath_link_width_active;
+       /* See rate_show() */
+       props->active_speed = dd->ipath_link_speed_active;
+       props->max_vl_num = 1;          /* VLCap = VL0 */
+       props->init_type_reply = 0;
+
+       props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+       switch (dd->ipath_ibmtu) {
+       case 4096:
+               mtu = IB_MTU_4096;
+               break;
+       case 2048:
+               mtu = IB_MTU_2048;
+               break;
+       case 1024:
+               mtu = IB_MTU_1024;
+               break;
+       case 512:
+               mtu = IB_MTU_512;
+               break;
+       case 256:
+               mtu = IB_MTU_256;
+               break;
+       default:
+               mtu = IB_MTU_2048;
+       }
+       props->active_mtu = mtu;
+       props->subnet_timeout = dev->subnet_timeout;
+
+       return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+                              int device_modify_mask,
+                              struct ib_device_modify *device_modify)
+{
+       int ret;
+
+       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+                                  IB_DEVICE_MODIFY_NODE_DESC)) {
+               ret = -EOPNOTSUPP;
+               goto bail;
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
+               memcpy(device->node_desc, device_modify->node_desc, 64);
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+               to_idev(device)->sys_image_guid =
+                       cpu_to_be64(device_modify->sys_image_guid);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+                            u8 port, int port_modify_mask,
+                            struct ib_port_modify *props)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+
+       dev->port_cap_flags |= props->set_port_cap_mask;
+       dev->port_cap_flags &= ~props->clr_port_cap_mask;
+       if (port_modify_mask & IB_PORT_SHUTDOWN)
+               ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+       if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+               dev->qkey_violations = 0;
+       return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+                          int index, union ib_gid *gid)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       if (index >= 1) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       gid->global.subnet_prefix = dev->gid_prefix;
+       gid->global.interface_id = dev->dd->ipath_guid;
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+                                   struct ib_ucontext *context,
+                                   struct ib_udata *udata)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_pd *pd;
+       struct ib_pd *ret;
+
+       /*
+        * This is actually totally arbitrary.  Some correctness tests
+        * assume there's a maximum number of PDs that can be allocated.
+        * We don't actually have this limit, but we fail the test if
+        * we allow allocations of more than we report for this value.
+        */
+
+       pd = kmalloc(sizeof *pd, GFP_KERNEL);
+       if (!pd) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       spin_lock(&dev->n_pds_lock);
+       if (dev->n_pds_allocated == ib_ipath_max_pds) {
+               spin_unlock(&dev->n_pds_lock);
+               kfree(pd);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       dev->n_pds_allocated++;
+       spin_unlock(&dev->n_pds_lock);
+
+       /* ib_alloc_pd() will initialize pd->ibpd. */
+       pd->user = udata != NULL;
+
+       ret = &pd->ibpd;
+
+bail:
+       return ret;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+       struct ipath_pd *pd = to_ipd(ibpd);
+       struct ipath_ibdev *dev = to_idev(ibpd->device);
+
+       spin_lock(&dev->n_pds_lock);
+       dev->n_pds_allocated--;
+       spin_unlock(&dev->n_pds_lock);
+
+       kfree(pd);
+
+       return 0;
+}
+
+/**
+ * ipath_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+                                    struct ib_ah_attr *ah_attr)
+{
+       struct ipath_ah *ah;
+       struct ib_ah *ret;
+       struct ipath_ibdev *dev = to_idev(pd->device);
+       unsigned long flags;
+
+       /* A multicast address requires a GRH (see ch. 8.4.1). */
+       if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+           ah_attr->dlid != IPATH_PERMISSIVE_LID &&
+           !(ah_attr->ah_flags & IB_AH_GRH)) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       if (ah_attr->dlid == 0) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       if (ah_attr->port_num < 1 ||
+           ah_attr->port_num > pd->device->phys_port_cnt) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+       if (!ah) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       spin_lock_irqsave(&dev->n_ahs_lock, flags);
+       if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
+               spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+               kfree(ah);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       dev->n_ahs_allocated++;
+       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+       /* ib_create_ah() will initialize ah->ibah. */
+       ah->attr = *ah_attr;
+       ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
+
+       ret = &ah->ibah;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+       struct ipath_ibdev *dev = to_idev(ibah->device);
+       struct ipath_ah *ah = to_iah(ibah);
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->n_ahs_lock, flags);
+       dev->n_ahs_allocated--;
+       spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+       kfree(ah);
+
+       return 0;
+}
+
+static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+       struct ipath_ah *ah = to_iah(ibah);
+
+       *ah_attr = ah->attr;
+       ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
+
+       return 0;
+}
+
+/**
+ * ipath_get_npkeys - return the size of the PKEY table for port 0
+ * @dd: the infinipath device
+ */
+unsigned ipath_get_npkeys(struct ipath_devdata *dd)
+{
+       return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
+}
+
+/**
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
+ * @dd: the infinipath device
+ * @index: the PKEY index
+ */
+unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
+{
+       unsigned ret;
+
+       /* always a kernel port, no locking needed */
+       if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
+               ret = 0;
+       else
+               ret = dd->ipath_pd[0]->port_pkeys[index];
+
+       return ret;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+                           u16 *pkey)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       if (index >= ipath_get_npkeys(dev->dd)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       *pkey = ipath_get_pkey(dev->dd, index);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the InfiniPath driver
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+                                               struct ib_udata *udata)
+{
+       struct ipath_ucontext *context;
+       struct ib_ucontext *ret;
+
+       context = kmalloc(sizeof *context, GFP_KERNEL);
+       if (!context) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       ret = &context->ibucontext;
+
+bail:
+       return ret;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+       kfree(to_iucontext(context));
+       return 0;
+}
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev);
+
+static void __verbs_timer(unsigned long arg)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *) arg;
+
+       /* Handle verbs layer timeouts. */
+       ipath_ib_timer(dd->verbs_dev);
+
+       mod_timer(&dd->verbs_timer, jiffies + 1);
+}
+
+static int enable_timer(struct ipath_devdata *dd)
+{
+       /*
+        * Early chips had a design flaw where the chip and kernel idea
+        * of the tail register don't always agree, and therefore we won't
+        * get an interrupt on the next packet received.
+        * If the board supports per packet receive interrupts, use it.
+        * Otherwise, the timer function periodically checks for packets
+        * to cover this case.
+        * Either way, the timer is needed for verbs layer related
+        * processing.
+        */
+       if (dd->ipath_flags & IPATH_GPIO_INTR) {
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
+                                0x2074076542310ULL);
+               /* Enable GPIO bit 2 interrupt */
+               dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+                                dd->ipath_gpio_mask);
+       }
+
+       init_timer(&dd->verbs_timer);
+       dd->verbs_timer.function = __verbs_timer;
+       dd->verbs_timer.data = (unsigned long)dd;
+       dd->verbs_timer.expires = jiffies + 1;
+       add_timer(&dd->verbs_timer);
+
+       return 0;
+}
+
+static int disable_timer(struct ipath_devdata *dd)
+{
+       /* Disable GPIO bit 2 interrupt */
+       if (dd->ipath_flags & IPATH_GPIO_INTR) {
+                /* Disable GPIO bit 2 interrupt */
+               dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+                                dd->ipath_gpio_mask);
+               /*
+                * We might want to undo changes to debugportselect,
+                * but how?
+                */
+       }
+
+       del_timer_sync(&dd->verbs_timer);
+
+       return 0;
+}
+
+static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num,
+                               struct ib_port_immutable *immutable)
+{
+       struct ib_port_attr attr;
+       int err;
+
+       err = ipath_query_port(ibdev, port_num, &attr);
+       if (err)
+               return err;
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+       immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB;
+       immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+       return 0;
+}
+
+/**
+ * ipath_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated ipath_ibdev pointer or NULL on error.
+ */
+int ipath_register_ib_device(struct ipath_devdata *dd)
+{
+       struct ipath_verbs_counters cntrs;
+       struct ipath_ibdev *idev;
+       struct ib_device *dev;
+       struct ipath_verbs_txreq *tx;
+       unsigned i;
+       int ret;
+
+       idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+       if (idev == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       dev = &idev->ibdev;
+
+       if (dd->ipath_sdma_descq_cnt) {
+               tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
+                            GFP_KERNEL);
+               if (tx == NULL) {
+                       ret = -ENOMEM;
+                       goto err_tx;
+               }
+       } else
+               tx = NULL;
+       idev->txreq_bufs = tx;
+
+       /* Only need to initialize non-zero fields. */
+       spin_lock_init(&idev->n_pds_lock);
+       spin_lock_init(&idev->n_ahs_lock);
+       spin_lock_init(&idev->n_cqs_lock);
+       spin_lock_init(&idev->n_qps_lock);
+       spin_lock_init(&idev->n_srqs_lock);
+       spin_lock_init(&idev->n_mcast_grps_lock);
+
+       spin_lock_init(&idev->qp_table.lock);
+       spin_lock_init(&idev->lk_table.lock);
+       idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
+       /* Set the prefix to the default value (see ch. 4.1.1) */
+       idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL);
+
+       ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
+       if (ret)
+               goto err_qp;
+
+       /*
+        * The top ib_ipath_lkey_table_size bits are used to index the
+        * table.  The lower 8 bits can be owned by the user (copied from
+        * the LKEY).  The remaining bits act as a generation number or tag.
+        */
+       idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+       idev->lk_table.table = kzalloc(idev->lk_table.max *
+                                      sizeof(*idev->lk_table.table),
+                                      GFP_KERNEL);
+       if (idev->lk_table.table == NULL) {
+               ret = -ENOMEM;
+               goto err_lk;
+       }
+       INIT_LIST_HEAD(&idev->pending_mmaps);
+       spin_lock_init(&idev->pending_lock);
+       idev->mmap_offset = PAGE_SIZE;
+       spin_lock_init(&idev->mmap_offset_lock);
+       INIT_LIST_HEAD(&idev->pending[0]);
+       INIT_LIST_HEAD(&idev->pending[1]);
+       INIT_LIST_HEAD(&idev->pending[2]);
+       INIT_LIST_HEAD(&idev->piowait);
+       INIT_LIST_HEAD(&idev->rnrwait);
+       INIT_LIST_HEAD(&idev->txreq_free);
+       idev->pending_index = 0;
+       idev->port_cap_flags =
+               IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+       if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
+               idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+       idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+       idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+       idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+       idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+       idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+       /* Snapshot current HW counters to "clear" them. */
+       ipath_get_counters(dd, &cntrs);
+       idev->z_symbol_error_counter = cntrs.symbol_error_counter;
+       idev->z_link_error_recovery_counter =
+               cntrs.link_error_recovery_counter;
+       idev->z_link_downed_counter = cntrs.link_downed_counter;
+       idev->z_port_rcv_errors = cntrs.port_rcv_errors;
+       idev->z_port_rcv_remphys_errors =
+               cntrs.port_rcv_remphys_errors;
+       idev->z_port_xmit_discards = cntrs.port_xmit_discards;
+       idev->z_port_xmit_data = cntrs.port_xmit_data;
+       idev->z_port_rcv_data = cntrs.port_rcv_data;
+       idev->z_port_xmit_packets = cntrs.port_xmit_packets;
+       idev->z_port_rcv_packets = cntrs.port_rcv_packets;
+       idev->z_local_link_integrity_errors =
+               cntrs.local_link_integrity_errors;
+       idev->z_excessive_buffer_overrun_errors =
+               cntrs.excessive_buffer_overrun_errors;
+       idev->z_vl15_dropped = cntrs.vl15_dropped;
+
+       for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
+               list_add(&tx->txreq.list, &idev->txreq_free);
+
+       /*
+        * The system image GUID is supposed to be the same for all
+        * IB HCAs in a single system but since there can be other
+        * device types in the system, we can't be sure this is unique.
+        */
+       if (!sys_image_guid)
+               sys_image_guid = dd->ipath_guid;
+       idev->sys_image_guid = sys_image_guid;
+       idev->ib_unit = dd->ipath_unit;
+       idev->dd = dd;
+
+       strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+       dev->owner = THIS_MODULE;
+       dev->node_guid = dd->ipath_guid;
+       dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+       dev->uverbs_cmd_mask =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+               (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+               (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+               (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+               (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+               (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+               (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+               (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+       dev->node_type = RDMA_NODE_IB_CA;
+       dev->phys_port_cnt = 1;
+       dev->num_comp_vectors = 1;
+       dev->dma_device = &dd->pcidev->dev;
+       dev->query_device = ipath_query_device;
+       dev->modify_device = ipath_modify_device;
+       dev->query_port = ipath_query_port;
+       dev->modify_port = ipath_modify_port;
+       dev->query_pkey = ipath_query_pkey;
+       dev->query_gid = ipath_query_gid;
+       dev->alloc_ucontext = ipath_alloc_ucontext;
+       dev->dealloc_ucontext = ipath_dealloc_ucontext;
+       dev->alloc_pd = ipath_alloc_pd;
+       dev->dealloc_pd = ipath_dealloc_pd;
+       dev->create_ah = ipath_create_ah;
+       dev->destroy_ah = ipath_destroy_ah;
+       dev->query_ah = ipath_query_ah;
+       dev->create_srq = ipath_create_srq;
+       dev->modify_srq = ipath_modify_srq;
+       dev->query_srq = ipath_query_srq;
+       dev->destroy_srq = ipath_destroy_srq;
+       dev->create_qp = ipath_create_qp;
+       dev->modify_qp = ipath_modify_qp;
+       dev->query_qp = ipath_query_qp;
+       dev->destroy_qp = ipath_destroy_qp;
+       dev->post_send = ipath_post_send;
+       dev->post_recv = ipath_post_receive;
+       dev->post_srq_recv = ipath_post_srq_receive;
+       dev->create_cq = ipath_create_cq;
+       dev->destroy_cq = ipath_destroy_cq;
+       dev->resize_cq = ipath_resize_cq;
+       dev->poll_cq = ipath_poll_cq;
+       dev->req_notify_cq = ipath_req_notify_cq;
+       dev->get_dma_mr = ipath_get_dma_mr;
+       dev->reg_phys_mr = ipath_reg_phys_mr;
+       dev->reg_user_mr = ipath_reg_user_mr;
+       dev->dereg_mr = ipath_dereg_mr;
+       dev->alloc_fmr = ipath_alloc_fmr;
+       dev->map_phys_fmr = ipath_map_phys_fmr;
+       dev->unmap_fmr = ipath_unmap_fmr;
+       dev->dealloc_fmr = ipath_dealloc_fmr;
+       dev->attach_mcast = ipath_multicast_attach;
+       dev->detach_mcast = ipath_multicast_detach;
+       dev->process_mad = ipath_process_mad;
+       dev->mmap = ipath_mmap;
+       dev->dma_ops = &ipath_dma_mapping_ops;
+       dev->get_port_immutable = ipath_port_immutable;
+
+       snprintf(dev->node_desc, sizeof(dev->node_desc),
+                IPATH_IDSTR " %s", init_utsname()->nodename);
+
+       ret = ib_register_device(dev, NULL);
+       if (ret)
+               goto err_reg;
+
+       ret = ipath_verbs_register_sysfs(dev);
+       if (ret)
+               goto err_class;
+
+       enable_timer(dd);
+
+       goto bail;
+
+err_class:
+       ib_unregister_device(dev);
+err_reg:
+       kfree(idev->lk_table.table);
+err_lk:
+       kfree(idev->qp_table.table);
+err_qp:
+       kfree(idev->txreq_bufs);
+err_tx:
+       ib_dealloc_device(dev);
+       ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+       idev = NULL;
+
+bail:
+       dd->verbs_dev = idev;
+       return ret;
+}
+
+void ipath_unregister_ib_device(struct ipath_ibdev *dev)
+{
+       struct ib_device *ibdev = &dev->ibdev;
+       u32 qps_inuse;
+
+       ib_unregister_device(ibdev);
+
+       disable_timer(dev->dd);
+
+       if (!list_empty(&dev->pending[0]) ||
+           !list_empty(&dev->pending[1]) ||
+           !list_empty(&dev->pending[2]))
+               ipath_dev_err(dev->dd, "pending list not empty!\n");
+       if (!list_empty(&dev->piowait))
+               ipath_dev_err(dev->dd, "piowait list not empty!\n");
+       if (!list_empty(&dev->rnrwait))
+               ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
+       if (!ipath_mcast_tree_empty())
+               ipath_dev_err(dev->dd, "multicast table memory leak!\n");
+       /*
+        * Note that ipath_unregister_ib_device() can be called before all
+        * the QPs are destroyed!
+        */
+       qps_inuse = ipath_free_all_qps(&dev->qp_table);
+       if (qps_inuse)
+               ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+                       qps_inuse);
+       kfree(dev->qp_table.table);
+       kfree(dev->lk_table.table);
+       kfree(dev->txreq_bufs);
+       ib_dealloc_device(ibdev);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct ipath_ibdev *dev =
+               container_of(device, struct ipath_ibdev, ibdev.dev);
+
+       return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+                       char *buf)
+{
+       struct ipath_ibdev *dev =
+               container_of(device, struct ipath_ibdev, ibdev.dev);
+       int ret;
+
+       ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
+       if (ret < 0)
+               goto bail;
+       strcat(buf, "\n");
+       ret = strlen(buf);
+
+bail:
+       return ret;
+}
+
+static ssize_t show_stats(struct device *device, struct device_attribute *attr,
+                         char *buf)
+{
+       struct ipath_ibdev *dev =
+               container_of(device, struct ipath_ibdev, ibdev.dev);
+       int i;
+       int len;
+
+       len = sprintf(buf,
+                     "RC resends  %d\n"
+                     "RC no QACK  %d\n"
+                     "RC ACKs     %d\n"
+                     "RC SEQ NAKs %d\n"
+                     "RC RDMA seq %d\n"
+                     "RC RNR NAKs %d\n"
+                     "RC OTH NAKs %d\n"
+                     "RC timeouts %d\n"
+                     "RC RDMA dup %d\n"
+                     "piobuf wait %d\n"
+                     "unaligned   %d\n"
+                     "PKT drops   %d\n"
+                     "WQE errs    %d\n",
+                     dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+                     dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+                     dev->n_other_naks, dev->n_timeouts,
+                     dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+                     dev->n_pkt_drops, dev->n_wqe_errs);
+       for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+               const struct ipath_opcode_stats *si = &dev->opstats[i];
+
+               if (!si->n_packets && !si->n_bytes)
+                       continue;
+               len += sprintf(buf + len, "%02x %llu/%llu\n", i,
+                              (unsigned long long) si->n_packets,
+                              (unsigned long long) si->n_bytes);
+       }
+       return len;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct device_attribute *ipath_class_attributes[] = {
+       &dev_attr_hw_rev,
+       &dev_attr_hca_type,
+       &dev_attr_board_id,
+       &dev_attr_stats
+};
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev)
+{
+       int i;
+       int ret;
+
+       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+               ret = device_create_file(&dev->dev,
+                                      ipath_class_attributes[i]);
+               if (ret)
+                       goto bail;
+       }
+       return 0;
+bail:
+       for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+               device_remove_file(&dev->dev, ipath_class_attributes[i]);
+       return ret;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h
new file mode 100644 (file)
index 0000000..ec167e5
--- /dev/null
@@ -0,0 +1,939 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_VERBS_H
+#define IPATH_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ipath_kernel.h"
+
+#define IPATH_MAX_RDMA_ATOMIC  4
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IPATH_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE     (IB_CQ_NEXT_COMP + 1)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                     0x20
+#define IB_NAK_PSN_ERROR               0x60
+#define IB_NAK_INVALID_REQUEST         0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR     0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST      0x64
+
+/* Flags for checking QP state (see ib_ipath_state_ops[]) */
+#define IPATH_POST_SEND_OK             0x01
+#define IPATH_POST_RECV_OK             0x02
+#define IPATH_PROCESS_RECV_OK          0x04
+#define IPATH_PROCESS_SEND_OK          0x08
+#define IPATH_PROCESS_NEXT_SEND_OK     0x10
+#define IPATH_FLUSH_SEND               0x20
+#define IPATH_FLUSH_RECV               0x40
+#define IPATH_PROCESS_OR_FLUSH_SEND \
+       (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE      0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED   0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING   0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA  cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA   cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS  cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS   cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT  cpu_to_be16(0x0005)
+
+struct ib_reth {
+       __be64 vaddr;
+       __be32 rkey;
+       __be32 length;
+} __attribute__ ((packed));
+
+struct ib_atomic_eth {
+       __be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+       __be32 rkey;
+       __be64 swap_data;
+       __be64 compare_data;
+} __attribute__ ((packed));
+
+struct ipath_other_headers {
+       __be32 bth[3];
+       union {
+               struct {
+                       __be32 deth[2];
+                       __be32 imm_data;
+               } ud;
+               struct {
+                       struct ib_reth reth;
+                       __be32 imm_data;
+               } rc;
+               struct {
+                       __be32 aeth;
+                       __be32 atomic_ack_eth[2];
+               } at;
+               __be32 imm_data;
+               __be32 aeth;
+               struct ib_atomic_eth atomic_eth;
+       } u;
+} __attribute__ ((packed));
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct ipath_ib_header {
+       __be16 lrh[4];
+       union {
+               struct {
+                       struct ib_grh grh;
+                       struct ipath_other_headers oth;
+               } l;
+               struct ipath_other_headers oth;
+       } u;
+} __attribute__ ((packed));
+
+struct ipath_pio_header {
+       __le32 pbc[2];
+       struct ipath_ib_header hdr;
+} __attribute__ ((packed));
+
+/*
+ * There is one struct ipath_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct ipath_mcast_qp.
+ */
+struct ipath_mcast_qp {
+       struct list_head list;
+       struct ipath_qp *qp;
+};
+
+struct ipath_mcast {
+       struct rb_node rb_node;
+       union ib_gid mgid;
+       struct list_head qp_list;
+       wait_queue_head_t wait;
+       atomic_t refcount;
+       int n_attached;
+};
+
+/* Protection domain */
+struct ipath_pd {
+       struct ib_pd ibpd;
+       int user;               /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct ipath_ah {
+       struct ib_ah ibah;
+       struct ib_ah_attr attr;
+};
+
+/*
+ * This structure is used by ipath_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct ipath_mmap_info {
+       struct list_head pending_mmaps;
+       struct ib_ucontext *context;
+       void *obj;
+       __u64 offset;
+       struct kref ref;
+       unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct ipath_cq_wc {
+       u32 head;               /* index of next entry to fill */
+       u32 tail;               /* index of next ib_poll_cq() entry */
+       union {
+               /* these are actually size ibcq.cqe + 1 */
+               struct ib_uverbs_wc uqueue[0];
+               struct ib_wc kqueue[0];
+       };
+};
+
+/*
+ * The completion queue structure.
+ */
+struct ipath_cq {
+       struct ib_cq ibcq;
+       struct tasklet_struct comptask;
+       spinlock_t lock;
+       u8 notify;
+       u8 triggered;
+       struct ipath_cq_wc *queue;
+       struct ipath_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct ipath_seg {
+       void *vaddr;
+       size_t length;
+};
+
+/* The number of ipath_segs that fit in a page. */
+#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
+
+struct ipath_segarray {
+       struct ipath_seg segs[IPATH_SEGSZ];
+};
+
+struct ipath_mregion {
+       struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
+       u64 user_base;          /* User's address for this region */
+       u64 iova;               /* IB start address of this region */
+       size_t length;
+       u32 lkey;
+       u32 offset;             /* offset (bytes) to start of region */
+       int access_flags;
+       u32 max_segs;           /* number of ipath_segs in all the arrays */
+       u32 mapsz;              /* size of the map array */
+       struct ipath_segarray *map[0];  /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct ipath_sge {
+       struct ipath_mregion *mr;
+       void *vaddr;            /* kernel virtual address of segment */
+       u32 sge_length;         /* length of the SGE */
+       u32 length;             /* remaining length of the segment */
+       u16 m;                  /* current index: mr->map[m] */
+       u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct ipath_mr {
+       struct ib_mr ibmr;
+       struct ib_umem *umem;
+       struct ipath_mregion mr;        /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct ipath_swqe {
+       struct ib_send_wr wr;   /* don't use wr.sg_list */
+       u32 psn;                /* first packet sequence number */
+       u32 lpsn;               /* last packet sequence number */
+       u32 ssn;                /* send sequence number */
+       u32 length;             /* total length of data in sg_list */
+       struct ipath_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct ipath_rwqe {
+       u64 wr_id;
+       u8 num_sge;
+       struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct ipath_rwq {
+       u32 head;               /* new work requests posted to the head */
+       u32 tail;               /* receives pull requests from here. */
+       struct ipath_rwqe wq[0];
+};
+
+struct ipath_rq {
+       struct ipath_rwq *wq;
+       spinlock_t lock;
+       u32 size;               /* size of RWQE array */
+       u8 max_sge;
+};
+
+struct ipath_srq {
+       struct ib_srq ibsrq;
+       struct ipath_rq rq;
+       struct ipath_mmap_info *ip;
+       /* send signal when number of RWQEs < limit */
+       u32 limit;
+};
+
+struct ipath_sge_state {
+       struct ipath_sge *sg_list;      /* next SGE to be used if any */
+       struct ipath_sge sge;   /* progress state for the current SGE */
+       u8 num_sge;
+       u8 static_rate;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct ipath_ack_entry {
+       u8 opcode;
+       u8 sent;
+       u32 psn;
+       union {
+               struct ipath_sge_state rdma_sge;
+               u64 atomic_data;
+       };
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct ipath_qp {
+       struct ib_qp ibqp;
+       struct ipath_qp *next;          /* link list for QPN hash table */
+       struct ipath_qp *timer_next;    /* link list for ipath_ib_timer() */
+       struct ipath_qp *pio_next;      /* link for ipath_ib_piobufavail() */
+       struct list_head piowait;       /* link for wait PIO buf */
+       struct list_head timerwait;     /* link for waiting for timeouts */
+       struct ib_ah_attr remote_ah_attr;
+       struct ipath_ib_header s_hdr;   /* next packet header to send */
+       atomic_t refcount;
+       wait_queue_head_t wait;
+       wait_queue_head_t wait_dma;
+       struct tasklet_struct s_task;
+       struct ipath_mmap_info *ip;
+       struct ipath_sge_state *s_cur_sge;
+       struct ipath_verbs_txreq *s_tx;
+       struct ipath_sge_state s_sge;   /* current send request data */
+       struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
+       struct ipath_sge_state s_ack_rdma_sge;
+       struct ipath_sge_state s_rdma_read_sge;
+       struct ipath_sge_state r_sge;   /* current receive data */
+       spinlock_t s_lock;
+       atomic_t s_dma_busy;
+       u16 s_pkt_delay;
+       u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
+       u32 s_cur_size;         /* size of send packet in bytes */
+       u32 s_len;              /* total length of s_sge */
+       u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
+       u32 s_next_psn;         /* PSN for next request */
+       u32 s_last_psn;         /* last response PSN processed */
+       u32 s_psn;              /* current packet sequence number */
+       u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
+       u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
+       u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
+       u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
+       u64 r_wr_id;            /* ID for current receive WQE */
+       unsigned long r_aflags;
+       u32 r_len;              /* total length of r_sge */
+       u32 r_rcv_len;          /* receive data len processed */
+       u32 r_psn;              /* expected rcv packet sequence number */
+       u32 r_msn;              /* message sequence number */
+       u8 state;               /* QP state */
+       u8 s_state;             /* opcode of last packet sent */
+       u8 s_ack_state;         /* opcode of packet to ACK */
+       u8 s_nak_state;         /* non-zero if NAK is pending */
+       u8 r_state;             /* opcode of last packet received */
+       u8 r_nak_state;         /* non-zero if NAK is pending */
+       u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
+       u8 r_flags;
+       u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
+       u8 r_head_ack_queue;    /* index into s_ack_queue[] */
+       u8 qp_access_flags;
+       u8 s_max_sge;           /* size of s_wq->sg_list */
+       u8 s_retry_cnt;         /* number of times to retry */
+       u8 s_rnr_retry_cnt;
+       u8 s_retry;             /* requester retry counter */
+       u8 s_rnr_retry;         /* requester RNR retry counter */
+       u8 s_pkey_index;        /* PKEY index to use */
+       u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
+       u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
+       u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+       u8 s_flags;
+       u8 s_dmult;
+       u8 s_draining;
+       u8 timeout;             /* Timeout for this QP */
+       enum ib_mtu path_mtu;
+       u32 remote_qpn;
+       u32 qkey;               /* QKEY for this QP (for UD or RD) */
+       u32 s_size;             /* send work queue size */
+       u32 s_head;             /* new entries added here */
+       u32 s_tail;             /* next entry to process */
+       u32 s_cur;              /* current work queue entry */
+       u32 s_last;             /* last un-ACK'ed entry */
+       u32 s_ssn;              /* SSN of tail entry */
+       u32 s_lsn;              /* limit sequence number (credit) */
+       struct ipath_swqe *s_wq;        /* send work queue */
+       struct ipath_swqe *s_wqe;
+       struct ipath_sge *r_ud_sg_list;
+       struct ipath_rq r_rq;           /* receive work queue */
+       struct ipath_sge r_sg_list[0];  /* verified SGEs */
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define IPATH_R_WRID_VALID     0
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define IPATH_R_REUSE_SGE      0x01
+#define IPATH_R_RDMAR_SEQ      0x02
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
+ *                        before processing the next SWQE
+ * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
+ *                        before processing the next SWQE
+ * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
+ * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                   next send completion entry not via send DMA.
+ */
+#define IPATH_S_SIGNAL_REQ_WR  0x01
+#define IPATH_S_FENCE_PENDING  0x02
+#define IPATH_S_RDMAR_PENDING  0x04
+#define IPATH_S_ACK_PENDING    0x08
+#define IPATH_S_BUSY           0x10
+#define IPATH_S_WAITING                0x20
+#define IPATH_S_WAIT_SSN_CREDIT        0x40
+#define IPATH_S_WAIT_DMA       0x80
+
+#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
+       IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
+
+#define IPATH_PSN_CREDIT       512
+
+/*
+ * Since struct ipath_swqe is not a fixed size, we can't simply index into
+ * struct ipath_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
+                                             unsigned n)
+{
+       return (struct ipath_swqe *)((char *)qp->s_wq +
+                                    (sizeof(struct ipath_swqe) +
+                                     qp->s_max_sge *
+                                     sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * Since struct ipath_rwqe is not a fixed size, we can't simply index into
+ * struct ipath_rwq.wq.  This function does the array index computation.
+ */
+static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
+                                             unsigned n)
+{
+       return (struct ipath_rwqe *)
+               ((char *) rq->wq->wq +
+                (sizeof(struct ipath_rwqe) +
+                 rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+       atomic_t n_free;
+       void *page;
+};
+
+struct ipath_qp_table {
+       spinlock_t lock;
+       u32 last;               /* last QP number allocated */
+       u32 max;                /* size of the hash table */
+       u32 nmaps;              /* size of the map table */
+       struct ipath_qp **table;
+       /* bit map of free numbers */
+       struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct ipath_lkey_table {
+       spinlock_t lock;
+       u32 next;               /* next unused index (speeds search) */
+       u32 gen;                /* generation count */
+       u32 max;                /* size of the table */
+       struct ipath_mregion **table;
+};
+
+struct ipath_opcode_stats {
+       u64 n_packets;          /* number of packets */
+       u64 n_bytes;            /* total number of bytes */
+};
+
+struct ipath_ibdev {
+       struct ib_device ibdev;
+       struct ipath_devdata *dd;
+       struct list_head pending_mmaps;
+       spinlock_t mmap_offset_lock;
+       u32 mmap_offset;
+       int ib_unit;            /* This is the device number */
+       u16 sm_lid;             /* in host order */
+       u8 sm_sl;
+       u8 mkeyprot;
+       /* non-zero when timer is set */
+       unsigned long mkey_lease_timeout;
+
+       /* The following fields are really per port. */
+       struct ipath_qp_table qp_table;
+       struct ipath_lkey_table lk_table;
+       struct list_head pending[3];    /* FIFO of QPs waiting for ACKs */
+       struct list_head piowait;       /* list for wait PIO buf */
+       struct list_head txreq_free;
+       void *txreq_bufs;
+       /* list of QPs waiting for RNR timer */
+       struct list_head rnrwait;
+       spinlock_t pending_lock;
+       __be64 sys_image_guid;  /* in network order */
+       __be64 gid_prefix;      /* in network order */
+       __be64 mkey;
+
+       u32 n_pds_allocated;    /* number of PDs allocated for device */
+       spinlock_t n_pds_lock;
+       u32 n_ahs_allocated;    /* number of AHs allocated for device */
+       spinlock_t n_ahs_lock;
+       u32 n_cqs_allocated;    /* number of CQs allocated for device */
+       spinlock_t n_cqs_lock;
+       u32 n_qps_allocated;    /* number of QPs allocated for device */
+       spinlock_t n_qps_lock;
+       u32 n_srqs_allocated;   /* number of SRQs allocated for device */
+       spinlock_t n_srqs_lock;
+       u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+       spinlock_t n_mcast_grps_lock;
+
+       u64 ipath_sword;        /* total dwords sent (sample result) */
+       u64 ipath_rword;        /* total dwords received (sample result) */
+       u64 ipath_spkts;        /* total packets sent (sample result) */
+       u64 ipath_rpkts;        /* total packets received (sample result) */
+       /* # of ticks no data sent (sample result) */
+       u64 ipath_xmit_wait;
+       u64 rcv_errors;         /* # of packets with SW detected rcv errs */
+       u64 n_unicast_xmit;     /* total unicast packets sent */
+       u64 n_unicast_rcv;      /* total unicast packets received */
+       u64 n_multicast_xmit;   /* total multicast packets sent */
+       u64 n_multicast_rcv;    /* total multicast packets received */
+       u64 z_symbol_error_counter;             /* starting count for PMA */
+       u64 z_link_error_recovery_counter;      /* starting count for PMA */
+       u64 z_link_downed_counter;              /* starting count for PMA */
+       u64 z_port_rcv_errors;                  /* starting count for PMA */
+       u64 z_port_rcv_remphys_errors;          /* starting count for PMA */
+       u64 z_port_xmit_discards;               /* starting count for PMA */
+       u64 z_port_xmit_data;                   /* starting count for PMA */
+       u64 z_port_rcv_data;                    /* starting count for PMA */
+       u64 z_port_xmit_packets;                /* starting count for PMA */
+       u64 z_port_rcv_packets;                 /* starting count for PMA */
+       u32 z_pkey_violations;                  /* starting count for PMA */
+       u32 z_local_link_integrity_errors;      /* starting count for PMA */
+       u32 z_excessive_buffer_overrun_errors;  /* starting count for PMA */
+       u32 z_vl15_dropped;                     /* starting count for PMA */
+       u32 n_rc_resends;
+       u32 n_rc_acks;
+       u32 n_rc_qacks;
+       u32 n_seq_naks;
+       u32 n_rdma_seq;
+       u32 n_rnr_naks;
+       u32 n_other_naks;
+       u32 n_timeouts;
+       u32 n_pkt_drops;
+       u32 n_vl15_dropped;
+       u32 n_wqe_errs;
+       u32 n_rdma_dup_busy;
+       u32 n_piowait;
+       u32 n_unaligned;
+       u32 port_cap_flags;
+       u32 pma_sample_start;
+       u32 pma_sample_interval;
+       __be16 pma_counter_select[5];
+       u16 pma_tag;
+       u16 qkey_violations;
+       u16 mkey_violations;
+       u16 mkey_lease_period;
+       u16 pending_index;      /* which pending queue is active */
+       u8 pma_sample_status;
+       u8 subnet_timeout;
+       u8 vl_high_limit;
+       struct ipath_opcode_stats opstats[128];
+};
+
+struct ipath_verbs_counters {
+       u64 symbol_error_counter;
+       u64 link_error_recovery_counter;
+       u64 link_downed_counter;
+       u64 port_rcv_errors;
+       u64 port_rcv_remphys_errors;
+       u64 port_xmit_discards;
+       u64 port_xmit_data;
+       u64 port_rcv_data;
+       u64 port_xmit_packets;
+       u64 port_rcv_packets;
+       u32 local_link_integrity_errors;
+       u32 excessive_buffer_overrun_errors;
+       u32 vl15_dropped;
+};
+
+struct ipath_verbs_txreq {
+       struct ipath_qp         *qp;
+       struct ipath_swqe       *wqe;
+       u32                      map_len;
+       u32                      len;
+       struct ipath_sge_state  *ss;
+       struct ipath_pio_header  hdr;
+       struct ipath_sdma_txreq  txreq;
+};
+
+static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct ipath_mr, ibmr);
+}
+
+static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct ipath_pd, ibpd);
+}
+
+static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct ipath_ah, ibah);
+}
+
+static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct ipath_cq, ibcq);
+}
+
+static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
+{
+       return container_of(ibsrq, struct ipath_srq, ibsrq);
+}
+
+static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct ipath_qp, ibqp);
+}
+
+static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct ipath_ibdev, ibdev);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+static inline void ipath_schedule_send(struct ipath_qp *qp)
+{
+       if (qp->s_flags & IPATH_S_ANY_WAIT)
+               qp->s_flags &= ~IPATH_S_ANY_WAIT;
+       if (!(qp->s_flags & IPATH_S_BUSY))
+               tasklet_hi_schedule(&qp->s_task);
+}
+
+int ipath_process_mad(struct ib_device *ibdev,
+                     int mad_flags,
+                     u8 port_num,
+                     const struct ib_wc *in_wc,
+                     const struct ib_grh *in_grh,
+                     const struct ib_mad_hdr *in, size_t in_mad_size,
+                     struct ib_mad_hdr *out, size_t *out_mad_size,
+                     u16 *out_mad_pkey_index);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int ipath_cmp24(u32 a, u32 b)
+{
+       return (((int) a) - ((int) b)) << 8;
+}
+
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+                           u64 *rwords, u64 *spkts, u64 *rpkts,
+                           u64 *xmit_wait);
+
+int ipath_get_counters(struct ipath_devdata *dd,
+                      struct ipath_verbs_counters *cntrs);
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_mcast_tree_empty(void);
+
+__be32 ipath_compute_aeth(struct ipath_qp *qp);
+
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
+
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+                             struct ib_qp_init_attr *init_attr,
+                             struct ib_udata *udata);
+
+int ipath_destroy_qp(struct ib_qp *ibqp);
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                   int attr_mask, struct ib_udata *udata);
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                  int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
+
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
+
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
+
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
+
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+                    u32 hdrwords, struct ipath_sge_state *ss, u32 len);
+
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+                    struct ipath_mregion *mr);
+
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+                 struct ib_sge *sge, int acc);
+
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+                 u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                          struct ib_recv_wr **bad_wr);
+
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+                               struct ib_srq_init_attr *srq_init_attr,
+                               struct ib_udata *udata);
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                    enum ib_srq_attr_mask attr_mask,
+                    struct ib_udata *udata);
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int ipath_destroy_srq(struct ib_srq *ibsrq);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev,
+                             const struct ib_cq_init_attr *attr,
+                             struct ib_ucontext *context,
+                             struct ib_udata *udata);
+
+int ipath_destroy_cq(struct ib_cq *ibcq);
+
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+                               struct ib_phys_buf *buffer_list,
+                               int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+                               u64 virt_addr, int mr_access_flags,
+                               struct ib_udata *udata);
+
+int ipath_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+                              struct ib_fmr_attr *fmr_attr);
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+                      int list_len, u64 iova);
+
+int ipath_unmap_fmr(struct list_head *fmr_list);
+
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
+
+void ipath_release_mmap_info(struct kref *ref);
+
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+                                              u32 size,
+                                              struct ib_ucontext *context,
+                                              void *obj);
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+                           struct ipath_mmap_info *ip,
+                           u32 size, void *obj);
+
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+void ipath_insert_rnr_queue(struct ipath_qp *qp);
+
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+                  u32 *lengthp, struct ipath_sge_state *ss);
+
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
+
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+                  struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+                          struct ipath_other_headers *ohdr,
+                          u32 bth0, u32 bth2);
+
+void ipath_do_send(unsigned long data);
+
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+                        enum ib_wc_status status);
+
+int ipath_make_rc_req(struct ipath_qp *qp);
+
+int ipath_make_uc_req(struct ipath_qp *qp);
+
+int ipath_make_ud_req(struct ipath_qp *qp);
+
+int ipath_register_ib_device(struct ipath_devdata *);
+
+void ipath_unregister_ib_device(struct ipath_ibdev *);
+
+void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
+
+int ipath_ib_piobufavail(struct ipath_ibdev *);
+
+unsigned ipath_get_npkeys(struct ipath_devdata *);
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *);
+
+unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
+
+extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
+
+/*
+ * Below converts HCA-specific LinkTrainingState to IB PhysPortState
+ * values.
+ */
+extern const u8 ipath_cvt_physportstate[];
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+
+extern const int ib_ipath_state_ops[];
+
+extern unsigned int ib_ipath_lkey_table_size;
+
+extern unsigned int ib_ipath_max_cqes;
+
+extern unsigned int ib_ipath_max_cqs;
+
+extern unsigned int ib_ipath_max_qp_wrs;
+
+extern unsigned int ib_ipath_max_qps;
+
+extern unsigned int ib_ipath_max_sges;
+
+extern unsigned int ib_ipath_max_mcast_grps;
+
+extern unsigned int ib_ipath_max_mcast_qp_attached;
+
+extern unsigned int ib_ipath_max_srqs;
+
+extern unsigned int ib_ipath_max_srq_sges;
+
+extern unsigned int ib_ipath_max_srq_wrs;
+
+extern const u32 ib_ipath_rnr_table[];
+
+extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
+
+#endif                         /* IPATH_VERBS_H */
diff --git a/drivers/staging/rdma/ipath/ipath_verbs_mcast.c b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c
new file mode 100644 (file)
index 0000000..6216ea9
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ipath_verbs.h"
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static DEFINE_SPINLOCK(mcast_lock);
+
+/**
+ * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+       struct ipath_mcast_qp *mqp;
+
+       mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+       if (!mqp)
+               goto bail;
+
+       mqp->qp = qp;
+       atomic_inc(&qp->refcount);
+
+bail:
+       return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+       struct ipath_qp *qp = mqp->qp;
+
+       /* Notify ipath_destroy_qp() if it is waiting. */
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+
+       kfree(mqp);
+}
+
+/**
+ * ipath_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+       struct ipath_mcast *mcast;
+
+       mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+       if (!mcast)
+               goto bail;
+
+       mcast->mgid = *mgid;
+       INIT_LIST_HEAD(&mcast->qp_list);
+       init_waitqueue_head(&mcast->wait);
+       atomic_set(&mcast->refcount, 0);
+       mcast->n_attached = 0;
+
+bail:
+       return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+       struct ipath_mcast_qp *p, *tmp;
+
+       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+               ipath_mcast_qp_free(p);
+
+       kfree(mcast);
+}
+
+/**
+ * ipath_mcast_find - search the global table for the given multicast GID
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+       struct rb_node *n;
+       unsigned long flags;
+       struct ipath_mcast *mcast;
+
+       spin_lock_irqsave(&mcast_lock, flags);
+       n = mcast_tree.rb_node;
+       while (n) {
+               int ret;
+
+               mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+               ret = memcmp(mgid->raw, mcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0)
+                       n = n->rb_left;
+               else if (ret > 0)
+                       n = n->rb_right;
+               else {
+                       atomic_inc(&mcast->refcount);
+                       spin_unlock_irqrestore(&mcast_lock, flags);
+                       goto bail;
+               }
+       }
+       spin_unlock_irqrestore(&mcast_lock, flags);
+
+       mcast = NULL;
+
+bail:
+       return mcast;
+}
+
+/**
+ * ipath_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_ibdev *dev,
+                          struct ipath_mcast *mcast,
+                          struct ipath_mcast_qp *mqp)
+{
+       struct rb_node **n = &mcast_tree.rb_node;
+       struct rb_node *pn = NULL;
+       int ret;
+
+       spin_lock_irq(&mcast_lock);
+
+       while (*n) {
+               struct ipath_mcast *tmcast;
+               struct ipath_mcast_qp *p;
+
+               pn = *n;
+               tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+               ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0) {
+                       n = &pn->rb_left;
+                       continue;
+               }
+               if (ret > 0) {
+                       n = &pn->rb_right;
+                       continue;
+               }
+
+               /* Search the QP list to see if this is already there. */
+               list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+                       if (p->qp == mqp->qp) {
+                               ret = ESRCH;
+                               goto bail;
+                       }
+               }
+               if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
+                       ret = ENOMEM;
+                       goto bail;
+               }
+
+               tmcast->n_attached++;
+
+               list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+               ret = EEXIST;
+               goto bail;
+       }
+
+       spin_lock(&dev->n_mcast_grps_lock);
+       if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
+               spin_unlock(&dev->n_mcast_grps_lock);
+               ret = ENOMEM;
+               goto bail;
+       }
+
+       dev->n_mcast_grps_allocated++;
+       spin_unlock(&dev->n_mcast_grps_lock);
+
+       mcast->n_attached++;
+
+       list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+       atomic_inc(&mcast->refcount);
+       rb_link_node(&mcast->rb_node, pn, n);
+       rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+       ret = 0;
+
+bail:
+       spin_unlock_irq(&mcast_lock);
+
+       return ret;
+}
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_ibdev *dev = to_idev(ibqp->device);
+       struct ipath_mcast *mcast;
+       struct ipath_mcast_qp *mqp;
+       int ret;
+
+       /*
+        * Allocate data structures since its better to do this outside of
+        * spin locks and it will most likely be needed.
+        */
+       mcast = ipath_mcast_alloc(gid);
+       if (mcast == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       mqp = ipath_mcast_qp_alloc(qp);
+       if (mqp == NULL) {
+               ipath_mcast_free(mcast);
+               ret = -ENOMEM;
+               goto bail;
+       }
+       switch (ipath_mcast_add(dev, mcast, mqp)) {
+       case ESRCH:
+               /* Neither was used: can't attach the same QP twice. */
+               ipath_mcast_qp_free(mqp);
+               ipath_mcast_free(mcast);
+               ret = -EINVAL;
+               goto bail;
+       case EEXIST:            /* The mcast wasn't used */
+               ipath_mcast_free(mcast);
+               break;
+       case ENOMEM:
+               /* Exceeded the maximum number of mcast groups. */
+               ipath_mcast_qp_free(mqp);
+               ipath_mcast_free(mcast);
+               ret = -ENOMEM;
+               goto bail;
+       default:
+               break;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_ibdev *dev = to_idev(ibqp->device);
+       struct ipath_mcast *mcast = NULL;
+       struct ipath_mcast_qp *p, *tmp;
+       struct rb_node *n;
+       int last = 0;
+       int ret;
+
+       spin_lock_irq(&mcast_lock);
+
+       /* Find the GID in the mcast table. */
+       n = mcast_tree.rb_node;
+       while (1) {
+               if (n == NULL) {
+                       spin_unlock_irq(&mcast_lock);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               mcast = rb_entry(n, struct ipath_mcast, rb_node);
+               ret = memcmp(gid->raw, mcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0)
+                       n = n->rb_left;
+               else if (ret > 0)
+                       n = n->rb_right;
+               else
+                       break;
+       }
+
+       /* Search the QP list. */
+       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+               if (p->qp != qp)
+                       continue;
+               /*
+                * We found it, so remove it, but don't poison the forward
+                * link until we are sure there are no list walkers.
+                */
+               list_del_rcu(&p->list);
+               mcast->n_attached--;
+
+               /* If this was the last attached QP, remove the GID too. */
+               if (list_empty(&mcast->qp_list)) {
+                       rb_erase(&mcast->rb_node, &mcast_tree);
+                       last = 1;
+               }
+               break;
+       }
+
+       spin_unlock_irq(&mcast_lock);
+
+       if (p) {
+               /*
+                * Wait for any list walkers to finish before freeing the
+                * list element.
+                */
+               wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+               ipath_mcast_qp_free(p);
+       }
+       if (last) {
+               atomic_dec(&mcast->refcount);
+               wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+               ipath_mcast_free(mcast);
+               spin_lock_irq(&dev->n_mcast_grps_lock);
+               dev->n_mcast_grps_allocated--;
+               spin_unlock_irq(&dev->n_mcast_grps_lock);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_mcast_tree_empty(void)
+{
+       return mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_ppc64.c b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c
new file mode 100644 (file)
index 0000000..1a7e20a
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+       return 0;
+}
diff --git a/drivers/staging/rdma/ipath/ipath_wc_x86_64.c b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c
new file mode 100644 (file)
index 0000000..7b6e4c8
--- /dev/null
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/processor.h>
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+       int ret = 0;
+       u64 pioaddr, piolen;
+       unsigned bits;
+       const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+       const size_t len = pci_resource_len(dd->pcidev, 0);
+
+       /*
+        * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+        * chip.  Linux (possibly the hardware) requires it to be on a power
+        * of 2 address matching the length (which has to be a power of 2).
+        * For rev1, that means the base address, for rev2, it will be just
+        * the PIO buffers themselves.
+        * For chips with two sets of buffers, the calculations are
+        * somewhat more complicated; we need to sum, and the piobufbase
+        * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+        * The buffers are still packed, so a single range covers both.
+        */
+       if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
+               unsigned long pio2kbase, pio4kbase;
+               pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
+               pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
+               if (pio2kbase < pio4kbase) { /* all, for now */
+                       pioaddr = addr + pio2kbase;
+                       piolen = pio4kbase - pio2kbase +
+                               dd->ipath_piobcnt4k * dd->ipath_4kalign;
+               } else {
+                       pioaddr = addr + pio4kbase;
+                       piolen = pio2kbase - pio4kbase +
+                               dd->ipath_piobcnt2k * dd->ipath_palign;
+               }
+       } else {  /* single buffer size (2K, currently) */
+               pioaddr = addr + dd->ipath_piobufbase;
+               piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
+                       dd->ipath_piobcnt4k * dd->ipath_4kalign;
+       }
+
+       for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+               /* do nothing */ ;
+
+       if (piolen != (1ULL << bits)) {
+               piolen >>= bits;
+               while (piolen >>= 1)
+                       bits++;
+               piolen = 1ULL << (bits + 1);
+       }
+       if (pioaddr & (piolen - 1)) {
+               u64 atmp;
+               ipath_dbg("pioaddr %llx not on right boundary for size "
+                         "%llx, fixing\n",
+                         (unsigned long long) pioaddr,
+                         (unsigned long long) piolen);
+               atmp = pioaddr & ~(piolen - 1);
+               if (atmp < addr || (atmp + piolen) > (addr + len)) {
+                       ipath_dev_err(dd, "No way to align address/size "
+                                     "(%llx/%llx), no WC mtrr\n",
+                                     (unsigned long long) atmp,
+                                     (unsigned long long) piolen << 1);
+                       ret = -ENODEV;
+               } else {
+                       ipath_dbg("changing WC base from %llx to %llx, "
+                                 "len from %llx to %llx\n",
+                                 (unsigned long long) pioaddr,
+                                 (unsigned long long) atmp,
+                                 (unsigned long long) piolen,
+                                 (unsigned long long) piolen << 1);
+                       pioaddr = atmp;
+                       piolen <<= 1;
+               }
+       }
+
+       if (!ret) {
+               dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
+               if (dd->wc_cookie < 0) {
+                       ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n");
+                       ret = -ENODEV;
+               } else if (dd->wc_cookie == 0)
+                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n");
+               else
+                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n");
+       }
+
+       return ret;
+}
+
+/**
+ * ipath_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ */
+void ipath_disable_wc(struct ipath_devdata *dd)
+{
+       arch_phys_wc_del(dd->wc_cookie);
+}
index 6da7e49a662758c7e673aea32705a86e2269cb2c..2693c46afdc0bd7b284cb11254d8d31c5987c1b0 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include <linux/uuid.h>
+#include <linux/io.h>
 
 #include "version.h"
 #include "visorbus.h"
@@ -35,7 +36,7 @@ static const uuid_le spar_video_guid = SPAR_CONSOLEVIDEO_CHANNEL_PROTOCOL_GUID;
 struct visorchannel {
        u64 physaddr;
        ulong nbytes;
-       void __iomem *mapped;
+       void *mapped;
        bool requested;
        struct channel_header chan_hdr;
        uuid_le guid;
@@ -92,7 +93,7 @@ visorchannel_create_guts(u64 physaddr, unsigned long channel_bytes,
                }
        }
 
-       channel->mapped = ioremap_cache(physaddr, size);
+       channel->mapped = memremap(physaddr, size, MEMREMAP_WB);
        if (!channel->mapped) {
                release_mem_region(physaddr, size);
                goto cleanup;
@@ -112,7 +113,7 @@ visorchannel_create_guts(u64 physaddr, unsigned long channel_bytes,
        if (uuid_le_cmp(guid, NULL_UUID_LE) == 0)
                guid = channel->chan_hdr.chtype;
 
-       iounmap(channel->mapped);
+       memunmap(channel->mapped);
        if (channel->requested)
                release_mem_region(channel->physaddr, channel->nbytes);
        channel->mapped = NULL;
@@ -125,7 +126,8 @@ visorchannel_create_guts(u64 physaddr, unsigned long channel_bytes,
                }
        }
 
-       channel->mapped = ioremap_cache(channel->physaddr, channel_bytes);
+       channel->mapped = memremap(channel->physaddr, channel_bytes,
+                       MEMREMAP_WB);
        if (!channel->mapped) {
                release_mem_region(channel->physaddr, channel_bytes);
                goto cleanup;
@@ -166,7 +168,7 @@ visorchannel_destroy(struct visorchannel *channel)
        if (!channel)
                return;
        if (channel->mapped) {
-               iounmap(channel->mapped);
+               memunmap(channel->mapped);
                if (channel->requested)
                        release_mem_region(channel->physaddr, channel->nbytes);
        }
@@ -240,7 +242,7 @@ visorchannel_read(struct visorchannel *channel, ulong offset,
        if (offset + nbytes > channel->nbytes)
                return -EIO;
 
-       memcpy_fromio(local, channel->mapped + offset, nbytes);
+       memcpy(local, channel->mapped + offset, nbytes);
 
        return 0;
 }
@@ -262,7 +264,7 @@ visorchannel_write(struct visorchannel *channel, ulong offset,
                       local, copy_size);
        }
 
-       memcpy_toio(channel->mapped + offset, local, nbytes);
+       memcpy(channel->mapped + offset, local, nbytes);
 
        return 0;
 }
index 4b76cb441ed4eb683a1bffbc792a40aa8b2adebd..94419c36d2affdc2e3f54775fee102d701d7fe86 100644 (file)
@@ -118,7 +118,7 @@ static struct visorchannel *controlvm_channel;
 
 /* Manages the request payload in the controlvm channel */
 struct visor_controlvm_payload_info {
-       u8 __iomem *ptr;        /* pointer to base address of payload pool */
+       u8 *ptr;                /* pointer to base address of payload pool */
        u64 offset;             /* offset from beginning of controlvm
                                 * channel to beginning of payload * pool */
        u32 bytes;              /* number of bytes in payload pool */
@@ -400,21 +400,22 @@ parser_init_byte_stream(u64 addr, u32 bytes, bool local, bool *retry)
                p = __va((unsigned long) (addr));
                memcpy(ctx->data, p, bytes);
        } else {
-               void __iomem *mapping;
+               void *mapping;
 
                if (!request_mem_region(addr, bytes, "visorchipset")) {
                        rc = NULL;
                        goto cleanup;
                }
 
-               mapping = ioremap_cache(addr, bytes);
+               mapping = memremap(addr, bytes, MEMREMAP_WB);
                if (!mapping) {
                        release_mem_region(addr, bytes);
                        rc = NULL;
                        goto cleanup;
                }
-               memcpy_fromio(ctx->data, mapping, bytes);
+               memcpy(ctx->data, mapping, bytes);
                release_mem_region(addr, bytes);
+               memunmap(mapping);
        }
 
        ctx->byte_stream = true;
@@ -1327,7 +1328,7 @@ static int
 initialize_controlvm_payload_info(u64 phys_addr, u64 offset, u32 bytes,
                                  struct visor_controlvm_payload_info *info)
 {
-       u8 __iomem *payload = NULL;
+       u8 *payload = NULL;
        int rc = CONTROLVM_RESP_SUCCESS;
 
        if (!info) {
@@ -1339,7 +1340,7 @@ initialize_controlvm_payload_info(u64 phys_addr, u64 offset, u32 bytes,
                rc = -CONTROLVM_RESP_ERROR_PAYLOAD_INVALID;
                goto cleanup;
        }
-       payload = ioremap_cache(phys_addr + offset, bytes);
+       payload = memremap(phys_addr + offset, bytes, MEMREMAP_WB);
        if (!payload) {
                rc = -CONTROLVM_RESP_ERROR_IOREMAP_FAILED;
                goto cleanup;
@@ -1352,7 +1353,7 @@ initialize_controlvm_payload_info(u64 phys_addr, u64 offset, u32 bytes,
 cleanup:
        if (rc < 0) {
                if (payload) {
-                       iounmap(payload);
+                       memunmap(payload);
                        payload = NULL;
                }
        }
@@ -1363,7 +1364,7 @@ static void
 destroy_controlvm_payload_info(struct visor_controlvm_payload_info *info)
 {
        if (info->ptr) {
-               iounmap(info->ptr);
+               memunmap(info->ptr);
                info->ptr = NULL;
        }
        memset(info, 0, sizeof(struct visor_controlvm_payload_info));
index fd092909a4577a7c4a708516bf3344fee331ad84..342a07c58d89400643b26236875d6ffa6a91062e 100644 (file)
@@ -269,14 +269,14 @@ int iscsit_deaccess_np(struct iscsi_np *np, struct iscsi_portal_group *tpg,
 }
 
 bool iscsit_check_np_match(
-       struct __kernel_sockaddr_storage *sockaddr,
+       struct sockaddr_storage *sockaddr,
        struct iscsi_np *np,
        int network_transport)
 {
        struct sockaddr_in *sock_in, *sock_in_e;
        struct sockaddr_in6 *sock_in6, *sock_in6_e;
        bool ip_match = false;
-       u16 port;
+       u16 port, port_e;
 
        if (sockaddr->ss_family == AF_INET6) {
                sock_in6 = (struct sockaddr_in6 *)sockaddr;
@@ -288,6 +288,7 @@ bool iscsit_check_np_match(
                        ip_match = true;
 
                port = ntohs(sock_in6->sin6_port);
+               port_e = ntohs(sock_in6_e->sin6_port);
        } else {
                sock_in = (struct sockaddr_in *)sockaddr;
                sock_in_e = (struct sockaddr_in *)&np->np_sockaddr;
@@ -296,9 +297,10 @@ bool iscsit_check_np_match(
                        ip_match = true;
 
                port = ntohs(sock_in->sin_port);
+               port_e = ntohs(sock_in_e->sin_port);
        }
 
-       if (ip_match && (np->np_port == port) &&
+       if (ip_match && (port_e == port) &&
            (np->np_network_transport == network_transport))
                return true;
 
@@ -309,7 +311,7 @@ bool iscsit_check_np_match(
  * Called with mutex np_lock held
  */
 static struct iscsi_np *iscsit_get_np(
-       struct __kernel_sockaddr_storage *sockaddr,
+       struct sockaddr_storage *sockaddr,
        int network_transport)
 {
        struct iscsi_np *np;
@@ -340,12 +342,9 @@ static struct iscsi_np *iscsit_get_np(
 }
 
 struct iscsi_np *iscsit_add_np(
-       struct __kernel_sockaddr_storage *sockaddr,
-       char *ip_str,
+       struct sockaddr_storage *sockaddr,
        int network_transport)
 {
-       struct sockaddr_in *sock_in;
-       struct sockaddr_in6 *sock_in6;
        struct iscsi_np *np;
        int ret;
 
@@ -368,16 +367,6 @@ struct iscsi_np *iscsit_add_np(
        }
 
        np->np_flags |= NPF_IP_NETWORK;
-       if (sockaddr->ss_family == AF_INET6) {
-               sock_in6 = (struct sockaddr_in6 *)sockaddr;
-               snprintf(np->np_ip, IPV6_ADDRESS_SPACE, "%s", ip_str);
-               np->np_port = ntohs(sock_in6->sin6_port);
-       } else {
-               sock_in = (struct sockaddr_in *)sockaddr;
-               sprintf(np->np_ip, "%s", ip_str);
-               np->np_port = ntohs(sock_in->sin_port);
-       }
-
        np->np_network_transport = network_transport;
        spin_lock_init(&np->np_thread_lock);
        init_completion(&np->np_restart_comp);
@@ -411,8 +400,8 @@ struct iscsi_np *iscsit_add_np(
        list_add_tail(&np->np_list, &g_np_list);
        mutex_unlock(&np_lock);
 
-       pr_debug("CORE[0] - Added Network Portal: %s:%hu on %s\n",
-               np->np_ip, np->np_port, np->np_transport->name);
+       pr_debug("CORE[0] - Added Network Portal: %pISpc on %s\n",
+               &np->np_sockaddr, np->np_transport->name);
 
        return np;
 }
@@ -481,8 +470,8 @@ int iscsit_del_np(struct iscsi_np *np)
        list_del(&np->np_list);
        mutex_unlock(&np_lock);
 
-       pr_debug("CORE[0] - Removed Network Portal: %s:%hu on %s\n",
-               np->np_ip, np->np_port, np->np_transport->name);
+       pr_debug("CORE[0] - Removed Network Portal: %pISpc on %s\n",
+               &np->np_sockaddr, np->np_transport->name);
 
        iscsit_put_transport(np->np_transport);
        kfree(np);
@@ -1209,7 +1198,6 @@ static u32 iscsit_do_crypto_hash_sg(
        u8 *pad_bytes)
 {
        u32 data_crc;
-       u32 i;
        struct scatterlist *sg;
        unsigned int page_off;
 
@@ -1218,15 +1206,15 @@ static u32 iscsit_do_crypto_hash_sg(
        sg = cmd->first_data_sg;
        page_off = cmd->first_data_sg_off;
 
-       i = 0;
        while (data_length) {
-               u32 cur_len = min_t(u32, data_length, (sg[i].length - page_off));
+               u32 cur_len = min_t(u32, data_length, (sg->length - page_off));
 
-               crypto_hash_update(hash, &sg[i], cur_len);
+               crypto_hash_update(hash, sg, cur_len);
 
                data_length -= cur_len;
                page_off = 0;
-               i++;
+               /* iscsit_map_iovec has already checked for invalid sg pointers */
+               sg = sg_next(sg);
        }
 
        if (padding) {
@@ -2556,7 +2544,7 @@ static int iscsit_send_conn_drop_async_message(
        cmd->stat_sn            = conn->stat_sn++;
        hdr->statsn             = cpu_to_be32(cmd->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
        hdr->async_event        = ISCSI_ASYNC_MSG_DROPPING_CONNECTION;
        hdr->param1             = cpu_to_be16(cmd->logout_cid);
        hdr->param2             = cpu_to_be16(conn->sess->sess_ops->DefaultTime2Wait);
@@ -2628,7 +2616,7 @@ iscsit_build_datain_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                hdr->statsn             = cpu_to_be32(0xFFFFFFFF);
 
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
        hdr->datasn             = cpu_to_be32(datain->data_sn);
        hdr->offset             = cpu_to_be32(datain->offset);
 
@@ -2839,7 +2827,7 @@ iscsit_build_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
 
        iscsit_increment_maxcmdsn(cmd, conn->sess);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built Logout Response ITT: 0x%08x StatSN:"
                " 0x%08x Response: 0x%02x CID: %hu on CID: %hu\n",
@@ -2902,7 +2890,7 @@ iscsit_build_nopin_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
                iscsit_increment_maxcmdsn(cmd, conn->sess);
 
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built NOPIN %s Response ITT: 0x%08x, TTT: 0x%08x,"
                " StatSN: 0x%08x, Length %u\n", (nopout_response) ?
@@ -3049,7 +3037,7 @@ static int iscsit_send_r2t(
        hdr->ttt                = cpu_to_be32(r2t->targ_xfer_tag);
        hdr->statsn             = cpu_to_be32(conn->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
        hdr->r2tsn              = cpu_to_be32(r2t->r2t_sn);
        hdr->data_offset        = cpu_to_be32(r2t->offset);
        hdr->data_length        = cpu_to_be32(r2t->xfer_len);
@@ -3202,7 +3190,7 @@ void iscsit_build_rsp_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
 
        iscsit_increment_maxcmdsn(cmd, conn->sess);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built SCSI Response, ITT: 0x%08x, StatSN: 0x%08x,"
                " Response: 0x%02x, SAM Status: 0x%02x, CID: %hu\n",
@@ -3321,7 +3309,7 @@ iscsit_build_task_mgt_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
 
        iscsit_increment_maxcmdsn(cmd, conn->sess);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built Task Management Response ITT: 0x%08x,"
                " StatSN: 0x%08x, Response: 0x%02x, CID: %hu\n",
@@ -3399,6 +3387,7 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
        int target_name_printed;
        unsigned char buf[ISCSI_IQN_LEN+12]; /* iqn + "TargetName=" + \0 */
        unsigned char *text_in = cmd->text_in_ptr, *text_ptr = NULL;
+       bool active;
 
        buffer_len = min(conn->conn_ops->MaxRecvDataSegmentLength,
                         SENDTARGETS_BUF_LIMIT);
@@ -3452,19 +3441,18 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
                        }
 
                        spin_lock(&tpg->tpg_state_lock);
-                       if ((tpg->tpg_state == TPG_STATE_FREE) ||
-                           (tpg->tpg_state == TPG_STATE_INACTIVE)) {
-                               spin_unlock(&tpg->tpg_state_lock);
-                               continue;
-                       }
+                       active = (tpg->tpg_state == TPG_STATE_ACTIVE);
                        spin_unlock(&tpg->tpg_state_lock);
 
+                       if (!active && tpg->tpg_attrib.tpg_enabled_sendtargets)
+                               continue;
+
                        spin_lock(&tpg->tpg_np_lock);
                        list_for_each_entry(tpg_np, &tpg->tpg_gnp_list,
                                                tpg_np_list) {
                                struct iscsi_np *np = tpg_np->tpg_np;
                                bool inaddr_any = iscsit_check_inaddr_any(np);
-                               char *fmt_str;
+                               struct sockaddr_storage *sockaddr;
 
                                if (np->np_network_transport != network_transport)
                                        continue;
@@ -3492,15 +3480,15 @@ iscsit_build_sendtargets_response(struct iscsi_cmd *cmd,
                                        }
                                }
 
-                               if (np->np_sockaddr.ss_family == AF_INET6)
-                                       fmt_str = "TargetAddress=[%s]:%hu,%hu";
+                               if (inaddr_any)
+                                       sockaddr = &conn->local_sockaddr;
                                else
-                                       fmt_str = "TargetAddress=%s:%hu,%hu";
+                                       sockaddr = &np->np_sockaddr;
 
-                               len = sprintf(buf, fmt_str,
-                                       inaddr_any ? conn->local_ip : np->np_ip,
-                                       np->np_port,
-                                       tpg->tpgt);
+                               len = sprintf(buf, "TargetAddress="
+                                             "%pISpc,%hu",
+                                             sockaddr,
+                                             tpg->tpgt);
                                len += 1;
 
                                if ((len + payload_len) > buffer_len) {
@@ -3576,7 +3564,7 @@ iscsit_build_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
         */
        cmd->maxcmdsn_inc = 0;
        hdr->exp_cmdsn = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Built Text Response: ITT: 0x%08x, TTT: 0x%08x, StatSN: 0x%08x,"
                " Length: %u, CID: %hu F: %d C: %d\n", cmd->init_task_tag,
@@ -3654,7 +3642,7 @@ iscsit_build_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
        cmd->stat_sn            = conn->stat_sn++;
        hdr->statsn             = cpu_to_be32(cmd->stat_sn);
        hdr->exp_cmdsn          = cpu_to_be32(conn->sess->exp_cmd_sn);
-       hdr->max_cmdsn          = cpu_to_be32(conn->sess->max_cmd_sn);
+       hdr->max_cmdsn          = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
 }
 EXPORT_SYMBOL(iscsit_build_reject);
index 7d0f9c00d9c255bb6f32856ba2dd19e8390457c0..4cf2c0f2ba2f981699499cce77726d20aeee9dc9 100644 (file)
@@ -10,10 +10,10 @@ extern int iscsit_access_np(struct iscsi_np *, struct iscsi_portal_group *);
 extern void iscsit_login_kref_put(struct kref *);
 extern int iscsit_deaccess_np(struct iscsi_np *, struct iscsi_portal_group *,
                                struct iscsi_tpg_np *);
-extern bool iscsit_check_np_match(struct __kernel_sockaddr_storage *,
+extern bool iscsit_check_np_match(struct sockaddr_storage *,
                                struct iscsi_np *, int);
-extern struct iscsi_np *iscsit_add_np(struct __kernel_sockaddr_storage *,
-                               char *, int);
+extern struct iscsi_np *iscsit_add_np(struct sockaddr_storage *,
+                               int);
 extern int iscsit_reset_np_thread(struct iscsi_np *, struct iscsi_tpg_np *,
                                struct iscsi_portal_group *, bool);
 extern int iscsit_del_np(struct iscsi_np *);
index c1898c84b3d25e3630c012d97d9cfa6270e7d414..c7461d770d3a8bff4ba0d4c64067654cf53512ca 100644 (file)
@@ -99,7 +99,7 @@ static ssize_t lio_target_np_store_sctp(
                 * Use existing np->np_sockaddr for SCTP network portal reference
                 */
                tpg_np_sctp = iscsit_tpg_add_network_portal(tpg, &np->np_sockaddr,
-                                       np->np_ip, tpg_np, ISCSI_SCTP_TCP);
+                                       tpg_np, ISCSI_SCTP_TCP);
                if (!tpg_np_sctp || IS_ERR(tpg_np_sctp))
                        goto out;
        } else {
@@ -177,7 +177,7 @@ static ssize_t lio_target_np_store_iser(
                }
 
                tpg_np_iser = iscsit_tpg_add_network_portal(tpg, &np->np_sockaddr,
-                               np->np_ip, tpg_np, ISCSI_INFINIBAND);
+                               tpg_np, ISCSI_INFINIBAND);
                if (IS_ERR(tpg_np_iser)) {
                        rc = PTR_ERR(tpg_np_iser);
                        goto out;
@@ -220,7 +220,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
        struct iscsi_portal_group *tpg;
        struct iscsi_tpg_np *tpg_np;
        char *str, *str2, *ip_str, *port_str;
-       struct __kernel_sockaddr_storage sockaddr;
+       struct sockaddr_storage sockaddr;
        struct sockaddr_in *sock_in;
        struct sockaddr_in6 *sock_in6;
        unsigned long port;
@@ -235,7 +235,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
        memset(buf, 0, MAX_PORTAL_LEN + 1);
        snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
 
-       memset(&sockaddr, 0, sizeof(struct __kernel_sockaddr_storage));
+       memset(&sockaddr, 0, sizeof(struct sockaddr_storage));
 
        str = strstr(buf, "[");
        if (str) {
@@ -248,8 +248,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
                        return ERR_PTR(-EINVAL);
                }
                str++; /* Skip over leading "[" */
-               *str2 = '\0'; /* Terminate the IPv6 address */
-               str2++; /* Skip over the "]" */
+               *str2 = '\0'; /* Terminate the unbracketed IPv6 address */
+               str2++; /* Skip over the \0 */
                port_str = strstr(str2, ":");
                if (!port_str) {
                        pr_err("Unable to locate \":port\""
@@ -267,7 +267,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
                sock_in6 = (struct sockaddr_in6 *)&sockaddr;
                sock_in6->sin6_family = AF_INET6;
                sock_in6->sin6_port = htons((unsigned short)port);
-               ret = in6_pton(str, IPV6_ADDRESS_SPACE,
+               ret = in6_pton(str, -1,
                                (void *)&sock_in6->sin6_addr.in6_u, -1, &end);
                if (ret <= 0) {
                        pr_err("in6_pton returned: %d\n", ret);
@@ -316,7 +316,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
         * sys/kernel/config/iscsi/$IQN/$TPG/np/$IP:$PORT/
         *
         */
-       tpg_np = iscsit_tpg_add_network_portal(tpg, &sockaddr, str, NULL,
+       tpg_np = iscsit_tpg_add_network_portal(tpg, &sockaddr, NULL,
                                ISCSI_TCP);
        if (IS_ERR(tpg_np)) {
                iscsit_put_tpg(tpg);
@@ -344,8 +344,8 @@ static void lio_target_call_delnpfromtpg(
 
        se_tpg = &tpg->tpg_se_tpg;
        pr_debug("LIO_Target_ConfigFS: DEREGISTER -> %s TPGT: %hu"
-               " PORTAL: %s:%hu\n", config_item_name(&se_tpg->se_tpg_wwn->wwn_group.cg_item),
-               tpg->tpgt, tpg_np->tpg_np->np_ip, tpg_np->tpg_np->np_port);
+               " PORTAL: %pISpc\n", config_item_name(&se_tpg->se_tpg_wwn->wwn_group.cg_item),
+               tpg->tpgt, &tpg_np->tpg_np->np_sockaddr);
 
        ret = iscsit_tpg_del_network_portal(tpg, tpg_np);
        if (ret < 0)
@@ -656,6 +656,7 @@ static ssize_t lio_target_nacl_show_info(
        struct iscsi_conn *conn;
        struct se_session *se_sess;
        ssize_t rb = 0;
+       u32 max_cmd_sn;
 
        spin_lock_bh(&se_nacl->nacl_sess_lock);
        se_sess = se_nacl->nacl_sess;
@@ -703,11 +704,12 @@ static ssize_t lio_target_nacl_show_info(
                                " Values]-----------------------\n");
                rb += sprintf(page+rb, "  CmdSN/WR  :  CmdSN/WC  :  ExpCmdSN"
                                "  :  MaxCmdSN  :     ITT    :     TTT\n");
+               max_cmd_sn = (u32) atomic_read(&sess->max_cmd_sn);
                rb += sprintf(page+rb, " 0x%08x   0x%08x   0x%08x   0x%08x"
                                "   0x%08x   0x%08x\n",
                        sess->cmdsn_window,
-                       (sess->max_cmd_sn - sess->exp_cmd_sn) + 1,
-                       sess->exp_cmd_sn, sess->max_cmd_sn,
+                       (max_cmd_sn - sess->exp_cmd_sn) + 1,
+                       sess->exp_cmd_sn, max_cmd_sn,
                        sess->init_task_tag, sess->targ_xfer_tag);
                rb += sprintf(page+rb, "----------------------[iSCSI"
                                " Connections]-------------------------\n");
@@ -751,7 +753,7 @@ static ssize_t lio_target_nacl_show_info(
                                break;
                        }
 
-                       rb += sprintf(page+rb, "   Address %s %s", conn->login_ip,
+                       rb += sprintf(page+rb, "   Address %pISc %s", &conn->login_sockaddr,
                                (conn->network_transport == ISCSI_TCP) ?
                                "TCP" : "SCTP");
                        rb += sprintf(page+rb, "  StatSN: 0x%08x\n",
@@ -1010,6 +1012,11 @@ TPG_ATTR(t10_pi, S_IRUGO | S_IWUSR);
  */
 DEF_TPG_ATTRIB(fabric_prot_type);
 TPG_ATTR(fabric_prot_type, S_IRUGO | S_IWUSR);
+/*
+ * Define iscsi_tpg_attrib_s_tpg_enabled_sendtargets
+ */
+DEF_TPG_ATTRIB(tpg_enabled_sendtargets);
+TPG_ATTR(tpg_enabled_sendtargets, S_IRUGO | S_IWUSR);
 
 static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = {
        &iscsi_tpg_attrib_authentication.attr,
@@ -1024,6 +1031,7 @@ static struct configfs_attribute *lio_target_tpg_attrib_attrs[] = {
        &iscsi_tpg_attrib_default_erl.attr,
        &iscsi_tpg_attrib_t10_pi.attr,
        &iscsi_tpg_attrib_fabric_prot_type.attr,
+       &iscsi_tpg_attrib_tpg_enabled_sendtargets.attr,
        NULL,
 };
 
index 5fabcd3d623f27fe9cd1f97b9d4c311157ade876..0382fa24b53bab9365ad315019ead4d24f2f750e 100644 (file)
@@ -47,19 +47,19 @@ void iscsit_determine_maxcmdsn(struct iscsi_session *sess)
         * core_set_queue_depth_for_node().
         */
        sess->cmdsn_window = se_nacl->queue_depth;
-       sess->max_cmd_sn = (sess->max_cmd_sn + se_nacl->queue_depth) - 1;
+       atomic_add(se_nacl->queue_depth - 1, &sess->max_cmd_sn);
 }
 
 void iscsit_increment_maxcmdsn(struct iscsi_cmd *cmd, struct iscsi_session *sess)
 {
+       u32 max_cmd_sn;
+
        if (cmd->immediate_cmd || cmd->maxcmdsn_inc)
                return;
 
        cmd->maxcmdsn_inc = 1;
 
-       mutex_lock(&sess->cmdsn_mutex);
-       sess->max_cmd_sn += 1;
-       pr_debug("Updated MaxCmdSN to 0x%08x\n", sess->max_cmd_sn);
-       mutex_unlock(&sess->cmdsn_mutex);
+       max_cmd_sn = atomic_inc_return(&sess->max_cmd_sn);
+       pr_debug("Updated MaxCmdSN to 0x%08x\n", max_cmd_sn);
 }
 EXPORT_SYMBOL(iscsit_increment_maxcmdsn);
index 7e8f65e5448fdbda5645e3d5a836ad9f81408efa..96e78c823d13fa2f78feb6ff024fb468518be75b 100644 (file)
@@ -331,7 +331,7 @@ static int iscsi_login_zero_tsih_s1(
         * The FFP CmdSN window values will be allocated from the TPG's
         * Initiator Node's ACL once the login has been successfully completed.
         */
-       sess->max_cmd_sn        = be32_to_cpu(pdu->cmdsn);
+       atomic_set(&sess->max_cmd_sn, be32_to_cpu(pdu->cmdsn));
 
        sess->sess_ops = kzalloc(sizeof(struct iscsi_sess_ops), GFP_KERNEL);
        if (!sess->sess_ops) {
@@ -729,9 +729,9 @@ void iscsi_post_login_handler(
                        stop_timer = 1;
                }
 
-               pr_debug("iSCSI Login successful on CID: %hu from %s to"
-                       " %s:%hu,%hu\n", conn->cid, conn->login_ip,
-                       conn->local_ip, conn->local_port, tpg->tpgt);
+               pr_debug("iSCSI Login successful on CID: %hu from %pISpc to"
+                       " %pISpc,%hu\n", conn->cid, &conn->login_sockaddr,
+                       &conn->local_sockaddr, tpg->tpgt);
 
                list_add_tail(&conn->conn_list, &sess->sess_conn_list);
                atomic_inc(&sess->nconn);
@@ -776,8 +776,8 @@ void iscsi_post_login_handler(
        pr_debug("Moving to TARG_SESS_STATE_LOGGED_IN.\n");
        sess->session_state = TARG_SESS_STATE_LOGGED_IN;
 
-       pr_debug("iSCSI Login successful on CID: %hu from %s to %s:%hu,%hu\n",
-               conn->cid, conn->login_ip, conn->local_ip, conn->local_port,
+       pr_debug("iSCSI Login successful on CID: %hu from %pISpc to %pISpc,%hu\n",
+               conn->cid, &conn->login_sockaddr, &conn->local_sockaddr,
                tpg->tpgt);
 
        spin_lock_bh(&sess->conn_lock);
@@ -823,8 +823,8 @@ static void iscsi_handle_login_thread_timeout(unsigned long data)
        struct iscsi_np *np = (struct iscsi_np *) data;
 
        spin_lock_bh(&np->np_thread_lock);
-       pr_err("iSCSI Login timeout on Network Portal %s:%hu\n",
-                       np->np_ip, np->np_port);
+       pr_err("iSCSI Login timeout on Network Portal %pISpc\n",
+                       &np->np_sockaddr);
 
        if (np->np_login_timer_flags & ISCSI_TF_STOP) {
                spin_unlock_bh(&np->np_thread_lock);
@@ -877,7 +877,7 @@ static void iscsi_stop_login_thread_timer(struct iscsi_np *np)
 
 int iscsit_setup_np(
        struct iscsi_np *np,
-       struct __kernel_sockaddr_storage *sockaddr)
+       struct sockaddr_storage *sockaddr)
 {
        struct socket *sock = NULL;
        int backlog = ISCSIT_TCP_BACKLOG, ret, opt = 0, len;
@@ -916,7 +916,7 @@ int iscsit_setup_np(
         * in iscsi_target_configfs.c code..
         */
        memcpy(&np->np_sockaddr, sockaddr,
-                       sizeof(struct __kernel_sockaddr_storage));
+                       sizeof(struct sockaddr_storage));
 
        if (sockaddr->ss_family == AF_INET6)
                len = sizeof(struct sockaddr_in6);
@@ -975,7 +975,7 @@ fail:
 
 int iscsi_target_setup_login_socket(
        struct iscsi_np *np,
-       struct __kernel_sockaddr_storage *sockaddr)
+       struct sockaddr_storage *sockaddr)
 {
        struct iscsit_transport *t;
        int rc;
@@ -1015,44 +1015,42 @@ int iscsit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in6, &err, 1);
                if (!rc) {
-                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr))
-                               snprintf(conn->login_ip, sizeof(conn->login_ip), "[%pI6c]",
-                                       &sock_in6.sin6_addr.in6_u);
-                       else
-                               snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI4",
-                                       &sock_in6.sin6_addr.s6_addr32[3]);
-                       conn->login_port = ntohs(sock_in6.sin6_port);
+                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr)) {
+                               memcpy(&conn->login_sockaddr, &sock_in6, sizeof(sock_in6));
+                       } else {
+                               /* Pretend to be an ipv4 socket */
+                               sock_in.sin_family = AF_INET;
+                               sock_in.sin_port = sock_in6.sin6_port;
+                               memcpy(&sock_in.sin_addr, &sock_in6.sin6_addr.s6_addr32[3], 4);
+                               memcpy(&conn->login_sockaddr, &sock_in, sizeof(sock_in));
+                       }
                }
 
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in6, &err, 0);
                if (!rc) {
-                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr))
-                               snprintf(conn->local_ip, sizeof(conn->local_ip), "[%pI6c]",
-                                       &sock_in6.sin6_addr.in6_u);
-                       else
-                               snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI4",
-                                       &sock_in6.sin6_addr.s6_addr32[3]);
-                       conn->local_port = ntohs(sock_in6.sin6_port);
+                       if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr)) {
+                               memcpy(&conn->local_sockaddr, &sock_in6, sizeof(sock_in6));
+                       } else {
+                               /* Pretend to be an ipv4 socket */
+                               sock_in.sin_family = AF_INET;
+                               sock_in.sin_port = sock_in6.sin6_port;
+                               memcpy(&sock_in.sin_addr, &sock_in6.sin6_addr.s6_addr32[3], 4);
+                               memcpy(&conn->local_sockaddr, &sock_in, sizeof(sock_in));
+                       }
                }
        } else {
                memset(&sock_in, 0, sizeof(struct sockaddr_in));
 
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in, &err, 1);
-               if (!rc) {
-                       sprintf(conn->login_ip, "%pI4",
-                                       &sock_in.sin_addr.s_addr);
-                       conn->login_port = ntohs(sock_in.sin_port);
-               }
+               if (!rc)
+                       memcpy(&conn->login_sockaddr, &sock_in, sizeof(sock_in));
 
                rc = conn->sock->ops->getname(conn->sock,
                                (struct sockaddr *)&sock_in, &err, 0);
-               if (!rc) {
-                       sprintf(conn->local_ip, "%pI4",
-                                       &sock_in.sin_addr.s_addr);
-                       conn->local_port = ntohs(sock_in.sin_port);
-               }
+               if (!rc)
+                       memcpy(&conn->local_sockaddr, &sock_in, sizeof(sock_in));
        }
 
        return 0;
@@ -1302,8 +1300,8 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
        spin_lock_bh(&np->np_thread_lock);
        if (np->np_thread_state != ISCSI_NP_THREAD_ACTIVE) {
                spin_unlock_bh(&np->np_thread_lock);
-               pr_err("iSCSI Network Portal on %s:%hu currently not"
-                       " active.\n", np->np_ip, np->np_port);
+               pr_err("iSCSI Network Portal on %pISpc currently not"
+                       " active.\n", &np->np_sockaddr);
                iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
                                ISCSI_LOGIN_STATUS_SVC_UNAVAILABLE);
                goto new_sess_out;
@@ -1312,9 +1310,9 @@ static int __iscsi_target_login_thread(struct iscsi_np *np)
 
        conn->network_transport = np->np_network_transport;
 
-       pr_debug("Received iSCSI login request from %s on %s Network"
-               " Portal %s:%hu\n", conn->login_ip, np->np_transport->name,
-               conn->local_ip, conn->local_port);
+       pr_debug("Received iSCSI login request from %pISpc on %s Network"
+               " Portal %pISpc\n", &conn->login_sockaddr, np->np_transport->name,
+               &conn->local_sockaddr);
 
        pr_debug("Moving to TARG_CONN_STATE_IN_LOGIN.\n");
        conn->conn_state        = TARG_CONN_STATE_IN_LOGIN;
index 57aa0d0fd820f330c271836ecdc02c5a067179b2..b597aa2c61a1c60d2794610796ac156c220e43fe 100644 (file)
@@ -5,9 +5,9 @@ extern int iscsi_login_setup_crypto(struct iscsi_conn *);
 extern int iscsi_check_for_session_reinstatement(struct iscsi_conn *);
 extern int iscsi_login_post_auth_non_zero_tsih(struct iscsi_conn *, u16, u32);
 extern int iscsit_setup_np(struct iscsi_np *,
-                               struct __kernel_sockaddr_storage *);
+                               struct sockaddr_storage *);
 extern int iscsi_target_setup_login_socket(struct iscsi_np *,
-                               struct __kernel_sockaddr_storage *);
+                               struct sockaddr_storage *);
 extern int iscsit_accept_np(struct iscsi_np *, struct iscsi_conn *);
 extern int iscsit_get_login_rx(struct iscsi_conn *, struct iscsi_login *);
 extern int iscsit_put_login_tx(struct iscsi_conn *, struct iscsi_login *, u32);
index f9cde91418367071d08c3a3ebe08dc44a1a1abe3..5c964c09c89ff25e6076b272d4afff1a0466a372 100644 (file)
@@ -341,7 +341,6 @@ static int iscsi_target_check_first_request(
 static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_login *login)
 {
        u32 padding = 0;
-       struct iscsi_session *sess = conn->sess;
        struct iscsi_login_rsp *login_rsp;
 
        login_rsp = (struct iscsi_login_rsp *) login->rsp;
@@ -353,7 +352,7 @@ static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_log
        login_rsp->itt                  = login->init_task_tag;
        login_rsp->statsn               = cpu_to_be32(conn->stat_sn++);
        login_rsp->exp_cmdsn            = cpu_to_be32(conn->sess->exp_cmd_sn);
-       login_rsp->max_cmdsn            = cpu_to_be32(conn->sess->max_cmd_sn);
+       login_rsp->max_cmdsn            = cpu_to_be32((u32) atomic_read(&conn->sess->max_cmd_sn));
 
        pr_debug("Sending Login Response, Flags: 0x%02x, ITT: 0x%08x,"
                " ExpCmdSN; 0x%08x, MaxCmdSN: 0x%08x, StatSN: 0x%08x, Length:"
@@ -382,10 +381,6 @@ static int iscsi_target_do_tx_login_io(struct iscsi_conn *conn, struct iscsi_log
                goto err;
 
        login->rsp_length               = 0;
-       mutex_lock(&sess->cmdsn_mutex);
-       login_rsp->exp_cmdsn            = cpu_to_be32(sess->exp_cmd_sn);
-       login_rsp->max_cmdsn            = cpu_to_be32(sess->max_cmd_sn);
-       mutex_unlock(&sess->cmdsn_mutex);
 
        return 0;
 
index 5e1349a3b1438ece26d986f31608bd6912393371..9dd94ff0b62c0e53e39dbd3685ce1e3dc272cb1b 100644 (file)
@@ -430,7 +430,7 @@ static ssize_t iscsi_stat_tgt_attr_show_attr_fail_intr_addr(
        int ret;
 
        spin_lock(&lstat->lock);
-       ret = snprintf(page, PAGE_SIZE, "%s\n", lstat->last_intr_fail_ip_addr);
+       ret = snprintf(page, PAGE_SIZE, "%pISc\n", &lstat->last_intr_fail_sockaddr);
        spin_unlock(&lstat->lock);
 
        return ret;
index cf59c397007bd0d9a48665f34a07c5bf3226e650..11320df939f7f19d5406fc857bc5e69b160f9593 100644 (file)
@@ -50,7 +50,7 @@ u8 iscsit_tmr_abort_task(
                pr_err("Unable to locate RefTaskTag: 0x%08x on CID:"
                        " %hu.\n", hdr->rtt, conn->cid);
                return (iscsi_sna_gte(be32_to_cpu(hdr->refcmdsn), conn->sess->exp_cmd_sn) &&
-                       iscsi_sna_lte(be32_to_cpu(hdr->refcmdsn), conn->sess->max_cmd_sn)) ?
+                       iscsi_sna_lte(be32_to_cpu(hdr->refcmdsn), (u32) atomic_read(&conn->sess->max_cmd_sn))) ?
                        ISCSI_TMF_RSP_COMPLETE : ISCSI_TMF_RSP_NO_TASK;
        }
        if (ref_cmd->cmd_sn != be32_to_cpu(hdr->refcmdsn)) {
index 968068ffcb1c87a7ce7d218f8faf0a900dbc517b..23c95cd14167a2705b773d0c7155567af331eb33 100644 (file)
@@ -226,6 +226,7 @@ static void iscsit_set_default_tpg_attribs(struct iscsi_portal_group *tpg)
        a->default_erl = TA_DEFAULT_ERL;
        a->t10_pi = TA_DEFAULT_T10_PI;
        a->fabric_prot_type = TA_DEFAULT_FABRIC_PROT_TYPE;
+       a->tpg_enabled_sendtargets = TA_DEFAULT_TPG_ENABLED_SENDTARGETS;
 }
 
 int iscsit_tpg_add_portal_group(struct iscsi_tiqn *tiqn, struct iscsi_portal_group *tpg)
@@ -430,7 +431,7 @@ struct iscsi_tpg_np *iscsit_tpg_locate_child_np(
 
 static bool iscsit_tpg_check_network_portal(
        struct iscsi_tiqn *tiqn,
-       struct __kernel_sockaddr_storage *sockaddr,
+       struct sockaddr_storage *sockaddr,
        int network_transport)
 {
        struct iscsi_portal_group *tpg;
@@ -459,8 +460,7 @@ static bool iscsit_tpg_check_network_portal(
 
 struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
        struct iscsi_portal_group *tpg,
-       struct __kernel_sockaddr_storage *sockaddr,
-       char *ip_str,
+       struct sockaddr_storage *sockaddr,
        struct iscsi_tpg_np *tpg_np_parent,
        int network_transport)
 {
@@ -470,8 +470,8 @@ struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
        if (!tpg_np_parent) {
                if (iscsit_tpg_check_network_portal(tpg->tpg_tiqn, sockaddr,
                                network_transport)) {
-                       pr_err("Network Portal: %s already exists on a"
-                               " different TPG on %s\n", ip_str,
+                       pr_err("Network Portal: %pISc already exists on a"
+                               " different TPG on %s\n", sockaddr,
                                tpg->tpg_tiqn->tiqn);
                        return ERR_PTR(-EEXIST);
                }
@@ -484,7 +484,7 @@ struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
                return ERR_PTR(-ENOMEM);
        }
 
-       np = iscsit_add_np(sockaddr, ip_str, network_transport);
+       np = iscsit_add_np(sockaddr, network_transport);
        if (IS_ERR(np)) {
                kfree(tpg_np);
                return ERR_CAST(np);
@@ -514,8 +514,8 @@ struct iscsi_tpg_np *iscsit_tpg_add_network_portal(
                spin_unlock(&tpg_np_parent->tpg_np_parent_lock);
        }
 
-       pr_debug("CORE[%s] - Added Network Portal: %s:%hu,%hu on %s\n",
-               tpg->tpg_tiqn->tiqn, np->np_ip, np->np_port, tpg->tpgt,
+       pr_debug("CORE[%s] - Added Network Portal: %pISpc,%hu on %s\n",
+               tpg->tpg_tiqn->tiqn, &np->np_sockaddr, tpg->tpgt,
                np->np_transport->name);
 
        return tpg_np;
@@ -528,8 +528,8 @@ static int iscsit_tpg_release_np(
 {
        iscsit_clear_tpg_np_login_thread(tpg_np, tpg, true);
 
-       pr_debug("CORE[%s] - Removed Network Portal: %s:%hu,%hu on %s\n",
-               tpg->tpg_tiqn->tiqn, np->np_ip, np->np_port, tpg->tpgt,
+       pr_debug("CORE[%s] - Removed Network Portal: %pISpc,%hu on %s\n",
+               tpg->tpg_tiqn->tiqn, &np->np_sockaddr, tpg->tpgt,
                np->np_transport->name);
 
        tpg_np->tpg_np = NULL;
@@ -892,3 +892,21 @@ int iscsit_ta_fabric_prot_type(
 
        return 0;
 }
+
+int iscsit_ta_tpg_enabled_sendtargets(
+       struct iscsi_portal_group *tpg,
+       u32 flag)
+{
+       struct iscsi_tpg_attrib *a = &tpg->tpg_attrib;
+
+       if ((flag != 0) && (flag != 1)) {
+               pr_err("Illegal value %d\n", flag);
+               return -EINVAL;
+       }
+
+       a->tpg_enabled_sendtargets = flag;
+       pr_debug("iSCSI_TPG[%hu] - TPG enabled bit required for SendTargets:"
+               " %s\n", tpg->tpgt, (a->tpg_enabled_sendtargets) ? "ON" : "OFF");
+
+       return 0;
+}
index 95ff5bdecd719d601826feed61abb97b3eac13d4..9db32bd24cd46d65c5b0050374d0fd6118ac9d32 100644 (file)
@@ -22,7 +22,7 @@ extern struct iscsi_node_attrib *iscsit_tpg_get_node_attrib(struct iscsi_session
 extern void iscsit_tpg_del_external_nps(struct iscsi_tpg_np *);
 extern struct iscsi_tpg_np *iscsit_tpg_locate_child_np(struct iscsi_tpg_np *, int);
 extern struct iscsi_tpg_np *iscsit_tpg_add_network_portal(struct iscsi_portal_group *,
-                       struct __kernel_sockaddr_storage *, char *, struct iscsi_tpg_np *,
+                       struct sockaddr_storage *, struct iscsi_tpg_np *,
                        int);
 extern int iscsit_tpg_del_network_portal(struct iscsi_portal_group *,
                        struct iscsi_tpg_np *);
@@ -40,5 +40,6 @@ extern int iscsit_ta_demo_mode_discovery(struct iscsi_portal_group *, u32);
 extern int iscsit_ta_default_erl(struct iscsi_portal_group *, u32);
 extern int iscsit_ta_t10_pi(struct iscsi_portal_group *, u32);
 extern int iscsit_ta_fabric_prot_type(struct iscsi_portal_group *, u32);
+extern int iscsit_ta_tpg_enabled_sendtargets(struct iscsi_portal_group *, u32);
 
 #endif /* ISCSI_TARGET_TPG_H */
index a2bff0702eb25bc4d10bac935d4888b2df131c41..428b0d9e3dbab246e579a027344dbc60f6a64098 100644 (file)
@@ -233,6 +233,7 @@ struct iscsi_r2t *iscsit_get_holder_for_r2tsn(
 
 static inline int iscsit_check_received_cmdsn(struct iscsi_session *sess, u32 cmdsn)
 {
+       u32 max_cmdsn;
        int ret;
 
        /*
@@ -241,10 +242,10 @@ static inline int iscsit_check_received_cmdsn(struct iscsi_session *sess, u32 cm
         * or order CmdSNs due to multiple connection sessions and/or
         * CRC failures.
         */
-       if (iscsi_sna_gt(cmdsn, sess->max_cmd_sn)) {
+       max_cmdsn = atomic_read(&sess->max_cmd_sn);
+       if (iscsi_sna_gt(cmdsn, max_cmdsn)) {
                pr_err("Received CmdSN: 0x%08x is greater than"
-                      " MaxCmdSN: 0x%08x, ignoring.\n", cmdsn,
-                      sess->max_cmd_sn);
+                      " MaxCmdSN: 0x%08x, ignoring.\n", cmdsn, max_cmdsn);
                ret = CMDSN_MAXCMDSN_OVERRUN;
 
        } else if (cmdsn == sess->exp_cmd_sn) {
@@ -1371,6 +1372,33 @@ int tx_data(
        return iscsit_do_tx_data(conn, &c);
 }
 
+static bool sockaddr_equal(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+       switch (x->ss_family) {
+       case AF_INET: {
+               struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+               struct sockaddr_in *siny = (struct sockaddr_in *)y;
+               if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+                       return false;
+               if (sinx->sin_port != siny->sin_port)
+                       return false;
+               break;
+       }
+       case AF_INET6: {
+               struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+               struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+               if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+                       return false;
+               if (sinx->sin6_port != siny->sin6_port)
+                       return false;
+               break;
+       }
+       default:
+               return false;
+       }
+       return true;
+}
+
 void iscsit_collect_login_stats(
        struct iscsi_conn *conn,
        u8 status_class,
@@ -1387,7 +1415,7 @@ void iscsit_collect_login_stats(
        ls = &tiqn->login_stats;
 
        spin_lock(&ls->lock);
-       if (!strcmp(conn->login_ip, ls->last_intr_fail_ip_addr) &&
+       if (sockaddr_equal(&conn->login_sockaddr, &ls->last_intr_fail_sockaddr) &&
            ((get_jiffies_64() - ls->last_fail_time) < 10)) {
                /* We already have the failure info for this login */
                spin_unlock(&ls->lock);
@@ -1427,8 +1455,7 @@ void iscsit_collect_login_stats(
 
                ls->last_intr_fail_ip_family = conn->login_family;
 
-               snprintf(ls->last_intr_fail_ip_addr, IPV6_ADDRESS_SPACE,
-                               "%s", conn->login_ip);
+               ls->last_intr_fail_sockaddr = conn->login_sockaddr;
                ls->last_fail_time = get_jiffies_64();
        }
 
index a556bdebd775dbc4ecbef99b1a9153493916a9f3..5bc85ffed7204f21871a31d7346b93885b437cf8 100644 (file)
@@ -526,7 +526,7 @@ static inline struct tcm_loop_tpg *tl_tpg(struct se_portal_group *se_tpg)
 static char *tcm_loop_get_endpoint_wwn(struct se_portal_group *se_tpg)
 {
        /*
-        * Return the passed NAA identifier for the SAS Target Port
+        * Return the passed NAA identifier for the Target Port
         */
        return &tl_tpg(se_tpg)->tl_hba->tl_wwn_address[0];
 }
@@ -845,7 +845,7 @@ static int tcm_loop_make_nexus(
                transport_free_session(tl_nexus->se_sess);
                goto out;
        }
-       /* Now, register the SAS I_T Nexus as active. */
+       /* Now, register the I_T Nexus as active. */
        transport_register_session(se_tpg, tl_nexus->se_sess->se_node_acl,
                        tl_nexus->se_sess, tl_nexus);
        tl_tpg->tl_nexus = tl_nexus;
@@ -884,7 +884,7 @@ static int tcm_loop_drop_nexus(
                " %s Initiator Port: %s\n", tcm_loop_dump_proto_id(tpg->tl_hba),
                tl_nexus->se_sess->se_node_acl->initiatorname);
        /*
-        * Release the SCSI I_T Nexus to the emulated SAS Target Port
+        * Release the SCSI I_T Nexus to the emulated Target Port
         */
        transport_deregister_session(tl_nexus->se_sess);
        tpg->tl_nexus = NULL;
@@ -1034,6 +1034,11 @@ static ssize_t tcm_loop_tpg_store_transport_status(
        }
        if (!strncmp(page, "offline", 7)) {
                tl_tpg->tl_transport_status = TCM_TRANSPORT_OFFLINE;
+               if (tl_tpg->tl_nexus) {
+                       struct se_session *tl_sess = tl_tpg->tl_nexus->se_sess;
+
+                       core_allocate_nexus_loss_ua(tl_sess->se_node_acl);
+               }
                return count;
        }
        return -EINVAL;
@@ -1077,7 +1082,7 @@ static struct se_portal_group *tcm_loop_make_naa_tpg(
        tl_tpg->tl_hba = tl_hba;
        tl_tpg->tl_tpgt = tpgt;
        /*
-        * Register the tl_tpg as a emulated SAS TCM Target Endpoint
+        * Register the tl_tpg as a emulated TCM Target Endpoint
         */
        ret = core_tpg_register(wwn, &tl_tpg->tl_se_tpg, tl_hba->tl_proto_id);
        if (ret < 0)
@@ -1102,11 +1107,11 @@ static void tcm_loop_drop_naa_tpg(
        tl_hba = tl_tpg->tl_hba;
        tpgt = tl_tpg->tl_tpgt;
        /*
-        * Release the I_T Nexus for the Virtual SAS link if present
+        * Release the I_T Nexus for the Virtual target link if present
         */
        tcm_loop_drop_nexus(tl_tpg);
        /*
-        * Deregister the tl_tpg as a emulated SAS TCM Target Endpoint
+        * Deregister the tl_tpg as a emulated TCM Target Endpoint
         */
        core_tpg_deregister(se_tpg);
 
@@ -1199,8 +1204,9 @@ static void tcm_loop_drop_scsi_hba(
                                struct tcm_loop_hba, tl_hba_wwn);
 
        pr_debug("TCM_Loop_ConfigFS: Deallocating emulated Target"
-               " SAS Address: %s at Linux/SCSI Host ID: %d\n",
-               tl_hba->tl_wwn_address, tl_hba->sh->host_no);
+               " %s Address: %s at Linux/SCSI Host ID: %d\n",
+               tcm_loop_dump_proto_id(tl_hba), tl_hba->tl_wwn_address,
+               tl_hba->sh->host_no);
        /*
         * Call device_unregister() on the original tl_hba->dev.
         * tcm_loop_fabric_scsi.c:tcm_loop_release_adapter() will
index 09e682b1c54953477a59057a6b3f3098c5dfbbf5..dcc424ac35d45bfd2cc38bb348c851fc4336796d 100644 (file)
@@ -620,8 +620,6 @@ struct se_lun_acl *core_dev_init_initiator_node_lun_acl(
 
        lacl->mapped_lun = mapped_lun;
        lacl->se_lun_nacl = nacl;
-       snprintf(lacl->initiatorname, TRANSPORT_IQN_LEN, "%s",
-                nacl->initiatorname);
 
        return lacl;
 }
@@ -656,7 +654,7 @@ int core_dev_add_initiator_node_lun_acl(
                " InitiatorNode: %s\n", tpg->se_tpg_tfo->get_fabric_name(),
                tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun, lacl->mapped_lun,
                (lun_access & TRANSPORT_LUNFLAGS_READ_WRITE) ? "RW" : "RO",
-               lacl->initiatorname);
+               nacl->initiatorname);
        /*
         * Check to see if there are any existing persistent reservation APTPL
         * pre-registrations that need to be enabled for this LUN ACL..
@@ -688,7 +686,7 @@ int core_dev_del_initiator_node_lun_acl(
                " InitiatorNode: %s Mapped LUN: %llu\n",
                tpg->se_tpg_tfo->get_fabric_name(),
                tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun,
-               lacl->initiatorname, lacl->mapped_lun);
+               nacl->initiatorname, lacl->mapped_lun);
 
        return 0;
 }
@@ -701,7 +699,7 @@ void core_dev_free_initiator_node_lun_acl(
                " Mapped LUN: %llu\n", tpg->se_tpg_tfo->get_fabric_name(),
                tpg->se_tpg_tfo->tpg_get_tag(tpg),
                tpg->se_tpg_tfo->get_fabric_name(),
-               lacl->initiatorname, lacl->mapped_lun);
+               lacl->se_lun_nacl->initiatorname, lacl->mapped_lun);
 
        kfree(lacl);
 }
@@ -754,7 +752,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        dev->dev_link_magic = SE_DEV_LINK_MAGIC;
        dev->se_hba = hba;
        dev->transport = hba->backend->ops;
-       dev->prot_length = sizeof(struct se_dif_v1_tuple);
+       dev->prot_length = sizeof(struct t10_pi_tuple);
        dev->hba_index = hba->hba_index;
 
        INIT_LIST_HEAD(&dev->dev_list);
@@ -771,7 +769,6 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
        spin_lock_init(&dev->se_tmr_lock);
        spin_lock_init(&dev->qf_cmd_lock);
        sema_init(&dev->caw_sem, 1);
-       atomic_set(&dev->dev_ordered_id, 0);
        INIT_LIST_HEAD(&dev->t10_wwn.t10_vpd_list);
        spin_lock_init(&dev->t10_wwn.t10_vpd_lock);
        INIT_LIST_HEAD(&dev->t10_pr.registration_list);
index 48a36989c1a659408b5a1a58bbb97a0989576055..be42429468e2505833f0578eea2cf5a81a4f9ee0 100644 (file)
@@ -203,7 +203,7 @@ static ssize_t target_fabric_mappedlun_store_write_protect(
        pr_debug("%s_ConfigFS: Changed Initiator ACL: %s"
                " Mapped LUN: %llu Write Protect bit to %s\n",
                se_tpg->se_tpg_tfo->get_fabric_name(),
-               lacl->initiatorname, lacl->mapped_lun, (op) ? "ON" : "OFF");
+               se_nacl->initiatorname, lacl->mapped_lun, (op) ? "ON" : "OFF");
 
        return count;
 
index be9cefc07407e80ef5dd7dfcbd8a0d025faf97f6..9522960c7fddacf70c682326a38586f8562c4487 100644 (file)
@@ -184,3 +184,8 @@ core_delete_hba(struct se_hba *hba)
        kfree(hba);
        return 0;
 }
+
+bool target_sense_desc_format(struct se_device *dev)
+{
+       return dev->transport->get_blocks(dev) > U32_MAX;
+}
index e318ddbe15da05338a2af842d4975d2958a0abab..0b4b2a67d9f9ed597479f1f2c0ea356126f55006 100644 (file)
@@ -154,6 +154,38 @@ sbc_emulate_readcapacity_16(struct se_cmd *cmd)
        return 0;
 }
 
+static sense_reason_t
+sbc_emulate_startstop(struct se_cmd *cmd)
+{
+       unsigned char *cdb = cmd->t_task_cdb;
+
+       /*
+        * See sbc3r36 section 5.25
+        * Immediate bit should be set since there is nothing to complete
+        * POWER CONDITION MODIFIER 0h
+        */
+       if (!(cdb[1] & 1) || cdb[2] || cdb[3])
+               return TCM_INVALID_CDB_FIELD;
+
+       /*
+        * See sbc3r36 section 5.25
+        * POWER CONDITION 0h START_VALID - process START and LOEJ
+        */
+       if (cdb[4] >> 4 & 0xf)
+               return TCM_INVALID_CDB_FIELD;
+
+       /*
+        * See sbc3r36 section 5.25
+        * LOEJ 0h - nothing to load or unload
+        * START 1h - we are ready
+        */
+       if (!(cdb[4] & 1) || (cdb[4] & 2) || (cdb[4] & 4))
+               return TCM_INVALID_CDB_FIELD;
+
+       target_complete_cmd(cmd, SAM_STAT_GOOD);
+       return 0;
+}
+
 sector_t sbc_get_write_same_sectors(struct se_cmd *cmd)
 {
        u32 num_blocks;
@@ -960,6 +992,9 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops)
                               " than 1\n", sectors);
                        return TCM_INVALID_CDB_FIELD;
                }
+               if (sbc_check_dpofua(dev, cmd, cdb))
+                       return TCM_INVALID_CDB_FIELD;
+
                /*
                 * Double size because we have two buffers, note that
                 * zero is not an error..
@@ -1069,6 +1104,10 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops)
                size = 0;
                cmd->execute_cmd = sbc_emulate_noop;
                break;
+       case START_STOP:
+               size = 0;
+               cmd->execute_cmd = sbc_emulate_startstop;
+               break;
        default:
                ret = spc_parse_cdb(cmd, &size);
                if (ret)
@@ -1191,7 +1230,7 @@ void
 sbc_dif_generate(struct se_cmd *cmd)
 {
        struct se_device *dev = cmd->se_dev;
-       struct se_dif_v1_tuple *sdt;
+       struct t10_pi_tuple *sdt;
        struct scatterlist *dsg = cmd->t_data_sg, *psg;
        sector_t sector = cmd->t_task_lba;
        void *daddr, *paddr;
@@ -1203,7 +1242,7 @@ sbc_dif_generate(struct se_cmd *cmd)
                daddr = kmap_atomic(sg_page(dsg)) + dsg->offset;
 
                for (j = 0; j < psg->length;
-                               j += sizeof(struct se_dif_v1_tuple)) {
+                               j += sizeof(*sdt)) {
                        __u16 crc;
                        unsigned int avail;
 
@@ -1256,7 +1295,7 @@ sbc_dif_generate(struct se_cmd *cmd)
 }
 
 static sense_reason_t
-sbc_dif_v1_verify(struct se_cmd *cmd, struct se_dif_v1_tuple *sdt,
+sbc_dif_v1_verify(struct se_cmd *cmd, struct t10_pi_tuple *sdt,
                  __u16 crc, sector_t sector, unsigned int ei_lba)
 {
        __be16 csum;
@@ -1346,7 +1385,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors,
               unsigned int ei_lba, struct scatterlist *psg, int psg_off)
 {
        struct se_device *dev = cmd->se_dev;
-       struct se_dif_v1_tuple *sdt;
+       struct t10_pi_tuple *sdt;
        struct scatterlist *dsg = cmd->t_data_sg;
        sector_t sector = start;
        void *daddr, *paddr;
@@ -1361,7 +1400,7 @@ sbc_dif_verify(struct se_cmd *cmd, sector_t start, unsigned int sectors,
 
                for (i = psg_off; i < psg->length &&
                                sector < start + sectors;
-                               i += sizeof(struct se_dif_v1_tuple)) {
+                               i += sizeof(*sdt)) {
                        __u16 crc;
                        unsigned int avail;
 
index f87d4cef6d398c072e953e7eaa6b5d9d5b469d70..9413e1a949e5bf9a63eb1f418a427440b1ad6132 100644 (file)
@@ -484,8 +484,8 @@ static sense_reason_t
 spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf)
 {
        struct se_device *dev = cmd->se_dev;
-       int have_tp = 0;
-       int opt, min;
+       u32 mtl = 0;
+       int have_tp = 0, opt, min;
 
        /*
         * Following spc3r22 section 6.5.3 Block Limits VPD page, when
@@ -516,8 +516,15 @@ spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf)
 
        /*
         * Set MAXIMUM TRANSFER LENGTH
+        *
+        * XXX: Currently assumes single PAGE_SIZE per scatterlist for fabrics
+        * enforcing maximum HW scatter-gather-list entry limit
         */
-       put_unaligned_be32(dev->dev_attrib.hw_max_sectors, &buf[8]);
+       if (cmd->se_tfo->max_data_sg_nents) {
+               mtl = (cmd->se_tfo->max_data_sg_nents * PAGE_SIZE) /
+                      dev->dev_attrib.block_size;
+       }
+       put_unaligned_be32(min_not_zero(mtl, dev->dev_attrib.hw_max_sectors), &buf[8]);
 
        /*
         * Set OPTIMAL TRANSFER LENGTH
@@ -768,7 +775,12 @@ static int spc_modesense_control(struct se_cmd *cmd, u8 pc, u8 *p)
        if (pc == 1)
                goto out;
 
-       p[2] = 2;
+       /* GLTSD: No implicit save of log parameters */
+       p[2] = (1 << 1);
+       if (target_sense_desc_format(dev))
+               /* D_SENSE: Descriptor format sense data for 64bit sectors */
+               p[2] |= (1 << 2);
+
        /*
         * From spc4r23, 7.4.7 Control mode page
         *
@@ -1151,6 +1163,7 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd)
        unsigned char *rbuf;
        u8 ua_asc = 0, ua_ascq = 0;
        unsigned char buf[SE_SENSE_BUF];
+       bool desc_format = target_sense_desc_format(cmd->se_dev);
 
        memset(buf, 0, SE_SENSE_BUF);
 
@@ -1164,32 +1177,11 @@ static sense_reason_t spc_emulate_request_sense(struct se_cmd *cmd)
        if (!rbuf)
                return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-       if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq)) {
-               /*
-                * CURRENT ERROR, UNIT ATTENTION
-                */
-               buf[0] = 0x70;
-               buf[SPC_SENSE_KEY_OFFSET] = UNIT_ATTENTION;
-
-               /*
-                * The Additional Sense Code (ASC) from the UNIT ATTENTION
-                */
-               buf[SPC_ASC_KEY_OFFSET] = ua_asc;
-               buf[SPC_ASCQ_KEY_OFFSET] = ua_ascq;
-               buf[7] = 0x0A;
-       } else {
-               /*
-                * CURRENT ERROR, NO SENSE
-                */
-               buf[0] = 0x70;
-               buf[SPC_SENSE_KEY_OFFSET] = NO_SENSE;
-
-               /*
-                * NO ADDITIONAL SENSE INFORMATION
-                */
-               buf[SPC_ASC_KEY_OFFSET] = 0x00;
-               buf[7] = 0x0A;
-       }
+       if (!core_scsi3_ua_clear_for_request_sense(cmd, &ua_asc, &ua_ascq))
+               scsi_build_sense_buffer(desc_format, buf, UNIT_ATTENTION,
+                                       ua_asc, ua_ascq);
+       else
+               scsi_build_sense_buffer(desc_format, buf, NO_SENSE, 0x0, 0x0);
 
        memcpy(rbuf, buf, min_t(u32, sizeof(buf), cmd->data_length));
        transport_kunmap_data_sg(cmd);
@@ -1418,9 +1410,6 @@ spc_parse_cdb(struct se_cmd *cmd, unsigned int *size)
                }
                break;
        default:
-               pr_warn("TARGET_CORE[%s]: Unsupported SCSI Opcode"
-                       " 0x%02x, sending CHECK_CONDITION.\n",
-                       cmd->se_tfo->get_fabric_name(), cdb[0]);
                return TCM_UNSUPPORTED_SCSI_OPCODE;
        }
 
index babde4ad841f18a7956c7d56b1d23ff1f075e3d3..2d0381dd105c4998f225603e80bc22e2a6245c08 100644 (file)
@@ -41,6 +41,7 @@
 #include "target_core_internal.h"
 #include "target_core_alua.h"
 #include "target_core_pr.h"
+#include "target_core_ua.h"
 
 extern struct se_device *g_lun0_dev;
 
@@ -83,6 +84,22 @@ struct se_node_acl *core_tpg_get_initiator_node_acl(
 }
 EXPORT_SYMBOL(core_tpg_get_initiator_node_acl);
 
+void core_allocate_nexus_loss_ua(
+       struct se_node_acl *nacl)
+{
+       struct se_dev_entry *deve;
+
+       if (!nacl)
+               return;
+
+       rcu_read_lock();
+       hlist_for_each_entry_rcu(deve, &nacl->lun_entry_hlist, link)
+               core_scsi3_ua_allocate(deve, 0x29,
+                       ASCQ_29H_NEXUS_LOSS_OCCURRED);
+       rcu_read_unlock();
+}
+EXPORT_SYMBOL(core_allocate_nexus_loss_ua);
+
 /*     core_tpg_add_node_to_devs():
  *
  *
index ce8574b7220ced193e969e46141c411156e7e9a7..5bacc7b5ed6d85cf54d6d8fe445dcac08ee8081b 100644 (file)
@@ -39,6 +39,7 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
 
 #include <target/target_core_base.h>
 #include <target/target_core_backend.h>
@@ -1074,6 +1075,55 @@ transport_set_vpd_ident(struct t10_vpd *vpd, unsigned char *page_83)
 }
 EXPORT_SYMBOL(transport_set_vpd_ident);
 
+static sense_reason_t
+target_check_max_data_sg_nents(struct se_cmd *cmd, struct se_device *dev,
+                              unsigned int size)
+{
+       u32 mtl;
+
+       if (!cmd->se_tfo->max_data_sg_nents)
+               return TCM_NO_SENSE;
+       /*
+        * Check if fabric enforced maximum SGL entries per I/O descriptor
+        * exceeds se_cmd->data_length.  If true, set SCF_UNDERFLOW_BIT +
+        * residual_count and reduce original cmd->data_length to maximum
+        * length based on single PAGE_SIZE entry scatter-lists.
+        */
+       mtl = (cmd->se_tfo->max_data_sg_nents * PAGE_SIZE);
+       if (cmd->data_length > mtl) {
+               /*
+                * If an existing CDB overflow is present, calculate new residual
+                * based on CDB size minus fabric maximum transfer length.
+                *
+                * If an existing CDB underflow is present, calculate new residual
+                * based on original cmd->data_length minus fabric maximum transfer
+                * length.
+                *
+                * Otherwise, set the underflow residual based on cmd->data_length
+                * minus fabric maximum transfer length.
+                */
+               if (cmd->se_cmd_flags & SCF_OVERFLOW_BIT) {
+                       cmd->residual_count = (size - mtl);
+               } else if (cmd->se_cmd_flags & SCF_UNDERFLOW_BIT) {
+                       u32 orig_dl = size + cmd->residual_count;
+                       cmd->residual_count = (orig_dl - mtl);
+               } else {
+                       cmd->se_cmd_flags |= SCF_UNDERFLOW_BIT;
+                       cmd->residual_count = (cmd->data_length - mtl);
+               }
+               cmd->data_length = mtl;
+               /*
+                * Reset sbc_check_prot() calculated protection payload
+                * length based upon the new smaller MTL.
+                */
+               if (cmd->prot_length) {
+                       u32 sectors = (mtl / dev->dev_attrib.block_size);
+                       cmd->prot_length = dev->prot_length * sectors;
+               }
+       }
+       return TCM_NO_SENSE;
+}
+
 sense_reason_t
 target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
 {
@@ -1087,9 +1137,9 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
                        " 0x%02x\n", cmd->se_tfo->get_fabric_name(),
                                cmd->data_length, size, cmd->t_task_cdb[0]);
 
-               if (cmd->data_direction == DMA_TO_DEVICE) {
-                       pr_err("Rejecting underflow/overflow"
-                                       " WRITE data\n");
+               if (cmd->data_direction == DMA_TO_DEVICE &&
+                   cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) {
+                       pr_err("Rejecting underflow/overflow WRITE data\n");
                        return TCM_INVALID_CDB_FIELD;
                }
                /*
@@ -1119,7 +1169,7 @@ target_cmd_size_check(struct se_cmd *cmd, unsigned int size)
                }
        }
 
-       return 0;
+       return target_check_max_data_sg_nents(cmd, dev, size);
 
 }
 
@@ -1177,14 +1227,7 @@ transport_check_alloc_task_attr(struct se_cmd *cmd)
                        " emulation is not supported\n");
                return TCM_INVALID_CDB_FIELD;
        }
-       /*
-        * Used to determine when ORDERED commands should go from
-        * Dormant to Active status.
-        */
-       cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id);
-       pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n",
-                       cmd->se_ordered_id, cmd->sam_task_attr,
-                       dev->transport->name);
+
        return 0;
 }
 
@@ -1246,6 +1289,11 @@ target_setup_cmd_from_cdb(struct se_cmd *cmd, unsigned char *cdb)
        }
 
        ret = dev->transport->parse_cdb(cmd);
+       if (ret == TCM_UNSUPPORTED_SCSI_OPCODE)
+               pr_warn_ratelimited("%s/%s: Unsupported SCSI Opcode 0x%02x, sending CHECK_CONDITION.\n",
+                                   cmd->se_tfo->get_fabric_name(),
+                                   cmd->se_sess->se_node_acl->initiatorname,
+                                   cmd->t_task_cdb[0]);
        if (ret)
                return ret;
 
@@ -1693,8 +1741,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 
 check_stop:
        transport_lun_remove_cmd(cmd);
-       if (!transport_cmd_check_stop_to_fabric(cmd))
-               ;
+       transport_cmd_check_stop_to_fabric(cmd);
        return;
 
 queue_full:
@@ -1767,16 +1814,14 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
         */
        switch (cmd->sam_task_attr) {
        case TCM_HEAD_TAG:
-               pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x, "
-                        "se_ordered_id: %u\n",
-                        cmd->t_task_cdb[0], cmd->se_ordered_id);
+               pr_debug("Added HEAD_OF_QUEUE for CDB: 0x%02x\n",
+                        cmd->t_task_cdb[0]);
                return false;
        case TCM_ORDERED_TAG:
                atomic_inc_mb(&dev->dev_ordered_sync);
 
-               pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, "
-                        " se_ordered_id: %u\n",
-                        cmd->t_task_cdb[0], cmd->se_ordered_id);
+               pr_debug("Added ORDERED for CDB: 0x%02x to ordered list\n",
+                        cmd->t_task_cdb[0]);
 
                /*
                 * Execute an ORDERED command if no other older commands
@@ -1800,10 +1845,8 @@ static bool target_handle_task_attr(struct se_cmd *cmd)
        list_add_tail(&cmd->se_delayed_node, &dev->delayed_cmd_list);
        spin_unlock(&dev->delayed_cmd_lock);
 
-       pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to"
-               " delayed CMD list, se_ordered_id: %u\n",
-               cmd->t_task_cdb[0], cmd->sam_task_attr,
-               cmd->se_ordered_id);
+       pr_debug("Added CDB: 0x%02x Task Attr: 0x%02x to delayed CMD listn",
+               cmd->t_task_cdb[0], cmd->sam_task_attr);
        return true;
 }
 
@@ -1888,20 +1931,18 @@ static void transport_complete_task_attr(struct se_cmd *cmd)
        if (cmd->sam_task_attr == TCM_SIMPLE_TAG) {
                atomic_dec_mb(&dev->simple_cmds);
                dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev->dev_cur_ordered_id: %u for"
-                       " SIMPLE: %u\n", dev->dev_cur_ordered_id,
-                       cmd->se_ordered_id);
+               pr_debug("Incremented dev->dev_cur_ordered_id: %u for SIMPLE\n",
+                        dev->dev_cur_ordered_id);
        } else if (cmd->sam_task_attr == TCM_HEAD_TAG) {
                dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev_cur_ordered_id: %u for"
-                       " HEAD_OF_QUEUE: %u\n", dev->dev_cur_ordered_id,
-                       cmd->se_ordered_id);
+               pr_debug("Incremented dev_cur_ordered_id: %u for HEAD_OF_QUEUE\n",
+                        dev->dev_cur_ordered_id);
        } else if (cmd->sam_task_attr == TCM_ORDERED_TAG) {
                atomic_dec_mb(&dev->dev_ordered_sync);
 
                dev->dev_cur_ordered_id++;
-               pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:"
-                       " %u\n", dev->dev_cur_ordered_id, cmd->se_ordered_id);
+               pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n",
+                        dev->dev_cur_ordered_id);
        }
 
        target_restart_delayed_cmds(dev);
@@ -2615,37 +2656,159 @@ bool transport_wait_for_tasks(struct se_cmd *cmd)
 }
 EXPORT_SYMBOL(transport_wait_for_tasks);
 
-static int transport_get_sense_codes(
-       struct se_cmd *cmd,
-       u8 *asc,
-       u8 *ascq)
+struct sense_info {
+       u8 key;
+       u8 asc;
+       u8 ascq;
+       bool add_sector_info;
+};
+
+static const struct sense_info sense_info_table[] = {
+       [TCM_NO_SENSE] = {
+               .key = NOT_READY
+       },
+       [TCM_NON_EXISTENT_LUN] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x25 /* LOGICAL UNIT NOT SUPPORTED */
+       },
+       [TCM_UNSUPPORTED_SCSI_OPCODE] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x20, /* INVALID COMMAND OPERATION CODE */
+       },
+       [TCM_SECTOR_COUNT_TOO_MANY] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x20, /* INVALID COMMAND OPERATION CODE */
+       },
+       [TCM_UNKNOWN_MODE_PAGE] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x24, /* INVALID FIELD IN CDB */
+       },
+       [TCM_CHECK_CONDITION_ABORT_CMD] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x29, /* BUS DEVICE RESET FUNCTION OCCURRED */
+               .ascq = 0x03,
+       },
+       [TCM_INCORRECT_AMOUNT_OF_DATA] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x0c, /* WRITE ERROR */
+               .ascq = 0x0d, /* NOT ENOUGH UNSOLICITED DATA */
+       },
+       [TCM_INVALID_CDB_FIELD] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x24, /* INVALID FIELD IN CDB */
+       },
+       [TCM_INVALID_PARAMETER_LIST] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x26, /* INVALID FIELD IN PARAMETER LIST */
+       },
+       [TCM_PARAMETER_LIST_LENGTH_ERROR] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x1a, /* PARAMETER LIST LENGTH ERROR */
+       },
+       [TCM_UNEXPECTED_UNSOLICITED_DATA] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x0c, /* WRITE ERROR */
+               .ascq = 0x0c, /* UNEXPECTED_UNSOLICITED_DATA */
+       },
+       [TCM_SERVICE_CRC_ERROR] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x47, /* PROTOCOL SERVICE CRC ERROR */
+               .ascq = 0x05, /* N/A */
+       },
+       [TCM_SNACK_REJECTED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x11, /* READ ERROR */
+               .ascq = 0x13, /* FAILED RETRANSMISSION REQUEST */
+       },
+       [TCM_WRITE_PROTECTED] = {
+               .key = DATA_PROTECT,
+               .asc = 0x27, /* WRITE PROTECTED */
+       },
+       [TCM_ADDRESS_OUT_OF_RANGE] = {
+               .key = ILLEGAL_REQUEST,
+               .asc = 0x21, /* LOGICAL BLOCK ADDRESS OUT OF RANGE */
+       },
+       [TCM_CHECK_CONDITION_UNIT_ATTENTION] = {
+               .key = UNIT_ATTENTION,
+       },
+       [TCM_CHECK_CONDITION_NOT_READY] = {
+               .key = NOT_READY,
+       },
+       [TCM_MISCOMPARE_VERIFY] = {
+               .key = MISCOMPARE,
+               .asc = 0x1d, /* MISCOMPARE DURING VERIFY OPERATION */
+               .ascq = 0x00,
+       },
+       [TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x10,
+               .ascq = 0x01, /* LOGICAL BLOCK GUARD CHECK FAILED */
+               .add_sector_info = true,
+       },
+       [TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x10,
+               .ascq = 0x02, /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */
+               .add_sector_info = true,
+       },
+       [TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED] = {
+               .key = ABORTED_COMMAND,
+               .asc = 0x10,
+               .ascq = 0x03, /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */
+               .add_sector_info = true,
+       },
+       [TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE] = {
+               /*
+                * Returning ILLEGAL REQUEST would cause immediate IO errors on
+                * Solaris initiators.  Returning NOT READY instead means the
+                * operations will be retried a finite number of times and we
+                * can survive intermittent errors.
+                */
+               .key = NOT_READY,
+               .asc = 0x08, /* LOGICAL UNIT COMMUNICATION FAILURE */
+       },
+};
+
+static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason)
 {
-       *asc = cmd->scsi_asc;
-       *ascq = cmd->scsi_ascq;
+       const struct sense_info *si;
+       u8 *buffer = cmd->sense_buffer;
+       int r = (__force int)reason;
+       u8 asc, ascq;
+       bool desc_format = target_sense_desc_format(cmd->se_dev);
 
-       return 0;
-}
+       if (r < ARRAY_SIZE(sense_info_table) && sense_info_table[r].key)
+               si = &sense_info_table[r];
+       else
+               si = &sense_info_table[(__force int)
+                                      TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE];
 
-static
-void transport_err_sector_info(unsigned char *buffer, sector_t bad_sector)
-{
-       /* Place failed LBA in sense data information descriptor 0. */
-       buffer[SPC_ADD_SENSE_LEN_OFFSET] = 0xc;
-       buffer[SPC_DESC_TYPE_OFFSET] = 0; /* Information */
-       buffer[SPC_ADDITIONAL_DESC_LEN_OFFSET] = 0xa;
-       buffer[SPC_VALIDITY_OFFSET] = 0x80;
+       if (reason == TCM_CHECK_CONDITION_UNIT_ATTENTION) {
+               core_scsi3_ua_for_check_condition(cmd, &asc, &ascq);
+               WARN_ON_ONCE(asc == 0);
+       } else if (si->asc == 0) {
+               WARN_ON_ONCE(cmd->scsi_asc == 0);
+               asc = cmd->scsi_asc;
+               ascq = cmd->scsi_ascq;
+       } else {
+               asc = si->asc;
+               ascq = si->ascq;
+       }
+
+       scsi_build_sense_buffer(desc_format, buffer, si->key, asc, ascq);
+       if (si->add_sector_info)
+               return scsi_set_sense_information(buffer,
+                                                 cmd->scsi_sense_length,
+                                                 cmd->bad_sector);
 
-       /* Descriptor Information: failing sector */
-       put_unaligned_be64(bad_sector, &buffer[12]);
+       return 0;
 }
 
 int
 transport_send_check_condition_and_sense(struct se_cmd *cmd,
                sense_reason_t reason, int from_transport)
 {
-       unsigned char *buffer = cmd->sense_buffer;
        unsigned long flags;
-       u8 asc = 0, ascq = 0;
 
        spin_lock_irqsave(&cmd->t_state_lock, flags);
        if (cmd->se_cmd_flags & SCF_SENT_CHECK_CONDITION) {
@@ -2655,243 +2818,17 @@ transport_send_check_condition_and_sense(struct se_cmd *cmd,
        cmd->se_cmd_flags |= SCF_SENT_CHECK_CONDITION;
        spin_unlock_irqrestore(&cmd->t_state_lock, flags);
 
-       if (!reason && from_transport)
-               goto after_reason;
+       if (!from_transport) {
+               int rc;
 
-       if (!from_transport)
                cmd->se_cmd_flags |= SCF_EMULATED_TASK_SENSE;
-
-       /*
-        * Actual SENSE DATA, see SPC-3 7.23.2  SPC_SENSE_KEY_OFFSET uses
-        * SENSE KEY values from include/scsi/scsi.h
-        */
-       switch (reason) {
-       case TCM_NO_SENSE:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* Not Ready */
-               buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY;
-               /* NO ADDITIONAL SENSE INFORMATION */
-               buffer[SPC_ASC_KEY_OFFSET] = 0;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0;
-               break;
-       case TCM_NON_EXISTENT_LUN:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL UNIT NOT SUPPORTED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x25;
-               break;
-       case TCM_UNSUPPORTED_SCSI_OPCODE:
-       case TCM_SECTOR_COUNT_TOO_MANY:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID COMMAND OPERATION CODE */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x20;
-               break;
-       case TCM_UNKNOWN_MODE_PAGE:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID FIELD IN CDB */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x24;
-               break;
-       case TCM_CHECK_CONDITION_ABORT_CMD:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* BUS DEVICE RESET FUNCTION OCCURRED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x29;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x03;
-               break;
-       case TCM_INCORRECT_AMOUNT_OF_DATA:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* WRITE ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x0c;
-               /* NOT ENOUGH UNSOLICITED DATA */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x0d;
-               break;
-       case TCM_INVALID_CDB_FIELD:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID FIELD IN CDB */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x24;
-               break;
-       case TCM_INVALID_PARAMETER_LIST:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* INVALID FIELD IN PARAMETER LIST */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x26;
-               break;
-       case TCM_PARAMETER_LIST_LENGTH_ERROR:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* PARAMETER LIST LENGTH ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x1a;
-               break;
-       case TCM_UNEXPECTED_UNSOLICITED_DATA:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* WRITE ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x0c;
-               /* UNEXPECTED_UNSOLICITED_DATA */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x0c;
-               break;
-       case TCM_SERVICE_CRC_ERROR:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* PROTOCOL SERVICE CRC ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x47;
-               /* N/A */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x05;
-               break;
-       case TCM_SNACK_REJECTED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ABORTED COMMAND */
-               buffer[SPC_SENSE_KEY_OFFSET] = ABORTED_COMMAND;
-               /* READ ERROR */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x11;
-               /* FAILED RETRANSMISSION REQUEST */
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x13;
-               break;
-       case TCM_WRITE_PROTECTED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* DATA PROTECT */
-               buffer[SPC_SENSE_KEY_OFFSET] = DATA_PROTECT;
-               /* WRITE PROTECTED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x27;
-               break;
-       case TCM_ADDRESS_OUT_OF_RANGE:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK ADDRESS OUT OF RANGE */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x21;
-               break;
-       case TCM_CHECK_CONDITION_UNIT_ATTENTION:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* UNIT ATTENTION */
-               buffer[SPC_SENSE_KEY_OFFSET] = UNIT_ATTENTION;
-               core_scsi3_ua_for_check_condition(cmd, &asc, &ascq);
-               buffer[SPC_ASC_KEY_OFFSET] = asc;
-               buffer[SPC_ASCQ_KEY_OFFSET] = ascq;
-               break;
-       case TCM_CHECK_CONDITION_NOT_READY:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* Not Ready */
-               buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY;
-               transport_get_sense_codes(cmd, &asc, &ascq);
-               buffer[SPC_ASC_KEY_OFFSET] = asc;
-               buffer[SPC_ASCQ_KEY_OFFSET] = ascq;
-               break;
-       case TCM_MISCOMPARE_VERIFY:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               buffer[SPC_SENSE_KEY_OFFSET] = MISCOMPARE;
-               /* MISCOMPARE DURING VERIFY OPERATION */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x1d;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x00;
-               break;
-       case TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK GUARD CHECK FAILED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x10;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x01;
-               transport_err_sector_info(buffer, cmd->bad_sector);
-               break;
-       case TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK APPLICATION TAG CHECK FAILED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x10;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x02;
-               transport_err_sector_info(buffer, cmd->bad_sector);
-               break;
-       case TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /* ILLEGAL REQUEST */
-               buffer[SPC_SENSE_KEY_OFFSET] = ILLEGAL_REQUEST;
-               /* LOGICAL BLOCK REFERENCE TAG CHECK FAILED */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x10;
-               buffer[SPC_ASCQ_KEY_OFFSET] = 0x03;
-               transport_err_sector_info(buffer, cmd->bad_sector);
-               break;
-       case TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE:
-       default:
-               /* CURRENT ERROR */
-               buffer[0] = 0x70;
-               buffer[SPC_ADD_SENSE_LEN_OFFSET] = 10;
-               /*
-                * Returning ILLEGAL REQUEST would cause immediate IO errors on
-                * Solaris initiators.  Returning NOT READY instead means the
-                * operations will be retried a finite number of times and we
-                * can survive intermittent errors.
-                */
-               buffer[SPC_SENSE_KEY_OFFSET] = NOT_READY;
-               /* LOGICAL UNIT COMMUNICATION FAILURE */
-               buffer[SPC_ASC_KEY_OFFSET] = 0x08;
-               break;
+               cmd->scsi_status = SAM_STAT_CHECK_CONDITION;
+               cmd->scsi_sense_length  = TRANSPORT_SENSE_BUFFER;
+               rc = translate_sense_reason(cmd, reason);
+               if (rc)
+                       return rc;
        }
-       /*
-        * This code uses linux/include/scsi/scsi.h SAM status codes!
-        */
-       cmd->scsi_status = SAM_STAT_CHECK_CONDITION;
-       /*
-        * Automatically padded, this value is encoded in the fabric's
-        * data_length response PDU containing the SCSI defined sense data.
-        */
-       cmd->scsi_sense_length  = TRANSPORT_SENSE_BUFFER;
 
-after_reason:
        trace_target_cmd_complete(cmd);
        return cmd->se_tfo->queue_status(cmd);
 }
index c448ef421ce779347973579654b36fb7107043f6..937cebf7663324b53a7fa773f519403a3953b87d 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/parser.h>
 #include <linux/vmalloc.h>
 #include <linux/uio_driver.h>
+#include <linux/stringify.h>
 #include <net/genetlink.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -538,14 +539,8 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
                UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size);
                pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n",
                        cmd->se_cmd);
-               transport_generic_request_failure(cmd->se_cmd,
-                       TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE);
-               cmd->se_cmd = NULL;
-               kmem_cache_free(tcmu_cmd_cache, cmd);
-               return;
-       }
-
-       if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
+               entry->rsp.scsi_status = SAM_STAT_CHECK_CONDITION;
+       } else if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) {
                memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer,
                               se_cmd->scsi_sense_length);
 
@@ -577,7 +572,6 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
 static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 {
        struct tcmu_mailbox *mb;
-       LIST_HEAD(cpl_cmds);
        unsigned long flags;
        int handled = 0;
 
@@ -905,7 +899,7 @@ static int tcmu_configure_device(struct se_device *dev)
        WARN_ON(!PAGE_ALIGNED(udev->data_off));
        WARN_ON(udev->data_size % PAGE_SIZE);
 
-       info->version = xstr(TCMU_MAILBOX_VERSION);
+       info->version = __stringify(TCMU_MAILBOX_VERSION);
 
        info->mem[0].name = "tcm-user command & data buffer";
        info->mem[0].addr = (phys_addr_t) udev->mb_addr;
index 4515f52546f83c5cd0d4ade575f3a006d9ce3683..47fe94ee10b82d876fedef726308738dc62f5805 100644 (file)
@@ -450,6 +450,8 @@ int target_xcopy_setup_pt(void)
        memset(&xcopy_pt_sess, 0, sizeof(struct se_session));
        INIT_LIST_HEAD(&xcopy_pt_sess.sess_list);
        INIT_LIST_HEAD(&xcopy_pt_sess.sess_acl_list);
+       INIT_LIST_HEAD(&xcopy_pt_sess.sess_cmd_list);
+       spin_lock_init(&xcopy_pt_sess.sess_cmd_lock);
 
        xcopy_pt_nacl.se_tpg = &xcopy_pt_tpg;
        xcopy_pt_nacl.nacl_sess = &xcopy_pt_sess;
@@ -644,7 +646,7 @@ static int target_xcopy_read_source(
        pr_debug("XCOPY: Built READ_16: LBA: %llu Sectors: %u Length: %u\n",
                (unsigned long long)src_lba, src_sectors, length);
 
-       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, NULL, length,
+       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, length,
                              DMA_FROM_DEVICE, 0, &xpt_cmd->sense_buffer[0]);
        xop->src_pt_cmd = xpt_cmd;
 
@@ -704,7 +706,7 @@ static int target_xcopy_write_destination(
        pr_debug("XCOPY: Built WRITE_16: LBA: %llu Sectors: %u Length: %u\n",
                (unsigned long long)dst_lba, dst_sectors, length);
 
-       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, NULL, length,
+       transport_init_se_cmd(se_cmd, &xcopy_pt_tfo, &xcopy_pt_sess, length,
                              DMA_TO_DEVICE, 0, &xpt_cmd->sense_buffer[0]);
        xop->dst_pt_cmd = xpt_cmd;
 
index 68031723e5be33c97742acc286fb4217279cb2d4..aa3caca8bace13dc29a8cd9cb01d5b252b5c8a6c 100644 (file)
@@ -255,7 +255,7 @@ static void ft_recv_seq(struct fc_seq *sp, struct fc_frame *fp, void *arg)
        struct ft_cmd *cmd = arg;
        struct fc_frame_header *fh;
 
-       if (unlikely(IS_ERR(fp))) {
+       if (IS_ERR(fp)) {
                /* XXX need to find cmd if queued */
                cmd->seq = NULL;
                cmd->aborted = true;
index 118938ee8552ffe3a1146997140ee7bed3e699b1..039004400987366b876be8ce58bc1290844f4e08 100644 (file)
@@ -340,6 +340,14 @@ config ACPI_THERMAL_REL
        tristate
        depends on ACPI
 
+config INTEL_PCH_THERMAL
+       tristate "Intel PCH Thermal Reporting Driver"
+       depends on X86 && PCI
+       help
+         Enable this to support thermal reporting on certain intel PCHs.
+         Thermal reporting device will provide temperature reading,
+         programmable trip points and other information.
+
 menu "Texas Instruments thermal drivers"
 source "drivers/thermal/ti-soc-thermal/Kconfig"
 endmenu
index 535dfee1496fc26d90457b28dc413993f693fe42..26f160809959248e682544f3adc976599336690e 100644 (file)
@@ -41,6 +41,7 @@ obj-$(CONFIG_INTEL_SOC_DTS_THERMAL)   += intel_soc_dts_thermal.o
 obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL)  += intel_quark_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)   += ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
+obj-$(CONFIG_INTEL_PCH_THERMAL)        += intel_pch_thermal.o
 obj-$(CONFIG_ST_THERMAL)       += st/
 obj-$(CONFIG_TEGRA_SOCTHERM)   += tegra_soctherm.o
 obj-$(CONFIG_HISI_THERMAL)     += hisi_thermal.o
index 01255fd65135949ce78b7f94fde7bccc196ac75c..26b8d326546a804d2c4f954258c0cc9839113d09 100644 (file)
@@ -155,7 +155,7 @@ static bool armada_is_valid(struct armada_thermal_priv *priv)
 }
 
 static int armada_get_temp(struct thermal_zone_device *thermal,
-                         unsigned long *temp)
+                         int *temp)
 {
        struct armada_thermal_priv *priv = thermal->devdata;
        unsigned long reg;
index 2fb273c4baa95b5583052102d793d8b9d288ad44..652acd8fbe48e2334ab887df96272859f12c11bf 100644 (file)
@@ -107,8 +107,7 @@ static int db8500_cdev_unbind(struct thermal_zone_device *thermal,
 }
 
 /* Callback to get current temperature */
-static int db8500_sys_get_temp(struct thermal_zone_device *thermal,
-               unsigned long *temp)
+static int db8500_sys_get_temp(struct thermal_zone_device *thermal, int *temp)
 {
        struct db8500_thermal_zone *pzone = thermal->devdata;
 
@@ -180,7 +179,7 @@ static int db8500_sys_get_trip_type(struct thermal_zone_device *thermal,
 
 /* Callback to get trip point temperature */
 static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
-               int trip, unsigned long *temp)
+               int trip, int *temp)
 {
        struct db8500_thermal_zone *pzone = thermal->devdata;
        struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
@@ -195,7 +194,7 @@ static int db8500_sys_get_trip_temp(struct thermal_zone_device *thermal,
 
 /* Callback to get critical trip point temperature */
 static int db8500_sys_get_crit_temp(struct thermal_zone_device *thermal,
-               unsigned long *temp)
+               int *temp)
 {
        struct db8500_thermal_zone *pzone = thermal->devdata;
        struct db8500_thsens_platform_data *ptrips = pzone->trip_tab;
index 09f6e304c27461969bb0440e6f06e6cbbc8e0ecc..a0bc9de42553b81038a4a07c7a35eb9916c670e3 100644 (file)
@@ -93,7 +93,7 @@ static int dove_init_sensor(const struct dove_thermal_priv *priv)
 }
 
 static int dove_get_temp(struct thermal_zone_device *thermal,
-                         unsigned long *temp)
+                         int *temp)
 {
        unsigned long reg;
        struct dove_thermal_priv *priv = thermal->devdata;
index c2c10bbe24d62c7cef116cf3c1ac2a9e5f738905..34fe36504a552cdaf112a6982483655f25dc2238 100644 (file)
@@ -34,7 +34,7 @@
 static int get_trip_level(struct thermal_zone_device *tz)
 {
        int count = 0;
-       unsigned long trip_temp;
+       int trip_temp;
        enum thermal_trip_type trip_type;
 
        if (tz->trips == 0 || !tz->ops->get_trip_temp)
index c5dd76b2ee74fb930654a7ce000f44262a40198e..70836c5b89bc411d3a1b91ebea91c3b8f92b4dba 100644 (file)
 
 static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 {
-       long trip_temp;
-       unsigned long trip_hyst;
+       int trip_temp, trip_hyst;
        struct thermal_instance *instance;
 
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
        tz->ops->get_trip_hyst(tz, trip, &trip_hyst);
 
-       dev_dbg(&tz->device, "Trip%d[temp=%ld]:temp=%d:hyst=%ld\n",
+       dev_dbg(&tz->device, "Trip%d[temp=%d]:temp=%d:hyst=%d\n",
                                trip, trip_temp, tz->temperature,
                                trip_hyst);
 
index b49f97c734d00ddccb50379d3131c232651e96e5..36d07295f8e3ac6724181f9dc99152c78db1eb08 100644 (file)
@@ -155,7 +155,7 @@ static void hisi_thermal_disable_sensor(struct hisi_thermal_data *data)
        mutex_unlock(&data->thermal_lock);
 }
 
-static int hisi_thermal_get_temp(void *_sensor, long *temp)
+static int hisi_thermal_get_temp(void *_sensor, int *temp)
 {
        struct hisi_thermal_sensor *sensor = _sensor;
        struct hisi_thermal_data *data = sensor->thermal;
@@ -178,7 +178,7 @@ static int hisi_thermal_get_temp(void *_sensor, long *temp)
        data->irq_bind_sensor = sensor_id;
        mutex_unlock(&data->thermal_lock);
 
-       dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%ld, thres=%d\n",
+       dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%d, thres=%d\n",
                sensor->id, data->irq_enabled, *temp, sensor->thres_temp);
        /*
         * Bind irq to sensor for two cases:
index fde4c2876d14612c2c3bef5fb0ed55fe78ea216c..4bec1d3c3d27bba4438ea37f26c9ffdf2e749fe5 100644 (file)
@@ -98,10 +98,10 @@ struct imx_thermal_data {
        enum thermal_device_mode mode;
        struct regmap *tempmon;
        u32 c1, c2; /* See formula in imx_get_sensor_data() */
-       unsigned long temp_passive;
-       unsigned long temp_critical;
-       unsigned long alarm_temp;
-       unsigned long last_temp;
+       int temp_passive;
+       int temp_critical;
+       int alarm_temp;
+       int last_temp;
        bool irq_enabled;
        int irq;
        struct clk *thermal_clk;
@@ -109,7 +109,7 @@ struct imx_thermal_data {
 };
 
 static void imx_set_panic_temp(struct imx_thermal_data *data,
-                              signed long panic_temp)
+                              int panic_temp)
 {
        struct regmap *map = data->tempmon;
        int critical_value;
@@ -121,7 +121,7 @@ static void imx_set_panic_temp(struct imx_thermal_data *data,
 }
 
 static void imx_set_alarm_temp(struct imx_thermal_data *data,
-                              signed long alarm_temp)
+                              int alarm_temp)
 {
        struct regmap *map = data->tempmon;
        int alarm_value;
@@ -133,7 +133,7 @@ static void imx_set_alarm_temp(struct imx_thermal_data *data,
                        TEMPSENSE0_ALARM_VALUE_SHIFT);
 }
 
-static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+static int imx_get_temp(struct thermal_zone_device *tz, int *temp)
 {
        struct imx_thermal_data *data = tz->devdata;
        struct regmap *map = data->tempmon;
@@ -189,13 +189,13 @@ static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
                if (data->alarm_temp == data->temp_critical &&
                        *temp < data->temp_passive) {
                        imx_set_alarm_temp(data, data->temp_passive);
-                       dev_dbg(&tz->device, "thermal alarm off: T < %lu\n",
+                       dev_dbg(&tz->device, "thermal alarm off: T < %d\n",
                                data->alarm_temp / 1000);
                }
        }
 
        if (*temp != data->last_temp) {
-               dev_dbg(&tz->device, "millicelsius: %ld\n", *temp);
+               dev_dbg(&tz->device, "millicelsius: %d\n", *temp);
                data->last_temp = *temp;
        }
 
@@ -262,8 +262,7 @@ static int imx_get_trip_type(struct thermal_zone_device *tz, int trip,
        return 0;
 }
 
-static int imx_get_crit_temp(struct thermal_zone_device *tz,
-                            unsigned long *temp)
+static int imx_get_crit_temp(struct thermal_zone_device *tz, int *temp)
 {
        struct imx_thermal_data *data = tz->devdata;
 
@@ -272,7 +271,7 @@ static int imx_get_crit_temp(struct thermal_zone_device *tz,
 }
 
 static int imx_get_trip_temp(struct thermal_zone_device *tz, int trip,
-                            unsigned long *temp)
+                            int *temp)
 {
        struct imx_thermal_data *data = tz->devdata;
 
@@ -282,7 +281,7 @@ static int imx_get_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int imx_set_trip_temp(struct thermal_zone_device *tz, int trip,
-                            unsigned long temp)
+                            int temp)
 {
        struct imx_thermal_data *data = tz->devdata;
 
@@ -434,7 +433,7 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev)
 {
        struct imx_thermal_data *data = dev;
 
-       dev_dbg(&data->tz->device, "THERMAL ALARM: T > %lu\n",
+       dev_dbg(&data->tz->device, "THERMAL ALARM: T > %d\n",
                data->alarm_temp / 1000);
 
        thermal_zone_device_update(data->tz);
index 031018e7a65bd72a4ec8ba3a452e16f455e5f3a1..5836e55544331dad4cbfcf918a29923d3025aba0 100644 (file)
@@ -186,7 +186,7 @@ static int int3400_thermal_run_osc(acpi_handle handle,
 }
 
 static int int3400_thermal_get_temp(struct thermal_zone_device *thermal,
-                       unsigned long *temp)
+                       int *temp)
 {
        *temp = 20 * 1000; /* faked temp sensor with 20C */
        return 0;
index 1e25133d35e2cbe8e7476a382bd27433a5801b0c..b9b2666aa94c93e39b1c7bd6e23c07dfe4245890 100644 (file)
@@ -20,7 +20,7 @@
 #include "int340x_thermal_zone.h"
 
 static int int340x_thermal_get_zone_temp(struct thermal_zone_device *zone,
-                                        unsigned long *temp)
+                                        int *temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        unsigned long long tmp;
@@ -49,7 +49,7 @@ static int int340x_thermal_get_zone_temp(struct thermal_zone_device *zone,
 }
 
 static int int340x_thermal_get_trip_temp(struct thermal_zone_device *zone,
-                                        int trip, unsigned long *temp)
+                                        int trip, int *temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        int i;
@@ -114,7 +114,7 @@ static int int340x_thermal_get_trip_type(struct thermal_zone_device *zone,
 }
 
 static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone,
-                                     int trip, unsigned long temp)
+                                     int trip, int temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        acpi_status status;
@@ -136,7 +136,7 @@ static int int340x_thermal_set_trip_temp(struct thermal_zone_device *zone,
 
 
 static int int340x_thermal_get_trip_hyst(struct thermal_zone_device *zone,
-               int trip, unsigned long *temp)
+               int trip, int *temp)
 {
        struct int34x_thermal_zone *d = zone->devdata;
        acpi_status status;
@@ -163,7 +163,7 @@ static struct thermal_zone_device_ops int340x_thermal_zone_ops = {
 };
 
 static int int340x_thermal_get_trip_config(acpi_handle handle, char *name,
-                                     unsigned long *temp)
+                                     int *temp)
 {
        unsigned long long r;
        acpi_status status;
index 9f38ab72c4bf54f39318bd851674569eb8d757fd..aaadf724ff2ef858807715ae395b2aef9c0b740e 100644 (file)
@@ -21,7 +21,7 @@
 #define INT340X_THERMAL_MAX_ACT_TRIP_COUNT     10
 
 struct active_trip {
-       unsigned long temp;
+       int temp;
        int id;
        bool valid;
 };
@@ -31,11 +31,11 @@ struct int34x_thermal_zone {
        struct active_trip act_trips[INT340X_THERMAL_MAX_ACT_TRIP_COUNT];
        unsigned long *aux_trips;
        int aux_trip_nr;
-       unsigned long psv_temp;
+       int psv_temp;
        int psv_trip_id;
-       unsigned long crt_temp;
+       int crt_temp;
        int crt_trip_id;
-       unsigned long hot_temp;
+       int hot_temp;
        int hot_trip_id;
        struct thermal_zone_device *zone;
        struct thermal_zone_device_ops *override_ops;
index 3df3dc34b124261789469208a07ffd85604292db..ccc0ad02d06698108ec486d11f7b0dc69c7c734a 100644 (file)
@@ -145,7 +145,7 @@ static int get_tjmax(void)
        return -EINVAL;
 }
 
-static int read_temp_msr(unsigned long *temp)
+static int read_temp_msr(int *temp)
 {
        int cpu;
        u32 eax, edx;
@@ -177,7 +177,7 @@ err_ret:
 }
 
 static int proc_thermal_get_zone_temp(struct thermal_zone_device *zone,
-                                        unsigned long *temp)
+                                        int *temp)
 {
        int ret;
 
diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c
new file mode 100644 (file)
index 0000000..50c7da7
--- /dev/null
@@ -0,0 +1,283 @@
+/* intel_pch_thermal.c - Intel PCH Thermal driver
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * Authors:
+ *     Tushar Dave <tushar.n.dave@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/thermal.h>
+
+/* Intel PCH thermal Device IDs */
+#define PCH_THERMAL_DID_WPT    0x9CA4 /* Wildcat Point */
+
+/* Wildcat Point-LP  PCH Thermal registers */
+#define WPT_TEMP       0x0000  /* Temperature */
+#define WPT_TSC        0x04    /* Thermal Sensor Control */
+#define WPT_TSS        0x06    /* Thermal Sensor Status */
+#define WPT_TSEL       0x08    /* Thermal Sensor Enable and Lock */
+#define WPT_TSREL      0x0A    /* Thermal Sensor Report Enable and Lock */
+#define WPT_TSMIC      0x0C    /* Thermal Sensor SMI Control */
+#define WPT_CTT        0x0010  /* Catastrophic Trip Point */
+#define WPT_TAHV       0x0014  /* Thermal Alert High Value */
+#define WPT_TALV       0x0018  /* Thermal Alert Low Value */
+#define WPT_TL         0x00000040      /* Throttle Value */
+#define WPT_PHL        0x0060  /* PCH Hot Level */
+#define WPT_PHLC       0x62    /* PHL Control */
+#define WPT_TAS        0x80    /* Thermal Alert Status */
+#define WPT_TSPIEN     0x82    /* PCI Interrupt Event Enables */
+#define WPT_TSGPEN     0x84    /* General Purpose Event Enables */
+
+/*  Wildcat Point-LP  PCH Thermal Register bit definitions */
+#define WPT_TEMP_TSR   0x00ff  /* Temp TS Reading */
+#define WPT_TSC_CPDE   0x01    /* Catastrophic Power-Down Enable */
+#define WPT_TSS_TSDSS  0x10    /* Thermal Sensor Dynamic Shutdown Status */
+#define WPT_TSS_GPES   0x08    /* GPE status */
+#define WPT_TSEL_ETS   0x01    /* Enable TS */
+#define WPT_TSEL_PLDB  0x80    /* TSEL Policy Lock-Down Bit */
+#define WPT_TL_TOL     0x000001FF      /* T0 Level */
+#define WPT_TL_T1L     0x1ff00000      /* T1 Level */
+#define WPT_TL_TTEN    0x20000000      /* TT Enable */
+
+static char driver_name[] = "Intel PCH thermal driver";
+
+struct pch_thermal_device {
+       void __iomem *hw_base;
+       const struct pch_dev_ops *ops;
+       struct pci_dev *pdev;
+       struct thermal_zone_device *tzd;
+       int crt_trip_id;
+       unsigned long crt_temp;
+       int hot_trip_id;
+       unsigned long hot_temp;
+};
+
+static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips)
+{
+       u8 tsel;
+       u16 trip_temp;
+
+       *nr_trips = 0;
+
+       /* Check if BIOS has already enabled thermal sensor */
+       if (WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))
+               goto read_trips;
+
+       tsel = readb(ptd->hw_base + WPT_TSEL);
+       /*
+        * When TSEL's Policy Lock-Down bit is 1, TSEL become RO.
+        * If so, thermal sensor cannot enable. Bail out.
+        */
+       if (tsel & WPT_TSEL_PLDB) {
+               dev_err(&ptd->pdev->dev, "Sensor can't be enabled\n");
+               return -ENODEV;
+       }
+
+       writeb(tsel|WPT_TSEL_ETS, ptd->hw_base + WPT_TSEL);
+       if (!(WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))) {
+               dev_err(&ptd->pdev->dev, "Sensor can't be enabled\n");
+               return -ENODEV;
+       }
+
+read_trips:
+       ptd->crt_trip_id = -1;
+       trip_temp = readw(ptd->hw_base + WPT_CTT);
+       trip_temp &= 0x1FF;
+       if (trip_temp) {
+               /* Resolution of 1/2 degree C and an offset of -50C */
+               ptd->crt_temp = trip_temp * 1000 / 2 - 50000;
+               ptd->crt_trip_id = 0;
+               ++(*nr_trips);
+       }
+
+       ptd->hot_trip_id = -1;
+       trip_temp = readw(ptd->hw_base + WPT_PHL);
+       trip_temp &= 0x1FF;
+       if (trip_temp) {
+               /* Resolution of 1/2 degree C and an offset of -50C */
+               ptd->hot_temp = trip_temp * 1000 / 2 - 50000;
+               ptd->hot_trip_id = *nr_trips;
+               ++(*nr_trips);
+       }
+
+       return 0;
+}
+
+static int pch_wpt_get_temp(struct pch_thermal_device *ptd, int *temp)
+{
+       u8 wpt_temp;
+
+       wpt_temp = WPT_TEMP_TSR & readl(ptd->hw_base + WPT_TEMP);
+
+       /* Resolution of 1/2 degree C and an offset of -50C */
+       *temp = (wpt_temp * 1000 / 2 - 50000);
+
+       return 0;
+}
+
+struct pch_dev_ops {
+       int (*hw_init)(struct pch_thermal_device *ptd, int *nr_trips);
+       int (*get_temp)(struct pch_thermal_device *ptd, int *temp);
+};
+
+
+/* dev ops for Wildcat Point */
+static struct pch_dev_ops pch_dev_ops_wpt = {
+       .hw_init = pch_wpt_init,
+       .get_temp = pch_wpt_get_temp,
+};
+
+static int pch_thermal_get_temp(struct thermal_zone_device *tzd, int *temp)
+{
+       struct pch_thermal_device *ptd = tzd->devdata;
+
+       return  ptd->ops->get_temp(ptd, temp);
+}
+
+static int pch_get_trip_type(struct thermal_zone_device *tzd, int trip,
+                            enum thermal_trip_type *type)
+{
+       struct pch_thermal_device *ptd = tzd->devdata;
+
+       if (ptd->crt_trip_id == trip)
+               *type = THERMAL_TRIP_CRITICAL;
+       else if (ptd->hot_trip_id == trip)
+               *type = THERMAL_TRIP_HOT;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+static int pch_get_trip_temp(struct thermal_zone_device *tzd, int trip, int *temp)
+{
+       struct pch_thermal_device *ptd = tzd->devdata;
+
+       if (ptd->crt_trip_id == trip)
+               *temp = ptd->crt_temp;
+       else if (ptd->hot_trip_id == trip)
+               *temp = ptd->hot_temp;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+static struct thermal_zone_device_ops tzd_ops = {
+       .get_temp = pch_thermal_get_temp,
+       .get_trip_type = pch_get_trip_type,
+       .get_trip_temp = pch_get_trip_temp,
+};
+
+
+static int intel_pch_thermal_probe(struct pci_dev *pdev,
+                                  const struct pci_device_id *id)
+{
+       struct pch_thermal_device *ptd;
+       int err;
+       int nr_trips;
+       char *dev_name;
+
+       ptd = devm_kzalloc(&pdev->dev, sizeof(*ptd), GFP_KERNEL);
+       if (!ptd)
+               return -ENOMEM;
+
+       switch (pdev->device) {
+       case PCH_THERMAL_DID_WPT:
+               ptd->ops = &pch_dev_ops_wpt;
+               dev_name = "pch_wildcat_point";
+               break;
+       default:
+               dev_err(&pdev->dev, "unknown pch thermal device\n");
+               return -ENODEV;
+       }
+
+       pci_set_drvdata(pdev, ptd);
+       ptd->pdev = pdev;
+
+       err = pci_enable_device(pdev);
+       if (err) {
+               dev_err(&pdev->dev, "failed to enable pci device\n");
+               return err;
+       }
+
+       err = pci_request_regions(pdev, driver_name);
+       if (err) {
+               dev_err(&pdev->dev, "failed to request pci region\n");
+               goto error_disable;
+       }
+
+       ptd->hw_base = pci_ioremap_bar(pdev, 0);
+       if (!ptd->hw_base) {
+               err = -ENOMEM;
+               dev_err(&pdev->dev, "failed to map mem base\n");
+               goto error_release;
+       }
+
+       err = ptd->ops->hw_init(ptd, &nr_trips);
+       if (err)
+               goto error_cleanup;
+
+       ptd->tzd = thermal_zone_device_register(dev_name, nr_trips, 0, ptd,
+                                               &tzd_ops, NULL, 0, 0);
+       if (IS_ERR(ptd->tzd)) {
+               dev_err(&pdev->dev, "Failed to register thermal zone %s\n",
+                       dev_name);
+               err = PTR_ERR(ptd->tzd);
+               goto error_cleanup;
+       }
+
+       return 0;
+
+error_cleanup:
+       iounmap(ptd->hw_base);
+error_release:
+       pci_release_regions(pdev);
+error_disable:
+       pci_disable_device(pdev);
+       dev_err(&pdev->dev, "pci device failed to probe\n");
+       return err;
+}
+
+static void intel_pch_thermal_remove(struct pci_dev *pdev)
+{
+       struct pch_thermal_device *ptd = pci_get_drvdata(pdev);
+
+       thermal_zone_device_unregister(ptd->tzd);
+       iounmap(ptd->hw_base);
+       pci_set_drvdata(pdev, NULL);
+       pci_release_region(pdev, 0);
+       pci_disable_device(pdev);
+}
+
+static struct pci_device_id intel_pch_thermal_id[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_WPT) },
+       { 0, },
+};
+MODULE_DEVICE_TABLE(pci, intel_pch_thermal_id);
+
+static struct pci_driver intel_pch_thermal_driver = {
+       .name           = "intel_pch_thermal",
+       .id_table       = intel_pch_thermal_id,
+       .probe          = intel_pch_thermal_probe,
+       .remove         = intel_pch_thermal_remove,
+};
+
+module_pci_driver(intel_pch_thermal_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Intel PCH Thermal driver");
index 2ac0c704bcb85b5884365d54485c57383952b81a..6c79588251d59b53e290908a7fd61705c6db3211 100644 (file)
@@ -693,11 +693,14 @@ static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
        { X86_VENDOR_INTEL, 6, 0x3f},
        { X86_VENDOR_INTEL, 6, 0x45},
        { X86_VENDOR_INTEL, 6, 0x46},
+       { X86_VENDOR_INTEL, 6, 0x47},
        { X86_VENDOR_INTEL, 6, 0x4c},
        { X86_VENDOR_INTEL, 6, 0x4d},
+       { X86_VENDOR_INTEL, 6, 0x4e},
        { X86_VENDOR_INTEL, 6, 0x4f},
        { X86_VENDOR_INTEL, 6, 0x56},
        { X86_VENDOR_INTEL, 6, 0x57},
+       { X86_VENDOR_INTEL, 6, 0x5e},
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
index 4434ec812cb70b9d4273c20cb9fd7fa5eea51b83..5ed90e6c8a64337bb968a937b50f8b92d49f07dc 100644 (file)
@@ -186,7 +186,7 @@ static int soc_dts_disable(struct thermal_zone_device *tzd)
        return ret;
 }
 
-static int _get_trip_temp(int trip, unsigned long *temp)
+static int _get_trip_temp(int trip, int *temp)
 {
        int status;
        u32 out;
@@ -212,19 +212,18 @@ static int _get_trip_temp(int trip, unsigned long *temp)
 }
 
 static inline int sys_get_trip_temp(struct thermal_zone_device *tzd,
-                               int trip, unsigned long *temp)
+                               int trip, int *temp)
 {
        return _get_trip_temp(trip, temp);
 }
 
-static inline int sys_get_crit_temp(struct thermal_zone_device *tzd,
-                               unsigned long *temp)
+static inline int sys_get_crit_temp(struct thermal_zone_device *tzd, int *temp)
 {
        return _get_trip_temp(QRK_DTS_ID_TP_CRITICAL, temp);
 }
 
 static int update_trip_temp(struct soc_sensor_entry *aux_entry,
-                               int trip, unsigned long temp)
+                               int trip, int temp)
 {
        u32 out;
        u32 temp_out;
@@ -272,7 +271,7 @@ failed:
 }
 
 static inline int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-                               unsigned long temp)
+                               int temp)
 {
        return update_trip_temp(tzd->devdata, trip, temp);
 }
@@ -289,7 +288,7 @@ static int sys_get_trip_type(struct thermal_zone_device *thermal,
 }
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzd,
-                               unsigned long *temp)
+                               int *temp)
 {
        u32 out;
        int ret;
index 42e4b6ac38750fd97474637550caafbbf872db04..5841d1d729966b85439f8aa27a51163e057c59ee 100644 (file)
@@ -80,7 +80,7 @@ err_ret:
 }
 
 static int sys_get_trip_temp(struct thermal_zone_device *tzd, int trip,
-                            unsigned long *temp)
+                            int *temp)
 {
        int status;
        u32 out;
@@ -106,7 +106,7 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzd, int trip,
 }
 
 static int update_trip_temp(struct intel_soc_dts_sensor_entry *dts,
-                           int thres_index, unsigned long temp,
+                           int thres_index, int temp,
                            enum thermal_trip_type trip_type)
 {
        int status;
@@ -196,7 +196,7 @@ err_restore_ptps:
 }
 
 static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-                            unsigned long temp)
+                            int temp)
 {
        struct intel_soc_dts_sensor_entry *dts = tzd->devdata;
        struct intel_soc_dts_sensors *sensors = dts->sensors;
@@ -226,7 +226,7 @@ static int sys_get_trip_type(struct thermal_zone_device *tzd,
 }
 
 static int sys_get_curr_temp(struct thermal_zone_device *tzd,
-                            unsigned long *temp)
+                            int *temp)
 {
        int status;
        u32 out;
index 11041fe63dc2b2ee17505561eebffea50c6331f9..892236621767d851e53b9710d47541bc3445e825 100644 (file)
@@ -33,7 +33,7 @@ struct kirkwood_thermal_priv {
 };
 
 static int kirkwood_get_temp(struct thermal_zone_device *thermal,
-                         unsigned long *temp)
+                         int *temp)
 {
        unsigned long reg;
        struct kirkwood_thermal_priv *priv = thermal->devdata;
index b295b2b6c191ea056f5d4f42dd7974ae31a9e1f6..42b7d4253b9446511150c6b599ac548f82e04f1a 100644 (file)
@@ -91,7 +91,7 @@ struct __thermal_zone {
 /***   DT thermal zone device callbacks   ***/
 
 static int of_thermal_get_temp(struct thermal_zone_device *tz,
-                              unsigned long *temp)
+                              int *temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -177,7 +177,7 @@ EXPORT_SYMBOL_GPL(of_thermal_get_trip_points);
  * Return: zero on success, error code otherwise
  */
 static int of_thermal_set_emul_temp(struct thermal_zone_device *tz,
-                                   unsigned long temp)
+                                   int temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -311,7 +311,7 @@ static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
-                                   unsigned long *temp)
+                                   int *temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -324,7 +324,7 @@ static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
-                                   unsigned long temp)
+                                   int temp)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -338,7 +338,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
-                                   unsigned long *hyst)
+                                   int *hyst)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -351,7 +351,7 @@ static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip,
-                                   unsigned long hyst)
+                                   int hyst)
 {
        struct __thermal_zone *data = tz->devdata;
 
@@ -365,7 +365,7 @@ static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip,
 }
 
 static int of_thermal_get_crit_temp(struct thermal_zone_device *tz,
-                                   unsigned long *temp)
+                                   int *temp)
 {
        struct __thermal_zone *data = tz->devdata;
        int i;
index 251676902869d673140b3684c64b0c63ea376f77..9c8a7aad0252e5de75c862213db2f001f06bf9dd 100644 (file)
@@ -92,8 +92,8 @@ struct power_allocator_params {
  * Return: The power budget for the next period.
  */
 static u32 pid_controller(struct thermal_zone_device *tz,
-                         unsigned long current_temp,
-                         unsigned long control_temp,
+                         int current_temp,
+                         int control_temp,
                          u32 max_allocatable_power)
 {
        s64 p, i, d, power_range;
@@ -102,7 +102,7 @@ static u32 pid_controller(struct thermal_zone_device *tz,
 
        max_power_frac = int_to_frac(max_allocatable_power);
 
-       err = ((s32)control_temp - (s32)current_temp);
+       err = control_temp - current_temp;
        err = int_to_frac(err);
 
        /* Calculate the proportional term */
@@ -223,8 +223,8 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
 }
 
 static int allocate_power(struct thermal_zone_device *tz,
-                         unsigned long current_temp,
-                         unsigned long control_temp)
+                         int current_temp,
+                         int control_temp)
 {
        struct thermal_instance *instance;
        struct power_allocator_params *params = tz->governor_data;
@@ -331,7 +331,7 @@ static int allocate_power(struct thermal_zone_device *tz,
                                      granted_power, total_granted_power,
                                      num_actors, power_range,
                                      max_allocatable_power, current_temp,
-                                     (s32)control_temp - (s32)current_temp);
+                                     control_temp - current_temp);
 
        kfree(req_power);
 unlock:
@@ -416,7 +416,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz)
 {
        int ret;
        struct power_allocator_params *params;
-       unsigned long switch_on_temp, control_temp;
+       int switch_on_temp, control_temp;
        u32 temperature_threshold;
 
        if (!tz->tzp || !tz->tzp->sustainable_power) {
@@ -481,7 +481,7 @@ static void power_allocator_unbind(struct thermal_zone_device *tz)
 static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 {
        int ret;
-       unsigned long switch_on_temp, control_temp, current_temp;
+       int switch_on_temp, control_temp, current_temp;
        struct power_allocator_params *params = tz->governor_data;
 
        /*
index c8d27b8fb9eca1c5ba017fc475c3fbd26bf85580..b677aada5b52860948d6731b0c164573aad4b9f9 100644 (file)
@@ -117,7 +117,7 @@ static int qpnp_tm_update_temp_no_adc(struct qpnp_tm_chip *chip)
        return 0;
 }
 
-static int qpnp_tm_get_temp(void *data, long *temp)
+static int qpnp_tm_get_temp(void *data, int *temp)
 {
        struct qpnp_tm_chip *chip = data;
        int ret, mili_celsius;
index fe4e767018c4cf73afa3c53852b6d48191e2a81e..5d4ae7d705e0024528c8d52d56134bd92e280ea1 100644 (file)
@@ -200,8 +200,7 @@ err_out_unlock:
        return ret;
 }
 
-static int rcar_thermal_get_temp(struct thermal_zone_device *zone,
-                                unsigned long *temp)
+static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
 {
        struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
 
@@ -235,7 +234,7 @@ static int rcar_thermal_get_trip_type(struct thermal_zone_device *zone,
 }
 
 static int rcar_thermal_get_trip_temp(struct thermal_zone_device *zone,
-                                     int trip, unsigned long *temp)
+                                     int trip, int *temp)
 {
        struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
        struct device *dev = rcar_priv_to_dev(priv);
@@ -299,7 +298,7 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable)
 static void rcar_thermal_work(struct work_struct *work)
 {
        struct rcar_thermal_priv *priv;
-       unsigned long cctemp, nctemp;
+       int cctemp, nctemp;
 
        priv = container_of(work, struct rcar_thermal_priv, work.work);
 
index cd8f5f93b42c45aa4cde0f8c4aa346836006f6da..c89ffb26a35434e67f38c7dfef3a6800cb29b83b 100644 (file)
@@ -64,7 +64,7 @@ struct rockchip_tsadc_chip {
        void (*control)(void __iomem *reg, bool on);
 
        /* Per-sensor methods */
-       int (*get_temp)(int chn, void __iomem *reg, long *temp);
+       int (*get_temp)(int chn, void __iomem *reg, int *temp);
        void (*set_tshut_temp)(int chn, void __iomem *reg, long temp);
        void (*set_tshut_mode)(int chn, void __iomem *reg, enum tshut_mode m);
 };
@@ -191,7 +191,7 @@ static u32 rk_tsadcv2_temp_to_code(long temp)
        return 0;
 }
 
-static long rk_tsadcv2_code_to_temp(u32 code)
+static int rk_tsadcv2_code_to_temp(u32 code)
 {
        unsigned int low = 0;
        unsigned int high = ARRAY_SIZE(v2_code_table) - 1;
@@ -277,7 +277,7 @@ static void rk_tsadcv2_control(void __iomem *regs, bool enable)
        writel_relaxed(val, regs + TSADCV2_AUTO_CON);
 }
 
-static int rk_tsadcv2_get_temp(int chn, void __iomem *regs, long *temp)
+static int rk_tsadcv2_get_temp(int chn, void __iomem *regs, int *temp)
 {
        u32 val;
 
@@ -366,7 +366,7 @@ static irqreturn_t rockchip_thermal_alarm_irq_thread(int irq, void *dev)
        return IRQ_HANDLED;
 }
 
-static int rockchip_thermal_get_temp(void *_sensor, long *out_temp)
+static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 {
        struct rockchip_thermal_sensor *sensor = _sensor;
        struct rockchip_thermal_data *thermal = sensor->thermal;
@@ -374,7 +374,7 @@ static int rockchip_thermal_get_temp(void *_sensor, long *out_temp)
        int retval;
 
        retval = tsadc->get_temp(sensor->id, thermal->regs, out_temp);
-       dev_dbg(&thermal->pdev->dev, "sensor %d - temp: %ld, retval: %d\n",
+       dev_dbg(&thermal->pdev->dev, "sensor %d - temp: %d, retval: %d\n",
                sensor->id, *out_temp, retval);
 
        return retval;
index c96ff10b869efd941bfe8c32384d48b70b1c348d..0bae8cc6c23a0be622b2addf1479830e838cb243 100644 (file)
@@ -207,8 +207,7 @@ struct exynos_tmu_data {
        int (*tmu_initialize)(struct platform_device *pdev);
        void (*tmu_control)(struct platform_device *pdev, bool on);
        int (*tmu_read)(struct exynos_tmu_data *data);
-       void (*tmu_set_emulation)(struct exynos_tmu_data *data,
-                                 unsigned long temp);
+       void (*tmu_set_emulation)(struct exynos_tmu_data *data, int temp);
        void (*tmu_clear_irqs)(struct exynos_tmu_data *data);
 };
 
@@ -216,7 +215,7 @@ static void exynos_report_trigger(struct exynos_tmu_data *p)
 {
        char data[10], *envp[] = { data, NULL };
        struct thermal_zone_device *tz = p->tzd;
-       unsigned long temp;
+       int temp;
        unsigned int i;
 
        if (!tz) {
@@ -517,7 +516,7 @@ static int exynos5433_tmu_initialize(struct platform_device *pdev)
        struct thermal_zone_device *tz = data->tzd;
        unsigned int status, trim_info;
        unsigned int rising_threshold = 0, falling_threshold = 0;
-       unsigned long temp, temp_hist;
+       int temp, temp_hist;
        int ret = 0, threshold_code, i, sensor_id, cal_type;
 
        status = readb(data->base + EXYNOS_TMU_REG_STATUS);
@@ -610,7 +609,7 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev)
        struct exynos_tmu_data *data = platform_get_drvdata(pdev);
        unsigned int trim_info = 0, con, rising_threshold;
        int ret = 0, threshold_code;
-       unsigned long crit_temp = 0;
+       int crit_temp = 0;
 
        /*
         * For exynos5440 soc triminfo value is swapped between TMU0 and
@@ -663,7 +662,7 @@ static int exynos7_tmu_initialize(struct platform_device *pdev)
        unsigned int status, trim_info;
        unsigned int rising_threshold = 0, falling_threshold = 0;
        int ret = 0, threshold_code, i;
-       unsigned long temp, temp_hist;
+       int temp, temp_hist;
        unsigned int reg_off, bit_off;
 
        status = readb(data->base + EXYNOS_TMU_REG_STATUS);
@@ -876,7 +875,7 @@ static void exynos7_tmu_control(struct platform_device *pdev, bool on)
        writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
 }
 
-static int exynos_get_temp(void *p, long *temp)
+static int exynos_get_temp(void *p, int *temp)
 {
        struct exynos_tmu_data *data = p;
 
@@ -896,7 +895,7 @@ static int exynos_get_temp(void *p, long *temp)
 
 #ifdef CONFIG_THERMAL_EMULATION
 static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
-                           unsigned long temp)
+                           int temp)
 {
        if (temp) {
                temp /= MCELSIUS;
@@ -926,7 +925,7 @@ static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
 }
 
 static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
-                                        unsigned long temp)
+                                        int temp)
 {
        unsigned int val;
        u32 emul_con;
@@ -946,7 +945,7 @@ static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
 }
 
 static void exynos5440_tmu_set_emulation(struct exynos_tmu_data *data,
-                                        unsigned long temp)
+                                        int temp)
 {
        unsigned int val;
 
@@ -955,7 +954,7 @@ static void exynos5440_tmu_set_emulation(struct exynos_tmu_data *data,
        writel(val, data->base + EXYNOS5440_TMU_S0_7_DEBUG);
 }
 
-static int exynos_tmu_set_emulation(void *drv_data, unsigned long temp)
+static int exynos_tmu_set_emulation(void *drv_data, int temp)
 {
        struct exynos_tmu_data *data = drv_data;
        int ret = -EINVAL;
@@ -978,7 +977,7 @@ out:
 #else
 #define exynos4412_tmu_set_emulation NULL
 #define exynos5440_tmu_set_emulation NULL
-static int exynos_tmu_set_emulation(void *drv_data,    unsigned long temp)
+static int exynos_tmu_set_emulation(void *drv_data, int temp)
        { return -EINVAL; }
 #endif /* CONFIG_THERMAL_EMULATION */
 
index bddb71744a6c4d9dbca8cea5e6eb4121f41b7798..534dd913666283fa13eecfaeb8823ea3d55ac0c8 100644 (file)
@@ -38,7 +38,7 @@ struct spear_thermal_dev {
 };
 
 static inline int thermal_get_temp(struct thermal_zone_device *thermal,
-                               unsigned long *temp)
+                               int *temp)
 {
        struct spear_thermal_dev *stdev = thermal->devdata;
 
index 76c515dd802b489116dd73f342520dff8326a67e..be637e6b01d217f9ab54929fbb93322fb5ca086a 100644 (file)
@@ -111,8 +111,7 @@ static int st_thermal_calibration(struct st_thermal_sensor *sensor)
 }
 
 /* Callback to get temperature from HW*/
-static int st_thermal_get_temp(struct thermal_zone_device *th,
-               unsigned long *temperature)
+static int st_thermal_get_temp(struct thermal_zone_device *th, int *temperature)
 {
        struct st_thermal_sensor *sensor = th->devdata;
        struct device *dev = sensor->dev;
@@ -159,7 +158,7 @@ static int st_thermal_get_trip_type(struct thermal_zone_device *th,
 }
 
 static int st_thermal_get_trip_temp(struct thermal_zone_device *th,
-                                   int trip, unsigned long *temp)
+                                   int trip, int *temp)
 {
        struct st_thermal_sensor *sensor = th->devdata;
        struct device *dev = sensor->dev;
@@ -214,7 +213,7 @@ int st_thermal_register(struct platform_device *pdev,
 
        sensor->ops = sensor->cdata->ops;
 
-       ret = sensor->ops->regmap_init(sensor);
+       ret = (sensor->ops->regmap_init)(sensor);
        if (ret)
                return ret;
 
index 5a0f12d08e8b81cc26b71b7e6c065091a098920f..2f9f7086ac3dd0d7a3a71015593fffacf8ee586e 100644 (file)
@@ -113,7 +113,7 @@ static void update_passive_instance(struct thermal_zone_device *tz,
 
 static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 {
-       long trip_temp;
+       int trip_temp;
        enum thermal_trip_type trip_type;
        enum thermal_trend trend;
        struct thermal_instance *instance;
@@ -135,7 +135,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
                trace_thermal_zone_trip(tz, trip, trip_type);
        }
 
-       dev_dbg(&tz->device, "Trip%d[type=%d,temp=%ld]:trend=%d,throttle=%d\n",
+       dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
                                trip, trip_type, trip_temp, trend, throttle);
 
        mutex_lock(&tz->lock);
index 9197fc05c5cc79abfaff06d9539e5cec9bf09ea5..74ea5765938bb802f5a20dccfa6e3c99d4750b74 100644 (file)
@@ -293,7 +293,7 @@ static int enable_tsensor(struct tegra_soctherm *tegra,
  * H denotes an addition of 0.5 Celsius and N denotes negation
  * of the final value.
  */
-static long translate_temp(u16 val)
+static int translate_temp(u16 val)
 {
        long t;
 
@@ -306,7 +306,7 @@ static long translate_temp(u16 val)
        return t;
 }
 
-static int tegra_thermctl_get_temp(void *data, long *out_temp)
+static int tegra_thermctl_get_temp(void *data, int *out_temp)
 {
        struct tegra_thermctl_zone *zone = data;
        u32 val;
index 4ca211be4c0f197825be94f70be386af1c2cc33d..5e5fc7015c7f8da4ba283287acdae8269733b373 100644 (file)
@@ -426,7 +426,7 @@ static void handle_non_critical_trips(struct thermal_zone_device *tz,
 static void handle_critical_trips(struct thermal_zone_device *tz,
                                int trip, enum thermal_trip_type trip_type)
 {
-       long trip_temp;
+       int trip_temp;
 
        tz->ops->get_trip_temp(tz, trip, &trip_temp);
 
@@ -465,7 +465,7 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
 }
 
 /**
- * thermal_zone_get_temp() - returns its the temperature of thermal zone
+ * thermal_zone_get_temp() - returns the temperature of a thermal zone
  * @tz: a valid pointer to a struct thermal_zone_device
  * @temp: a valid pointer to where to store the resulting temperature.
  *
@@ -474,14 +474,12 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
  *
  * Return: On success returns 0, an error code otherwise
  */
-int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 {
        int ret = -EINVAL;
-#ifdef CONFIG_THERMAL_EMULATION
        int count;
-       unsigned long crit_temp = -1UL;
+       int crit_temp = INT_MAX;
        enum thermal_trip_type type;
-#endif
 
        if (!tz || IS_ERR(tz) || !tz->ops->get_temp)
                goto exit;
@@ -489,25 +487,26 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
        mutex_lock(&tz->lock);
 
        ret = tz->ops->get_temp(tz, temp);
-#ifdef CONFIG_THERMAL_EMULATION
-       if (!tz->emul_temperature)
-               goto skip_emul;
-
-       for (count = 0; count < tz->trips; count++) {
-               ret = tz->ops->get_trip_type(tz, count, &type);
-               if (!ret && type == THERMAL_TRIP_CRITICAL) {
-                       ret = tz->ops->get_trip_temp(tz, count, &crit_temp);
-                       break;
-               }
-       }
 
-       if (ret)
-               goto skip_emul;
+       if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) {
+               for (count = 0; count < tz->trips; count++) {
+                       ret = tz->ops->get_trip_type(tz, count, &type);
+                       if (!ret && type == THERMAL_TRIP_CRITICAL) {
+                               ret = tz->ops->get_trip_temp(tz, count,
+                                               &crit_temp);
+                               break;
+                       }
+               }
 
-       if (*temp < crit_temp)
-               *temp = tz->emul_temperature;
-skip_emul:
-#endif
+               /*
+                * Only allow emulating a temperature when the real temperature
+                * is below the critical temperature so that the emulation code
+                * cannot hide critical conditions.
+                */
+               if (!ret && *temp < crit_temp)
+                       *temp = tz->emul_temperature;
+       }
        mutex_unlock(&tz->lock);
 exit:
        return ret;
@@ -516,8 +515,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
 static void update_temperature(struct thermal_zone_device *tz)
 {
-       long temp;
-       int ret;
+       int temp, ret;
 
        ret = thermal_zone_get_temp(tz, &temp);
        if (ret) {
@@ -577,15 +575,14 @@ static ssize_t
 temp_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
-       long temperature;
-       int ret;
+       int temperature, ret;
 
        ret = thermal_zone_get_temp(tz, &temperature);
 
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -689,7 +686,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
-       long temperature;
+       int temperature;
 
        if (!tz->ops->get_trip_temp)
                return -EPERM;
@@ -702,7 +699,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -711,7 +708,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
-       unsigned long temperature;
+       int temperature;
 
        if (!tz->ops->set_trip_hyst)
                return -EPERM;
@@ -719,7 +716,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
        if (!sscanf(attr->attr.name, "trip_point_%d_hyst", &trip))
                return -EINVAL;
 
-       if (kstrtoul(buf, 10, &temperature))
+       if (kstrtoint(buf, 10, &temperature))
                return -EINVAL;
 
        /*
@@ -738,7 +735,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 {
        struct thermal_zone_device *tz = to_thermal_zone(dev);
        int trip, ret;
-       unsigned long temperature;
+       int temperature;
 
        if (!tz->ops->get_trip_hyst)
                return -EPERM;
@@ -748,7 +745,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 
        ret = tz->ops->get_trip_hyst(tz, trip, &temperature);
 
-       return ret ? ret : sprintf(buf, "%ld\n", temperature);
+       return ret ? ret : sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -847,7 +844,27 @@ policy_show(struct device *dev, struct device_attribute *devattr, char *buf)
        return sprintf(buf, "%s\n", tz->governor->name);
 }
 
-#ifdef CONFIG_THERMAL_EMULATION
+static ssize_t
+available_policies_show(struct device *dev, struct device_attribute *devattr,
+                       char *buf)
+{
+       struct thermal_governor *pos;
+       ssize_t count = 0;
+       ssize_t size = PAGE_SIZE;
+
+       mutex_lock(&thermal_governor_lock);
+
+       list_for_each_entry(pos, &thermal_governor_list, governor_list) {
+               size = PAGE_SIZE - count;
+               count += scnprintf(buf + count, size, "%s ", pos->name);
+       }
+       count += scnprintf(buf + count, size, "\n");
+
+       mutex_unlock(&thermal_governor_lock);
+
+       return count;
+}
+
 static ssize_t
 emul_temp_store(struct device *dev, struct device_attribute *attr,
                     const char *buf, size_t count)
@@ -873,7 +890,6 @@ emul_temp_store(struct device *dev, struct device_attribute *attr,
        return ret ? ret : count;
 }
 static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
-#endif/*CONFIG_THERMAL_EMULATION*/
 
 static ssize_t
 sustainable_power_show(struct device *dev, struct device_attribute *devattr,
@@ -1032,6 +1048,7 @@ static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
 static DEVICE_ATTR(passive, S_IRUGO | S_IWUSR, passive_show, passive_store);
 static DEVICE_ATTR(policy, S_IRUGO | S_IWUSR, policy_show, policy_store);
+static DEVICE_ATTR(available_policies, S_IRUGO, available_policies_show, NULL);
 
 /* sys I/F for cooling device */
 #define to_cooling_device(_dev)        \
@@ -1803,11 +1820,12 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
                        goto unregister;
        }
 
-#ifdef CONFIG_THERMAL_EMULATION
-       result = device_create_file(&tz->device, &dev_attr_emul_temp);
-       if (result)
-               goto unregister;
-#endif
+       if (IS_ENABLED(CONFIG_THERMAL_EMULATION)) {
+               result = device_create_file(&tz->device, &dev_attr_emul_temp);
+               if (result)
+                       goto unregister;
+       }
+
        /* Create policy attribute */
        result = device_create_file(&tz->device, &dev_attr_policy);
        if (result)
@@ -1818,6 +1836,11 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
        if (result)
                goto unregister;
 
+       /* Create available_policies attribute */
+       result = device_create_file(&tz->device, &dev_attr_available_policies);
+       if (result)
+               goto unregister;
+
        /* Update 'this' zone's governor information */
        mutex_lock(&thermal_governor_lock);
 
@@ -1849,9 +1872,6 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 
        INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check);
 
-       if (!tz->ops->get_temp)
-               thermal_zone_device_set_polling(tz, 0);
-
        thermal_zone_device_update(tz);
 
        return tz;
@@ -1918,6 +1938,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
        if (tz->ops->get_mode)
                device_remove_file(&tz->device, &dev_attr_mode);
        device_remove_file(&tz->device, &dev_attr_policy);
+       device_remove_file(&tz->device, &dev_attr_available_policies);
        remove_trip_attrs(tz);
        thermal_set_governor(tz, NULL);
 
index 1967bee4f07686de6c091e28797ea27039c2996b..06fd2ed9ef9d13bf0ab09f727020f873152b8da9 100644 (file)
@@ -69,7 +69,7 @@ static DEVICE_ATTR(name, 0444, name_show, NULL);
 static ssize_t
 temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-       long temperature;
+       int temperature;
        int ret;
        struct thermal_hwmon_attr *hwmon_attr
                        = container_of(attr, struct thermal_hwmon_attr, attr);
@@ -83,7 +83,7 @@ temp_input_show(struct device *dev, struct device_attribute *attr, char *buf)
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t
@@ -95,14 +95,14 @@ temp_crit_show(struct device *dev, struct device_attribute *attr, char *buf)
                        = container_of(hwmon_attr, struct thermal_hwmon_temp,
                                       temp_crit);
        struct thermal_zone_device *tz = temp->tz;
-       long temperature;
+       int temperature;
        int ret;
 
        ret = tz->ops->get_trip_temp(tz, 0, &temperature);
        if (ret)
                return ret;
 
-       return sprintf(buf, "%ld\n", temperature);
+       return sprintf(buf, "%d\n", temperature);
 }
 
 
@@ -142,7 +142,7 @@ thermal_hwmon_lookup_temp(const struct thermal_hwmon_device *hwmon,
 
 static bool thermal_zone_crit_temp_valid(struct thermal_zone_device *tz)
 {
-       unsigned long temp;
+       int temp;
        return tz->ops->get_crit_temp && !tz->ops->get_crit_temp(tz, &temp);
 }
 
index c7c5b3779dacc28bb40e0d86bb3d49eaa1e66154..b213a12222956185677e11208224514a4468217c 100644 (file)
@@ -76,14 +76,14 @@ static inline int ti_thermal_hotspot_temperature(int t, int s, int c)
 
 /* thermal zone ops */
 /* Get temperature callback function for thermal zone */
-static inline int __ti_thermal_get_temp(void *devdata, long *temp)
+static inline int __ti_thermal_get_temp(void *devdata, int *temp)
 {
        struct thermal_zone_device *pcb_tz = NULL;
        struct ti_thermal_data *data = devdata;
        struct ti_bandgap *bgp;
        const struct ti_temp_sensor *s;
        int ret, tmp, slope, constant;
-       unsigned long pcb_temp;
+       int pcb_temp;
 
        if (!data)
                return 0;
@@ -119,7 +119,7 @@ static inline int __ti_thermal_get_temp(void *devdata, long *temp)
 }
 
 static inline int ti_thermal_get_temp(struct thermal_zone_device *thermal,
-                                     unsigned long *temp)
+                                     int *temp)
 {
        struct ti_thermal_data *data = thermal->devdata;
 
@@ -229,7 +229,7 @@ static int ti_thermal_get_trip_type(struct thermal_zone_device *thermal,
 
 /* Get trip temperature callback functions for thermal zone */
 static int ti_thermal_get_trip_temp(struct thermal_zone_device *thermal,
-                                   int trip, unsigned long *temp)
+                                   int trip, int *temp)
 {
        if (!ti_thermal_is_valid_trip(trip))
                return -EINVAL;
@@ -280,7 +280,7 @@ static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
 
 /* Get critical temperature callback functions for thermal zone */
 static int ti_thermal_get_crit_temp(struct thermal_zone_device *thermal,
-                                   unsigned long *temp)
+                                   int *temp)
 {
        /* shutdown zone */
        return ti_thermal_get_trip_temp(thermal, OMAP_TRIP_NUMBER - 1, temp);
index 50d1d2cb091a538b02753f81c2a5909a1f8be9d9..7fc919f7da4de1878c7617c0eb46e6a7950b06b6 100644 (file)
@@ -164,7 +164,7 @@ err_ret:
        return err;
 }
 
-static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *temp)
+static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp)
 {
        u32 eax, edx;
        struct phy_dev_entry *phy_dev_entry;
@@ -175,7 +175,7 @@ static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *tem
        if (eax & 0x80000000) {
                *temp = phy_dev_entry->tj_max -
                                ((eax >> 16) & 0x7f) * 1000;
-               pr_debug("sys_get_curr_temp %ld\n", *temp);
+               pr_debug("sys_get_curr_temp %d\n", *temp);
                return 0;
        }
 
@@ -183,7 +183,7 @@ static int sys_get_curr_temp(struct thermal_zone_device *tzd, unsigned long *tem
 }
 
 static int sys_get_trip_temp(struct thermal_zone_device *tzd,
-               int trip, unsigned long *temp)
+               int trip, int *temp)
 {
        u32 eax, edx;
        struct phy_dev_entry *phy_dev_entry;
@@ -214,13 +214,13 @@ static int sys_get_trip_temp(struct thermal_zone_device *tzd,
                *temp = phy_dev_entry->tj_max - thres_reg_value * 1000;
        else
                *temp = 0;
-       pr_debug("sys_get_trip_temp %ld\n", *temp);
+       pr_debug("sys_get_trip_temp %d\n", *temp);
 
        return 0;
 }
 
 static int sys_set_trip_temp(struct thermal_zone_device *tzd, int trip,
-                                                       unsigned long temp)
+                                                       int temp)
 {
        u32 l, h;
        struct phy_dev_entry *phy_dev_entry;
index a9d837f83ce832539a442643f10ec4221d0fa117..10beb1589d8340fb0d0c5965a53cca5abf0b75f7 100644 (file)
@@ -200,7 +200,7 @@ static int xen_hvm_console_init(void)
 {
        int r;
        uint64_t v = 0;
-       unsigned long mfn;
+       unsigned long gfn;
        struct xencons_info *info;
 
        if (!xen_hvm_domain())
@@ -217,7 +217,7 @@ static int xen_hvm_console_init(void)
        }
        /*
         * If the toolstack (or the hypervisor) hasn't set these values, the
-        * default value is 0. Even though mfn = 0 and evtchn = 0 are
+        * default value is 0. Even though gfn = 0 and evtchn = 0 are
         * theoretically correct values, in practice they never are and they
         * mean that a legacy toolstack hasn't initialized the pv console correctly.
         */
@@ -229,8 +229,8 @@ static int xen_hvm_console_init(void)
        r = hvm_get_parameter(HVM_PARAM_CONSOLE_PFN, &v);
        if (r < 0 || v == 0)
                goto err;
-       mfn = v;
-       info->intf = xen_remap(mfn << PAGE_SHIFT, PAGE_SIZE);
+       gfn = v;
+       info->intf = xen_remap(gfn << PAGE_SHIFT, PAGE_SIZE);
        if (info->intf == NULL)
                goto err;
        info->vtermno = HVC_COOKIE;
@@ -265,7 +265,8 @@ static int xen_pv_console_init(void)
                return 0;
        }
        info->evtchn = xen_start_info->console.domU.evtchn;
-       info->intf = mfn_to_virt(xen_start_info->console.domU.mfn);
+       /* GFN == MFN for PV guest */
+       info->intf = gfn_to_virt(xen_start_info->console.domU.mfn);
        info->vtermno = HVC_COOKIE;
 
        spin_lock(&xencons_lock);
@@ -374,7 +375,6 @@ static int xencons_connect_backend(struct xenbus_device *dev,
        int ret, evtchn, devid, ref, irq;
        struct xenbus_transaction xbt;
        grant_ref_t gref_head;
-       unsigned long mfn;
 
        ret = xenbus_alloc_evtchn(dev, &evtchn);
        if (ret)
@@ -389,10 +389,6 @@ static int xencons_connect_backend(struct xenbus_device *dev,
                        irq, &domU_hvc_ops, 256);
        if (IS_ERR(info->hvc))
                return PTR_ERR(info->hvc);
-       if (xen_pv_domain())
-               mfn = virt_to_mfn(info->intf);
-       else
-               mfn = __pa(info->intf) >> PAGE_SHIFT;
        ret = gnttab_alloc_grant_references(1, &gref_head);
        if (ret < 0)
                return ret;
@@ -401,7 +397,7 @@ static int xencons_connect_backend(struct xenbus_device *dev,
        if (ref < 0)
                return ref;
        gnttab_grant_foreign_access_ref(ref, info->xbdev->otherend_id,
-                       mfn, 0);
+                                       virt_to_gfn(info->intf), 0);
 
  again:
        ret = xenbus_transaction_start(&xbt);
index cfbb9d728e317fae55daa324ab5d8f780cb023f0..271d121376490042e87bda6eeb06d26833d6c0d6 100644 (file)
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
+#include <linux/io.h>
 #ifdef CONFIG_SPARC
 #include <linux/sunserialcore.h>
 #endif
 
-#include <asm/io.h>
 #include <asm/irq.h>
 
 #include "8250.h"
index ed299b9e63752b4ec9d319a7e989fe3307433a77..687b1ea294b79bbf999c4c86747ee2df7ba30df5 100644 (file)
@@ -47,12 +47,12 @@ config SERIAL_AMBA_PL010_CONSOLE
 
 config SERIAL_AMBA_PL011
        tristate "ARM AMBA PL011 serial port support"
-       depends on ARM_AMBA || SOC_ZX296702
+       depends on ARM_AMBA
        select SERIAL_CORE
        help
          This selects the ARM(R) AMBA(R) PrimeCell PL011 UART.  If you have
          an Integrator/PP2, Integrator/CP or Versatile platform, say Y or M
-         here. Say Y or M if you have SOC_ZX296702.
+         here.
 
          If unsure, say N.
 
index 2af09ab153b6db628c09bf248219a65a0e40387d..fd27e986b1dd3437dfd2560ec11efd8def7bf254 100644 (file)
 /* There is by now at least one vendor with differing details, so handle it */
 struct vendor_data {
        unsigned int            ifls;
-       unsigned int            fr_busy;
-       unsigned int            fr_dsr;
-       unsigned int            fr_cts;
-       unsigned int            fr_ri;
        unsigned int            lcrh_tx;
        unsigned int            lcrh_rx;
-       u16                     *reg_lut;
        bool                    oversampling;
        bool                    dma_threshold;
        bool                    cts_event_workaround;
@@ -90,48 +85,6 @@ struct vendor_data {
        unsigned int (*get_fifosize)(struct amba_device *dev);
 };
 
-/* Max address offset of register in use is 0x48 */
-#define REG_NR         (0x48 >> 2)
-#define IDX(x)         (x >> 2)
-enum reg_idx {
-       REG_DR          = IDX(UART01x_DR),
-       REG_RSR         = IDX(UART01x_RSR),
-       REG_ST_DMAWM    = IDX(ST_UART011_DMAWM),
-       REG_FR          = IDX(UART01x_FR),
-       REG_ST_LCRH_RX  = IDX(ST_UART011_LCRH_RX),
-       REG_ILPR        = IDX(UART01x_ILPR),
-       REG_IBRD        = IDX(UART011_IBRD),
-       REG_FBRD        = IDX(UART011_FBRD),
-       REG_LCRH        = IDX(UART011_LCRH),
-       REG_CR          = IDX(UART011_CR),
-       REG_IFLS        = IDX(UART011_IFLS),
-       REG_IMSC        = IDX(UART011_IMSC),
-       REG_RIS         = IDX(UART011_RIS),
-       REG_MIS         = IDX(UART011_MIS),
-       REG_ICR         = IDX(UART011_ICR),
-       REG_DMACR       = IDX(UART011_DMACR),
-};
-
-static u16 arm_reg[] = {
-       [REG_DR]                = UART01x_DR,
-       [REG_RSR]               = UART01x_RSR,
-       [REG_ST_DMAWM]          = ~0,
-       [REG_FR]                = UART01x_FR,
-       [REG_ST_LCRH_RX]        = ~0,
-       [REG_ILPR]              = UART01x_ILPR,
-       [REG_IBRD]              = UART011_IBRD,
-       [REG_FBRD]              = UART011_FBRD,
-       [REG_LCRH]              = UART011_LCRH,
-       [REG_CR]                = UART011_CR,
-       [REG_IFLS]              = UART011_IFLS,
-       [REG_IMSC]              = UART011_IMSC,
-       [REG_RIS]               = UART011_RIS,
-       [REG_MIS]               = UART011_MIS,
-       [REG_ICR]               = UART011_ICR,
-       [REG_DMACR]             = UART011_DMACR,
-};
-
-#ifdef CONFIG_ARM_AMBA
 static unsigned int get_fifosize_arm(struct amba_device *dev)
 {
        return amba_rev(dev) < 3 ? 16 : 32;
@@ -139,13 +92,8 @@ static unsigned int get_fifosize_arm(struct amba_device *dev)
 
 static struct vendor_data vendor_arm = {
        .ifls                   = UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
-       .fr_busy                = UART01x_FR_BUSY,
-       .fr_dsr                 = UART01x_FR_DSR,
-       .fr_cts                 = UART01x_FR_CTS,
-       .fr_ri                  = UART011_FR_RI,
-       .lcrh_tx                = REG_LCRH,
-       .lcrh_rx                = REG_LCRH,
-       .reg_lut                = arm_reg,
+       .lcrh_tx                = UART011_LCRH,
+       .lcrh_rx                = UART011_LCRH,
        .oversampling           = false,
        .dma_threshold          = false,
        .cts_event_workaround   = false,
@@ -153,14 +101,8 @@ static struct vendor_data vendor_arm = {
        .fixed_options          = false,
        .get_fifosize           = get_fifosize_arm,
 };
-#endif
 
 static struct vendor_data vendor_sbsa = {
-       .fr_busy                = UART01x_FR_BUSY,
-       .fr_dsr                 = UART01x_FR_DSR,
-       .fr_cts                 = UART01x_FR_CTS,
-       .fr_ri                  = UART011_FR_RI,
-       .reg_lut                = arm_reg,
        .oversampling           = false,
        .dma_threshold          = false,
        .cts_event_workaround   = false,
@@ -168,26 +110,6 @@ static struct vendor_data vendor_sbsa = {
        .fixed_options          = true,
 };
 
-#ifdef CONFIG_ARM_AMBA
-static u16 st_reg[] = {
-       [REG_DR]                = UART01x_DR,
-       [REG_RSR]               = UART01x_RSR,
-       [REG_ST_DMAWM]          = ST_UART011_DMAWM,
-       [REG_FR]                = UART01x_FR,
-       [REG_ST_LCRH_RX]        = ST_UART011_LCRH_RX,
-       [REG_ILPR]              = UART01x_ILPR,
-       [REG_IBRD]              = UART011_IBRD,
-       [REG_FBRD]              = UART011_FBRD,
-       [REG_LCRH]              = UART011_LCRH,
-       [REG_CR]                = UART011_CR,
-       [REG_IFLS]              = UART011_IFLS,
-       [REG_IMSC]              = UART011_IMSC,
-       [REG_RIS]               = UART011_RIS,
-       [REG_MIS]               = UART011_MIS,
-       [REG_ICR]               = UART011_ICR,
-       [REG_DMACR]             = UART011_DMACR,
-};
-
 static unsigned int get_fifosize_st(struct amba_device *dev)
 {
        return 64;
@@ -195,13 +117,8 @@ static unsigned int get_fifosize_st(struct amba_device *dev)
 
 static struct vendor_data vendor_st = {
        .ifls                   = UART011_IFLS_RX_HALF|UART011_IFLS_TX_HALF,
-       .fr_busy                = UART01x_FR_BUSY,
-       .fr_dsr                 = UART01x_FR_DSR,
-       .fr_cts                 = UART01x_FR_CTS,
-       .fr_ri                  = UART011_FR_RI,
-       .lcrh_tx                = REG_LCRH,
-       .lcrh_rx                = REG_ST_LCRH_RX,
-       .reg_lut                = st_reg,
+       .lcrh_tx                = ST_UART011_LCRH_TX,
+       .lcrh_rx                = ST_UART011_LCRH_RX,
        .oversampling           = true,
        .dma_threshold          = true,
        .cts_event_workaround   = true,
@@ -209,43 +126,6 @@ static struct vendor_data vendor_st = {
        .fixed_options          = false,
        .get_fifosize           = get_fifosize_st,
 };
-#endif
-
-#ifdef CONFIG_SOC_ZX296702
-static u16 zte_reg[] = {
-       [REG_DR]                = ZX_UART01x_DR,
-       [REG_RSR]               = UART01x_RSR,
-       [REG_ST_DMAWM]          = ST_UART011_DMAWM,
-       [REG_FR]                = ZX_UART01x_FR,
-       [REG_ST_LCRH_RX]        = ST_UART011_LCRH_RX,
-       [REG_ILPR]              = UART01x_ILPR,
-       [REG_IBRD]              = UART011_IBRD,
-       [REG_FBRD]              = UART011_FBRD,
-       [REG_LCRH]              = ZX_UART011_LCRH_TX,
-       [REG_CR]                = ZX_UART011_CR,
-       [REG_IFLS]              = ZX_UART011_IFLS,
-       [REG_IMSC]              = ZX_UART011_IMSC,
-       [REG_RIS]               = ZX_UART011_RIS,
-       [REG_MIS]               = ZX_UART011_MIS,
-       [REG_ICR]               = ZX_UART011_ICR,
-       [REG_DMACR]             = ZX_UART011_DMACR,
-};
-
-static struct vendor_data vendor_zte = {
-       .ifls                   = UART011_IFLS_RX4_8|UART011_IFLS_TX4_8,
-       .fr_busy                = ZX_UART01x_FR_BUSY,
-       .fr_dsr                 = ZX_UART01x_FR_DSR,
-       .fr_cts                 = ZX_UART01x_FR_CTS,
-       .fr_ri                  = ZX_UART011_FR_RI,
-       .lcrh_tx                = REG_LCRH,
-       .lcrh_rx                = REG_ST_LCRH_RX,
-       .reg_lut                = zte_reg,
-       .oversampling           = false,
-       .dma_threshold          = false,
-       .cts_event_workaround   = false,
-       .fixed_options          = false,
-};
-#endif
 
 /* Deals with DMA transactions */
 
@@ -284,15 +164,10 @@ struct uart_amba_port {
        struct uart_port        port;
        struct clk              *clk;
        const struct vendor_data *vendor;
-       u16                     *reg_lut;
        unsigned int            dmacr;          /* dma control reg */
        unsigned int            im;             /* interrupt mask */
        unsigned int            old_status;
        unsigned int            fifosize;       /* vendor-specific */
-       unsigned int            fr_busy;        /* vendor-specific */
-       unsigned int            fr_dsr;         /* vendor-specific */
-       unsigned int            fr_cts;         /* vendor-specific */
-       unsigned int            fr_ri;          /* vendor-specific */
        unsigned int            lcrh_tx;        /* vendor-specific */
        unsigned int            lcrh_rx;        /* vendor-specific */
        unsigned int            old_cr;         /* state during shutdown */
@@ -309,29 +184,6 @@ struct uart_amba_port {
 #endif
 };
 
-static bool is_implemented(struct uart_amba_port *uap, unsigned int reg)
-{
-       return uap->reg_lut[reg] != (u16)~0;
-}
-
-static unsigned int pl011_readw(struct uart_amba_port *uap, int index)
-{
-       WARN_ON(index > REG_NR);
-       return readw_relaxed(uap->port.membase + uap->reg_lut[index]);
-}
-
-static void pl011_writew(struct uart_amba_port *uap, int val, int index)
-{
-       WARN_ON(index > REG_NR);
-       writew_relaxed(val, uap->port.membase + uap->reg_lut[index]);
-}
-
-static void pl011_writeb(struct uart_amba_port *uap, u8 val, int index)
-{
-       WARN_ON(index > REG_NR);
-       writeb_relaxed(val, uap->port.membase + uap->reg_lut[index]);
-}
-
 /*
  * Reads up to 256 characters from the FIFO or until it's empty and
  * inserts them into the TTY layer. Returns the number of characters
@@ -344,12 +196,12 @@ static int pl011_fifo_to_tty(struct uart_amba_port *uap)
        int fifotaken = 0;
 
        while (max_count--) {
-               status = pl011_readw(uap, REG_FR);
+               status = readw(uap->port.membase + UART01x_FR);
                if (status & UART01x_FR_RXFE)
                        break;
 
                /* Take chars from the FIFO and update status */
-               ch = pl011_readw(uap, REG_DR) |
+               ch = readw(uap->port.membase + UART01x_DR) |
                        UART_DUMMY_DR_RX;
                flag = TTY_NORMAL;
                uap->port.icount.rx++;
@@ -432,7 +284,7 @@ static void pl011_dma_probe(struct uart_amba_port *uap)
        struct amba_pl011_data *plat = dev_get_platdata(uap->port.dev);
        struct device *dev = uap->port.dev;
        struct dma_slave_config tx_conf = {
-               .dst_addr = uap->port.mapbase + uap->reg_lut[REG_DR],
+               .dst_addr = uap->port.mapbase + UART01x_DR,
                .dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE,
                .direction = DMA_MEM_TO_DEV,
                .dst_maxburst = uap->fifosize >> 1,
@@ -487,7 +339,7 @@ static void pl011_dma_probe(struct uart_amba_port *uap)
 
        if (chan) {
                struct dma_slave_config rx_conf = {
-                       .src_addr = uap->port.mapbase + uap->reg_lut[REG_DR],
+                       .src_addr = uap->port.mapbase + UART01x_DR,
                        .src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE,
                        .direction = DMA_DEV_TO_MEM,
                        .src_maxburst = uap->fifosize >> 2,
@@ -586,7 +438,7 @@ static void pl011_dma_tx_callback(void *data)
 
        dmacr = uap->dmacr;
        uap->dmacr = dmacr & ~UART011_TXDMAE;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
 
        /*
         * If TX DMA was disabled, it means that we've stopped the DMA for
@@ -700,7 +552,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
        dma_dev->device_issue_pending(chan);
 
        uap->dmacr |= UART011_TXDMAE;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
        uap->dmatx.queued = true;
 
        /*
@@ -736,9 +588,9 @@ static bool pl011_dma_tx_irq(struct uart_amba_port *uap)
         */
        if (uap->dmatx.queued) {
                uap->dmacr |= UART011_TXDMAE;
-               pl011_writew(uap, uap->dmacr, REG_DMACR);
+               writew(uap->dmacr, uap->port.membase + UART011_DMACR);
                uap->im &= ~UART011_TXIM;
-               pl011_writew(uap, uap->im, REG_IMSC);
+               writew(uap->im, uap->port.membase + UART011_IMSC);
                return true;
        }
 
@@ -748,7 +600,7 @@ static bool pl011_dma_tx_irq(struct uart_amba_port *uap)
         */
        if (pl011_dma_tx_refill(uap) > 0) {
                uap->im &= ~UART011_TXIM;
-               pl011_writew(uap, uap->im, REG_IMSC);
+               writew(uap->im, uap->port.membase + UART011_IMSC);
                return true;
        }
        return false;
@@ -762,7 +614,7 @@ static inline void pl011_dma_tx_stop(struct uart_amba_port *uap)
 {
        if (uap->dmatx.queued) {
                uap->dmacr &= ~UART011_TXDMAE;
-               pl011_writew(uap, uap->dmacr, REG_DMACR);
+               writew(uap->dmacr, uap->port.membase + UART011_DMACR);
        }
 }
 
@@ -788,12 +640,14 @@ static inline bool pl011_dma_tx_start(struct uart_amba_port *uap)
                if (!uap->dmatx.queued) {
                        if (pl011_dma_tx_refill(uap) > 0) {
                                uap->im &= ~UART011_TXIM;
-                               pl011_writew(uap, uap->im, REG_IMSC);
+                               writew(uap->im, uap->port.membase +
+                                      UART011_IMSC);
                        } else
                                ret = false;
                } else if (!(uap->dmacr & UART011_TXDMAE)) {
                        uap->dmacr |= UART011_TXDMAE;
-                       pl011_writew(uap, uap->dmacr, REG_DMACR);
+                       writew(uap->dmacr,
+                                      uap->port.membase + UART011_DMACR);
                }
                return ret;
        }
@@ -804,9 +658,9 @@ static inline bool pl011_dma_tx_start(struct uart_amba_port *uap)
         */
        dmacr = uap->dmacr;
        uap->dmacr &= ~UART011_TXDMAE;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
 
-       if (pl011_readw(uap, REG_FR) & UART01x_FR_TXFF) {
+       if (readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF) {
                /*
                 * No space in the FIFO, so enable the transmit interrupt
                 * so we know when there is space.  Note that once we've
@@ -815,13 +669,13 @@ static inline bool pl011_dma_tx_start(struct uart_amba_port *uap)
                return false;
        }
 
-       pl011_writew(uap, uap->port.x_char, REG_DR);
+       writew(uap->port.x_char, uap->port.membase + UART01x_DR);
        uap->port.icount.tx++;
        uap->port.x_char = 0;
 
        /* Success - restore the DMA state */
        uap->dmacr = dmacr;
-       pl011_writew(uap, dmacr, REG_DMACR);
+       writew(dmacr, uap->port.membase + UART011_DMACR);
 
        return true;
 }
@@ -849,7 +703,7 @@ __acquires(&uap->port.lock)
                             DMA_TO_DEVICE);
                uap->dmatx.queued = false;
                uap->dmacr &= ~UART011_TXDMAE;
-               pl011_writew(uap, uap->dmacr, REG_DMACR);
+               writew(uap->dmacr, uap->port.membase + UART011_DMACR);
        }
 }
 
@@ -889,11 +743,11 @@ static int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap)
        dma_async_issue_pending(rxchan);
 
        uap->dmacr |= UART011_RXDMAE;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
        uap->dmarx.running = true;
 
        uap->im &= ~UART011_RXIM;
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
 
        return 0;
 }
@@ -951,9 +805,8 @@ static void pl011_dma_rx_chars(struct uart_amba_port *uap,
         */
        if (dma_count == pending && readfifo) {
                /* Clear any error flags */
-               pl011_writew(uap,
-                            UART011_OEIS | UART011_BEIS | UART011_PEIS
-                            | UART011_FEIS, REG_ICR);
+               writew(UART011_OEIS | UART011_BEIS | UART011_PEIS | UART011_FEIS,
+                      uap->port.membase + UART011_ICR);
 
                /*
                 * If we read all the DMA'd characters, and we had an
@@ -1001,7 +854,7 @@ static void pl011_dma_rx_irq(struct uart_amba_port *uap)
 
        /* Disable RX DMA - incoming data will wait in the FIFO */
        uap->dmacr &= ~UART011_RXDMAE;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
        uap->dmarx.running = false;
 
        pending = sgbuf->sg.length - state.residue;
@@ -1021,7 +874,7 @@ static void pl011_dma_rx_irq(struct uart_amba_port *uap)
                dev_dbg(uap->port.dev, "could not retrigger RX DMA job "
                        "fall back to interrupt mode\n");
                uap->im |= UART011_RXIM;
-               pl011_writew(uap, uap->im, REG_IMSC);
+               writew(uap->im, uap->port.membase + UART011_IMSC);
        }
 }
 
@@ -1069,7 +922,7 @@ static void pl011_dma_rx_callback(void *data)
                dev_dbg(uap->port.dev, "could not retrigger RX DMA job "
                        "fall back to interrupt mode\n");
                uap->im |= UART011_RXIM;
-               pl011_writew(uap, uap->im, REG_IMSC);
+               writew(uap->im, uap->port.membase + UART011_IMSC);
        }
 }
 
@@ -1082,7 +935,7 @@ static inline void pl011_dma_rx_stop(struct uart_amba_port *uap)
 {
        /* FIXME.  Just disable the DMA enable */
        uap->dmacr &= ~UART011_RXDMAE;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
 }
 
 /*
@@ -1126,7 +979,7 @@ static void pl011_dma_rx_poll(unsigned long args)
                spin_lock_irqsave(&uap->port.lock, flags);
                pl011_dma_rx_stop(uap);
                uap->im |= UART011_RXIM;
-               pl011_writew(uap, uap->im, REG_IMSC);
+               writew(uap->im, uap->port.membase + UART011_IMSC);
                spin_unlock_irqrestore(&uap->port.lock, flags);
 
                uap->dmarx.running = false;
@@ -1188,7 +1041,7 @@ static void pl011_dma_startup(struct uart_amba_port *uap)
 skip_rx:
        /* Turn on DMA error (RX/TX will be enabled on demand) */
        uap->dmacr |= UART011_DMAONERR;
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
 
        /*
         * ST Micro variants has some specific dma burst threshold
@@ -1196,9 +1049,8 @@ skip_rx:
         * be issued above/below 16 bytes.
         */
        if (uap->vendor->dma_threshold)
-               pl011_writew(uap,
-                            ST_UART011_DMAWM_RX_16 | ST_UART011_DMAWM_TX_16,
-                            REG_ST_DMAWM);
+               writew(ST_UART011_DMAWM_RX_16 | ST_UART011_DMAWM_TX_16,
+                              uap->port.membase + ST_UART011_DMAWM);
 
        if (uap->using_rx_dma) {
                if (pl011_dma_rx_trigger_dma(uap))
@@ -1223,12 +1075,12 @@ static void pl011_dma_shutdown(struct uart_amba_port *uap)
                return;
 
        /* Disable RX and TX DMA */
-       while (pl011_readw(uap, REG_FR) & uap->fr_busy)
+       while (readw(uap->port.membase + UART01x_FR) & UART01x_FR_BUSY)
                barrier();
 
        spin_lock_irq(&uap->port.lock);
        uap->dmacr &= ~(UART011_DMAONERR | UART011_RXDMAE | UART011_TXDMAE);
-       pl011_writew(uap, uap->dmacr, REG_DMACR);
+       writew(uap->dmacr, uap->port.membase + UART011_DMACR);
        spin_unlock_irq(&uap->port.lock);
 
        if (uap->using_tx_dma) {
@@ -1329,7 +1181,7 @@ static void pl011_stop_tx(struct uart_port *port)
            container_of(port, struct uart_amba_port, port);
 
        uap->im &= ~UART011_TXIM;
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
        pl011_dma_tx_stop(uap);
 }
 
@@ -1339,7 +1191,7 @@ static void pl011_tx_chars(struct uart_amba_port *uap, bool from_irq);
 static void pl011_start_tx_pio(struct uart_amba_port *uap)
 {
        uap->im |= UART011_TXIM;
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
        pl011_tx_chars(uap, false);
 }
 
@@ -1359,7 +1211,7 @@ static void pl011_stop_rx(struct uart_port *port)
 
        uap->im &= ~(UART011_RXIM|UART011_RTIM|UART011_FEIM|
                     UART011_PEIM|UART011_BEIM|UART011_OEIM);
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
 
        pl011_dma_rx_stop(uap);
 }
@@ -1370,7 +1222,7 @@ static void pl011_enable_ms(struct uart_port *port)
            container_of(port, struct uart_amba_port, port);
 
        uap->im |= UART011_RIMIM|UART011_CTSMIM|UART011_DCDMIM|UART011_DSRMIM;
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
 }
 
 static void pl011_rx_chars(struct uart_amba_port *uap)
@@ -1390,7 +1242,7 @@ __acquires(&uap->port.lock)
                        dev_dbg(uap->port.dev, "could not trigger RX DMA job "
                                "fall back to interrupt mode again\n");
                        uap->im |= UART011_RXIM;
-                       pl011_writew(uap, uap->im, REG_IMSC);
+                       writew(uap->im, uap->port.membase + UART011_IMSC);
                } else {
 #ifdef CONFIG_DMA_ENGINE
                        /* Start Rx DMA poll */
@@ -1411,10 +1263,10 @@ static bool pl011_tx_char(struct uart_amba_port *uap, unsigned char c,
                          bool from_irq)
 {
        if (unlikely(!from_irq) &&
-           pl011_readw(uap, REG_FR) & UART01x_FR_TXFF)
+           readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF)
                return false; /* unable to transmit character */
 
-       pl011_writew(uap, c, REG_DR);
+       writew(c, uap->port.membase + UART01x_DR);
        uap->port.icount.tx++;
 
        return true;
@@ -1461,7 +1313,7 @@ static void pl011_modem_status(struct uart_amba_port *uap)
 {
        unsigned int status, delta;
 
-       status = pl011_readw(uap, REG_FR) & UART01x_FR_MODEM_ANY;
+       status = readw(uap->port.membase + UART01x_FR) & UART01x_FR_MODEM_ANY;
 
        delta = status ^ uap->old_status;
        uap->old_status = status;
@@ -1472,11 +1324,11 @@ static void pl011_modem_status(struct uart_amba_port *uap)
        if (delta & UART01x_FR_DCD)
                uart_handle_dcd_change(&uap->port, status & UART01x_FR_DCD);
 
-       if (delta & uap->fr_dsr)
+       if (delta & UART01x_FR_DSR)
                uap->port.icount.dsr++;
 
-       if (delta & uap->fr_cts)
-               uart_handle_cts_change(&uap->port, status & uap->fr_cts);
+       if (delta & UART01x_FR_CTS)
+               uart_handle_cts_change(&uap->port, status & UART01x_FR_CTS);
 
        wake_up_interruptible(&uap->port.state->port.delta_msr_wait);
 }
@@ -1489,15 +1341,15 @@ static void check_apply_cts_event_workaround(struct uart_amba_port *uap)
                return;
 
        /* workaround to make sure that all bits are unlocked.. */
-       pl011_writew(uap, 0x00, REG_ICR);
+       writew(0x00, uap->port.membase + UART011_ICR);
 
        /*
         * WA: introduce 26ns(1 uart clk) delay before W1C;
         * single apb access will incur 2 pclk(133.12Mhz) delay,
         * so add 2 dummy reads
         */
-       dummy_read = pl011_readw(uap, REG_ICR);
-       dummy_read = pl011_readw(uap, REG_ICR);
+       dummy_read = readw(uap->port.membase + UART011_ICR);
+       dummy_read = readw(uap->port.membase + UART011_ICR);
 }
 
 static irqreturn_t pl011_int(int irq, void *dev_id)
@@ -1509,13 +1361,15 @@ static irqreturn_t pl011_int(int irq, void *dev_id)
        int handled = 0;
 
        spin_lock_irqsave(&uap->port.lock, flags);
-       imsc = pl011_readw(uap, REG_IMSC);
-       status = pl011_readw(uap, REG_RIS) & imsc;
+       imsc = readw(uap->port.membase + UART011_IMSC);
+       status = readw(uap->port.membase + UART011_RIS) & imsc;
        if (status) {
                do {
                        check_apply_cts_event_workaround(uap);
-                       pl011_writew(uap, status & ~(UART011_TXIS|UART011_RTIS|
-                                    UART011_RXIS), REG_ICR);
+
+                       writew(status & ~(UART011_TXIS|UART011_RTIS|
+                                         UART011_RXIS),
+                              uap->port.membase + UART011_ICR);
 
                        if (status & (UART011_RTIS|UART011_RXIS)) {
                                if (pl011_dma_rx_running(uap))
@@ -1532,7 +1386,7 @@ static irqreturn_t pl011_int(int irq, void *dev_id)
                        if (pass_counter-- == 0)
                                break;
 
-                       status = pl011_readw(uap, REG_RIS) & imsc;
+                       status = readw(uap->port.membase + UART011_RIS) & imsc;
                } while (status != 0);
                handled = 1;
        }
@@ -1546,8 +1400,8 @@ static unsigned int pl011_tx_empty(struct uart_port *port)
 {
        struct uart_amba_port *uap =
            container_of(port, struct uart_amba_port, port);
-       unsigned int status = pl011_readw(uap, REG_FR);
-       return status & (uap->fr_busy|UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT;
+       unsigned int status = readw(uap->port.membase + UART01x_FR);
+       return status & (UART01x_FR_BUSY|UART01x_FR_TXFF) ? 0 : TIOCSER_TEMT;
 }
 
 static unsigned int pl011_get_mctrl(struct uart_port *port)
@@ -1555,16 +1409,16 @@ static unsigned int pl011_get_mctrl(struct uart_port *port)
        struct uart_amba_port *uap =
            container_of(port, struct uart_amba_port, port);
        unsigned int result = 0;
-       unsigned int status = pl011_readw(uap, REG_FR);
+       unsigned int status = readw(uap->port.membase + UART01x_FR);
 
 #define TIOCMBIT(uartbit, tiocmbit)    \
        if (status & uartbit)           \
                result |= tiocmbit
 
        TIOCMBIT(UART01x_FR_DCD, TIOCM_CAR);
-       TIOCMBIT(uap->fr_dsr, TIOCM_DSR);
-       TIOCMBIT(uap->fr_cts, TIOCM_CTS);
-       TIOCMBIT(uap->fr_ri, TIOCM_RNG);
+       TIOCMBIT(UART01x_FR_DSR, TIOCM_DSR);
+       TIOCMBIT(UART01x_FR_CTS, TIOCM_CTS);
+       TIOCMBIT(UART011_FR_RI, TIOCM_RNG);
 #undef TIOCMBIT
        return result;
 }
@@ -1575,7 +1429,7 @@ static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl)
            container_of(port, struct uart_amba_port, port);
        unsigned int cr;
 
-       cr = pl011_readw(uap, REG_CR);
+       cr = readw(uap->port.membase + UART011_CR);
 
 #define        TIOCMBIT(tiocmbit, uartbit)             \
        if (mctrl & tiocmbit)           \
@@ -1595,7 +1449,7 @@ static void pl011_set_mctrl(struct uart_port *port, unsigned int mctrl)
        }
 #undef TIOCMBIT
 
-       pl011_writew(uap, cr, REG_CR);
+       writew(cr, uap->port.membase + UART011_CR);
 }
 
 static void pl011_break_ctl(struct uart_port *port, int break_state)
@@ -1606,12 +1460,12 @@ static void pl011_break_ctl(struct uart_port *port, int break_state)
        unsigned int lcr_h;
 
        spin_lock_irqsave(&uap->port.lock, flags);
-       lcr_h = pl011_readw(uap, uap->lcrh_tx);
+       lcr_h = readw(uap->port.membase + uap->lcrh_tx);
        if (break_state == -1)
                lcr_h |= UART01x_LCRH_BRK;
        else
                lcr_h &= ~UART01x_LCRH_BRK;
-       pl011_writew(uap, lcr_h, uap->lcrh_tx);
+       writew(lcr_h, uap->port.membase + uap->lcrh_tx);
        spin_unlock_irqrestore(&uap->port.lock, flags);
 }
 
@@ -1621,8 +1475,9 @@ static void pl011_quiesce_irqs(struct uart_port *port)
 {
        struct uart_amba_port *uap =
            container_of(port, struct uart_amba_port, port);
+       unsigned char __iomem *regs = uap->port.membase;
 
-       pl011_writew(uap, pl011_readw(uap, REG_MIS), REG_ICR);
+       writew(readw(regs + UART011_MIS), regs + UART011_ICR);
        /*
         * There is no way to clear TXIM as this is "ready to transmit IRQ", so
         * we simply mask it. start_tx() will unmask it.
@@ -1636,7 +1491,7 @@ static void pl011_quiesce_irqs(struct uart_port *port)
         * (including tx queue), so we're also fine with start_tx()'s caller
         * side.
         */
-       pl011_writew(uap, pl011_readw(uap, REG_IMSC) & ~UART011_TXIM, REG_IMSC);
+       writew(readw(regs + UART011_IMSC) & ~UART011_TXIM, regs + UART011_IMSC);
 }
 
 static int pl011_get_poll_char(struct uart_port *port)
@@ -1651,11 +1506,11 @@ static int pl011_get_poll_char(struct uart_port *port)
         */
        pl011_quiesce_irqs(port);
 
-       status = pl011_readw(uap, REG_FR);
+       status = readw(uap->port.membase + UART01x_FR);
        if (status & UART01x_FR_RXFE)
                return NO_POLL_CHAR;
 
-       return pl011_readw(uap, REG_DR);
+       return readw(uap->port.membase + UART01x_DR);
 }
 
 static void pl011_put_poll_char(struct uart_port *port,
@@ -1664,10 +1519,10 @@ static void pl011_put_poll_char(struct uart_port *port,
        struct uart_amba_port *uap =
            container_of(port, struct uart_amba_port, port);
 
-       while (pl011_readw(uap, REG_FR) & UART01x_FR_TXFF)
+       while (readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF)
                barrier();
 
-       pl011_writew(uap, ch, REG_DR);
+       writew(ch, uap->port.membase + UART01x_DR);
 }
 
 #endif /* CONFIG_CONSOLE_POLL */
@@ -1691,15 +1546,15 @@ static int pl011_hwinit(struct uart_port *port)
        uap->port.uartclk = clk_get_rate(uap->clk);
 
        /* Clear pending error and receive interrupts */
-       pl011_writew(uap, UART011_OEIS | UART011_BEIS | UART011_PEIS |
-                    UART011_FEIS | UART011_RTIS | UART011_RXIS, REG_ICR);
+       writew(UART011_OEIS | UART011_BEIS | UART011_PEIS | UART011_FEIS |
+              UART011_RTIS | UART011_RXIS, uap->port.membase + UART011_ICR);
 
        /*
         * Save interrupts enable mask, and enable RX interrupts in case if
         * the interrupt is used for NMI entry.
         */
-       uap->im = pl011_readw(uap, REG_IMSC);
-       pl011_writew(uap, UART011_RTIM | UART011_RXIM, REG_IMSC);
+       uap->im = readw(uap->port.membase + UART011_IMSC);
+       writew(UART011_RTIM | UART011_RXIM, uap->port.membase + UART011_IMSC);
 
        if (dev_get_platdata(uap->port.dev)) {
                struct amba_pl011_data *plat;
@@ -1713,22 +1568,22 @@ static int pl011_hwinit(struct uart_port *port)
 
 static void pl011_write_lcr_h(struct uart_amba_port *uap, unsigned int lcr_h)
 {
-       pl011_writew(uap, lcr_h, uap->lcrh_rx);
-       if (is_implemented(uap, REG_ST_LCRH_RX)) {
+       writew(lcr_h, uap->port.membase + uap->lcrh_rx);
+       if (uap->lcrh_rx != uap->lcrh_tx) {
                int i;
                /*
                 * Wait 10 PCLKs before writing LCRH_TX register,
                 * to get this delay write read only register 10 times
                 */
                for (i = 0; i < 10; ++i)
-                       pl011_writew(uap, 0xff, REG_MIS);
-               pl011_writew(uap, lcr_h, uap->lcrh_tx);
+                       writew(0xff, uap->port.membase + UART011_MIS);
+               writew(lcr_h, uap->port.membase + uap->lcrh_tx);
        }
 }
 
 static int pl011_allocate_irq(struct uart_amba_port *uap)
 {
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
 
        return request_irq(uap->port.irq, pl011_int, 0, "uart-pl011", uap);
 }
@@ -1743,11 +1598,12 @@ static void pl011_enable_interrupts(struct uart_amba_port *uap)
        spin_lock_irq(&uap->port.lock);
 
        /* Clear out any spuriously appearing RX interrupts */
-       pl011_writew(uap, UART011_RTIS | UART011_RXIS, REG_ICR);
+       writew(UART011_RTIS | UART011_RXIS,
+              uap->port.membase + UART011_ICR);
        uap->im = UART011_RTIM;
        if (!pl011_dma_rx_running(uap))
                uap->im |= UART011_RXIM;
-       pl011_writew(uap, uap->im, REG_IMSC);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
        spin_unlock_irq(&uap->port.lock);
 }
 
@@ -1766,21 +1622,21 @@ static int pl011_startup(struct uart_port *port)
        if (retval)
                goto clk_dis;
 
-       pl011_writew(uap, uap->vendor->ifls, REG_IFLS);
+       writew(uap->vendor->ifls, uap->port.membase + UART011_IFLS);
 
        spin_lock_irq(&uap->port.lock);
 
        /* restore RTS and DTR */
        cr = uap->old_cr & (UART011_CR_RTS | UART011_CR_DTR);
        cr |= UART01x_CR_UARTEN | UART011_CR_RXE | UART011_CR_TXE;
-       pl011_writew(uap, cr, REG_CR);
+       writew(cr, uap->port.membase + UART011_CR);
 
        spin_unlock_irq(&uap->port.lock);
 
        /*
         * initialise the old status of the modem signals
         */
-       uap->old_status = pl011_readw(uap, REG_FR) & UART01x_FR_MODEM_ANY;
+       uap->old_status = readw(uap->port.membase + UART01x_FR) & UART01x_FR_MODEM_ANY;
 
        /* Startup DMA */
        pl011_dma_startup(uap);
@@ -1819,11 +1675,11 @@ static int sbsa_uart_startup(struct uart_port *port)
 static void pl011_shutdown_channel(struct uart_amba_port *uap,
                                        unsigned int lcrh)
 {
-       unsigned long val;
+      unsigned long val;
 
-       val = pl011_readw(uap, lcrh);
-       val &= ~(UART01x_LCRH_BRK | UART01x_LCRH_FEN);
-       pl011_writew(uap, val, lcrh);
+      val = readw(uap->port.membase + lcrh);
+      val &= ~(UART01x_LCRH_BRK | UART01x_LCRH_FEN);
+      writew(val, uap->port.membase + lcrh);
 }
 
 /*
@@ -1837,18 +1693,18 @@ static void pl011_disable_uart(struct uart_amba_port *uap)
 
        uap->autorts = false;
        spin_lock_irq(&uap->port.lock);
-       cr = pl011_readw(uap, REG_CR);
+       cr = readw(uap->port.membase + UART011_CR);
        uap->old_cr = cr;
        cr &= UART011_CR_RTS | UART011_CR_DTR;
        cr |= UART01x_CR_UARTEN | UART011_CR_TXE;
-       pl011_writew(uap, cr, REG_CR);
+       writew(cr, uap->port.membase + UART011_CR);
        spin_unlock_irq(&uap->port.lock);
 
        /*
         * disable break condition and fifos
         */
        pl011_shutdown_channel(uap, uap->lcrh_rx);
-       if (is_implemented(uap, REG_ST_LCRH_RX))
+       if (uap->lcrh_rx != uap->lcrh_tx)
                pl011_shutdown_channel(uap, uap->lcrh_tx);
 }
 
@@ -1858,8 +1714,8 @@ static void pl011_disable_interrupts(struct uart_amba_port *uap)
 
        /* mask all interrupts and clear all pending ones */
        uap->im = 0;
-       pl011_writew(uap, uap->im, REG_IMSC);
-       pl011_writew(uap, 0xffff, REG_ICR);
+       writew(uap->im, uap->port.membase + UART011_IMSC);
+       writew(0xffff, uap->port.membase + UART011_ICR);
 
        spin_unlock_irq(&uap->port.lock);
 }
@@ -2011,8 +1867,8 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios,
                pl011_enable_ms(port);
 
        /* first, disable everything */
-       old_cr = pl011_readw(uap, REG_CR);
-       pl011_writew(uap, 0, REG_CR);
+       old_cr = readw(port->membase + UART011_CR);
+       writew(0, port->membase + UART011_CR);
 
        if (termios->c_cflag & CRTSCTS) {
                if (old_cr & UART011_CR_RTS)
@@ -2045,17 +1901,17 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios,
                        quot -= 2;
        }
        /* Set baud rate */
-       pl011_writew(uap, quot & 0x3f, REG_FBRD);
-       pl011_writew(uap, quot >> 6, REG_IBRD);
+       writew(quot & 0x3f, port->membase + UART011_FBRD);
+       writew(quot >> 6, port->membase + UART011_IBRD);
 
        /*
         * ----------v----------v----------v----------v-----
         * NOTE: lcrh_tx and lcrh_rx MUST BE WRITTEN AFTER
-        * REG_FBRD & REG_IBRD.
+        * UART011_FBRD & UART011_IBRD.
         * ----------^----------^----------^----------^-----
         */
        pl011_write_lcr_h(uap, lcr_h);
-       pl011_writew(uap, old_cr, REG_CR);
+       writew(old_cr, port->membase + UART011_CR);
 
        spin_unlock_irqrestore(&port->lock, flags);
 }
@@ -2196,9 +2052,9 @@ static void pl011_console_putchar(struct uart_port *port, int ch)
        struct uart_amba_port *uap =
            container_of(port, struct uart_amba_port, port);
 
-       while (pl011_readw(uap, REG_FR) & UART01x_FR_TXFF)
+       while (readw(uap->port.membase + UART01x_FR) & UART01x_FR_TXFF)
                barrier();
-       pl011_writew(uap, ch, REG_DR);
+       writew(ch, uap->port.membase + UART01x_DR);
 }
 
 static void
@@ -2223,10 +2079,10 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
         *      First save the CR then disable the interrupts
         */
        if (!uap->vendor->always_enabled) {
-               old_cr = pl011_readw(uap, REG_CR);
+               old_cr = readw(uap->port.membase + UART011_CR);
                new_cr = old_cr & ~UART011_CR_CTSEN;
                new_cr |= UART01x_CR_UARTEN | UART011_CR_TXE;
-               pl011_writew(uap, new_cr, REG_CR);
+               writew(new_cr, uap->port.membase + UART011_CR);
        }
 
        uart_console_write(&uap->port, s, count, pl011_console_putchar);
@@ -2236,10 +2092,10 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
         *      and restore the TCR
         */
        do {
-               status = pl011_readw(uap, REG_FR);
-       } while (status & uap->fr_busy);
+               status = readw(uap->port.membase + UART01x_FR);
+       } while (status & UART01x_FR_BUSY);
        if (!uap->vendor->always_enabled)
-               pl011_writew(uap, old_cr, REG_CR);
+               writew(old_cr, uap->port.membase + UART011_CR);
 
        if (locked)
                spin_unlock(&uap->port.lock);
@@ -2252,10 +2108,10 @@ static void __init
 pl011_console_get_options(struct uart_amba_port *uap, int *baud,
                             int *parity, int *bits)
 {
-       if (pl011_readw(uap, REG_CR) & UART01x_CR_UARTEN) {
+       if (readw(uap->port.membase + UART011_CR) & UART01x_CR_UARTEN) {
                unsigned int lcr_h, ibrd, fbrd;
 
-               lcr_h = pl011_readw(uap, uap->lcrh_tx);
+               lcr_h = readw(uap->port.membase + uap->lcrh_tx);
 
                *parity = 'n';
                if (lcr_h & UART01x_LCRH_PEN) {
@@ -2270,13 +2126,13 @@ pl011_console_get_options(struct uart_amba_port *uap, int *baud,
                else
                        *bits = 8;
 
-               ibrd = pl011_readw(uap, REG_IBRD);
-               fbrd = pl011_readw(uap, REG_FBRD);
+               ibrd = readw(uap->port.membase + UART011_IBRD);
+               fbrd = readw(uap->port.membase + UART011_FBRD);
 
                *baud = uap->port.uartclk * 4 / (64 * ibrd + fbrd);
 
                if (uap->vendor->oversampling) {
-                       if (pl011_readw(uap, REG_CR)
+                       if (readw(uap->port.membase + UART011_CR)
                                  & ST_UART011_CR_OVSFACT)
                                *baud *= 2;
                }
@@ -2348,13 +2204,10 @@ static struct console amba_console = {
 
 static void pl011_putc(struct uart_port *port, int c)
 {
-       struct uart_amba_port *uap =
-           container_of(port, struct uart_amba_port, port);
-
-       while (pl011_readw(uap, REG_FR) & UART01x_FR_TXFF)
+       while (readl(port->membase + UART01x_FR) & UART01x_FR_TXFF)
                ;
-       pl011_writeb(uap, c, REG_DR);
-       while (pl011_readw(uap, REG_FR) & uap->fr_busy)
+       writeb(c, port->membase + UART01x_DR);
+       while (readl(port->membase + UART01x_FR) & UART01x_FR_BUSY)
                ;
 }
 
@@ -2481,8 +2334,8 @@ static int pl011_register_port(struct uart_amba_port *uap)
        int ret;
 
        /* Ensure interrupts from this UART are masked and cleared */
-       pl011_writew(uap, 0, REG_IMSC);
-       pl011_writew(uap, 0xffff, REG_ICR);
+       writew(0, uap->port.membase + UART011_IMSC);
+       writew(0xffff, uap->port.membase + UART011_ICR);
 
        if (!amba_reg.state) {
                ret = uart_register_driver(&amba_reg);
@@ -2500,7 +2353,6 @@ static int pl011_register_port(struct uart_amba_port *uap)
        return ret;
 }
 
-#ifdef CONFIG_ARM_AMBA
 static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
 {
        struct uart_amba_port *uap;
@@ -2521,13 +2373,8 @@ static int pl011_probe(struct amba_device *dev, const struct amba_id *id)
                return PTR_ERR(uap->clk);
 
        uap->vendor = vendor;
-       uap->reg_lut = vendor->reg_lut;
        uap->lcrh_rx = vendor->lcrh_rx;
        uap->lcrh_tx = vendor->lcrh_tx;
-       uap->fr_busy = vendor->fr_busy;
-       uap->fr_dsr = vendor->fr_dsr;
-       uap->fr_cts = vendor->fr_cts;
-       uap->fr_ri = vendor->fr_ri;
        uap->fifosize = vendor->get_fifosize(dev);
        uap->port.irq = dev->irq[0];
        uap->port.ops = &amba_pl011_pops;
@@ -2551,67 +2398,6 @@ static int pl011_remove(struct amba_device *dev)
        pl011_unregister_port(uap);
        return 0;
 }
-#endif
-
-#ifdef CONFIG_SOC_ZX296702
-static int zx_uart_probe(struct platform_device *pdev)
-{
-       struct uart_amba_port *uap;
-       struct vendor_data *vendor = &vendor_zte;
-       struct resource *res;
-       int portnr, ret;
-
-       portnr = pl011_find_free_port();
-       if (portnr < 0)
-               return portnr;
-
-       uap = devm_kzalloc(&pdev->dev, sizeof(struct uart_amba_port),
-                       GFP_KERNEL);
-       if (!uap) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       uap->clk = devm_clk_get(&pdev->dev, NULL);
-       if (IS_ERR(uap->clk)) {
-               ret = PTR_ERR(uap->clk);
-               goto out;
-       }
-
-       uap->vendor     = vendor;
-       uap->reg_lut    = vendor->reg_lut;
-       uap->lcrh_rx    = vendor->lcrh_rx;
-       uap->lcrh_tx    = vendor->lcrh_tx;
-       uap->fr_busy    = vendor->fr_busy;
-       uap->fr_dsr     = vendor->fr_dsr;
-       uap->fr_cts     = vendor->fr_cts;
-       uap->fr_ri      = vendor->fr_ri;
-       uap->fifosize   = 16;
-       uap->port.irq   = platform_get_irq(pdev, 0);
-       uap->port.ops   = &amba_pl011_pops;
-
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-       ret = pl011_setup_port(&pdev->dev, uap, res, portnr);
-       if (ret)
-               return ret;
-
-       platform_set_drvdata(pdev, uap);
-
-       return pl011_register_port(uap);
-out:
-       return ret;
-}
-
-static int zx_uart_remove(struct platform_device *pdev)
-{
-       struct uart_amba_port *uap = platform_get_drvdata(pdev);
-
-       uart_remove_one_port(&amba_reg, &uap->port);
-       pl011_unregister_port(uap);
-       return 0;
-}
-#endif
 
 #ifdef CONFIG_PM_SLEEP
 static int pl011_suspend(struct device *dev)
@@ -2668,11 +2454,6 @@ static int sbsa_uart_probe(struct platform_device *pdev)
                return -ENOMEM;
 
        uap->vendor     = &vendor_sbsa;
-       uap->reg_lut    = vendor_sbsa.reg_lut;
-       uap->fr_busy    = vendor_sbsa.fr_busy;
-       uap->fr_dsr     = vendor_sbsa.fr_dsr;
-       uap->fr_cts     = vendor_sbsa.fr_cts;
-       uap->fr_ri      = vendor_sbsa.fr_ri;
        uap->fifosize   = 32;
        uap->port.irq   = platform_get_irq(pdev, 0);
        uap->port.ops   = &sbsa_uart_pops;
@@ -2722,7 +2503,6 @@ static struct platform_driver arm_sbsa_uart_platform_driver = {
        },
 };
 
-#ifdef CONFIG_ARM_AMBA
 static struct amba_id pl011_ids[] = {
        {
                .id     = 0x00041011,
@@ -2748,57 +2528,20 @@ static struct amba_driver pl011_driver = {
        .probe          = pl011_probe,
        .remove         = pl011_remove,
 };
-#endif
-
-#ifdef CONFIG_SOC_ZX296702
-static const struct of_device_id zx_uart_dt_ids[] = {
-       { .compatible = "zte,zx296702-uart", },
-       { /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(of, zx_uart_dt_ids);
-
-static struct platform_driver zx_uart_driver = {
-       .driver = {
-               .name   = "zx-uart",
-               .owner  = THIS_MODULE,
-               .pm     = &pl011_dev_pm_ops,
-               .of_match_table = zx_uart_dt_ids,
-       },
-       .probe          = zx_uart_probe,
-       .remove         = zx_uart_remove,
-};
-#endif
-
 
 static int __init pl011_init(void)
 {
-       int ret;
        printk(KERN_INFO "Serial: AMBA PL011 UART driver\n");
 
        if (platform_driver_register(&arm_sbsa_uart_platform_driver))
                pr_warn("could not register SBSA UART platform driver\n");
-
-#ifdef CONFIG_SOC_ZX296702
-       ret = platform_driver_register(&zx_uart_driver);
-       if (ret)
-               pr_warn("could not register ZX UART platform driver\n");
-#endif
-
-#ifdef CONFIG_ARM_AMBA
-       ret = amba_driver_register(&pl011_driver);
-#endif
-       return ret;
+       return amba_driver_register(&pl011_driver);
 }
 
 static void __exit pl011_exit(void)
 {
        platform_driver_unregister(&arm_sbsa_uart_platform_driver);
-#ifdef CONFIG_SOC_ZX296702
-       platform_driver_unregister(&zx_uart_driver);
-#endif
-#ifdef CONFIG_ARM_AMBA
        amba_driver_unregister(&pl011_driver);
-#endif
 }
 
 /*
index b5b427888b2453d88e59f7d002ab8d4e512d23e6..95b330a9ea983dcafae63956cefd03ddefe9379b 100644 (file)
@@ -353,9 +353,16 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
+       const gfp_t gfp_mask = GFP_KERNEL;
+       struct oom_control oc = {
+               .zonelist = node_zonelist(first_memory_node, gfp_mask),
+               .nodemask = NULL,
+               .gfp_mask = gfp_mask,
+               .order = -1,
+       };
+
        mutex_lock(&oom_lock);
-       if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
-                          GFP_KERNEL, 0, NULL, true))
+       if (!out_of_memory(&oc))
                pr_info("OOM request ignored because killer is disabled\n");
        mutex_unlock(&oom_lock);
 }
index 071280643db75f3f48dafc2dcbe6a385d7ac53b8..38da6e2991491064263d0010782e464e42698e85 100644 (file)
@@ -9,7 +9,7 @@ config VGA_CONSOLE
        depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \
                !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \
                (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
-               !ARM64 && !ARC
+               !ARM64 && !ARC && !MICROBLAZE
        default y
        help
          Saying Y here will allow you to use Linux in text mode through a
index 811acfc6048e7f3f99b9c28c9b70a2d9a6e2a936..8b1d371b54040b3eeb04c1d95cab2d98af202dd9 100644 (file)
@@ -2464,7 +2464,7 @@ config FB_SSD1307
        tristate "Solomon SSD1307 framebuffer support"
        depends on FB && I2C
        depends on OF
-       depends on GPIOLIB
+       depends on GPIOLIB || COMPILE_TEST
        select FB_SYS_FOPS
        select FB_SYS_FILLRECT
        select FB_SYS_COPYAREA
index abadc490fa1f58cdc3b129108248c3871fe40c17..19eb42b57d8742089d2faadf5c8a64ce81cc01b3 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/backlight.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
-#include <linux/platform_data/atmel.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
@@ -999,7 +998,7 @@ static const char *atmel_lcdfb_wiring_modes[] = {
        [ATMEL_LCDC_WIRING_RGB] = "RGB",
 };
 
-const int atmel_lcdfb_get_of_wiring_modes(struct device_node *np)
+static int atmel_lcdfb_get_of_wiring_modes(struct device_node *np)
 {
        const char *mode;
        int err, i;
index d787533d9c8b0b0e589fe4eaa217b29debbb6845..47c3191ec313dbc617c015df22832f6e0ee522cb 100644 (file)
@@ -1072,9 +1072,9 @@ void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs)
 
        for (i = specs->modedb_len + num; i < specs->modedb_len + num + svd_n; i++) {
                int idx = svd[i - specs->modedb_len - num];
-               if (!idx || idx > 63) {
+               if (!idx || idx >= ARRAY_SIZE(cea_modes)) {
                        pr_warning("Reserved SVD code %d\n", idx);
-               } else if (idx > ARRAY_SIZE(cea_modes) || !cea_modes[idx].xres) {
+               } else if (!cea_modes[idx].xres) {
                        pr_warning("Unimplemented SVD code %d\n", idx);
                } else {
                        memcpy(&m[i], cea_modes + idx, sizeof(m[i]));
index 60c3f0a1634111c41f346b0d2af5d32f1955bd28..15755ce1d26c817587f7f600db41963c46e16c7e 100644 (file)
@@ -485,7 +485,7 @@ static ssize_t show_bl_curve(struct device *device,
 
        mutex_lock(&fb_info->bl_curve_mutex);
        for (i = 0; i < FB_BACKLIGHT_LEVELS; i += 8)
-               len += snprintf(&buf[len], PAGE_SIZE, "%8ph\n",
+               len += scnprintf(&buf[len], PAGE_SIZE - len, "%8ph\n",
                                fb_info->bl_curve + i);
        mutex_unlock(&fb_info->bl_curve_mutex);
 
index 7d07cf824b64c0839a2a1b1c92f99744a7e2f86a..2510fa728d77160326781219ec554ae50c99be4a 100644 (file)
@@ -289,7 +289,7 @@ static const struct fb_videomode modedb[] = {
 };
 
 #ifdef CONFIG_FB_MODE_HELPERS
-const struct fb_videomode cea_modes[64] = {
+const struct fb_videomode cea_modes[65] = {
        /* #1: 640x480p@59.94/60Hz */
        [1] = {
                NULL, 60, 640, 480, 39722, 48, 16, 33, 10, 96, 2, 0,
index de9819660ca09ea17e52cf1f259f001adc30ac56..c9293aea8ec3502e27ddf6125f07201c16996a7b 100644 (file)
@@ -325,7 +325,6 @@ static int ocfb_probe(struct platform_device *pdev)
                dev_err(&pdev->dev, "I/O resource request failed\n");
                return -ENXIO;
        }
-       res->flags &= ~IORESOURCE_CACHEABLE;
        fbdev->regs = devm_ioremap_resource(&pdev->dev, res);
        if (IS_ERR(fbdev->regs))
                return PTR_ERR(fbdev->regs);
index a14d993f719dddd4a80a3effc45f71566c6c8666..8c246c213e06ea1bfaff43f49c34e4a903a6193a 100644 (file)
@@ -266,7 +266,6 @@ static struct platform_driver opa362_driver = {
        .remove = __exit_p(opa362_remove),
        .driver = {
                .name   = "amplifier-opa362",
-               .owner  = THIS_MODULE,
                .of_match_table = opa362_of_match,
                .suppress_bind_attrs = true,
        },
index 4f0cbb54d4dbdf1d43eb013d8fbdcc6cb3813389..d3af01c94a58d07f45b6e55c95c91af44755b987 100644 (file)
@@ -1091,7 +1091,7 @@ static void mmap_user_close(struct vm_area_struct *vma)
        omapfb_put_mem_region(rg);
 }
 
-static struct vm_operations_struct mmap_user_ops = {
+static const struct vm_operations_struct mmap_user_ops = {
        .open = mmap_user_open,
        .close = mmap_user_close,
 };
index e209b039f55304483fc8fb5f0785830bfbb523bc..efb57c059997641baed972fe2b25fd85715cf291 100644 (file)
@@ -615,7 +615,7 @@ static int pxa168fb_probe(struct platform_device *pdev)
                return -EINVAL;
        }
 
-       clk = clk_get(&pdev->dev, "LCDCLK");
+       clk = devm_clk_get(&pdev->dev, "LCDCLK");
        if (IS_ERR(clk)) {
                dev_err(&pdev->dev, "unable to get LCDCLK");
                return PTR_ERR(clk);
@@ -624,21 +624,18 @@ static int pxa168fb_probe(struct platform_device *pdev)
        res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        if (res == NULL) {
                dev_err(&pdev->dev, "no IO memory defined\n");
-               ret = -ENOENT;
-               goto failed_put_clk;
+               return -ENOENT;
        }
 
        irq = platform_get_irq(pdev, 0);
        if (irq < 0) {
                dev_err(&pdev->dev, "no IRQ defined\n");
-               ret = -ENOENT;
-               goto failed_put_clk;
+               return -ENOENT;
        }
 
        info = framebuffer_alloc(sizeof(struct pxa168fb_info), &pdev->dev);
        if (info == NULL) {
-               ret = -ENOMEM;
-               goto failed_put_clk;
+               return -ENOMEM;
        }
 
        /* Initialize private data */
@@ -776,8 +773,6 @@ failed_free_fbmem:
                        info->screen_base, fbi->fb_start_dma);
 failed_free_info:
        kfree(info);
-failed_put_clk:
-       clk_put(clk);
 
        dev_err(&pdev->dev, "frame buffer device init failed with %d\n", ret);
        return ret;
@@ -813,7 +808,6 @@ static int pxa168fb_remove(struct platform_device *pdev)
                                info->screen_base, info->fix.smem_start);
 
        clk_disable(fbi->clk);
-       clk_put(fbi->clk);
 
        framebuffer_release(info);
 
index 83433cb0dfba42010c4793bd7c8695955c77217f..96aa46dc696c94225be2b9a721e6cba6e619af46 100644 (file)
@@ -32,8 +32,7 @@
 #include <linux/spinlock_types.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
-
-#include <asm/io.h>
+#include <linux/io.h>
 
 #include <video/s1d13xxxfb.h>
 
index 7e3a05fc47aa341b2b2a9a97c4031f2d37741b0f..f72dd12456f962eba79eec43fb706be9dc1cdd39 100644 (file)
@@ -1938,7 +1938,7 @@ static struct s3c_fb_driverdata s3c_fb_data_s3c2443 = {
        },
 };
 
-static struct platform_device_id s3c_fb_driver_ids[] = {
+static const struct platform_device_id s3c_fb_driver_ids[] = {
        {
                .name           = "s3c-fb",
                .driver_data    = (unsigned long)&s3c_fb_data_64xx,
index 3e153c06131ad94adc2dac710dcd1c53539b75e8..93f4c902d0f9a62ab0383c726d35115a84c20de3 100644 (file)
@@ -656,8 +656,9 @@ static int ssd1307fb_probe(struct i2c_client *client,
        bl = backlight_device_register(bl_name, &client->dev, par,
                                       &ssd1307fb_bl_ops, NULL);
        if (IS_ERR(bl)) {
-               dev_err(&client->dev, "unable to register backlight device: %ld\n",
-                       PTR_ERR(bl));
+               ret = PTR_ERR(bl);
+               dev_err(&client->dev, "unable to register backlight device: %d\n",
+                       ret);
                goto bl_init_error;
        }
 
@@ -719,7 +720,6 @@ static struct i2c_driver ssd1307fb_driver = {
        .driver = {
                .name = "ssd1307fb",
                .of_match_table = ssd1307fb_of_match,
-               .owner = THIS_MODULE,
        },
 };
 
index 735355b0e0233529ca8ed1fbbf4df40dec867bb9..7df4228e25f05fa24c6275dce7c940b3f93c63f8 100644 (file)
@@ -64,6 +64,7 @@
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
+#include <linux/io.h>
 
 #include <asm/grfioctl.h>      /* for HP-UX compatibility */
 #include <asm/uaccess.h>
index ff2b8731a2dc6781d1aacb747609cb1a31f05223..e9c2f7ba3c8e6c34382b6b0b58a7d65e41e8292d 100644 (file)
@@ -279,7 +279,7 @@ static int dlfb_set_video_mode(struct dlfb_data *dev,
 {
        char *buf;
        char *wrptr;
-       int retval = 0;
+       int retval;
        int writesize;
        struct urb *urb;
 
@@ -1505,8 +1505,7 @@ static int dlfb_parse_vendor_descriptor(struct dlfb_data *dev,
        char *desc;
        char *buf;
        char *desc_end;
-
-       int total_len = 0;
+       int total_len;
 
        buf = kzalloc(MAX_VENDOR_DESCRIPTOR_SIZE, GFP_KERNEL);
        if (!buf)
@@ -1582,7 +1581,7 @@ static int dlfb_usb_probe(struct usb_interface *interface,
                        const struct usb_device_id *id)
 {
        struct usb_device *usbdev;
-       struct dlfb_data *dev = NULL;
+       struct dlfb_data *dev;
        int retval = -ENOMEM;
 
        /* usb initialization */
@@ -1665,7 +1664,6 @@ static void dlfb_init_framebuffer_work(struct work_struct *work)
        /* allocates framebuffer driver structure, not framebuffer memory */
        info = framebuffer_alloc(0, dev->gdev);
        if (!info) {
-               retval = -ENOMEM;
                pr_err("framebuffer_alloc failed\n");
                goto error;
        }
@@ -1912,7 +1910,7 @@ static int dlfb_alloc_urb_list(struct dlfb_data *dev, int count, size_t size)
 
 static struct urb *dlfb_get_urb(struct dlfb_data *dev)
 {
-       int ret = 0;
+       int ret;
        struct list_head *entry;
        struct urb_node *unode;
        struct urb *urb = NULL;
index 70a897b1e4588cb42bd21206de9a0ae3df55ae72..b9c2f81fb6b9f047bfcd2e144140cce80c3731c0 100644 (file)
@@ -51,7 +51,14 @@ static void *rvmalloc(unsigned long size)
        if (!mem)
                return NULL;
 
-       memset(mem, 0, size); /* Clear the ram out, no junk to the user */
+       /*
+        * VFB must clear memory to prevent kernel info
+        * leakage into userspace
+        * VGA-based drivers MUST NOT clear memory if
+        * they want to be able to take over vgacon
+        */
+
+       memset(mem, 0, size);
        adr = (unsigned long) mem;
        while (size > 0) {
                SetPageReserved(vmalloc_to_page((void *)adr));
@@ -490,14 +497,6 @@ static int vfb_probe(struct platform_device *dev)
        if (!(videomemory = rvmalloc(videomemorysize)))
                return retval;
 
-       /*
-        * VFB must clear memory to prevent kernel info
-        * leakage into userspace
-        * VGA-based drivers MUST NOT clear memory if
-        * they want to be able to take over vgacon
-        */
-       memset(videomemory, 0, videomemorysize);
-
        info = framebuffer_alloc(sizeof(u32) * 256, &dev->dev);
        if (!info)
                goto err;
index 09dc44736c1ac72160f5d9aa4d13ab1912151be9..0567d517eed34b2993a2bdf6df963ab55564297a 100644 (file)
@@ -46,7 +46,7 @@ struct xenfb_info {
        int                     nr_pages;
        int                     irq;
        struct xenfb_page       *page;
-       unsigned long           *mfns;
+       unsigned long           *gfns;
        int                     update_wanted; /* XENFB_TYPE_UPDATE wanted */
        int                     feature_resize; /* XENFB_TYPE_RESIZE ok */
        struct xenfb_resize     resize;         /* protected by resize_lock */
@@ -402,8 +402,8 @@ static int xenfb_probe(struct xenbus_device *dev,
 
        info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-       info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
-       if (!info->mfns)
+       info->gfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
+       if (!info->gfns)
                goto error_nomem;
 
        /* set up shared page */
@@ -530,29 +530,29 @@ static int xenfb_remove(struct xenbus_device *dev)
                framebuffer_release(info->fb_info);
        }
        free_page((unsigned long)info->page);
-       vfree(info->mfns);
+       vfree(info->gfns);
        vfree(info->fb);
        kfree(info);
 
        return 0;
 }
 
-static unsigned long vmalloc_to_mfn(void *address)
+static unsigned long vmalloc_to_gfn(void *address)
 {
-       return pfn_to_mfn(vmalloc_to_pfn(address));
+       return xen_page_to_gfn(vmalloc_to_page(address));
 }
 
 static void xenfb_init_shared_page(struct xenfb_info *info,
                                   struct fb_info *fb_info)
 {
        int i;
-       int epd = PAGE_SIZE / sizeof(info->mfns[0]);
+       int epd = PAGE_SIZE / sizeof(info->gfns[0]);
 
        for (i = 0; i < info->nr_pages; i++)
-               info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
+               info->gfns[i] = vmalloc_to_gfn(info->fb + i * PAGE_SIZE);
 
        for (i = 0; i * epd < info->nr_pages; i++)
-               info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]);
+               info->page->pd[i] = vmalloc_to_gfn(&info->gfns[i * epd]);
 
        info->page->width = fb_info->var.xres;
        info->page->height = fb_info->var.yres;
@@ -586,7 +586,7 @@ static int xenfb_connect_backend(struct xenbus_device *dev,
                goto unbind_irq;
        }
        ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
-                           virt_to_mfn(info->page));
+                           virt_to_gfn(info->page));
        if (ret)
                goto error_xenbus;
        ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
index 82e80e034f250b88993af25bb27434c2a78f8204..7efc32945810e8fdcafa76a6328517f35d65ea3c 100644 (file)
@@ -157,7 +157,9 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
                }
                set_page_pfns(vb->pfns + vb->num_pfns, page);
                vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
-               adjust_managed_page_count(page, -1);
+               if (!virtio_has_feature(vb->vdev,
+                                       VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+                       adjust_managed_page_count(page, -1);
        }
 
        /* Did we get any? */
@@ -166,14 +168,16 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num)
        mutex_unlock(&vb->balloon_lock);
 }
 
-static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
+static void release_pages_balloon(struct virtio_balloon *vb)
 {
        unsigned int i;
 
        /* Find pfns pointing at start of each page, get pages and free them. */
-       for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-               struct page *page = balloon_pfn_to_page(pfns[i]);
-               adjust_managed_page_count(page, 1);
+       for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+               struct page *page = balloon_pfn_to_page(vb->pfns[i]);
+               if (!virtio_has_feature(vb->vdev,
+                                       VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+                       adjust_managed_page_count(page, 1);
                put_page(page); /* balloon reference */
        }
 }
@@ -206,7 +210,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
        if (vb->num_pfns != 0)
                tell_host(vb, vb->deflate_vq);
        mutex_unlock(&vb->balloon_lock);
-       release_pages_by_pfn(vb->pfns, vb->num_pfns);
+       release_pages_balloon(vb);
        return num_freed_pages;
 }
 
index 10189b5b627f962cb9a8e9527aae829e27b0aec2..f499d9da72373d04d115caa4b7b4c9e6585ee965 100644 (file)
@@ -58,6 +58,7 @@
 
 #define pr_fmt(fmt) "virtio-mmio: " fmt
 
+#include <linux/acpi.h>
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -732,12 +733,21 @@ static struct of_device_id virtio_mmio_match[] = {
 };
 MODULE_DEVICE_TABLE(of, virtio_mmio_match);
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id virtio_mmio_acpi_match[] = {
+       { "LNRO0005", },
+       { }
+};
+MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match);
+#endif
+
 static struct platform_driver virtio_mmio_driver = {
        .probe          = virtio_mmio_probe,
        .remove         = virtio_mmio_remove,
        .driver         = {
                .name   = "virtio-mmio",
                .of_match_table = virtio_mmio_match,
+               .acpi_match_table = ACPI_PTR(virtio_mmio_acpi_match),
        },
 };
 
index 55c4b5b0a3173f799c4df9b294c9f4b8dac886ba..c68edc16aa54c5e65347588e32c883ffdaf71f63 100644 (file)
@@ -188,6 +188,15 @@ config AT91SAM9X_WATCHDOG
          Watchdog timer embedded into AT91SAM9X and AT91CAP9 chips. This will
          reboot your system when the timeout is reached.
 
+config SAMA5D4_WATCHDOG
+       tristate "Atmel SAMA5D4 Watchdog Timer"
+       depends on ARCH_AT91
+       select WATCHDOG_CORE
+       help
+         Atmel SAMA5D4 watchdog timer is embedded into SAMA5D4 chips.
+         Its Watchdog Timer Mode Register can be written more than once.
+         This will reboot your system when the timeout is reached.
+
 config CADENCE_WATCHDOG
        tristate "Cadence Watchdog Timer"
        depends on HAS_IOMEM
@@ -558,6 +567,17 @@ config DIGICOLOR_WATCHDOG
          To compile this driver as a module, choose M here: the
          module will be called digicolor_wdt.
 
+config LPC18XX_WATCHDOG
+       tristate "LPC18xx/43xx Watchdog"
+       depends on ARCH_LPC18XX || COMPILE_TEST
+       select WATCHDOG_CORE
+       help
+         Say Y here if to include support for the watchdog timer
+         in NXP LPC SoCs family, which includes LPC18xx/LPC43xx
+         processors.
+         To compile this driver as a module, choose M here: the
+         module will be called lpc18xx_wdt.
+
 # AVR32 Architecture
 
 config AT32AP700X_WDT
@@ -1334,7 +1354,7 @@ config MPC5200_WDT
 
 config 8xxx_WDT
        tristate "MPC8xxx Platform Watchdog Timer"
-       depends on PPC_8xx || PPC_83xx || PPC_86xx
+       depends on PPC_8xx || PPC_83xx || PPC_86xx || PPC_MPC512x
        select WATCHDOG_CORE
        help
          This driver is for a SoC level watchdog that exists on some
index 59ea9a1b8e766f64e49531873cb5b94373956962..0c616e3f67bb57ba79c6457bb9132ed2c2a7b678 100644 (file)
@@ -41,6 +41,7 @@ obj-$(CONFIG_IXP4XX_WATCHDOG) += ixp4xx_wdt.o
 obj-$(CONFIG_KS8695_WATCHDOG) += ks8695_wdt.o
 obj-$(CONFIG_S3C2410_WATCHDOG) += s3c2410_wdt.o
 obj-$(CONFIG_SA1100_WATCHDOG) += sa1100_wdt.o
+obj-$(CONFIG_SAMA5D4_WATCHDOG) += sama5d4_wdt.o
 obj-$(CONFIG_DW_WATCHDOG) += dw_wdt.o
 obj-$(CONFIG_EP93XX_WATCHDOG) += ep93xx_wdt.o
 obj-$(CONFIG_PNX4008_WATCHDOG) += pnx4008_wdt.o
@@ -66,6 +67,7 @@ obj-$(CONFIG_TEGRA_WATCHDOG) += tegra_wdt.o
 obj-$(CONFIG_MESON_WATCHDOG) += meson_wdt.o
 obj-$(CONFIG_MEDIATEK_WATCHDOG) += mtk_wdt.o
 obj-$(CONFIG_DIGICOLOR_WATCHDOG) += digicolor_wdt.o
+obj-$(CONFIG_LPC18XX_WATCHDOG) += lpc18xx_wdt.o
 
 # AVR32 Architecture
 obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o
index 9ba1153465ae8a1ef051dca258875940feb77b74..e12a797cb82099a7e58c8167ba5a3c9401eb1bab 100644 (file)
@@ -244,7 +244,7 @@ static int at91wdt_probe(struct platform_device *pdev)
        }
 
        regmap_st = syscon_node_to_regmap(parent->of_node);
-       if (!regmap_st)
+       if (IS_ERR(regmap_st))
                return -ENODEV;
 
        res = misc_register(&at91wdt_miscdev);
index e4698f7c5f9306826836e7856b7862aa21c593d6..7e6acaf3ece495ac9056bcd74532c994feb8ba0f 100644 (file)
@@ -17,6 +17,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/clk.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
@@ -90,6 +91,7 @@ struct at91wdt {
        unsigned long heartbeat;        /* WDT heartbeat in jiffies */
        bool nowayout;
        unsigned int irq;
+       struct clk *sclk;
 };
 
 /* ......................................................................... */
@@ -352,15 +354,25 @@ static int __init at91wdt_probe(struct platform_device *pdev)
        if (IS_ERR(wdt->base))
                return PTR_ERR(wdt->base);
 
+       wdt->sclk = devm_clk_get(&pdev->dev, NULL);
+       if (IS_ERR(wdt->sclk))
+               return PTR_ERR(wdt->sclk);
+
+       err = clk_prepare_enable(wdt->sclk);
+       if (err) {
+               dev_err(&pdev->dev, "Could not enable slow clock\n");
+               return err;
+       }
+
        if (pdev->dev.of_node) {
                err = of_at91wdt_init(pdev->dev.of_node, wdt);
                if (err)
-                       return err;
+                       goto err_clk;
        }
 
        err = at91_wdt_init(pdev, wdt);
        if (err)
-               return err;
+               goto err_clk;
 
        platform_set_drvdata(pdev, wdt);
 
@@ -368,6 +380,11 @@ static int __init at91wdt_probe(struct platform_device *pdev)
                wdt->wdd.timeout, wdt->nowayout);
 
        return 0;
+
+err_clk:
+       clk_disable_unprepare(wdt->sclk);
+
+       return err;
 }
 
 static int __exit at91wdt_remove(struct platform_device *pdev)
@@ -377,6 +394,7 @@ static int __exit at91wdt_remove(struct platform_device *pdev)
 
        pr_warn("I quit now, hardware will probably reboot!\n");
        del_timer(&wdt->timer);
+       clk_disable_unprepare(wdt->sclk);
 
        return 0;
 }
index c6fbb2e6c41baa55d46cb054928097ec4e892eff..b79a83b467cec8c790e6fe02bf66c3776ddaa5bf 100644 (file)
 
 #define AT91_WDT_MR            0x04                    /* Watchdog Mode Register */
 #define                AT91_WDT_WDV            (0xfff << 0)            /* Counter Value */
+#define                        AT91_WDT_SET_WDV(x)     ((x) & AT91_WDT_WDV)
 #define                AT91_WDT_WDFIEN         (1     << 12)           /* Fault Interrupt Enable */
 #define                AT91_WDT_WDRSTEN        (1     << 13)           /* Reset Processor */
 #define                AT91_WDT_WDRPROC        (1     << 14)           /* Timer Restart */
 #define                AT91_WDT_WDDIS          (1     << 15)           /* Watchdog Disable */
 #define                AT91_WDT_WDD            (0xfff << 16)           /* Delta Value */
+#define                        AT91_WDT_SET_WDD(x)     (((x) << 16) & AT91_WDT_WDD)
 #define                AT91_WDT_WDDBGHLT       (1     << 28)           /* Debug Halt */
 #define                AT91_WDT_WDIDLEHLT      (1     << 29)           /* Idle Halt */
 
index 7116968dee12944ebca036127aedc136825f2dda..66c3e656a616619e02c8c523f19913e3274459ff 100644 (file)
@@ -182,6 +182,7 @@ static int bcm2835_wdt_probe(struct platform_device *pdev)
        watchdog_set_drvdata(&bcm2835_wdt_wdd, wdt);
        watchdog_init_timeout(&bcm2835_wdt_wdd, heartbeat, dev);
        watchdog_set_nowayout(&bcm2835_wdt_wdd, nowayout);
+       bcm2835_wdt_wdd.parent = &pdev->dev;
        err = watchdog_register_device(&bcm2835_wdt_wdd);
        if (err) {
                dev_err(dev, "Failed to register watchdog device");
index b28a072abf78fbe030f647ef491e8fc24bc157d4..4064a43f1360a2fff166bd96caf81e9ceabaccaf 100644 (file)
@@ -209,6 +209,7 @@ static int bcm47xx_wdt_probe(struct platform_device *pdev)
 
        wdt->wdd.info = &bcm47xx_wdt_info;
        wdt->wdd.timeout = WDT_DEFAULT_TIME;
+       wdt->wdd.parent = &pdev->dev;
        ret = wdt->wdd.ops->set_timeout(&wdt->wdd, timeout);
        if (ret)
                goto err_timer;
index 22d8ae65772a508ecb28592b3d59f7304da9a8d0..e0c98423f2c9d0c2203a14933a0061f3026b224e 100644 (file)
@@ -319,6 +319,7 @@ static int bcm_kona_wdt_probe(struct platform_device *pdev)
        spin_lock_init(&wdt->lock);
        platform_set_drvdata(pdev, wdt);
        watchdog_set_drvdata(&bcm_kona_wdt_wdd, wdt);
+       bcm_kona_wdt_wdd.parent = &pdev->dev;
 
        ret = bcm_kona_wdt_set_timeout_reg(&bcm_kona_wdt_wdd, 0);
        if (ret) {
index e96b09b135c8faed6b544e522cd58ed446aedeae..04da4b66c75e361d191a0c0faac4fd4010d9d26a 100644 (file)
@@ -186,8 +186,6 @@ static int booke_wdt_stop(struct watchdog_device *wdog)
 static int booke_wdt_set_timeout(struct watchdog_device *wdt_dev,
                                 unsigned int timeout)
 {
-       if (timeout > MAX_WDT_TIMEOUT)
-               return -EINVAL;
        wdt_dev->timeout = timeout;
        booke_wdt_set(wdt_dev);
 
@@ -211,7 +209,6 @@ static struct watchdog_device booke_wdt_dev = {
        .info = &booke_wdt_info,
        .ops = &booke_wdt_ops,
        .min_timeout = 1,
-       .max_timeout = 0xFFFF
 };
 
 static void __exit booke_wdt_exit(void)
@@ -229,6 +226,7 @@ static int __init booke_wdt_init(void)
        booke_wdt_set_timeout(&booke_wdt_dev,
                              period_to_sec(booke_wdt_period));
        watchdog_set_nowayout(&booke_wdt_dev, nowayout);
+       booke_wdt_dev.max_timeout = MAX_WDT_TIMEOUT;
        if (booke_wdt_enabled)
                booke_wdt_start(&booke_wdt_dev);
 
index ce12f437f19567fb98f7154c601ffaf9240fcd27..a099b77fc0b91a076302f6657aaf87d4be9ed159 100644 (file)
@@ -358,6 +358,7 @@ static int __init coh901327_probe(struct platform_device *pdev)
        if (ret < 0)
                coh901327_wdt.timeout = 60;
 
+       coh901327_wdt.parent = &pdev->dev;
        ret = watchdog_register_device(&coh901327_wdt);
        if (ret == 0)
                dev_info(&pdev->dev,
index 2e9589652e1eee2a88f6fd137bde5366b6af77c3..67e67977bd29a7ce3637256dd421fe4bfb76cce4 100644 (file)
@@ -195,6 +195,7 @@ static int da9052_wdt_probe(struct platform_device *pdev)
        da9052_wdt->timeout = DA9052_DEF_TIMEOUT;
        da9052_wdt->info = &da9052_wdt_info;
        da9052_wdt->ops = &da9052_wdt_ops;
+       da9052_wdt->parent = &pdev->dev;
        watchdog_set_drvdata(da9052_wdt, driver_data);
 
        kref_init(&driver_data->kref);
index 495089d8dbfeb7965142c48f2579672ed8882d6c..04d1430d93d2007742cbb1cc97515ac7d78b2063 100644 (file)
@@ -161,6 +161,7 @@ static int da9055_wdt_probe(struct platform_device *pdev)
        da9055_wdt->timeout = DA9055_DEF_TIMEOUT;
        da9055_wdt->info = &da9055_wdt_info;
        da9055_wdt->ops = &da9055_wdt_ops;
+       da9055_wdt->parent = &pdev->dev;
        watchdog_set_nowayout(da9055_wdt, nowayout);
        watchdog_set_drvdata(da9055_wdt, driver_data);
 
index b3a870ce85be7b70257b96fd472f46e2c744a957..7386111220d58480c298f1612305410485eedc2f 100644 (file)
@@ -210,6 +210,7 @@ static int da9062_wdt_probe(struct platform_device *pdev)
        wdt->wdtdev.max_timeout = DA9062_WDT_MAX_TIMEOUT;
        wdt->wdtdev.timeout = DA9062_WDG_DEFAULT_TIMEOUT;
        wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS;
+       wdt->wdtdev.parent = &pdev->dev;
 
        watchdog_set_drvdata(&wdt->wdtdev, wdt);
        dev_set_drvdata(&pdev->dev, wdt);
index e2fe2ebdebd4d6bb12bf41adb1f11c9654306963..6bf130bd863d5fbb298027b54ab2c59033e9d3bd 100644 (file)
@@ -175,6 +175,7 @@ static int da9063_wdt_probe(struct platform_device *pdev)
        wdt->wdtdev.min_timeout = DA9063_WDT_MIN_TIMEOUT;
        wdt->wdtdev.max_timeout = DA9063_WDT_MAX_TIMEOUT;
        wdt->wdtdev.timeout = DA9063_WDG_TIMEOUT;
+       wdt->wdtdev.parent = &pdev->dev;
 
        wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS;
 
index cfdf8a408aea055a6ef87a363aab425ea527d1ea..17454ca653f42c6cc5154966f0da619094c8e594 100644 (file)
@@ -179,6 +179,7 @@ static int davinci_wdt_probe(struct platform_device *pdev)
        wdd->min_timeout        = 1;
        wdd->max_timeout        = MAX_HEARTBEAT;
        wdd->timeout            = DEFAULT_HEARTBEAT;
+       wdd->parent             = &pdev->dev;
 
        watchdog_init_timeout(wdd, heartbeat, dev);
 
index 31d8e4936611e48f3d774b22cd1cef64d7e2e508..50abe1bf62a50a9a914d53e2c5cd0469fafb84fd 100644 (file)
@@ -143,6 +143,7 @@ static int dc_wdt_probe(struct platform_device *pdev)
        }
        dc_wdt_wdd.max_timeout = U32_MAX / clk_get_rate(wdt->clk);
        dc_wdt_wdd.timeout = dc_wdt_wdd.max_timeout;
+       dc_wdt_wdd.parent = &pdev->dev;
 
        spin_lock_init(&wdt->lock);
 
index 7a2cc7191c585309036a51b55116080c635b539b..0a4d7cc05d5439346ae0b77633089a08ea0464d6 100644 (file)
@@ -132,6 +132,7 @@ static int ep93xx_wdt_probe(struct platform_device *pdev)
        val = readl(mmio_base + EP93XX_WATCHDOG);
        ep93xx_wdt_wdd.bootstatus = (val & 0x01) ? WDIOF_CARDRESET : 0;
        ep93xx_wdt_wdd.timeout = timeout;
+       ep93xx_wdt_wdd.parent = &pdev->dev;
 
        watchdog_set_nowayout(&ep93xx_wdt_wdd, nowayout);
 
index 1687cc2d71223cc799fbb60bb1d3f0914d7c3ada..90d59d3f38a3320f6afa616cfd62a279132ad5e4 100644 (file)
@@ -50,12 +50,41 @@ static void gpio_wdt_disable(struct gpio_wdt_priv *priv)
                gpio_direction_input(priv->gpio);
 }
 
+static void gpio_wdt_hwping(unsigned long data)
+{
+       struct watchdog_device *wdd = (struct watchdog_device *)data;
+       struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
+
+       if (priv->armed && time_after(jiffies, priv->last_jiffies +
+                                     msecs_to_jiffies(wdd->timeout * 1000))) {
+               dev_crit(wdd->dev, "Timer expired. System will reboot soon!\n");
+               return;
+       }
+
+       /* Restart timer */
+       mod_timer(&priv->timer, jiffies + priv->hw_margin);
+
+       switch (priv->hw_algo) {
+       case HW_ALGO_TOGGLE:
+               /* Toggle output pin */
+               priv->state = !priv->state;
+               gpio_set_value_cansleep(priv->gpio, priv->state);
+               break;
+       case HW_ALGO_LEVEL:
+               /* Pulse */
+               gpio_set_value_cansleep(priv->gpio, !priv->active_low);
+               udelay(1);
+               gpio_set_value_cansleep(priv->gpio, priv->active_low);
+               break;
+       }
+}
+
 static void gpio_wdt_start_impl(struct gpio_wdt_priv *priv)
 {
        priv->state = priv->active_low;
        gpio_direction_output(priv->gpio, priv->state);
        priv->last_jiffies = jiffies;
-       mod_timer(&priv->timer, priv->last_jiffies + priv->hw_margin);
+       gpio_wdt_hwping((unsigned long)&priv->wdd);
 }
 
 static int gpio_wdt_start(struct watchdog_device *wdd)
@@ -97,35 +126,6 @@ static int gpio_wdt_set_timeout(struct watchdog_device *wdd, unsigned int t)
        return gpio_wdt_ping(wdd);
 }
 
-static void gpio_wdt_hwping(unsigned long data)
-{
-       struct watchdog_device *wdd = (struct watchdog_device *)data;
-       struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
-
-       if (priv->armed && time_after(jiffies, priv->last_jiffies +
-                                     msecs_to_jiffies(wdd->timeout * 1000))) {
-               dev_crit(wdd->dev, "Timer expired. System will reboot soon!\n");
-               return;
-       }
-
-       /* Restart timer */
-       mod_timer(&priv->timer, jiffies + priv->hw_margin);
-
-       switch (priv->hw_algo) {
-       case HW_ALGO_TOGGLE:
-               /* Toggle output pin */
-               priv->state = !priv->state;
-               gpio_set_value_cansleep(priv->gpio, priv->state);
-               break;
-       case HW_ALGO_LEVEL:
-               /* Pulse */
-               gpio_set_value_cansleep(priv->gpio, !priv->active_low);
-               udelay(1);
-               gpio_set_value_cansleep(priv->gpio, priv->active_low);
-               break;
-       }
-}
-
 static int gpio_wdt_notify_sys(struct notifier_block *nb, unsigned long code,
                               void *unused)
 {
@@ -182,10 +182,10 @@ static int gpio_wdt_probe(struct platform_device *pdev)
        ret = of_property_read_string(pdev->dev.of_node, "hw_algo", &algo);
        if (ret)
                return ret;
-       if (!strncmp(algo, "toggle", 6)) {
+       if (!strcmp(algo, "toggle")) {
                priv->hw_algo = HW_ALGO_TOGGLE;
                f = GPIOF_IN;
-       } else if (!strncmp(algo, "level", 5)) {
+       } else if (!strcmp(algo, "level")) {
                priv->hw_algo = HW_ALGO_LEVEL;
                f = priv->active_low ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
        } else {
@@ -217,6 +217,7 @@ static int gpio_wdt_probe(struct platform_device *pdev)
        priv->wdd.ops           = &gpio_wdt_ops;
        priv->wdd.min_timeout   = SOFT_TIMEOUT_MIN;
        priv->wdd.max_timeout   = SOFT_TIMEOUT_MAX;
+       priv->wdd.parent        = &pdev->dev;
 
        if (watchdog_init_timeout(&priv->wdd, 0, &pdev->dev) < 0)
                priv->wdd.timeout = SOFT_TIMEOUT_DEF;
index 9bc39ae51624c48d1ac4b263d1989e766c60c383..78c2541f5d52d31af60e239dafd417b2cabd4cc8 100644 (file)
@@ -267,6 +267,7 @@ static int ie6xx_wdt_probe(struct platform_device *pdev)
 
        ie6xx_wdt_dev.timeout = timeout;
        watchdog_set_nowayout(&ie6xx_wdt_dev, nowayout);
+       ie6xx_wdt_dev.parent = &pdev->dev;
 
        spin_lock_init(&ie6xx_wdt_data.unlock_sequence);
 
index 0f73621827abf839f81fdfd46056652032898266..15ab07230960f7fd57a7c6032edaf8f3df99566e 100644 (file)
@@ -316,6 +316,7 @@ static int pdc_wdt_remove(struct platform_device *pdev)
 {
        struct pdc_wdt_dev *pdc_wdt = platform_get_drvdata(pdev);
 
+       unregister_restart_handler(&pdc_wdt->restart_handler);
        pdc_wdt_stop(&pdc_wdt->wdt_dev);
        watchdog_unregister_device(&pdc_wdt->wdt_dev);
        clk_disable_unprepare(pdc_wdt->wdt_clk);
index 84f6701c391fc9608c14bea990ef7b706416ace1..0a436b5d1e8444efed12ad7ce3f5b3eb5d4f0930 100644 (file)
@@ -137,6 +137,7 @@ static int mid_wdt_probe(struct platform_device *pdev)
        wdt_dev->min_timeout = MID_WDT_TIMEOUT_MIN;
        wdt_dev->max_timeout = MID_WDT_TIMEOUT_MAX;
        wdt_dev->timeout = MID_WDT_DEFAULT_TIMEOUT;
+       wdt_dev->parent = &pdev->dev;
 
        watchdog_set_drvdata(wdt_dev, &pdev->dev);
        platform_set_drvdata(pdev, wdt_dev);
index 4c2cc09c0c5780ec859c8643c65e887bfd3b0b54..6a7d5c365438120d5a31d59038f1aaea777b691c 100644 (file)
@@ -174,6 +174,7 @@ static int jz4740_wdt_probe(struct platform_device *pdev)
        jz4740_wdt->timeout = heartbeat;
        jz4740_wdt->min_timeout = 1;
        jz4740_wdt->max_timeout = MAX_HEARTBEAT;
+       jz4740_wdt->parent = &pdev->dev;
        watchdog_set_nowayout(jz4740_wdt, nowayout);
        watchdog_set_drvdata(jz4740_wdt, drvdata);
 
diff --git a/drivers/watchdog/lpc18xx_wdt.c b/drivers/watchdog/lpc18xx_wdt.c
new file mode 100644 (file)
index 0000000..ab7b8b1
--- /dev/null
@@ -0,0 +1,340 @@
+/*
+ * NXP LPC18xx Watchdog Timer (WDT)
+ *
+ * Copyright (c) 2015 Ariel D'Alessandro <ariel@vanguardiasur.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * Notes
+ * -----
+ * The Watchdog consists of a fixed divide-by-4 clock pre-scaler and a 24-bit
+ * counter which decrements on every clock cycle.
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/watchdog.h>
+
+/* Registers */
+#define LPC18XX_WDT_MOD                        0x00
+#define LPC18XX_WDT_MOD_WDEN           BIT(0)
+#define LPC18XX_WDT_MOD_WDRESET                BIT(1)
+
+#define LPC18XX_WDT_TC                 0x04
+#define LPC18XX_WDT_TC_MIN             0xff
+#define LPC18XX_WDT_TC_MAX             0xffffff
+
+#define LPC18XX_WDT_FEED               0x08
+#define LPC18XX_WDT_FEED_MAGIC1                0xaa
+#define LPC18XX_WDT_FEED_MAGIC2                0x55
+
+#define LPC18XX_WDT_TV                 0x0c
+
+/* Clock pre-scaler */
+#define LPC18XX_WDT_CLK_DIV            4
+
+/* Timeout values in seconds */
+#define LPC18XX_WDT_DEF_TIMEOUT                30U
+
+static int heartbeat;
+module_param(heartbeat, int, 0);
+MODULE_PARM_DESC(heartbeat, "Watchdog heartbeats in seconds (default="
+                __MODULE_STRING(LPC18XX_WDT_DEF_TIMEOUT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+                __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+struct lpc18xx_wdt_dev {
+       struct watchdog_device  wdt_dev;
+       struct clk              *reg_clk;
+       struct clk              *wdt_clk;
+       unsigned long           clk_rate;
+       void __iomem            *base;
+       struct timer_list       timer;
+       struct notifier_block   restart_handler;
+       spinlock_t              lock;
+};
+
+static int lpc18xx_wdt_feed(struct watchdog_device *wdt_dev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+       unsigned long flags;
+
+       /*
+        * An abort condition will occur if an interrupt happens during the feed
+        * sequence.
+        */
+       spin_lock_irqsave(&lpc18xx_wdt->lock, flags);
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       writel(LPC18XX_WDT_FEED_MAGIC2, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       spin_unlock_irqrestore(&lpc18xx_wdt->lock, flags);
+
+       return 0;
+}
+
+static void lpc18xx_wdt_timer_feed(unsigned long data)
+{
+       struct watchdog_device *wdt_dev = (struct watchdog_device *)data;
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+
+       lpc18xx_wdt_feed(wdt_dev);
+
+       /* Use safe value (1/2 of real timeout) */
+       mod_timer(&lpc18xx_wdt->timer, jiffies +
+                 msecs_to_jiffies((wdt_dev->timeout * MSEC_PER_SEC) / 2));
+}
+
+/*
+ * Since LPC18xx Watchdog cannot be disabled in hardware, we must keep feeding
+ * it with a timer until userspace watchdog software takes over.
+ */
+static int lpc18xx_wdt_stop(struct watchdog_device *wdt_dev)
+{
+       lpc18xx_wdt_timer_feed((unsigned long)wdt_dev);
+
+       return 0;
+}
+
+static void __lpc18xx_wdt_set_timeout(struct lpc18xx_wdt_dev *lpc18xx_wdt)
+{
+       unsigned int val;
+
+       val = DIV_ROUND_UP(lpc18xx_wdt->wdt_dev.timeout * lpc18xx_wdt->clk_rate,
+                          LPC18XX_WDT_CLK_DIV);
+       writel(val, lpc18xx_wdt->base + LPC18XX_WDT_TC);
+}
+
+static int lpc18xx_wdt_set_timeout(struct watchdog_device *wdt_dev,
+                                  unsigned int new_timeout)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+
+       lpc18xx_wdt->wdt_dev.timeout = new_timeout;
+       __lpc18xx_wdt_set_timeout(lpc18xx_wdt);
+
+       return 0;
+}
+
+static unsigned int lpc18xx_wdt_get_timeleft(struct watchdog_device *wdt_dev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+       unsigned int val;
+
+       val = readl(lpc18xx_wdt->base + LPC18XX_WDT_TV);
+       return (val * LPC18XX_WDT_CLK_DIV) / lpc18xx_wdt->clk_rate;
+}
+
+static int lpc18xx_wdt_start(struct watchdog_device *wdt_dev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = watchdog_get_drvdata(wdt_dev);
+       unsigned int val;
+
+       if (timer_pending(&lpc18xx_wdt->timer))
+               del_timer(&lpc18xx_wdt->timer);
+
+       val = readl(lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+       val |= LPC18XX_WDT_MOD_WDEN;
+       val |= LPC18XX_WDT_MOD_WDRESET;
+       writel(val, lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+
+       /*
+        * Setting the WDEN bit in the WDMOD register is not sufficient to
+        * enable the Watchdog. A valid feed sequence must be completed after
+        * setting WDEN before the Watchdog is capable of generating a reset.
+        */
+       lpc18xx_wdt_feed(wdt_dev);
+
+       return 0;
+}
+
+static struct watchdog_info lpc18xx_wdt_info = {
+       .identity       = "NXP LPC18xx Watchdog",
+       .options        = WDIOF_SETTIMEOUT |
+                         WDIOF_KEEPALIVEPING |
+                         WDIOF_MAGICCLOSE,
+};
+
+static const struct watchdog_ops lpc18xx_wdt_ops = {
+       .owner          = THIS_MODULE,
+       .start          = lpc18xx_wdt_start,
+       .stop           = lpc18xx_wdt_stop,
+       .ping           = lpc18xx_wdt_feed,
+       .set_timeout    = lpc18xx_wdt_set_timeout,
+       .get_timeleft   = lpc18xx_wdt_get_timeleft,
+};
+
+static int lpc18xx_wdt_restart(struct notifier_block *this, unsigned long mode,
+                              void *cmd)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = container_of(this,
+                               struct lpc18xx_wdt_dev, restart_handler);
+       unsigned long flags;
+       int val;
+
+       /*
+        * Incorrect feed sequence causes immediate watchdog reset if enabled.
+        */
+       spin_lock_irqsave(&lpc18xx_wdt->lock, flags);
+
+       val = readl(lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+       val |= LPC18XX_WDT_MOD_WDEN;
+       val |= LPC18XX_WDT_MOD_WDRESET;
+       writel(val, lpc18xx_wdt->base + LPC18XX_WDT_MOD);
+
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       writel(LPC18XX_WDT_FEED_MAGIC2, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+       writel(LPC18XX_WDT_FEED_MAGIC1, lpc18xx_wdt->base + LPC18XX_WDT_FEED);
+
+       spin_unlock_irqrestore(&lpc18xx_wdt->lock, flags);
+
+       return NOTIFY_OK;
+}
+
+static int lpc18xx_wdt_probe(struct platform_device *pdev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt;
+       struct device *dev = &pdev->dev;
+       struct resource *res;
+       int ret;
+
+       lpc18xx_wdt = devm_kzalloc(dev, sizeof(*lpc18xx_wdt), GFP_KERNEL);
+       if (!lpc18xx_wdt)
+               return -ENOMEM;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       lpc18xx_wdt->base = devm_ioremap_resource(dev, res);
+       if (IS_ERR(lpc18xx_wdt->base))
+               return PTR_ERR(lpc18xx_wdt->base);
+
+       lpc18xx_wdt->reg_clk = devm_clk_get(dev, "reg");
+       if (IS_ERR(lpc18xx_wdt->reg_clk)) {
+               dev_err(dev, "failed to get the reg clock\n");
+               return PTR_ERR(lpc18xx_wdt->reg_clk);
+       }
+
+       lpc18xx_wdt->wdt_clk = devm_clk_get(dev, "wdtclk");
+       if (IS_ERR(lpc18xx_wdt->wdt_clk)) {
+               dev_err(dev, "failed to get the wdt clock\n");
+               return PTR_ERR(lpc18xx_wdt->wdt_clk);
+       }
+
+       ret = clk_prepare_enable(lpc18xx_wdt->reg_clk);
+       if (ret) {
+               dev_err(dev, "could not prepare or enable sys clock\n");
+               return ret;
+       }
+
+       ret = clk_prepare_enable(lpc18xx_wdt->wdt_clk);
+       if (ret) {
+               dev_err(dev, "could not prepare or enable wdt clock\n");
+               goto disable_reg_clk;
+       }
+
+       /* We use the clock rate to calculate timeouts */
+       lpc18xx_wdt->clk_rate = clk_get_rate(lpc18xx_wdt->wdt_clk);
+       if (lpc18xx_wdt->clk_rate == 0) {
+               dev_err(dev, "failed to get clock rate\n");
+               ret = -EINVAL;
+               goto disable_wdt_clk;
+       }
+
+       lpc18xx_wdt->wdt_dev.info = &lpc18xx_wdt_info;
+       lpc18xx_wdt->wdt_dev.ops = &lpc18xx_wdt_ops;
+
+       lpc18xx_wdt->wdt_dev.min_timeout = DIV_ROUND_UP(LPC18XX_WDT_TC_MIN *
+                               LPC18XX_WDT_CLK_DIV, lpc18xx_wdt->clk_rate);
+
+       lpc18xx_wdt->wdt_dev.max_timeout = (LPC18XX_WDT_TC_MAX *
+                               LPC18XX_WDT_CLK_DIV) / lpc18xx_wdt->clk_rate;
+
+       lpc18xx_wdt->wdt_dev.timeout = min(lpc18xx_wdt->wdt_dev.max_timeout,
+                                          LPC18XX_WDT_DEF_TIMEOUT);
+
+       spin_lock_init(&lpc18xx_wdt->lock);
+
+       lpc18xx_wdt->wdt_dev.parent = dev;
+       watchdog_set_drvdata(&lpc18xx_wdt->wdt_dev, lpc18xx_wdt);
+
+       ret = watchdog_init_timeout(&lpc18xx_wdt->wdt_dev, heartbeat, dev);
+
+       __lpc18xx_wdt_set_timeout(lpc18xx_wdt);
+
+       setup_timer(&lpc18xx_wdt->timer, lpc18xx_wdt_timer_feed,
+                   (unsigned long)&lpc18xx_wdt->wdt_dev);
+
+       watchdog_set_nowayout(&lpc18xx_wdt->wdt_dev, nowayout);
+
+       platform_set_drvdata(pdev, lpc18xx_wdt);
+
+       ret = watchdog_register_device(&lpc18xx_wdt->wdt_dev);
+       if (ret)
+               goto disable_wdt_clk;
+
+       lpc18xx_wdt->restart_handler.notifier_call = lpc18xx_wdt_restart;
+       lpc18xx_wdt->restart_handler.priority = 128;
+       ret = register_restart_handler(&lpc18xx_wdt->restart_handler);
+       if (ret)
+               dev_warn(dev, "failed to register restart handler: %d\n", ret);
+
+       return 0;
+
+disable_wdt_clk:
+       clk_disable_unprepare(lpc18xx_wdt->wdt_clk);
+disable_reg_clk:
+       clk_disable_unprepare(lpc18xx_wdt->reg_clk);
+       return ret;
+}
+
+static void lpc18xx_wdt_shutdown(struct platform_device *pdev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = platform_get_drvdata(pdev);
+
+       lpc18xx_wdt_stop(&lpc18xx_wdt->wdt_dev);
+}
+
+static int lpc18xx_wdt_remove(struct platform_device *pdev)
+{
+       struct lpc18xx_wdt_dev *lpc18xx_wdt = platform_get_drvdata(pdev);
+
+       unregister_restart_handler(&lpc18xx_wdt->restart_handler);
+
+       dev_warn(&pdev->dev, "I quit now, hardware will probably reboot!\n");
+       del_timer(&lpc18xx_wdt->timer);
+
+       watchdog_unregister_device(&lpc18xx_wdt->wdt_dev);
+       clk_disable_unprepare(lpc18xx_wdt->wdt_clk);
+       clk_disable_unprepare(lpc18xx_wdt->reg_clk);
+
+       return 0;
+}
+
+static const struct of_device_id lpc18xx_wdt_match[] = {
+       { .compatible = "nxp,lpc1850-wwdt" },
+       {}
+};
+MODULE_DEVICE_TABLE(of, lpc18xx_wdt_match);
+
+static struct platform_driver lpc18xx_wdt_driver = {
+       .driver = {
+               .name = "lpc18xx-wdt",
+               .of_match_table = lpc18xx_wdt_match,
+       },
+       .probe = lpc18xx_wdt_probe,
+       .remove = lpc18xx_wdt_remove,
+       .shutdown = lpc18xx_wdt_shutdown,
+};
+module_platform_driver(lpc18xx_wdt_driver);
+
+MODULE_AUTHOR("Ariel D'Alessandro <ariel@vanguardiasur.com.ar>");
+MODULE_DESCRIPTION("NXP LPC18xx Watchdog Timer Driver");
+MODULE_LICENSE("GPL v2");
index d193a5e79c381775ba683a5cf2153d8b0e5619c7..69013007dc4701826518c0babd6d94d258719892 100644 (file)
@@ -197,6 +197,7 @@ static int a21_wdt_probe(struct platform_device *pdev)
        watchdog_init_timeout(&a21_wdt, 30, &pdev->dev);
        watchdog_set_nowayout(&a21_wdt, nowayout);
        watchdog_set_drvdata(&a21_wdt, drv);
+       a21_wdt.parent = &pdev->dev;
 
        reset = a21_wdt_get_bootstatus(drv);
        if (reset == 2)
index 59f0913c734121c96eb288cec9ee85d5dd06e4c3..3aefddebb386184456ef009182e35a267d0cca34 100644 (file)
@@ -130,6 +130,7 @@ static int menf21bmc_wdt_probe(struct platform_device *pdev)
        drv_data->wdt.info = &menf21bmc_wdt_info;
        drv_data->wdt.min_timeout = BMC_WD_TIMEOUT_MIN;
        drv_data->wdt.max_timeout = BMC_WD_TIMEOUT_MAX;
+       drv_data->wdt.parent = &pdev->dev;
        drv_data->i2c_client = i2c_client;
 
        /*
index 689381a248871af4610660ac5c6e05ec01939a8f..5f2273aac37d5df1115e516bf5b6544967b89353 100644 (file)
@@ -50,8 +50,12 @@ struct mpc8xxx_wdt_type {
        bool hw_enabled;
 };
 
-static struct mpc8xxx_wdt __iomem *wd_base;
-static int mpc8xxx_wdt_init_late(void);
+struct mpc8xxx_wdt_ddata {
+       struct mpc8xxx_wdt __iomem *base;
+       struct watchdog_device wdd;
+       struct timer_list timer;
+       spinlock_t lock;
+};
 
 static u16 timeout = 0xffff;
 module_param(timeout, ushort, 0);
@@ -68,65 +72,59 @@ module_param(nowayout, bool, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
                 "(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-/*
- * We always prescale, but if someone really doesn't want to they can set this
- * to 0
- */
-static int prescale = 1;
-
-static DEFINE_SPINLOCK(wdt_spinlock);
-
-static void mpc8xxx_wdt_keepalive(void)
+static void mpc8xxx_wdt_keepalive(struct mpc8xxx_wdt_ddata *ddata)
 {
        /* Ping the WDT */
-       spin_lock(&wdt_spinlock);
-       out_be16(&wd_base->swsrr, 0x556c);
-       out_be16(&wd_base->swsrr, 0xaa39);
-       spin_unlock(&wdt_spinlock);
+       spin_lock(&ddata->lock);
+       out_be16(&ddata->base->swsrr, 0x556c);
+       out_be16(&ddata->base->swsrr, 0xaa39);
+       spin_unlock(&ddata->lock);
 }
 
-static struct watchdog_device mpc8xxx_wdt_dev;
-static void mpc8xxx_wdt_timer_ping(unsigned long arg);
-static DEFINE_TIMER(wdt_timer, mpc8xxx_wdt_timer_ping, 0,
-               (unsigned long)&mpc8xxx_wdt_dev);
-
 static void mpc8xxx_wdt_timer_ping(unsigned long arg)
 {
-       struct watchdog_device *w = (struct watchdog_device *)arg;
+       struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
 
-       mpc8xxx_wdt_keepalive();
+       mpc8xxx_wdt_keepalive(ddata);
        /* We're pinging it twice faster than needed, just to be sure. */
-       mod_timer(&wdt_timer, jiffies + HZ * w->timeout / 2);
+       mod_timer(&ddata->timer, jiffies + HZ * ddata->wdd.timeout / 2);
 }
 
 static int mpc8xxx_wdt_start(struct watchdog_device *w)
 {
-       u32 tmp = SWCRR_SWEN;
+       struct mpc8xxx_wdt_ddata *ddata =
+               container_of(w, struct mpc8xxx_wdt_ddata, wdd);
+
+       u32 tmp = SWCRR_SWEN | SWCRR_SWPR;
 
        /* Good, fire up the show */
-       if (prescale)
-               tmp |= SWCRR_SWPR;
        if (reset)
                tmp |= SWCRR_SWRI;
 
        tmp |= timeout << 16;
 
-       out_be32(&wd_base->swcrr, tmp);
+       out_be32(&ddata->base->swcrr, tmp);
 
-       del_timer_sync(&wdt_timer);
+       del_timer_sync(&ddata->timer);
 
        return 0;
 }
 
 static int mpc8xxx_wdt_ping(struct watchdog_device *w)
 {
-       mpc8xxx_wdt_keepalive();
+       struct mpc8xxx_wdt_ddata *ddata =
+               container_of(w, struct mpc8xxx_wdt_ddata, wdd);
+
+       mpc8xxx_wdt_keepalive(ddata);
        return 0;
 }
 
 static int mpc8xxx_wdt_stop(struct watchdog_device *w)
 {
-       mod_timer(&wdt_timer, jiffies);
+       struct mpc8xxx_wdt_ddata *ddata =
+               container_of(w, struct mpc8xxx_wdt_ddata, wdd);
+
+       mod_timer(&ddata->timer, jiffies);
        return 0;
 }
 
@@ -143,53 +141,57 @@ static struct watchdog_ops mpc8xxx_wdt_ops = {
        .stop = mpc8xxx_wdt_stop,
 };
 
-static struct watchdog_device mpc8xxx_wdt_dev = {
-       .info = &mpc8xxx_wdt_info,
-       .ops = &mpc8xxx_wdt_ops,
-};
-
-static const struct of_device_id mpc8xxx_wdt_match[];
 static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
 {
        int ret;
-       const struct of_device_id *match;
-       struct device_node *np = ofdev->dev.of_node;
+       struct resource *res;
        const struct mpc8xxx_wdt_type *wdt_type;
+       struct mpc8xxx_wdt_ddata *ddata;
        u32 freq = fsl_get_sys_freq();
        bool enabled;
        unsigned int timeout_sec;
 
-       match = of_match_device(mpc8xxx_wdt_match, &ofdev->dev);
-       if (!match)
+       wdt_type = of_device_get_match_data(&ofdev->dev);
+       if (!wdt_type)
                return -EINVAL;
-       wdt_type = match->data;
 
        if (!freq || freq == -1)
                return -EINVAL;
 
-       wd_base = of_iomap(np, 0);
-       if (!wd_base)
+       ddata = devm_kzalloc(&ofdev->dev, sizeof(*ddata), GFP_KERNEL);
+       if (!ddata)
                return -ENOMEM;
 
-       enabled = in_be32(&wd_base->swcrr) & SWCRR_SWEN;
+       res = platform_get_resource(ofdev, IORESOURCE_MEM, 0);
+       ddata->base = devm_ioremap_resource(&ofdev->dev, res);
+       if (IS_ERR(ddata->base))
+               return PTR_ERR(ddata->base);
+
+       enabled = in_be32(&ddata->base->swcrr) & SWCRR_SWEN;
        if (!enabled && wdt_type->hw_enabled) {
                pr_info("could not be enabled in software\n");
-               ret = -ENOSYS;
-               goto err_unmap;
+               return -ENODEV;
        }
 
+       spin_lock_init(&ddata->lock);
+       setup_timer(&ddata->timer, mpc8xxx_wdt_timer_ping,
+                   (unsigned long)ddata);
+
+       ddata->wdd.info = &mpc8xxx_wdt_info,
+       ddata->wdd.ops = &mpc8xxx_wdt_ops,
+
        /* Calculate the timeout in seconds */
-       if (prescale)
-               timeout_sec = (timeout * wdt_type->prescaler) / freq;
-       else
-               timeout_sec = timeout / freq;
-
-       mpc8xxx_wdt_dev.timeout = timeout_sec;
-#ifdef MODULE
-       ret = mpc8xxx_wdt_init_late();
-       if (ret)
-               goto err_unmap;
-#endif
+       timeout_sec = (timeout * wdt_type->prescaler) / freq;
+
+       ddata->wdd.timeout = timeout_sec;
+
+       watchdog_set_nowayout(&ddata->wdd, nowayout);
+
+       ret = watchdog_register_device(&ddata->wdd);
+       if (ret) {
+               pr_err("cannot register watchdog device (err=%d)\n", ret);
+               return ret;
+       }
 
        pr_info("WDT driver for MPC8xxx initialized. mode:%s timeout=%d (%d seconds)\n",
                reset ? "reset" : "interrupt", timeout, timeout_sec);
@@ -200,21 +202,20 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
         * userspace handles it.
         */
        if (enabled)
-               mod_timer(&wdt_timer, jiffies);
+               mod_timer(&ddata->timer, jiffies);
+
+       platform_set_drvdata(ofdev, ddata);
        return 0;
-err_unmap:
-       iounmap(wd_base);
-       wd_base = NULL;
-       return ret;
 }
 
 static int mpc8xxx_wdt_remove(struct platform_device *ofdev)
 {
+       struct mpc8xxx_wdt_ddata *ddata = platform_get_drvdata(ofdev);
+
        pr_crit("Watchdog removed, expect the %s soon!\n",
                reset ? "reset" : "machine check exception");
-       del_timer_sync(&wdt_timer);
-       watchdog_unregister_device(&mpc8xxx_wdt_dev);
-       iounmap(wd_base);
+       del_timer_sync(&ddata->timer);
+       watchdog_unregister_device(&ddata->wdd);
 
        return 0;
 }
@@ -253,31 +254,6 @@ static struct platform_driver mpc8xxx_wdt_driver = {
        },
 };
 
-/*
- * We do wdt initialization in two steps: arch_initcall probes the wdt
- * very early to start pinging the watchdog (misc devices are not yet
- * available), and later module_init() just registers the misc device.
- */
-static int mpc8xxx_wdt_init_late(void)
-{
-       int ret;
-
-       if (!wd_base)
-               return -ENODEV;
-
-       watchdog_set_nowayout(&mpc8xxx_wdt_dev, nowayout);
-
-       ret = watchdog_register_device(&mpc8xxx_wdt_dev);
-       if (ret) {
-               pr_err("cannot register watchdog device (err=%d)\n", ret);
-               return ret;
-       }
-       return 0;
-}
-#ifndef MODULE
-module_init(mpc8xxx_wdt_init_late);
-#endif
-
 static int __init mpc8xxx_wdt_init(void)
 {
        return platform_driver_register(&mpc8xxx_wdt_driver);
index 938b987de551bdea7615a701007d0125f9a10d8b..6ad9df948711080ca3c87464d0195bdd3c0d9feb 100644 (file)
@@ -210,6 +210,14 @@ static int mtk_wdt_probe(struct platform_device *pdev)
        return 0;
 }
 
+static void mtk_wdt_shutdown(struct platform_device *pdev)
+{
+       struct mtk_wdt_dev *mtk_wdt = platform_get_drvdata(pdev);
+
+       if (watchdog_active(&mtk_wdt->wdt_dev))
+               mtk_wdt_stop(&mtk_wdt->wdt_dev);
+}
+
 static int mtk_wdt_remove(struct platform_device *pdev)
 {
        struct mtk_wdt_dev *mtk_wdt = platform_get_drvdata(pdev);
@@ -221,17 +229,48 @@ static int mtk_wdt_remove(struct platform_device *pdev)
        return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int mtk_wdt_suspend(struct device *dev)
+{
+       struct mtk_wdt_dev *mtk_wdt = dev_get_drvdata(dev);
+
+       if (watchdog_active(&mtk_wdt->wdt_dev))
+               mtk_wdt_stop(&mtk_wdt->wdt_dev);
+
+       return 0;
+}
+
+static int mtk_wdt_resume(struct device *dev)
+{
+       struct mtk_wdt_dev *mtk_wdt = dev_get_drvdata(dev);
+
+       if (watchdog_active(&mtk_wdt->wdt_dev)) {
+               mtk_wdt_start(&mtk_wdt->wdt_dev);
+               mtk_wdt_ping(&mtk_wdt->wdt_dev);
+       }
+
+       return 0;
+}
+#endif
+
 static const struct of_device_id mtk_wdt_dt_ids[] = {
        { .compatible = "mediatek,mt6589-wdt" },
        { /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, mtk_wdt_dt_ids);
 
+static const struct dev_pm_ops mtk_wdt_pm_ops = {
+       SET_SYSTEM_SLEEP_PM_OPS(mtk_wdt_suspend,
+                               mtk_wdt_resume)
+};
+
 static struct platform_driver mtk_wdt_driver = {
        .probe          = mtk_wdt_probe,
        .remove         = mtk_wdt_remove,
+       .shutdown       = mtk_wdt_shutdown,
        .driver         = {
                .name           = DRV_NAME,
+               .pm             = &mtk_wdt_pm_ops,
                .of_match_table = mtk_wdt_dt_ids,
        },
 };
index c028454be66ce9e682db1ecb16449427ca7d1900..bd917bb757b8251139efa7661fa1a164edc90412 100644 (file)
@@ -294,6 +294,8 @@ static const struct pci_device_id tco_pci_tbl[] = {
          PCI_ANY_ID, PCI_ANY_ID, },
        { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_SMBUS,
          PCI_ANY_ID, PCI_ANY_ID, },
+       { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS,
+         PCI_ANY_ID, PCI_ANY_ID, },
        { 0, },                 /* End of list */
 };
 MODULE_DEVICE_TABLE(pci, tco_pci_tbl);
index de911c7e477c2875fe3633bce5a72a6b45fb95c0..d96bee017fd3caa2fbcde961cc3ba9941b34c341 100644 (file)
@@ -253,6 +253,7 @@ static int omap_wdt_probe(struct platform_device *pdev)
        wdev->wdog.ops = &omap_wdt_ops;
        wdev->wdog.min_timeout = TIMER_MARGIN_MIN;
        wdev->wdog.max_timeout = TIMER_MARGIN_MAX;
+       wdev->wdog.parent = &pdev->dev;
 
        if (watchdog_init_timeout(&wdev->wdog, timer_margin, &pdev->dev) < 0)
                wdev->wdog.timeout = TIMER_MARGIN_DEFAULT;
index ef0c628d503797d87b223eb2cbc81061290a10bc..c6b8f4a43bdeff2df7faa71f6356819f8129c326 100644 (file)
@@ -567,6 +567,7 @@ static int orion_wdt_probe(struct platform_device *pdev)
 
        dev->wdt.timeout = wdt_max_duration;
        dev->wdt.max_timeout = wdt_max_duration;
+       dev->wdt.parent = &pdev->dev;
        watchdog_init_timeout(&dev->wdt, heartbeat, &pdev->dev);
 
        platform_set_drvdata(pdev, &dev->wdt);
index b9c6049c3e78601151218508c67e54b363d69be5..4224b3ec83a5515dc76a57507ec81122dd9316e1 100644 (file)
@@ -167,6 +167,7 @@ static int pnx4008_wdt_probe(struct platform_device *pdev)
 
        pnx4008_wdd.bootstatus = (readl(WDTIM_RES(wdt_base)) & WDOG_RESET) ?
                        WDIOF_CARDRESET : 0;
+       pnx4008_wdd.parent = &pdev->dev;
        watchdog_set_nowayout(&pnx4008_wdd, nowayout);
 
        pnx4008_wdt_stop(&pnx4008_wdd); /* disable for now */
index aa03ca8f2d9b0a0f08f85cab9d928fdc0a594388..773dcfaee7b2fb7dd61db2d781add26856e727ac 100644 (file)
@@ -171,6 +171,7 @@ static int qcom_wdt_probe(struct platform_device *pdev)
        wdt->wdd.ops = &qcom_wdt_ops;
        wdt->wdd.min_timeout = 1;
        wdt->wdd.max_timeout = 0x10000000U / wdt->rate;
+       wdt->wdd.parent = &pdev->dev;
 
        /*
         * If 'timeout-sec' unspecified in devicetree, assume a 30 second
index b7c68e275aeb357503b3e10e48b2102444348b92..39cd51df2ffc76ac8a3d795c41fb9a859b4b33dc 100644 (file)
@@ -127,6 +127,7 @@ static int retu_wdt_probe(struct platform_device *pdev)
        retu_wdt->timeout       = RETU_WDT_MAX_TIMER;
        retu_wdt->min_timeout   = 0;
        retu_wdt->max_timeout   = RETU_WDT_MAX_TIMER;
+       retu_wdt->parent        = &pdev->dev;
 
        watchdog_set_drvdata(retu_wdt, wdev);
        watchdog_set_nowayout(retu_wdt, nowayout);
index a6f7e2e29bebbdb201952e5a14fa268e3cc4b1f1..1967919ae74330454a440925689f6de95fcdb65c 100644 (file)
@@ -161,6 +161,7 @@ static int rt288x_wdt_probe(struct platform_device *pdev)
        rt288x_wdt_dev.dev = &pdev->dev;
        rt288x_wdt_dev.bootstatus = rt288x_wdt_bootcause();
        rt288x_wdt_dev.max_timeout = (0xfffful / rt288x_wdt_freq);
+       rt288x_wdt_dev.parent = &pdev->dev;
 
        watchdog_init_timeout(&rt288x_wdt_dev, rt288x_wdt_dev.max_timeout,
                              &pdev->dev);
index e89ae027c91db4baa05a588a3c4aaac79737abe5..d781000c78250144ba42966e2f7c66be288930e0 100644 (file)
@@ -607,6 +607,7 @@ static int s3c2410wdt_probe(struct platform_device *pdev)
        watchdog_set_nowayout(&wdt->wdt_device, nowayout);
 
        wdt->wdt_device.bootstatus = s3c2410wdt_get_bootstatus(wdt);
+       wdt->wdt_device.parent = &pdev->dev;
 
        ret = watchdog_register_device(&wdt->wdt_device);
        if (ret) {
diff --git a/drivers/watchdog/sama5d4_wdt.c b/drivers/watchdog/sama5d4_wdt.c
new file mode 100644 (file)
index 0000000..a49634c
--- /dev/null
@@ -0,0 +1,280 @@
+/*
+ * Driver for Atmel SAMA5D4 Watchdog Timer
+ *
+ * Copyright (C) 2015 Atmel Corporation
+ *
+ * Licensed under GPLv2.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/watchdog.h>
+
+#include "at91sam9_wdt.h"
+
+/* minimum and maximum watchdog timeout, in seconds */
+#define MIN_WDT_TIMEOUT                1
+#define MAX_WDT_TIMEOUT                16
+#define WDT_DEFAULT_TIMEOUT    MAX_WDT_TIMEOUT
+
+#define WDT_SEC2TICKS(s)       ((s) ? (((s) << 8) - 1) : 0)
+
+struct sama5d4_wdt {
+       struct watchdog_device  wdd;
+       void __iomem            *reg_base;
+       u32     config;
+};
+
+static int wdt_timeout = WDT_DEFAULT_TIMEOUT;
+static bool nowayout = WATCHDOG_NOWAYOUT;
+
+module_param(wdt_timeout, int, 0);
+MODULE_PARM_DESC(wdt_timeout,
+       "Watchdog timeout in seconds. (default = "
+       __MODULE_STRING(WDT_DEFAULT_TIMEOUT) ")");
+
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout,
+       "Watchdog cannot be stopped once started (default="
+       __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+#define wdt_read(wdt, field) \
+       readl_relaxed((wdt)->reg_base + (field))
+
+#define wdt_write(wtd, field, val) \
+       writel_relaxed((val), (wdt)->reg_base + (field))
+
+static int sama5d4_wdt_start(struct watchdog_device *wdd)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+       u32 reg;
+
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg &= ~AT91_WDT_WDDIS;
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       return 0;
+}
+
+static int sama5d4_wdt_stop(struct watchdog_device *wdd)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+       u32 reg;
+
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg |= AT91_WDT_WDDIS;
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       return 0;
+}
+
+static int sama5d4_wdt_ping(struct watchdog_device *wdd)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+
+       wdt_write(wdt, AT91_WDT_CR, AT91_WDT_KEY | AT91_WDT_WDRSTT);
+
+       return 0;
+}
+
+static int sama5d4_wdt_set_timeout(struct watchdog_device *wdd,
+                                unsigned int timeout)
+{
+       struct sama5d4_wdt *wdt = watchdog_get_drvdata(wdd);
+       u32 value = WDT_SEC2TICKS(timeout);
+       u32 reg;
+
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg &= ~AT91_WDT_WDV;
+       reg &= ~AT91_WDT_WDD;
+       reg |= AT91_WDT_SET_WDV(value);
+       reg |= AT91_WDT_SET_WDD(value);
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       wdd->timeout = timeout;
+
+       return 0;
+}
+
+static const struct watchdog_info sama5d4_wdt_info = {
+       .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
+       .identity = "Atmel SAMA5D4 Watchdog",
+};
+
+static struct watchdog_ops sama5d4_wdt_ops = {
+       .owner = THIS_MODULE,
+       .start = sama5d4_wdt_start,
+       .stop = sama5d4_wdt_stop,
+       .ping = sama5d4_wdt_ping,
+       .set_timeout = sama5d4_wdt_set_timeout,
+};
+
+static irqreturn_t sama5d4_wdt_irq_handler(int irq, void *dev_id)
+{
+       struct sama5d4_wdt *wdt = platform_get_drvdata(dev_id);
+
+       if (wdt_read(wdt, AT91_WDT_SR)) {
+               pr_crit("Atmel Watchdog Software Reset\n");
+               emergency_restart();
+               pr_crit("Reboot didn't succeed\n");
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int of_sama5d4_wdt_init(struct device_node *np, struct sama5d4_wdt *wdt)
+{
+       const char *tmp;
+
+       wdt->config = AT91_WDT_WDDIS;
+
+       if (!of_property_read_string(np, "atmel,watchdog-type", &tmp) &&
+           !strcmp(tmp, "software"))
+               wdt->config |= AT91_WDT_WDFIEN;
+       else
+               wdt->config |= AT91_WDT_WDRSTEN;
+
+       if (of_property_read_bool(np, "atmel,idle-halt"))
+               wdt->config |= AT91_WDT_WDIDLEHLT;
+
+       if (of_property_read_bool(np, "atmel,dbg-halt"))
+               wdt->config |= AT91_WDT_WDDBGHLT;
+
+       return 0;
+}
+
+static int sama5d4_wdt_init(struct sama5d4_wdt *wdt)
+{
+       struct watchdog_device *wdd = &wdt->wdd;
+       u32 value = WDT_SEC2TICKS(wdd->timeout);
+       u32 reg;
+
+       /*
+        * Because the fields WDV and WDD must not be modified when the WDDIS
+        * bit is set, so clear the WDDIS bit before writing the WDT_MR.
+        */
+       reg = wdt_read(wdt, AT91_WDT_MR);
+       reg &= ~AT91_WDT_WDDIS;
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       reg = wdt->config;
+       reg |= AT91_WDT_SET_WDD(value);
+       reg |= AT91_WDT_SET_WDV(value);
+
+       wdt_write(wdt, AT91_WDT_MR, reg);
+
+       return 0;
+}
+
+static int sama5d4_wdt_probe(struct platform_device *pdev)
+{
+       struct watchdog_device *wdd;
+       struct sama5d4_wdt *wdt;
+       struct resource *res;
+       void __iomem *regs;
+       u32 irq = 0;
+       int ret;
+
+       wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL);
+       if (!wdt)
+               return -ENOMEM;
+
+       wdd = &wdt->wdd;
+       wdd->timeout = wdt_timeout;
+       wdd->info = &sama5d4_wdt_info;
+       wdd->ops = &sama5d4_wdt_ops;
+       wdd->min_timeout = MIN_WDT_TIMEOUT;
+       wdd->max_timeout = MAX_WDT_TIMEOUT;
+
+       watchdog_set_drvdata(wdd, wdt);
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       regs = devm_ioremap_resource(&pdev->dev, res);
+       if (IS_ERR(regs))
+               return PTR_ERR(regs);
+
+       wdt->reg_base = regs;
+
+       if (pdev->dev.of_node) {
+               irq = irq_of_parse_and_map(pdev->dev.of_node, 0);
+               if (!irq)
+                       dev_warn(&pdev->dev, "failed to get IRQ from DT\n");
+
+               ret = of_sama5d4_wdt_init(pdev->dev.of_node, wdt);
+               if (ret)
+                       return ret;
+       }
+
+       if ((wdt->config & AT91_WDT_WDFIEN) && irq) {
+               ret = devm_request_irq(&pdev->dev, irq, sama5d4_wdt_irq_handler,
+                                      IRQF_SHARED | IRQF_IRQPOLL |
+                                      IRQF_NO_SUSPEND, pdev->name, pdev);
+               if (ret) {
+                       dev_err(&pdev->dev,
+                               "cannot register interrupt handler\n");
+                       return ret;
+               }
+       }
+
+       ret = watchdog_init_timeout(wdd, wdt_timeout, &pdev->dev);
+       if (ret) {
+               dev_err(&pdev->dev, "unable to set timeout value\n");
+               return ret;
+       }
+
+       ret = sama5d4_wdt_init(wdt);
+       if (ret)
+               return ret;
+
+       watchdog_set_nowayout(wdd, nowayout);
+
+       ret = watchdog_register_device(wdd);
+       if (ret) {
+               dev_err(&pdev->dev, "failed to register watchdog device\n");
+               return ret;
+       }
+
+       platform_set_drvdata(pdev, wdt);
+
+       dev_info(&pdev->dev, "initialized (timeout = %d sec, nowayout = %d)\n",
+                wdt_timeout, nowayout);
+
+       return 0;
+}
+
+static int sama5d4_wdt_remove(struct platform_device *pdev)
+{
+       struct sama5d4_wdt *wdt = platform_get_drvdata(pdev);
+
+       sama5d4_wdt_stop(&wdt->wdd);
+
+       watchdog_unregister_device(&wdt->wdd);
+
+       return 0;
+}
+
+static const struct of_device_id sama5d4_wdt_of_match[] = {
+       { .compatible = "atmel,sama5d4-wdt", },
+       { }
+};
+MODULE_DEVICE_TABLE(of, sama5d4_wdt_of_match);
+
+static struct platform_driver sama5d4_wdt_driver = {
+       .probe          = sama5d4_wdt_probe,
+       .remove         = sama5d4_wdt_remove,
+       .driver         = {
+               .name   = "sama5d4_wdt",
+               .of_match_table = sama5d4_wdt_of_match,
+       }
+};
+module_platform_driver(sama5d4_wdt_driver);
+
+MODULE_AUTHOR("Atmel Corporation");
+MODULE_DESCRIPTION("Atmel SAMA5D4 Watchdog Timer driver");
+MODULE_LICENSE("GPL v2");
index 567458b137a67874be0195aeda205208ba03cd67..f90812170657988b2089093765967b0dd5958965 100644 (file)
@@ -252,6 +252,7 @@ static int sh_wdt_probe(struct platform_device *pdev)
 
        watchdog_set_nowayout(&sh_wdt_dev, nowayout);
        watchdog_set_drvdata(&sh_wdt_dev, wdt);
+       sh_wdt_dev.parent = &pdev->dev;
 
        spin_lock_init(&wdt->lock);
 
index 42fa5c0c518ab39c3f2a4ca7405256e2771c3a5e..d0578ab2e636dcfbcec090541507d30cedbb4e22 100644 (file)
@@ -154,6 +154,7 @@ static int sirfsoc_wdt_probe(struct platform_device *pdev)
 
        watchdog_init_timeout(&sirfsoc_wdd, timeout, &pdev->dev);
        watchdog_set_nowayout(&sirfsoc_wdd, nowayout);
+       sirfsoc_wdd.parent = &pdev->dev;
 
        ret = watchdog_register_device(&sirfsoc_wdd);
        if (ret)
index 4e7fec36f5c36d55edd220469cac43c2d56c4840..01d816251302c2491c24a70e8f7c542b61c2f15a 100644 (file)
@@ -226,6 +226,7 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id)
        wdt->adev = adev;
        wdt->wdd.info = &wdt_info;
        wdt->wdd.ops = &wdt_ops;
+       wdt->wdd.parent = &adev->dev;
 
        spin_lock_init(&wdt->lock);
        watchdog_set_nowayout(&wdt->wdd, nowayout);
index 6785afdc0fcaabad1694f3e0c24b316ddb29c1bf..14e9badf2bfa37b9c8fa0d3d923ecaa188b987b8 100644 (file)
@@ -241,6 +241,7 @@ static int st_wdog_probe(struct platform_device *pdev)
                return -EINVAL;
        }
        st_wdog_dev.max_timeout = 0xFFFFFFFF / st_wdog->clkrate;
+       st_wdog_dev.parent = &pdev->dev;
 
        ret = clk_prepare_enable(clk);
        if (ret) {
index e7f0d5b60d3d4febb20759ea893dc0432d4963ee..3ee6128a540e9896248ebd5cdad6bc3065170831 100644 (file)
@@ -76,6 +76,7 @@ static int stmp3xxx_wdt_probe(struct platform_device *pdev)
        watchdog_set_drvdata(&stmp3xxx_wdd, &pdev->dev);
 
        stmp3xxx_wdd.timeout = clamp_t(unsigned, heartbeat, 1, STMP3XXX_MAX_TIMEOUT);
+       stmp3xxx_wdd.parent = &pdev->dev;
 
        ret = watchdog_register_device(&stmp3xxx_wdd);
        if (ret < 0) {
index a29afb37c48ca865f5cc6815e7a038956a519dc7..47bd8a14d01f5a3fe5cb6e3a5bcc30000e37cadf 100644 (file)
@@ -184,7 +184,7 @@ static int sunxi_wdt_start(struct watchdog_device *wdt_dev)
        /* Set system reset function */
        reg = readl(wdt_base + regs->wdt_cfg);
        reg &= ~(regs->wdt_reset_mask);
-       reg |= ~(regs->wdt_reset_val);
+       reg |= regs->wdt_reset_val;
        writel(reg, wdt_base + regs->wdt_cfg);
 
        /* Enable watchdog */
index 30451ea4690237e36d46daff376404e4a7c11017..7f97cdd53f29624f6c732b0e44c0448a856c8a5c 100644 (file)
@@ -218,6 +218,7 @@ static int tegra_wdt_probe(struct platform_device *pdev)
        wdd->ops = &tegra_wdt_ops;
        wdd->min_timeout = MIN_WDT_TIMEOUT;
        wdd->max_timeout = MAX_WDT_TIMEOUT;
+       wdd->parent = &pdev->dev;
 
        watchdog_set_drvdata(wdd, wdt);
 
index 2c1db6fa9a2724ae906f8682dd29ea176a752daf..9bf3cc0f396106c730383ab561f4457876c6af85 100644 (file)
@@ -83,6 +83,7 @@ static int twl4030_wdt_probe(struct platform_device *pdev)
        wdt->timeout            = 30;
        wdt->min_timeout        = 1;
        wdt->max_timeout        = 30;
+       wdt->parent = &pdev->dev;
 
        watchdog_set_nowayout(wdt, nowayout);
        platform_set_drvdata(pdev, wdt);
index 7f615933d31a169fc4da4e49bd3f8d8612b01d50..c2da880292bc2f326b71678634de5d2968bd38c8 100644 (file)
@@ -131,6 +131,7 @@ static int __init txx9wdt_probe(struct platform_device *dev)
        txx9wdt.timeout = timeout;
        txx9wdt.min_timeout = 1;
        txx9wdt.max_timeout = WD_MAX_TIMEOUT;
+       txx9wdt.parent = &dev->dev;
        watchdog_set_nowayout(&txx9wdt, nowayout);
 
        ret = watchdog_register_device(&txx9wdt);
index 9de09ab008380a66bf6b10d2d1ebe5a9758fdf79..37c084353cce238f4e694a1be1520727530503c7 100644 (file)
@@ -96,6 +96,7 @@ static int ux500_wdt_probe(struct platform_device *pdev)
                        ux500_wdt.max_timeout = WATCHDOG_MAX28;
        }
 
+       ux500_wdt.parent = &pdev->dev;
        watchdog_set_nowayout(&ux500_wdt, nowayout);
 
        /* disable auto off on sleep */
index 56369c4f1961d0c2001094f680a6c7ed81c54ca2..5f9cbc37520d2e4fd029ba02e594e9ac024c7e6f 100644 (file)
@@ -206,6 +206,7 @@ static int wdt_probe(struct pci_dev *pdev,
                timeout = WDT_TIMEOUT;
 
        wdt_dev.timeout = timeout;
+       wdt_dev.parent = &pdev->dev;
        watchdog_set_nowayout(&wdt_dev, nowayout);
        if (readl(wdt_mem) & VIA_WDT_FIRED)
                wdt_dev.bootstatus |= WDIOF_CARDRESET;
index 2fa17e746ff6f43dfff2042d5306934c4eab4cf8..8d1184aee932e064240eba56195b812e32229f6e 100644 (file)
@@ -215,6 +215,7 @@ static int wm831x_wdt_probe(struct platform_device *pdev)
 
        wm831x_wdt->info = &wm831x_wdt_info;
        wm831x_wdt->ops = &wm831x_wdt_ops;
+       wm831x_wdt->parent = &pdev->dev;
        watchdog_set_nowayout(wm831x_wdt, nowayout);
        watchdog_set_drvdata(wm831x_wdt, driver_data);
 
index 34d272ada23d5cb4771eeed1b3afc6471117369e..4ab4b8347d459b5745da86252c4e27bdd1cb4e8e 100644 (file)
@@ -151,6 +151,7 @@ static int wm8350_wdt_probe(struct platform_device *pdev)
 
        watchdog_set_nowayout(&wm8350_wdt, nowayout);
        watchdog_set_drvdata(&wm8350_wdt, wm8350);
+       wm8350_wdt.parent = &pdev->dev;
 
        /* Default to 4s timeout */
        wm8350_wdt_set_timeout(&wm8350_wdt, 4);
index 7cd226da15fea2d6985b6203d696a6fe8e0c400f..73708acce3ca78dfeb2da5fa78d6655e8f6554d4 100644 (file)
@@ -280,4 +280,15 @@ config XEN_ACPI
        def_bool y
        depends on X86 && ACPI
 
+config XEN_SYMS
+       bool "Xen symbols"
+       depends on X86 && XEN_DOM0 && XENFS
+       default y if KALLSYMS
+       help
+          Exports hypervisor symbols (along with their types and addresses) via
+          /proc/xen/xensyms file, similar to /proc/kallsyms
+
+config XEN_HAVE_VPMU
+       bool
+
 endmenu
index bf4a23c7c5918f6849e764a8376c3608cc591933..c79329fcfa78c722ba00a14f147aab1df2989a1c 100644 (file)
@@ -441,7 +441,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
        /* Update direct mapping, invalidate P2M, and add to balloon. */
        for (i = 0; i < nr_pages; i++) {
                pfn = frame_list[i];
-               frame_list[i] = pfn_to_mfn(pfn);
+               frame_list[i] = pfn_to_gfn(pfn);
                page = pfn_to_page(pfn);
 
 #ifdef CONFIG_XEN_HAVE_PVMMU
@@ -638,9 +638,9 @@ static int __init balloon_init(void)
         * regions (see arch/x86/xen/setup.c).
         */
        for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
-               if (xen_extra_mem[i].size)
-                       balloon_add_region(PFN_UP(xen_extra_mem[i].start),
-                                          PFN_DOWN(xen_extra_mem[i].size));
+               if (xen_extra_mem[i].n_pfns)
+                       balloon_add_region(xen_extra_mem[i].start_pfn,
+                                          xen_extra_mem[i].n_pfns);
 
        return 0;
 }
index 0edb91c0de6bf2b69d6de3ec41d43d38b5d823b5..8ae2fc90e1ea88a27325927e108b777e226c6f13 100644 (file)
@@ -6,10 +6,10 @@
 bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
                               const struct bio_vec *vec2)
 {
-       unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
-       unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+       unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page));
+       unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page));
 
        return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
-               ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+               ((bfn1 == bfn2) || ((bfn1+1) == bfn2));
 }
 EXPORT_SYMBOL(xen_biovec_phys_mergeable);
index ed8bf1067a97aa8965003d3db701fcfa330ccecb..6cd5e65c4aff0824153cf0a2e298e85785cdcef7 100644 (file)
@@ -1301,11 +1301,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
        if (!VALID_EVTCHN(evtchn))
                return -1;
 
-       /*
-        * Events delivered via platform PCI interrupts are always
-        * routed to vcpu 0 and hence cannot be rebound.
-        */
-       if (xen_hvm_domain() && !xen_have_vector_callback)
+       if (!xen_support_evtchn_rebind())
                return -1;
 
        /* Send future instances of this interrupt to other vcpu. */
@@ -1692,7 +1688,7 @@ void __init xen_init_IRQ(void)
                struct physdev_pirq_eoi_gmfn eoi_gmfn;
 
                pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
-               eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map);
+               eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map);
                rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
                /* TODO: No PVH support for PIRQ EOI */
                if (rc != 0) {
index ed673e1acd6159a3ca34dc10238fef8936e43249..1d4baf56c36bfc5abefb4d0701c0eb92553b7a68 100644 (file)
@@ -111,7 +111,7 @@ static int init_control_block(int cpu,
        for (i = 0; i < EVTCHN_FIFO_MAX_QUEUES; i++)
                q->head[i] = 0;
 
-       init_control.control_gfn = virt_to_mfn(control_block);
+       init_control.control_gfn = virt_to_gfn(control_block);
        init_control.offset      = 0;
        init_control.vcpu        = cpu;
 
@@ -167,7 +167,7 @@ static int evtchn_fifo_setup(struct irq_info *info)
                /* Mask all events in this page before adding it. */
                init_array_page(array_page);
 
-               expand_array.array_gfn = virt_to_mfn(array_page);
+               expand_array.array_gfn = virt_to_gfn(array_page);
 
                ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array);
                if (ret < 0)
index e53fe191738cfe8ce7c7cfe6202c2651258ff952..4547a91bca67a1005c95c478aef929d121655d00 100644 (file)
@@ -142,7 +142,8 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
 
                /* Grant foreign access to the page. */
                rc = gnttab_grant_foreign_access(op->domid,
-                       pfn_to_mfn(page_to_pfn(gref->page)), readonly);
+                                                xen_page_to_gfn(gref->page),
+                                                readonly);
                if (rc < 0)
                        goto undo;
                gref_ids[i] = gref->gref_id = rc;
@@ -493,7 +494,7 @@ static void gntalloc_vma_close(struct vm_area_struct *vma)
        mutex_unlock(&gref_mutex);
 }
 
-static struct vm_operations_struct gntalloc_vmops = {
+static const struct vm_operations_struct gntalloc_vmops = {
        .open = gntalloc_vma_open,
        .close = gntalloc_vma_close,
 };
index 0dbb222daaf1c694b1f073f3e206f755f5f77cc6..2ea0b3b2a91d2585a2d37f8ead07f08f32c79826 100644 (file)
@@ -433,7 +433,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma,
        return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT];
 }
 
-static struct vm_operations_struct gntdev_vmops = {
+static const struct vm_operations_struct gntdev_vmops = {
        .open = gntdev_vma_open,
        .close = gntdev_vma_close,
        .find_special_page = gntdev_vma_find_special_page,
index d10effee9b9eb16d46a0bfea9b108b5f22863be3..e12bd3635f832e7fa5330667fe77f32fe305db4b 100644 (file)
@@ -80,7 +80,7 @@ static int xen_suspend(void *data)
         * is resuming in a new domain.
         */
        si->cancelled = HYPERVISOR_suspend(xen_pv_domain()
-                                           ? virt_to_mfn(xen_start_info)
+                                           ? virt_to_gfn(xen_start_info)
                                            : 0);
 
        xen_arch_post_suspend(si->cancelled);
index 5a296161d843baa5eda5282e15b198e6a6f69feb..5e9adac928e694d6701b5c59ef5300144226a0ab 100644 (file)
@@ -193,16 +193,16 @@ static int traverse_pages_block(unsigned nelem, size_t size,
        return ret;
 }
 
-struct mmap_mfn_state {
+struct mmap_gfn_state {
        unsigned long va;
        struct vm_area_struct *vma;
        domid_t domain;
 };
 
-static int mmap_mfn_range(void *data, void *state)
+static int mmap_gfn_range(void *data, void *state)
 {
        struct privcmd_mmap_entry *msg = data;
-       struct mmap_mfn_state *st = state;
+       struct mmap_gfn_state *st = state;
        struct vm_area_struct *vma = st->vma;
        int rc;
 
@@ -216,7 +216,7 @@ static int mmap_mfn_range(void *data, void *state)
            ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
                return -EINVAL;
 
-       rc = xen_remap_domain_mfn_range(vma,
+       rc = xen_remap_domain_gfn_range(vma,
                                        msg->va & PAGE_MASK,
                                        msg->mfn, msg->npages,
                                        vma->vm_page_prot,
@@ -236,7 +236,7 @@ static long privcmd_ioctl_mmap(void __user *udata)
        struct vm_area_struct *vma;
        int rc;
        LIST_HEAD(pagelist);
-       struct mmap_mfn_state state;
+       struct mmap_gfn_state state;
 
        /* We only support privcmd_ioctl_mmap_batch for auto translated. */
        if (xen_feature(XENFEAT_auto_translated_physmap))
@@ -273,7 +273,7 @@ static long privcmd_ioctl_mmap(void __user *udata)
 
        rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
                            &pagelist,
-                           mmap_mfn_range, &state);
+                           mmap_gfn_range, &state);
 
 
 out_up:
@@ -299,18 +299,18 @@ struct mmap_batch_state {
        int global_error;
        int version;
 
-       /* User-space mfn array to store errors in the second pass for V1. */
-       xen_pfn_t __user *user_mfn;
+       /* User-space gfn array to store errors in the second pass for V1. */
+       xen_pfn_t __user *user_gfn;
        /* User-space int array to store errors in the second pass for V2. */
        int __user *user_err;
 };
 
-/* auto translated dom0 note: if domU being created is PV, then mfn is
- * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
+/* auto translated dom0 note: if domU being created is PV, then gfn is
+ * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
  */
 static int mmap_batch_fn(void *data, int nr, void *state)
 {
-       xen_pfn_t *mfnp = data;
+       xen_pfn_t *gfnp = data;
        struct mmap_batch_state *st = state;
        struct vm_area_struct *vma = st->vma;
        struct page **pages = vma->vm_private_data;
@@ -321,8 +321,8 @@ static int mmap_batch_fn(void *data, int nr, void *state)
                cur_pages = &pages[st->index];
 
        BUG_ON(nr < 0);
-       ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr,
-                                        (int *)mfnp, st->vma->vm_page_prot,
+       ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
+                                        (int *)gfnp, st->vma->vm_page_prot,
                                         st->domain, cur_pages);
 
        /* Adjust the global_error? */
@@ -347,22 +347,22 @@ static int mmap_return_error(int err, struct mmap_batch_state *st)
 
        if (st->version == 1) {
                if (err) {
-                       xen_pfn_t mfn;
+                       xen_pfn_t gfn;
 
-                       ret = get_user(mfn, st->user_mfn);
+                       ret = get_user(gfn, st->user_gfn);
                        if (ret < 0)
                                return ret;
                        /*
                         * V1 encodes the error codes in the 32bit top
-                        * nibble of the mfn (with its known
+                        * nibble of the gfn (with its known
                         * limitations vis-a-vis 64 bit callers).
                         */
-                       mfn |= (err == -ENOENT) ?
+                       gfn |= (err == -ENOENT) ?
                                PRIVCMD_MMAPBATCH_PAGED_ERROR :
                                PRIVCMD_MMAPBATCH_MFN_ERROR;
-                       return __put_user(mfn, st->user_mfn++);
+                       return __put_user(gfn, st->user_gfn++);
                } else
-                       st->user_mfn++;
+                       st->user_gfn++;
        } else { /* st->version == 2 */
                if (err)
                        return __put_user(err, st->user_err++);
@@ -388,7 +388,7 @@ static int mmap_return_errors(void *data, int nr, void *state)
        return 0;
 }
 
-/* Allocate pfns that are then mapped with gmfns from foreign domid. Update
+/* Allocate pfns that are then mapped with gfns from foreign domid. Update
  * the vma with the page info to use later.
  * Returns: 0 if success, otherwise -errno
  */
@@ -414,7 +414,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
        return 0;
 }
 
-static struct vm_operations_struct privcmd_vm_ops;
+static const struct vm_operations_struct privcmd_vm_ops;
 
 static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 {
@@ -526,7 +526,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 
        if (state.global_error) {
                /* Write back errors in second pass. */
-               state.user_mfn = (xen_pfn_t *)m.arr;
+               state.user_gfn = (xen_pfn_t *)m.arr;
                state.user_err = m.err;
                ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
                                           &pagelist, mmap_return_errors, &state);
@@ -587,7 +587,7 @@ static void privcmd_close(struct vm_area_struct *vma)
        if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
                return;
 
-       rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
+       rc = xen_unmap_domain_gfn_range(vma, numpgs, pages);
        if (rc == 0)
                free_xenballooned_pages(numpgs, pages);
        else
@@ -605,7 +605,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_SIGBUS;
 }
 
-static struct vm_operations_struct privcmd_vm_ops = {
+static const struct vm_operations_struct privcmd_vm_ops = {
        .close = privcmd_close,
        .fault = privcmd_fault
 };
index 4c549323c605d97591224050b00add55e3a833a0..79bc4933b13e05c220325bb3ad39f7f6c0e243dd 100644 (file)
@@ -82,8 +82,8 @@ static u64 start_dma_addr;
  */
 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 {
-       unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr));
-       dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT;
+       unsigned long bfn = pfn_to_bfn(PFN_DOWN(paddr));
+       dma_addr_t dma = (dma_addr_t)bfn << PAGE_SHIFT;
 
        dma |= paddr & ~PAGE_MASK;
 
@@ -92,7 +92,7 @@ static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 
 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
 {
-       unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr));
+       unsigned long pfn = bfn_to_pfn(PFN_DOWN(baddr));
        dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT;
        phys_addr_t paddr = dma;
 
@@ -110,15 +110,15 @@ static int check_pages_physically_contiguous(unsigned long pfn,
                                             unsigned int offset,
                                             size_t length)
 {
-       unsigned long next_mfn;
+       unsigned long next_bfn;
        int i;
        int nr_pages;
 
-       next_mfn = pfn_to_mfn(pfn);
+       next_bfn = pfn_to_bfn(pfn);
        nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
 
        for (i = 1; i < nr_pages; i++) {
-               if (pfn_to_mfn(++pfn) != ++next_mfn)
+               if (pfn_to_bfn(++pfn) != ++next_bfn)
                        return 0;
        }
        return 1;
@@ -138,8 +138,8 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
 
 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
 {
-       unsigned long mfn = PFN_DOWN(dma_addr);
-       unsigned long pfn = mfn_to_local_pfn(mfn);
+       unsigned long bfn = PFN_DOWN(dma_addr);
+       unsigned long pfn = bfn_to_local_pfn(bfn);
        phys_addr_t paddr;
 
        /* If the address is outside our domain, it CAN
@@ -311,9 +311,6 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        */
        flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
 
-       if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
-               return ret;
-
        /* On ARM this function returns an ioremap'ped virtual address for
         * which virt_to_phys doesn't return the corresponding physical
         * address. In fact on ARM virt_to_phys only works for kernel direct
@@ -356,9 +353,6 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
        phys_addr_t phys;
        u64 dma_mask = DMA_BIT_MASK(32);
 
-       if (dma_release_from_coherent(hwdev, order, vaddr))
-               return;
-
        if (hwdev && hwdev->coherent_dma_mask)
                dma_mask = hwdev->coherent_dma_mask;
 
index 96453f8a85c5543f506ecc128f22e91b3d2dbdbb..b5a7342e0ba5288ddf9e4e3f4fcd4fbcced9df9e 100644 (file)
@@ -20,6 +20,9 @@
 #include <xen/xenbus.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
+#ifdef CONFIG_XEN_HAVE_VPMU
+#include <xen/interface/xenpmu.h>
+#endif
 
 #define HYPERVISOR_ATTR_RO(_name) \
 static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
@@ -368,6 +371,126 @@ static void xen_properties_destroy(void)
        sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
+#ifdef CONFIG_XEN_HAVE_VPMU
+struct pmu_mode {
+       const char *name;
+       uint32_t mode;
+};
+
+static struct pmu_mode pmu_modes[] = {
+       {"off", XENPMU_MODE_OFF},
+       {"self", XENPMU_MODE_SELF},
+       {"hv", XENPMU_MODE_HV},
+       {"all", XENPMU_MODE_ALL}
+};
+
+static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr,
+                             const char *buffer, size_t len)
+{
+       int ret;
+       struct xen_pmu_params xp;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
+               if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) {
+                       xp.val = pmu_modes[i].mode;
+                       break;
+               }
+       }
+
+       if (i == ARRAY_SIZE(pmu_modes))
+               return -EINVAL;
+
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp);
+       if (ret)
+               return ret;
+
+       return len;
+}
+
+static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret;
+       struct xen_pmu_params xp;
+       int i;
+       uint32_t mode;
+
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp);
+       if (ret)
+               return ret;
+
+       mode = (uint32_t)xp.val;
+       for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
+               if (mode == pmu_modes[i].mode)
+                       return sprintf(buffer, "%s\n", pmu_modes[i].name);
+       }
+
+       return -EINVAL;
+}
+HYPERVISOR_ATTR_RW(pmu_mode);
+
+static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr,
+                                 const char *buffer, size_t len)
+{
+       int ret;
+       uint32_t features;
+       struct xen_pmu_params xp;
+
+       ret = kstrtou32(buffer, 0, &features);
+       if (ret)
+               return ret;
+
+       xp.val = features;
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp);
+       if (ret)
+               return ret;
+
+       return len;
+}
+
+static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret;
+       struct xen_pmu_params xp;
+
+       xp.version.maj = XENPMU_VER_MAJ;
+       xp.version.min = XENPMU_VER_MIN;
+       ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp);
+       if (ret)
+               return ret;
+
+       return sprintf(buffer, "0x%x\n", (uint32_t)xp.val);
+}
+HYPERVISOR_ATTR_RW(pmu_features);
+
+static struct attribute *xen_pmu_attrs[] = {
+       &pmu_mode_attr.attr,
+       &pmu_features_attr.attr,
+       NULL
+};
+
+static const struct attribute_group xen_pmu_group = {
+       .name = "pmu",
+       .attrs = xen_pmu_attrs,
+};
+
+static int __init xen_pmu_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &xen_pmu_group);
+}
+
+static void xen_pmu_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &xen_pmu_group);
+}
+#endif
+
 static int __init hyper_sysfs_init(void)
 {
        int ret;
@@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void)
        ret = xen_properties_init();
        if (ret)
                goto prop_out;
-
+#ifdef CONFIG_XEN_HAVE_VPMU
+       if (xen_initial_domain()) {
+               ret = xen_pmu_init();
+               if (ret) {
+                       xen_properties_destroy();
+                       goto prop_out;
+               }
+       }
+#endif
        goto out;
 
 prop_out:
@@ -407,6 +538,9 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
+#ifdef CONFIG_XEN_HAVE_VPMU
+       xen_pmu_destroy();
+#endif
        xen_properties_destroy();
        xen_compilation_destroy();
        xen_sysfs_uuid_destroy();
index 239738f944badfa3f12f3d61581ef5cb4d6910d4..945fc43272017cfe05bab171eb8b97e62057244c 100644 (file)
@@ -129,21 +129,17 @@ static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
 /* xen generic tmem ops */
 
 static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid,
-                            u32 index, unsigned long pfn)
+                            u32 index, struct page *page)
 {
-       unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
-
        return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index,
-               gmfn, 0, 0, 0);
+                          xen_page_to_gfn(page), 0, 0, 0);
 }
 
 static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid,
-                            u32 index, unsigned long pfn)
+                            u32 index, struct page *page)
 {
-       unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn;
-
        return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index,
-               gmfn, 0, 0, 0);
+                          xen_page_to_gfn(page), 0, 0, 0);
 }
 
 static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index)
@@ -173,14 +169,13 @@ static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key,
 {
        u32 ind = (u32) index;
        struct tmem_oid oid = *(struct tmem_oid *)&key;
-       unsigned long pfn = page_to_pfn(page);
 
        if (pool < 0)
                return;
        if (ind != index)
                return;
        mb(); /* ensure page is quiescent; tmem may address it with an alias */
-       (void)xen_tmem_put_page((u32)pool, oid, ind, pfn);
+       (void)xen_tmem_put_page((u32)pool, oid, ind, page);
 }
 
 static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
@@ -188,7 +183,6 @@ static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
 {
        u32 ind = (u32) index;
        struct tmem_oid oid = *(struct tmem_oid *)&key;
-       unsigned long pfn = page_to_pfn(page);
        int ret;
 
        /* translate return values to linux semantics */
@@ -196,7 +190,7 @@ static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
                return -1;
        if (ind != index)
                return -1;
-       ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
+       ret = xen_tmem_get_page((u32)pool, oid, ind, page);
        if (ret == 1)
                return 0;
        else
@@ -287,7 +281,6 @@ static int tmem_frontswap_store(unsigned type, pgoff_t offset,
 {
        u64 ind64 = (u64)offset;
        u32 ind = (u32)offset;
-       unsigned long pfn = page_to_pfn(page);
        int pool = tmem_frontswap_poolid;
        int ret;
 
@@ -296,7 +289,7 @@ static int tmem_frontswap_store(unsigned type, pgoff_t offset,
        if (ind64 != ind)
                return -1;
        mb(); /* ensure page is quiescent; tmem may address it with an alias */
-       ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+       ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), page);
        /* translate Xen tmem return values to linux semantics */
        if (ret == 1)
                return 0;
@@ -313,7 +306,6 @@ static int tmem_frontswap_load(unsigned type, pgoff_t offset,
 {
        u64 ind64 = (u64)offset;
        u32 ind = (u32)offset;
-       unsigned long pfn = page_to_pfn(page);
        int pool = tmem_frontswap_poolid;
        int ret;
 
@@ -321,7 +313,7 @@ static int tmem_frontswap_load(unsigned type, pgoff_t offset,
                return -1;
        if (ind64 != ind)
                return -1;
-       ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn);
+       ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), page);
        /* translate Xen tmem return values to linux semantics */
        if (ret == 1)
                return 0;
index e30353575d5da11f75e8c927ba53945a23b73d76..2ba09c1195c87c68b47b2b210d6d0eb32c5ae056 100644 (file)
@@ -380,7 +380,7 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
 
        for (i = 0; i < nr_pages; i++) {
                err = gnttab_grant_foreign_access(dev->otherend_id,
-                                                 virt_to_mfn(vaddr), 0);
+                                                 virt_to_gfn(vaddr), 0);
                if (err < 0) {
                        xenbus_dev_fatal(dev, err,
                                         "granting access to ring page");
index b17707ee07d4f3057c1ab529f91dcbfd15203ff9..ee6d9efd7b768c44a40bc81172af685c03cab1ea 100644 (file)
@@ -49,7 +49,7 @@ static long xenbus_alloc(domid_t domid)
                goto out_err;
 
        gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid,
-                       virt_to_mfn(xen_store_interface), 0 /* writable */);
+                       virt_to_gfn(xen_store_interface), 0 /* writable */);
 
        arg.dom = DOMID_SELF;
        arg.remote_dom = domid;
index 4308fb3cf7c2f717ffd446035f1c30b61f8dacf8..3cbe0556de26625b75f1e62f02d8372555d59b7f 100644 (file)
@@ -75,7 +75,7 @@ EXPORT_SYMBOL_GPL(xen_store_interface);
 enum xenstore_init xen_store_domain_type;
 EXPORT_SYMBOL_GPL(xen_store_domain_type);
 
-static unsigned long xen_store_mfn;
+static unsigned long xen_store_gfn;
 
 static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
 
@@ -711,9 +711,7 @@ static int __init xenstored_local_init(void)
        if (!page)
                goto out_err;
 
-       xen_store_mfn = xen_start_info->store_mfn =
-               pfn_to_mfn(virt_to_phys((void *)page) >>
-                          PAGE_SHIFT);
+       xen_store_gfn = xen_start_info->store_mfn = virt_to_gfn((void *)page);
 
        /* Next allocate a local port which xenstored can bind to */
        alloc_unbound.dom        = DOMID_SELF;
@@ -787,12 +785,12 @@ static int __init xenbus_init(void)
                err = xenstored_local_init();
                if (err)
                        goto out_error;
-               xen_store_interface = mfn_to_virt(xen_store_mfn);
+               xen_store_interface = gfn_to_virt(xen_store_gfn);
                break;
        case XS_PV:
                xen_store_evtchn = xen_start_info->store_evtchn;
-               xen_store_mfn = xen_start_info->store_mfn;
-               xen_store_interface = mfn_to_virt(xen_store_mfn);
+               xen_store_gfn = xen_start_info->store_mfn;
+               xen_store_interface = gfn_to_virt(xen_store_gfn);
                break;
        case XS_HVM:
                err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
@@ -802,9 +800,9 @@ static int __init xenbus_init(void)
                err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
                if (err)
                        goto out_error;
-               xen_store_mfn = (unsigned long)v;
+               xen_store_gfn = (unsigned long)v;
                xen_store_interface =
-                       xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+                       xen_remap(xen_store_gfn << PAGE_SHIFT, PAGE_SIZE);
                break;
        default:
                pr_warn("Xenstore state unknown\n");
index b019865fcc56b779098c792bc7653cdc2537f2bc..1a83010ddffafc0f6fe816d021732129814c2d00 100644 (file)
@@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o
 
 xenfs-y                          = super.o
 xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
+xenfs-$(CONFIG_XEN_SYMS) += xensyms.o
index 06092e0fe8cea4d3a5dfddac4df0e9bb531c5a60..8559a71f36b1a7d6fe2bd2b57b588321ad92f26c 100644 (file)
@@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
                { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
                { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR},
                { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR},
+#ifdef CONFIG_XEN_SYMS
+               { "xensyms", &xensyms_ops, S_IRUSR},
+#endif
                {""},
        };
 
index 6b80c7779c0217bdddca45d977e5d823df150b5a..2c5934ea9b1e5498c4f92ff2b74a40f11a973a6d 100644 (file)
@@ -3,5 +3,6 @@
 
 extern const struct file_operations xsd_kva_file_ops;
 extern const struct file_operations xsd_port_file_ops;
+extern const struct file_operations xensyms_ops;
 
 #endif /* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c
new file mode 100644 (file)
index 0000000..f8b1285
--- /dev/null
@@ -0,0 +1,152 @@
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+#include <xen/xen-ops.h>
+#include "xenfs.h"
+
+
+#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */
+
+struct xensyms {
+       struct xen_platform_op op;
+       char *name;
+       uint32_t namelen;
+};
+
+/* Grab next output page from the hypervisor */
+static int xensyms_next_sym(struct xensyms *xs)
+{
+       int ret;
+       struct xenpf_symdata *symdata = &xs->op.u.symdata;
+       uint64_t symnum;
+
+       memset(xs->name, 0, xs->namelen);
+       symdata->namelen = xs->namelen;
+
+       symnum = symdata->symnum;
+
+       ret = HYPERVISOR_dom0_op(&xs->op);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If hypervisor's symbol didn't fit into the buffer then allocate
+        * a larger buffer and try again.
+        */
+       if (unlikely(symdata->namelen > xs->namelen)) {
+               kfree(xs->name);
+
+               xs->namelen = symdata->namelen;
+               xs->name = kzalloc(xs->namelen, GFP_KERNEL);
+               if (!xs->name)
+                       return -ENOMEM;
+
+               set_xen_guest_handle(symdata->name, xs->name);
+               symdata->symnum--; /* Rewind */
+
+               ret = HYPERVISOR_dom0_op(&xs->op);
+               if (ret < 0)
+                       return ret;
+       }
+
+       if (symdata->symnum == symnum)
+               /* End of symbols */
+               return 1;
+
+       return 0;
+}
+
+static void *xensyms_start(struct seq_file *m, loff_t *pos)
+{
+       struct xensyms *xs = (struct xensyms *)m->private;
+
+       xs->op.u.symdata.symnum = *pos;
+
+       if (xensyms_next_sym(xs))
+               return NULL;
+
+       return m->private;
+}
+
+static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos)
+{
+       struct xensyms *xs = (struct xensyms *)m->private;
+
+       xs->op.u.symdata.symnum = ++(*pos);
+
+       if (xensyms_next_sym(xs))
+               return NULL;
+
+       return p;
+}
+
+static int xensyms_show(struct seq_file *m, void *p)
+{
+       struct xensyms *xs = (struct xensyms *)m->private;
+       struct xenpf_symdata *symdata = &xs->op.u.symdata;
+
+       seq_printf(m, "%016llx %c %s\n", symdata->address,
+                  symdata->type, xs->name);
+
+       return 0;
+}
+
+static void xensyms_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations xensyms_seq_ops = {
+       .start = xensyms_start,
+       .next = xensyms_next,
+       .show = xensyms_show,
+       .stop = xensyms_stop,
+};
+
+static int xensyms_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *m;
+       struct xensyms *xs;
+       int ret;
+
+       ret = seq_open_private(file, &xensyms_seq_ops,
+                              sizeof(struct xensyms));
+       if (ret)
+               return ret;
+
+       m = file->private_data;
+       xs = (struct xensyms *)m->private;
+
+       xs->namelen = XEN_KSYM_NAME_LEN + 1;
+       xs->name = kzalloc(xs->namelen, GFP_KERNEL);
+       if (!xs->name) {
+               seq_release_private(inode, file);
+               return -ENOMEM;
+       }
+       set_xen_guest_handle(xs->op.u.symdata.name, xs->name);
+       xs->op.cmd = XENPF_get_symbol;
+       xs->op.u.symdata.namelen = xs->namelen;
+
+       return 0;
+}
+
+static int xensyms_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *m = file->private_data;
+       struct xensyms *xs = (struct xensyms *)m->private;
+
+       kfree(xs->name);
+       return seq_release_private(inode, file);
+}
+
+const struct file_operations xensyms_ops = {
+       .open = xensyms_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = xensyms_release
+};
index 58a5389aec891932f18a81770a7d0a248d6c12c2..cff23872c5a965884b227655d08615ef905b91b6 100644 (file)
@@ -38,8 +38,8 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 
-/* map fgmfn of domid to lpfn in the current domain */
-static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
+/* map fgfn of domid to lpfn in the current domain */
+static int map_foreign_page(unsigned long lpfn, unsigned long fgfn,
                            unsigned int domid)
 {
        int rc;
@@ -49,7 +49,7 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
                .size = 1,
                .space = XENMAPSPACE_gmfn_foreign,
        };
-       xen_ulong_t idx = fgmfn;
+       xen_ulong_t idx = fgfn;
        xen_pfn_t gpfn = lpfn;
        int err = 0;
 
@@ -62,13 +62,13 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
 }
 
 struct remap_data {
-       xen_pfn_t *fgmfn; /* foreign domain's gmfn */
+       xen_pfn_t *fgfn; /* foreign domain's gfn */
        pgprot_t prot;
        domid_t  domid;
        struct vm_area_struct *vma;
        int index;
        struct page **pages;
-       struct xen_remap_mfn_info *info;
+       struct xen_remap_gfn_info *info;
        int *err_ptr;
        int mapped;
 };
@@ -82,20 +82,20 @@ static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
        pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
        int rc;
 
-       rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
+       rc = map_foreign_page(pfn, *info->fgfn, info->domid);
        *info->err_ptr++ = rc;
        if (!rc) {
                set_pte_at(info->vma->vm_mm, addr, ptep, pte);
                info->mapped++;
        }
-       info->fgmfn++;
+       info->fgfn++;
 
        return 0;
 }
 
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
                              unsigned long addr,
-                             xen_pfn_t *mfn, int nr,
+                             xen_pfn_t *gfn, int nr,
                              int *err_ptr, pgprot_t prot,
                              unsigned domid,
                              struct page **pages)
@@ -108,7 +108,7 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
           x86 PVOPS */
        BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
-       data.fgmfn = mfn;
+       data.fgfn = gfn;
        data.prot  = prot;
        data.domid = domid;
        data.vma   = vma;
index 3f89c9e05b4077b5db9b728435713d04d5ed05ae..5b50c4ca43a7dde6ce48049f365a9988da23449b 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 #include "affs.h"
 
 static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
         * blocks, we will have to change it.
         */
 
-       size = sb->s_bdev->bd_inode->i_size >> 9;
+       size = i_size_read(sb->s_bdev->bd_inode) >> 9;
        pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
 
        affs_set_blocksize(sb, PAGE_SIZE);
        /* Try to find root block. Its location depends on the block size. */
 
-       i = 512;
-       j = 4096;
+       i = bdev_logical_block_size(sb->s_bdev);
+       j = PAGE_SIZE;
        if (blocksize > 0) {
                i = j = blocksize;
                size = size / (blocksize / 512);
        }
+
        for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
                sbi->s_root_block = root_block;
                if (root_block < 0)
index 33b813e04f7921e390c9f4edec0908d4fcb9337f..22ea424ee741ea1a967676298e7877332fc324b2 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
+#include <linux/dax.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
  * accessible at this address.
  */
 long bdev_direct_access(struct block_device *bdev, sector_t sector,
-                       void **addr, unsigned long *pfn, long size)
+                       void __pmem **addr, unsigned long *pfn, long size)
 {
        long avail;
        const struct block_device_operations *ops = bdev->bd_disk->fops;
@@ -462,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
        sector += get_start_sect(bdev);
        if (sector % (PAGE_SIZE / 512))
                return -EINVAL;
-       avail = ops->direct_access(bdev, sector, addr, pfn, size);
+       avail = ops->direct_access(bdev, sector, addr, pfn);
        if (!avail)
                return -ERANGE;
        return min(avail, size);
index 1ce06c849a86db84ca080a3d0fd0398a160b3b9a..3e36e4adc4a35539e9415951118a974db2794c5a 100644 (file)
@@ -42,8 +42,14 @@ struct __btrfs_workqueue {
 
        /* Thresholding related variants */
        atomic_t pending;
-       int max_active;
-       int current_max;
+
+       /* Up limit of concurrency workers */
+       int limit_active;
+
+       /* Current number of concurrency workers */
+       int current_active;
+
+       /* Threshold to change current_active */
        int thresh;
        unsigned int count;
        spinlock_t thres_lock;
@@ -88,7 +94,7 @@ BTRFS_WORK_HELPER(scrubnc_helper);
 BTRFS_WORK_HELPER(scrubparity_helper);
 
 static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
                         int thresh)
 {
        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -96,26 +102,31 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
        if (!ret)
                return NULL;
 
-       ret->max_active = max_active;
+       ret->limit_active = limit_active;
        atomic_set(&ret->pending, 0);
        if (thresh == 0)
                thresh = DFT_THRESHOLD;
        /* For low threshold, disabling threshold is a better choice */
        if (thresh < DFT_THRESHOLD) {
-               ret->current_max = max_active;
+               ret->current_active = limit_active;
                ret->thresh = NO_THRESHOLD;
        } else {
-               ret->current_max = 1;
+               /*
+                * For threshold-able wq, let its concurrency grow on demand.
+                * Use minimal max_active at alloc time to reduce resource
+                * usage.
+                */
+               ret->current_active = 1;
                ret->thresh = thresh;
        }
 
        if (flags & WQ_HIGHPRI)
                ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
-                                                ret->max_active,
-                                                "btrfs", name);
+                                                ret->current_active, "btrfs",
+                                                name);
        else
                ret->normal_wq = alloc_workqueue("%s-%s", flags,
-                                                ret->max_active, "btrfs",
+                                                ret->current_active, "btrfs",
                                                 name);
        if (!ret->normal_wq) {
                kfree(ret);
@@ -134,7 +145,7 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              unsigned int flags,
-                                             int max_active,
+                                             int limit_active,
                                              int thresh)
 {
        struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -143,14 +154,14 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                return NULL;
 
        ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
-                                             max_active, thresh);
+                                             limit_active, thresh);
        if (!ret->normal) {
                kfree(ret);
                return NULL;
        }
 
        if (flags & WQ_HIGHPRI) {
-               ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+               ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
                                                    thresh);
                if (!ret->high) {
                        __btrfs_destroy_workqueue(ret->normal);
@@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
  */
 static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
 {
-       int new_max_active;
+       int new_current_active;
        long pending;
        int need_change = 0;
 
@@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
        wq->count %= (wq->thresh / 4);
        if (!wq->count)
                goto  out;
-       new_max_active = wq->current_max;
+       new_current_active = wq->current_active;
 
        /*
         * pending may be changed later, but it's OK since we really
@@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
         */
        pending = atomic_read(&wq->pending);
        if (pending > wq->thresh)
-               new_max_active++;
+               new_current_active++;
        if (pending < wq->thresh / 2)
-               new_max_active--;
-       new_max_active = clamp_val(new_max_active, 1, wq->max_active);
-       if (new_max_active != wq->current_max)  {
+               new_current_active--;
+       new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
+       if (new_current_active != wq->current_active)  {
                need_change = 1;
-               wq->current_max = new_max_active;
+               wq->current_active = new_current_active;
        }
 out:
        spin_unlock(&wq->thres_lock);
 
        if (need_change) {
-               workqueue_set_max_active(wq->normal_wq, wq->current_max);
+               workqueue_set_max_active(wq->normal_wq, wq->current_active);
        }
 }
 
@@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
        kfree(wq);
 }
 
-void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
 {
        if (!wq)
                return;
-       wq->normal->max_active = max;
+       wq->normal->limit_active = limit_active;
        if (wq->high)
-               wq->high->max_active = max;
+               wq->high->limit_active = limit_active;
 }
 
 void btrfs_set_work_high_priority(struct btrfs_work *work)
index b0b093b6afeca3654d44865a20d2b66305013f62..ad4d0647d1a6c03b6b3ba9d1b4aae9a1ceff6df5 100644 (file)
@@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
                                              unsigned int flags,
-                                             int max_active,
+                                             int limit_active,
                                              int thresh);
 void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
                     btrfs_func_t func,
index 564a7de17d99831083c46bc19fd859d40a5d51a2..e54dd5905cee177912e03c915a8d471762bfc0cd 100644 (file)
@@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found:
        }
 
 out:
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        return ret;
 }
 
index 9ebd34f1c67752590beaae1288ed506f8b9fc561..0d98aee34fee8f716771e46a70cbee478ac27e2d 100644 (file)
@@ -3443,6 +3443,26 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
        return 0;
 }
 
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
+{
+       if ((flags & (BTRFS_BLOCK_GROUP_DUP |
+                     BTRFS_BLOCK_GROUP_RAID0 |
+                     BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
+           ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
+               return 0;
+
+       if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                    BTRFS_BLOCK_GROUP_RAID5 |
+                    BTRFS_BLOCK_GROUP_RAID10))
+               return 1;
+
+       if (flags & BTRFS_BLOCK_GROUP_RAID6)
+               return 2;
+
+       pr_warn("BTRFS: unknown raid type: %llu\n", flags);
+       return 0;
+}
+
 int btrfs_calc_num_tolerated_disk_barrier_failures(
        struct btrfs_fs_info *fs_info)
 {
@@ -3452,13 +3472,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
                       BTRFS_BLOCK_GROUP_SYSTEM,
                       BTRFS_BLOCK_GROUP_METADATA,
                       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
-       int num_types = 4;
        int i;
        int c;
        int num_tolerated_disk_barrier_failures =
                (int)fs_info->fs_devices->num_devices;
 
-       for (i = 0; i < num_types; i++) {
+       for (i = 0; i < ARRAY_SIZE(types); i++) {
                struct btrfs_space_info *tmp;
 
                sinfo = NULL;
@@ -3476,44 +3495,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
 
                down_read(&sinfo->groups_sem);
                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
-                       if (!list_empty(&sinfo->block_groups[c])) {
-                               u64 flags;
-
-                               btrfs_get_block_group_info(
-                                       &sinfo->block_groups[c], &space);
-                               if (space.total_bytes == 0 ||
-                                   space.used_bytes == 0)
-                                       continue;
-                               flags = space.flags;
-                               /*
-                                * return
-                                * 0: if dup, single or RAID0 is configured for
-                                *    any of metadata, system or data, else
-                                * 1: if RAID5 is configured, or if RAID1 or
-                                *    RAID10 is configured and only two mirrors
-                                *    are used, else
-                                * 2: if RAID6 is configured, else
-                                * num_mirrors - 1: if RAID1 or RAID10 is
-                                *                  configured and more than
-                                *                  2 mirrors are used.
-                                */
-                               if (num_tolerated_disk_barrier_failures > 0 &&
-                                   ((flags & (BTRFS_BLOCK_GROUP_DUP |
-                                              BTRFS_BLOCK_GROUP_RAID0)) ||
-                                    ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
-                                     == 0)))
-                                       num_tolerated_disk_barrier_failures = 0;
-                               else if (num_tolerated_disk_barrier_failures > 1) {
-                                       if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-                                           BTRFS_BLOCK_GROUP_RAID5 |
-                                           BTRFS_BLOCK_GROUP_RAID10)) {
-                                               num_tolerated_disk_barrier_failures = 1;
-                                       } else if (flags &
-                                                  BTRFS_BLOCK_GROUP_RAID6) {
-                                               num_tolerated_disk_barrier_failures = 2;
-                                       }
-                               }
-                       }
+                       u64 flags;
+
+                       if (list_empty(&sinfo->block_groups[c]))
+                               continue;
+
+                       btrfs_get_block_group_info(&sinfo->block_groups[c],
+                                                  &space);
+                       if (space.total_bytes == 0 || space.used_bytes == 0)
+                               continue;
+                       flags = space.flags;
+
+                       num_tolerated_disk_barrier_failures = min(
+                               num_tolerated_disk_barrier_failures,
+                               btrfs_get_num_tolerated_disk_barrier_failures(
+                                       flags));
                }
                up_read(&sinfo->groups_sem);
        }
index d4cbfeeeedd42050d3b2cd61707f80781d258328..bdfb479ea85955112305d0c30a17af9c7647daed 100644 (file)
@@ -139,6 +139,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
                                void (*flush_fn)(void *));
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
 int btrfs_calc_num_tolerated_disk_barrier_failures(
        struct btrfs_fs_info *fs_info);
 int __init btrfs_end_io_wq_init(void);
index 237da012f7d09e3ac0e4c4aabef8224a5d5a06f1..a0fa7253a2d77b6faa2e71366c762b331cd784b2 100644 (file)
@@ -6909,8 +6909,7 @@ out:
 
        trace_btrfs_get_extent(root, em);
 
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        if (trans) {
                ret = btrfs_end_transaction(trans, root);
                if (!err)
index 9a11db0c47ee7bc1b51d8f79226a9a25f7a3b8eb..a39f5d1144e8e0fe90b459f3c5528d5672be0d4a 100644 (file)
@@ -3267,13 +3267,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                        scrub_blocked_if_needed(fs_info);
                }
 
-               /* for raid56, we skip parity stripe */
                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                        ret = get_raid56_logic_offset(physical, num, map,
                                                      &logical,
                                                      &stripe_logical);
                        logical += base;
                        if (ret) {
+                               /* it is parity strip */
                                stripe_logical += base;
                                stripe_end = stripe_logical + increment;
                                ret = scrub_raid56_parity(sctx, map, scrub_dev,
@@ -3480,7 +3480,6 @@ out:
 
 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
                                          struct btrfs_device *scrub_dev,
-                                         u64 chunk_tree, u64 chunk_objectid,
                                          u64 chunk_offset, u64 length,
                                          u64 dev_offset, int is_dev_replace)
 {
@@ -3531,8 +3530,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
        struct btrfs_root *root = sctx->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 length;
-       u64 chunk_tree;
-       u64 chunk_objectid;
        u64 chunk_offset;
        int ret = 0;
        int slot;
@@ -3596,8 +3593,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                if (found_key.offset + length <= start)
                        goto skip;
 
-               chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
-               chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
 
                /*
@@ -3630,9 +3625,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                dev_replace->cursor_right = found_key.offset + length;
                dev_replace->cursor_left = found_key.offset;
                dev_replace->item_needs_writeback = 1;
-               ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
-                                 chunk_offset, length, found_key.offset,
-                                 is_dev_replace);
+               ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
+                                 found_key.offset, is_dev_replace);
 
                /*
                 * flush, submit all pending read and write bios, afterwards
index a4b9c8b2d35ab9d93676588ad426726788f032ce..f31db43253399e032c76d52d6520bbff41907aa6 100644 (file)
@@ -115,8 +115,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                ret = -EAGAIN;
        }
 out:
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        if (ret == -EAGAIN) {
                if (root->defrag_max.objectid > root->defrag_progress.objectid)
                        goto done;
index 76201d6f6ce46371784e8fa6462bc2f5fde3879b..6fc735869c186c35fb79fa66decc7d3519ed2e93 100644 (file)
@@ -3585,23 +3585,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        } while (read_seqretry(&fs_info->profiles_lock, seq));
 
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
-               int num_tolerated_disk_barrier_failures;
-               u64 target = bctl->sys.target;
-
-               num_tolerated_disk_barrier_failures =
-                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
-               if (num_tolerated_disk_barrier_failures > 0 &&
-                   (target &
-                    (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-                     BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
-                       num_tolerated_disk_barrier_failures = 0;
-               else if (num_tolerated_disk_barrier_failures > 1 &&
-                        (target &
-                         (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
-                       num_tolerated_disk_barrier_failures = 1;
-
-               fs_info->num_tolerated_disk_barrier_failures =
-                       num_tolerated_disk_barrier_failures;
+               fs_info->num_tolerated_disk_barrier_failures = min(
+                       btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
+                       btrfs_get_num_tolerated_disk_barrier_failures(
+                               bctl->sys.target));
        }
 
        ret = insert_balance_item(fs_info->tree_root, bctl);
index 890c50971a690472f6dc795b00fd0bcfbcdf1a39..9d23e788d1dfdab235d1edd0f8d1d3e065904e65 100644 (file)
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
        for (i = 0; i < num_pages; i++) {
                struct page *page = osd_data->pages[i];
 
-               if (rc < 0)
+               if (rc < 0 && rc != ENOENT)
                        goto unlock;
                if (bytes < (int)PAGE_CACHE_SIZE) {
                        /* zero (remainder of) page */
@@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping,
             wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
             (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-       if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+       if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
                pr_warn("writepage_start %p on forced umount\n", inode);
+               truncate_pagecache(inode, 0);
+               mapping_set_error(mapping, -EIO);
                return -EIO; /* we're in a forced umount, don't write! */
        }
        if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1593,7 +1595,7 @@ out:
        return err;
 }
 
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
 };
index ddd5e94712904501db729c51b59de72cd88ddb5a..27b566874bc1d2b0494b49ebe1769f380ccf8648 100644 (file)
@@ -2413,6 +2413,14 @@ again:
                        goto out_unlock;
                }
 
+               if (!__ceph_is_any_caps(ci) &&
+                   ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+                       dout("get_cap_refs %p forced umount\n", inode);
+                       *err = -EIO;
+                       ret = 1;
+                       goto out_unlock;
+               }
+
                dout("get_cap_refs %p have %s needed %s\n", inode,
                     ceph_cap_string(have), ceph_cap_string(need));
        }
index 8b79d87eaf4675ff91cf05c10a3fc53e70d5b313..0c62868b5c561b37fa866b68bbe34f3ffb1fd6a6 100644 (file)
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
        struct ceph_file_info *cf = file->private_data;
-       struct inode *parent_inode = NULL;
        int err;
        int flags, fmode, wanted;
 
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
        ihold(inode);
 
        req->r_num_caps = 1;
-       if (flags & O_CREAT)
-               parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
-       err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-       iput(parent_inode);
+       err = ceph_mdsc_do_request(mdsc, NULL, req);
        if (!err)
                err = ceph_init_file(inode, file, req->r_fmode);
        ceph_mdsc_put_request(req);
@@ -279,7 +275,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        if (err)
                goto out_req;
 
-       if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+       if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
 
        if (d_unhashed(dentry)) {
@@ -956,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
 
+       if (iocb->ki_flags & IOCB_APPEND) {
+               err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+               if (err < 0)
+                       goto out;
+       }
+
        err = generic_write_checks(iocb, from);
        if (err <= 0)
                goto out;
index 6aa07af67603ada211f49268d3845ea62b625720..51cb02da75d98979b18e05d7bdad4dee89e258d4 100644 (file)
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        msg = create_request_message(mdsc, req, mds, drop_cap_releases);
        if (IS_ERR(msg)) {
                req->r_err = PTR_ERR(msg);
-               complete_request(mdsc, req);
                return PTR_ERR(msg);
        }
        req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
 {
        struct ceph_mds_session *session = NULL;
        int mds = -1;
-       int err = -EAGAIN;
+       int err = 0;
 
        if (req->r_err || req->r_got_result) {
                if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
                err = -EIO;
                goto finish;
        }
+       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+               dout("do_request forced umount\n");
+               err = -EIO;
+               goto finish;
+       }
 
        put_request_session(req);
 
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
 
 out_session:
        ceph_put_mds_session(session);
+finish:
+       if (err) {
+               dout("__do_request early error %d\n", err);
+               req->r_err = err;
+               complete_request(mdsc, req);
+               __unregister_request(mdsc, req);
+       }
 out:
        return err;
-
-finish:
-       req->r_err = err;
-       complete_request(mdsc, req);
-       goto out;
 }
 
 /*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 
        if (req->r_err) {
                err = req->r_err;
-               __unregister_request(mdsc, req);
-               dout("do_request early error %d\n", err);
                goto out;
        }
 
@@ -2411,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
-       if (req->r_got_safe && !head->safe) {
+       if (req->r_got_safe) {
                pr_warn("got unsafe after safe on %llu from mds%d\n",
                           tid, mds);
                mutex_unlock(&mdsc->mutex);
@@ -2520,8 +2524,7 @@ out_err:
                if (err) {
                        req->r_err = err;
                } else {
-                       req->r_reply = msg;
-                       ceph_msg_get(msg);
+                       req->r_reply =  ceph_msg_get(msg);
                        req->r_got_result = true;
                }
        } else {
@@ -3555,7 +3558,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush, want_snap;
 
-       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return;
 
        dout("sync\n");
@@ -3584,7 +3587,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
-       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
                return true;
        return atomic_read(&mdsc->num_sessions) == 0;
 }
@@ -3643,6 +3646,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
        dout("stopped\n");
 }
 
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+       struct ceph_mds_session *session;
+       int mds;
+
+       dout("force umount\n");
+
+       mutex_lock(&mdsc->mutex);
+       for (mds = 0; mds < mdsc->max_sessions; mds++) {
+               session = __ceph_lookup_mds_session(mdsc, mds);
+               if (!session)
+                       continue;
+               mutex_unlock(&mdsc->mutex);
+               mutex_lock(&session->s_mutex);
+               __close_session(mdsc, session);
+               if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+                       cleanup_session_requests(mdsc, session);
+                       remove_session_caps(session);
+               }
+               mutex_unlock(&session->s_mutex);
+               ceph_put_mds_session(session);
+               mutex_lock(&mdsc->mutex);
+               kick_requests(mdsc, mds);
+       }
+       __wake_requests(mdsc, &mdsc->waiting_for_map);
+       mutex_unlock(&mdsc->mutex);
+}
+
 static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
        dout("stop\n");
index 762757e6cebf95fff324894d1650b8271210acdd..f575eafe2261cbd5974d8d4f072879e9d5bd7a39 100644 (file)
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 
 extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
 
 extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
index 233d906aec02b7c4508fd2488908bdb95a130aa4..4aa7122a8d38c18dd4fe7443fb64f46765b83280 100644 (file)
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
                return 0;
        }
 
-       if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
-               ceph_get_snap_context(ceph_empty_snapc);
-               snapc = ceph_empty_snapc;
-               goto done;
-       }
-
        /* alloc new snap context */
        err = -ENOMEM;
        if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
             realm->ino, realm, snapc, snapc->seq,
             (unsigned int) snapc->num_snaps);
 
-done:
        ceph_put_snap_context(realm->cached_context);
        realm->cached_context = snapc;
        return 0;
index 7b6bfcbf801cac7bf5c54f4543809c1bb6c76d87..f446afada328a45c2b70f648cbb48261e11e996c 100644 (file)
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
        if (!fsc)
                return;
        fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+       ceph_mdsc_force_umount(fsc->mdsc);
        return;
 }
 
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
new file mode 100644 (file)
index 0000000..0065256
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ *   fs/cifs/cifs_ioctl.h
+ *
+ *   Structure definitions for io control for cifs/smb3
+ *
+ *   Copyright (c) 2015 Steve French <steve.french@primarydata.com>
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ */
+
+struct smb_mnt_fs_info {
+       __u32   version; /* 0001 */
+       __u16   protocol_id;
+       __u16   tcon_flags;
+       __u32   vol_serial_number;
+       __u32   vol_create_time;
+       __u32   share_caps;
+       __u32   share_flags;
+       __u32   sector_flags;
+       __u32   optimal_sector_size;
+       __u32   max_bytes_chunk;
+       __u32   fs_attributes;
+       __u32   max_path_component;
+       __u32   device_type;
+       __u32   device_characteristics;
+       __u32   maximal_access;
+       __u64   cifs_posix_caps;
+} __packed;
+
+#define CIFS_IOCTL_MAGIC       0xCF
+#define CIFS_IOC_COPYCHUNK_FILE        _IOW(CIFS_IOCTL_MAGIC, 3, int)
+#define CIFS_IOC_SET_INTEGRITY  _IO(CIFS_IOCTL_MAGIC, 4)
+#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
index 6a1119e87fbb6fb636e4d76e814574402b7dc139..e739950ca08485543db80bee53f1021384ed0b9a 100644 (file)
@@ -325,8 +325,11 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 static void
 cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
 {
-       if (ses->sectype == Unspecified)
+       if (ses->sectype == Unspecified) {
+               if (ses->user_name == NULL)
+                       seq_puts(s, ",sec=none");
                return;
+       }
 
        seq_puts(s, ",sec=");
 
index a782b22904e40b71387d844a6a7879bab8191a88..27aea110e92365e1e91610579369215cc54644ea 100644 (file)
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.06"
+#define CIFS_VERSION   "2.07"
 #endif                         /* _CIFSFS_H */
index 47b030da0781e988c7bcafefd462a5e8b45ff14c..f5b87303ce46d50abab41e6017c33cecb06194bb 100644 (file)
@@ -2245,6 +2245,20 @@ typedef struct {
 #define FILE_DEVICE_VIRTUAL_DISK        0x00000024
 #define FILE_DEVICE_NETWORK_REDIRECTOR  0x00000028
 
+/* Device Characteristics */
+#define FILE_REMOVABLE_MEDIA                   0x00000001
+#define FILE_READ_ONLY_DEVICE                  0x00000002
+#define FILE_FLOPPY_DISKETTE                   0x00000004
+#define FILE_WRITE_ONCE_MEDIA                  0x00000008
+#define FILE_REMOTE_DEVICE                     0x00000010
+#define FILE_DEVICE_IS_MOUNTED                 0x00000020
+#define FILE_VIRTUAL_VOLUME                    0x00000040
+#define FILE_DEVICE_SECURE_OPEN                        0x00000100
+#define FILE_CHARACTERISTIC_TS_DEVICE          0x00001000
+#define FILE_CHARACTERISTIC_WEBDAV_DEVICE      0x00002000
+#define FILE_PORTABLE_DEVICE                   0x00004000
+#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000
+
 typedef struct {
        __le32 DeviceType;
        __le32 DeviceCharacteristics;
index 672ef35c9f73c59d1f4b566bec2f16b4c74e4a8f..90b4f9f7de660a261b5f93322af59282294953fd 100644 (file)
@@ -696,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid)
 {
        struct TCP_Server_Info *server = mid->callback_data;
 
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        add_credits(server, 1, CIFS_ECHO_OP);
 }
 
@@ -1572,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid)
        }
 
        queue_work(cifsiod_wq, &rdata->work);
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        add_credits(server, 1, 0);
 }
 
@@ -2032,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
 {
        struct cifs_writedata *wdata = mid->callback_data;
        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+       struct TCP_Server_Info *server = tcon->ses->server;
        unsigned int written;
        WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
 
@@ -2068,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid)
        }
 
        queue_work(cifsiod_wq, &wdata->work);
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        add_credits(tcon->ses->server, 1, 0);
 }
 
index 3f50cee79df9d3318209e19281acef536b34af37..e2a6af1508af2aef789d0caab21fedfa91d49c60 100644 (file)
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        return VM_FAULT_LOCKED;
 }
 
-static struct vm_operations_struct cifs_file_vm_ops = {
+static const struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
index 49b8b6e41a188b3a832c3e32c482ed7a99091aa1..28a77bf1d55924693d27d1c701571e1b1fef2d49 100644 (file)
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
+#include "cifs_ioctl.h"
 #include <linux/btrfs.h>
 
-#define CIFS_IOCTL_MAGIC       0xCF
-#define CIFS_IOC_COPYCHUNK_FILE        _IOW(CIFS_IOCTL_MAGIC, 3, int)
-#define CIFS_IOC_SET_INTEGRITY  _IO(CIFS_IOCTL_MAGIC, 4)
-
 static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
                        unsigned long srcfd, u64 off, u64 len, u64 destoff,
                        bool dup_extents)
@@ -70,6 +67,12 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
                goto out_drop_write;
        }
 
+       if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+               rc = -EBADF;
+               cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+               goto out_fput;
+       }
+
        if ((!src_file.file->private_data) || (!dst_file->private_data)) {
                rc = -EBADF;
                cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
@@ -135,6 +138,43 @@ out_drop_write:
        return rc;
 }
 
+static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
+                               void __user *arg)
+{
+       int rc = 0;
+       struct smb_mnt_fs_info *fsinf;
+
+       fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL);
+       if (fsinf == NULL)
+               return -ENOMEM;
+
+       fsinf->version = 1;
+       fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+       fsinf->device_characteristics =
+                       le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
+       fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+       fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes);
+       fsinf->max_path_component =
+               le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength);
+#ifdef CONFIG_CIFS_SMB2
+       fsinf->vol_serial_number = tcon->vol_serial_number;
+       fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time);
+       fsinf->share_flags = tcon->share_flags;
+       fsinf->share_caps = le32_to_cpu(tcon->capabilities);
+       fsinf->sector_flags = tcon->ss_flags;
+       fsinf->optimal_sector_size = tcon->perf_sector_size;
+       fsinf->max_bytes_chunk = tcon->max_bytes_chunk;
+       fsinf->maximal_access = tcon->maximal_access;
+#endif /* SMB2 */
+       fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+       if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info)))
+               rc = -EFAULT;
+
+       kfree(fsinf);
+       return rc;
+}
+
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
        struct inode *inode = file_inode(filep);
@@ -148,8 +188,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
        xid = get_xid();
 
-       cifs_dbg(FYI, "ioctl file %p  cmd %u  arg %lu\n", filep, command, arg);
-
        cifs_sb = CIFS_SB(inode->i_sb);
 
        switch (command) {
@@ -228,6 +266,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
                        else
                                rc = -EOPNOTSUPP;
                        break;
+               case CIFS_IOC_GET_MNT_INFO:
+                       tcon = tlink_tcon(pSMBFile->tlink);
+                       rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
+                       break;
                default:
                        cifs_dbg(FYI, "unsupported ioctl\n");
                        break;
index b8b4f08ee094e2f8a2b811f076ca0eb4cc3e68b9..070fb2ad85ced4483d28d88c1fa4832e92eef3ba 100644 (file)
@@ -1626,7 +1626,9 @@ smb2_echo_callback(struct mid_q_entry *mid)
        if (mid->mid_state == MID_RESPONSE_RECEIVED)
                credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
 
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        add_credits(server, credits_received, CIFS_ECHO_OP);
 }
 
@@ -1810,7 +1812,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
                cifs_stats_fail_inc(tcon, SMB2_READ_HE);
 
        queue_work(cifsiod_wq, &rdata->work);
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        add_credits(server, credits_received, 0);
 }
 
@@ -1938,6 +1942,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
 {
        struct cifs_writedata *wdata = mid->callback_data;
        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+       struct TCP_Server_Info *server = tcon->ses->server;
        unsigned int written;
        struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
        unsigned int credits_received = 1;
@@ -1977,7 +1982,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
                cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
 
        queue_work(cifsiod_wq, &wdata->work);
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        add_credits(tcon->ses->server, credits_received, 0);
 }
 
index 126f46b887cc85b6ba6a0f313b918e2d8b026708..2a24c524fb9a90cedd4187460ecb90c0e5dfcf0c 100644 (file)
@@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
        }
        spin_unlock(&GlobalMid_Lock);
 
+       mutex_lock(&server->srv_mutex);
        DeleteMidQEntry(mid);
+       mutex_unlock(&server->srv_mutex);
        return rc;
 }
 
index 9b1ffaa0572e5825d8c617880bfbc19be166313b..f6c6c8adbc01efd495de1d6e914dcf0f5b62c10a 100644 (file)
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
         char *result;
         
        insize = max_t(unsigned int,
-                    INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+                    INSIZE(readlink), OUTSIZE(readlink)+ *length);
        UPARG(CODA_READLINK);
 
         inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
        error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
        if (!error) {
                retlen = outp->coda_readlink.count;
-               if ( retlen > *length )
-                       retlen = *length;
+               if (retlen >= *length)
+                       retlen = *length - 1;
                *length = retlen;
                result =  (char *)outp + (long)outp->coda_readlink.data;
                memcpy(buffer, result, retlen);
index c5ecde6f3eed975af7756c17cec4f3b1748dbc83..a8f75640ac86ec2d29cd55a253b2c3c12c7bac9b 100644 (file)
@@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo)
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
-       int flag = 0;
        int ispipe;
        struct files_struct *displaced;
-       bool need_nonrelative = false;
+       /* require nonrelative corefile path and be extra careful */
+       bool need_suid_safe = false;
        bool core_dumped = false;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
@@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo)
         */
        if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
                /* Setuid core dump mode */
-               flag = O_EXCL;          /* Stop rewrite attacks */
                cred->fsuid = GLOBAL_ROOT_UID;  /* Dump root private */
-               need_nonrelative = true;
+               need_suid_safe = true;
        }
 
        retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo)
                if (cprm.limit < binfmt->min_coredump)
                        goto fail_unlock;
 
-               if (need_nonrelative && cn.corename[0] != '/') {
+               if (need_suid_safe && cn.corename[0] != '/') {
                        printk(KERN_WARNING "Pid %d(%s) can only dump core "\
                                "to fully qualified path!\n",
                                task_tgid_vnr(current), current->comm);
@@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo)
                        goto fail_unlock;
                }
 
+               /*
+                * Unlink the file if it exists unless this is a SUID
+                * binary - in that case, we're running around with root
+                * privs and don't want to unlink another user's coredump.
+                */
+               if (!need_suid_safe) {
+                       mm_segment_t old_fs;
+
+                       old_fs = get_fs();
+                       set_fs(KERNEL_DS);
+                       /*
+                        * If it doesn't exist, that's fine. If there's some
+                        * other problem, we'll catch it at the filp_open().
+                        */
+                       (void) sys_unlink((const char __user *)cn.corename);
+                       set_fs(old_fs);
+               }
+
+               /*
+                * There is a race between unlinking and creating the
+                * file, but if that causes an EEXIST here, that's
+                * fine - another process raced with us while creating
+                * the corefile, and the other process won. To userspace,
+                * what matters is that at least one of the two processes
+                * writes its coredump successfully, not which one.
+                */
                cprm.file = filp_open(cn.corename,
-                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+                                O_CREAT | 2 | O_NOFOLLOW |
+                                O_LARGEFILE | O_EXCL,
                                 0600);
                if (IS_ERR(cprm.file))
                        goto fail_unlock;
@@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo)
                if (!S_ISREG(inode->i_mode))
                        goto close_fail;
                /*
-                * Dont allow local users get cute and trick others to coredump
-                * into their pre-created files.
+                * Don't dump core if the filesystem changed owner or mode
+                * of the file during file creation. This is an issue when
+                * a process dumps core while its cwd is e.g. on a vfat
+                * filesystem.
                 */
                if (!uid_eq(inode->i_uid, current_fsuid()))
                        goto close_fail;
+               if ((inode->i_mode & 0677) != 0600)
+                       goto close_fail;
                if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
                        goto close_fail;
                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
index a7f77e1fa18c25e62e8de5f809389d041e59266e..93bf2f990ace462b31dba2ee239084e112b1dfd0 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
 #include <linux/atomic.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
 #include <linux/highmem.h>
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
@@ -34,7 +36,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
 
        might_sleep();
        do {
-               void *addr;
+               void __pmem *addr;
                unsigned long pfn;
                long count;
 
@@ -46,10 +48,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
                        unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
                        if (pgsz > count)
                                pgsz = count;
-                       if (pgsz < PAGE_SIZE)
-                               memset(addr, 0, pgsz);
-                       else
-                               clear_page(addr);
+                       clear_pmem(addr, pgsz);
                        addr += pgsz;
                        size -= pgsz;
                        count -= pgsz;
@@ -59,26 +58,29 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
                }
        } while (size);
 
+       wmb_pmem();
        return 0;
 }
 EXPORT_SYMBOL_GPL(dax_clear_blocks);
 
-static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
+               unsigned blkbits)
 {
        unsigned long pfn;
        sector_t sector = bh->b_blocknr << (blkbits - 9);
        return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
 }
 
-static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
-                       loff_t end)
+/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
+static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
+               loff_t pos, loff_t end)
 {
        loff_t final = end - pos + first; /* The final byte of the buffer */
 
        if (first > 0)
-               memset(addr, 0, first);
+               clear_pmem(addr, first);
        if (final < size)
-               memset(addr + final, 0, size - final);
+               clear_pmem(addr + final, size - final);
 }
 
 static bool buffer_written(struct buffer_head *bh)
@@ -106,14 +108,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
        loff_t pos = start;
        loff_t max = start;
        loff_t bh_max = start;
-       void *addr;
+       void __pmem *addr;
        bool hole = false;
+       bool need_wmb = false;
 
        if (iov_iter_rw(iter) != WRITE)
                end = min(end, i_size_read(inode));
 
        while (pos < end) {
-               unsigned len;
+               size_t len;
                if (pos == max) {
                        unsigned blkbits = inode->i_blkbits;
                        sector_t block = pos >> blkbits;
@@ -145,19 +148,23 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                                retval = dax_get_addr(bh, &addr, blkbits);
                                if (retval < 0)
                                        break;
-                               if (buffer_unwritten(bh) || buffer_new(bh))
+                               if (buffer_unwritten(bh) || buffer_new(bh)) {
                                        dax_new_buf(addr, retval, first, pos,
                                                                        end);
+                                       need_wmb = true;
+                               }
                                addr += first;
                                size = retval - first;
                        }
                        max = min(pos + size, end);
                }
 
-               if (iov_iter_rw(iter) == WRITE)
-                       len = copy_from_iter_nocache(addr, max - pos, iter);
-               else if (!hole)
-                       len = copy_to_iter(addr, max - pos, iter);
+               if (iov_iter_rw(iter) == WRITE) {
+                       len = copy_from_iter_pmem(addr, max - pos, iter);
+                       need_wmb = true;
+               } else if (!hole)
+                       len = copy_to_iter((void __force *)addr, max - pos,
+                                       iter);
                else
                        len = iov_iter_zero(max - pos, iter);
 
@@ -168,6 +175,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
                addr += len;
        }
 
+       if (need_wmb)
+               wmb_pmem();
+
        return (pos == start) ? retval : pos - start;
 }
 
@@ -260,11 +270,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
 static int copy_user_bh(struct page *to, struct buffer_head *bh,
                        unsigned blkbits, unsigned long vaddr)
 {
-       void *vfrom, *vto;
+       void __pmem *vfrom;
+       void *vto;
+
        if (dax_get_addr(bh, &vfrom, blkbits) < 0)
                return -EIO;
        vto = kmap_atomic(to);
-       copy_user_page(vto, vfrom, vaddr, to);
+       copy_user_page(vto, (void __force *)vfrom, vaddr, to);
        kunmap_atomic(vto);
        return 0;
 }
@@ -272,16 +284,13 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                        struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       struct address_space *mapping = inode->i_mapping;
        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
        unsigned long vaddr = (unsigned long)vmf->virtual_address;
-       void *addr;
+       void __pmem *addr;
        unsigned long pfn;
        pgoff_t size;
        int error;
 
-       i_mmap_lock_read(mapping);
-
        /*
         * Check truncate didn't happen while we were allocating a block.
         * If it did, this block may or may not be still allocated to the
@@ -303,14 +312,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
                goto out;
        }
 
-       if (buffer_unwritten(bh) || buffer_new(bh))
-               clear_page(addr);
+       if (buffer_unwritten(bh) || buffer_new(bh)) {
+               clear_pmem(addr, PAGE_SIZE);
+               wmb_pmem();
+       }
 
        error = vm_insert_mixed(vma, vaddr, pfn);
 
  out:
-       i_mmap_unlock_read(mapping);
-
        return error;
 }
 
@@ -372,15 +381,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                         * from a read fault and we've raced with a truncate
                         */
                        error = -EIO;
-                       goto unlock_page;
+                       goto unlock;
                }
+       } else {
+               i_mmap_lock_write(mapping);
        }
 
        error = get_block(inode, block, &bh, 0);
        if (!error && (bh.b_size < PAGE_SIZE))
                error = -EIO;           /* fs corruption? */
        if (error)
-               goto unlock_page;
+               goto unlock;
 
        if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -391,8 +402,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                        if (!error && (bh.b_size < PAGE_SIZE))
                                error = -EIO;
                        if (error)
-                               goto unlock_page;
+                               goto unlock;
                } else {
+                       i_mmap_unlock_write(mapping);
                        return dax_load_hole(mapping, page, vmf);
                }
        }
@@ -404,17 +416,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                else
                        clear_user_highpage(new_page, vaddr);
                if (error)
-                       goto unlock_page;
+                       goto unlock;
                vmf->page = page;
                if (!page) {
-                       i_mmap_lock_read(mapping);
                        /* Check we didn't race with truncate */
                        size = (i_size_read(inode) + PAGE_SIZE - 1) >>
                                                                PAGE_SHIFT;
                        if (vmf->pgoff >= size) {
-                               i_mmap_unlock_read(mapping);
                                error = -EIO;
-                               goto out;
+                               goto unlock;
                        }
                }
                return VM_FAULT_LOCKED;
@@ -450,6 +460,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                        WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
        }
 
+       if (!page)
+               i_mmap_unlock_write(mapping);
  out:
        if (error == -ENOMEM)
                return VM_FAULT_OOM | major;
@@ -458,11 +470,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                return VM_FAULT_SIGBUS | major;
        return VM_FAULT_NOPAGE | major;
 
- unlock_page:
+ unlock:
        if (page) {
                unlock_page(page);
                page_cache_release(page);
+       } else {
+               i_mmap_unlock_write(mapping);
        }
+
        goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
@@ -494,6 +509,177 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+               pmd_t *pmd, unsigned int flags, get_block_t get_block,
+               dax_iodone_t complete_unwritten)
+{
+       struct file *file = vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+       struct buffer_head bh;
+       unsigned blkbits = inode->i_blkbits;
+       unsigned long pmd_addr = address & PMD_MASK;
+       bool write = flags & FAULT_FLAG_WRITE;
+       long length;
+       void __pmem *kaddr;
+       pgoff_t size, pgoff;
+       sector_t block, sector;
+       unsigned long pfn;
+       int result = 0;
+
+       /* Fall back to PTEs if we're going to COW */
+       if (write && !(vma->vm_flags & VM_SHARED))
+               return VM_FAULT_FALLBACK;
+       /* If the PMD would extend outside the VMA */
+       if (pmd_addr < vma->vm_start)
+               return VM_FAULT_FALLBACK;
+       if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+               return VM_FAULT_FALLBACK;
+
+       pgoff = linear_page_index(vma, pmd_addr);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (pgoff >= size)
+               return VM_FAULT_SIGBUS;
+       /* If the PMD would cover blocks out of the file */
+       if ((pgoff | PG_PMD_COLOUR) >= size)
+               return VM_FAULT_FALLBACK;
+
+       memset(&bh, 0, sizeof(bh));
+       block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+       bh.b_size = PMD_SIZE;
+       i_mmap_lock_write(mapping);
+       length = get_block(inode, block, &bh, write);
+       if (length)
+               return VM_FAULT_SIGBUS;
+
+       /*
+        * If the filesystem isn't willing to tell us the length of a hole,
+        * just fall back to PTEs.  Calling get_block 512 times in a loop
+        * would be silly.
+        */
+       if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+               goto fallback;
+
+       if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+               int i;
+               for (i = 0; i < PTRS_PER_PMD; i++)
+                       clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+               wmb_pmem();
+               count_vm_event(PGMAJFAULT);
+               mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+               result |= VM_FAULT_MAJOR;
+       }
+
+       /*
+        * If we allocated new storage, make sure no process has any
+        * zero pages covering this hole
+        */
+       if (buffer_new(&bh)) {
+               i_mmap_unlock_write(mapping);
+               unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+               i_mmap_lock_write(mapping);
+       }
+
+       /*
+        * If a truncate happened while we were allocating blocks, we may
+        * leave blocks allocated to the file that are beyond EOF.  We can't
+        * take i_mutex here, so just leave them hanging; they'll be freed
+        * when the file is deleted.
+        */
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (pgoff >= size) {
+               result = VM_FAULT_SIGBUS;
+               goto out;
+       }
+       if ((pgoff | PG_PMD_COLOUR) >= size)
+               goto fallback;
+
+       if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+               spinlock_t *ptl;
+               pmd_t entry;
+               struct page *zero_page = get_huge_zero_page();
+
+               if (unlikely(!zero_page))
+                       goto fallback;
+
+               ptl = pmd_lock(vma->vm_mm, pmd);
+               if (!pmd_none(*pmd)) {
+                       spin_unlock(ptl);
+                       goto fallback;
+               }
+
+               entry = mk_pmd(zero_page, vma->vm_page_prot);
+               entry = pmd_mkhuge(entry);
+               set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+               result = VM_FAULT_NOPAGE;
+               spin_unlock(ptl);
+       } else {
+               sector = bh.b_blocknr << (blkbits - 9);
+               length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+                                               bh.b_size);
+               if (length < 0) {
+                       result = VM_FAULT_SIGBUS;
+                       goto out;
+               }
+               if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+                       goto fallback;
+
+               result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+       }
+
+ out:
+       if (buffer_unwritten(&bh))
+               complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+       i_mmap_unlock_write(mapping);
+
+       return result;
+
+ fallback:
+       count_vm_event(THP_FAULT_FALLBACK);
+       result = VM_FAULT_FALLBACK;
+       goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmd, unsigned int flags, get_block_t get_block,
+                       dax_iodone_t complete_unwritten)
+{
+       int result;
+       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(sb);
+               file_update_time(vma->vm_file);
+       }
+       result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+                               complete_unwritten);
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(sb);
+
+       return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /**
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
@@ -548,11 +734,12 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
        if (err < 0)
                return err;
        if (buffer_written(&bh)) {
-               void *addr;
+               void __pmem *addr;
                err = dax_get_addr(&bh, &addr, inode->i_blkbits);
                if (err < 0)
                        return err;
-               memset(addr + offset, 0, length);
+               clear_pmem(addr + offset, length);
+               wmb_pmem();
        }
 
        return 0;
index 284f9aa0028b8dd46b9897ababc5e826c185c608..6c55ade071c39d733adc90085544b25901ebde4d 100644 (file)
@@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
 
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
-                             size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+                              size_t count, loff_t *ppos)
 {
        char buf[3];
        u32 *val = file->private_data;
@@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
        buf[2] = 0x00;
        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
 }
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
 
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
-                              size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+                               size_t count, loff_t *ppos)
 {
        char buf[32];
        size_t buf_size;
@@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 
        return count;
 }
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
 
 static const struct file_operations fops_bool = {
-       .read =         read_file_bool,
-       .write =        write_file_bool,
+       .read =         debugfs_read_file_bool,
+       .write =        debugfs_write_file_bool,
        .open =         simple_open,
        .llseek =       default_llseek,
 };
index 97315f2f68164f45f41176abeb1d0317467280f4..80d6901493cf5e0867cd2572885cc16f66ff59c5 100644 (file)
@@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat(
                                 &mount_crypt_stat->global_auth_tok_list,
                                 mount_crypt_stat_list) {
                list_del(&auth_tok->mount_crypt_stat_list);
-               if (auth_tok->global_auth_tok_key
-                   && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
+               if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
                        key_put(auth_tok->global_auth_tok_key);
                kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok);
        }
index 8db0b464483f9b028f78e779a8337c3d33ca1c64..63cd2c147221aae7c83d2f8992994c625fdcea67 100644 (file)
 static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
 {
        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-       int rc;
-
-       if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
-               return 1;
+       int rc = 1;
 
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
-       rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+       if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE)
+               rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+
        if (d_really_is_positive(dentry)) {
-               struct inode *lower_inode =
-                       ecryptfs_inode_to_lower(d_inode(dentry));
+               struct inode *inode = d_inode(dentry);
 
-               fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+               fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode));
+               if (!inode->i_nlink)
+                       return 0;
        }
        return rc;
 }
index 3b57c9f83c9b9b6469b014a317b0e50c1b1f1cf4..1982c3f11aec421f871c57a701f78fc3b6488d8d 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <linux/time.h>
 #include <linux/pagemap.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return dax_fault(vma, vmf, ext2_get_block, NULL);
 }
 
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+                                               pmd_t *pmd, unsigned int flags)
+{
+       return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
        .fault          = ext2_dax_fault,
+       .pmd_fault      = ext2_dax_pmd_fault,
        .page_mkwrite   = ext2_dax_mkwrite,
        .pfn_mkwrite    = dax_pfn_mkwrite,
 };
@@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
 
        file_accessed(file);
        vma->vm_ops = &ext2_dax_vm_ops;
-       vma->vm_flags |= VM_MIXEDMAP;
+       vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
        return 0;
 }
 #else
index a3a404c5df2edf3a9196333f248e53add270d643..c60a248c640cb9f23494ed3442ad161639922e7b 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
index 32071f5c1c2623b4ef242855941e3cd483429a59..fd1f28be529690898c67b5d93ef3ed69bb1e0e78 100644 (file)
@@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+                        struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
index bc313ac5d3fa024a96574549690eb40eb35612db..113837e7ba98d5cf866ee365a40a89d7fc781a71 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
@@ -195,7 +196,7 @@ out:
 static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
 {
        struct inode *inode = bh->b_assoc_map->host;
-       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+       /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
        loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
        int err;
        if (!uptodate)
@@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
 
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
-                                       /* Is this the right get_block? */
+       int result;
+       handle_t *handle = NULL;
+       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+       bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+       if (write) {
+               sb_start_pagefault(sb);
+               file_update_time(vma->vm_file);
+               handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+                                               EXT4_DATA_TRANS_BLOCKS(sb));
+       }
+
+       if (IS_ERR(handle))
+               result = VM_FAULT_SIGBUS;
+       else
+               result = __dax_fault(vma, vmf, ext4_get_block_dax,
+                                               ext4_end_io_unwritten);
+
+       if (write) {
+               if (!IS_ERR(handle))
+                       ext4_journal_stop(handle);
+               sb_end_pagefault(sb);
+       }
+
+       return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+                                               pmd_t *pmd, unsigned int flags)
+{
+       int result;
+       handle_t *handle = NULL;
+       struct inode *inode = file_inode(vma->vm_file);
+       struct super_block *sb = inode->i_sb;
+       bool write = flags & FAULT_FLAG_WRITE;
+
+       if (write) {
+               sb_start_pagefault(sb);
+               file_update_time(vma->vm_file);
+               handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+                               ext4_chunk_trans_blocks(inode,
+                                                       PMD_SIZE / PAGE_SIZE));
+       }
+
+       if (IS_ERR(handle))
+               result = VM_FAULT_SIGBUS;
+       else
+               result = __dax_pmd_fault(vma, addr, pmd, flags,
+                               ext4_get_block_dax, ext4_end_io_unwritten);
+
+       if (write) {
+               if (!IS_ERR(handle))
+                       ext4_journal_stop(handle);
+               sb_end_pagefault(sb);
+       }
+
+       return result;
 }
 
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+       return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+                               ext4_end_io_unwritten);
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault          = ext4_dax_fault,
+       .pmd_fault      = ext4_dax_pmd_fault,
        .page_mkwrite   = ext4_dax_mkwrite,
        .pfn_mkwrite    = dax_pfn_mkwrite,
 };
@@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        file_accessed(file);
        if (IS_DAX(file_inode(file))) {
                vma->vm_ops = &ext4_dax_vm_ops;
-               vma->vm_flags |= VM_MIXEDMAP;
+               vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
        } else {
                vma->vm_ops = &ext4_file_vm_ops;
        }
index 4f6ac499f09e7cd4eedde3887674e074cc9868e4..2468261748b2c53a7ee5ebafadc388a8bb2835ad 100644 (file)
@@ -22,6 +22,7 @@
 
 #include "ext4_jbd2.h"
 #include "truncate.h"
+#include <linux/dax.h>
 #include <linux/uio.h>
 
 #include <trace/events/ext4.h>
index 29f1af7c2cab9e97b514b9ea4459f36ec8e7415b..612fbcf76b5c4820ad6a18d7723f7e65db803d18 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
+#include <linux/dax.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
@@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
                               EXT4_GET_BLOCKS_NO_LOCK);
 }
 
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+                  struct buffer_head *bh_result, int create)
+{
+       int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+       if (create)
+               flags |= EXT4_GET_BLOCKS_CREATE;
+       ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+                  inode->i_ino, create);
+       return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                            ssize_t size, void *private)
 {
index ae0f438c2ee68f59e4f8f4ebc17aeaae9c250ff9..587ac08eabb62bcd47c0a90309e0442262eafd39 100644 (file)
@@ -53,8 +53,6 @@ struct wb_writeback_work {
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;       /* free on completion */
-       unsigned int single_wait:1;
-       unsigned int single_done:1;
        enum wb_reason reason;          /* why was writeback initiated? */
 
        struct list_head list;          /* pending work list */
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
 static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
 {
-       trace_writeback_queue(wb->bdi, work);
+       trace_writeback_queue(wb, work);
 
        spin_lock_bh(&wb->work_lock);
-       if (!test_bit(WB_registered, &wb->state)) {
-               if (work->single_wait)
-                       work->single_done = 1;
+       if (!test_bit(WB_registered, &wb->state))
                goto out_unlock;
-       }
        if (work->done)
                atomic_inc(&work->done->cnt);
        list_add_tail(&work->list, &wb->work_list);
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
 
 /**
  * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
  * @cong_bits: mask of WB_[a]sync_congested bits to test
  *
  * Tests whether @inode is congested.  @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
  * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
  * associated with @inode is congested; otherwise, the root wb's congestion
  * state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
  */
 int inode_congested(struct inode *inode, int cong_bits)
 {
@@ -737,32 +735,6 @@ int inode_congested(struct inode *inode, int cong_bits)
 }
 EXPORT_SYMBOL_GPL(inode_congested);
 
-/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's.  The caller must have set @work->single_wait before
- * issuing it.  This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
-                                   struct wb_writeback_work *work)
-{
-       if (WARN_ON_ONCE(!work->single_wait))
-               return;
-
-       wait_event(bdi->wb_waitq, work->single_done);
-
-       /*
-        * Paired with smp_wmb() in wb_do_writeback() and ensures that all
-        * modifications to @work prior to assertion of ->single_done is
-        * visible to the caller once this function returns.
-        */
-       smp_rmb();
-}
-
 /**
  * wb_split_bdi_pages - split nr_pages to write according to bandwidth
  * @wb: target bdi_writeback to split @nr_pages to
@@ -791,38 +763,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
 }
 
-/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb.  If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned.  In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion.  @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
-                                   struct wb_writeback_work *base_work)
-{
-       struct wb_writeback_work *work;
-
-       work = kmalloc(sizeof(*work), GFP_ATOMIC);
-       if (work) {
-               *work = *base_work;
-               work->auto_free = 1;
-               work->single_wait = 0;
-       } else {
-               work = base_work;
-               work->auto_free = 0;
-               work->single_wait = 1;
-       }
-       work->single_done = 0;
-       wb_queue_work(wb, work);
-       return work != base_work;
-}
-
 /**
  * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
  * @bdi: target backing_dev_info
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
 {
-       long nr_pages = base_work->nr_pages;
-       int next_blkcg_id = 0;
+       int next_memcg_id = 0;
        struct bdi_writeback *wb;
        struct wb_iter iter;
 
        might_sleep();
 restart:
        rcu_read_lock();
-       bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+       bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+               DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+               struct wb_writeback_work fallback_work;
+               struct wb_writeback_work *work;
+               long nr_pages;
+
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;
 
-               base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
-               if (!wb_clone_and_queue_work(wb, base_work)) {
-                       next_blkcg_id = wb->blkcg_css->id + 1;
-                       rcu_read_unlock();
-                       wb_wait_for_single_work(bdi, base_work);
-                       goto restart;
+               nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+               work = kmalloc(sizeof(*work), GFP_ATOMIC);
+               if (work) {
+                       *work = *base_work;
+                       work->nr_pages = nr_pages;
+                       work->auto_free = 1;
+                       wb_queue_work(wb, work);
+                       continue;
                }
+
+               /* alloc failed, execute synchronously using on-stack fallback */
+               work = &fallback_work;
+               *work = *base_work;
+               work->nr_pages = nr_pages;
+               work->auto_free = 0;
+               work->done = &fallback_work_done;
+
+               wb_queue_work(wb, work);
+
+               next_memcg_id = wb->memcg_css->id + 1;
+               rcu_read_unlock();
+               wb_wait_for_completion(bdi, &fallback_work_done);
+               goto restart;
        }
        rcu_read_unlock();
 }
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
 
        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
-               base_work->single_wait = 0;
-               base_work->single_done = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
 }
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-               trace_writeback_nowork(wb->bdi);
+               trace_writeback_nowork(wb);
                wb_wakeup(wb);
                return;
        }
@@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
-       trace_writeback_wake_background(wb->bdi);
+       trace_writeback_wake_background(wb);
        wb_wakeup(wb);
 }
 
@@ -1421,6 +1380,10 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
  * Write a portion of b_io inodes which belong to @sb.
  *
  * Return the number of pages and/or inodes written.
+ *
+ * NOTE! This is called with wb->list_lock held, and will
+ * unlock and relock that for each inode it ends up doing
+ * IO for.
  */
 static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
@@ -1439,9 +1402,7 @@ static long writeback_sb_inodes(struct super_block *sb,
        unsigned long start_time = jiffies;
        long write_chunk;
        long wrote = 0;  /* count both pages and inodes */
-       struct blk_plug plug;
 
-       blk_start_plug(&plug);
        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
 
@@ -1539,7 +1500,6 @@ static long writeback_sb_inodes(struct super_block *sb,
                                break;
                }
        }
-       blk_finish_plug(&plug);
        return wrote;
 }
 
@@ -1586,12 +1546,15 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                .range_cyclic   = 1,
                .reason         = reason,
        };
+       struct blk_plug plug;
 
+       blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
+       blk_finish_plug(&plug);
 
        return nr_pages - work.nr_pages;
 }
@@ -1619,10 +1582,12 @@ static long wb_writeback(struct bdi_writeback *wb,
        unsigned long oldest_jif;
        struct inode *inode;
        long progress;
+       struct blk_plug plug;
 
        oldest_jif = jiffies;
        work->older_than_this = &oldest_jif;
 
+       blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        for (;;) {
                /*
@@ -1660,14 +1625,14 @@ static long wb_writeback(struct bdi_writeback *wb,
                } else if (work->for_background)
                        oldest_jif = jiffies;
 
-               trace_writeback_start(wb->bdi, work);
+               trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io))
                        queue_io(wb, work);
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
-               trace_writeback_written(wb->bdi, work);
+               trace_writeback_written(wb, work);
 
                wb_update_bandwidth(wb, wb_start);
 
@@ -1692,7 +1657,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * we'll just busyloop.
                 */
                if (!list_empty(&wb->b_more_io))  {
-                       trace_writeback_wait(wb->bdi, work);
+                       trace_writeback_wait(wb, work);
                        inode = wb_inode(wb->b_more_io.prev);
                        spin_lock(&inode->i_lock);
                        spin_unlock(&wb->list_lock);
@@ -1702,6 +1667,7 @@ static long wb_writeback(struct bdi_writeback *wb,
                }
        }
        spin_unlock(&wb->list_lock);
+       blk_finish_plug(&plug);
 
        return nr_pages - work->nr_pages;
 }
@@ -1797,26 +1763,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                struct wb_completion *done = work->done;
-               bool need_wake_up = false;
 
-               trace_writeback_exec(wb->bdi, work);
+               trace_writeback_exec(wb, work);
 
                wrote += wb_writeback(wb, work);
 
-               if (work->single_wait) {
-                       WARN_ON_ONCE(work->auto_free);
-                       /* paired w/ rmb in wb_wait_for_single_work() */
-                       smp_wmb();
-                       work->single_done = 1;
-                       need_wake_up = true;
-               } else if (work->auto_free) {
+               if (work->auto_free)
                        kfree(work);
-               }
-
                if (done && atomic_dec_and_test(&done->cnt))
-                       need_wake_up = true;
-
-               if (need_wake_up)
                        wake_up_all(&wb->bdi->wb_waitq);
        }
 
index a38e38f7b6fc37ae2e2b9ca5af0e59d7c5652856..9bd1244caf38d42c80425d5c1a1a09b0edf22bbb 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/percpu.h>
 #include <linux/list_sort.h>
 #include <linux/lockref.h>
+#include <linux/rhashtable.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -50,9 +51,8 @@
 #include "trace_gfs2.h"
 
 struct gfs2_glock_iter {
-       int hash;                       /* hash bucket index           */
-       unsigned nhash;                 /* Index within current bucket */
        struct gfs2_sbd *sdp;           /* incore superblock           */
+       struct rhashtable_iter hti;     /* rhashtable iterator         */
        struct gfs2_glock *gl;          /* current glock struct        */
        loff_t last_pos;                /* last position               */
 };
@@ -70,44 +70,19 @@ static DEFINE_SPINLOCK(lru_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
-#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 
-static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
-static struct dentry *gfs2_root;
-
-/**
- * gl_hash() - Turn glock number into hash bucket number
- * @lock: The glock number
- *
- * Returns: The number of the corresponding hash bucket
- */
-
-static unsigned int gl_hash(const struct gfs2_sbd *sdp,
-                           const struct lm_lockname *name)
-{
-       unsigned int h;
-
-       h = jhash(&name->ln_number, sizeof(u64), 0);
-       h = jhash(&name->ln_type, sizeof(unsigned int), h);
-       h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
-       h &= GFS2_GL_HASH_MASK;
-
-       return h;
-}
-
-static inline void spin_lock_bucket(unsigned int hash)
-{
-       hlist_bl_lock(&gl_hash_table[hash]);
-}
+static struct rhashtable_params ht_parms = {
+       .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
+       .key_len = sizeof(struct lm_lockname),
+       .key_offset = offsetof(struct gfs2_glock, gl_name),
+       .head_offset = offsetof(struct gfs2_glock, gl_node),
+};
 
-static inline void spin_unlock_bucket(unsigned int hash)
-{
-       hlist_bl_unlock(&gl_hash_table[hash]);
-}
+static struct rhashtable gl_hash_table;
 
-static void gfs2_glock_dealloc(struct rcu_head *rcu)
+void gfs2_glock_free(struct gfs2_glock *gl)
 {
-       struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
        if (gl->gl_ops->go_flags & GLOF_ASPACE) {
                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -115,13 +90,6 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
                kfree(gl->gl_lksb.sb_lvbptr);
                kmem_cache_free(gfs2_glock_cachep, gl);
        }
-}
-
-void gfs2_glock_free(struct gfs2_glock *gl)
-{
-       struct gfs2_sbd *sdp = gl->gl_sbd;
-
-       call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
                wake_up(&sdp->sd_glock_wait);
 }
@@ -192,7 +160,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 
 void gfs2_glock_put(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = gfs2_glock2aspace(gl);
 
        if (lockref_put_or_lock(&gl->gl_lockref))
@@ -202,42 +170,13 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 
        gfs2_glock_remove_from_lru(gl);
        spin_unlock(&gl->gl_lockref.lock);
-       spin_lock_bucket(gl->gl_hash);
-       hlist_bl_del_rcu(&gl->gl_list);
-       spin_unlock_bucket(gl->gl_hash);
+       rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
        GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
        trace_gfs2_glock_put(gl);
        sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
 }
 
-/**
- * search_bucket() - Find struct gfs2_glock by lock number
- * @bucket: the bucket to search
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *search_bucket(unsigned int hash,
-                                       const struct gfs2_sbd *sdp,
-                                       const struct lm_lockname *name)
-{
-       struct gfs2_glock *gl;
-       struct hlist_bl_node *h;
-
-       hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
-               if (!lm_name_equal(&gl->gl_name, name))
-                       continue;
-               if (gl->gl_sbd != sdp)
-                       continue;
-               if (lockref_get_not_dead(&gl->gl_lockref))
-                       return gl;
-       }
-
-       return NULL;
-}
-
 /**
  * may_grant - check if its ok to grant a new lock
  * @gl: The glock
@@ -506,7 +445,7 @@ __releases(&gl->gl_spin)
 __acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        unsigned int lck_flags = gh ? gh->gh_flags : 0;
        int ret;
 
@@ -628,7 +567,7 @@ out_unlock:
 static void delete_work_func(struct work_struct *work)
 {
        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_inode *ip;
        struct inode *inode;
        u64 no_addr = gl->gl_name.ln_number;
@@ -704,15 +643,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                   struct gfs2_glock **glp)
 {
        struct super_block *s = sdp->sd_vfs;
-       struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
-       struct gfs2_glock *gl, *tmp;
-       unsigned int hash = gl_hash(sdp, &name);
+       struct lm_lockname name = { .ln_number = number,
+                                   .ln_type = glops->go_type,
+                                   .ln_sbd = sdp };
+       struct gfs2_glock *gl, *tmp = NULL;
        struct address_space *mapping;
        struct kmem_cache *cachep;
+       int ret, tries = 0;
 
-       rcu_read_lock();
-       gl = search_bucket(hash, sdp, &name);
-       rcu_read_unlock();
+       gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+       if (gl && !lockref_get_not_dead(&gl->gl_lockref))
+               gl = NULL;
 
        *glp = gl;
        if (gl)
@@ -739,14 +680,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        }
 
        atomic_inc(&sdp->sd_glock_disposal);
-       gl->gl_sbd = sdp;
+       gl->gl_node.next = NULL;
        gl->gl_flags = 0;
        gl->gl_name = name;
        gl->gl_lockref.count = 1;
        gl->gl_state = LM_ST_UNLOCKED;
        gl->gl_target = LM_ST_UNLOCKED;
        gl->gl_demote_state = LM_ST_EXCLUSIVE;
-       gl->gl_hash = hash;
        gl->gl_ops = glops;
        gl->gl_dstamp = ktime_set(0, 0);
        preempt_disable();
@@ -771,22 +711,34 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->writeback_index = 0;
        }
 
-       spin_lock_bucket(hash);
-       tmp = search_bucket(hash, sdp, &name);
-       if (tmp) {
-               spin_unlock_bucket(hash);
-               kfree(gl->gl_lksb.sb_lvbptr);
-               kmem_cache_free(cachep, gl);
-               atomic_dec(&sdp->sd_glock_disposal);
-               gl = tmp;
-       } else {
-               hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
-               spin_unlock_bucket(hash);
+again:
+       ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
+                                           ht_parms);
+       if (ret == 0) {
+               *glp = gl;
+               return 0;
        }
 
-       *glp = gl;
+       if (ret == -EEXIST) {
+               ret = 0;
+               tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+               if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
+                       if (++tries < 100) {
+                               cond_resched();
+                               goto again;
+                       }
+                       tmp = NULL;
+                       ret = -ENOMEM;
+               }
+       } else {
+               WARN_ON_ONCE(ret);
+       }
+       kfree(gl->gl_lksb.sb_lvbptr);
+       kmem_cache_free(cachep, gl);
+       atomic_dec(&sdp->sd_glock_disposal);
+       *glp = tmp;
 
-       return 0;
+       return ret;
 }
 
 /**
@@ -928,7 +880,7 @@ __releases(&gl->gl_spin)
 __acquires(&gl->gl_spin)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct list_head *insert_pt = NULL;
        struct gfs2_holder *gh2;
        int try_futile = 0;
@@ -1006,7 +958,7 @@ trap_recursive:
 int gfs2_glock_nq(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        int error = 0;
 
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -1313,7 +1265,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
 
 void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
-       struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+       struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
 
        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
@@ -1462,31 +1414,26 @@ static struct shrinker glock_shrinker = {
  *
  */
 
-static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
-                         unsigned int hash)
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 {
        struct gfs2_glock *gl;
-       struct hlist_bl_head *head = &gl_hash_table[hash];
-       struct hlist_bl_node *pos;
+       struct rhash_head *pos, *next;
+       const struct bucket_table *tbl;
+       int i;
 
        rcu_read_lock();
-       hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-               if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
-                       examiner(gl);
+       tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
+       for (i = 0; i < tbl->size; i++) {
+               rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+                       if ((gl->gl_name.ln_sbd == sdp) &&
+                           lockref_get_not_dead(&gl->gl_lockref))
+                               examiner(gl);
+               }
        }
        rcu_read_unlock();
        cond_resched();
 }
 
-static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
-{
-       unsigned x;
-
-       for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-               examine_bucket(examiner, sdp, x);
-}
-
-
 /**
  * thaw_glock - thaw out a glock which has an unprocessed reply waiting
  * @gl: The glock to thaw
@@ -1569,7 +1516,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
        int ret;
 
        ret = gfs2_truncatei_resume(ip);
-       gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+       gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
 
        spin_lock(&gl->gl_spin);
        clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1733,17 +1680,17 @@ static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock *gl = iter_ptr;
 
-       seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+       seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n",
                   gl->gl_name.ln_type,
                   (unsigned long long)gl->gl_name.ln_number,
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
-                  (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+                  (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
        return 0;
 }
 
@@ -1776,11 +1723,10 @@ static const char *gfs2_stype[] = {
 
 static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-       struct gfs2_glock_iter *gi = seq->private;
-       struct gfs2_sbd *sdp = gi->sdp;
-       unsigned index = gi->hash >> 3;
-       unsigned subindex = gi->hash & 0x07;
-       s64 value;
+       struct gfs2_sbd *sdp = seq->private;
+       loff_t pos = *(loff_t *)iter_ptr;
+       unsigned index = pos >> 3;
+       unsigned subindex = pos & 0x07;
        int i;
 
        if (index == 0 && subindex != 0)
@@ -1791,12 +1737,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 
        for_each_possible_cpu(i) {
                 const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
-               if (index == 0) {
-                       value = i;
-               } else {
-                       value = lkstats->lkstats[index - 1].stats[subindex];
-               }
-               seq_printf(seq, " %15lld", (long long)value);
+
+               if (index == 0)
+                       seq_printf(seq, " %15u", i);
+               else
+                       seq_printf(seq, " %15llu", (unsigned long long)lkstats->
+                                  lkstats[index - 1].stats[subindex]);
        }
        seq_putc(seq, '\n');
        return 0;
@@ -1804,20 +1750,24 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
 
 int __init gfs2_glock_init(void)
 {
-       unsigned i;
-       for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-               INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
-       }
+       int ret;
+
+       ret = rhashtable_init(&gl_hash_table, &ht_parms);
+       if (ret < 0)
+               return ret;
 
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
-       if (!glock_workqueue)
+       if (!glock_workqueue) {
+               rhashtable_destroy(&gl_hash_table);
                return -ENOMEM;
+       }
        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
                                                WQ_MEM_RECLAIM | WQ_FREEZABLE,
                                                0);
        if (!gfs2_delete_workqueue) {
                destroy_workqueue(glock_workqueue);
+               rhashtable_destroy(&gl_hash_table);
                return -ENOMEM;
        }
 
@@ -1829,72 +1779,41 @@ int __init gfs2_glock_init(void)
 void gfs2_glock_exit(void)
 {
        unregister_shrinker(&glock_shrinker);
+       rhashtable_destroy(&gl_hash_table);
        destroy_workqueue(glock_workqueue);
        destroy_workqueue(gfs2_delete_workqueue);
 }
 
-static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
-       return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
-                             struct gfs2_glock, gl_list);
-}
-
-static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
-{
-       return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
-                             struct gfs2_glock, gl_list);
-}
-
-static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
-{
-       struct gfs2_glock *gl;
-
        do {
-               gl = gi->gl;
-               if (gl) {
-                       gi->gl = glock_hash_next(gl);
-                       gi->nhash++;
-               } else {
-                       if (gi->hash >= GFS2_GL_HASH_SIZE) {
-                               rcu_read_unlock();
-                               return 1;
-                       }
-                       gi->gl = glock_hash_chain(gi->hash);
-                       gi->nhash = 0;
-               }
-               while (gi->gl == NULL) {
-                       gi->hash++;
-                       if (gi->hash >= GFS2_GL_HASH_SIZE) {
-                               rcu_read_unlock();
-                               return 1;
-                       }
-                       gi->gl = glock_hash_chain(gi->hash);
-                       gi->nhash = 0;
+               gi->gl = rhashtable_walk_next(&gi->hti);
+               if (IS_ERR(gi->gl)) {
+                       if (PTR_ERR(gi->gl) == -EAGAIN)
+                               continue;
+                       gi->gl = NULL;
                }
        /* Skip entries for other sb and dead entries */
-       } while (gi->sdp != gi->gl->gl_sbd ||
-                __lockref_is_dead(&gi->gl->gl_lockref));
-
-       return 0;
+       } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) ||
+                             __lockref_is_dead(&gi->gl->gl_lockref)));
 }
 
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
+       int ret;
 
        if (gi->last_pos <= *pos)
-               n = gi->nhash + (*pos - gi->last_pos);
-       else
-               gi->hash = 0;
+               n = (*pos - gi->last_pos);
 
-       gi->nhash = 0;
-       rcu_read_lock();
+       ret = rhashtable_walk_start(&gi->hti);
+       if (ret)
+               return NULL;
 
        do {
-               if (gfs2_glock_iter_next(gi))
-                       return NULL;
-       } while (n--);
+               gfs2_glock_iter_next(gi);
+       } while (gi->gl && n--);
 
        gi->last_pos = *pos;
        return gi->gl;
@@ -1907,9 +1826,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 
        (*pos)++;
        gi->last_pos = *pos;
-       if (gfs2_glock_iter_next(gi))
-               return NULL;
-
+       gfs2_glock_iter_next(gi);
        return gi->gl;
 }
 
@@ -1917,9 +1834,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock_iter *gi = seq->private;
 
-       if (gi->gl)
-               rcu_read_unlock();
        gi->gl = NULL;
+       rhashtable_walk_stop(&gi->hti);
 }
 
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1930,26 +1846,19 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 
 static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
 {
-       struct gfs2_glock_iter *gi = seq->private;
-
-       gi->hash = *pos;
+       preempt_disable();
        if (*pos >= GFS2_NR_SBSTATS)
                return NULL;
-       preempt_disable();
-       return SEQ_START_TOKEN;
+       return pos;
 }
 
 static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
                                   loff_t *pos)
 {
-       struct gfs2_glock_iter *gi = seq->private;
        (*pos)++;
-       gi->hash++;
-       if (gi->hash >= GFS2_NR_SBSTATS) {
-               preempt_enable();
+       if (*pos >= GFS2_NR_SBSTATS)
                return NULL;
-       }
-       return SEQ_START_TOKEN;
+       return pos;
 }
 
 static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
@@ -1987,14 +1896,28 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
        if (ret == 0) {
                struct seq_file *seq = file->private_data;
                struct gfs2_glock_iter *gi = seq->private;
+
                gi->sdp = inode->i_private;
+               gi->last_pos = 0;
                seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
                if (seq->buf)
                        seq->size = GFS2_SEQ_GOODSIZE;
+               gi->gl = NULL;
+               ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
        }
        return ret;
 }
 
+static int gfs2_glocks_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+       struct gfs2_glock_iter *gi = seq->private;
+
+       gi->gl = NULL;
+       rhashtable_walk_exit(&gi->hti);
+       return seq_release_private(inode, file);
+}
+
 static int gfs2_glstats_open(struct inode *inode, struct file *file)
 {
        int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
@@ -2003,21 +1926,22 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
                struct seq_file *seq = file->private_data;
                struct gfs2_glock_iter *gi = seq->private;
                gi->sdp = inode->i_private;
+               gi->last_pos = 0;
                seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
                if (seq->buf)
                        seq->size = GFS2_SEQ_GOODSIZE;
+               gi->gl = NULL;
+               ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
        }
        return ret;
 }
 
 static int gfs2_sbstats_open(struct inode *inode, struct file *file)
 {
-       int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
-                                  sizeof(struct gfs2_glock_iter));
+       int ret = seq_open(file, &gfs2_sbstats_seq_ops);
        if (ret == 0) {
                struct seq_file *seq = file->private_data;
-               struct gfs2_glock_iter *gi = seq->private;
-               gi->sdp = inode->i_private;
+               seq->private = inode->i_private;  /* sdp */
        }
        return ret;
 }
@@ -2027,7 +1951,7 @@ static const struct file_operations gfs2_glocks_fops = {
        .open    = gfs2_glocks_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = seq_release_private,
+       .release = gfs2_glocks_release,
 };
 
 static const struct file_operations gfs2_glstats_fops = {
@@ -2035,7 +1959,7 @@ static const struct file_operations gfs2_glstats_fops = {
        .open    = gfs2_glstats_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = seq_release_private,
+       .release = gfs2_glocks_release,
 };
 
 static const struct file_operations gfs2_sbstats_fops = {
@@ -2043,7 +1967,7 @@ static const struct file_operations gfs2_sbstats_fops = {
        .open    = gfs2_sbstats_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-       .release = seq_release_private,
+       .release = seq_release,
 };
 
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
index fa3fa5e9455366b2ce0d2ebcb0438db7c8fdc58e..1f6c9c3fe5cbb47361ed1e4b6d2a9dae1b3290b9 100644 (file)
@@ -32,13 +32,15 @@ struct workqueue_struct *gfs2_freeze_wq;
 
 static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
 {
-       fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+       fs_err(gl->gl_name.ln_sbd,
+              "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
+              "state 0x%lx\n",
               bh, (unsigned long long)bh->b_blocknr, bh->b_state,
               bh->b_page->mapping, bh->b_page->flags);
-       fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+       fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
               gl->gl_name.ln_type, gl->gl_name.ln_number,
               gfs2_glock2aspace(gl));
-       gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+       gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
 }
 
 /**
@@ -52,7 +54,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
 static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
                             unsigned int nr_revokes)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct list_head *head = &gl->gl_ail_list;
        struct gfs2_bufdata *bd, *tmp;
        struct buffer_head *bh;
@@ -80,7 +82,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
 
 static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_trans tr;
 
        memset(&tr, 0, sizeof(tr));
@@ -109,7 +111,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 
 void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        unsigned int revokes = atomic_read(&gl->gl_ail_count);
        unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
        int ret;
@@ -139,7 +141,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = &sdp->sd_aspace;
        struct gfs2_rgrpd *rgd;
        int error;
@@ -179,7 +181,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = &sdp->sd_aspace;
        struct gfs2_rgrpd *rgd = gl->gl_object;
 
@@ -218,7 +220,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-       gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
+       gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
        filemap_fdatawrite(metamapping);
        if (ip) {
                struct address_space *mapping = ip->i_inode.i_mapping;
@@ -252,7 +254,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 {
        struct gfs2_inode *ip = gl->gl_object;
 
-       gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+       gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
 
        if (flags & DIO_METADATA) {
                struct address_space *mapping = gfs2_glock2aspace(gl);
@@ -264,9 +266,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
                }
        }
 
-       if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
-               gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
-               gl->gl_sbd->sd_rindex_uptodate = 0;
+       if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
+               gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH);
+               gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
        }
        if (ip && S_ISREG(ip->i_inode.i_mode))
                truncate_inode_pages(ip->i_inode.i_mapping, 0);
@@ -281,7 +283,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_holder *gh;
 
        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
@@ -416,7 +418,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = gl->gl_object;
        int error = 0;
 
@@ -477,7 +479,7 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 static void freeze_go_sync(struct gfs2_glock *gl)
 {
        int error = 0;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
        if (gl->gl_state == LM_ST_SHARED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
@@ -500,7 +502,7 @@ static void freeze_go_sync(struct gfs2_glock *gl)
 
 static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
        struct gfs2_glock *j_gl = ip->i_gl;
        struct gfs2_log_header_host head;
@@ -545,7 +547,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
 static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
 {
        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
        if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
                return;
index a1ec7c20e498220c75809e14bae2c2d63709b9a8..121ed08d9d9f96bba5d38c954bdd9ba0a834933c 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/ktime.h>
 #include <linux/percpu.h>
 #include <linux/lockref.h>
+#include <linux/rhashtable.h>
 
 #define DIO_WAIT       0x00000010
 #define DIO_METADATA   0x00000020
@@ -203,13 +204,15 @@ enum {
 };
 
 struct lm_lockname {
+       struct gfs2_sbd *ln_sbd;
        u64 ln_number;
        unsigned int ln_type;
 };
 
 #define lm_name_equal(name1, name2) \
-        (((name1)->ln_number == (name2)->ln_number) && \
-         ((name1)->ln_type == (name2)->ln_type))
+        (((name1)->ln_number == (name2)->ln_number) && \
+        ((name1)->ln_type == (name2)->ln_type) &&      \
+        ((name1)->ln_sbd == (name2)->ln_sbd))
 
 
 struct gfs2_glock_operations {
@@ -241,7 +244,7 @@ enum {
 };
 
 struct gfs2_lkstats {
-       s64 stats[GFS2_NR_LKSTATS];
+       u64 stats[GFS2_NR_LKSTATS];
 };
 
 enum {
@@ -327,7 +330,6 @@ enum {
 
 struct gfs2_glock {
        struct hlist_bl_node gl_list;
-       struct gfs2_sbd *gl_sbd;
        unsigned long gl_flags;         /* GLF_... */
        struct lm_lockname gl_name;
 
@@ -341,7 +343,6 @@ struct gfs2_glock {
                     gl_req:2,          /* State in last dlm request */
                     gl_reply:8;        /* Last reply from the dlm */
 
-       unsigned int gl_hash;
        unsigned long gl_demote_time; /* time of first demote request */
        long gl_hold_time;
        struct list_head gl_holders;
@@ -367,7 +368,7 @@ struct gfs2_glock {
                        loff_t end;
                } gl_vm;
        };
-       struct rcu_head gl_rcu;
+       struct rhash_head gl_node;
 };
 
 #define GFS2_MIN_LVB_SIZE 32   /* Min size of LVB that gfs2 supports */
@@ -835,7 +836,7 @@ static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
 
 static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
 {
-       const struct gfs2_sbd *sdp = gl->gl_sbd;
+       const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        preempt_disable();
        this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
        preempt_enable();
index 641383a9c1bbf5a346315ad3a4805b2ffaf788b4..284c1542783eb03dcdec9c1d5b5b91e92754ea12 100644 (file)
@@ -31,7 +31,7 @@ extern struct workqueue_struct *gfs2_control_wq;
  *
  * @delta is the difference between the current rtt sample and the
  * running average srtt. We add 1/8 of that to the srtt in order to
- * update the current srtt estimate. The varience estimate is a bit
+ * update the current srtt estimate. The variance estimate is a bit
  * more complicated. We subtract the abs value of the @delta from
  * the current variance estimate and add 1/4 of that to the running
  * total.
@@ -80,7 +80,7 @@ static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
 
        preempt_disable();
        rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
-       lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+       lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
        gfs2_update_stats(&gl->gl_stats, index, rtt);           /* Local */
        gfs2_update_stats(&lks->lkstats[gltype], index, rtt);   /* Global */
        preempt_enable();
@@ -108,7 +108,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
        dstamp = gl->gl_dstamp;
        gl->gl_dstamp = ktime_get_real();
        irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
-       lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+       lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
        gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt);           /* Local */
        gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt);   /* Global */
        preempt_enable();
@@ -253,7 +253,7 @@ static void gfs2_reverse_hex(char *c, u64 value)
 static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
                     unsigned int flags)
 {
-       struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+       struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
        int req;
        u32 lkf;
        char strname[GDLM_STRNAME_BYTES] = "";
@@ -281,7 +281,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 
 static void gdlm_put_lock(struct gfs2_glock *gl)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int lvb_needs_unlock = 0;
        int error;
@@ -319,7 +319,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 
 static void gdlm_cancel(struct gfs2_glock *gl)
 {
-       struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+       struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
        dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
 }
 
index 92324ac5829081e694954260962685f84f329149..d5369a109781d990317cf634f3f8537c38a5dd7c 100644 (file)
@@ -70,7 +70,7 @@ static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
 static void maybe_release_space(struct gfs2_bufdata *bd)
 {
        struct gfs2_glock *gl = bd->bd_gl;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_rgrpd *rgd = gl->gl_object;
        unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
        struct gfs2_bitmap *bi = rgd->rd_bits + index;
@@ -578,7 +578,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 static void gfs2_meta_sync(struct gfs2_glock *gl)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        int error;
 
        if (mapping == NULL)
@@ -588,7 +588,7 @@ static void gfs2_meta_sync(struct gfs2_glock *gl)
        error = filemap_fdatawait(mapping);
 
        if (error)
-               gfs2_io_error(gl->gl_sbd);
+               gfs2_io_error(gl->gl_name.ln_sbd);
 }
 
 static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
index b984a6e190bc2f6b5d447fd8a4a5e7ab740d7905..0e1d4be5865a57f8484ad8e4d0fbf496f7cf6741 100644 (file)
@@ -114,7 +114,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct page *page;
        struct buffer_head *bh;
        unsigned int shift;
@@ -200,7 +200,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct buffer_head *bh;
 
        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
@@ -362,7 +362,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 
 struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 {
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct buffer_head *first_bh, *bh;
        u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
                          sdp->sd_sb.sb_bsize_shift;
index ac5d8027d33569437f42b76808f8103903c5c3de..8ca161567a93c549470068e2f5b980eba4f7057b 100644 (file)
@@ -44,7 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 {
        struct inode *inode = mapping->host;
        if (mapping->a_ops == &gfs2_meta_aops)
-               return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+               return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
        else if (mapping->a_ops == &gfs2_rgrp_aops)
                return container_of(mapping, struct gfs2_sbd, sd_aspace);
        else
index 9b61f92fcfdf0c85f37210f3fe5e630715be0c8e..3a31226531ea81fb67a53564199eed7a0a8d3d1b 100644 (file)
@@ -119,7 +119,7 @@ static void gfs2_qd_dispose(struct list_head *list)
 
        while (!list_empty(list)) {
                qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
-               sdp = qd->qd_gl->gl_sbd;
+               sdp = qd->qd_gl->gl_name.ln_sbd;
 
                list_del(&qd->qd_lru);
 
@@ -302,7 +302,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
 
 static void qd_hold(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
        lockref_get(&qd->qd_lockref);
 }
@@ -367,7 +367,7 @@ static void slot_put(struct gfs2_quota_data *qd)
 
 static int bh_get(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
        unsigned int block, offset;
        struct buffer_head *bh;
@@ -414,7 +414,7 @@ fail:
 
 static void bh_put(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
 
        mutex_lock(&sdp->sd_quota_mutex);
        gfs2_assert(sdp, qd->qd_bh_count);
@@ -486,7 +486,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 
 static void qd_unlock(struct gfs2_quota_data *qd)
 {
-       gfs2_assert_warn(qd->qd_gl->gl_sbd,
+       gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd,
                         test_bit(QDF_LOCKED, &qd->qd_flags));
        clear_bit(QDF_LOCKED, &qd->qd_flags);
        bh_put(qd);
@@ -614,7 +614,7 @@ static int sort_qd(const void *a, const void *b)
 
 static void do_qc(struct gfs2_quota_data *qd, s64 change)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
        struct gfs2_quota_change *qc = qd->qd_bh_qc;
        s64 x;
@@ -831,7 +831,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 
 static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 {
-       struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_alloc_parms ap = { .aflags = 0, };
        unsigned int data_blocks, ind_blocks;
@@ -922,7 +922,7 @@ out:
                gfs2_glock_dq_uninit(&ghs[qx]);
        mutex_unlock(&ip->i_inode.i_mutex);
        kfree(ghs);
-       gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
+       gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
        return error;
 }
 
@@ -954,7 +954,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
 static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
                    struct gfs2_holder *q_gh)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
        struct gfs2_holder i_gh;
        int error;
@@ -1037,7 +1037,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 
 static int need_sync(struct gfs2_quota_data *qd)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
        struct gfs2_tune *gt = &sdp->sd_tune;
        s64 value;
        unsigned int num, den;
@@ -1125,7 +1125,7 @@ out:
 
 static int print_message(struct gfs2_quota_data *qd, char *type)
 {
-       struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+       struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
 
        fs_info(sdp, "quota %s for %s %u\n",
                type,
index c6c62321dfd6f6918bd54cbd609347ed6d4c38d3..475985d14758cc12a59c366552038928cebed348 100644 (file)
@@ -1860,13 +1860,13 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 {
        const struct gfs2_glock *gl = rgd->rd_gl;
-       const struct gfs2_sbd *sdp = gl->gl_sbd;
+       const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_lkstats *st;
-       s64 r_dcount, l_dcount;
-       s64 l_srttb, a_srttb = 0;
+       u64 r_dcount, l_dcount;
+       u64 l_srttb, a_srttb = 0;
        s64 srttb_diff;
-       s64 sqr_diff;
-       s64 var;
+       u64 sqr_diff;
+       u64 var;
        int cpu, nonzero = 0;
 
        preempt_disable();
index 20c007d747ab2cd9b2d53eb5cd4bbb69ebbc361f..49ac55da4e334e2c5239e7ce81e1e04e3a016540 100644 (file)
@@ -104,7 +104,7 @@ TRACE_EVENT(gfs2_glock_state_change,
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
@@ -140,7 +140,7 @@ TRACE_EVENT(gfs2_glock_put,
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
@@ -174,7 +174,7 @@ TRACE_EVENT(gfs2_demote_rq,
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->cur_state      = glock_trace_state(gl->gl_state);
@@ -209,7 +209,7 @@ TRACE_EVENT(gfs2_promote,
        ),
 
        TP_fast_assign(
-               __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev    = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
                __entry->gltype = gh->gh_gl->gl_name.ln_type;
                __entry->first  = first;
@@ -239,7 +239,7 @@ TRACE_EVENT(gfs2_glock_queue,
        ),
 
        TP_fast_assign(
-               __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev    = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
                __entry->gltype = gh->gh_gl->gl_name.ln_type;
                __entry->queue  = queue;
@@ -267,18 +267,18 @@ TRACE_EVENT(gfs2_glock_lock_time,
                __field(        int,    status          )
                __field(        char,   flags           )
                __field(        s64,    tdiff           )
-               __field(        s64,    srtt            )
-               __field(        s64,    srttvar         )
-               __field(        s64,    srttb           )
-               __field(        s64,    srttvarb        )
-               __field(        s64,    sirt            )
-               __field(        s64,    sirtvar         )
-               __field(        s64,    dcount          )
-               __field(        s64,    qcount          )
+               __field(        u64,    srtt            )
+               __field(        u64,    srttvar         )
+               __field(        u64,    srttb           )
+               __field(        u64,    srttvarb        )
+               __field(        u64,    sirt            )
+               __field(        u64,    sirtvar         )
+               __field(        u64,    dcount          )
+               __field(        u64,    qcount          )
        ),
 
        TP_fast_assign(
-               __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->glnum          = gl->gl_name.ln_number;
                __entry->gltype         = gl->gl_name.ln_type;
                __entry->status         = gl->gl_lksb.sb_status;
@@ -333,7 +333,7 @@ TRACE_EVENT(gfs2_pin,
        ),
 
        TP_fast_assign(
-               __entry->dev            = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->pin            = pin;
                __entry->len            = bd->bd_bh->b_size;
                __entry->block          = bd->bd_bh->b_blocknr;
@@ -449,7 +449,7 @@ TRACE_EVENT(gfs2_bmap,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->lblock         = lblock;
                __entry->pblock         = buffer_mapped(bh) ?  bh->b_blocknr : 0;
                __entry->inum           = ip->i_no_addr;
@@ -489,7 +489,7 @@ TRACE_EVENT(gfs2_block_alloc,
        ),
 
        TP_fast_assign(
-               __entry->dev            = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
+               __entry->dev            = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
                __entry->start          = block;
                __entry->inum           = ip->i_no_addr;
                __entry->len            = len;
index 88bff243066910a1ee57d08432d409ed2fabadf2..b95d0d625f32bf1d2d57b22fd3057b94e75fb27c 100644 (file)
@@ -158,7 +158,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
 void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 {
        struct gfs2_trans *tr = current->journal_info;
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = bh->b_page->mapping;
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_bufdata *bd;
@@ -224,7 +224,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 {
 
-       struct gfs2_sbd *sdp = gl->gl_sbd;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct gfs2_bufdata *bd;
 
        lock_buffer(bh);
index d3fa6bd9503e762c861debdd4fe64bef546bb78f..221719eac5de667c1d6044697605148fca6b87e8 100644 (file)
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-               page_cache_release(page);
                node->page[i] = page;
        }
 
@@ -398,11 +397,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-       //int i;
+       int i;
 
-       //for (i = 0; i < node->tree->pages_per_bnode; i++)
-       //      if (node->page[i])
-       //              page_cache_release(node->page[i]);
+       for (i = 0; i < node->tree->pages_per_bnode; i++)
+               if (node->page[i])
+                       page_cache_release(node->page[i]);
        kfree(node);
 }
 
index 9f4ee7f5202615ba41b41be76d12de3bbe1f5676..6fc766df04617a3f4abbdb0ba44f76ec71a468de 100644 (file)
@@ -131,13 +131,16 @@ skip:
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);
 
-       if (new_node) {
-               /* update parent key if we inserted a key
-                * at the start of the first node
-                */
-               if (!rec && new_node != node)
-                       hfs_brec_update_parent(fd);
+       /*
+        * update parent key if we inserted a key
+        * at the start of the node and it is not the new node
+        */
+       if (!rec && new_node != node) {
+               hfs_bnode_read_key(node, fd->search_key, data_off + size);
+               hfs_brec_update_parent(fd);
+       }
 
+       if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
                goto again;
        }
 
-       if (!rec)
-               hfs_brec_update_parent(fd);
-
        return 0;
 }
 
@@ -366,6 +366,8 @@ again:
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd);
+       if (fd->record < 0)
+               return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;
 
index 759708fd9331cc37a6775c31e068117eddbec11d..63924662aaf3efa3b80cb732e409499e5cb2f87e 100644 (file)
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-               page_cache_release(page);
                node->page[i] = page;
        }
 
@@ -566,13 +565,11 @@ node_error:
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-#if 0
        int i;
 
        for (i = 0; i < node->tree->pages_per_bnode; i++)
                if (node->page[i])
                        page_cache_release(node->page[i]);
-#endif
        kfree(node);
 }
 
index 973c24ce59ad3ef1b62ff3ce00113d7d85cedb68..316adb968b6588faca5d64cf57005f48e067bb08 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>               /* remove ASAP */
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
        {Opt_err,       NULL},
 };
 
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+                                       struct inode *inode, pgoff_t index)
+{
+       vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+                                                       index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+       mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+                                       struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
 static void huge_pagevec_release(struct pagevec *pvec)
 {
        int i;
@@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
        return -EINVAL;
 }
 
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
 {
        ClearPageDirty(page);
        ClearPageUptodate(page);
        delete_from_page_cache(page);
 }
 
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch.  There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ *     In this case, we first scan the range and release found pages.
+ *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ *     maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ *     In the hole punch case we scan the range and release found pages.
+ *     Only when releasing a page is the associated region/reserv map
+ *     deleted.  The region/reserv map for ranges without associated
+ *     pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+                                  loff_t lend)
 {
        struct hstate *h = hstate_inode(inode);
        struct address_space *mapping = &inode->i_data;
        const pgoff_t start = lstart >> huge_page_shift(h);
+       const pgoff_t end = lend >> huge_page_shift(h);
+       struct vm_area_struct pseudo_vma;
        struct pagevec pvec;
        pgoff_t next;
        int i, freed = 0;
+       long lookup_nr = PAGEVEC_SIZE;
+       bool truncate_op = (lend == LLONG_MAX);
 
+       memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
        pagevec_init(&pvec, 0);
        next = start;
-       while (1) {
-               if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+       while (next < end) {
+               /*
+                * Make sure to never grab more pages that we
+                * might possibly need.
+                */
+               if (end - next < lookup_nr)
+                       lookup_nr = end - next;
+
+               /*
+                * This pagevec_lookup() may return pages past 'end',
+                * so we must check for page->index > end.
+                */
+               if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
                        if (next == start)
                                break;
                        next = start;
@@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 
                for (i = 0; i < pagevec_count(&pvec); ++i) {
                        struct page *page = pvec.pages[i];
+                       u32 hash;
+
+                       hash = hugetlb_fault_mutex_hash(h, current->mm,
+                                                       &pseudo_vma,
+                                                       mapping, next, 0);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                        lock_page(page);
+                       if (page->index >= end) {
+                               unlock_page(page);
+                               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                               next = end;     /* we are done */
+                               break;
+                       }
+
+                       /*
+                        * If page is mapped, it was faulted in after being
+                        * unmapped.  Do nothing in this race case.  In the
+                        * normal case page is not mapped.
+                        */
+                       if (!page_mapped(page)) {
+                               bool rsv_on_error = !PagePrivate(page);
+                               /*
+                                * We must free the huge page and remove
+                                * from page cache (remove_huge_page) BEFORE
+                                * removing the region/reserve map
+                                * (hugetlb_unreserve_pages).  In rare out
+                                * of memory conditions, removal of the
+                                * region/reserve map could fail.  Before
+                                * free'ing the page, note PagePrivate which
+                                * is used in case of error.
+                                */
+                               remove_huge_page(page);
+                               freed++;
+                               if (!truncate_op) {
+                                       if (unlikely(hugetlb_unreserve_pages(
+                                                       inode, next,
+                                                       next + 1, 1)))
+                                               hugetlb_fix_reserve_counts(
+                                                       inode, rsv_on_error);
+                               }
+                       }
+
                        if (page->index > next)
                                next = page->index;
+
                        ++next;
-                       truncate_huge_page(page);
                        unlock_page(page);
-                       freed++;
+
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                }
                huge_pagevec_release(&pvec);
        }
-       BUG_ON(!lstart && mapping->nrpages);
-       hugetlb_unreserve_pages(inode, start, freed);
+
+       if (truncate_op)
+               (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
 }
 
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
        struct resv_map *resv_map;
 
-       truncate_hugepages(inode, 0);
+       remove_inode_hugepages(inode, 0, LLONG_MAX);
        resv_map = (struct resv_map *)inode->i_mapping->private_data;
        /* root inode doesn't have the resv_map, so we should check it */
        if (resv_map)
@@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 }
 
 static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
 {
        struct vm_area_struct *vma;
 
-       vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+       /*
+        * end == 0 indicates that the entire range after
+        * start should be unmapped.
+        */
+       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
                unsigned long v_offset;
 
                /*
@@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
                 * which overlap the truncated area starting at pgoff,
                 * and no vma on a 32-bit arch can span beyond the 4GB.
                 */
-               if (vma->vm_pgoff < pgoff)
-                       v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+               if (vma->vm_pgoff < start)
+                       v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
                else
                        v_offset = 0;
 
-               unmap_hugepage_range(vma, vma->vm_start + v_offset,
-                                    vma->vm_end, NULL);
+               if (end) {
+                       end = ((end - start) << PAGE_SHIFT) +
+                              vma->vm_start + v_offset;
+                       if (end > vma->vm_end)
+                               end = vma->vm_end;
+               } else
+                       end = vma->vm_end;
+
+               unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
        }
 }
 
@@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        i_size_write(inode, offset);
        i_mmap_lock_write(mapping);
        if (!RB_EMPTY_ROOT(&mapping->i_mmap))
-               hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
        i_mmap_unlock_write(mapping);
-       truncate_hugepages(inode, offset);
+       remove_inode_hugepages(inode, offset, LLONG_MAX);
        return 0;
 }
 
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct hstate *h = hstate_inode(inode);
+       loff_t hpage_size = huge_page_size(h);
+       loff_t hole_start, hole_end;
+
+       /*
+        * For hole punch round up the beginning offset of the hole and
+        * round down the end.
+        */
+       hole_start = round_up(offset, hpage_size);
+       hole_end = round_down(offset + len, hpage_size);
+
+       if (hole_end > hole_start) {
+               struct address_space *mapping = inode->i_mapping;
+
+               mutex_lock(&inode->i_mutex);
+               i_mmap_lock_write(mapping);
+               if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+                       hugetlb_vmdelete_list(&mapping->i_mmap,
+                                               hole_start >> PAGE_SHIFT,
+                                               hole_end  >> PAGE_SHIFT);
+               i_mmap_unlock_write(mapping);
+               remove_inode_hugepages(inode, hole_start, hole_end);
+               mutex_unlock(&inode->i_mutex);
+       }
+
+       return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+                               loff_t len)
+{
+       struct inode *inode = file_inode(file);
+       struct address_space *mapping = inode->i_mapping;
+       struct hstate *h = hstate_inode(inode);
+       struct vm_area_struct pseudo_vma;
+       struct mm_struct *mm = current->mm;
+       loff_t hpage_size = huge_page_size(h);
+       unsigned long hpage_shift = huge_page_shift(h);
+       pgoff_t start, index, end;
+       int error;
+       u32 hash;
+
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
+
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               return hugetlbfs_punch_hole(inode, offset, len);
+
+       /*
+        * Default preallocate case.
+        * For this range, start is rounded down and end is rounded up
+        * as well as being converted to page offsets.
+        */
+       start = offset >> hpage_shift;
+       end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+       mutex_lock(&inode->i_mutex);
+
+       /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+       error = inode_newsize_ok(inode, offset + len);
+       if (error)
+               goto out;
+
+       /*
+        * Initialize a pseudo vma as this is required by the huge page
+        * allocation routines.  If NUMA is configured, use page index
+        * as input to create an allocation policy.
+        */
+       memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+       pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+       pseudo_vma.vm_file = file;
+
+       for (index = start; index < end; index++) {
+               /*
+                * This is supposed to be the vaddr where the page is being
+                * faulted in, but we have no vaddr here.
+                */
+               struct page *page;
+               unsigned long addr;
+               int avoid_reserve = 0;
+
+               cond_resched();
+
+               /*
+                * fallocate(2) manpage permits EINTR; we may have been
+                * interrupted because we are using up too much memory.
+                */
+               if (signal_pending(current)) {
+                       error = -EINTR;
+                       break;
+               }
+
+               /* Set numa allocation policy based on index */
+               hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+               /* addr is the offset within the file (zero based) */
+               addr = index * hpage_size;
+
+               /* mutex taken here, fault path and hole punch */
+               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+                                               index, addr);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+               /* See if already present in mapping to avoid alloc/free */
+               page = find_get_page(mapping, index);
+               if (page) {
+                       put_page(page);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       hugetlb_drop_vma_policy(&pseudo_vma);
+                       continue;
+               }
+
+               /* Allocate page and add to page cache */
+               page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+               hugetlb_drop_vma_policy(&pseudo_vma);
+               if (IS_ERR(page)) {
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       error = PTR_ERR(page);
+                       goto out;
+               }
+               clear_huge_page(page, addr, pages_per_huge_page(h));
+               __SetPageUptodate(page);
+               error = huge_add_to_page_cache(page, mapping, index);
+               if (unlikely(error)) {
+                       put_page(page);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       goto out;
+               }
+
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+               /*
+                * page_put due to reference from alloc_huge_page()
+                * unlock_page because locked by add_to_page_cache()
+                */
+               put_page(page);
+               unlock_page(page);
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+               i_size_write(inode, offset + len);
+       inode->i_ctime = CURRENT_TIME;
+       spin_lock(&inode->i_lock);
+       inode->i_private = NULL;
+       spin_unlock(&inode->i_lock);
+out:
+       mutex_unlock(&inode->i_mutex);
+       return error;
+}
+
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = d_inode(dentry);
@@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
        .mmap                   = hugetlbfs_file_mmap,
        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
-       .llseek         = default_llseek,
+       .llseek                 = default_llseek,
+       .fallocate              = hugetlbfs_fallocate,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
index 2d48d28e164015668dcc8d04ff72148f07f28e38..91e004518237f78b0e48b58ca1a233b0dc15c75d 100644 (file)
@@ -91,6 +91,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
        return ret;
 }
 
+/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+       size_t len = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+       do {
+               len += strlen(kn->name) + 1;
+               kn = kn->parent;
+       } while (kn && kn->parent);
+
+       spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+       return len;
+}
+
 /**
  * kernfs_path - build full path of a given node
  * @kn: kernfs_node of interest
index 29b927938b8ce9c78af99d0f9c8f5af5a294cefd..726d211db4842715f71e1911f6940c93b19fe57f 100644 (file)
@@ -2438,7 +2438,7 @@ done:
 
 /**
  * path_mountpoint - look up a path to be umounted
- * @nameidata: lookup context
+ * @nd:                lookup context
  * @flags:     lookup flags
  * @path:      pointer to container for result
  *
index 92dca9e90d8dab527d3b5d3b6891981aeb77de11..c556640dcf3bad183659eaac9ac935ca17d30114 100644 (file)
 
 struct pnfs_block_dev;
 
-enum pnfs_block_volume_type {
-       PNFS_BLOCK_VOLUME_SIMPLE        = 0,
-       PNFS_BLOCK_VOLUME_SLICE         = 1,
-       PNFS_BLOCK_VOLUME_CONCAT        = 2,
-       PNFS_BLOCK_VOLUME_STRIPE        = 3,
-};
-
 #define PNFS_BLOCK_MAX_UUIDS   4
 #define PNFS_BLOCK_MAX_DEVICES 64
 
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
                        struct pnfs_block_dev_map *map);
 };
 
-enum exstate4 {
-       PNFS_BLOCK_READWRITE_DATA       = 0,
-       PNFS_BLOCK_READ_DATA            = 1,
-       PNFS_BLOCK_INVALID_DATA         = 2, /* mapped, but data is invalid */
-       PNFS_BLOCK_NONE_DATA            = 3  /* unmapped, it's a hole */
-};
-
 /* sector_t fields are all in 512-byte sectors */
 struct pnfs_block_extent {
        union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
        sector_t        be_f_offset;    /* the starting offset in the file */
        sector_t        be_length;      /* the size of the extent */
        sector_t        be_v_offset;    /* the starting offset in the volume */
-       enum exstate4   be_state;       /* the state of this extent */
+       enum pnfs_block_extent_state be_state;  /* the state of this extent */
 #define EXTENT_WRITTEN         1
 #define EXTENT_COMMITTING      2
        unsigned int    be_tag;
 };
 
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
 struct pnfs_block_layout {
        struct pnfs_layout_hdr  bl_layout;
        struct rb_root          bl_ext_rw;
index e535599a07191619c28eba93342388a59267114e..a861bbdfe5778e579ab88f2a5fa1393a441f3fbb 100644 (file)
@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
                kfree(dev->children);
        } else {
                if (dev->bdev)
-                       blkdev_put(dev->bdev, FMODE_READ);
+                       blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
        }
 }
 
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
                                return -EIO;
                        p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
                        b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+                       if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+                               pr_info("signature too long: %d\n",
+                                       b->simple.sigs[i].sig_len);
+                               return -EIO;
+                       }
 
                        p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
                        if (!p)
@@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
        if (!dev)
                return -EIO;
 
-       d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+       d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
        if (IS_ERR(d->bdev)) {
                printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
                        MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
index 31d0b5e53dfd8fc8883dd70cc9220c198882d02f..c59a59c37f3dabae1211db9efcb963442c013b36 100644 (file)
@@ -462,6 +462,12 @@ out:
        return err;
 }
 
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+       return sizeof(__be32) /* number of entries */ +
+               PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
                size_t buffer_size)
 {
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
                        continue;
 
                (*count)++;
-               if (*count * BL_EXTENT_SIZE > buffer_size) {
+               if (ext_tree_layoutupdate_size(*count) > buffer_size) {
                        /* keep counting.. */
                        ret = -ENOSPC;
                        continue;
@@ -530,7 +536,7 @@ retry:
        if (unlikely(ret)) {
                ext_tree_free_commitdata(arg, buffer_size);
 
-               buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+               buffer_size = ext_tree_layoutupdate_size(count);
                count = 0;
 
                arg->layoutupdate_pages =
@@ -549,17 +555,14 @@ retry:
        }
 
        *start_p = cpu_to_be32(count);
-       arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+       arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
 
        if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
-               __be32 *p = start_p;
+               void *p = start_p, *end = p + arg->layoutupdate_len;
                int i = 0;
 
-               for (p = start_p;
-                    p < start_p + arg->layoutupdate_len;
-                    p += PAGE_SIZE) {
+               for ( ; p < end; p += PAGE_SIZE)
                        arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
-               }
        }
 
        dprintk("%s found %zu ranges\n", __func__, count);
index 2c4a0b565d28e6eba828668a01123d8dce9f05e1..75f7c0a7538a247822ab3abcd6e3dfb4697269f3 100644 (file)
@@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv)
        spin_lock_init(&serv->sv_cb_lock);
        init_waitqueue_head(&serv->sv_cb_waitq);
        rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-       if (IS_ERR(rqstp)) {
-               svc_xprt_put(serv->sv_bc_xprt);
-               serv->sv_bc_xprt = NULL;
-       }
        dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
        return rqstp;
 }
index 29e3c1b011b73e4661f4deb1ef200e2f8d27792b..b85cf7a30232a39fb5049a098a0bde40a9563df2 100644 (file)
@@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
                rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
 
        inode = nfs_delegation_find_inode(cps->clp, &args->fh);
-       if (inode == NULL)
+       if (inode == NULL) {
+               trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+                               -ntohl(res->status));
                goto out;
+       }
        nfsi = NFS_I(inode);
        rcu_read_lock();
        delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
        res->status = 0;
 out_iput:
        rcu_read_unlock();
+       trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
        iput(inode);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
@@ -194,6 +198,7 @@ unlock:
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me_list);
        pnfs_put_layout_hdr(lo);
+       trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
        iput(ino);
 out:
        return rv;
@@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
        status = htonl(NFS4_OK);
 
        nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
-       nfs41_server_notify_target_slotid_update(cps->clp);
+       nfs41_notify_server(cps->clp);
 out:
        dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
        return status;
index 4a90c9bb31357305ed6f38166bb0e9afabaae953..57c5a02f6213e421cb3cfcdf2ba3a67b7141c6c6 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
 }
 EXPORT_SYMBOL_GPL(nfs_put_client);
 
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-                                     const struct sockaddr *sa2)
-{
-       const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-       const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-       if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
-               return 0;
-       else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
-               return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
-       return 1;
-}
-#else  /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-                                     const struct sockaddr *sa2)
-{
-       return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
-                                     const struct sockaddr *sa2)
-{
-       const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-       const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-       return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
-                               const struct sockaddr *sa2)
-{
-       const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-       const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
-       return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
-               (sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
-                               const struct sockaddr *sa2)
-{
-       const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-       const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
-       return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
-               (sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-                             const struct sockaddr *sa2)
-{
-       if (sa1->sa_family != sa2->sa_family)
-               return 0;
-
-       switch (sa1->sa_family) {
-       case AF_INET:
-               return nfs_sockaddr_match_ipaddr4(sa1, sa2);
-       case AF_INET6:
-               return nfs_sockaddr_match_ipaddr6(sa1, sa2);
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
-                           const struct sockaddr *sa2)
-{
-       if (sa1->sa_family != sa2->sa_family)
-               return 0;
-
-       switch (sa1->sa_family) {
-       case AF_INET:
-               return nfs_sockaddr_cmp_ip4(sa1, sa2);
-       case AF_INET6:
-               return nfs_sockaddr_cmp_ip6(sa1, sa2);
-       }
-       return 0;
-}
-
 /*
  * Find an nfs_client on the list that matches the initialisation data
  * that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
                if (clp->cl_minorversion != data->minorversion)
                        continue;
                /* Match the full socket address */
-               if (!nfs_sockaddr_cmp(sap, clap))
+               if (!rpc_cmp_addr_port(sap, clap))
                        continue;
 
                atomic_inc(&clp->cl_count);
index 029d688a969f4427e57ccf6f19b0dca9035c3d71..2714ef835bdd4261cbc301838c050f42896c24c0 100644 (file)
@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
                if (delegation->inode != NULL) {
                        nfs4_stateid_copy(&delegation->stateid, &res->delegation);
                        delegation->type = res->delegation_type;
-                       delegation->maxsize = res->maxsize;
+                       delegation->pagemod_limit = res->pagemod_limit;
                        oldcred = delegation->cred;
                        delegation->cred = get_rpccred(cred);
                        clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                return -ENOMEM;
        nfs4_stateid_copy(&delegation->stateid, &res->delegation);
        delegation->type = res->delegation_type;
-       delegation->maxsize = res->maxsize;
+       delegation->pagemod_limit = res->pagemod_limit;
        delegation->change_attr = inode->i_version;
        delegation->cred = get_rpccred(cred);
        delegation->inode = inode;
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
        rcu_read_unlock();
        return ret;
 }
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_delegation *delegation;
+       bool ret = true;
+
+       rcu_read_lock();
+       delegation = rcu_dereference(nfsi->delegation);
+       if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+               goto out;
+       if (nfsi->nrequests < delegation->pagemod_limit)
+               ret = false;
+out:
+       rcu_read_unlock();
+       return ret;
+}
index e3c20a3ccc937453b678e9bb02d1a46827f11be0..a44829173e573d1ceca061a9f23b76752f7bb30e 100644 (file)
@@ -18,7 +18,7 @@ struct nfs_delegation {
        struct inode *inode;
        nfs4_stateid stateid;
        fmode_t type;
-       loff_t maxsize;
+       unsigned long pagemod_limit;
        __u64 change_attr;
        unsigned long flags;
        spinlock_t lock;
@@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
 
 #endif
 
index 547308a5ec6f4a738006370e523c751c90927e1b..3d8e4ffa0a33a1b0449f6dde73d155ead884ee16 100644 (file)
@@ -583,26 +583,19 @@ out_nopages:
 }
 
 static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
 {
        unsigned int i;
        for (i = 0; i < npages; i++)
                put_page(pages[i]);
 }
 
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
-               unsigned int npages)
-{
-       nfs_readdir_free_pagearray(pages, npages);
-}
-
 /*
  * nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
  */
 static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
 {
        unsigned int i;
 
@@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
        return 0;
 
 out_freepages:
-       nfs_readdir_free_pagearray(pages, i);
+       nfs_readdir_free_pages(pages, i);
        return -ENOMEM;
 }
 
@@ -623,7 +616,6 @@ static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
        struct page *pages[NFS_MAX_READDIR_PAGES];
-       void *pages_ptr = NULL;
        struct nfs_entry entry;
        struct file     *file = desc->file;
        struct nfs_cache_array *array;
@@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
        memset(array, 0, sizeof(struct nfs_cache_array));
        array->eof_index = -1;
 
-       status = nfs_readdir_large_page(pages, array_size);
+       status = nfs_readdir_alloc_pages(pages, array_size);
        if (status < 0)
                goto out_release_array;
        do {
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
                }
        } while (array->eof_index < 0);
 
-       nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+       nfs_readdir_free_pages(pages, array_size);
 out_release_array:
        nfs_readdir_release_array(page);
 out_label_free:
index cc4fa1ed61fc5bdfe04d1afcaa5f081bb3ba0470..c0f9b1ed12b9eb281909926f2befacd13db1a026 100644 (file)
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
        dprintk("NFS: release(%pD2)\n", filp);
 
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
-       return nfs_release(inode, filp);
+       nfs_file_clear_open_context(filp);
+       return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_file_release);
 
@@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
 /*
  * Flush all dirty pages, and check for write errors.
  */
-int
+static int
 nfs_file_flush(struct file *file, fl_owner_t id)
 {
        struct inode    *inode = file_inode(file);
@@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
        if ((file->f_mode & FMODE_WRITE) == 0)
                return 0;
 
-       /*
-        * If we're holding a write delegation, then just start the i/o
-        * but don't wait for completion (or send a commit).
-        */
-       if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
-               return filemap_fdatawrite(file->f_mapping);
-
        /* Flush writes to the server and return any errors */
        return vfs_fsync(file, 0);
 }
-EXPORT_SYMBOL_GPL(nfs_file_flush);
 
 ssize_t
 nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
@@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
        .page_mkwrite = nfs_vm_page_mkwrite,
 };
 
-static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode)
 {
        struct nfs_open_context *ctx;
 
-       if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
-               return 1;
        ctx = nfs_file_open_context(filp);
        if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
            nfs_ctx_key_to_expire(ctx))
@@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
        if (result > 0)
                written = result;
 
-       /* Return error values for O_DSYNC and IS_SYNC() */
-       if (result >= 0 && nfs_need_sync_write(file, inode)) {
+       /* Return error values */
+       if (result >= 0 && nfs_need_check_write(file, inode)) {
                int err = vfs_fsync(file, 0);
                if (err < 0)
                        result = err;
index b3289d701eea21623f4081fee1b2807e3e2f4b3a..fbc5a56de87597dd899905d9d96b19e4fd395e88 100644 (file)
@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
        ffl = kzalloc(sizeof(*ffl), gfp_flags);
        if (ffl) {
                INIT_LIST_HEAD(&ffl->error_list);
+               INIT_LIST_HEAD(&ffl->mirrors);
                return &ffl->generic_hdr;
        } else
                return NULL;
@@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
        return 0;
 }
 
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+               const struct nfs4_ff_layout_mirror *m2)
+{
+       int i, j;
+
+       if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+               return false;
+       for (i = 0; i < m1->fh_versions_cnt; i++) {
+               bool found_fh = false;
+               for (j = 0; j < m2->fh_versions_cnt; i++) {
+                       if (nfs_compare_fh(&m1->fh_versions[i],
+                                       &m2->fh_versions[j]) == 0) {
+                               found_fh = true;
+                               break;
+                       }
+               }
+               if (!found_fh)
+                       return false;
+       }
+       return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+               struct nfs4_ff_layout_mirror *mirror)
+{
+       struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+       struct nfs4_ff_layout_mirror *pos;
+       struct inode *inode = lo->plh_inode;
+
+       spin_lock(&inode->i_lock);
+       list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+               if (mirror->mirror_ds != pos->mirror_ds)
+                       continue;
+               if (!ff_mirror_match_fh(mirror, pos))
+                       continue;
+               if (atomic_inc_not_zero(&pos->ref)) {
+                       spin_unlock(&inode->i_lock);
+                       return pos;
+               }
+       }
+       list_add(&mirror->mirrors, &ff_layout->mirrors);
+       mirror->layout = lo;
+       spin_unlock(&inode->i_lock);
+       return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+       struct inode *inode;
+       if (mirror->layout == NULL)
+               return;
+       inode = mirror->layout->plh_inode;
+       spin_lock(&inode->i_lock);
+       list_del(&mirror->mirrors);
+       spin_unlock(&inode->i_lock);
+       mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+
+       mirror = kzalloc(sizeof(*mirror), gfp_flags);
+       if (mirror != NULL) {
+               spin_lock_init(&mirror->lock);
+               atomic_set(&mirror->ref, 1);
+               INIT_LIST_HEAD(&mirror->mirrors);
+       }
+       return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+       ff_layout_remove_mirror(mirror);
+       kfree(mirror->fh_versions);
+       if (mirror->cred)
+               put_rpccred(mirror->cred);
+       nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+       kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+       if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+               ff_layout_free_mirror(mirror);
+}
+
 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 {
        int i;
@@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
                        /* normally mirror_ds is freed in
                         * .free_deviceid_node but we still do it here
                         * for .alloc_lseg error path */
-                       if (fls->mirror_array[i]) {
-                               kfree(fls->mirror_array[i]->fh_versions);
-                               nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-                               kfree(fls->mirror_array[i]);
-                       }
+                       ff_layout_put_mirror(fls->mirror_array[i]);
                }
                kfree(fls->mirror_array);
                fls->mirror_array = NULL;
@@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
        }
 }
 
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+               const struct pnfs_layout_range *l2)
+{
+       u64 end1, end2;
+
+       if (l1->iomode != l2->iomode)
+               return l1->iomode != IOMODE_READ;
+       end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+       end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+       if (end1 < l2->offset)
+               return false;
+       if (end2 < l1->offset)
+               return true;
+       return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+               struct pnfs_layout_segment *old)
+{
+       u64 new_end, old_end;
+
+       if (new->pls_range.iomode != old->pls_range.iomode)
+               return false;
+       old_end = pnfs_calc_offset_end(old->pls_range.offset,
+                       old->pls_range.length);
+       if (old_end < new->pls_range.offset)
+               return false;
+       new_end = pnfs_calc_offset_end(new->pls_range.offset,
+                       new->pls_range.length);
+       if (new_end < old->pls_range.offset)
+               return false;
+
+       /* Mergeable: copy info from 'old' to 'new' */
+       if (new_end < old_end)
+               new_end = old_end;
+       if (new->pls_range.offset < old->pls_range.offset)
+               new->pls_range.offset = old->pls_range.offset;
+       new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+                       new_end);
+       if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+               set_bit(NFS_LSEG_ROC, &new->pls_flags);
+       if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+               set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+       return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+               struct pnfs_layout_segment *lseg,
+               struct list_head *free_me)
+{
+       pnfs_generic_layout_insert_lseg(lo, lseg,
+                       ff_lseg_range_is_after,
+                       ff_lseg_merge,
+                       free_me);
+}
+
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
        int i, j;
@@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                goto out_err_free;
 
        for (i = 0; i < fls->mirror_array_cnt; i++) {
+               struct nfs4_ff_layout_mirror *mirror;
                struct nfs4_deviceid devid;
                struct nfs4_deviceid_node *idnode;
                u32 ds_count;
@@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                if (ds_count != 1)
                        goto out_err_free;
 
-               fls->mirror_array[i] =
-                       kzalloc(sizeof(struct nfs4_ff_layout_mirror),
-                               gfp_flags);
+               fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
                if (fls->mirror_array[i] == NULL) {
                        rc = -ENOMEM;
                        goto out_err_free;
                }
 
-               spin_lock_init(&fls->mirror_array[i]->lock);
                fls->mirror_array[i]->ds_count = ds_count;
-               fls->mirror_array[i]->lseg = &fls->generic_hdr;
 
                /* deviceid */
                rc = decode_deviceid(&stream, &devid);
@@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                if (rc)
                        goto out_err_free;
 
+               mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+               if (mirror != fls->mirror_array[i]) {
+                       ff_layout_free_mirror(fls->mirror_array[i]);
+                       fls->mirror_array[i] = mirror;
+               }
+
                dprintk("%s: uid %d gid %d\n", __func__,
                        fls->mirror_array[i]->uid,
                        fls->mirror_array[i]->gid);
@@ -379,21 +527,9 @@ static void
 ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 {
        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
-       int i;
 
        dprintk("--> %s\n", __func__);
 
-       for (i = 0; i < fls->mirror_array_cnt; i++) {
-               if (fls->mirror_array[i]) {
-                       nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-                       fls->mirror_array[i]->mirror_ds = NULL;
-                       if (fls->mirror_array[i]->cred) {
-                               put_rpccred(fls->mirror_array[i]->cred);
-                               fls->mirror_array[i]->cred = NULL;
-                       }
-               }
-       }
-
        if (lseg->pls_range.iomode == IOMODE_RW) {
                struct nfs4_flexfile_layout *ffl;
                struct inode *inode;
@@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 }
 
 static void
-nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
        /* first IO request? */
        if (atomic_inc_return(&timer->n_ops) == 1) {
-               timer->start_time = ktime_get();
+               timer->start_time = now;
        }
 }
 
 static ktime_t
-nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
-       ktime_t start, now;
+       ktime_t start;
 
        if (atomic_dec_return(&timer->n_ops) < 0)
                WARN_ON_ONCE(1);
 
-       now = ktime_get();
        start = timer->start_time;
        timer->start_time = now;
        return ktime_sub(now, start);
 }
 
-static ktime_t
-nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
-{
-       return ktime_sub(ktime_get(), task->tk_start);
-}
-
 static bool
 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
-                           struct nfs4_ff_layoutstat *layoutstat)
+                           struct nfs4_ff_layoutstat *layoutstat,
+                           ktime_t now)
 {
        static const ktime_t notime = {0};
-       ktime_t now = ktime_get();
+       s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
 
-       nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+       nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
        if (ktime_equal(mirror->start_time, notime))
                mirror->start_time = now;
        if (ktime_equal(mirror->last_report_time, notime))
                mirror->last_report_time = now;
+       if (layoutstats_timer != 0)
+               report_interval = (s64)layoutstats_timer * 1000LL;
        if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
-                       FF_LAYOUTSTATS_REPORT_INTERVAL) {
+                       report_interval) {
                mirror->last_report_time = now;
                return true;
        }
@@ -482,35 +614,39 @@ static void
 nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
                __u64 requested,
                __u64 completed,
-               ktime_t time_completed)
+               ktime_t time_completed,
+               ktime_t time_started)
 {
        struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+       ktime_t completion_time = ktime_sub(time_completed, time_started);
        ktime_t timer;
 
        iostat->ops_completed++;
        iostat->bytes_completed += completed;
        iostat->bytes_not_delivered += requested - completed;
 
-       timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+       timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
        iostat->total_busy_time =
                        ktime_add(iostat->total_busy_time, timer);
        iostat->aggregate_completion_time =
-                       ktime_add(iostat->aggregate_completion_time, time_completed);
+                       ktime_add(iostat->aggregate_completion_time,
+                                       completion_time);
 }
 
 static void
-nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
-               __u64 requested)
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested, ktime_t now)
 {
        bool report;
 
        spin_lock(&mirror->lock);
-       report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+       report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
        nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
        spin_unlock(&mirror->lock);
 
        if (report)
-               pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+               pnfs_report_layoutstat(inode, GFP_KERNEL);
 }
 
 static void
@@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
        spin_lock(&mirror->lock);
        nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
                        requested, completed,
-                       nfs4_ff_layout_calc_completion_time(task));
+                       ktime_get(), task->tk_start);
        spin_unlock(&mirror->lock);
 }
 
 static void
-nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
-               __u64 requested)
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested, ktime_t now)
 {
        bool report;
 
        spin_lock(&mirror->lock);
-       report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+       report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
        nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
        spin_unlock(&mirror->lock);
 
        if (report)
-               pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+               pnfs_report_layoutstat(inode, GFP_NOIO);
 }
 
 static void
@@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 
        spin_lock(&mirror->lock);
        nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
-                       requested, completed,
-                       nfs4_ff_layout_calc_completion_time(task));
+                       requested, completed, ktime_get(), task->tk_start);
        spin_unlock(&mirror->lock);
 }
 
@@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
        /* no lseg means that pnfs is not in use, so no mirroring here */
-       pnfs_put_lseg(pgio->pg_lseg);
-       pgio->pg_lseg = NULL;
        nfs_pageio_reset_write_mds(pgio);
        return 1;
 }
@@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
        if (task->tk_status >= 0)
                return 0;
 
-       if (task->tk_status != -EJUKEBOX) {
+       switch (task->tk_status) {
+       /* File access problems. Don't mark the device as unavailable */
+       case -EACCES:
+       case -ESTALE:
+       case -EISDIR:
+       case -EBADHANDLE:
+       case -ELOOP:
+       case -ENOSPC:
+               break;
+       case -EJUKEBOX:
+               nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+               goto out_retry;
+       default:
                dprintk("%s DS connection error %d\n", __func__,
                        task->tk_status);
                nfs4_mark_deviceid_unavailable(devid);
-               if (ff_layout_has_available_ds(lseg))
-                       return -NFS4ERR_RESET_TO_PNFS;
-               else
-                       return -NFS4ERR_RESET_TO_MDS;
        }
-
-       if (task->tk_status == -EJUKEBOX)
-               nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+       /* FIXME: Need to prevent infinite looping here. */
+       return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
        task->tk_status = 0;
        rpc_restart_call(task);
        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 
 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
                                        int idx, u64 offset, u64 length,
-                                       u32 status, int opnum)
+                                       u32 status, int opnum, int error)
 {
        struct nfs4_ff_layout_mirror *mirror;
        int err;
 
+       if (status == 0) {
+               switch (error) {
+               case -ETIMEDOUT:
+               case -EPFNOSUPPORT:
+               case -EPROTONOSUPPORT:
+               case -EOPNOTSUPP:
+               case -ECONNREFUSED:
+               case -ECONNRESET:
+               case -EHOSTDOWN:
+               case -EHOSTUNREACH:
+               case -ENETUNREACH:
+               case -EADDRINUSE:
+               case -ENOBUFS:
+               case -EPIPE:
+               case -EPERM:
+                       status = NFS4ERR_NXIO;
+                       break;
+               case -EACCES:
+                       status = NFS4ERR_ACCESS;
+                       break;
+               default:
+                       return;
+               }
+       }
+
        mirror = FF_LAYOUT_COMP(lseg, idx);
        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
                                       mirror, offset, length, status, opnum,
                                       GFP_NOIO);
+       pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 }
 
@@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 static int ff_layout_read_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
-       struct inode *inode;
        int err;
 
        trace_nfs4_pnfs_read(hdr, task->tk_status);
-       if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-               hdr->res.op_status = NFS4ERR_NXIO;
-       if (task->tk_status < 0 && hdr->res.op_status)
+       if (task->tk_status < 0)
                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
                                            hdr->args.offset, hdr->args.count,
-                                           hdr->res.op_status, OP_READ);
+                                           hdr->res.op_status, OP_READ,
+                                           task->tk_status);
        err = ff_layout_async_handle_error(task, hdr->args.context->state,
                                           hdr->ds_clp, hdr->lseg,
                                           hdr->pgio_mirror_idx);
@@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
                pnfs_read_resend_pnfs(hdr);
                return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
-               inode = hdr->lseg->pls_layout->plh_inode;
-               pnfs_error_mark_layout_for_return(inode, hdr->lseg);
                ff_layout_reset_read(hdr);
                return task->tk_status;
        case -EAGAIN:
@@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
                                         struct nfs_pgio_header *hdr)
 {
-       nfs4_ff_layout_stat_io_start_read(
+       nfs4_ff_layout_stat_io_start_read(hdr->inode,
                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-                       hdr->args.count);
+                       hdr->args.count,
+                       task->tk_start);
 
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
@@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
-       struct inode *inode;
        int err;
 
        trace_nfs4_pnfs_write(hdr, task->tk_status);
-       if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-               hdr->res.op_status = NFS4ERR_NXIO;
-       if (task->tk_status < 0 && hdr->res.op_status)
+       if (task->tk_status < 0)
                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
                                            hdr->args.offset, hdr->args.count,
-                                           hdr->res.op_status, OP_WRITE);
+                                           hdr->res.op_status, OP_WRITE,
+                                           task->tk_status);
        err = ff_layout_async_handle_error(task, hdr->args.context->state,
                                           hdr->ds_clp, hdr->lseg,
                                           hdr->pgio_mirror_idx);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
+               pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+               ff_layout_reset_write(hdr, true);
+               return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
-               inode = hdr->lseg->pls_layout->plh_inode;
-               pnfs_error_mark_layout_for_return(inode, hdr->lseg);
-               if (err == -NFS4ERR_RESET_TO_PNFS) {
-                       pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
-                       ff_layout_reset_write(hdr, true);
-               } else {
-                       pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
-                       ff_layout_reset_write(hdr, false);
-               }
+               pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+               ff_layout_reset_write(hdr, false);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
            hdr->res.verf->committed == NFS_DATA_SYNC)
                ff_layout_set_layoutcommit(hdr);
 
+       /* zero out fattr since we don't care DS attr at all */
+       hdr->fattr.valid = 0;
+       if (task->tk_status >= 0)
+               nfs_writeback_update_inode(hdr);
+
        return 0;
 }
 
 static int ff_layout_commit_done_cb(struct rpc_task *task,
                                     struct nfs_commit_data *data)
 {
-       struct inode *inode;
        int err;
 
        trace_nfs4_pnfs_commit_ds(data, task->tk_status);
-       if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
-               data->res.op_status = NFS4ERR_NXIO;
-       if (task->tk_status < 0 && data->res.op_status)
+       if (task->tk_status < 0)
                ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
                                            data->args.offset, data->args.count,
-                                           data->res.op_status, OP_COMMIT);
+                                           data->res.op_status, OP_COMMIT,
+                                           task->tk_status);
        err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
                                           data->lseg, data->ds_commit_index);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
+               pnfs_set_retry_layoutget(data->lseg->pls_layout);
+               pnfs_generic_prepare_to_resend_writes(data);
+               return -EAGAIN;
        case -NFS4ERR_RESET_TO_MDS:
-               inode = data->lseg->pls_layout->plh_inode;
-               pnfs_error_mark_layout_for_return(inode, data->lseg);
-               if (err == -NFS4ERR_RESET_TO_PNFS)
-                       pnfs_set_retry_layoutget(data->lseg->pls_layout);
-               else
-                       pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+               pnfs_clear_retry_layoutget(data->lseg->pls_layout);
                pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -EAGAIN:
@@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
                                          struct nfs_pgio_header *hdr)
 {
-       nfs4_ff_layout_stat_io_start_write(
+       nfs4_ff_layout_stat_io_start_write(hdr->inode,
                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-                       hdr->args.count);
+                       hdr->args.count,
+                       task->tk_start);
 
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
@@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 static void ff_layout_commit_prepare_common(struct rpc_task *task,
                struct nfs_commit_data *cdata)
 {
-       nfs4_ff_layout_stat_io_start_write(
+       nfs4_ff_layout_stat_io_start_write(cdata->inode,
                        FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
-                       0);
+                       0, task->tk_start);
 }
 
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
@@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
        *start = cpu_to_be32((xdr->p - start - 1) * 4);
 }
 
-static bool
+static int
 ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
-                              struct pnfs_layout_segment *pls,
-                              int *dev_count, int dev_limit)
+                              struct pnfs_layout_hdr *lo,
+                              int dev_limit)
 {
+       struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
        struct nfs4_ff_layout_mirror *mirror;
        struct nfs4_deviceid_node *dev;
        struct nfs42_layoutstat_devinfo *devinfo;
-       int i;
+       int i = 0;
 
-       for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
-               if (*dev_count >= dev_limit)
+       list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+               if (i >= dev_limit)
                        break;
-               mirror = FF_LAYOUT_COMP(pls, i);
-               if (!mirror || !mirror->mirror_ds)
+               if (!mirror->mirror_ds)
+                       continue;
+               /* mirror refcount put in cleanup_layoutstats */
+               if (!atomic_inc_not_zero(&mirror->ref))
                        continue;
-               dev = FF_LAYOUT_DEVID_NODE(pls, i);
-               devinfo = &args->devinfo[*dev_count];
+               dev = &mirror->mirror_ds->id_node; 
+               devinfo = &args->devinfo[i];
                memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
-               devinfo->offset = pls->pls_range.offset;
-               devinfo->length = pls->pls_range.length;
-               /* well, we don't really know if IO is continuous or not! */
-               devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+               devinfo->offset = 0;
+               devinfo->length = NFS4_MAX_UINT64;
+               devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
                devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
-               devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+               devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
                devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
                devinfo->layout_type = LAYOUT_FLEX_FILES;
                devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
                devinfo->layout_private = mirror;
-               /* lseg refcount put in cleanup_layoutstats */
-               pnfs_get_lseg(pls);
 
-               ++(*dev_count);
+               i++;
        }
-
-       return *dev_count < dev_limit;
+       return i;
 }
 
 static int
 ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 {
-       struct pnfs_layout_segment *pls;
+       struct nfs4_flexfile_layout *ff_layout;
+       struct nfs4_ff_layout_mirror *mirror;
        int dev_count = 0;
 
        spin_lock(&args->inode->i_lock);
-       list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
-               dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+       ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+       list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+               if (atomic_read(&mirror->ref) != 0)
+                       dev_count ++;
        }
        spin_unlock(&args->inode->i_lock);
        /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
                        __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
                dev_count = PNFS_LAYOUTSTATS_MAXDEV;
        }
-       args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+       args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
        if (!args->devinfo)
                return -ENOMEM;
 
-       dev_count = 0;
        spin_lock(&args->inode->i_lock);
-       list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
-               if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
-                                                   PNFS_LAYOUTSTATS_MAXDEV)) {
-                       break;
-               }
-       }
+       args->num_dev = ff_layout_mirror_prepare_stats(args,
+                       &ff_layout->generic_hdr, dev_count);
        spin_unlock(&args->inode->i_lock);
-       args->num_dev = dev_count;
 
        return 0;
 }
@@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
        for (i = 0; i < data->args.num_dev; i++) {
                mirror = data->args.devinfo[i].layout_private;
                data->args.devinfo[i].layout_private = NULL;
-               pnfs_put_lseg(mirror->lseg);
+               ff_layout_put_mirror(mirror);
        }
 }
 
@@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .free_layout_hdr        = ff_layout_free_layout_hdr,
        .alloc_lseg             = ff_layout_alloc_lseg,
        .free_lseg              = ff_layout_free_lseg,
+       .add_lseg               = ff_layout_add_lseg,
        .pg_read_ops            = &ff_layout_pg_read_ops,
        .pg_write_ops           = &ff_layout_pg_write_ops,
        .get_ds_info            = ff_layout_get_ds_info,
index f92f9a0a856b3e698c8859923438549d1bffed37..68cc0d9828f9ae6f778c001d313c69b589ce98f1 100644 (file)
@@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat {
 };
 
 struct nfs4_ff_layout_mirror {
-       struct pnfs_layout_segment      *lseg; /* back pointer */
+       struct pnfs_layout_hdr          *layout;
+       struct list_head                mirrors;
        u32                             ds_count;
        u32                             efficiency;
        struct nfs4_ff_layout_ds        *mirror_ds;
@@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror {
        u32                             uid;
        u32                             gid;
        struct rpc_cred                 *cred;
+       atomic_t                        ref;
        spinlock_t                      lock;
        struct nfs4_ff_layoutstat       read_stat;
        struct nfs4_ff_layoutstat       write_stat;
@@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment {
 struct nfs4_flexfile_layout {
        struct pnfs_layout_hdr generic_hdr;
        struct pnfs_ds_commit_info commit_info;
+       struct list_head        mirrors;
        struct list_head        error_list; /* nfs4_ff_layout_ds_err */
 };
 
index f13e1969eedd911bf6a5d9be6af6e4ae403f6c1e..e125e55de86daebcbba9f2f907aca05fa27a0a1a 100644 (file)
@@ -172,6 +172,32 @@ out_err:
        return NULL;
 }
 
+static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
+               struct nfs4_deviceid_node *devid)
+{
+       nfs4_mark_deviceid_unavailable(devid);
+       if (!ff_layout_has_available_ds(lseg))
+               pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+                               lseg);
+}
+
+static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
+               struct nfs4_ff_layout_mirror *mirror)
+{
+       if (mirror == NULL || mirror->mirror_ds == NULL) {
+               pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+                                       lseg);
+               return false;
+       }
+       if (mirror->mirror_ds->ds == NULL) {
+               struct nfs4_deviceid_node *devid;
+               devid = &mirror->mirror_ds->id_node;
+               ff_layout_mark_devid_invalid(lseg, devid);
+               return false;
+       }
+       return true;
+}
+
 static u64
 end_offset(u64 start, u64 len)
 {
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
 {
        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
        struct nfs_fh *fh = NULL;
-       struct nfs4_deviceid_node *devid;
 
-       if (mirror == NULL || mirror->mirror_ds == NULL ||
-           mirror->mirror_ds->ds == NULL) {
-               printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+       if (!ff_layout_mirror_valid(lseg, mirror)) {
+               pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
                        __func__, mirror_idx);
-               if (mirror && mirror->mirror_ds) {
-                       devid = &mirror->mirror_ds->id_node;
-                       pnfs_generic_mark_devid_invalid(devid);
-               }
                goto out;
        }
 
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
        unsigned int max_payload;
        rpc_authflavor_t flavor;
 
-       if (mirror == NULL || mirror->mirror_ds == NULL ||
-           mirror->mirror_ds->ds == NULL) {
-               printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+       if (!ff_layout_mirror_valid(lseg, mirror)) {
+               pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
                        __func__, ds_idx);
-               if (mirror && mirror->mirror_ds) {
-                       devid = &mirror->mirror_ds->id_node;
-                       pnfs_generic_mark_devid_invalid(devid);
-               }
                goto out;
        }
 
@@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
                                           range->offset, range->length))
                        continue;
                /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
-                * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+                * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+                * + status(4) + opnum(4)
                 */
                p = xdr_reserve_space(xdr,
-                               24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+                               28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
                if (unlikely(!p))
                        return -ENOBUFS;
                p = xdr_encode_hyper(p, err->offset);
                p = xdr_encode_hyper(p, err->length);
                p = xdr_encode_opaque_fixed(p, &err->stateid,
                                            NFS4_STATEID_SIZE);
+               /* Encode 1 error */
+               *p++ = cpu_to_be32(1);
                p = xdr_encode_opaque_fixed(p, &err->deviceid,
                                            NFS4_DEVICEID4_SIZE);
                *p++ = cpu_to_be32(err->status);
@@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
        return 0;
 }
 
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
        struct nfs4_ff_layout_mirror *mirror;
        struct nfs4_deviceid_node *devid;
-       int idx;
+       u32 idx;
 
        for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
                mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
        return false;
 }
 
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+       struct nfs4_deviceid_node *devid;
+       u32 idx;
+
+       for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+               mirror = FF_LAYOUT_COMP(lseg, idx);
+               if (!mirror || !mirror->mirror_ds)
+                       return false;
+               devid = &mirror->mirror_ds->id_node;
+               if (ff_layout_test_devid_unavailable(devid))
+                       return false;
+       }
+
+       return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+       if (lseg->pls_range.iomode == IOMODE_READ)
+               return  ff_read_layout_has_available_ds(lseg);
+       /* Note: RW layout needs all mirrors available */
+       return ff_rw_layout_has_available_ds(lseg);
+}
+
 module_param(dataserver_retrans, uint, 0644);
 MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
                        "retries a request before it attempts further "
index 0adc7d245b3dd838e32371920e23d9dda5071ee0..326d9e10d83370f56061220c51e72a42de0595b2 100644 (file)
@@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = d_inode(dentry);
        struct nfs_fattr *fattr;
-       int error = -ENOMEM;
+       int error = 0;
 
        nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
 
@@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                attr->ia_valid &= ~ATTR_MODE;
 
        if (attr->ia_valid & ATTR_SIZE) {
-               loff_t i_size;
-
                BUG_ON(!S_ISREG(inode->i_mode));
 
-               i_size = i_size_read(inode);
-               if (attr->ia_size == i_size)
+               error = inode_newsize_ok(inode, attr->ia_size);
+               if (error)
+                       return error;
+
+               if (attr->ia_size == i_size_read(inode))
                        attr->ia_valid &= ~ATTR_SIZE;
-               else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
-                       return -ETXTBSY;
        }
 
        /* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                nfs_sync_inode(inode);
 
        fattr = nfs_alloc_fattr();
-       if (fattr == NULL)
+       if (fattr == NULL) {
+               error = -ENOMEM;
                goto out;
+       }
+
        /*
         * Return any delegations if we're going to change ACLs
         */
@@ -759,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
  * @ctx: pointer to context
  * @is_sync: is this a synchronous close
  *
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
  */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 {
+       struct nfs_inode *nfsi;
        struct inode *inode;
        struct nfs_server *server;
 
@@ -772,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
        if (!is_sync)
                return;
        inode = d_inode(ctx->dentry);
-       if (!list_empty(&NFS_I(inode)->open_files))
+       nfsi = NFS_I(inode);
+       if (inode->i_mapping->nrpages == 0)
+               return;
+       if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+               return;
+       if (!list_empty(&nfsi->open_files))
                return;
        server = NFS_SERVER(inode);
        if (server->flags & NFS_MOUNT_NOCTO)
@@ -844,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
 }
 EXPORT_SYMBOL_GPL(put_nfs_open_context);
 
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+       __put_nfs_open_context(ctx, 1);
+}
+
 /*
  * Ensure that mmap has a recent RPC credential for use when writing out
  * shared pages
@@ -888,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
        return ctx;
 }
 
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(filp);
 
@@ -899,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp)
                spin_lock(&inode->i_lock);
                list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
                spin_unlock(&inode->i_lock);
-               __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+               put_nfs_open_context_sync(ctx);
        }
 }
 
@@ -919,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp)
        return 0;
 }
 
-int nfs_release(struct inode *inode, struct file *filp)
-{
-       nfs_file_clear_open_context(filp);
-       return 0;
-}
-
 /*
  * This function is called whenever some part of NFS notices that
  * the cached attributes have to be refreshed.
@@ -1273,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        return 0;
 }
 
-static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
-       if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
-               return 0;
-       return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
-}
-
 static atomic_long_t nfs_attr_generation_counter;
 
 static unsigned long nfs_read_attr_generation_counter(void)
@@ -1428,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
        const struct nfs_inode *nfsi = NFS_I(inode);
 
        return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
-               nfs_ctime_need_update(inode, fattr) ||
                ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
 }
 
@@ -1491,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
 {
        unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
+       /*
+        * Don't revalidate the pagecache if we hold a delegation, but do
+        * force an attribute update
+        */
+       if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+               invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
+
        if (S_ISDIR(inode->i_mode))
                invalid |= NFS_INO_INVALID_DATA;
        nfs_set_cache_invalid(inode, invalid);
index 9b372b845f6a6ff06a4a035e2f6799d7d29cd8f7..56cfde26fb9cea0100a99bb7c3fe8a2be813dc63 100644 (file)
@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
 }
 #endif
 
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
 /* callback_xdr.c */
 extern struct svc_version nfs4_callback_version1;
 extern struct svc_version nfs4_callback_version4;
@@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
 /* file.c */
 int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
 loff_t nfs_file_llseek(struct file *, loff_t, int);
-int nfs_file_flush(struct file *, fl_owner_t);
 ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
 ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
                             size_t, unsigned int);
@@ -490,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list,
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
                                 struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+               struct list_head *dst,
+               struct nfs_commit_info *cinfo);
 void nfs_request_remove_commit_list(struct nfs_page *req,
                                    struct nfs_commit_info *cinfo);
 void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
  * Record the page as unstable and mark its inode as dirty.
  */
 static inline
-void nfs_mark_page_unstable(struct page *page)
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
 {
-       struct inode *inode = page_file_mapping(page)->host;
+       if (!cinfo->dreq) {
+               struct inode *inode = page_file_mapping(page)->host;
 
-       inc_zone_page_state(page, NR_UNSTABLE_NFS);
-       inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
-        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+               inc_zone_page_state(page, NR_UNSTABLE_NFS);
+               inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+               __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+       }
 }
 
 /*
index 9b04c2e6fffc3f306f3c598b7c4557beff653c8e..267126d32ec0f6a1d5d09293ef9427585edca9c6 100644 (file)
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
 {
        encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
        encode_symlinkdata3(xdr, args);
+       xdr->buf->flags |= XDRBUF_WRITE;
 }
 
 /*
index ff66ae700b8991eeed513e210397f6639c0e5f70..814c1255f1d2c6fd8deda74dd55de60a5ad14418 100644 (file)
@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
 int nfs42_proc_layoutstats_generic(struct nfs_server *,
                                   struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];
 
 #endif /* __LINUX_FS_NFS_NFS4_2_H */
index a6bd27da6286f9fee14f0b087eddcc1ec437cdde..0eb29e14070d5890478eda0274814f007f3cf9c5 100644 (file)
@@ -238,8 +238,7 @@ out_overflow:
        return -EIO;
 }
 
-static int decode_layoutstats(struct xdr_stream *xdr,
-                             struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
 {
        return decode_op_hdr(xdr, OP_LAYOUTSTATS);
 }
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
                goto out;
        WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
        for (i = 0; i < res->num_dev; i++) {
-               status = decode_layoutstats(xdr, res);
+               status = decode_layoutstats(xdr);
                if (status)
                        goto out;
        }
index ea3bee919a765840a267f8fc59ccdec4ef61f676..50cfc4ca7a02a06d383c05325ecbdd166ff25b8c 100644 (file)
@@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
 int nfs41_discover_server_trunking(struct nfs_client *clp,
                        struct nfs_client **, struct rpc_cred *);
 extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
 #else
 static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 {
index 3aa6a9ba51136f31f30dea29d60dded106b05241..223bedda64ae49f3d94226677e652e31da4c8d84 100644 (file)
@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
                return false;
 
        /* Match only the IP address, not the port number */
-       if (!nfs_sockaddr_match_ipaddr(addr, clap))
-               return false;
-
-       return true;
+       return rpc_cmp_addr(addr, clap);
 }
 
 /*
index dcd39d4e2efebd78eed64d4df00fd2745f747027..b0dbe0abed53631a22e938b332b675b501b9e978 100644 (file)
@@ -6,7 +6,9 @@
 #include <linux/fs.h>
 #include <linux/falloc.h>
 #include <linux/nfs_fs.h>
+#include "delegation.h"
 #include "internal.h"
+#include "iostat.h"
 #include "fscache.h"
 #include "pnfs.h"
 
@@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
        struct inode *dir;
        unsigned openflags = filp->f_flags;
        struct iattr attr;
-       int opened = 0;
        int err;
 
        /*
@@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
                nfs_sync_inode(inode);
        }
 
-       inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+       inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                switch (err) {
@@ -100,6 +101,31 @@ out_drop:
        goto out_put_ctx;
 }
 
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+       struct inode    *inode = file_inode(file);
+
+       dprintk("NFS: flush(%pD2)\n", file);
+
+       nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+       if ((file->f_mode & FMODE_WRITE) == 0)
+               return 0;
+
+       /*
+        * If we're holding a write delegation, then check if we're required
+        * to flush the i/o on close. If not, then just start the i/o now.
+        */
+       if (!nfs4_delegation_flush_on_close(inode))
+               return filemap_fdatawrite(file->f_mapping);
+
+       /* Flush writes to the server and return any errors */
+       return vfs_fsync(file, 0);
+}
+
 static int
 nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
@@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = {
        .write_iter     = nfs_file_write,
        .mmap           = nfs_file_mmap,
        .open           = nfs4_file_open,
-       .flush          = nfs_file_flush,
+       .flush          = nfs4_file_flush,
        .release        = nfs_file_release,
        .fsync          = nfs4_file_fsync,
        .lock           = nfs_lock,
index 535dfc69c628f825cc4422339406b1365f66e8d4..2e4902203c358c46344338ee2efcfb7db7aba7f7 100644 (file)
@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
        .read           = user_read,
 };
 
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
 {
        struct cred *cred;
        struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
        return ret;
 }
 
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
 {
        key_revoke(id_resolver_cache->thread_keyring);
        unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
        kfree(idmap);
 }
 
-int nfs_idmap_init(void)
-{
-       return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
-       nfs_idmap_quit_keyring();
-}
-
 static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
                                     struct idmap_msg *im,
                                     struct rpc_pipe_msg *msg)
index 3acb1eb72930c40828bab90aeb27a3918f71138d..693b903b48bdfb78808274e90f53971eb1f21244 100644 (file)
@@ -586,7 +586,7 @@ out_unlock:
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
        if (send_new_highest_used_slotid)
-               nfs41_server_notify_highest_slotid_update(session->clp);
+               nfs41_notify_server(session->clp);
 }
 
 int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -1150,7 +1150,8 @@ out:
        return ret;
 }
 
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+               enum open_claim_type4 claim)
 {
        if (delegation == NULL)
                return 0;
@@ -1158,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
                return 0;
        if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
                return 0;
+       switch (claim) {
+       case NFS4_OPEN_CLAIM_NULL:
+       case NFS4_OPEN_CLAIM_FH:
+               break;
+       case NFS4_OPEN_CLAIM_PREVIOUS:
+               if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+                       break;
+       default:
+               return 0;
+       }
        nfs_mark_delegation_referenced(delegation);
        return 1;
 }
@@ -1220,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
 }
 
 static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+               nfs4_stateid *arg_stateid,
                nfs4_stateid *stateid, fmode_t fmode)
 {
        clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1238,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
        if (stateid == NULL)
                return;
        /* Handle races with OPEN */
-       if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
-           !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+       if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
+           (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+           !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
                nfs_resync_open_stateid_locked(state);
                return;
        }
@@ -1248,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
        nfs4_stateid_copy(&state->open_stateid, stateid);
 }
 
-static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+       nfs4_stateid *arg_stateid,
+       nfs4_stateid *stateid, fmode_t fmode)
 {
        write_seqlock(&state->seqlock);
-       nfs_clear_open_stateid_locked(state, stateid, fmode);
+       nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
        write_sequnlock(&state->seqlock);
        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
                nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -1376,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
        struct nfs_delegation *delegation;
        int open_mode = opendata->o_arg.open_flags;
        fmode_t fmode = opendata->o_arg.fmode;
+       enum open_claim_type4 claim = opendata->o_arg.claim;
        nfs4_stateid stateid;
        int ret = -EAGAIN;
 
@@ -1389,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                spin_unlock(&state->owner->so_lock);
                rcu_read_lock();
                delegation = rcu_dereference(nfsi->delegation);
-               if (!can_open_delegated(delegation, fmode)) {
+               if (!can_open_delegated(delegation, fmode, claim)) {
                        rcu_read_unlock();
                        break;
                }
@@ -1852,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_opendata *data = calldata;
        struct nfs4_state_owner *sp = data->owner;
        struct nfs_client *clp = sp->so_server->nfs_client;
+       enum open_claim_type4 claim = data->o_arg.claim;
 
        if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
                goto out_wait;
@@ -1866,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
                        goto out_no_action;
                rcu_read_lock();
                delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
-               if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
-                   data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
-                   can_open_delegated(delegation, data->o_arg.fmode))
+               if (can_open_delegated(delegation, data->o_arg.fmode, claim))
                        goto unlock_no_action;
                rcu_read_unlock();
        }
        /* Update client id. */
        data->o_arg.clientid = clp->cl_clientid;
-       switch (data->o_arg.claim) {
+       switch (claim) {
+       default:
+               break;
        case NFS4_OPEN_CLAIM_PREVIOUS:
        case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
        case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
@@ -2294,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
  * fields corresponding to attributes that were used to store the verifier.
  * Make sure we clobber those fields in the later setattr call
  */
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+                               struct iattr *sattr, struct nfs4_label **label)
 {
-       if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+       const u32 *attrset = opendata->o_res.attrset;
+
+       if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
            !(sattr->ia_valid & ATTR_ATIME_SET))
                sattr->ia_valid |= ATTR_ATIME;
 
-       if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+       if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
            !(sattr->ia_valid & ATTR_MTIME_SET))
                sattr->ia_valid |= ATTR_MTIME;
+
+       /* Except MODE, it seems harmless of setting twice. */
+       if ((attrset[1] & FATTR4_WORD1_MODE))
+               sattr->ia_valid &= ~ATTR_MODE;
+
+       if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+               *label = NULL;
 }
 
 static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2425,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir,
                goto err_free_label;
        state = ctx->state;
 
-       if ((opendata->o_arg.open_flags & O_EXCL) &&
+       if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
            (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
-               nfs4_exclusive_attrset(opendata, sattr);
+               nfs4_exclusive_attrset(opendata, sattr, &label);
 
                nfs_fattr_init(opendata->o_res.f_attr);
                status = nfs4_do_setattr(state->inode, cred,
@@ -2439,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir,
                        nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
                }
        }
-       if (opendata->file_created)
+       if (opened && opendata->file_created)
                *opened |= FILE_CREATED;
 
        if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -2661,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
        switch (task->tk_status) {
                case 0:
                        res_stateid = &calldata->res.stateid;
-                       if (calldata->arg.fmode == 0 && calldata->roc)
+                       if (calldata->roc)
                                pnfs_roc_set_barrier(state->inode,
                                                     calldata->roc_barrier);
                        renew_lease(server, calldata->timestamp);
@@ -2684,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                                goto out_release;
                        }
        }
-       nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+       nfs_clear_open_stateid(state, &calldata->arg.stateid,
+                       res_stateid, calldata->arg.fmode);
 out_release:
        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2735,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                goto out_no_action;
        }
 
-       if (calldata->arg.fmode == 0) {
+       if (calldata->arg.fmode == 0)
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
-               if (calldata->roc &&
-                   pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
-                       nfs_release_seqid(calldata->arg.seqid);
-                       goto out_wait;
-                   }
-       }
+       if (calldata->roc)
+               pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
        calldata->arg.share_access =
                nfs4_map_atomic_open_share(NFS_SERVER(inode),
                                calldata->arg.fmode, 0);
@@ -2883,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
+       u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
        struct nfs4_server_caps_arg args = {
                .fhandle = fhandle,
+               .bitmask = bitmask,
        };
        struct nfs4_server_caps_res res = {};
        struct rpc_message msg = {
@@ -2894,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
        };
        int status;
 
+       bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+                    FATTR4_WORD0_FH_EXPIRE_TYPE |
+                    FATTR4_WORD0_LINK_SUPPORT |
+                    FATTR4_WORD0_SYMLINK_SUPPORT |
+                    FATTR4_WORD0_ACLSUPPORT;
+       if (minorversion)
+               bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
        if (status == 0) {
                /* Sanity check the server answers */
-               switch (server->nfs_client->cl_minorversion) {
+               switch (minorversion) {
                case 0:
                        res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
                        res.attr_bitmask[2] = 0;
@@ -2950,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                server->cache_consistency_bitmask[2] = 0;
+               memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+                       sizeof(server->exclcreat_bitmask));
                server->acl_bitmask = res.acl_bitmask;
                server->fh_expire_type = res.fh_expire_type;
        }
@@ -3552,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        struct nfs4_label l, *ilabel = NULL;
        struct nfs_open_context *ctx;
        struct nfs4_state *state;
-       int opened = 0;
        int status = 0;
 
        ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3562,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
        sattr->ia_mode &= ~current_umask();
-       state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+       state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
                goto out;
@@ -4978,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
        int result;
        size_t len;
        char *str;
-       bool retried = false;
 
        if (clp->cl_owner_id != NULL)
                return 0;
-retry:
+
        rcu_read_lock();
-       len = 10 + strlen(clp->cl_ipaddr) + 1 +
+       len = 14 + strlen(clp->cl_ipaddr) + 1 +
                strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
                1 +
                strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
@@ -5010,14 +5045,6 @@ retry:
                        rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
        rcu_read_unlock();
 
-       /* Did something change? */
-       if (result >= len) {
-               kfree(str);
-               if (retried)
-                       return -EINVAL;
-               retried = true;
-               goto retry;
-       }
        clp->cl_owner_id = str;
        return 0;
 }
@@ -5049,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
                        clp->rpc_ops->version, clp->cl_minorversion,
                        nfs4_client_id_uniquifier,
                        clp->cl_rpcclient->cl_nodename);
-       if (result >= len) {
-               kfree(str);
-               return -EINVAL;
-       }
        clp->cl_owner_id = str;
        return 0;
 }
@@ -5088,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
        result = scnprintf(str, len, "Linux NFSv%u.%u %s",
                        clp->rpc_ops->version, clp->cl_minorversion,
                        clp->cl_rpcclient->cl_nodename);
-       if (result >= len) {
-               kfree(str);
-               return -EINVAL;
-       }
        clp->cl_owner_id = str;
        return 0;
 }
@@ -5289,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
        d_data = (struct nfs4_delegreturndata *)data;
 
-       if (d_data->roc &&
-           pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
-               return;
+       if (d_data->roc)
+               pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
 
        nfs4_setup_sequence(d_data->res.server,
                        &d_data->args.seq_args,
@@ -7745,11 +7763,20 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                goto out;
+       /*
+        * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+        * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+        */
+       case -NFS4ERR_BADLAYOUT:
+               goto out_overflow;
        /*
         * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
-        * (or clients) writing to the same RAID stripe
+        * (or clients) writing to the same RAID stripe except when
+        * the minlength argument is 0 (see RFC5661 section 18.43.3).
         */
        case -NFS4ERR_LAYOUTTRYLATER:
+               if (lgp->args.minlength == 0)
+                       goto out_overflow;
        /*
         * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
         * existing layout before getting a new one).
@@ -7805,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
                rpc_restart_call_prepare(task);
 out:
        dprintk("<-- %s\n", __func__);
+       return;
+out_overflow:
+       task->tk_status = -EOVERFLOW;
+       goto out;
 }
 
 static size_t max_response_pages(struct nfs_server *server)
@@ -8661,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
        .state_renewal_ops = &nfs41_state_renewal_ops,
+       .mig_recovery_ops = &nfs41_mig_recovery_ops,
 };
 #endif
 
index f2e2ad8944617f679a4a85934f6a276d3665229d..da73bc4432385748a5224a4fddf302ab2bb11cfa 100644 (file)
@@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
 {
        /* Use CHECK_LEASE to ping the server with a SEQUENCE */
        set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
        nfs4_schedule_state_manager(clp);
 }
 
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
-       nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
-       nfs41_ping_server(clp);
-}
-
 static void nfs4_reset_all_state(struct nfs_client *clp)
 {
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
index 470af1a78becf206c6e2de9aae89b61b35eb8452..28df12e525bac5857c0d41aba62d558db82f526a 100644 (file)
@@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
 DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
 
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+               TP_PROTO(
+                       const struct nfs_client *clp,
+                       const struct nfs_fh *fhandle,
+                       const struct inode *inode,
+                       int error
+               ),
+
+               TP_ARGS(clp, fhandle, inode, error),
+
+               TP_STRUCT__entry(
+                       __field(int, error)
+                       __field(dev_t, dev)
+                       __field(u32, fhandle)
+                       __field(u64, fileid)
+                       __string(dstaddr, clp ?
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                       RPC_DISPLAY_ADDR) : "unknown")
+               ),
+
+               TP_fast_assign(
+                       __entry->error = error;
+                       __entry->fhandle = nfs_fhandle_hash(fhandle);
+                       if (inode != NULL) {
+                               __entry->fileid = NFS_FILEID(inode);
+                               __entry->dev = inode->i_sb->s_dev;
+                       } else {
+                               __entry->fileid = 0;
+                               __entry->dev = 0;
+                       }
+                       __assign_str(dstaddr, clp ?
+                               rpc_peeraddr2str(clp->cl_rpcclient,
+                                       RPC_DISPLAY_ADDR) : "unknown")
+               ),
+
+               TP_printk(
+                       "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+                       "dstaddr=%s",
+                       __entry->error,
+                       show_nfsv4_errors(__entry->error),
+                       MAJOR(__entry->dev), MINOR(__entry->dev),
+                       (unsigned long long)__entry->fileid,
+                       __entry->fhandle,
+                       __get_str(dstaddr)
+               )
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+       DEFINE_EVENT(nfs4_inode_callback_event, name, \
+                       TP_PROTO( \
+                               const struct nfs_client *clp, \
+                               const struct nfs_fh *fhandle, \
+                               const struct inode *inode, \
+                               int error \
+                       ), \
+                       TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+
+
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
                TP_PROTO(
                        const char *name,
@@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget,
 
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 
 #endif /* CONFIG_NFS_V4_1 */
 
index 558cd65dbdb752d111b5b85649b72bae36fdf040..788adf3897c74f2cfee16e8584807dc7ddbd127d 100644 (file)
@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
                                encode_stateid_maxsz + \
-                               1 /* FIXME: opaque lrf_body always empty at the moment */)
+                               1 + \
+                               XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
 #define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
                                1 + decode_stateid_maxsz)
 #define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
@@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
 
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
                                const struct nfs4_label *label,
-                               const struct nfs_server *server)
+                               const struct nfs_server *server,
+                               bool excl_check)
 {
        char owner_name[IDMAP_NAMESZ];
        char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
                bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
                len += 4;
        }
+
+       if (excl_check) {
+               const u32 *excl_bmval = server->exclcreat_bitmask;
+               bmval[0] &= excl_bmval[0];
+               bmval[1] &= excl_bmval[1];
+               bmval[2] &= excl_bmval[2];
+
+               if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+                       label = NULL;
+       }
+
        if (label) {
                len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
                bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        case NF4LNK:
                p = reserve_space(xdr, 4);
                *p = cpu_to_be32(create->u.symlink.len);
-               xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+               xdr_write_pages(xdr, create->u.symlink.pages, 0,
+                               create->u.symlink.len);
+               xdr->buf->flags |= XDRBUF_WRITE;
                break;
 
        case NF4BLK: case NF4CHR:
@@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
        }
 
        encode_string(xdr, create->name->len, create->name->name);
-       encode_attrs(xdr, create->attrs, create->label, create->server);
+       encode_attrs(xdr, create->attrs, create->label, create->server, false);
 }
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-       struct iattr dummy;
        __be32 *p;
 
        p = reserve_space(xdr, 4);
        switch(arg->createmode) {
        case NFS4_CREATE_UNCHECKED:
                *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
                break;
        case NFS4_CREATE_GUARDED:
                *p = cpu_to_be32(NFS4_CREATE_GUARDED);
-               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
                break;
        case NFS4_CREATE_EXCLUSIVE:
                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
        case NFS4_CREATE_EXCLUSIVE4_1:
                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
                encode_nfs4_verifier(xdr, &arg->u.verifier);
-               dummy.ia_valid = 0;
-               encode_attrs(xdr, &dummy, arg->label, arg->server);
+               encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
        }
 }
 
@@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
        encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
        encode_nfs4_stateid(xdr, &arg->stateid);
-       encode_attrs(xdr, arg->iap, arg->label, server);
+       encode_attrs(xdr, arg->iap, arg->label, server, false);
 }
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
                                     struct xdr_stream *xdr,
                                     struct nfs4_server_caps_arg *args)
 {
+       const u32 *bitmask = args->bitmask;
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
@@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->fhandle, &hdr);
-       encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-                          FATTR4_WORD0_FH_EXPIRE_TYPE|
-                          FATTR4_WORD0_LINK_SUPPORT|
-                          FATTR4_WORD0_SYMLINK_SUPPORT|
-                          FATTR4_WORD0_ACLSUPPORT, &hdr);
+       encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
        encode_nops(&hdr);
 }
 
@@ -3368,6 +3378,22 @@ out_overflow:
        return -EIO;
 }
 
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+                                uint32_t *bitmap, uint32_t *bitmask)
+{
+       if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+               int ret;
+               ret = decode_attr_bitmap(xdr, bitmask);
+               if (unlikely(ret < 0))
+                       return ret;
+               bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+       } else
+               bitmask[0] = bitmask[1] = bitmask[2] = 0;
+       dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+               bitmask[0], bitmask[1], bitmask[2]);
+       return 0;
+}
+
 static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
 {
        __be32 *p;
@@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
                goto xdr_error;
        if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
                goto xdr_error;
+       if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+                               res->exclcreat_bitmask)) != 0)
+               goto xdr_error;
        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
        dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
 }
 
 /* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+               unsigned long *pagemod_limit)
 {
        __be32 *p;
        uint32_t limit_type, nblocks, blocksize;
+       u64 maxsize = 0;
 
        p = xdr_inline_decode(xdr, 12);
        if (unlikely(!p))
                goto out_overflow;
        limit_type = be32_to_cpup(p++);
        switch (limit_type) {
-       case 1:
-               xdr_decode_hyper(p, maxsize);
+       case NFS4_LIMIT_SIZE:
+               xdr_decode_hyper(p, &maxsize);
                break;
-       case 2:
+       case NFS4_LIMIT_BLOCKS:
                nblocks = be32_to_cpup(p++);
                blocksize = be32_to_cpup(p);
-               *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+               maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
        }
+       maxsize >>= PAGE_CACHE_SHIFT;
+       *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
        return 0;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
                break;
        case NFS4_OPEN_DELEGATE_WRITE:
                res->delegation_type = FMODE_WRITE|FMODE_READ;
-               if (decode_space_limit(xdr, &res->maxsize) < 0)
+               if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
                                return -EIO;
        }
        return decode_ace(xdr, NULL, res->server->nfs_client);
index 4984bbe55ff1eed1623df2196bc0de0de41a4304..7c5718ba625e28ff661868dd2d32fb438042a7bb 100644 (file)
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 {
        spin_lock(&hdr->lock);
-       if (pos < hdr->io_start + hdr->good_bytes) {
-               set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+       if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+           || pos < hdr->io_start + hdr->good_bytes) {
                clear_bit(NFS_IOHDR_EOF, &hdr->flags);
                hdr->good_bytes = pos - hdr->io_start;
                hdr->error = error;
index 70bf706b10904e156affe9dd4bea2ec9a17776c5..ba1246433794f0b917ac84738b2d952fd782b2fd 100644 (file)
@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
                return false;
        lo->plh_return_iomode = 0;
-       lo->plh_block_lgets++;
        pnfs_get_layout_hdr(lo);
        clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
        return true;
@@ -817,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 }
 
-static bool
-pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
-                     struct pnfs_layout_range *range)
-{
-       return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
-               (lo->plh_return_iomode == IOMODE_ANY ||
-                lo->plh_return_iomode == range->iomode);
-}
-
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
-                       struct pnfs_layout_range *range, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 {
        return lo->plh_block_lgets ||
-               test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-               (list_empty(&lo->plh_segs) &&
-                (atomic_read(&lo->plh_outstanding) > lget)) ||
-               pnfs_layout_returning(lo, range);
+               test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 }
 
 int
@@ -847,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-       if (pnfs_layoutgets_blocked(lo, range, 1)) {
+       if (pnfs_layoutgets_blocked(lo)) {
                status = -EAGAIN;
        } else if (!nfs4_valid_open_stateid(open_state)) {
                status = -EBADF;
@@ -882,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg;
+       loff_t i_size;
 
        dprintk("--> %s\n", __func__);
 
@@ -889,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        if (lgp == NULL)
                return NULL;
 
+       i_size = i_size_read(ino);
+
        lgp->args.minlength = PAGE_CACHE_SIZE;
        if (lgp->args.minlength > range->length)
                lgp->args.minlength = range->length;
+       if (range->iomode == IOMODE_READ) {
+               if (range->offset >= i_size)
+                       lgp->args.minlength = 0;
+               else if (i_size - range->offset < lgp->args.minlength)
+                       lgp->args.minlength = i_size - range->offset;
+       }
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
        lgp->args.range = *range;
        lgp->args.type = server->pnfs_curr_ld->id;
@@ -956,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
        if (unlikely(lrp == NULL)) {
                status = -ENOMEM;
                spin_lock(&ino->i_lock);
-               lo->plh_block_lgets--;
                pnfs_clear_layoutreturn_waitbit(lo);
-               rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
                spin_unlock(&ino->i_lock);
                pnfs_put_layout_hdr(lo);
                goto out;
@@ -1080,15 +1073,14 @@ bool pnfs_roc(struct inode *ino)
        struct pnfs_layout_segment *lseg, *tmp;
        nfs4_stateid stateid;
        LIST_HEAD(tmp_list);
-       bool found = false, layoutreturn = false;
+       bool found = false, layoutreturn = false, roc = false;
 
        spin_lock(&ino->i_lock);
        lo = nfsi->layout;
-       if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
-           test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+       if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
                goto out_noroc;
 
-       /* Don't return layout if we hold a delegation */
+       /* no roc if we hold a delegation */
        if (nfs4_check_delegation(ino, FMODE_READ))
                goto out_noroc;
 
@@ -1099,34 +1091,41 @@ bool pnfs_roc(struct inode *ino)
                        goto out_noroc;
        }
 
+       stateid = lo->plh_stateid;
+       /* always send layoutreturn if being marked so */
+       if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                  &lo->plh_flags))
+               layoutreturn = pnfs_prepare_layoutreturn(lo);
+
        pnfs_clear_retry_layoutget(lo);
        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
-               if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+               /* If we are sending layoutreturn, invalidate all valid lsegs */
+               if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
                        mark_lseg_invalid(lseg, &tmp_list);
                        found = true;
                }
-       if (!found)
-               goto out_noroc;
-       lo->plh_block_lgets++;
-       pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
-       spin_unlock(&ino->i_lock);
-       pnfs_free_lseg_list(&tmp_list);
-       pnfs_layoutcommit_inode(ino, true);
-       return true;
+       /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
+        * in pnfs_roc_release(). We don't really send a layoutreturn but
+        * still want others to view us like we are sending one!
+        *
+        * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
+        * LAYOUTRETURN, so we proceed like there are no layouts to return.
+        *
+        * ROC in three conditions:
+        * 1. there are ROC lsegs
+        * 2. we don't send layoutreturn
+        * 3. no others are sending layoutreturn
+        */
+       if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
+               roc = true;
 
 out_noroc:
-       if (lo) {
-               stateid = lo->plh_stateid;
-               if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-                                          &lo->plh_flags))
-                       layoutreturn = pnfs_prepare_layoutreturn(lo);
-       }
        spin_unlock(&ino->i_lock);
-       if (layoutreturn) {
-               pnfs_layoutcommit_inode(ino, true);
+       pnfs_free_lseg_list(&tmp_list);
+       pnfs_layoutcommit_inode(ino, true);
+       if (layoutreturn)
                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
-       }
-       return false;
+       return roc;
 }
 
 void pnfs_roc_release(struct inode *ino)
@@ -1135,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino)
 
        spin_lock(&ino->i_lock);
        lo = NFS_I(ino)->layout;
-       lo->plh_block_lgets--;
+       pnfs_clear_layoutreturn_waitbit(lo);
        if (atomic_dec_and_test(&lo->plh_refcount)) {
                pnfs_detach_layout_hdr(lo);
                spin_unlock(&ino->i_lock);
@@ -1153,27 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
                lo->plh_barrier = barrier;
        spin_unlock(&ino->i_lock);
+       trace_nfs4_layoutreturn_on_close(ino, 0);
 }
 
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
        struct nfs_inode *nfsi = NFS_I(ino);
        struct pnfs_layout_hdr *lo;
-       struct pnfs_layout_segment *lseg;
-       nfs4_stateid stateid;
        u32 current_seqid;
-       bool layoutreturn = false;
 
        spin_lock(&ino->i_lock);
-       list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
-               if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
-                       continue;
-               if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
-                       continue;
-               rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-               spin_unlock(&ino->i_lock);
-               return true;
-       }
        lo = nfsi->layout;
        current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
 
@@ -1181,19 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
         * a barrier, we choose the worst-case barrier.
         */
        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-       stateid = lo->plh_stateid;
-       if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-                                          &lo->plh_flags))
-               layoutreturn = pnfs_prepare_layoutreturn(lo);
-       if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-               rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
        spin_unlock(&ino->i_lock);
-       if (layoutreturn) {
-               pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
-               return true;
-       }
-       return false;
 }
 
 /*
@@ -1221,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 }
 
-static void
-pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
-                  struct pnfs_layout_segment *lseg)
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+               const struct pnfs_layout_range *l2)
 {
-       struct pnfs_layout_segment *lp;
+       return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+               struct pnfs_layout_segment *old)
+{
+       return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+                  struct pnfs_layout_segment *lseg,
+                  bool (*is_after)(const struct pnfs_layout_range *,
+                          const struct pnfs_layout_range *),
+                  bool (*do_merge)(struct pnfs_layout_segment *,
+                          struct pnfs_layout_segment *),
+                  struct list_head *free_me)
+{
+       struct pnfs_layout_segment *lp, *tmp;
 
        dprintk("%s:Begin\n", __func__);
 
-       list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-               if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+       list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+               if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+                       continue;
+               if (do_merge(lseg, lp)) {
+                       mark_lseg_invalid(lp, free_me);
+                       continue;
+               }
+               if (is_after(&lseg->pls_range, &lp->pls_range))
                        continue;
                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
@@ -1252,6 +1253,24 @@ out:
 
        dprintk("%s:Return\n", __func__);
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+                  struct pnfs_layout_segment *lseg,
+                  struct list_head *free_me)
+{
+       struct inode *inode = lo->plh_inode;
+       struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+       if (ld->add_lseg != NULL)
+               ld->add_lseg(lo, lseg, free_me);
+       else
+               pnfs_generic_layout_insert_lseg(lo, lseg,
+                               pnfs_lseg_range_is_after,
+                               pnfs_lseg_no_merge,
+                               free_me);
+}
 
 static struct pnfs_layout_hdr *
 alloc_init_layout_hdr(struct inode *ino,
@@ -1344,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
                        ret = pnfs_get_lseg(lseg);
                        break;
                }
-               if (lseg->pls_range.offset > range->offset)
-                       break;
        }
 
        dprintk("%s:Return lseg %p ref %d\n",
@@ -1438,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
 
 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 {
+       if (!pnfs_should_retry_layoutget(lo))
+               return false;
        /*
         * send layoutcommit as it can hold up layoutreturn due to lseg
         * reference
@@ -1484,6 +1503,9 @@ pnfs_update_layout(struct inode *ino,
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                goto out;
 
+       if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+               goto out;
+
        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
                goto out;
 
@@ -1533,8 +1555,7 @@ lookup_again:
         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
         * for LAYOUTRETURN even if first is true.
         */
-       if (!lseg && pnfs_should_retry_layoutget(lo) &&
-           test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+       if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
                spin_unlock(&ino->i_lock);
                dprintk("%s wait for layoutreturn\n", __func__);
                if (pnfs_prepare_to_retry_layoutget(lo)) {
@@ -1547,7 +1568,7 @@ lookup_again:
                goto out_put_layout_hdr;
        }
 
-       if (pnfs_layoutgets_blocked(lo, &arg, 0))
+       if (pnfs_layoutgets_blocked(lo))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
        spin_unlock(&ino->i_lock);
@@ -1593,6 +1614,26 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(pnfs_update_layout);
 
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+       switch (range->iomode) {
+       case IOMODE_READ:
+       case IOMODE_RW:
+               break;
+       default:
+               return false;
+       }
+       if (range->offset == NFS4_MAX_UINT64)
+               return false;
+       if (range->length == 0)
+               return false;
+       if (range->length != NFS4_MAX_UINT64 &&
+           range->length > NFS4_MAX_UINT64 - range->offset)
+               return false;
+       return true;
+}
+
 struct pnfs_layout_segment *
 pnfs_layout_process(struct nfs4_layoutget *lgp)
 {
@@ -1601,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct pnfs_layout_segment *lseg;
        struct inode *ino = lo->plh_inode;
        LIST_HEAD(free_me);
-       int status = 0;
+       int status = -EINVAL;
+
+       if (!pnfs_sanity_check_layout_range(&res->range))
+               goto out;
 
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
@@ -1619,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        lseg->pls_range = res->range;
 
        spin_lock(&ino->i_lock);
-       if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
-               dprintk("%s forget reply due to recall\n", __func__);
-               goto out_forget_reply;
-       }
-
-       if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+       if (pnfs_layoutgets_blocked(lo)) {
                dprintk("%s forget reply due to state\n", __func__);
                goto out_forget_reply;
        }
@@ -1651,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 
        pnfs_get_lseg(lseg);
-       pnfs_layout_insert_lseg(lo, lseg);
+       pnfs_layout_insert_lseg(lo, lseg, &free_me);
 
-       if (res->return_on_close) {
+       if (res->return_on_close)
                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
-               set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
-       }
 
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&free_me);
@@ -1692,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
                                lseg->pls_range.length);
                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
                        mark_lseg_invalid(lseg, tmp_list);
+                       set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                       &lo->plh_flags);
                }
 }
 
@@ -2267,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
 int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 {
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
        struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode)
        pnfs_get_layout_hdr(hdr);
        spin_unlock(&inode->i_lock);
 
-       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       data = kzalloc(sizeof(*data), gfp_flags);
        if (!data) {
                status = -ENOMEM;
                goto out_put;
@@ -2324,3 +2363,7 @@ out_put:
 }
 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
 #endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
index 3e6ab7bfbabd428425227b6f9d2a94711edd371e..78c9351ff117bdf56e04bbcf1d696ccbbfb6b866 100644 (file)
@@ -94,7 +94,6 @@ enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
-       NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
        NFS_LAYOUT_RETURN,              /* Return this layout ASAP */
        NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
@@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type {
 
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+       void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+                       struct pnfs_layout_segment *lseg,
+                       struct list_head *free_me);
 
        void (*return_range) (struct pnfs_layout_hdr *lo,
                              struct pnfs_layout_range *range);
@@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type {
 
 struct pnfs_layout_hdr {
        atomic_t                plh_refcount;
+       atomic_t                plh_outstanding; /* number of RPCs out */
        struct list_head        plh_layouts;   /* other client layouts */
        struct list_head        plh_bulk_destroy;
        struct list_head        plh_segs;      /* layout segments list */
-       nfs4_stateid            plh_stateid;
-       atomic_t                plh_outstanding; /* number of RPCs out */
        unsigned long           plh_block_lgets; /* block LAYOUTGET if >0 */
-       u32                     plh_barrier; /* ignore lower seqids */
        unsigned long           plh_retry_timestamp;
        unsigned long           plh_flags;
+       nfs4_stateid            plh_stateid;
+       u32                     plh_barrier; /* ignore lower seqids */
        enum pnfs_iomode        plh_return_iomode;
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
@@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               gfp_t gfp_flags);
 void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+                  struct pnfs_layout_segment *lseg,
+                  bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+                          const struct pnfs_layout_range *old),
+                  bool (*do_merge)(struct pnfs_layout_segment *lseg,
+                          struct pnfs_layout_segment *old),
+                  struct list_head *free_me);
+
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
@@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
                                        nfss->pnfs_curr_ld->id == src->l_type);
 }
 
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+       if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+               return NFS4_MAX_UINT64;
+       return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+       if (end == NFS4_MAX_UINT64 || end <= offset)
+               return NFS4_MAX_UINT64;
+       return 1 + end - offset;
+}
+
+extern unsigned int layoutstats_timer;
+
 #ifdef NFS_DEBUG
 void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
 #else
 static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
 {
 }
+
 #endif /* NFS_DEBUG */
 #else  /* CONFIG_NFS_V4_1 */
 
@@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 {
 }
 
-static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+static inline void
+pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
 {
-       return false;
 }
 
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
@@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
 #endif /* CONFIG_NFS_V4_1 */
 
 #if IS_ENABLED(CONFIG_NFS_V4_2)
-int pnfs_report_layoutstat(struct inode *inode);
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
 #else
 static inline int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
 {
        return 0;
 }
index f37e25b6311c83ac890508207be3e7d6cdc7bdda..24655b807d442596e14c360b29c81d4cedca9ef0 100644 (file)
@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
        if (ret) {
                cinfo->ds->nwritten -= ret;
                cinfo->ds->ncommitting += ret;
-               bucket->clseg = bucket->wlseg;
-               if (list_empty(src))
+               if (bucket->clseg == NULL)
+                       bucket->clseg = pnfs_get_lseg(bucket->wlseg);
+               if (list_empty(src)) {
+                       pnfs_put_lseg_locked(bucket->wlseg);
                        bucket->wlseg = NULL;
-               else
-                       pnfs_get_lseg(bucket->clseg);
+               }
        }
        return ret;
 }
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
        struct pnfs_commit_bucket *bucket;
        struct pnfs_layout_segment *freeme;
+       LIST_HEAD(pages);
        int i;
 
+       spin_lock(cinfo->lock);
        for (i = idx; i < fl_cinfo->nbuckets; i++) {
                bucket = &fl_cinfo->buckets[i];
                if (list_empty(&bucket->committing))
                        continue;
-               nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
-               spin_lock(cinfo->lock);
                freeme = bucket->clseg;
                bucket->clseg = NULL;
+               list_splice_init(&bucket->committing, &pages);
                spin_unlock(cinfo->lock);
+               nfs_retry_commit(&pages, freeme, cinfo, i);
                pnfs_put_lseg(freeme);
+               spin_lock(cinfo->lock);
        }
+       spin_unlock(cinfo->lock);
 }
 
 static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
                if (!data)
                        break;
                data->ds_commit_index = i;
-               spin_lock(cinfo->lock);
-               data->lseg = bucket->clseg;
-               bucket->clseg = NULL;
-               spin_unlock(cinfo->lock);
                list_add(&data->pages, list);
                nreq++;
        }
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
        return nreq;
 }
 
+static inline
+void pnfs_fetch_commit_bucket_list(struct list_head *pages,
+               struct nfs_commit_data *data,
+               struct nfs_commit_info *cinfo)
+{
+       struct pnfs_commit_bucket *bucket;
+
+       bucket = &cinfo->ds->buckets[data->ds_commit_index];
+       spin_lock(cinfo->lock);
+       list_splice_init(&bucket->committing, pages);
+       data->lseg = bucket->clseg;
+       bucket->clseg = NULL;
+       spin_unlock(cinfo->lock);
+
+}
+
 /* This follows nfs_commit_list pretty closely */
 int
 pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
        if (!list_empty(mds_pages)) {
                data = nfs_commitdata_alloc();
                if (data != NULL) {
-                       data->lseg = NULL;
+                       data->ds_commit_index = -1;
                        list_add(&data->pages, &list);
                        nreq++;
                } else {
@@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 
        list_for_each_entry_safe(data, tmp, &list, pages) {
                list_del_init(&data->pages);
-               if (!data->lseg) {
+               if (data->ds_commit_index < 0) {
                        nfs_init_commit(data, mds_pages, NULL, cinfo);
                        nfs_initiate_commit(NFS_CLIENT(inode), data,
                                            NFS_PROTO(data->inode),
                                            data->mds_ops, how, 0);
                } else {
-                       struct pnfs_commit_bucket *buckets;
+                       LIST_HEAD(pages);
 
-                       buckets = cinfo->ds->buckets;
-                       nfs_init_commit(data,
-                                       &buckets[data->ds_commit_index].committing,
-                                       data->lseg,
-                                       cinfo);
+                       pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+                       nfs_init_commit(data, &pages, data->lseg, cinfo);
                        initiate_commit(data, how);
                }
        }
@@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
        return false;
 }
 
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
 static bool
 _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
                               const struct list_head *dsaddrs2)
 {
        struct nfs4_pnfs_ds_addr *da1, *da2;
-
-       /* step through both lists, comparing as we go */
-       for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
-            da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
-            da1 != NULL && da2 != NULL;
-            da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
-            da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
-               if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
-                                  (struct sockaddr *)&da2->da_addr))
-                       return false;
+       struct sockaddr *sa1, *sa2;
+       bool match = false;
+
+       list_for_each_entry(da1, dsaddrs1, da_node) {
+               sa1 = (struct sockaddr *)&da1->da_addr;
+               match = false;
+               list_for_each_entry(da2, dsaddrs2, da_node) {
+                       sa2 = (struct sockaddr *)&da2->da_addr;
+                       match = same_sockaddr(sa1, sa2);
+                       if (match)
+                               break;
+               }
+               if (!match)
+                       break;
        }
-       if (da1 == NULL && da2 == NULL)
-               return true;
-
-       return false;
+       return match;
 }
 
 /*
@@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
        }
        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        cinfo->ds->nwritten++;
-       spin_unlock(cinfo->lock);
 
-       nfs_request_add_commit_list(req, list, cinfo);
+       nfs_request_add_commit_list_locked(req, list, cinfo);
+       spin_unlock(cinfo->lock);
+       nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
 
index aa62004f1706f9c685b368379ce021c1f3474c72..383a027de4528d74bd976cc5bfa237be653c5e77 100644 (file)
@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
        ret = nfs_register_sysctl();
        if (ret < 0)
                goto error_2;
-       register_shrinker(&acl_shrinker);
+       ret = register_shrinker(&acl_shrinker);
+       if (ret < 0)
+               goto error_3;
        return 0;
-
+error_3:
+       nfs_unregister_sysctl();
 error_2:
        unregister_nfs4_fs();
 error_1:
index 75a35a1afa7944d4ac54bd94994cddf1fd05ab54..388f48079c43839fa9c8222d78556282920858f2 100644 (file)
@@ -767,6 +767,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
        return NULL;
 }
 
+/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold the cinfo->lock, and the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+                           struct nfs_commit_info *cinfo)
+{
+       set_bit(PG_CLEAN, &req->wb_flags);
+       nfs_list_add_request(req, dst);
+       cinfo->mds->ncommit++;
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
 /**
  * nfs_request_add_commit_list - add request to a commit list
  * @req: pointer to a struct nfs_page
@@ -784,13 +806,10 @@ void
 nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
                            struct nfs_commit_info *cinfo)
 {
-       set_bit(PG_CLEAN, &(req)->wb_flags);
        spin_lock(cinfo->lock);
-       nfs_list_add_request(req, dst);
-       cinfo->mds->ncommit++;
+       nfs_request_add_commit_list_locked(req, dst, cinfo);
        spin_unlock(cinfo->lock);
-       if (!cinfo->dreq)
-               nfs_mark_page_unstable(req->wb_page);
+       nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
 
@@ -1793,7 +1812,7 @@ out_mark_dirty:
        return res;
 }
 
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        int flags = FLUSH_SYNC;
@@ -1828,11 +1847,6 @@ out_mark_dirty:
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return ret;
 }
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-       return nfs_commit_unstable_pages(inode, wbc);
-}
 EXPORT_SYMBOL_GPL(nfs_write_inode);
 
 /*
index 9aa2796da90d9169488a625d5f80e90010971ff4..6d834dc9bbc826bf8b711fb4adfaf58c82ae5c48 100644 (file)
@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
        }
 
        nr_iomaps = be32_to_cpup(p++);
-       expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+       expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
        if (len != expected) {
                dprintk("%s: extent array size mismatch: %u/%u\n",
                        __func__, len, expected);
index fdc79037c0e78108dd7d359e2d539fccbeffccff..6de925fe84991d09081dce75db8c8cd4ba12dded 100644 (file)
@@ -7,13 +7,6 @@
 struct iomap;
 struct xdr_stream;
 
-enum pnfs_block_extent_state {
-       PNFS_BLOCK_READWRITE_DATA       = 0,
-       PNFS_BLOCK_READ_DATA            = 1,
-       PNFS_BLOCK_INVALID_DATA         = 2,
-       PNFS_BLOCK_NONE_DATA            = 3,
-};
-
 struct pnfs_block_extent {
        struct nfsd4_deviceid           vol_id;
        u64                             foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
        u64                             soff;
        enum pnfs_block_extent_state    es;
 };
-#define NFS4_BLOCK_EXTENT_SIZE         44
-
-enum pnfs_block_volume_type {
-       PNFS_BLOCK_VOLUME_SIMPLE        = 0,
-       PNFS_BLOCK_VOLUME_SLICE         = 1,
-       PNFS_BLOCK_VOLUME_CONCAT        = 2,
-       PNFS_BLOCK_VOLUME_STRIPE        = 3,
-};
 
 /*
  * Random upper cap for the uuid length to avoid unbounded allocation.
index e4905fbf3396978b6d5f77319df6f3613f9d4ed7..8f20d6016e205d341b82e31025961e8c9f6c6963 100644 (file)
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
        struct inode *inode = d_inode(dentry);
        const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
 
-       return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+       seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+       return 0;
 }
 
 static const struct super_operations nsfs_ops = {
index d0e436dc64371713af953cb475ad530e3cd69557..ce12e0b1a31f180371e6ac010cac3e37fdfbad09 100644 (file)
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                     struct dlm_migratable_lockres *mres)
 {
        struct dlm_migratable_lock *ml;
-       struct list_head *queue;
+       struct list_head *queue, *iter;
        struct list_head *tmpq = NULL;
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
@@ -1821,7 +1821,9 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                               list_for_each_entry(lock, tmpq, list) {
+                               list_for_each(iter, tmpq) {
+                                       lock = list_entry(iter,
+                                                 struct dlm_lock, list);
                                        if (lock->ml.cookie == ml->cookie)
                                                break;
                                        lock = NULL;
index aa50d1ac28fc6189a9489d1b679fcf86115e633c..b25eee4cead5398b69889c95c29480ba9862c397 100644 (file)
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
 {
        struct inode * inode = file_inode(file);
-       char *page, *tmp;
-       ssize_t length;
        uid_t loginuid;
        kuid_t kloginuid;
+       int rv;
 
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        }
        rcu_read_unlock();
 
-       if (count >= PAGE_SIZE)
-               count = PAGE_SIZE - 1;
-
        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }
-       page = (char*)__get_free_page(GFP_TEMPORARY);
-       if (!page)
-               return -ENOMEM;
-       length = -EFAULT;
-       if (copy_from_user(page, buf, count))
-               goto out_free_page;
-
-       page[count] = '\0';
-       loginuid = simple_strtoul(page, &tmp, 10);
-       if (tmp == page) {
-               length = -EINVAL;
-               goto out_free_page;
 
-       }
+       rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+       if (rv < 0)
+               return rv;
 
        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
-               if (!uid_valid(kloginuid)) {
-                       length = -EINVAL;
-                       goto out_free_page;
-               }
+               if (!uid_valid(kloginuid))
+                       return -EINVAL;
        }
 
-       length = audit_set_loginuid(kloginuid);
-       if (likely(length == 0))
-               length = count;
-
-out_free_page:
-       free_page((unsigned long) page);
-       return length;
+       rv = audit_set_loginuid(kloginuid);
+       if (rv < 0)
+               return rv;
+       return count;
 }
 
 static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
 {
        struct task_struct *task;
-       char buffer[PROC_NUMBUF], *end;
+       char buffer[PROC_NUMBUF];
        int make_it_fail;
+       int rv;
 
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
-       make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
-       if (*end)
-               return -EINVAL;
+       rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+       if (rv < 0)
+               return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;
 
@@ -1836,8 +1818,6 @@ end_instantiate:
        return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
 /*
  * dname_to_vma_addr - maps a dentry name into two unsigned longs
  * which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
        if (flags & LOOKUP_RCU)
                return -ECHILD;
 
-       if (!capable(CAP_SYS_ADMIN)) {
-               status = -EPERM;
-               goto out_notask;
-       }
-
        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+       .readlink       = proc_pid_readlink,
+       .follow_link    = proc_map_files_follow_link,
+       .setattr        = proc_setattr,
+};
+
 static int
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
        ei = PROC_I(inode);
        ei->op.proc_get_link = proc_map_files_get_link;
 
-       inode->i_op = &proc_pid_link_inode_operations;
+       inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;
        inode->i_mode = S_IFLNK;
 
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        int result;
        struct mm_struct *mm;
 
-       result = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        result = -ENOENT;
        task = get_proc_task(dir);
        if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
        struct map_files_info *p;
        int ret;
 
-       ret = -EPERM;
-       if (!capable(CAP_SYS_ADMIN))
-               goto out;
-
        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
        .llseek         = seq_lseek,
        .release        = seq_release_private,
 };
-#endif /* CONFIG_CHECKPOINT_RESTORE */
 
 static int proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
 {
        struct task_struct *task;
        struct mm_struct *mm;
-       char buffer[PROC_NUMBUF], *end;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;
 
-       ret = -EFAULT;
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count))
-               goto out_no_task;
-
-       ret = -EINVAL;
-       val = (unsigned int)simple_strtoul(buffer, &end, 0);
-       if (*end == '\n')
-               end++;
-       if (end - buffer == 0)
-               goto out_no_task;
+       ret = kstrtouint_from_user(buf, count, 0, &val);
+       if (ret < 0)
+               return ret;
 
        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;
 
-       ret = end - buffer;
        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
  out_no_mm:
        put_task_struct(task);
  out_no_task:
-       return ret;
+       if (ret < 0)
+               return ret;
+       return count;
 }
 
 static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
index e5dee5c3188eb10e94742fbb57bb3b3564fa61bb..ff3ffc76a93795b6662cb80d85f2a404d8f20693 100644 (file)
@@ -26,7 +26,7 @@
 
 #include "internal.h"
 
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
 
 static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 {
        int rv;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        rv = __xlate_proc_name(name, ret, residual);
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return rv;
 }
 
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 {
        struct inode *inode;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
        if (de) {
                pde_get(de);
-               spin_unlock(&proc_subdir_lock);
+               read_unlock(&proc_subdir_lock);
                inode = proc_get_inode(dir->i_sb, de);
                if (!inode)
                        return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                d_add(dentry, inode);
                return NULL;
        }
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return ERR_PTR(-ENOENT);
 }
 
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       spin_lock(&proc_subdir_lock);
+       read_lock(&proc_subdir_lock);
        de = pde_subdir_first(de);
        i = ctx->pos - 2;
        for (;;) {
                if (!de) {
-                       spin_unlock(&proc_subdir_lock);
+                       read_unlock(&proc_subdir_lock);
                        return 0;
                }
                if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
        do {
                struct proc_dir_entry *next;
                pde_get(de);
-               spin_unlock(&proc_subdir_lock);
+               read_unlock(&proc_subdir_lock);
                if (!dir_emit(ctx, de->name, de->namelen,
                            de->low_ino, de->mode >> 12)) {
                        pde_put(de);
                        return 0;
                }
-               spin_lock(&proc_subdir_lock);
+               read_lock(&proc_subdir_lock);
                ctx->pos++;
                next = pde_subdir_next(de);
                pde_put(de);
                de = next;
        } while (de);
-       spin_unlock(&proc_subdir_lock);
+       read_unlock(&proc_subdir_lock);
        return 1;
 }
 
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        if (ret)
                return ret;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                proc_free_inum(dp->low_ino);
                return -EEXIST;
        }
-       spin_unlock(&proc_subdir_lock);
+       write_unlock(&proc_subdir_lock);
 
        return 0;
 }
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        unsigned int len;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return;
        }
        len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        de = pde_subdir_find(parent, fn, len);
        if (de)
                rb_erase(&de->subdir_node, &parent->subdir);
-       spin_unlock(&proc_subdir_lock);
+       write_unlock(&proc_subdir_lock);
        if (!de) {
                WARN(1, "name '%s'\n", name);
                return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
        const char *fn = name;
        unsigned int len;
 
-       spin_lock(&proc_subdir_lock);
+       write_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        len = strlen(fn);
 
        root = pde_subdir_find(parent, fn, len);
        if (!root) {
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
                return -ENOENT;
        }
        rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                        de = next;
                        continue;
                }
-               spin_unlock(&proc_subdir_lock);
+               write_unlock(&proc_subdir_lock);
 
                proc_entry_rundown(de);
                next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
                        break;
                pde_put(de);
 
-               spin_lock(&proc_subdir_lock);
+               write_lock(&proc_subdir_lock);
                de = next;
        }
        pde_put(root);
index 7eee2d8b97d9786b7c05ca1078d477c4db6af5e1..93484034a03d04c38cc5ff7779fb95e7611fbd09 100644 (file)
@@ -9,12 +9,16 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 #include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
 
 /* /proc/kpagecount - an array exposing page counts
  *
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
                pfn++;
                out++;
                count -= KPMSIZE;
+
+               cond_resched();
        }
 
        *ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
        if (PageBalloon(page))
                u |= 1 << KPF_BALLOON;
 
+       if (page_is_idle(page))
+               u |= 1 << KPF_IDLE;
+
        u |= kpf_copy_bit(k, KPF_LOCKED,        PG_locked);
 
        u |= kpf_copy_bit(k, KPF_SLAB,          PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                pfn++;
                out++;
                count -= KPMSIZE;
+
+               cond_resched();
        }
 
        *ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
        .read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       u64 __user *out = (u64 __user *)buf;
+       struct page *ppage;
+       unsigned long src = *ppos;
+       unsigned long pfn;
+       ssize_t ret = 0;
+       u64 ino;
+
+       pfn = src / KPMSIZE;
+       count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+       if (src & KPMMASK || count & KPMMASK)
+               return -EINVAL;
+
+       while (count > 0) {
+               if (pfn_valid(pfn))
+                       ppage = pfn_to_page(pfn);
+               else
+                       ppage = NULL;
+
+               if (ppage)
+                       ino = page_cgroup_ino(ppage);
+               else
+                       ino = 0;
+
+               if (put_user(ino, out)) {
+                       ret = -EFAULT;
+                       break;
+               }
+
+               pfn++;
+               out++;
+               count -= KPMSIZE;
+
+               cond_resched();
+       }
+
+       *ppos += (char __user *)out - buf;
+       if (!ret)
+               ret = (char __user *)out - buf;
+       return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+       .llseek = mem_lseek,
+       .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
        proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
        proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+       proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
        return 0;
 }
 fs_initcall(proc_page_init);
index 3b4d8255e8068dccaa99b158b9d8daab193de656..e2d46adb54b42a76608a0cbf938d55529bd6c851 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -446,6 +447,7 @@ struct mem_size_stats {
        unsigned long anonymous_thp;
        unsigned long swap;
        u64 pss;
+       u64 swap_pss;
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -458,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 
        mss->resident += size;
        /* Accumulate the size in pages that have been accessed. */
-       if (young || PageReferenced(page))
+       if (young || page_is_young(page) || PageReferenced(page))
                mss->referenced += size;
        mapcount = page_mapcount(page);
        if (mapcount >= 2) {
@@ -492,9 +494,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
        } else if (is_swap_pte(*pte)) {
                swp_entry_t swpent = pte_to_swp_entry(*pte);
 
-               if (!non_swap_entry(swpent))
+               if (!non_swap_entry(swpent)) {
+                       int mapcount;
+
                        mss->swap += PAGE_SIZE;
-               else if (is_migration_entry(swpent))
+                       mapcount = swp_swapcount(swpent);
+                       if (mapcount >= 2) {
+                               u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+                               do_div(pss_delta, mapcount);
+                               mss->swap_pss += pss_delta;
+                       } else {
+                               mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+                       }
+               } else if (is_migration_entry(swpent))
                        page = migration_entry_to_page(swpent);
        }
 
@@ -640,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   "Anonymous:      %8lu kB\n"
                   "AnonHugePages:  %8lu kB\n"
                   "Swap:           %8lu kB\n"
+                  "SwapPss:        %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
                   "MMUPageSize:    %8lu kB\n"
                   "Locked:         %8lu kB\n",
@@ -654,6 +668,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   mss.anonymous >> 10,
                   mss.anonymous_thp >> 10,
                   mss.swap >> 10,
+                  (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
                   vma_kernel_pagesize(vma) >> 10,
                   vma_mmu_pagesize(vma) >> 10,
                   (vma->vm_flags & VM_LOCKED) ?
@@ -712,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = {
        .release        = proc_map_release,
 };
 
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- *    but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- *    these flag is set to denote, that user is aware of the
- *    new API and those page-shift bits change their meaning.
- *    The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- *    of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
 enum clear_refs_types {
        CLEAR_REFS_ALL = 1,
        CLEAR_REFS_ANON,
@@ -810,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
                /* Clear accessed and referenced bits. */
                pmdp_test_and_clear_young(vma, addr, pmd);
+               test_and_clear_page_young(page);
                ClearPageReferenced(page);
 out:
                spin_unlock(ptl);
@@ -837,6 +836,7 @@ out:
 
                /* Clear accessed and referenced bits. */
                ptep_test_and_clear_young(vma, addr, pte);
+               test_and_clear_page_young(page);
                ClearPageReferenced(page);
        }
        pte_unmap_unlock(pte - 1, ptl);
@@ -889,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
        if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
                return -EINVAL;
 
-       if (type == CLEAR_REFS_SOFT_DIRTY) {
-               soft_dirty_cleared = true;
-               pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
-                            " See the linux/Documentation/vm/pagemap.txt for "
-                            "details.\n");
-       }
-
        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
@@ -963,36 +956,26 @@ typedef struct {
 struct pagemapread {
        int pos, len;           /* units: PM_ENTRY_BYTES, not bytes */
        pagemap_entry_t *buffer;
-       bool v2;
+       bool show_pfn;
 };
 
 #define PAGEMAP_WALK_SIZE      (PMD_SIZE)
 #define PAGEMAP_WALK_MASK      (PMD_MASK)
 
-#define PM_ENTRY_BYTES      sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS      3
-#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS      6
-#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x)      (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x)   (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
-
-#define __PM_SOFT_DIRTY      (1LL)
-#define PM_PRESENT          PM_STATUS(4LL)
-#define PM_SWAP             PM_STATUS(2LL)
-#define PM_FILE             PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2)  PM_STATUS2(v2, 0)
+#define PM_ENTRY_BYTES         sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS         55
+#define PM_PFRAME_MASK         GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY          BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE      BIT_ULL(56)
+#define PM_FILE                        BIT_ULL(61)
+#define PM_SWAP                        BIT_ULL(62)
+#define PM_PRESENT             BIT_ULL(63)
+
 #define PM_END_OF_BUFFER    1
 
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
 {
-       return (pagemap_entry_t) { .pme = val };
+       return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
 }
 
 static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1013,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 
        while (addr < end) {
                struct vm_area_struct *vma = find_vma(walk->mm, addr);
-               pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+               pagemap_entry_t pme = make_pme(0, 0);
                /* End of address space hole, which we mark as non-present. */
                unsigned long hole_end;
 
@@ -1033,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 
                /* Addresses in the VMA. */
                if (vma->vm_flags & VM_SOFTDIRTY)
-                       pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+                       pme = make_pme(0, PM_SOFT_DIRTY);
                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
                        err = add_to_pagemap(addr, &pme, pm);
                        if (err)
@@ -1044,67 +1027,42 @@ out:
        return err;
 }
 
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
                struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
-       u64 frame, flags;
+       u64 frame = 0, flags = 0;
        struct page *page = NULL;
-       int flags2 = 0;
 
        if (pte_present(pte)) {
-               frame = pte_pfn(pte);
-               flags = PM_PRESENT;
+               if (pm->show_pfn)
+                       frame = pte_pfn(pte);
+               flags |= PM_PRESENT;
                page = vm_normal_page(vma, addr, pte);
                if (pte_soft_dirty(pte))
-                       flags2 |= __PM_SOFT_DIRTY;
+                       flags |= PM_SOFT_DIRTY;
        } else if (is_swap_pte(pte)) {
                swp_entry_t entry;
                if (pte_swp_soft_dirty(pte))
-                       flags2 |= __PM_SOFT_DIRTY;
+                       flags |= PM_SOFT_DIRTY;
                entry = pte_to_swp_entry(pte);
                frame = swp_type(entry) |
                        (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
-               flags = PM_SWAP;
+               flags |= PM_SWAP;
                if (is_migration_entry(entry))
                        page = migration_entry_to_page(entry);
-       } else {
-               if (vma->vm_flags & VM_SOFTDIRTY)
-                       flags2 |= __PM_SOFT_DIRTY;
-               *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
-               return;
        }
 
        if (page && !PageAnon(page))
                flags |= PM_FILE;
-       if ((vma->vm_flags & VM_SOFTDIRTY))
-               flags2 |= __PM_SOFT_DIRTY;
-
-       *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
-}
+       if (page && page_mapcount(page) == 1)
+               flags |= PM_MMAP_EXCLUSIVE;
+       if (vma->vm_flags & VM_SOFTDIRTY)
+               flags |= PM_SOFT_DIRTY;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-               pmd_t pmd, int offset, int pmd_flags2)
-{
-       /*
-        * Currently pmd for thp is always present because thp can not be
-        * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
-        * This if-check is just to prepare for future implementation.
-        */
-       if (pmd_present(pmd))
-               *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
-                               | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
-       else
-               *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
+       return make_pme(frame, flags);
 }
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-               pmd_t pmd, int offset, int pmd_flags2)
-{
-}
-#endif
 
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
                             struct mm_walk *walk)
 {
        struct vm_area_struct *vma = walk->vma;
@@ -1113,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte, *orig_pte;
        int err = 0;
 
-       if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-               int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+               u64 flags = 0, frame = 0;
+               pmd_t pmd = *pmdp;
 
-               if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
-                       pmd_flags2 = __PM_SOFT_DIRTY;
-               else
-                       pmd_flags2 = 0;
+               if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+                       flags |= PM_SOFT_DIRTY;
+
+               /*
+                * Currently pmd for thp is always present because thp
+                * can not be swapped-out, migrated, or HWPOISONed
+                * (split in such cases instead.)
+                * This if-check is just to prepare for future implementation.
+                */
+               if (pmd_present(pmd)) {
+                       struct page *page = pmd_page(pmd);
+
+                       if (page_mapcount(page) == 1)
+                               flags |= PM_MMAP_EXCLUSIVE;
+
+                       flags |= PM_PRESENT;
+                       if (pm->show_pfn)
+                               frame = pmd_pfn(pmd) +
+                                       ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+               }
 
                for (; addr != end; addr += PAGE_SIZE) {
-                       unsigned long offset;
-                       pagemap_entry_t pme;
+                       pagemap_entry_t pme = make_pme(frame, flags);
 
-                       offset = (addr & ~PAGEMAP_WALK_MASK) >>
-                                       PAGE_SHIFT;
-                       thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
                        err = add_to_pagemap(addr, &pme, pm);
                        if (err)
                                break;
+                       if (pm->show_pfn && (flags & PM_PRESENT))
+                               frame++;
                }
                spin_unlock(ptl);
                return err;
        }
 
-       if (pmd_trans_unstable(pmd))
+       if (pmd_trans_unstable(pmdp))
                return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
        /*
         * We can assume that @vma always points to a valid one and @end never
         * goes beyond vma->vm_end.
         */
-       orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+       orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
        for (; addr < end; pte++, addr += PAGE_SIZE) {
                pagemap_entry_t pme;
 
-               pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+               pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
                err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        break;
@@ -1160,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-                                       pte_t pte, int offset, int flags2)
-{
-       if (pte_present(pte))
-               *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)        |
-                               PM_STATUS2(pm->v2, flags2)              |
-                               PM_PRESENT);
-       else
-               *pme = make_pme(PM_NOT_PRESENT(pm->v2)                  |
-                               PM_STATUS2(pm->v2, flags2));
-}
-
 /* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
                                 unsigned long addr, unsigned long end,
                                 struct mm_walk *walk)
 {
        struct pagemapread *pm = walk->private;
        struct vm_area_struct *vma = walk->vma;
+       u64 flags = 0, frame = 0;
        int err = 0;
-       int flags2;
-       pagemap_entry_t pme;
+       pte_t pte;
 
        if (vma->vm_flags & VM_SOFTDIRTY)
-               flags2 = __PM_SOFT_DIRTY;
-       else
-               flags2 = 0;
+               flags |= PM_SOFT_DIRTY;
+
+       pte = huge_ptep_get(ptep);
+       if (pte_present(pte)) {
+               struct page *page = pte_page(pte);
+
+               if (!PageAnon(page))
+                       flags |= PM_FILE;
+
+               if (page_mapcount(page) == 1)
+                       flags |= PM_MMAP_EXCLUSIVE;
+
+               flags |= PM_PRESENT;
+               if (pm->show_pfn)
+                       frame = pte_pfn(pte) +
+                               ((addr & ~hmask) >> PAGE_SHIFT);
+       }
 
        for (; addr != end; addr += PAGE_SIZE) {
-               int offset = (addr & ~hmask) >> PAGE_SHIFT;
-               huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+               pagemap_entry_t pme = make_pme(frame, flags);
+
                err = add_to_pagemap(addr, &pme, pm);
                if (err)
                        return err;
+               if (pm->show_pfn && (flags & PM_PRESENT))
+                       frame++;
        }
 
        cond_resched();
@@ -1211,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
  * Bits 0-54  page frame number (PFN) if present
  * Bits 0-4   swap type if swapped
  * Bits 5-54  swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit  56    page exclusively mapped
+ * Bits 57-60 zero
  * Bit  61    page is file-page or shared-anon
  * Bit  62    page swapped
  * Bit  63    page present
@@ -1229,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 static ssize_t pagemap_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
 {
-       struct task_struct *task = get_proc_task(file_inode(file));
-       struct mm_struct *mm;
+       struct mm_struct *mm = file->private_data;
        struct pagemapread pm;
-       int ret = -ESRCH;
        struct mm_walk pagemap_walk = {};
        unsigned long src;
        unsigned long svpfn;
        unsigned long start_vaddr;
        unsigned long end_vaddr;
-       int copied = 0;
+       int ret = 0, copied = 0;
 
-       if (!task)
+       if (!mm || !atomic_inc_not_zero(&mm->mm_users))
                goto out;
 
        ret = -EINVAL;
        /* file position must be aligned */
        if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
-               goto out_task;
+               goto out_mm;
 
        ret = 0;
        if (!count)
-               goto out_task;
+               goto out_mm;
+
+       /* do not disclose physical addresses: attack vector */
+       pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
 
-       pm.v2 = soft_dirty_cleared;
        pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
        pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
        ret = -ENOMEM;
        if (!pm.buffer)
-               goto out_task;
-
-       mm = mm_access(task, PTRACE_MODE_READ);
-       ret = PTR_ERR(mm);
-       if (!mm || IS_ERR(mm))
-               goto out_free;
+               goto out_mm;
 
-       pagemap_walk.pmd_entry = pagemap_pte_range;
+       pagemap_walk.pmd_entry = pagemap_pmd_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
 #ifdef CONFIG_HUGETLB_PAGE
        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1275,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        src = *ppos;
        svpfn = src / PM_ENTRY_BYTES;
        start_vaddr = svpfn << PAGE_SHIFT;
-       end_vaddr = TASK_SIZE_OF(task);
+       end_vaddr = mm->task_size;
 
        /* watch out for wraparound */
-       if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+       if (svpfn > mm->task_size >> PAGE_SHIFT)
                start_vaddr = end_vaddr;
 
        /*
@@ -1305,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                len = min(count, PM_ENTRY_BYTES * pm.pos);
                if (copy_to_user(buf, pm.buffer, len)) {
                        ret = -EFAULT;
-                       goto out_mm;
+                       goto out_free;
                }
                copied += len;
                buf += len;
@@ -1315,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!ret || ret == PM_END_OF_BUFFER)
                ret = copied;
 
-out_mm:
-       mmput(mm);
 out_free:
        kfree(pm.buffer);
-out_task:
-       put_task_struct(task);
+out_mm:
+       mmput(mm);
 out:
        return ret;
 }
 
 static int pagemap_open(struct inode *inode, struct file *file)
 {
-       /* do not disclose physical addresses: attack vector */
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-       pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
-                       "to stop being page-shift some time soon. See the "
-                       "linux/Documentation/vm/pagemap.txt for details.\n");
+       struct mm_struct *mm;
+
+       mm = proc_mem_open(inode, PTRACE_MODE_READ);
+       if (IS_ERR(mm))
+               return PTR_ERR(mm);
+       file->private_data = mm;
+       return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+       struct mm_struct *mm = file->private_data;
+
+       if (mm)
+               mmdrop(mm);
        return 0;
 }
 
@@ -1340,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = {
        .llseek         = mem_lseek, /* borrow this */
        .read           = pagemap_read,
        .open           = pagemap_open,
+       .release        = pagemap_release,
 };
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
index ce9e39fd5dafc768c27b2ceaa4e69a02c3ed1e6e..225586e141cac6e21a35b75e74201355f7f3f6d1 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/slab.h>
 #include <linux/cred.h>
 #include <linux/mm.h>
+#include <linux/printk.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -371,16 +372,16 @@ EXPORT_SYMBOL(seq_release);
  *     @esc:   set of characters that need escaping
  *
  *     Puts string into buffer, replacing each occurrence of character from
- *     @esc with usual octal escape.  Returns 0 in case of success, -1 - in
- *     case of overflow.
+ *     @esc with usual octal escape.
+ *     Use seq_has_overflowed() to check for errors.
  */
-int seq_escape(struct seq_file *m, const char *s, const char *esc)
+void seq_escape(struct seq_file *m, const char *s, const char *esc)
 {
        char *end = m->buf + m->size;
-        char *p;
+       char *p;
        char c;
 
-        for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+       for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
                if (!strchr(esc, c)) {
                        *p++ = c;
                        continue;
@@ -393,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
                        continue;
                }
                seq_set_overflow(m);
-               return -1;
-        }
+               return;
+       }
        m->count = p - m->buf;
-        return 0;
 }
 EXPORT_SYMBOL(seq_escape);
 
-int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+void seq_vprintf(struct seq_file *m, const char *f, va_list args)
 {
        int len;
 
@@ -408,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args)
                len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
                if (m->count + len < m->size) {
                        m->count += len;
-                       return 0;
+                       return;
                }
        }
        seq_set_overflow(m);
-       return -1;
 }
 EXPORT_SYMBOL(seq_vprintf);
 
-int seq_printf(struct seq_file *m, const char *f, ...)
+void seq_printf(struct seq_file *m, const char *f, ...)
 {
-       int ret;
        va_list args;
 
        va_start(args, f);
-       ret = seq_vprintf(m, f, args);
+       seq_vprintf(m, f, args);
        va_end(args);
-
-       return ret;
 }
 EXPORT_SYMBOL(seq_printf);
 
@@ -663,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops,
 }
 EXPORT_SYMBOL(seq_open_private);
 
-int seq_putc(struct seq_file *m, char c)
+void seq_putc(struct seq_file *m, char c)
 {
-       if (m->count < m->size) {
-               m->buf[m->count++] = c;
-               return 0;
-       }
-       return -1;
+       if (m->count >= m->size)
+               return;
+
+       m->buf[m->count++] = c;
 }
 EXPORT_SYMBOL(seq_putc);
 
-int seq_puts(struct seq_file *m, const char *s)
+void seq_puts(struct seq_file *m, const char *s)
 {
        int len = strlen(s);
-       if (m->count + len < m->size) {
-               memcpy(m->buf + m->count, s, len);
-               m->count += len;
-               return 0;
+
+       if (m->count + len >= m->size) {
+               seq_set_overflow(m);
+               return;
        }
-       seq_set_overflow(m);
-       return -1;
+       memcpy(m->buf + m->count, s, len);
+       m->count += len;
 }
 EXPORT_SYMBOL(seq_puts);
 
@@ -693,8 +688,8 @@ EXPORT_SYMBOL(seq_puts);
  * This routine is very quick when you show lots of numbers.
  * In usual cases, it will be better to use seq_printf(). It's easier to read.
  */
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
-                       unsigned long long num)
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+                        unsigned long long num)
 {
        int len;
 
@@ -706,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
 
        if (num < 10) {
                m->buf[m->count++] = num + '0';
-               return 0;
+               return;
        }
 
        len = num_to_str(m->buf + m->count, m->size - m->count, num);
        if (!len)
                goto overflow;
        m->count += len;
-       return 0;
+       return;
+
 overflow:
        seq_set_overflow(m);
-       return -1;
 }
 EXPORT_SYMBOL(seq_put_decimal_ull);
 
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
-                       long long num)
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
 {
        if (num < 0) {
                if (m->count + 3 >= m->size) {
                        seq_set_overflow(m);
-                       return -1;
+                       return;
                }
                if (delimiter)
                        m->buf[m->count++] = delimiter;
                num = -num;
                delimiter = '-';
        }
-       return seq_put_decimal_ull(m, delimiter, num);
-
+       seq_put_decimal_ull(m, delimiter, num);
 }
 EXPORT_SYMBOL(seq_put_decimal_ll);
 
@@ -773,6 +766,47 @@ void seq_pad(struct seq_file *m, char c)
 }
 EXPORT_SYMBOL(seq_pad);
 
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+                 int rowsize, int groupsize, const void *buf, size_t len,
+                 bool ascii)
+{
+       const u8 *ptr = buf;
+       int i, linelen, remaining = len;
+       int ret;
+
+       if (rowsize != 16 && rowsize != 32)
+               rowsize = 16;
+
+       for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+               linelen = min(remaining, rowsize);
+               remaining -= rowsize;
+
+               switch (prefix_type) {
+               case DUMP_PREFIX_ADDRESS:
+                       seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+                       break;
+               case DUMP_PREFIX_OFFSET:
+                       seq_printf(m, "%s%.8x: ", prefix_str, i);
+                       break;
+               default:
+                       seq_printf(m, "%s", prefix_str);
+                       break;
+               }
+
+               ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+                                        m->buf + m->count, m->size - m->count,
+                                        ascii);
+               if (ret >= m->size - m->count) {
+                       seq_set_overflow(m);
+               } else {
+                       m->count += ret;
+                       seq_putc(m, '\n');
+               }
+       }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
        struct list_head *lh;
index fb8b54eb77c5dcf89ef4a78d0c4dc88bc1d46abb..dc5fae601c24b40e432907dd93c1da6361453431 100644 (file)
@@ -417,14 +417,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
        if (oldcount == 0) {
                result = ufs_alloc_fragments (inode, cgno, goal, count, err);
                if (result) {
+                       ufs_clear_frags(inode, result + oldcount,
+                                       newcount - oldcount, locked_page != NULL);
                        write_seqlock(&UFS_I(inode)->meta_lock);
                        ufs_cpu_to_data_ptr(sb, p, result);
                        write_sequnlock(&UFS_I(inode)->meta_lock);
                        *err = 0;
                        UFS_I(inode)->i_lastfrag =
                                max(UFS_I(inode)->i_lastfrag, fragment + count);
-                       ufs_clear_frags(inode, result + oldcount,
-                                       newcount - oldcount, locked_page != NULL);
                }
                mutex_unlock(&UFS_SB(sb)->s_lock);
                UFSD("EXIT, result %llu\n", (unsigned long long)result);
index 634e676072cb738467b61064beaa5fa4b98cf672..f9aeb40a7197475ec52ba414852ff356129b0aab 100644 (file)
@@ -1287,8 +1287,10 @@ static struct file *userfaultfd_file_create(int flags)
 
        file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
                                  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
-       if (IS_ERR(file))
+       if (IS_ERR(file)) {
+               mmput(ctx->mm);
                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+       }
 out:
        return file;
 }
index df6828570e874ae423c48309f51a91d16ce949fd..a096841bd06ca1f424249c21d5b21a908de739cc 100644 (file)
@@ -33,6 +33,7 @@ xfs-y                         += $(addprefix libxfs/, \
                                   xfs_attr.o \
                                   xfs_attr_leaf.o \
                                   xfs_attr_remote.o \
+                                  xfs_bit.o \
                                   xfs_bmap.o \
                                   xfs_bmap_btree.o \
                                   xfs_btree.o \
@@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT)          += $(addprefix libxfs/, \
 xfs-y                          += xfs_aops.o \
                                   xfs_attr_inactive.o \
                                   xfs_attr_list.o \
-                                  xfs_bit.o \
                                   xfs_bmap_util.o \
                                   xfs_buf.o \
                                   xfs_dir2_readdir.o \
index f9e9ffe6fb46f88691e337d56d73928bc3d6f5cb..ffad7f20342f6e328b8daafa918ffeb1ae076ab1 100644 (file)
@@ -464,7 +464,7 @@ xfs_agfl_verify(
        struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
        int             i;
 
-       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+       if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
                return false;
        if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
                return false;
@@ -1937,7 +1937,7 @@ xfs_alloc_fix_freelist(
        struct xfs_alloc_arg    targs;  /* local allocation arguments */
        xfs_agblock_t           bno;    /* freelist block */
        xfs_extlen_t            need;   /* total blocks needed in freelist */
-       int                     error;
+       int                     error = 0;
 
        if (!pag->pagf_init) {
                error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
@@ -2260,7 +2260,7 @@ xfs_agf_verify(
        struct xfs_agf  *agf = XFS_BUF_TO_AGF(bp);
 
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+           !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
 
        if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
index 59d521c09a17b2310a5908324a9291cb247d95b1..90de071dd4c2cf2afbd7a7bde5658f1367b21bd1 100644 (file)
@@ -295,7 +295,7 @@ xfs_allocbt_verify(
        case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
                if (!xfs_sb_version_hascrc(&mp->m_sb))
                        return false;
-               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
                        return false;
@@ -313,7 +313,7 @@ xfs_allocbt_verify(
        case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
                if (!xfs_sb_version_hascrc(&mp->m_sb))
                        return false;
-               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
                        return false;
index 3349c9a1e84569201cb5df93ae39dd1941f72209..ff065578969f27cd40065043ebfe292c5743ab4e 100644 (file)
@@ -139,6 +139,8 @@ xfs_attr_get(
 
        args.value = value;
        args.valuelen = *valuelenp;
+       /* Entirely possible to look up a name which doesn't exist */
+       args.op_flags = XFS_DA_OP_OKNOENT;
 
        lock_mode = xfs_ilock_attr_map_shared(ip);
        if (!xfs_inode_hasattr(ip))
index e9d401ce93bb19d822a2ec9b475dae7ad5d279c1..33df52d97ec77bfeb07e4eb8bf45c33d7033a15c 100644 (file)
@@ -262,7 +262,7 @@ xfs_attr3_leaf_verify(
                if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
                        return false;
 
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
                        return false;
@@ -1056,7 +1056,7 @@ xfs_attr3_leaf_create(
 
                hdr3->blkno = cpu_to_be64(bp->b_bn);
                hdr3->owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
 
                ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
        } else {
index dd714037c322d0009d8df40e73bf5adf3aa892d4..f38f9bd81557062a69e3381cb90e9af50297e9ed 100644 (file)
@@ -100,7 +100,7 @@ xfs_attr3_rmt_verify(
                return false;
        if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
                return false;
-       if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+       if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
                return false;
        if (be64_to_cpu(rmt->rm_blkno) != bno)
                return false;
@@ -222,7 +222,7 @@ xfs_attr3_rmt_hdr_set(
        rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
        rmt->rm_offset = cpu_to_be32(offset);
        rmt->rm_bytes = cpu_to_be32(size);
-       uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+       uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid);
        rmt->rm_owner = cpu_to_be64(ino);
        rmt->rm_blkno = cpu_to_be64(bno);
 
@@ -618,9 +618,8 @@ xfs_attr_rmtval_remove(
 
                xfs_bmap_init(args->flist, args->firstblock);
                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
-                                   XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-                                   1, args->firstblock, args->flist,
-                                   &done);
+                                   XFS_BMAPI_ATTRFORK, 1, args->firstblock,
+                                   args->flist, &done);
                if (!error) {
                        error = xfs_bmap_finish(&args->trans, args->flist,
                                                &committed);
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
new file mode 100644 (file)
index 0000000..0e8885a
--- /dev/null
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_log_format.h"
+#include "xfs_bit.h"
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+/*
+ * Return whether bitmap is empty.
+ * Size is number of words in the bitmap, which is padded to word boundary
+ * Returns 1 for empty, 0 for non-empty.
+ */
+int
+xfs_bitmap_empty(uint *map, uint size)
+{
+       uint i;
+       uint ret = 0;
+
+       for (i = 0; i < size; i++) {
+               ret |= map[i];
+       }
+
+       return (ret == 0);
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint        size, uint start_bit)
+{
+       uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+       uint result = 0;
+       uint tmp;
+
+       size <<= BIT_TO_WORD_SHIFT;
+
+       ASSERT(start_bit < size);
+       size -= start_bit & ~(NBWORD - 1);
+       start_bit &= (NBWORD - 1);
+       if (start_bit) {
+               tmp = *p++;
+               /* set to one first offset bits prior to start */
+               tmp |= (~0U >> (NBWORD-start_bit));
+               if (tmp != ~0U)
+                       goto found;
+               result += NBWORD;
+               size -= NBWORD;
+       }
+       while (size) {
+               if ((tmp = *p++) != ~0U)
+                       goto found;
+               result += NBWORD;
+               size -= NBWORD;
+       }
+       return result - start_bit;
+found:
+       return result + ffz(tmp) - start_bit;
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.  It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+       uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+       uint result = start_bit & ~(NBWORD - 1);
+       uint tmp;
+
+       size <<= BIT_TO_WORD_SHIFT;
+
+       if (start_bit >= size)
+               return -1;
+       size -= result;
+       start_bit &= (NBWORD - 1);
+       if (start_bit) {
+               tmp = *p++;
+               /* set to zero first offset bits prior to start */
+               tmp &= (~0U << start_bit);
+               if (tmp != 0U)
+                       goto found;
+               result += NBWORD;
+               size -= NBWORD;
+       }
+       while (size) {
+               if ((tmp = *p++) != 0U)
+                       goto found;
+               result += NBWORD;
+               size -= NBWORD;
+       }
+       return -1;
+found:
+       return result + ffs(tmp) - 1;
+}
index 63e05b663380d8e8997f8013d76de1e7d45572b1..8e2010d53b079556de461d6a1be1f9cbbc7e2924 100644 (file)
@@ -5945,6 +5945,7 @@ xfs_bmap_split_extent(
        return xfs_trans_commit(tp);
 
 out:
+       xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp);
        return error;
 }
index 2c44c8e507827cba31851e1bb8248b6293da5bc6..6b0cf6546a82f7752bc8a7121d8741f3257269c7 100644 (file)
@@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr(
 
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
-               ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+               ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
+                      &mp->m_sb.sb_meta_uuid));
                ASSERT(rblock->bb_u.l.bb_blkno ==
                       cpu_to_be64(XFS_BUF_DADDR_NULL));
        } else
@@ -647,7 +648,7 @@ xfs_bmbt_verify(
        case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
                if (!xfs_sb_version_hascrc(&mp->m_sb))
                        return false;
-               if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
                        return false;
index c72283dd8d44b6327e420355864ea8571068c9f4..f7d7ee7a26072587262dfd27509ba8f502214cc7 100644 (file)
@@ -65,7 +65,8 @@ xfs_btree_check_lblock(
 
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                lblock_ok = lblock_ok &&
-                       uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+                       uuid_equal(&block->bb_u.l.bb_uuid,
+                                  &mp->m_sb.sb_meta_uuid) &&
                        block->bb_u.l.bb_blkno == cpu_to_be64(
                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
        }
@@ -115,7 +116,8 @@ xfs_btree_check_sblock(
 
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                sblock_ok = sblock_ok &&
-                       uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+                       uuid_equal(&block->bb_u.s.bb_uuid,
+                                  &mp->m_sb.sb_meta_uuid) &&
                        block->bb_u.s.bb_blkno == cpu_to_be64(
                                bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
        }
@@ -1000,7 +1002,7 @@ xfs_btree_init_block_int(
                if (flags & XFS_BTREE_CRC_BLOCKS) {
                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
-                       uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
                        buf->bb_u.l.bb_pad = 0;
                        buf->bb_u.l.bb_lsn = 0;
                }
@@ -1013,7 +1015,7 @@ xfs_btree_init_block_int(
                if (flags & XFS_BTREE_CRC_BLOCKS) {
                        buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
                        buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
-                       uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
                        buf->bb_u.s.bb_lsn = 0;
                }
        }
index 2385f8cd08ab9f1cdf19ad3a9a8473a5e4e34eb3..be43248a5822844f642007ad40bd1e00c808d54e 100644 (file)
@@ -146,7 +146,7 @@ xfs_da3_node_verify(
                if (ichdr.magic != XFS_DA3_NODE_MAGIC)
                        return false;
 
-               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
                        return false;
@@ -233,6 +233,7 @@ xfs_da3_node_read_verify(
                        bp->b_ops->verify_read(bp);
                        return;
                default:
+                       xfs_buf_ioerror(bp, -EFSCORRUPTED);
                        break;
        }
 
@@ -324,7 +325,7 @@ xfs_da3_node_create(
                ichdr.magic = XFS_DA3_NODE_MAGIC;
                hdr3->info.blkno = cpu_to_be64(bp->b_bn);
                hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
-               uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
        } else {
                ichdr.magic = XFS_DA_NODE_MAGIC;
        }
@@ -1822,6 +1823,7 @@ xfs_da3_path_shift(
        struct xfs_da_args      *args;
        struct xfs_da_node_entry *btree;
        struct xfs_da3_icnode_hdr nodehdr;
+       struct xfs_buf          *bp;
        xfs_dablk_t             blkno = 0;
        int                     level;
        int                     error;
@@ -1866,20 +1868,24 @@ xfs_da3_path_shift(
         */
        for (blk++, level++; level < path->active; blk++, level++) {
                /*
-                * Release the old block.
-                * (if it's dirty, trans won't actually let go)
+                * Read the next child block into a local buffer.
                 */
-               if (release)
-                       xfs_trans_brelse(args->trans, blk->bp);
+               error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
+                                         args->whichfork);
+               if (error)
+                       return error;
 
                /*
-                * Read the next child block.
+                * Release the old block (if it's dirty, the trans doesn't
+                * actually let go) and swap the local buffer into the path
+                * structure. This ensures failure of the above read doesn't set
+                * a NULL buffer in an active slot in the path.
                 */
+               if (release)
+                       xfs_trans_brelse(args->trans, blk->bp);
                blk->blkno = blkno;
-               error = xfs_da3_node_read(args->trans, dp, blkno, -1,
-                                       &blk->bp, args->whichfork);
-               if (error)
-                       return error;
+               blk->bp = bp;
+
                info = blk->bp->b_addr;
                ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
                       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2351,8 +2357,8 @@ xfs_da_shrink_inode(
                 * the last block to the place we want to kill.
                 */
                error = xfs_bunmapi(tp, dp, dead_blkno, count,
-                                   xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-                                   0, args->firstblock, args->flist, &done);
+                                   xfs_bmapi_aflag(w), 0, args->firstblock,
+                                   args->flist, &done);
                if (error == -ENOSPC) {
                        if (w != XFS_DATA_FORK)
                                break;
index 74bcbabfa52329bda4d932daeab73ae624bde7f1..b14bbd6bb05fad090571bcada4e4867e35040b87 100644 (file)
@@ -680,8 +680,15 @@ typedef struct xfs_attr_leaf_name_remote {
 typedef struct xfs_attr_leafblock {
        xfs_attr_leaf_hdr_t     hdr;    /* constant-structure header block */
        xfs_attr_leaf_entry_t   entries[1];     /* sorted on key, not name */
-       xfs_attr_leaf_name_local_t namelist;    /* grows from bottom of buf */
-       xfs_attr_leaf_name_remote_t valuelist;  /* grows from bottom of buf */
+       /*
+        * The rest of the block contains the following structures after the
+        * leaf entries, growing from the bottom up. The variables are never
+        * referenced and definining them can actually make gcc optimize away
+        * accesses to the 'entries' array above index 0 so don't do that.
+        *
+        * xfs_attr_leaf_name_local_t namelist;
+        * xfs_attr_leaf_name_remote_t valuelist;
+        */
 } xfs_attr_leafblock_t;
 
 /*
index a69fb3a1e16182ac9bd636fa9abb42150444cacb..9de401d297e5bf1972922849d221c4495edd86be 100644 (file)
@@ -362,6 +362,7 @@ xfs_dir_lookup(
        struct xfs_da_args *args;
        int             rval;
        int             v;              /* type-checking value */
+       int             lock_mode;
 
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_lookup);
@@ -387,6 +388,7 @@ xfs_dir_lookup(
        if (ci_name)
                args->op_flags |= XFS_DA_OP_CILOOKUP;
 
+       lock_mode = xfs_ilock_data_map_shared(dp);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
                rval = xfs_dir2_sf_lookup(args);
                goto out_check_rval;
@@ -419,6 +421,7 @@ out_check_rval:
                }
        }
 out_free:
+       xfs_iunlock(dp, lock_mode);
        kmem_free(args);
        return rval;
 }
@@ -674,25 +677,22 @@ xfs_dir2_shrink_inode(
        mp = dp->i_mount;
        tp = args->trans;
        da = xfs_dir2_db_to_da(args->geo, db);
-       /*
-        * Unmap the fsblock(s).
-        */
-       if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
-                       XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-                       &done))) {
+
+       /* Unmap the fsblock(s). */
+       error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
+                           args->firstblock, args->flist, &done);
+       if (error) {
                /*
-                * ENOSPC actually can happen if we're in a removename with
-                * no space reservation, and the resulting block removal
-                * would cause a bmap btree split or conversion from extents
-                * to btree.  This can only happen for un-fragmented
-                * directory blocks, since you need to be punching out
-                * the middle of an extent.
-                * In this case we need to leave the block in the file,
-                * and not binval it.
-                * So the block has to be in a consistent empty state
-                * and appropriately logged.
-                * We don't free up the buffer, the caller can tell it
-                * hasn't happened since it got an error back.
+                * ENOSPC actually can happen if we're in a removename with no
+                * space reservation, and the resulting block removal would
+                * cause a bmap btree split or conversion from extents to btree.
+                * This can only happen for un-fragmented directory blocks,
+                * since you need to be punching out the middle of an extent.
+                * In this case we need to leave the block in the file, and not
+                * binval it.  So the block has to be in a consistent empty
+                * state and appropriately logged.  We don't free up the buffer,
+                * the caller can tell it hasn't happened since it got an error
+                * back.
                 */
                return error;
        }
index 9354e190b82e9e9e4f33a38e2254309159622a97..4778d1dd511afae50eee4c6a7b5d0a7220fc4c4d 100644 (file)
@@ -67,7 +67,7 @@ xfs_dir3_block_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
                        return false;
-               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return false;
@@ -157,7 +157,7 @@ xfs_dir3_block_init(
                hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
                hdr3->blkno = cpu_to_be64(bp->b_bn);
                hdr3->owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
                return;
 
        }
index de1ea16f57485ccc03143196eb2aced9147aed5b..824131e71bc53017082f518355c47f963e792cd8 100644 (file)
@@ -220,7 +220,7 @@ xfs_dir3_data_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
                        return false;
-               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return false;
@@ -252,7 +252,8 @@ xfs_dir3_data_reada_verify(
                return;
        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
        case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
-               xfs_dir3_data_verify(bp);
+               bp->b_ops = &xfs_dir3_data_buf_ops;
+               bp->b_ops->verify_read(bp);
                return;
        default:
                xfs_buf_ioerror(bp, -EFSCORRUPTED);
@@ -604,7 +605,7 @@ xfs_dir3_data_init(
                hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
                hdr3->blkno = cpu_to_be64(bp->b_bn);
                hdr3->owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
 
        } else
                hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
index 106119955400a6d06cc73ff93cad2055a8e4bc1e..f300240ebb8d1575191379978ba5ed722c009718 100644 (file)
@@ -160,7 +160,7 @@ xfs_dir3_leaf_verify(
 
                if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
                        return false;
-               if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
                        return false;
@@ -310,7 +310,7 @@ xfs_dir3_leaf_init(
                                         : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
                leaf3->info.blkno = cpu_to_be64(bp->b_bn);
                leaf3->info.owner = cpu_to_be64(owner);
-               uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
        } else {
                memset(leaf, 0, sizeof(*leaf));
                leaf->hdr.info.magic = cpu_to_be16(type);
index 41b80d3d38772b100903bdcf7c604adad6417fd4..cc28e924545b52159cb8b0201b5d52dba58f7b1d 100644 (file)
@@ -93,7 +93,7 @@ xfs_dir3_free_verify(
 
                if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
                        return false;
-               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
                        return false;
@@ -226,7 +226,7 @@ xfs_dir3_free_get_buf(
 
                hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
                hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
-               uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
        } else
                hdr.magic = XFS_DIR2_FREE_MAGIC;
        dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
@@ -1845,8 +1845,7 @@ xfs_dir2_node_addname_int(
 
                        if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
                                xfs_alert(mp,
-                       "%s: dir ino %llu needed freesp block %lld for\n"
-                       "  data block %lld, got %lld ifbno %llu lastfbno %d",
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
                                        __func__, (unsigned long long)dp->i_ino,
                                        (long long)dp->d_ops->db_to_fdb(
                                                                args->geo, dbno),
@@ -2132,6 +2131,7 @@ xfs_dir2_node_replace(
        int                     error;          /* error return value */
        int                     i;              /* btree level */
        xfs_ino_t               inum;           /* new inode number */
+       int                     ftype;          /* new file type */
        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry being changed */
        int                     rval;           /* internal return value */
@@ -2145,7 +2145,14 @@ xfs_dir2_node_replace(
        state = xfs_da_state_alloc();
        state->args = args;
        state->mp = args->dp->i_mount;
+
+       /*
+        * We have to save new inode number and ftype since
+        * xfs_da3_node_lookup_int() is going to overwrite them
+        */
        inum = args->inumber;
+       ftype = args->filetype;
+
        /*
         * Lookup the entry to change in the btree.
         */
@@ -2183,7 +2190,7 @@ xfs_dir2_node_replace(
                 * Fill in the new inode number and log the entry.
                 */
                dep->inumber = cpu_to_be64(inum);
-               args->dp->d_ops->data_put_ftype(dep, args->filetype);
+               args->dp->d_ops->data_put_ftype(dep, ftype);
                xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
                rval = 0;
        }
index 6fbf2d853a54e178d0699fa90cb03ad8647a349e..5331b7f0460c7e16a72aab1b7b96f727a606a10c 100644 (file)
@@ -163,7 +163,7 @@ xfs_dqcheck(
        d->dd_diskdq.d_id = cpu_to_be32(id);
 
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
-               uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+               uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
                xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
                                 XFS_DQUOT_CRC_OFF);
        }
@@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc(
                if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
                                 XFS_DQUOT_CRC_OFF))
                        return false;
-               if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
        }
        return true;
index a0ae572051de78b4169d5f363ba749f4030c170d..9590a069e5569a88942e4726382db7819d7884f7 100644 (file)
@@ -100,7 +100,7 @@ typedef struct xfs_sb {
        xfs_rfsblock_t  sb_dblocks;     /* number of data blocks */
        xfs_rfsblock_t  sb_rblocks;     /* number of realtime blocks */
        xfs_rtblock_t   sb_rextents;    /* number of realtime extents */
-       uuid_t          sb_uuid;        /* file system unique id */
+       uuid_t          sb_uuid;        /* user-visible file system unique id */
        xfs_fsblock_t   sb_logstart;    /* starting block of log if internal */
        xfs_ino_t       sb_rootino;     /* root inode number */
        xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
@@ -174,6 +174,7 @@ typedef struct xfs_sb {
 
        xfs_ino_t       sb_pquotino;    /* project quota inode */
        xfs_lsn_t       sb_lsn;         /* last write sequence */
+       uuid_t          sb_meta_uuid;   /* metadata file system unique id */
 
        /* must be padded to 64 bit alignment */
 } xfs_sb_t;
@@ -190,7 +191,7 @@ typedef struct xfs_dsb {
        __be64          sb_dblocks;     /* number of data blocks */
        __be64          sb_rblocks;     /* number of realtime blocks */
        __be64          sb_rextents;    /* number of realtime extents */
-       uuid_t          sb_uuid;        /* file system unique id */
+       uuid_t          sb_uuid;        /* user-visible file system unique id */
        __be64          sb_logstart;    /* starting block of log if internal */
        __be64          sb_rootino;     /* root inode number */
        __be64          sb_rbmino;      /* bitmap inode for realtime extents */
@@ -260,6 +261,7 @@ typedef struct xfs_dsb {
 
        __be64          sb_pquotino;    /* project quota inode */
        __be64          sb_lsn;         /* last write sequence */
+       uuid_t          sb_meta_uuid;   /* metadata file system unique id */
 
        /* must be padded to 64 bit alignment */
 } xfs_dsb_t;
@@ -458,9 +460,11 @@ xfs_sb_has_ro_compat_feature(
 
 #define XFS_SB_FEAT_INCOMPAT_FTYPE     (1 << 0)        /* filetype in dirent */
 #define XFS_SB_FEAT_INCOMPAT_SPINODES  (1 << 1)        /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2)        /* metadata UUID */
 #define XFS_SB_FEAT_INCOMPAT_ALL \
                (XFS_SB_FEAT_INCOMPAT_FTYPE|    \
-                XFS_SB_FEAT_INCOMPAT_SPINODES)
+                XFS_SB_FEAT_INCOMPAT_SPINODES| \
+                XFS_SB_FEAT_INCOMPAT_META_UUID)
 
 #define XFS_SB_FEAT_INCOMPAT_UNKNOWN   ~XFS_SB_FEAT_INCOMPAT_ALL
 static inline bool
@@ -514,6 +518,18 @@ static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
                xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
 }
 
+/*
+ * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID
+ * is stored separately from the user-visible UUID; this allows the
+ * user-visible UUID to be changed on V5 filesystems which have a
+ * filesystem UUID stamped into every piece of metadata.
+ */
+static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
+{
+       return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+               (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
+}
+
 /*
  * end of superblock version macros
  */
index 66efc702452a0cd45920ce3fd021c689d7bdd40b..54deb2d12ac6bdfeb55ca2e05b56c81ed169e1a5 100644 (file)
@@ -338,7 +338,8 @@ xfs_ialloc_inode_init(
                        if (version == 3) {
                                free->di_ino = cpu_to_be64(ino);
                                ino++;
-                               uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+                               uuid_copy(&free->di_uuid,
+                                         &mp->m_sb.sb_meta_uuid);
                                xfs_dinode_calc_crc(mp, free);
                        } else if (tp) {
                                /* just log the inode core */
@@ -2232,7 +2233,7 @@ xfs_imap_lookup(
        }
 
        xfs_trans_brelse(tp, agbp);
-       xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
        if (error)
                return error;
 
@@ -2500,7 +2501,7 @@ xfs_agi_verify(
        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
 
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+           !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
        /*
         * Validate the magic number of the agi block.
index 674ad8f760be25ea8fd969b182eee8cee0af7900..f39b285beb19f659ab5a6f0774c966134c0fdf58 100644 (file)
@@ -239,7 +239,7 @@ xfs_inobt_verify(
        case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
                if (!xfs_sb_version_hascrc(&mp->m_sb))
                        return false;
-               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+               if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
                        return false;
                if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
                        return false;
index 6526e7696184b75fd3206ba4d6c48e354a85721d..268c00f4f83af9b5eb3fe492322323c7be2f5b42 100644 (file)
@@ -304,7 +304,7 @@ xfs_dinode_verify(
                return false;
        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
                return false;
-       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+       if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
                return false;
        return true;
 }
@@ -366,7 +366,7 @@ xfs_iread(
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        ip->i_d.di_version = 3;
                        ip->i_d.di_ino = ip->i_ino;
-                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
                } else
                        ip->i_d.di_version = 2;
                return 0;
index df9851c46b5c2e053926b9f223a318ba04ee6489..47425140f34303745bd8ff4b2b2dd0d34d0f033e 100644 (file)
@@ -131,10 +131,11 @@ xfs_mount_validate_sb(
                if (xfs_sb_has_compat_feature(sbp,
                                        XFS_SB_FEAT_COMPAT_UNKNOWN)) {
                        xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
+"Superblock has unknown compatible features (0x%x) enabled.",
                                (sbp->sb_features_compat &
                                                XFS_SB_FEAT_COMPAT_UNKNOWN));
+                       xfs_warn(mp,
+"Using a more recent kernel is recommended.");
                }
 
                if (xfs_sb_has_ro_compat_feature(sbp,
@@ -145,18 +146,21 @@ xfs_mount_validate_sb(
                                                XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
                        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
                                xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
+"Attempted to mount read-only compatible filesystem read-write.");
+                               xfs_warn(mp,
 "Filesystem can only be safely mounted read only.");
+
                                return -EINVAL;
                        }
                }
                if (xfs_sb_has_incompat_feature(sbp,
                                        XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
                        xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
+"Superblock has unknown incompatible features (0x%x) enabled.",
                                (sbp->sb_features_incompat &
                                                XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+                       xfs_warn(mp,
+"Filesystem can not be safely mounted by this kernel.");
                        return -EINVAL;
                }
        }
@@ -182,9 +186,6 @@ xfs_mount_validate_sb(
        if (xfs_sb_version_hassparseinodes(sbp)) {
                uint32_t        align;
 
-               xfs_alert(mp,
-       "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
                align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
                                >> sbp->sb_blocklog;
                if (sbp->sb_inoalignmt != align) {
@@ -398,6 +399,14 @@ __xfs_sb_from_disk(
        to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
        to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
+       /*
+        * sb_meta_uuid is only on disk if it differs from sb_uuid and the
+        * feature flag is set; if not set we keep it only in memory.
+        */
+       if (xfs_sb_version_hasmetauuid(to))
+               uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+       else
+               uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
        /* Convert on-disk flags to in-memory flags? */
        if (convert_xquota)
                xfs_sb_quota_from_disk(to);
@@ -539,6 +548,8 @@ xfs_sb_to_disk(
                                cpu_to_be32(from->sb_features_log_incompat);
                to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
                to->sb_lsn = cpu_to_be64(from->sb_lsn);
+               if (xfs_sb_version_hasmetauuid(from))
+                       uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
        }
 }
 
index e7e26bd6468fdd218e3390a9913487f52e9576a2..8f8af05b3f13da7acadc9ebfbc64f0e7debaf655 100644 (file)
@@ -63,7 +63,7 @@ xfs_symlink_hdr_set(
        dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
        dsl->sl_offset = cpu_to_be32(offset);
        dsl->sl_bytes = cpu_to_be32(size);
-       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+       uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
        dsl->sl_owner = cpu_to_be64(ino);
        dsl->sl_blkno = cpu_to_be64(bp->b_bn);
        bp->b_ops = &xfs_symlink_buf_ops;
@@ -107,7 +107,7 @@ xfs_symlink_verify(
                return false;
        if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
                return false;
-       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+       if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
                return false;
        if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
                return false;
index cc2a321f774b204de565db643483ce4c48518bbf..50ab2879b9da0ec1211abacbf6598b0a90eddd8a 100644 (file)
@@ -353,7 +353,8 @@ xfs_end_bio(
 {
        xfs_ioend_t             *ioend = bio->bi_private;
 
-       ioend->io_error = bio->bi_error;
+       if (!ioend->io_error)
+               ioend->io_error = bio->bi_error;
 
        /* Toss bio and pass work off to an xfsdatad thread */
        bio->bi_private = NULL;
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
deleted file mode 100644 (file)
index 0e8885a..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_log_format.h"
-#include "xfs_bit.h"
-
-/*
- * XFS bit manipulation routines, used in non-realtime code.
- */
-
-/*
- * Return whether bitmap is empty.
- * Size is number of words in the bitmap, which is padded to word boundary
- * Returns 1 for empty, 0 for non-empty.
- */
-int
-xfs_bitmap_empty(uint *map, uint size)
-{
-       uint i;
-       uint ret = 0;
-
-       for (i = 0; i < size; i++) {
-               ret |= map[i];
-       }
-
-       return (ret == 0);
-}
-
-/*
- * Count the number of contiguous bits set in the bitmap starting with bit
- * start_bit.  Size is the size of the bitmap in words.
- */
-int
-xfs_contig_bits(uint *map, uint        size, uint start_bit)
-{
-       uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
-       uint result = 0;
-       uint tmp;
-
-       size <<= BIT_TO_WORD_SHIFT;
-
-       ASSERT(start_bit < size);
-       size -= start_bit & ~(NBWORD - 1);
-       start_bit &= (NBWORD - 1);
-       if (start_bit) {
-               tmp = *p++;
-               /* set to one first offset bits prior to start */
-               tmp |= (~0U >> (NBWORD-start_bit));
-               if (tmp != ~0U)
-                       goto found;
-               result += NBWORD;
-               size -= NBWORD;
-       }
-       while (size) {
-               if ((tmp = *p++) != ~0U)
-                       goto found;
-               result += NBWORD;
-               size -= NBWORD;
-       }
-       return result - start_bit;
-found:
-       return result + ffz(tmp) - start_bit;
-}
-
-/*
- * This takes the bit number to start looking from and
- * returns the next set bit from there.  It returns -1
- * if there are no more bits set or the start bit is
- * beyond the end of the bitmap.
- *
- * Size is the number of words, not bytes, in the bitmap.
- */
-int xfs_next_bit(uint *map, uint size, uint start_bit)
-{
-       uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
-       uint result = start_bit & ~(NBWORD - 1);
-       uint tmp;
-
-       size <<= BIT_TO_WORD_SHIFT;
-
-       if (start_bit >= size)
-               return -1;
-       size -= result;
-       start_bit &= (NBWORD - 1);
-       if (start_bit) {
-               tmp = *p++;
-               /* set to zero first offset bits prior to start */
-               tmp &= (~0U << start_bit);
-               if (tmp != 0U)
-                       goto found;
-               result += NBWORD;
-               size -= NBWORD;
-       }
-       while (size) {
-               if ((tmp = *p++) != 0U)
-                       goto found;
-               result += NBWORD;
-               size -= NBWORD;
-       }
-       return -1;
-found:
-       return result + ffs(tmp) - 1;
-}
index 0f34886cf7269b1cd67eadbc372655372253981b..3bf4ad0d19e4f3203bdb36b23f74616f9c691741 100644 (file)
@@ -67,16 +67,15 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  */
 int                                            /* error */
 xfs_bmap_finish(
-       xfs_trans_t             **tp,           /* transaction pointer addr */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-       int                     *committed)     /* xact committed or not */
+       struct xfs_trans                **tp,   /* transaction pointer addr */
+       struct xfs_bmap_free            *flist, /* i/o: list extents to free */
+       int                             *committed)/* xact committed or not */
 {
-       xfs_efd_log_item_t      *efd;           /* extent free data */
-       xfs_efi_log_item_t      *efi;           /* extent free intention */
-       int                     error;          /* error return value */
-       xfs_bmap_free_item_t    *free;          /* free extent item */
-       xfs_mount_t             *mp;            /* filesystem mount structure */
-       xfs_bmap_free_item_t    *next;          /* next item on free list */
+       struct xfs_efd_log_item         *efd;   /* extent free data */
+       struct xfs_efi_log_item         *efi;   /* extent free intention */
+       int                             error;  /* error return value */
+       struct xfs_bmap_free_item       *free;  /* free extent item */
+       struct xfs_bmap_free_item       *next;  /* next item on free list */
 
        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
        if (flist->xbf_count == 0) {
@@ -88,40 +87,48 @@ xfs_bmap_finish(
                xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
                        free->xbfi_blockcount);
 
-       error = xfs_trans_roll(tp, NULL);
-       *committed = 1;
-       /*
-        * We have a new transaction, so we should return committed=1,
-        * even though we're returning an error.
-        */
-       if (error)
+       error = __xfs_trans_roll(tp, NULL, committed);
+       if (error) {
+               /*
+                * If the transaction was committed, drop the EFD reference
+                * since we're bailing out of here. The other reference is
+                * dropped when the EFI hits the AIL.
+                *
+                * If the transaction was not committed, the EFI is freed by the
+                * EFI item unlock handler on abort. Also, we have a new
+                * transaction so we should return committed=1 even though we're
+                * returning an error.
+                */
+               if (*committed) {
+                       xfs_efi_release(efi);
+                       xfs_force_shutdown((*tp)->t_mountp,
+                               (error == -EFSCORRUPTED) ?
+                                       SHUTDOWN_CORRUPT_INCORE :
+                                       SHUTDOWN_META_IO_ERROR);
+               } else {
+                       *committed = 1;
+               }
+
                return error;
+       }
 
+       /*
+        * Get an EFD and free each extent in the list, logging to the EFD in
+        * the process. The remaining bmap free list is cleaned up by the caller
+        * on error.
+        */
        efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
        for (free = flist->xbf_first; free != NULL; free = next) {
                next = free->xbfi_next;
-               if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
-                               free->xbfi_blockcount))) {
-                       /*
-                        * The bmap free list will be cleaned up at a
-                        * higher level.  The EFI will be canceled when
-                        * this transaction is aborted.
-                        * Need to force shutdown here to make sure it
-                        * happens, since this transaction may not be
-                        * dirty yet.
-                        */
-                       mp = (*tp)->t_mountp;
-                       if (!XFS_FORCED_SHUTDOWN(mp))
-                               xfs_force_shutdown(mp,
-                                                  (error == -EFSCORRUPTED) ?
-                                                  SHUTDOWN_CORRUPT_INCORE :
-                                                  SHUTDOWN_META_IO_ERROR);
+
+               error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
+                                             free->xbfi_blockcount);
+               if (error)
                        return error;
-               }
-               xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
-                       free->xbfi_blockcount);
+
                xfs_bmap_del_free(flist, NULL, free);
        }
+
        return 0;
 }
 
@@ -1467,7 +1474,7 @@ xfs_shift_file_space(
                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
                                XFS_QMOPT_RES_REGBLKS);
                if (error)
-                       goto out;
+                       goto out_trans_cancel;
 
                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
@@ -1481,18 +1488,20 @@ xfs_shift_file_space(
                                &done, stop_fsb, &first_block, &free_list,
                                direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
                if (error)
-                       goto out;
+                       goto out_bmap_cancel;
 
                error = xfs_bmap_finish(&tp, &free_list, &committed);
                if (error)
-                       goto out;
+                       goto out_bmap_cancel;
 
                error = xfs_trans_commit(tp);
        }
 
        return error;
 
-out:
+out_bmap_cancel:
+       xfs_bmap_cancel(&free_list);
+out_trans_cancel:
        xfs_trans_cancel(tp);
        return error;
 }
index 01bd6781974eab6ebbcaf50a294dc27a08b07f51..8ecffb35935b0c6f66cbc8d49993b740e052658d 100644 (file)
@@ -438,7 +438,6 @@ _xfs_buf_find(
        xfs_buf_flags_t         flags,
        xfs_buf_t               *new_bp)
 {
-       size_t                  numbytes;
        struct xfs_perag        *pag;
        struct rb_node          **rbp;
        struct rb_node          *parent;
@@ -450,10 +449,9 @@ _xfs_buf_find(
 
        for (i = 0; i < nmaps; i++)
                numblks += map[i].bm_len;
-       numbytes = BBTOB(numblks);
 
        /* Check for IOs smaller than the sector size / not sector aligned */
-       ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+       ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
        ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
 
        /*
@@ -1532,9 +1530,10 @@ xfs_wait_buftarg(
                        list_del_init(&bp->b_lru);
                        if (bp->b_flags & XBF_WRITE_FAIL) {
                                xfs_alert(btp->bt_mount,
-"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
-"Please run xfs_repair to determine the extent of the problem.",
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!",
                                        (long long)bp->b_bn);
+                               xfs_alert(btp->bt_mount,
+"Please run xfs_repair to determine the extent of the problem.");
                        }
                        xfs_buf_rele(bp);
                }
index 331c1ccf826478732aeb18bd8ee59103100e5597..c79b717d9b882f5a84d1fe07dc1ead83acac8ba2 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/uio.h>
 #include <linux/list_lru.h>
index 092d652bc03df3153b6bcf427ed972ffe480fd86..7e986da34f6cb40ad3aca9e9845f81a070dd2d4d 100644 (file)
@@ -647,11 +647,7 @@ xfs_buf_item_unlock(
                        xfs_buf_item_relse(bp);
                else if (aborted) {
                        ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
-                       if (lip->li_flags & XFS_LI_IN_AIL) {
-                               spin_lock(&lip->li_ailp->xa_lock);
-                               xfs_trans_ail_delete(lip->li_ailp, lip,
-                                                    SHUTDOWN_LOG_IO_ERROR);
-                       }
+                       xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
                        xfs_buf_item_relse(bp);
                }
        }
@@ -750,13 +746,13 @@ xfs_buf_item_free_format(
  * buffer (see xfs_buf_attach_iodone() below), then put the
  * buf log item at the front.
  */
-void
+int
 xfs_buf_item_init(
-       xfs_buf_t       *bp,
-       xfs_mount_t     *mp)
+       struct xfs_buf  *bp,
+       struct xfs_mount *mp)
 {
-       xfs_log_item_t          *lip = bp->b_fspriv;
-       xfs_buf_log_item_t      *bip;
+       struct xfs_log_item     *lip = bp->b_fspriv;
+       struct xfs_buf_log_item *bip;
        int                     chunks;
        int                     map_size;
        int                     error;
@@ -770,12 +766,11 @@ xfs_buf_item_init(
         */
        ASSERT(bp->b_target->bt_mount == mp);
        if (lip != NULL && lip->li_type == XFS_LI_BUF)
-               return;
+               return 0;
 
        bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
        xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
        bip->bli_buf = bp;
-       xfs_buf_hold(bp);
 
        /*
         * chunks is the number of XFS_BLF_CHUNK size pieces the buffer
@@ -788,6 +783,11 @@ xfs_buf_item_init(
         */
        error = xfs_buf_item_get_format(bip, bp->b_map_count);
        ASSERT(error == 0);
+       if (error) {    /* to stop gcc throwing set-but-unused warnings */
+               kmem_zone_free(xfs_buf_item_zone, bip);
+               return error;
+       }
+
 
        for (i = 0; i < bip->bli_format_count; i++) {
                chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
@@ -807,6 +807,8 @@ xfs_buf_item_init(
        if (bp->b_fspriv)
                bip->bli_item.li_bio_list = bp->b_fspriv;
        bp->b_fspriv = bip;
+       xfs_buf_hold(bp);
+       return 0;
 }
 
 
index 3f3455a415102de167271a7467725da410a21f24..f7eba99d19dde9404c1517633fd8f34f16a36599 100644 (file)
@@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item {
        struct xfs_buf_log_format __bli_format; /* embedded in-log header */
 } xfs_buf_log_item_t;
 
-void   xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+int    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
 void   xfs_buf_item_relse(struct xfs_buf *);
 void   xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
 uint   xfs_buf_item_dirty(xfs_buf_log_item_t *);
index 098cd78fe708433fdf77306637626d6b69400582..a989a9c7edb7fe22e2a425c3efaef417bde89b72 100644 (file)
@@ -171,6 +171,7 @@ xfs_dir2_block_getdents(
        int                     wantoff;        /* starting block offset */
        xfs_off_t               cook;
        struct xfs_da_geometry  *geo = args->geo;
+       int                     lock_mode;
 
        /*
         * If the block number in the offset is out of range, we're done.
@@ -178,7 +179,9 @@ xfs_dir2_block_getdents(
        if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
                return 0;
 
+       lock_mode = xfs_ilock_data_map_shared(dp);
        error = xfs_dir3_block_read(NULL, dp, &bp);
+       xfs_iunlock(dp, lock_mode);
        if (error)
                return error;
 
@@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents(
                 * current buffer, need to get another one.
                 */
                if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+                       int     lock_mode;
 
+                       lock_mode = xfs_ilock_data_map_shared(dp);
                        error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
                                                      &curoff, &bp);
+                       xfs_iunlock(dp, lock_mode);
                        if (error || !map_info->map_valid)
                                break;
 
@@ -653,7 +659,6 @@ xfs_readdir(
        struct xfs_da_args      args = { NULL };
        int                     rval;
        int                     v;
-       uint                    lock_mode;
 
        trace_xfs_readdir(dp);
 
@@ -666,7 +671,7 @@ xfs_readdir(
        args.dp = dp;
        args.geo = dp->i_mount->m_dir_geo;
 
-       lock_mode = xfs_ilock_data_map_shared(dp);
+       xfs_ilock(dp, XFS_IOLOCK_SHARED);
        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
                rval = xfs_dir2_sf_getdents(&args, ctx);
        else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -675,7 +680,7 @@ xfs_readdir(
                rval = xfs_dir2_block_getdents(&args, ctx);
        else
                rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
-       xfs_iunlock(dp, lock_mode);
+       xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 
        return rval;
 }
index 4143dc75dca4b22b15003ebd1bce5b3561a5deb8..30cb3afb67f091e3e96f7b6a6d189d4464706b68 100644 (file)
@@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk(
                d->dd_diskdq.d_id = cpu_to_be32(curid);
                d->dd_diskdq.d_flags = type;
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
-                       uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
                        xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
                                         XFS_DQUOT_CRC_OFF);
                }
@@ -954,12 +954,8 @@ xfs_qm_dqflush(
                struct xfs_log_item     *lip = &dqp->q_logitem.qli_item;
                dqp->dq_flags &= ~XFS_DQ_DIRTY;
 
-               spin_lock(&mp->m_ail->xa_lock);
-               if (lip->li_flags & XFS_LI_IN_AIL)
-                       xfs_trans_ail_delete(mp->m_ail, lip,
-                                            SHUTDOWN_CORRUPT_INCORE);
-               else
-                       spin_unlock(&mp->m_ail->xa_lock);
+               xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE);
+
                error = -EIO;
                goto out_unlock;
        }
index adc8f8fdd145ae4c0facfb377b177fdf3a59baeb..4aa0153214f91fb9ec03621c0b006317ca3d0e09 100644 (file)
@@ -46,28 +46,6 @@ xfs_efi_item_free(
                kmem_zone_free(xfs_efi_zone, efip);
 }
 
-/*
- * Freeing the efi requires that we remove it from the AIL if it has already
- * been placed there. However, the EFI may not yet have been placed in the AIL
- * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the EFI.
- */
-STATIC void
-__xfs_efi_release(
-       struct xfs_efi_log_item *efip)
-{
-       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-
-       if (atomic_dec_and_test(&efip->efi_refcount)) {
-               spin_lock(&ailp->xa_lock);
-               /* xfs_trans_ail_delete() drops the AIL lock. */
-               xfs_trans_ail_delete(ailp, &efip->efi_item,
-                                    SHUTDOWN_LOG_IO_ERROR);
-               xfs_efi_item_free(efip);
-       }
-}
-
 /*
  * This returns the number of iovecs needed to log the given efi item.
  * We only need 1 iovec for an efi item.  It just logs the efi_log_format
@@ -128,12 +106,12 @@ xfs_efi_item_pin(
 }
 
 /*
- * While EFIs cannot really be pinned, the unpin operation is the last place at
- * which the EFI is manipulated during a transaction.  If we are being asked to
- * remove the EFI it's because the transaction has been cancelled and by
- * definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it.  Otherwise coordinate with xfs_efi_release()
- * to determine who gets to free the EFI.
+ * The unpin operation is the last place an EFI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the EFI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the EFI to either construct
+ * and commit the EFD or drop the EFD's reference in the event of error. Simply
+ * drop the log's EFI reference now that the log is done with it.
  */
 STATIC void
 xfs_efi_item_unpin(
@@ -141,15 +119,7 @@ xfs_efi_item_unpin(
        int                     remove)
 {
        struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-
-       if (remove) {
-               ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-               if (lip->li_desc)
-                       xfs_trans_del_item(lip);
-               xfs_efi_item_free(efip);
-               return;
-       }
-       __xfs_efi_release(efip);
+       xfs_efi_release(efip);
 }
 
 /*
@@ -167,6 +137,11 @@ xfs_efi_item_push(
        return XFS_ITEM_PINNED;
 }
 
+/*
+ * The EFI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an EFD isn't going to be
+ * constructed and thus we free the EFI here directly.
+ */
 STATIC void
 xfs_efi_item_unlock(
        struct xfs_log_item     *lip)
@@ -301,23 +276,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
 }
 
 /*
- * This is called by the efd item code below to release references to the given
- * efi item.  Each efd calls this with the number of extents that it has
- * logged, and when the sum of these reaches the total number of extents logged
- * by this efi item we can free the efi item.
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
  */
 void
-xfs_efi_release(xfs_efi_log_item_t     *efip,
-               uint                    nextents)
+xfs_efi_release(
+       struct xfs_efi_log_item *efip)
 {
-       ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
-       if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
-               /* recovery needs us to drop the EFI reference, too */
-               if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
-                       __xfs_efi_release(efip);
-
-               __xfs_efi_release(efip);
-               /* efip may now have been freed, do not reference it again. */
+       if (atomic_dec_and_test(&efip->efi_refcount)) {
+               xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
+               xfs_efi_item_free(efip);
        }
 }
 
@@ -415,20 +386,27 @@ xfs_efd_item_push(
        return XFS_ITEM_PINNED;
 }
 
+/*
+ * The EFD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the EFI and free the EFD.
+ */
 STATIC void
 xfs_efd_item_unlock(
        struct xfs_log_item     *lip)
 {
-       if (lip->li_flags & XFS_LI_ABORTED)
-               xfs_efd_item_free(EFD_ITEM(lip));
+       struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
+       if (lip->li_flags & XFS_LI_ABORTED) {
+               xfs_efi_release(efdp->efd_efip);
+               xfs_efd_item_free(efdp);
+       }
 }
 
 /*
- * When the efd item is committed to disk, all we need to do
- * is delete our reference to our partner efi item and then
- * free ourselves.  Since we're freeing ourselves we must
- * return -1 to keep the transaction code from further referencing
- * this item.
+ * When the efd item is committed to disk, all we need to do is delete our
+ * reference to our partner efi item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from further
+ * referencing this item.
  */
 STATIC xfs_lsn_t
 xfs_efd_item_committed(
@@ -438,13 +416,14 @@ xfs_efd_item_committed(
        struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
 
        /*
-        * If we got a log I/O error, it's always the case that the LR with the
-        * EFI got unpinned and freed before the EFD got aborted.
+        * Drop the EFI reference regardless of whether the EFD has been
+        * aborted. Once the EFD transaction is constructed, it is the sole
+        * responsibility of the EFD to release the EFI (even if the EFI is
+        * aborted due to log I/O error).
         */
-       if (!(lip->li_flags & XFS_LI_ABORTED))
-               xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
-
+       xfs_efi_release(efdp->efd_efip);
        xfs_efd_item_free(efdp);
+
        return (xfs_lsn_t)-1;
 }
 
index 0ffbce32d5693e05e042de8983c8274942616196..8fa8651705e1dc33bb84f1234b411aca9c73ef76 100644 (file)
@@ -39,9 +39,28 @@ struct kmem_zone;
  * "extent free done" log item described below.
  *
  * The EFI is reference counted so that it is not freed prior to both the EFI
- * and EFD being committed and unpinned. This ensures that when the last
- * reference goes away the EFI will always be in the AIL as it has been
- * unpinned, regardless of whether the EFD is processed before or after the EFI.
+ * and EFD being committed and unpinned. This ensures the EFI is inserted into
+ * the AIL even in the event of out of order EFI/EFD processing. In other words,
+ * an EFI is born with two references:
+ *
+ *     1.) an EFI held reference to track EFI AIL insertion
+ *     2.) an EFD held reference to track EFD commit
+ *
+ * On allocation, both references are the responsibility of the caller. Once the
+ * EFI is added to and dirtied in a transaction, ownership of reference one
+ * transfers to the transaction. The reference is dropped once the EFI is
+ * inserted to the AIL or in the event of failure along the way (e.g., commit
+ * failure, log I/O error, etc.). Note that the caller remains responsible for
+ * the EFD reference under all circumstances to this point. The caller has no
+ * means to detect failure once the transaction is committed, however.
+ * Therefore, an EFD is required after this point, even in the event of
+ * unrelated failure.
+ *
+ * Once an EFD is allocated and dirtied in a transaction, reference two
+ * transfers to the transaction. The EFD reference is dropped once it reaches
+ * the unpin handler. Similar to the EFI, the reference also drops in the event
+ * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+ * AIL, so at this point both the EFI and EFD are freed.
  */
 typedef struct xfs_efi_log_item {
        xfs_log_item_t          efi_item;
@@ -77,5 +96,6 @@ xfs_efd_log_item_t    *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
 int                    xfs_efi_copy_format(xfs_log_iovec_t *buf,
                                            xfs_efi_log_format_t *dst_efi_fmt);
 void                   xfs_efi_item_free(xfs_efi_log_item_t *);
+void                   xfs_efi_release(struct xfs_efi_log_item *);
 
 #endif /* __XFS_EXTFREE_ITEM_H__ */
index db4acc1c3e73479cdf32322944a7fa1bcf34b96f..e78feb400e22b22d59228b4f959d5b845848a675 100644 (file)
@@ -317,24 +317,33 @@ xfs_file_read_iter(
                return -EIO;
 
        /*
-        * Locking is a bit tricky here. If we take an exclusive lock
-        * for direct IO, we effectively serialise all new concurrent
-        * read IO to this file and block it behind IO that is currently in
-        * progress because IO in progress holds the IO lock shared. We only
-        * need to hold the lock exclusive to blow away the page cache, so
-        * only take lock exclusively if the page cache needs invalidation.
-        * This allows the normal direct IO case of no page cache pages to
-        * proceeed concurrently without serialisation.
+        * Locking is a bit tricky here. If we take an exclusive lock for direct
+        * IO, we effectively serialise all new concurrent read IO to this file
+        * and block it behind IO that is currently in progress because IO in
+        * progress holds the IO lock shared. We only need to hold the lock
+        * exclusive to blow away the page cache, so only take lock exclusively
+        * if the page cache needs invalidation. This allows the normal direct
+        * IO case of no page cache pages to proceeed concurrently without
+        * serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
+               /*
+                * The generic dio code only flushes the range of the particular
+                * I/O. Because we take an exclusive lock here, this whole
+                * sequence is considerably more expensive for us. This has a
+                * noticeable performance impact for any file with cached pages,
+                * even when outside of the range of the particular I/O.
+                *
+                * Hence, amortize the cost of the lock against a full file
+                * flush and reduce the chances of repeated iolock cycles going
+                * forward.
+                */
                if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait_range(
-                                                       VFS_I(ip)->i_mapping,
-                                                       pos, pos + size - 1);
+                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
@@ -345,9 +354,7 @@ xfs_file_read_iter(
                         * we fail to invalidate a page, but this should never
                         * happen on XFS. Warn if it does fail.
                         */
-                       ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       pos >> PAGE_CACHE_SHIFT,
-                                       (pos + size - 1) >> PAGE_CACHE_SHIFT);
+                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
                        WARN_ON_ONCE(ret);
                        ret = 0;
                }
@@ -733,19 +740,19 @@ xfs_file_dio_aio_write(
        pos = iocb->ki_pos;
        end = pos + count - 1;
 
+       /*
+        * See xfs_file_read_iter() for why we do a full-file flush here.
+        */
        if (mapping->nrpages) {
-               ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                  pos, end);
+               ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                if (ret)
                        goto out;
                /*
-                * Invalidate whole pages. This can return an error if
-                * we fail to invalidate a page, but this should never
-                * happen on XFS. Warn if it does fail.
+                * Invalidate whole pages. This can return an error if we fail
+                * to invalidate a page, but this should never happen on XFS.
+                * Warn if it does fail.
                 */
-               ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       pos >> PAGE_CACHE_SHIFT,
-                                       end >> PAGE_CACHE_SHIFT);
+               ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -1539,8 +1546,36 @@ xfs_filemap_fault(
        return ret;
 }
 
+STATIC int
+xfs_filemap_pmd_fault(
+       struct vm_area_struct   *vma,
+       unsigned long           addr,
+       pmd_t                   *pmd,
+       unsigned int            flags)
+{
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret;
+
+       if (!IS_DAX(inode))
+               return VM_FAULT_FALLBACK;
+
+       trace_xfs_filemap_pmd_fault(ip);
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+                                   xfs_end_io_dax_write);
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
+}
+
 static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = xfs_filemap_fault,
+       .pmd_fault      = xfs_filemap_pmd_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_filemap_page_mkwrite,
 };
@@ -1553,7 +1588,7 @@ xfs_file_mmap(
        file_accessed(filp);
        vma->vm_ops = &xfs_file_vm_ops;
        if (IS_DAX(file_inode(filp)))
-               vma->vm_flags |= VM_MIXEDMAP;
+               vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
        return 0;
 }
 
index 9b3438a7680f4251cf623d18e0dc9e78ef269ab7..ee3aaa0a53179f761ffffe6257a58b637a6b7dc8 100644 (file)
@@ -250,7 +250,7 @@ xfs_growfs_data_private(
                agf->agf_freeblks = cpu_to_be32(tmpsize);
                agf->agf_longest = cpu_to_be32(tmpsize);
                if (xfs_sb_version_hascrc(&mp->m_sb))
-                       uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
 
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
@@ -273,7 +273,7 @@ xfs_growfs_data_private(
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
                        agfl->agfl_seqno = cpu_to_be32(agno);
-                       uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
                }
 
                agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
@@ -309,7 +309,7 @@ xfs_growfs_data_private(
                agi->agi_newino = cpu_to_be32(NULLAGINO);
                agi->agi_dirino = cpu_to_be32(NULLAGINO);
                if (xfs_sb_version_hascrc(&mp->m_sb))
-                       uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+                       uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
                if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
                        agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
                        agi->agi_free_level = cpu_to_be32(1);
index 76a9f27832827db72fc58465389a619648a90b56..0a326bd64d4e39b3b197437cf6e964bffb4cca0a 100644 (file)
@@ -412,6 +412,8 @@ xfs_iget(
        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return -EINVAL;
 
+       XFS_STATS_INC(xs_ig_attempts);
+
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
        agino = XFS_INO_TO_AGINO(mp, ino);
index 3da9f4da4f3d2e6b67ffd1bd752b4b0993af9fda..dc40a6d5ae0dc909a79ef72917c5eb1551fae227 100644 (file)
@@ -164,7 +164,7 @@ xfs_ilock(
               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
        if (lock_flags & XFS_IOLOCK_EXCL)
                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
@@ -212,7 +212,7 @@ xfs_ilock_nowait(
               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
        if (lock_flags & XFS_IOLOCK_EXCL) {
                if (!mrtryupdate(&ip->i_iolock))
@@ -281,7 +281,7 @@ xfs_iunlock(
               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
        ASSERT(lock_flags != 0);
 
        if (lock_flags & XFS_IOLOCK_EXCL)
@@ -362,32 +362,58 @@ int xfs_lots_retries;
 int xfs_lock_delays;
 #endif
 
+/*
+ * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
+ * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
+ * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
+ * errors and warnings.
+ */
+#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
+static bool
+xfs_lockdep_subclass_ok(
+       int subclass)
+{
+       return subclass < MAX_LOCKDEP_SUBCLASSES;
+}
+#else
+#define xfs_lockdep_subclass_ok(subclass)      (true)
+#endif
+
 /*
  * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
- * value. This shouldn't be called for page fault locking, but we also need to
- * ensure we don't overrun the number of lockdep subclasses for the iolock or
- * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
+ * value. This can be called for any type of inode lock combination, including
+ * parent locking. Care must be taken to ensure we don't overrun the subclass
+ * storage fields in the class mask we build.
  */
 static inline int
 xfs_lock_inumorder(int lock_mode, int subclass)
 {
+       int     class = 0;
+
+       ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
+                             XFS_ILOCK_RTSUM)));
+       ASSERT(xfs_lockdep_subclass_ok(subclass));
+
        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
-               ASSERT(subclass + XFS_LOCK_INUMORDER <
-                       (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+               ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
+               ASSERT(xfs_lockdep_subclass_ok(subclass +
+                                               XFS_IOLOCK_PARENT_VAL));
+               class += subclass << XFS_IOLOCK_SHIFT;
+               if (lock_mode & XFS_IOLOCK_PARENT)
+                       class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
        }
 
        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
-               ASSERT(subclass + XFS_LOCK_INUMORDER <
-                       (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
-                                                       XFS_MMAPLOCK_SHIFT;
+               ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
+               class += subclass << XFS_MMAPLOCK_SHIFT;
        }
 
-       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
-               lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+       if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
+               ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
+               class += subclass << XFS_ILOCK_SHIFT;
+       }
 
-       return lock_mode;
+       return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
 }
 
 /*
@@ -399,6 +425,11 @@ xfs_lock_inumorder(int lock_mode, int subclass)
  * transaction (such as truncate). This can result in deadlock since the long
  * running trans might need to wait for the inode we just locked in order to
  * push the tail and free space in the log.
+ *
+ * xfs_lock_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
  */
 void
 xfs_lock_inodes(
@@ -409,8 +440,29 @@ xfs_lock_inodes(
        int             attempts = 0, i, j, try_lock;
        xfs_log_item_t  *lp;
 
-       /* currently supports between 2 and 5 inodes */
+       /*
+        * Currently supports between 2 and 5 inodes with exclusive locking.  We
+        * support an arbitrary depth of locking here, but absolute limits on
+        * inodes depend on the the type of locking and the limits placed by
+        * lockdep annotations in xfs_lock_inumorder.  These are all checked by
+        * the asserts.
+        */
        ASSERT(ips && inodes >= 2 && inodes <= 5);
+       ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
+                           XFS_ILOCK_EXCL));
+       ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
+                             XFS_ILOCK_SHARED)));
+       ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
+               inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
+       ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
+               inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
+       ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
+               inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
+
+       if (lock_mode & XFS_IOLOCK_EXCL) {
+               ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
+       } else if (lock_mode & XFS_MMAPLOCK_EXCL)
+               ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
 
        try_lock = 0;
        i = 0;
@@ -629,30 +681,29 @@ xfs_lookup(
 {
        xfs_ino_t               inum;
        int                     error;
-       uint                    lock_mode;
 
        trace_xfs_lookup(dp, name);
 
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
                return -EIO;
 
-       lock_mode = xfs_ilock_data_map_shared(dp);
+       xfs_ilock(dp, XFS_IOLOCK_SHARED);
        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
-       xfs_iunlock(dp, lock_mode);
-
        if (error)
-               goto out;
+               goto out_unlock;
 
        error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
        if (error)
                goto out_free_name;
 
+       xfs_iunlock(dp, XFS_IOLOCK_SHARED);
        return 0;
 
 out_free_name:
        if (ci_name)
                kmem_free(ci_name->name);
-out:
+out_unlock:
+       xfs_iunlock(dp, XFS_IOLOCK_SHARED);
        *ipp = NULL;
        return error;
 }
@@ -787,7 +838,7 @@ xfs_ialloc(
 
        if (ip->i_d.di_version == 3) {
                ASSERT(ip->i_d.di_ino == ino);
-               ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+               ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
                ip->i_d.di_crc = 0;
                ip->i_d.di_changecount = 1;
                ip->i_d.di_lsn = 0;
@@ -1149,7 +1200,8 @@ xfs_create(
                goto out_trans_cancel;
 
 
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+                     XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
 
        xfs_bmap_init(&free_list, &first_block);
@@ -1175,11 +1227,8 @@ xfs_create(
         */
        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
                               prid, resblks > 0, &ip, &committed);
-       if (error) {
-               if (error == -ENOSPC)
-                       goto out_trans_cancel;
+       if (error)
                goto out_trans_cancel;
-       }
 
        /*
         * Now we join the directory inode to the transaction.  We do not do it
@@ -1188,7 +1237,7 @@ xfs_create(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        unlock_dp_on_error = false;
 
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1261,7 +1310,7 @@ xfs_create(
        xfs_qm_dqrele(pdqp);
 
        if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+               xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        return error;
 }
 
@@ -1318,11 +1367,8 @@ xfs_create_tmpfile(
 
        error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
                                prid, resblks > 0, &ip, NULL);
-       if (error) {
-               if (error == -ENOSPC)
-                       goto out_trans_cancel;
+       if (error)
                goto out_trans_cancel;
-       }
 
        if (mp->m_flags & XFS_MOUNT_WSYNC)
                xfs_trans_set_sync(tp);
@@ -1409,10 +1455,11 @@ xfs_link(
        if (error)
                goto error_return;
 
+       xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 
        /*
         * If we are using project inheritance, we only allow hard link
@@ -1791,14 +1838,15 @@ xfs_inactive_ifree(
        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
 
        /*
-        * Just ignore errors at this point.  There is nothing we can
-        * do except to try to keep going. Make sure it's not a silent
-        * error.
+        * Just ignore errors at this point.  There is nothing we can do except
+        * to try to keep going. Make sure it's not a silent error.
         */
        error = xfs_bmap_finish(&tp,  &free_list, &committed);
-       if (error)
+       if (error) {
                xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
                        __func__, error);
+               xfs_bmap_cancel(&free_list);
+       }
        error = xfs_trans_commit(tp);
        if (error)
                xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
@@ -2515,9 +2563,10 @@ xfs_remove(
                goto out_trans_cancel;
        }
 
+       xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
        xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
        /*
@@ -2898,6 +2947,12 @@ xfs_rename(
         * whether the target directory is the same as the source
         * directory, we can lock from 2 to 4 inodes.
         */
+       if (!new_parent)
+               xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+       else
+               xfs_lock_two_inodes(src_dp, target_dp,
+                                   XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 
        /*
@@ -2905,9 +2960,9 @@ xfs_rename(
         * we can rely on either trans_commit or trans_cancel to unlock
         * them.
         */
-       xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        if (new_parent)
-               xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+               xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
        if (target_ip)
                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
index 8f22d20368d8ff4c0232174f8bdc7c1e1bcf28eb..ca9e11989cbd4f330c6cb0d1a1bede113fd9c8b2 100644 (file)
@@ -284,9 +284,9 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * Flags for lockdep annotations.
  *
  * XFS_LOCK_PARENT - for directory operations that require locking a
- * parent directory inode and a child entry inode.  The parent gets locked
- * with this flag so it gets a lockdep subclass of 1 and the child entry
- * lock will have a lockdep subclass of 0.
+ * parent directory inode and a child entry inode. IOLOCK requires nesting,
+ * MMAPLOCK does not support this class, ILOCK requires a single subclass
+ * to differentiate parent from child.
  *
  * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
  * inodes do not participate in the normal lock order, and thus have their
@@ -295,30 +295,63 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * XFS_LOCK_INUMORDER - for locking several inodes at the some time
  * with xfs_lock_inodes().  This flag is used as the starting subclass
  * and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 5, and so on. It is
- * the responsibility of the class builder to shift this to the correct
- * portion of the lock_mode lockdep mask.
+ * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly
+ * limited to the subclasses we can represent via nesting. We need at least
+ * 5 inodes nest depth for the ILOCK through rename, and we also have to support
+ * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP
+ * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all
+ * 8 subclasses supported by lockdep.
+ *
+ * This also means we have to number the sub-classes in the lowest bits of
+ * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep
+ * mask and we can't use bit-masking to build the subclasses. What a mess.
+ *
+ * Bit layout:
+ *
+ * Bit         Lock Region
+ * 16-19       XFS_IOLOCK_SHIFT dependencies
+ * 20-23       XFS_MMAPLOCK_SHIFT dependencies
+ * 24-31       XFS_ILOCK_SHIFT dependencies
+ *
+ * IOLOCK values
+ *
+ * 0-3         subclass value
+ * 4-7         PARENT subclass values
+ *
+ * MMAPLOCK values
+ *
+ * 0-3         subclass value
+ * 4-7         unused
+ *
+ * ILOCK values
+ * 0-4         subclass values
+ * 5           PARENT subclass (not nestable)
+ * 6           RTBITMAP subclass (not nestable)
+ * 7           RTSUM subclass (not nestable)
+ * 
  */
-#define XFS_LOCK_PARENT                1
-#define XFS_LOCK_RTBITMAP      2
-#define XFS_LOCK_RTSUM         3
-#define XFS_LOCK_INUMORDER     4
-
-#define XFS_IOLOCK_SHIFT       16
-#define        XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
-
-#define XFS_MMAPLOCK_SHIFT     20
-
-#define XFS_ILOCK_SHIFT                24
-#define        XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
-#define        XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
-#define        XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
-
-#define XFS_IOLOCK_DEP_MASK    0x000f0000
-#define XFS_MMAPLOCK_DEP_MASK  0x00f00000
-#define XFS_ILOCK_DEP_MASK     0xff000000
-#define XFS_LOCK_DEP_MASK      (XFS_IOLOCK_DEP_MASK | \
+#define XFS_IOLOCK_SHIFT               16
+#define XFS_IOLOCK_PARENT_VAL          4
+#define XFS_IOLOCK_MAX_SUBCLASS                (XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_DEP_MASK            0x000f0000
+#define        XFS_IOLOCK_PARENT               (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
+
+#define XFS_MMAPLOCK_SHIFT             20
+#define XFS_MMAPLOCK_NUMORDER          0
+#define XFS_MMAPLOCK_MAX_SUBCLASS      3
+#define XFS_MMAPLOCK_DEP_MASK          0x00f00000
+
+#define XFS_ILOCK_SHIFT                        24
+#define XFS_ILOCK_PARENT_VAL           5
+#define XFS_ILOCK_MAX_SUBCLASS         (XFS_ILOCK_PARENT_VAL - 1)
+#define XFS_ILOCK_RTBITMAP_VAL         6
+#define XFS_ILOCK_RTSUM_VAL            7
+#define XFS_ILOCK_DEP_MASK             0xff000000
+#define        XFS_ILOCK_PARENT                (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
+#define        XFS_ILOCK_RTBITMAP              (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
+#define        XFS_ILOCK_RTSUM                 (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
+
+#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \
                                 XFS_MMAPLOCK_DEP_MASK | \
                                 XFS_ILOCK_DEP_MASK)
 
index bf13a5a7e2f4ffd90d1ae65402a240ac81e80349..62bd80f4edd9aacb74c254c70b3e9de8618ef988 100644 (file)
@@ -703,17 +703,10 @@ xfs_iflush_abort(
        xfs_inode_log_item_t    *iip = ip->i_itemp;
 
        if (iip) {
-               struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                       spin_lock(&ailp->xa_lock);
-                       if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                               /* xfs_trans_ail_delete() drops the AIL lock. */
-                               xfs_trans_ail_delete(ailp, &iip->ili_item,
-                                               stale ?
-                                                    SHUTDOWN_LOG_IO_ERROR :
+                       xfs_trans_ail_remove(&iip->ili_item,
+                                            stale ? SHUTDOWN_LOG_IO_ERROR :
                                                     SHUTDOWN_CORRUPT_INCORE);
-                       } else
-                               spin_unlock(&ailp->xa_lock);
                }
                iip->ili_logged = 0;
                /*
index 766b23f86ce9f7b9423ba0d8a7f690d27f2a5d1b..8294132e6a3cdda41704a76a871ddd15d3bb271d 100644 (file)
@@ -609,7 +609,7 @@ xfs_setattr_nonsize(
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
        if (error)
-               goto out_dqrele;
+               goto out_trans_cancel;
 
        xfs_ilock(ip, XFS_ILOCK_EXCL);
 
@@ -640,7 +640,7 @@ xfs_setattr_nonsize(
                                                NULL, capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (error)      /* out of quota */
-                               goto out_trans_cancel;
+                               goto out_unlock;
                }
        }
 
@@ -729,10 +729,10 @@ xfs_setattr_nonsize(
 
        return 0;
 
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_trans_cancel:
        xfs_trans_cancel(tp);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_dqrele:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        return error;
index f41b0c3fddab5558f5040c00501e16b6ef935992..930ebd86bebac3a300faf44fabe77aa28258cf60 100644 (file)
@@ -473,7 +473,8 @@ xfs_bulkstat(
                 * pending error, then we are done.
                 */
 del_cursor:
-               xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+               xfs_btree_del_cursor(cur, error ?
+                                         XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
                xfs_buf_relse(agbp);
                if (error)
                        break;
index 08d4fe46f0fae9a161678e1a074cb4e09abcf86f..aaadee0969c929022725b11dcc2cd3a66995bd29 100644 (file)
@@ -668,9 +668,9 @@ xfs_log_mount(
                        ASSERT(0);
                        goto out_free_log;
                }
+               xfs_crit(mp, "Log size out of supported range.");
                xfs_crit(mp,
-"Log size out of supported range. Continuing onwards, but if log hangs are\n"
-"experienced then please report this message in the bug report.");
+"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
        }
 
        /*
@@ -700,6 +700,7 @@ xfs_log_mount(
                if (error) {
                        xfs_warn(mp, "log mount/recovery failed: error %d",
                                error);
+                       xlog_recover_cancel(mp->m_log);
                        goto out_destroy_ail;
                }
        }
@@ -740,18 +741,35 @@ out:
  * it.
  */
 int
-xfs_log_mount_finish(xfs_mount_t *mp)
+xfs_log_mount_finish(
+       struct xfs_mount        *mp)
 {
        int     error = 0;
 
-       if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
-               error = xlog_recover_finish(mp->m_log);
-               if (!error)
-                       xfs_log_work_queue(mp);
-       } else {
+       if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+               return 0;
        }
 
+       error = xlog_recover_finish(mp->m_log);
+       if (!error)
+               xfs_log_work_queue(mp);
+
+       return error;
+}
+
+/*
+ * The mount has failed. Cancel the recovery if it hasn't completed and destroy
+ * the log.
+ */
+int
+xfs_log_mount_cancel(
+       struct xfs_mount        *mp)
+{
+       int                     error;
+
+       error = xlog_recover_cancel(mp->m_log);
+       xfs_log_unmount(mp);
 
        return error;
 }
@@ -1142,11 +1160,13 @@ xlog_space_left(
                 * In this case we just want to return the size of the
                 * log as the amount of space left.
                 */
+               xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
                xfs_alert(log->l_mp,
-                       "xlog_space_left: head behind tail\n"
-                       "  tail_cycle = %d, tail_bytes = %d\n"
-                       "  GH   cycle = %d, GH   bytes = %d",
-                       tail_cycle, tail_bytes, head_cycle, head_bytes);
+                         "  tail_cycle = %d, tail_bytes = %d",
+                         tail_cycle, tail_bytes);
+               xfs_alert(log->l_mp,
+                         "  GH   cycle = %d, GH   bytes = %d",
+                         head_cycle, head_bytes);
                ASSERT(0);
                free_bytes = log->l_logsize;
        }
@@ -1652,8 +1672,13 @@ xlog_cksum(
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
                union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
                int             i;
+               int             xheads;
+
+               xheads = size / XLOG_HEADER_CYCLE_SIZE;
+               if (size % XLOG_HEADER_CYCLE_SIZE)
+                       xheads++;
 
-               for (i = 1; i < log->l_iclog_heads; i++) {
+               for (i = 1; i < xheads; i++) {
                        crc = crc32c(crc, &xhdr[i].hic_xheader,
                                     sizeof(struct xlog_rec_ext_header));
                }
@@ -2028,26 +2053,24 @@ xlog_print_tic_res(
            "SWAPEXT"
        };
 
-       xfs_warn(mp,
-               "xlog_write: reservation summary:\n"
-               "  trans type  = %s (%u)\n"
-               "  unit res    = %d bytes\n"
-               "  current res = %d bytes\n"
-               "  total reg   = %u bytes (o/flow = %u bytes)\n"
-               "  ophdrs      = %u (ophdr space = %u bytes)\n"
-               "  ophdr + reg = %u bytes\n"
-               "  num regions = %u",
-               ((ticket->t_trans_type <= 0 ||
-                 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+       xfs_warn(mp, "xlog_write: reservation summary:");
+       xfs_warn(mp, "  trans type  = %s (%u)",
+                ((ticket->t_trans_type <= 0 ||
+                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
-               ticket->t_trans_type,
-               ticket->t_unit_res,
-               ticket->t_curr_res,
-               ticket->t_res_arr_sum, ticket->t_res_o_flow,
-               ticket->t_res_num_ophdrs, ophdr_spc,
-               ticket->t_res_arr_sum +
-               ticket->t_res_o_flow + ophdr_spc,
-               ticket->t_res_num);
+                ticket->t_trans_type);
+       xfs_warn(mp, "  unit res    = %d bytes",
+                ticket->t_unit_res);
+       xfs_warn(mp, "  current res = %d bytes",
+                ticket->t_curr_res);
+       xfs_warn(mp, "  total reg   = %u bytes (o/flow = %u bytes)",
+                ticket->t_res_arr_sum, ticket->t_res_o_flow);
+       xfs_warn(mp, "  ophdrs      = %u (ophdr space = %u bytes)",
+                ticket->t_res_num_ophdrs, ophdr_spc);
+       xfs_warn(mp, "  ophdr + reg = %u bytes",
+                ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
+       xfs_warn(mp, "  num regions = %u",
+                ticket->t_res_num);
 
        for (i = 0; i < ticket->t_res_num; i++) {
                uint r_type = ticket->t_res_arr[i].r_type;
index fa27aaec72cb535b872840115374c13817d9b6a2..09d91d3166cde43479366e0a3a3ad8e325350cf5 100644 (file)
@@ -147,6 +147,7 @@ int   xfs_log_mount(struct xfs_mount        *mp,
                        xfs_daddr_t             start_block,
                        int                     num_bblocks);
 int      xfs_log_mount_finish(struct xfs_mount *mp);
+int    xfs_log_mount_cancel(struct xfs_mount *);
 xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
 void     xfs_log_space_wake(struct xfs_mount *mp);
index abc2ccbff73918663b374d795ad15bf803fa4c5b..4e7649351f5a25ab062396818861c9d7c2ee0d61 100644 (file)
@@ -307,7 +307,13 @@ xlog_cil_insert_items(
                if (!(lidp->lid_flags & XFS_LID_DIRTY))
                        continue;
 
-               list_move_tail(&lip->li_cil, &cil->xc_cil);
+               /*
+                * Only move the item if it isn't already at the tail. This is
+                * to prevent a transient list_empty() state when reinserting
+                * an item that is already the only item in the CIL.
+                */
+               if (!list_is_last(&lip->li_cil, &cil->xc_cil))
+                       list_move_tail(&lip->li_cil, &cil->xc_cil);
        }
 
        /* account for space used by new iovec headers  */
index 1c87c8abfbed0aef749760ac794b102d217d30eb..950f3f94720c66524baa9a6a4dc0b853e41c3d1b 100644 (file)
@@ -426,6 +426,8 @@ xlog_recover(
 extern int
 xlog_recover_finish(
        struct xlog             *log);
+extern int
+xlog_recover_cancel(struct xlog *);
 
 extern __le32   xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
                            char *dp, int size);
index 480ebba8464f38dbb0608b579185ad93a0600913..512a0945d52ac4e023e2181510e70e65f3da04e8 100644 (file)
@@ -1895,15 +1895,25 @@ xlog_recover_get_buf_lsn(
                 */
                goto recover_immediately;
        case XFS_SB_MAGIC:
+               /*
+                * superblock uuids are magic. We may or may not have a
+                * sb_meta_uuid on disk, but it will be set in the in-core
+                * superblock. We set the uuid pointer for verification
+                * according to the superblock feature mask to ensure we check
+                * the relevant UUID in the superblock.
+                */
                lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
-               uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+               if (xfs_sb_version_hasmetauuid(&mp->m_sb))
+                       uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+               else
+                       uuid = &((struct xfs_dsb *)blk)->sb_uuid;
                break;
        default:
                break;
        }
 
        if (lsn != (xfs_lsn_t)-1) {
-               if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+               if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
                        goto recover_immediately;
                return lsn;
        }
@@ -2933,16 +2943,16 @@ xlog_recover_efi_pass2(
        struct xlog_recover_item        *item,
        xfs_lsn_t                       lsn)
 {
-       int                     error;
-       xfs_mount_t             *mp = log->l_mp;
-       xfs_efi_log_item_t      *efip;
-       xfs_efi_log_format_t    *efi_formatp;
+       int                             error;
+       struct xfs_mount                *mp = log->l_mp;
+       struct xfs_efi_log_item         *efip;
+       struct xfs_efi_log_format       *efi_formatp;
 
        efi_formatp = item->ri_buf[0].i_addr;
 
        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
-       if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
-                                        &(efip->efi_format)))) {
+       error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+       if (error) {
                xfs_efi_item_free(efip);
                return error;
        }
@@ -2950,20 +2960,23 @@ xlog_recover_efi_pass2(
 
        spin_lock(&log->l_ailp->xa_lock);
        /*
-        * xfs_trans_ail_update() drops the AIL lock.
+        * The EFI has two references. One for the EFD and one for EFI to ensure
+        * it makes it into the AIL. Insert the EFI into the AIL directly and
+        * drop the EFI reference. Note that xfs_trans_ail_update() drops the
+        * AIL lock.
         */
        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+       xfs_efi_release(efip);
        return 0;
 }
 
 
 /*
- * This routine is called when an efd format structure is found in
- * a committed transaction in the log.  It's purpose is to cancel
- * the corresponding efi if it was still in the log.  To do this
- * it searches the AIL for the efi with an id equal to that in the
- * efd format structure.  If we find it, we remove the efi from the
- * AIL and free it.
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
  */
 STATIC int
 xlog_recover_efd_pass2(
@@ -2985,8 +2998,8 @@ xlog_recover_efd_pass2(
        efi_id = efd_formatp->efd_efi_id;
 
        /*
-        * Search for the efi with the id in the efd format structure
-        * in the AIL.
+        * Search for the EFI with the id in the EFD format structure in the
+        * AIL.
         */
        spin_lock(&ailp->xa_lock);
        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
@@ -2995,18 +3008,18 @@ xlog_recover_efd_pass2(
                        efip = (xfs_efi_log_item_t *)lip;
                        if (efip->efi_format.efi_id == efi_id) {
                                /*
-                                * xfs_trans_ail_delete() drops the
-                                * AIL lock.
+                                * Drop the EFD reference to the EFI. This
+                                * removes the EFI from the AIL and frees it.
                                 */
-                               xfs_trans_ail_delete(ailp, lip,
-                                                    SHUTDOWN_CORRUPT_INCORE);
-                               xfs_efi_item_free(efip);
+                               spin_unlock(&ailp->xa_lock);
+                               xfs_efi_release(efip);
                                spin_lock(&ailp->xa_lock);
                                break;
                        }
                }
                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
+
        xfs_trans_ail_cursor_done(&cur);
        spin_unlock(&ailp->xa_lock);
 
@@ -3034,6 +3047,11 @@ xlog_recover_do_icreate_pass2(
        unsigned int            count;
        unsigned int            isize;
        xfs_agblock_t           length;
+       int                     blks_per_cluster;
+       int                     bb_per_cluster;
+       int                     cancel_count;
+       int                     nbufs;
+       int                     i;
 
        icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
        if (icl->icl_type != XFS_LI_ICREATE) {
@@ -3092,22 +3110,45 @@ xlog_recover_do_icreate_pass2(
        }
 
        /*
-        * Inode buffers can be freed. Do not replay the inode initialisation as
-        * we could be overwriting something written after this inode buffer was
-        * cancelled.
+        * The icreate transaction can cover multiple cluster buffers and these
+        * buffers could have been freed and reused. Check the individual
+        * buffers for cancellation so we don't overwrite anything written after
+        * a cancellation.
+        */
+       blks_per_cluster = xfs_icluster_size_fsb(mp);
+       bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
+       nbufs = length / blks_per_cluster;
+       for (i = 0, cancel_count = 0; i < nbufs; i++) {
+               xfs_daddr_t     daddr;
+
+               daddr = XFS_AGB_TO_DADDR(mp, agno,
+                                        agbno + i * blks_per_cluster);
+               if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
+                       cancel_count++;
+       }
+
+       /*
+        * We currently only use icreate for a single allocation at a time. This
+        * means we should expect either all or none of the buffers to be
+        * cancelled. Be conservative and skip replay if at least one buffer is
+        * cancelled, but warn the user that something is awry if the buffers
+        * are not consistent.
         *
-        * XXX: we need to iterate all buffers and only init those that are not
-        * cancelled. I think that a more fine grained factoring of
-        * xfs_ialloc_inode_init may be appropriate here to enable this to be
-        * done easily.
+        * XXX: This must be refined to only skip cancelled clusters once we use
+        * icreate for multiple chunk allocations.
         */
-       if (xlog_check_buffer_cancelled(log,
-                       XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+       ASSERT(!cancel_count || cancel_count == nbufs);
+       if (cancel_count) {
+               if (cancel_count != nbufs)
+                       xfs_warn(mp,
+       "WARNING: partial inode chunk cancellation, skipped icreate.");
+               trace_xfs_log_recover_icreate_cancel(log, icl);
                return 0;
+       }
 
-       xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
-                             be32_to_cpu(icl->icl_gen));
-       return 0;
+       trace_xfs_log_recover_icreate_recover(log, icl);
+       return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+                                    length, be32_to_cpu(icl->icl_gen));
 }
 
 STATIC void
@@ -3385,14 +3426,24 @@ xlog_recover_add_to_cont_trans(
        char                    *ptr, *old_ptr;
        int                     old_len;
 
+       /*
+        * If the transaction is empty, the header was split across this and the
+        * previous record. Copy the rest of the header.
+        */
        if (list_empty(&trans->r_itemq)) {
-               /* finish copying rest of trans header */
+               ASSERT(len < sizeof(struct xfs_trans_header));
+               if (len > sizeof(struct xfs_trans_header)) {
+                       xfs_warn(log->l_mp, "%s: bad header length", __func__);
+                       return -EIO;
+               }
+
                xlog_recover_add_item(&trans->r_itemq);
                ptr = (char *)&trans->r_theader +
-                               sizeof(xfs_trans_header_t) - len;
+                               sizeof(struct xfs_trans_header) - len;
                memcpy(ptr, dp, len);
                return 0;
        }
+
        /* take the tail entry */
        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
 
@@ -3441,7 +3492,19 @@ xlog_recover_add_to_trans(
                        ASSERT(0);
                        return -EIO;
                }
-               if (len == sizeof(xfs_trans_header_t))
+
+               if (len > sizeof(struct xfs_trans_header)) {
+                       xfs_warn(log->l_mp, "%s: bad header length", __func__);
+                       ASSERT(0);
+                       return -EIO;
+               }
+
+               /*
+                * The transaction header can be arbitrarily split across op
+                * records. If we don't have the whole thing here, copy what we
+                * do have and handle the rest in the next record.
+                */
+               if (len == sizeof(struct xfs_trans_header))
                        xlog_recover_add_item(&trans->r_itemq);
                memcpy(&trans->r_theader, dp, len);
                return 0;
@@ -3744,7 +3807,7 @@ xlog_recover_process_efi(
                         * free the memory associated with it.
                         */
                        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
-                       xfs_efi_release(efip, efip->efi_format.efi_nextents);
+                       xfs_efi_release(efip);
                        return -EIO;
                }
        }
@@ -3757,11 +3820,11 @@ xlog_recover_process_efi(
 
        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
                extp = &(efip->efi_format.efi_extents[i]);
-               error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+               error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+                                             extp->ext_len);
                if (error)
                        goto abort_error;
-               xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
-                                        extp->ext_len);
+
        }
 
        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
@@ -3793,10 +3856,10 @@ abort_error:
  */
 STATIC int
 xlog_recover_process_efis(
-       struct xlog     *log)
+       struct xlog             *log)
 {
-       xfs_log_item_t          *lip;
-       xfs_efi_log_item_t      *efip;
+       struct xfs_log_item     *lip;
+       struct xfs_efi_log_item *efip;
        int                     error = 0;
        struct xfs_ail_cursor   cur;
        struct xfs_ail          *ailp;
@@ -3820,7 +3883,7 @@ xlog_recover_process_efis(
                /*
                 * Skip EFIs that we've already processed.
                 */
-               efip = (xfs_efi_log_item_t *)lip;
+               efip = container_of(lip, struct xfs_efi_log_item, efi_item);
                if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
@@ -3839,6 +3902,50 @@ out:
        return error;
 }
 
+/*
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending EFIs so they don't pin the AIL.
+ */
+STATIC int
+xlog_recover_cancel_efis(
+       struct xlog             *log)
+{
+       struct xfs_log_item     *lip;
+       struct xfs_efi_log_item *efip;
+       int                     error = 0;
+       struct xfs_ail_cursor   cur;
+       struct xfs_ail          *ailp;
+
+       ailp = log->l_ailp;
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+       while (lip != NULL) {
+               /*
+                * We're done when we see something other than an EFI.
+                * There should be no EFIs left in the AIL now.
+                */
+               if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+                       for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+                               ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+                       break;
+               }
+
+               efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+
+               spin_unlock(&ailp->xa_lock);
+               xfs_efi_release(efip);
+               spin_lock(&ailp->xa_lock);
+
+               lip = xfs_trans_ail_cursor_next(ailp, &cur);
+       }
+
+       xfs_trans_ail_cursor_done(&cur);
+       spin_unlock(&ailp->xa_lock);
+       return error;
+}
+
 /*
  * This routine performs a transaction to null out a bad inode pointer
  * in an agi unlinked inode hash bucket.
@@ -4532,11 +4639,13 @@ xlog_recover(
                    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
                        xfs_warn(log->l_mp,
-"Superblock has unknown incompatible log features (0x%x) enabled.\n"
-"The log can not be fully and/or safely recovered by this kernel.\n"
-"Please recover the log on a kernel that supports the unknown features.",
+"Superblock has unknown incompatible log features (0x%x) enabled.",
                                (log->l_mp->m_sb.sb_features_log_incompat &
                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+                       xfs_warn(log->l_mp,
+"The log can not be fully and/or safely recovered by this kernel.");
+                       xfs_warn(log->l_mp,
+"Please recover the log on a kernel that supports the unknown features.");
                        return -EINVAL;
                }
 
@@ -4612,6 +4721,17 @@ xlog_recover_finish(
        return 0;
 }
 
+int
+xlog_recover_cancel(
+       struct xlog     *log)
+{
+       int             error = 0;
+
+       if (log->l_flags & XLOG_RECOVERY_NEEDED)
+               error = xlog_recover_cancel_efis(log);
+
+       return error;
+}
 
 #if defined(DEBUG)
 /*
index 461e791efad71d66f1ffa50930be9fba6ee9f352..bf92e0c037c7378cadf1848a4324968170e1ae29 100644 (file)
@@ -615,14 +615,14 @@ xfs_default_resblks(xfs_mount_t *mp)
  */
 int
 xfs_mountfs(
-       xfs_mount_t     *mp)
+       struct xfs_mount        *mp)
 {
-       xfs_sb_t        *sbp = &(mp->m_sb);
-       xfs_inode_t     *rip;
-       __uint64_t      resblks;
-       uint            quotamount = 0;
-       uint            quotaflags = 0;
-       int             error = 0;
+       struct xfs_sb           *sbp = &(mp->m_sb);
+       struct xfs_inode        *rip;
+       __uint64_t              resblks;
+       uint                    quotamount = 0;
+       uint                    quotaflags = 0;
+       int                     error = 0;
 
        xfs_sb_mount_common(mp, sbp);
 
@@ -799,7 +799,9 @@ xfs_mountfs(
        }
 
        /*
-        * log's mount-time initialization. Perform 1st part recovery if needed
+        * Log's mount-time initialization. The first part of recovery can place
+        * some items on the AIL, to be handled when recovery is finished or
+        * cancelled.
         */
        error = xfs_log_mount(mp, mp->m_logdev_targp,
                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
@@ -910,9 +912,9 @@ xfs_mountfs(
        }
 
        /*
-        * Finish recovering the file system.  This part needed to be
-        * delayed until after the root and real-time bitmap inodes
-        * were consistently read in.
+        * Finish recovering the file system.  This part needed to be delayed
+        * until after the root and real-time bitmap inodes were consistently
+        * read in.
         */
        error = xfs_log_mount_finish(mp);
        if (error) {
@@ -955,8 +957,10 @@ xfs_mountfs(
        xfs_rtunmount_inodes(mp);
  out_rele_rip:
        IRELE(rip);
+       cancel_delayed_work_sync(&mp->m_reclaim_work);
+       xfs_reclaim_inodes(mp, SYNC_WAIT);
  out_log_dealloc:
-       xfs_log_unmount(mp);
+       xfs_log_mount_cancel(mp);
  out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
                xfs_wait_buftarg(mp->m_logdev_targp);
index f4e8c06eee26d0e5e4608bc40f3c571b3618d433..ab1bac6a3a1c0db4c2b04441682d85c25ee36279 100644 (file)
@@ -757,31 +757,30 @@ xfs_rtallocate_extent_size(
 /*
  * Allocate space to the bitmap or summary file, and zero it, for growfs.
  */
-STATIC int                             /* error */
+STATIC int
 xfs_growfs_rt_alloc(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_extlen_t    oblocks,        /* old count of blocks */
-       xfs_extlen_t    nblocks,        /* new count of blocks */
-       xfs_inode_t     *ip)            /* inode (bitmap/summary) */
+       struct xfs_mount        *mp,            /* file system mount point */
+       xfs_extlen_t            oblocks,        /* old count of blocks */
+       xfs_extlen_t            nblocks,        /* new count of blocks */
+       struct xfs_inode        *ip)            /* inode (bitmap/summary) */
 {
-       xfs_fileoff_t   bno;            /* block number in file */
-       xfs_buf_t       *bp;            /* temporary buffer for zeroing */
-       int             committed;      /* transaction committed flag */
-       xfs_daddr_t     d;              /* disk block address */
-       int             error;          /* error return value */
-       xfs_fsblock_t   firstblock;     /* first block allocated in xaction */
-       xfs_bmap_free_t flist;          /* list of freed blocks */
-       xfs_fsblock_t   fsbno;          /* filesystem block for bno */
-       xfs_bmbt_irec_t map;            /* block map output */
-       int             nmap;           /* number of block maps */
-       int             resblks;        /* space reservation */
+       xfs_fileoff_t           bno;            /* block number in file */
+       struct xfs_buf          *bp;    /* temporary buffer for zeroing */
+       int                     committed;      /* transaction committed flag */
+       xfs_daddr_t             d;              /* disk block address */
+       int                     error;          /* error return value */
+       xfs_fsblock_t           firstblock;/* first block allocated in xaction */
+       struct xfs_bmap_free    flist;          /* list of freed blocks */
+       xfs_fsblock_t           fsbno;          /* filesystem block for bno */
+       struct xfs_bmbt_irec    map;            /* block map output */
+       int                     nmap;           /* number of block maps */
+       int                     resblks;        /* space reservation */
+       struct xfs_trans        *tp;
 
        /*
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
-               xfs_trans_t     *tp;
-
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
                /*
@@ -790,7 +789,7 @@ xfs_growfs_rt_alloc(
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
                                          resblks, 0);
                if (error)
-                       goto error_cancel;
+                       goto out_trans_cancel;
                /*
                 * Lock the inode.
                 */
@@ -808,16 +807,16 @@ xfs_growfs_rt_alloc(
                if (!error && nmap < 1)
                        error = -ENOSPC;
                if (error)
-                       goto error_cancel;
+                       goto out_bmap_cancel;
                /*
                 * Free any blocks freed up in the transaction, then commit.
                 */
                error = xfs_bmap_finish(&tp, &flist, &committed);
                if (error)
-                       goto error_cancel;
+                       goto out_bmap_cancel;
                error = xfs_trans_commit(tp);
                if (error)
-                       goto error;
+                       return error;
                /*
                 * Now we need to clear the allocated blocks.
                 * Do this one block per transaction, to keep it simple.
@@ -832,7 +831,7 @@ xfs_growfs_rt_alloc(
                        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
                                                  0, 0);
                        if (error)
-                               goto error_cancel;
+                               goto out_trans_cancel;
                        /*
                         * Lock the bitmap inode.
                         */
@@ -846,9 +845,7 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = -EIO;
-error_cancel:
-                               xfs_trans_cancel(tp);
-                               goto error;
+                               goto out_trans_cancel;
                        }
                        memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -857,16 +854,20 @@ error_cancel:
                         */
                        error = xfs_trans_commit(tp);
                        if (error)
-                               goto error;
+                               return error;
                }
                /*
                 * Go on to the next extent, if any.
                 */
                oblocks = map.br_startoff + map.br_blockcount;
        }
+
        return 0;
 
-error:
+out_bmap_cancel:
+       xfs_bmap_cancel(&flist);
+out_trans_cancel:
+       xfs_trans_cancel(tp);
        return error;
 }
 
index bbd9b1f10ffb2d9a19995ca0f7f30ed500128a4e..904f637cfa5f3c2f40c0e7c4bb0ba6ffe3b65d1a 100644 (file)
@@ -261,16 +261,8 @@ xfs_parseargs(
                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
                        if (!mp->m_rtname)
                                return -ENOMEM;
-               } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return -EINVAL;
-                       }
-                       if (kstrtoint(value, 10, &iosize))
-                               return -EINVAL;
-                       iosizelog = ffs(iosize) - 1;
-               } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+               } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
+                          !strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
@@ -1528,6 +1520,10 @@ xfs_fs_fill_super(
                }
        }
 
+       if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+               xfs_alert(mp,
+       "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
        error = xfs_mountfs(mp);
        if (error)
                goto out_filestream_unmount;
index 4be27b0210af863f3913f94b9b7134307a7c74ff..996481eeb491363a75f804f807f8a7056f794e3d 100644 (file)
@@ -240,7 +240,8 @@ xfs_symlink(
        if (error)
                goto out_trans_cancel;
 
-       xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+       xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+                     XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
        unlock_dp_on_error = true;
 
        /*
@@ -288,7 +289,7 @@ xfs_symlink(
         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
-       xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        unlock_dp_on_error = false;
 
        /*
@@ -421,7 +422,7 @@ out_release_inode:
        xfs_qm_dqrele(pdqp);
 
        if (unlock_dp_on_error)
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
+               xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
        return error;
 }
 
@@ -501,7 +502,7 @@ xfs_inactive_symlink_rmt(
        /*
         * Unmap the dead block(s) to the free_list.
         */
-       error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+       error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps,
                            &first_block, &free_list, &done);
        if (error)
                goto error_bmap_cancel;
index 8d916d33d93da78a4fb6d214fbd4282948211bc8..5ed36b1e04c1af6103122450e33aa8e1f2546c1b 100644 (file)
@@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 
 DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
 DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 
 DECLARE_EVENT_CLASS(xfs_iref_class,
@@ -2089,6 +2090,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
 
+DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
+       TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f),
+       TP_ARGS(log, in_f),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(unsigned int, count)
+               __field(unsigned int, isize)
+               __field(xfs_agblock_t, length)
+               __field(unsigned int, gen)
+       ),
+       TP_fast_assign(
+               __entry->dev = log->l_mp->m_super->s_dev;
+               __entry->agno = be32_to_cpu(in_f->icl_ag);
+               __entry->agbno = be32_to_cpu(in_f->icl_agbno);
+               __entry->count = be32_to_cpu(in_f->icl_count);
+               __entry->isize = be32_to_cpu(in_f->icl_isize);
+               __entry->length = be32_to_cpu(in_f->icl_length);
+               __entry->gen = be32_to_cpu(in_f->icl_gen);
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u "
+                 "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno, __entry->agbno, __entry->count, __entry->isize,
+                 __entry->length, __entry->gen)
+)
+#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
+       TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \
+       TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);
+
 DECLARE_EVENT_CLASS(xfs_discard_class,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
                 xfs_agblock_t agbno, xfs_extlen_t len),
index 0582a27107d4ee3dcd2e57c6739405a96280d79d..a0ab1dae9c312ec0b67fe83a1d2936e7d716b339 100644 (file)
@@ -1019,9 +1019,10 @@ xfs_trans_cancel(
  * chunk we've been working on and get a new transaction to continue.
  */
 int
-xfs_trans_roll(
+__xfs_trans_roll(
        struct xfs_trans        **tpp,
-       struct xfs_inode        *dp)
+       struct xfs_inode        *dp,
+       int                     *committed)
 {
        struct xfs_trans        *trans;
        struct xfs_trans_res    tres;
@@ -1052,6 +1053,7 @@ xfs_trans_roll(
        if (error)
                return error;
 
+       *committed = 1;
        trans = *tpp;
 
        /*
@@ -1074,3 +1076,12 @@ xfs_trans_roll(
                xfs_trans_ijoin(trans, dp, 0);
        return 0;
 }
+
+int
+xfs_trans_roll(
+       struct xfs_trans        **tpp,
+       struct xfs_inode        *dp)
+{
+       int                     committed = 0;
+       return __xfs_trans_roll(tpp, dp, &committed);
+}
index 3b21b4e5e4678885f0a9197cd073684814a118a4..4643070d7cae4b814a36b101ba0a88fcd0c6287e 100644 (file)
@@ -213,7 +213,6 @@ void                xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void           xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
 void           xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 struct xfs_efi_log_item        *xfs_trans_get_efi(xfs_trans_t *, uint);
-void           xfs_efi_release(struct xfs_efi_log_item *, uint);
 void           xfs_trans_log_efi_extent(xfs_trans_t *,
                                         struct xfs_efi_log_item *,
                                         xfs_fsblock_t,
@@ -221,11 +220,11 @@ void              xfs_trans_log_efi_extent(xfs_trans_t *,
 struct xfs_efd_log_item        *xfs_trans_get_efd(xfs_trans_t *,
                                  struct xfs_efi_log_item *,
                                  uint);
-void           xfs_trans_log_efd_extent(xfs_trans_t *,
-                                        struct xfs_efd_log_item *,
-                                        xfs_fsblock_t,
-                                        xfs_extlen_t);
+int            xfs_trans_free_extent(struct xfs_trans *,
+                                     struct xfs_efd_log_item *, xfs_fsblock_t,
+                                     xfs_extlen_t);
 int            xfs_trans_commit(struct xfs_trans *);
+int            __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
 int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 void           xfs_trans_cancel(xfs_trans_t *);
 int            xfs_trans_ail_init(struct xfs_mount *);
index 284397dd7990d83cb7be11be4a0424bbd7cc1a9c..a96ae540eb629c86e15c004dc66eb60fbb6be90e 100644 (file)
@@ -25,6 +25,7 @@
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
 
 /*
  * This routine is called to allocate an "extent free intention"
@@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t             *tp,
 }
 
 /*
- * This routine is called to indicate that the described
- * extent is to be logged as having been freed.  It should
- * be called once for each extent freed.
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
  */
-void
-xfs_trans_log_efd_extent(xfs_trans_t           *tp,
-                        xfs_efd_log_item_t     *efdp,
-                        xfs_fsblock_t          start_block,
-                        xfs_extlen_t           ext_len)
+int
+xfs_trans_free_extent(
+       struct xfs_trans        *tp,
+       struct xfs_efd_log_item *efdp,
+       xfs_fsblock_t           start_block,
+       xfs_extlen_t            ext_len)
 {
        uint                    next_extent;
-       xfs_extent_t            *extp;
+       struct xfs_extent       *extp;
+       int                     error;
 
+       error = xfs_free_extent(tp, start_block, ext_len);
+
+       /*
+        * Mark the transaction dirty, even on error. This ensures the
+        * transaction is aborted, which:
+        *
+        * 1.) releases the EFI and frees the EFD
+        * 2.) shuts down the filesystem
+        */
        tp->t_flags |= XFS_TRANS_DIRTY;
        efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
 
@@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t                *tp,
        extp->ext_start = start_block;
        extp->ext_len = ext_len;
        efdp->efd_next_extent++;
+
+       return error;
 }
index 1b736294558a76f0498f7e034885ddc5ce9c1497..49931b72da8a8400f767a1cb7d368b31ce62c04b 100644 (file)
@@ -119,6 +119,21 @@ xfs_trans_ail_delete(
        xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
 }
 
+static inline void
+xfs_trans_ail_remove(
+       struct xfs_log_item     *lip,
+       int                     shutdown_type)
+{
+       struct xfs_ail          *ailp = lip->li_ailp;
+
+       spin_lock(&ailp->xa_lock);
+       /* xfs_trans_ail_delete() drops the AIL lock */
+       if (lip->li_flags & XFS_LI_IN_AIL)
+               xfs_trans_ail_delete(ailp, lip, shutdown_type);
+       else
+               spin_unlock(&ailp->xa_lock);
+}
+
 void                   xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                   xfs_ail_push_all(struct xfs_ail *);
 void                   xfs_ail_push_all_sync(struct xfs_ail *);
index 940d5ec122c96e5a72173b9db2d0bcff44d39405..b1bc954eccf37438213d6744fe69ff1bc4d71365 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <linux/dma-attrs.h>
+#include <asm-generic/dma-coherent.h>
 
 static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
                                              size_t size,
@@ -237,4 +238,121 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 
 #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL)
 
+#ifndef arch_dma_alloc_attrs
+#define arch_dma_alloc_attrs(dev, flag)        (true)
+#endif
+
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+                                      dma_addr_t *dma_handle, gfp_t flag,
+                                      struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+       void *cpu_addr;
+
+       BUG_ON(!ops);
+
+       if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
+               return cpu_addr;
+
+       if (!arch_dma_alloc_attrs(&dev, &flag))
+               return NULL;
+       if (!ops->alloc)
+               return NULL;
+
+       cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
+       debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+       return cpu_addr;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+                                    void *cpu_addr, dma_addr_t dma_handle,
+                                    struct dma_attrs *attrs)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       BUG_ON(!ops);
+       WARN_ON(irqs_disabled());
+
+       if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
+               return;
+
+       if (!ops->free)
+               return;
+
+       debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
+       ops->free(dev, size, cpu_addr, dma_handle, attrs);
+}
+
+static inline void *dma_alloc_coherent(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t flag)
+{
+       return dma_alloc_attrs(dev, size, dma_handle, flag, NULL);
+}
+
+static inline void dma_free_coherent(struct device *dev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle)
+{
+       return dma_free_attrs(dev, size, cpu_addr, dma_handle, NULL);
+}
+
+static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
+               dma_addr_t *dma_handle, gfp_t gfp)
+{
+       DEFINE_DMA_ATTRS(attrs);
+
+       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+       return dma_alloc_attrs(dev, size, dma_handle, gfp, &attrs);
+}
+
+static inline void dma_free_noncoherent(struct device *dev, size_t size,
+               void *cpu_addr, dma_addr_t dma_handle)
+{
+       DEFINE_DMA_ATTRS(attrs);
+
+       dma_set_attr(DMA_ATTR_NON_CONSISTENT, &attrs);
+       dma_free_attrs(dev, size, cpu_addr, dma_handle, &attrs);
+}
+
+static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+       debug_dma_mapping_error(dev, dma_addr);
+
+       if (get_dma_ops(dev)->mapping_error)
+               return get_dma_ops(dev)->mapping_error(dev, dma_addr);
+
+#ifdef DMA_ERROR_CODE
+       return dma_addr == DMA_ERROR_CODE;
+#else
+       return 0;
+#endif
+}
+
+#ifndef HAVE_ARCH_DMA_SUPPORTED
+static inline int dma_supported(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (!ops)
+               return 0;
+       if (!ops->dma_supported)
+               return 1;
+       return ops->dma_supported(dev, mask);
+}
+#endif
+
+#ifndef HAVE_ARCH_DMA_SET_MASK
+static inline int dma_set_mask(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+       if (ops->set_dma_mask)
+               return ops->set_dma_mask(dev, mask);
+
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+       *dev->dma_mask = mask;
+       return 0;
+}
+#endif
+
 #endif
index a5de55c04fb2ee56b8f77a3444028aa52c3ecbb9..734ad4db388c6d922fb812391f913cbdda710f12 100644 (file)
@@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr,
                                   unsigned long size);
 extern void *early_memremap(resource_size_t phys_addr,
                            unsigned long size);
+extern void *early_memremap_ro(resource_size_t phys_addr,
+                              unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void early_memunmap(void *addr, unsigned long size);
 
@@ -33,6 +35,12 @@ extern void early_ioremap_setup(void);
  */
 extern void early_ioremap_reset(void);
 
+/*
+ * Early copy from unmapped memory to kernel mapped memory.
+ */
+extern void copy_from_early_mem(void *dest, phys_addr_t src,
+                               unsigned long size);
+
 #else
 static inline void early_ioremap_init(void) { }
 static inline void early_ioremap_setup(void) { }
index f23174fb9ec4340378df59b5cc89b43ecf342bec..1cbb8338edf391bd83c4d1b0bc0dff2cbbe56e75 100644 (file)
@@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
 #ifndef FIXMAP_PAGE_NORMAL
 #define FIXMAP_PAGE_NORMAL PAGE_KERNEL
 #endif
+#if !defined(FIXMAP_PAGE_RO) && defined(PAGE_KERNEL_RO)
+#define FIXMAP_PAGE_RO PAGE_KERNEL_RO
+#endif
 #ifndef FIXMAP_PAGE_NOCACHE
 #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE
 #endif
index 14909b0b9cae71d879df88963a688d37a0e98a8a..f20f407ce45d29fddcf98bd8fcfe270349fd0b92 100644 (file)
 })
 #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
 
+/*
+ * Convert a physical address to a Page Frame Number and back
+ */
+#define        __phys_to_pfn(paddr)    ((unsigned long)((paddr) >> PAGE_SHIFT))
+#define        __pfn_to_phys(pfn)      ((pfn) << PAGE_SHIFT)
+
 #define page_to_pfn __page_to_pfn
 #define pfn_to_page __pfn_to_page
 
index 83bfb87f5bf18e92ea794dd3ca3afec1b1ba6f11..e2aadbc7151f4cd69b8745e80a0af403257f1678 100644 (file)
@@ -111,8 +111,8 @@ static inline void queued_spin_unlock_wait(struct qspinlock *lock)
                cpu_relax();
 }
 
-#ifndef virt_queued_spin_lock
-static __always_inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#ifndef virt_spin_lock
+static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 {
        return false;
 }
index fa86f240c874328121862d0332930abc7e053ba8..4e3b6558331eac0d478389a53ac2c91b92fd7b4d 100644 (file)
@@ -16,6 +16,9 @@
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 #include <linux/delay.h>
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#endif
 
 #define RTC_PIE 0x40           /* periodic interrupt enable */
 #define RTC_AIE 0x20           /* alarm interrupt enable */
@@ -46,6 +49,7 @@ static inline unsigned int __get_rtc_time(struct rtc_time *time)
 {
        unsigned char ctrl;
        unsigned long flags;
+       unsigned char century = 0;
 
 #ifdef CONFIG_MACH_DECSTATION
        unsigned int real_year;
@@ -78,6 +82,11 @@ static inline unsigned int __get_rtc_time(struct rtc_time *time)
        time->tm_year = CMOS_READ(RTC_YEAR);
 #ifdef CONFIG_MACH_DECSTATION
        real_year = CMOS_READ(RTC_DEC_YEAR);
+#endif
+#ifdef CONFIG_ACPI
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century)
+               century = CMOS_READ(acpi_gbl_FADT.century);
 #endif
        ctrl = CMOS_READ(RTC_CONTROL);
        spin_unlock_irqrestore(&rtc_lock, flags);
@@ -90,12 +99,16 @@ static inline unsigned int __get_rtc_time(struct rtc_time *time)
                time->tm_mday = bcd2bin(time->tm_mday);
                time->tm_mon = bcd2bin(time->tm_mon);
                time->tm_year = bcd2bin(time->tm_year);
+               century = bcd2bin(century);
        }
 
 #ifdef CONFIG_MACH_DECSTATION
        time->tm_year += real_year - 72;
 #endif
 
+       if (century)
+               time->tm_year += (century - 19) * 100;
+
        /*
         * Account for differences between how the RTC uses the values
         * and how they are defined in a struct rtc_time;
@@ -122,6 +135,7 @@ static inline int __set_rtc_time(struct rtc_time *time)
 #ifdef CONFIG_MACH_DECSTATION
        unsigned int real_yrs, leap_yr;
 #endif
+       unsigned char century = 0;
 
        yrs = time->tm_year;
        mon = time->tm_mon + 1;   /* tm_mon starts at zero */
@@ -150,6 +164,15 @@ static inline int __set_rtc_time(struct rtc_time *time)
                yrs = 73;
        }
 #endif
+
+#ifdef CONFIG_ACPI
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century) {
+               century = (yrs + 1900) / 100;
+               yrs %= 100;
+       }
+#endif
+
        /* These limits and adjustments are independent of
         * whether the chip is in binary mode or not.
         */
@@ -169,6 +192,7 @@ static inline int __set_rtc_time(struct rtc_time *time)
                day = bin2bcd(day);
                mon = bin2bcd(mon);
                yrs = bin2bcd(yrs);
+               century = bin2bcd(century);
        }
 
        save_control = CMOS_READ(RTC_CONTROL);
@@ -185,6 +209,11 @@ static inline int __set_rtc_time(struct rtc_time *time)
        CMOS_WRITE(hrs, RTC_HOURS);
        CMOS_WRITE(min, RTC_MINUTES);
        CMOS_WRITE(sec, RTC_SECONDS);
+#ifdef CONFIG_ACPI
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+           acpi_gbl_FADT.century)
+               CMOS_WRITE(century, acpi_gbl_FADT.century);
+#endif
 
        CMOS_WRITE(save_control, RTC_CONTROL);
        CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
index 8bd374d3cf21fb0ee32e217f7f7eb6915c992514..1781e54ea6d30791796c450df64240dff720fbbf 100644 (file)
  * during second ld run in second ld pass when generating System.map */
 #define TEXT_TEXT                                                      \
                ALIGN_FUNCTION();                                       \
-               *(.text.hot)                                            \
-               *(.text .text.fixup)                                    \
+               *(.text.hot .text .text.fixup .text.unlikely)           \
                *(.ref.text)                                            \
        MEM_KEEP(init.text)                                             \
        MEM_KEEP(exit.text)                                             \
-               *(.text.unlikely)
 
 
 /* sched.text is aling to function alignment to secure we have same
index 691c79172a269152cf22c1a5d8a5455c736bb835..441aff9b5aa75923e66566d93fdcd544d948b00f 100644 (file)
@@ -9,6 +9,11 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#ifndef _CRYPTO_PKCS7_H
+#define _CRYPTO_PKCS7_H
+
+#include <crypto/public_key.h>
+
 struct key;
 struct pkcs7_message;
 
@@ -33,4 +38,10 @@ extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
 /*
  * pkcs7_verify.c
  */
-extern int pkcs7_verify(struct pkcs7_message *pkcs7);
+extern int pkcs7_verify(struct pkcs7_message *pkcs7,
+                       enum key_being_used_for usage);
+
+extern int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7,
+                                     const void *data, size_t datalen);
+
+#endif /* _CRYPTO_PKCS7_H */
index 54add206990166ff44aca927d024d4d62b8256a1..067c242b1e152d013f746424ceebd417112154a7 100644 (file)
@@ -33,11 +33,26 @@ extern const struct public_key_algorithm *pkey_algo[PKEY_ALGO__LAST];
 enum pkey_id_type {
        PKEY_ID_PGP,            /* OpenPGP generated key ID */
        PKEY_ID_X509,           /* X.509 arbitrary subjectKeyIdentifier */
+       PKEY_ID_PKCS7,          /* Signature in PKCS#7 message */
        PKEY_ID_TYPE__LAST
 };
 
 extern const char *const pkey_id_type_name[PKEY_ID_TYPE__LAST];
 
+/*
+ * The use to which an asymmetric key is being put.
+ */
+enum key_being_used_for {
+       VERIFYING_MODULE_SIGNATURE,
+       VERIFYING_FIRMWARE_SIGNATURE,
+       VERIFYING_KEXEC_PE_SIGNATURE,
+       VERIFYING_KEY_SIGNATURE,
+       VERIFYING_KEY_SELF_SIGNATURE,
+       VERIFYING_UNSPECIFIED_SIGNATURE,
+       NR__KEY_BEING_USED_FOR
+};
+extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
+
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
  *
@@ -101,7 +116,8 @@ extern int verify_signature(const struct key *key,
 
 struct asymmetric_key_id;
 extern struct key *x509_request_asymmetric_key(struct key *keyring,
-                                              const struct asymmetric_key_id *kid,
+                                              const struct asymmetric_key_id *id,
+                                              const struct asymmetric_key_id *skid,
                                               bool partial);
 
 #endif /* _LINUX_PUBLIC_KEY_H */
diff --git a/include/dt-bindings/i2c/i2c.h b/include/dt-bindings/i2c/i2c.h
new file mode 100644 (file)
index 0000000..1d5da81
--- /dev/null
@@ -0,0 +1,18 @@
+/*
+ * This header provides constants for I2C bindings
+ *
+ * Copyright (C) 2015 by Sang Engineering
+ * Copyright (C) 2015 by Renesas Electronics Corporation
+ *
+ * Wolfram Sang <wsa@sang-engineering.com>
+ *
+ * GPLv2 only
+ */
+
+#ifndef _DT_BINDINGS_I2C_I2C_H
+#define _DT_BINDINGS_I2C_I2C_H
+
+#define I2C_TEN_BIT_ADDRESS    (1 << 31)
+#define I2C_OWN_SLAVE_ADDRESS  (1 << 30)
+
+#endif
index 72665eb8069269f4e1b6726c8cd2d04374405ab8..b20cd885c1fd84b01d5b5602177777a74ea64810 100644 (file)
@@ -15,6 +15,7 @@
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
 #include <linux/key.h>
+#include <crypto/public_key.h>
 
 extern struct key *system_trusted_keyring;
 static inline struct key *get_system_trusted_keyring(void)
@@ -28,4 +29,10 @@ static inline struct key *get_system_trusted_keyring(void)
 }
 #endif
 
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+extern int system_verify_data(const void *data, unsigned long len,
+                             const void *raw_pkcs7, size_t pkcs7_len,
+                             enum key_being_used_for usage);
+#endif
+
 #endif /* _KEYS_SYSTEM_KEYRING_H */
index e5966758c093483cd6027b4e14ea678904be5978..e1e4d7c38dda5b1d014c607002838bc6275f9135 100644 (file)
@@ -52,13 +52,16 @@ struct arch_timer_cpu {
 
        /* Timer IRQ */
        const struct kvm_irq_level      *irq;
+
+       /* VGIC mapping */
+       struct irq_phys_map             *map;
 };
 
 int kvm_timer_hyp_init(void);
 void kvm_timer_enable(struct kvm *kvm);
 void kvm_timer_init(struct kvm *kvm);
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                         const struct kvm_irq_level *irq);
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+                        const struct kvm_irq_level *irq);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
index 133ea00aa83bc8926137ca8867f28a4ab9469d27..d901f1a47be6c5e4af71ea7543f7dc7163eca2f9 100644 (file)
@@ -95,11 +95,15 @@ enum vgic_type {
 #define LR_STATE_ACTIVE                (1 << 1)
 #define LR_STATE_MASK          (3 << 0)
 #define LR_EOI_INT             (1 << 2)
+#define LR_HW                  (1 << 3)
 
 struct vgic_lr {
-       u16     irq;
-       u8      source;
-       u8      state;
+       unsigned irq:10;
+       union {
+               unsigned hwirq:10;
+               unsigned source:3;
+       };
+       unsigned state:4;
 };
 
 struct vgic_vmcr {
@@ -155,6 +159,19 @@ struct vgic_io_device {
        struct kvm_io_device dev;
 };
 
+struct irq_phys_map {
+       u32                     virt_irq;
+       u32                     phys_irq;
+       u32                     irq;
+       bool                    active;
+};
+
+struct irq_phys_map_entry {
+       struct list_head        entry;
+       struct rcu_head         rcu;
+       struct irq_phys_map     map;
+};
+
 struct vgic_dist {
        spinlock_t              lock;
        bool                    in_kernel;
@@ -252,6 +269,10 @@ struct vgic_dist {
        struct vgic_vm_ops      vm_ops;
        struct vgic_io_device   dist_iodev;
        struct vgic_io_device   *redist_iodevs;
+
+       /* Virtual irq to hwirq mapping */
+       spinlock_t              irq_phys_map_lock;
+       struct list_head        irq_phys_map_list;
 };
 
 struct vgic_v2_cpu_if {
@@ -303,6 +324,9 @@ struct vgic_cpu {
                struct vgic_v2_cpu_if   vgic_v2;
                struct vgic_v3_cpu_if   vgic_v3;
        };
+
+       /* Protected by the distributor's irq_phys_map_lock */
+       struct list_head        irq_phys_map_list;
 };
 
 #define LR_EMPTY       0xff
@@ -317,16 +341,25 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_vgic_hyp_init(void);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_get_max_vcpus(void);
+void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+                              struct irq_phys_map *map, bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+                                          int virt_irq, int irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 
 #define irqchip_in_kernel(k)   (!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)    (!!((k)->arch.vgic.nr_cpus))
index 6a0a89ed7f81cac2b9eb4d04414874fc46243cb3..0ddb5c02ad8b6c279047c4c8c9c90e5516327ca7 100644 (file)
 #define UART01x_DR             0x00    /* Data read or written from the interface. */
 #define UART01x_RSR            0x04    /* Receive status register (Read). */
 #define UART01x_ECR            0x04    /* Error clear register (Write). */
-#define ZX_UART01x_DR          0x04    /* Data read or written from the interface. */
 #define UART010_LCRH           0x08    /* Line control register, high byte. */
 #define ST_UART011_DMAWM       0x08    /* DMA watermark configure register. */
 #define UART010_LCRM           0x0C    /* Line control register, middle byte. */
 #define ST_UART011_TIMEOUT     0x0C    /* Timeout period register. */
 #define UART010_LCRL           0x10    /* Line control register, low byte. */
 #define UART010_CR             0x14    /* Control register. */
-#define ZX_UART01x_FR          0x14    /* Flag register (Read only). */
 #define UART01x_FR             0x18    /* Flag register (Read only). */
 #define UART010_IIR            0x1C    /* Interrupt identification register (Read). */
 #define UART010_ICR            0x1C    /* Interrupt clear register (Write). */
 #define UART011_LCRH           0x2c    /* Line control register. */
 #define ST_UART011_LCRH_TX     0x2c    /* Tx Line control register. */
 #define UART011_CR             0x30    /* Control register. */
-#define ZX_UART011_LCRH_TX     0x30    /* Tx Line control register. */
 #define UART011_IFLS           0x34    /* Interrupt fifo level select. */
-#define ZX_UART011_CR          0x34    /* Control register. */
-#define ZX_UART011_IFLS                0x38    /* Interrupt fifo level select. */
 #define UART011_IMSC           0x38    /* Interrupt mask. */
 #define UART011_RIS            0x3c    /* Raw interrupt status. */
 #define UART011_MIS            0x40    /* Masked interrupt status. */
-#define ZX_UART011_IMSC                0x40    /* Interrupt mask. */
 #define UART011_ICR            0x44    /* Interrupt clear register. */
-#define ZX_UART011_RIS         0x44    /* Raw interrupt status. */
 #define UART011_DMACR          0x48    /* DMA control register. */
-#define ZX_UART011_MIS         0x48    /* Masked interrupt status. */
-#define ZX_UART011_ICR         0x4c    /* Interrupt clear register. */
 #define ST_UART011_XFCR                0x50    /* XON/XOFF control register. */
-#define ZX_UART011_DMACR       0x50    /* DMA control register. */
 #define ST_UART011_XON1                0x54    /* XON1 register. */
 #define ST_UART011_XON2                0x58    /* XON2 register. */
 #define ST_UART011_XOFF1       0x5C    /* XON1 register. */
 #define UART01x_RSR_PE                 0x02
 #define UART01x_RSR_FE                 0x01
 
-#define ZX_UART01x_FR_BUSY     0x300
 #define UART011_FR_RI          0x100
 #define UART011_FR_TXFE                0x080
 #define UART011_FR_RXFF                0x040
 #define UART01x_FR_TXFF                0x020
 #define UART01x_FR_RXFE                0x010
 #define UART01x_FR_BUSY                0x008
-#define ZX_UART01x_FR_DSR       0x008
 #define UART01x_FR_DCD                 0x004
 #define UART01x_FR_DSR                 0x002
-#define ZX_UART01x_FR_CTS      0x002
 #define UART01x_FR_CTS                 0x001
-#define ZX_UART011_FR_RI       0x001
 #define UART01x_FR_TMSK                (UART01x_FR_TXFF + UART01x_FR_BUSY)
 
 #define UART011_CR_CTSEN       0x8000  /* CTS hardware flow control */
index 945d44ae529c311e1fe6c3e330a8b789f6d3eb55..ab3a6c002f7b9d2e265b08061d78e7f260b13633 100644 (file)
@@ -45,23 +45,27 @@ enum asn1_opcode {
        ASN1_OP_MATCH_JUMP              = 0x04,
        ASN1_OP_MATCH_JUMP_OR_SKIP      = 0x05,
        ASN1_OP_MATCH_ANY               = 0x08,
+       ASN1_OP_MATCH_ANY_OR_SKIP       = 0x09,
        ASN1_OP_MATCH_ANY_ACT           = 0x0a,
+       ASN1_OP_MATCH_ANY_ACT_OR_SKIP   = 0x0b,
        /* Everything before here matches unconditionally */
 
        ASN1_OP_COND_MATCH_OR_SKIP      = 0x11,
        ASN1_OP_COND_MATCH_ACT_OR_SKIP  = 0x13,
        ASN1_OP_COND_MATCH_JUMP_OR_SKIP = 0x15,
        ASN1_OP_COND_MATCH_ANY          = 0x18,
+       ASN1_OP_COND_MATCH_ANY_OR_SKIP  = 0x19,
        ASN1_OP_COND_MATCH_ANY_ACT      = 0x1a,
+       ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP = 0x1b,
 
        /* Everything before here will want a tag from the data */
-#define ASN1_OP__MATCHES_TAG ASN1_OP_COND_MATCH_ANY_ACT
+#define ASN1_OP__MATCHES_TAG ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP
 
        /* These are here to help fill up space */
-       ASN1_OP_COND_FAIL               = 0x1b,
-       ASN1_OP_COMPLETE                = 0x1c,
-       ASN1_OP_ACT                     = 0x1d,
-       ASN1_OP_RETURN                  = 0x1e,
+       ASN1_OP_COND_FAIL               = 0x1c,
+       ASN1_OP_COMPLETE                = 0x1d,
+       ASN1_OP_ACT                     = 0x1e,
+       ASN1_OP_MAYBE_ACT               = 0x1f,
 
        /* The following eight have bit 0 -> SET, 1 -> OF, 2 -> ACT */
        ASN1_OP_END_SEQ                 = 0x20,
@@ -76,6 +80,8 @@ enum asn1_opcode {
 #define ASN1_OP_END__OF                          0x02
 #define ASN1_OP_END__ACT                 0x04
 
+       ASN1_OP_RETURN                  = 0x28,
+
        ASN1_OP__NR
 };
 
index c2e7e3a83965341207b64dab360a72b9cce780a6..b2abc996c25dabee1c9b55a97667e06ad436ec1e 100644 (file)
@@ -27,6 +27,9 @@
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
 
+#define AUDIT_INO_UNSET ((unsigned long)-1)
+#define AUDIT_DEV_UNSET ((dev_t)-1)
+
 struct audit_sig_info {
        uid_t           uid;
        pid_t           pid;
@@ -59,6 +62,7 @@ struct audit_krule {
        struct audit_field      *inode_f; /* quick access to an inode field */
        struct audit_watch      *watch; /* associated watch */
        struct audit_tree       *tree;  /* associated watched tree */
+       struct audit_fsnotify_mark      *exe;
        struct list_head        rlist;  /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;   /* for AUDIT_LIST* purposes only */
        u64                     prio;
index 0fe9df983ab7410c67143ecdc7a26bac958b4597..5a5d79ee256f24aa2de8fa92778e8e85cdfcd4b2 100644 (file)
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
-       if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+       if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
 }
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
 }
 
 struct wb_iter {
-       int                     start_blkcg_id;
+       int                     start_memcg_id;
        struct radix_tree_iter  tree_iter;
        void                    **slot;
 };
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
        WARN_ON_ONCE(!rcu_read_lock_held());
 
-       if (iter->start_blkcg_id >= 0) {
-               iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
-               iter->start_blkcg_id = -1;
+       if (iter->start_memcg_id >= 0) {
+               iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+               iter->start_memcg_id = -1;
        } else {
                iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
        }
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
 static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
                                                   struct backing_dev_info *bdi,
-                                                  int start_blkcg_id)
+                                                  int start_memcg_id)
 {
-       iter->start_blkcg_id = start_blkcg_id;
+       iter->start_memcg_id = start_memcg_id;
 
-       if (start_blkcg_id)
+       if (start_memcg_id)
                return __wb_iter_next(iter, bdi);
        else
                return &bdi->wb;
 }
 
 /**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
  * @wb_cur: cursor struct bdi_writeback pointer
  * @bdi: bdi to walk wb's of
  * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
  *
  * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
  * to be used as temp storage during iteration.  rcu_read_lock() must be
  * held throughout iteration.
  */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
-       for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);      \
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)             \
+       for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
             (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
index a4cd1641e9e2d74ccfd8518d415318401e5a4da8..0a5cc7a1109b9b2c655020c6c849960506f1cbc6 100644 (file)
  */
 
 #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH    (INT_MAX / 2)
+
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX                UINT_MAX
 
@@ -45,7 +48,7 @@ struct blkcg {
        struct blkcg_gq                 *blkg_hint;
        struct hlist_head               blkg_list;
 
-       struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+       struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
 
        struct list_head                all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
 #endif
 };
 
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
+ */
 struct blkg_stat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt;
+       struct percpu_counter           cpu_cnt;
+       atomic64_t                      aux_cnt;
 };
 
 struct blkg_rwstat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt[BLKG_RWSTAT_NR];
+       struct percpu_counter           cpu_cnt[BLKG_RWSTAT_NR];
+       atomic64_t                      aux_cnt[BLKG_RWSTAT_NR];
 };
 
 /*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
  * request_queue (q).  This is used by blkcg policies which need to track
  * information per blkcg - q pair.
  *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
  */
 struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                 *blkg;
        int                             plid;
-
-       /* used during policy activation */
-       struct list_head                alloc_node;
 };
 
 /*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
-       /* the policy id this per-policy data belongs to */
+       /* the blkcg and policy id this per-policy data belongs to */
+       struct blkcg                    *blkcg;
        int                             plid;
 };
 
@@ -123,40 +127,50 @@ struct blkcg_gq {
        /* is this blkg online? protected by both blkcg and q locks */
        bool                            online;
 
+       struct blkg_rwstat              stat_bytes;
+       struct blkg_rwstat              stat_ios;
+
        struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
 
        struct rcu_head                 rcu_head;
 };
 
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
        int                             plid;
-       /* policy specific private data size */
-       size_t                          pd_size;
-       /* policy specific per-blkcg data size */
-       size_t                          cpd_size;
        /* cgroup files for the policy */
-       struct cftype                   *cftypes;
+       struct cftype                   *dfl_cftypes;
+       struct cftype                   *legacy_cftypes;
 
        /* operations */
+       blkcg_pol_alloc_cpd_fn          *cpd_alloc_fn;
        blkcg_pol_init_cpd_fn           *cpd_init_fn;
+       blkcg_pol_free_cpd_fn           *cpd_free_fn;
+       blkcg_pol_bind_cpd_fn           *cpd_bind_fn;
+
+       blkcg_pol_alloc_pd_fn           *pd_alloc_fn;
        blkcg_pol_init_pd_fn            *pd_init_fn;
        blkcg_pol_online_pd_fn          *pd_online_fn;
        blkcg_pol_offline_pd_fn         *pd_offline_fn;
-       blkcg_pol_exit_pd_fn            *pd_exit_fn;
+       blkcg_pol_free_pd_fn            *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
 };
 
 extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
 
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                                    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
                             const struct blkcg_policy *pol);
 
+const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
                       int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off);
 
 struct blkg_conf_ctx {
        struct gendisk                  *disk;
        struct blkcg_gq                 *blkg;
-       u64                             v;
+       char                            *body;
 };
 
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx);
+                  char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+       return css_to_blkcg(task_css(tsk, io_cgrp_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 static inline struct cgroup_subsys_state *
 task_get_blkcg_css(struct task_struct *task)
 {
-       return task_get_css(task, blkio_cgrp_id);
+       return task_get_css(task, io_cgrp_id);
 }
 
 /**
@@ -232,6 +252,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
        return css_to_blkcg(blkcg->css.parent);
 }
 
+/**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+                                            struct request_queue *q,
+                                            bool update_hint)
+{
+       struct blkcg_gq *blkg;
+
+       if (blkcg == &blkcg_root)
+               return q->root_blkg;
+
+       blkg = rcu_dereference(blkcg->blkg_hint);
+       if (blkg && blkg->q == q)
+               return blkg;
+
+       return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+                                          struct request_queue *q)
+{
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (unlikely(blk_queue_bypass(q)))
+               return NULL;
+       return __blkg_lookup(blkcg, q, false);
+}
+
 /**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
 {
-       return blkcg ? blkcg->pd[pol->plid] : NULL;
+       return blkcg ? blkcg->cpd[pol->plid] : NULL;
 }
 
 /**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
        return pd ? pd->blkg : NULL;
 }
 
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+       return cpd ? cpd->blkcg : NULL;
+}
+
 /**
  * blkg_path - format cgroup path of blkg
  * @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
                call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint);
-
 /**
  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
  * @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
         * or if either the blkcg or queue is going away.  Fall back to
         * root_rl in such cases.
         */
-       blkg = blkg_lookup_create(blkcg, q);
-       if (IS_ERR(blkg))
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg))
                goto root_rl;
 
        blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
  */
 static inline void blk_put_rl(struct request_list *rl)
 {
-       /* root_rl may not have blkg set */
-       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+       if (rl->blkg->blkcg != &blkcg_root)
                blkg_put(rl->blkg);
 }
 
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)   \
        for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
-       u64_stats_init(&stat->syncp);
+       int ret;
+
+       ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+       if (ret)
+               return ret;
+
+       atomic64_set(&stat->aux_cnt, 0);
+       return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+       percpu_counter_destroy(&stat->cpu_cnt);
 }
 
 /**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
  * @stat: target blkg_stat
  * @val: value to add
  *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-       u64_stats_update_begin(&stat->syncp);
-       stat->cnt += val;
-       u64_stats_update_end(&stat->syncp);
+       __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
-       unsigned int start;
-       uint64_t v;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&stat->syncp);
-               v = stat->cnt;
-       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-       return v;
+       return percpu_counter_sum_positive(&stat->cpu_cnt);
 }
 
 /**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  */
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
-       stat->cnt = 0;
+       percpu_counter_set(&stat->cpu_cnt, 0);
+       atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
  * @to: the destination blkg_stat
  * @from: the source
  *
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+                                    struct blkg_stat *from)
 {
-       blkg_stat_add(to, blkg_stat_read(from));
+       atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+                    &to->aux_cnt);
 }
 
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
-       u64_stats_init(&rwstat->syncp);
+       int i, ret;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+               if (ret) {
+                       while (--i >= 0)
+                               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+                       return ret;
+               }
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
+       return 0;
+}
+
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+       int i;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
                                   int rw, uint64_t val)
 {
-       u64_stats_update_begin(&rwstat->syncp);
+       struct percpu_counter *cnt;
 
        if (rw & REQ_WRITE)
-               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
        else
-               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
        if (rw & REQ_SYNC)
-               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
        else
-               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+               cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-       u64_stats_update_end(&rwstat->syncp);
+       __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
-       unsigned int start;
-       struct blkg_rwstat tmp;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-               tmp = *rwstat;
-       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+       struct blkg_rwstat result;
+       int i;
 
-       return tmp;
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               atomic64_set(&result.aux_cnt[i],
+                            percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+       return result;
 }
 
 /**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
        struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 
-       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+       return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
 
 /**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
  */
 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
-       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+       int i;
+
+       for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+               percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+               atomic64_set(&rwstat->aux_cnt[i], 0);
+       }
 }
 
 /**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
  * @to: the destination blkg_rwstat
  * @from: the source
  *
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-                                    struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+                                      struct blkg_rwstat *from)
 {
        struct blkg_rwstat v = blkg_rwstat_read(from);
        int i;
 
-       u64_stats_update_begin(&to->syncp);
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
-               to->cnt[i] += v.cnt[i];
-       u64_stats_update_end(&to->syncp);
+               atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+                            atomic64_read(&from->aux_cnt[i]),
+                            &to->aux_cnt[i]);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                          struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+                                 struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+                                        struct bio *bio)
+{
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       bool throtl = false;
+
+       rcu_read_lock();
+       blkcg = bio_blkcg(bio);
+
+       blkg = blkg_lookup(blkcg, q);
+       if (unlikely(!blkg)) {
+               spin_lock_irq(q->queue_lock);
+               blkg = blkg_lookup_create(blkcg, q);
+               if (IS_ERR(blkg))
+                       blkg = NULL;
+               spin_unlock_irq(q->queue_lock);
+       }
+
+       throtl = blk_throtl_bio(q, blkg, bio);
+
+       if (!throtl) {
+               blkg = blkg ?: q->root_blkg;
+               blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+                               bio->bi_iter.bi_size);
+               blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+       }
+
+       rcu_read_unlock();
+       return !throtl;
 }
 
 #else  /* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
 static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
 static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
 
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+                                        struct bio *bio) { return true; }
+
 #define blk_queue_for_each_rl(rl, q)   \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
index a622f270f09e78fce0bf06a73de65827ef4fddf7..38a5ff772a37fbbd841a1152d9f613bb1ffde84c 100644 (file)
@@ -584,7 +584,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 
 #define list_entry_rq(ptr)     list_entry((ptr), struct request, queuelist)
 
-#define rq_data_dir(rq)                (((rq)->cmd_flags & 1) != 0)
+#define rq_data_dir(rq)                ((int)((rq)->cmd_flags & 1))
 
 /*
  * Driver can handle struct request, if it either has an old style
@@ -1569,8 +1569,8 @@ struct block_device_operations {
        int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-       long (*direct_access)(struct block_device *, sector_t,
-                                       void **, unsigned long *pfn, long size);
+       long (*direct_access)(struct block_device *, sector_t, void __pmem **,
+                       unsigned long *pfn);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        /* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1588,8 +1588,8 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
-extern long bdev_direct_access(struct block_device *, sector_t, void **addr,
-                                               unsigned long *pfn, long size);
+extern long bdev_direct_access(struct block_device *, sector_t,
+               void __pmem **addr, unsigned long *pfn, long size);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
index 4763ad64e832cb07496bbd14a35c4bcf3825a885..f89b31d45cc894814d409a19e161a29c2c41e832 100644 (file)
@@ -107,6 +107,7 @@ static inline u64 ceph_sanitize_features(u64 features)
         CEPH_FEATURE_OSDMAP_ENC |              \
         CEPH_FEATURE_CRUSH_TUNABLES3 |         \
         CEPH_FEATURE_OSD_PRIMARY_AFFINITY |    \
+        CEPH_FEATURE_MSGR_KEEPALIVE2 |         \
         CEPH_FEATURE_CRUSH_V4)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
index 9ebee53d3bf586ef80690fa778c0a3e030e410f1..397c5cd09794854ebc8891fac59b4dfa50e7e141 100644 (file)
@@ -46,6 +46,7 @@ struct ceph_options {
        unsigned long mount_timeout;            /* jiffies */
        unsigned long osd_idle_ttl;             /* jiffies */
        unsigned long osd_keepalive_timeout;    /* jiffies */
+       unsigned long monc_ping_timeout;        /* jiffies */
 
        /*
         * any type that can't be simply compared or doesn't need need
@@ -66,6 +67,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT     msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT     msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT      msecs_to_jiffies(60 * 1000)
+#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN        (16*1024*1024)
index 37753278987ac654a5e00797e6d6b81e4b2353c0..b2371d9b51fac6ea69032576ac1c3cddfc104016 100644 (file)
@@ -238,6 +238,8 @@ struct ceph_connection {
        bool out_kvec_is_msg; /* kvec refers to out_msg */
        int out_more;        /* there is more data after the kvecs */
        __le64 out_temp_ack; /* for writing an ack */
+       struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
+                                                    stamp */
 
        /* message in temps */
        struct ceph_msg_header in_hdr;
@@ -248,6 +250,8 @@ struct ceph_connection {
        int in_base_pos;     /* bytes read */
        __le64 in_temp_ack;  /* for reading an ack */
 
+       struct timespec last_keepalive_ack; /* keepalive2 ack stamp */
+
        struct delayed_work work;           /* send|recv work */
        unsigned long       delay;          /* current delay interval */
 };
@@ -285,6 +289,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
+extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
+                                      unsigned long interval);
 
 extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
                                size_t length, size_t alignment);
index 1c1887206ffa928264acbc1cf63aa3b36717b3b9..0fe2656ac415711cce5bfc53f3c9ed753bf3ed26 100644 (file)
@@ -84,10 +84,12 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_MSG           7  /* message */
 #define CEPH_MSGR_TAG_ACK           8  /* message ack */
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADPROTOVER   10 /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
 #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2    14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
 
 
 /*
index 1f36945fd23da3b6b77e928f3d41889e8000e783..1a96fdaa33d54befd4fde55b5af79c1c51b887ac 100644 (file)
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
 #endif
 
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
 #endif
 
 #if IS_ENABLED(CONFIG_MEMCG)
index 31ce435981feb962b4f1a9a9494ad97913232dcd..bdcf358dfce2a9d0f5420d9fbde6066b1e2dc73b 100644 (file)
 struct clock_event_device;
 struct module;
 
-/* Clock event mode commands for legacy ->set_mode(): OBSOLETE */
-enum clock_event_mode {
-       CLOCK_EVT_MODE_UNUSED,
-       CLOCK_EVT_MODE_SHUTDOWN,
-       CLOCK_EVT_MODE_PERIODIC,
-       CLOCK_EVT_MODE_ONESHOT,
-       CLOCK_EVT_MODE_RESUME,
-};
-
 /*
  * Possible states of a clock event device.
  *
@@ -86,16 +77,14 @@ enum clock_event_state {
  * @min_delta_ns:      minimum delta value in ns
  * @mult:              nanosecond to cycles multiplier
  * @shift:             nanoseconds to cycles divisor (power of two)
- * @mode:              operating mode, relevant only to ->set_mode(), OBSOLETE
  * @state_use_accessors:current state of the device, assigned by the core code
  * @features:          features
  * @retries:           number of forced programming retries
- * @set_mode:          legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
- * @set_state_periodic:        switch state to periodic, if !set_mode
- * @set_state_oneshot: switch state to oneshot, if !set_mode
- * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode
- * @set_state_shutdown:        switch state to shutdown, if !set_mode
- * @tick_resume:       resume clkevt device, if !set_mode
+ * @set_state_periodic:        switch state to periodic
+ * @set_state_oneshot: switch state to oneshot
+ * @set_state_oneshot_stopped: switch state to oneshot_stopped
+ * @set_state_shutdown:        switch state to shutdown
+ * @tick_resume:       resume clkevt device
  * @broadcast:         function to broadcast events
  * @min_delta_ticks:   minimum delta value in ticks stored for reconfiguration
  * @max_delta_ticks:   maximum delta value in ticks stored for reconfiguration
@@ -116,18 +105,10 @@ struct clock_event_device {
        u64                     min_delta_ns;
        u32                     mult;
        u32                     shift;
-       enum clock_event_mode   mode;
        enum clock_event_state  state_use_accessors;
        unsigned int            features;
        unsigned long           retries;
 
-       /*
-        * State transition callback(s): Only one of the two groups should be
-        * defined:
-        * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
-        * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume().
-        */
-       void                    (*set_mode)(enum clock_event_mode mode, struct clock_event_device *);
        int                     (*set_state_periodic)(struct clock_event_device *);
        int                     (*set_state_oneshot)(struct clock_event_device *);
        int                     (*set_state_oneshot_stopped)(struct clock_event_device *);
diff --git a/include/linux/dax.h b/include/linux/dax.h
new file mode 100644 (file)
index 0000000..b415e52
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef _LINUX_DAX_H
+#define _LINUX_DAX_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+                 get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+               dax_iodone_t);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+                               unsigned int flags, get_block_t, dax_iodone_t);
+int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+                               unsigned int flags, get_block_t, dax_iodone_t);
+#else
+static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+                               pmd_t *pmd, unsigned int flags, get_block_t gb,
+                               dax_iodone_t di)
+{
+       return VM_FAULT_FALLBACK;
+}
+#define __dax_pmd_fault dax_pmd_fault
+#endif
+int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
+
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+       return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
+}
+#endif
index 420311bcee38c291cf75894ebfe4c2d1141da1a7..9beb636b97ebcd4713c79dcb63f8d5809b58410f 100644 (file)
@@ -116,6 +116,12 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name,
 
 bool debugfs_initialized(void);
 
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+                              size_t count, loff_t *ppos);
+
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+                               size_t count, loff_t *ppos);
+
 #else
 
 #include <linux/err.h>
@@ -282,6 +288,20 @@ static inline struct dentry *debugfs_create_devm_seqfile(struct device *dev,
        return ERR_PTR(-ENODEV);
 }
 
+static inline ssize_t debugfs_read_file_bool(struct file *file,
+                                            char __user *user_buf,
+                                            size_t count, loff_t *ppos)
+{
+       return -ENODEV;
+}
+
+static inline ssize_t debugfs_write_file_bool(struct file *file,
+                                             const char __user *user_buf,
+                                             size_t count, loff_t *ppos)
+{
+       return -ENODEV;
+}
+
 #endif
 
 #endif
index e1043f79122f827405afbb7b176d14611702098a..53ba737505df31c7673c814d8d00884f72a17cb1 100644 (file)
@@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool);
 void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                     dma_addr_t *handle);
 
+static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+                                   dma_addr_t *handle)
+{
+       return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
+}
+
 void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
 
 /*
index 043f3283b71c42cd72ecc99a9006eb9b862f5f75..bc9afa74ee11cb3f7c2876280aad4da50ad6df7b 100644 (file)
@@ -788,7 +788,7 @@ struct dmt_videomode {
 
 extern const char *fb_mode_option;
 extern const struct fb_videomode vesa_modes[];
-extern const struct fb_videomode cea_modes[64];
+extern const struct fb_videomode cea_modes[65];
 extern const struct dmt_videomode dmt_modes[];
 
 struct fb_modelist {
index b2f9b9c25e419255a08ece5031315b62db3ecd82..72d8a844c692b2e623b90bfb37b162d923aeb9de 100644 (file)
@@ -52,7 +52,6 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
-struct vm_fault;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -2678,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
-                 get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-               dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
-               dax_iodone_t);
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)         dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod)       __dax_fault(vma, vmf, gb, iod)
-
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
                            loff_t file_offset);
index ad35f300b9a46019b43eee1b49c5c0fb281ada10..f92cbd2f44507adda1333d60adc5fb4686a3f029 100644 (file)
@@ -63,7 +63,10 @@ struct vm_area_struct;
  * but it is definitely preferable to use the flag rather than opencode endless
  * loop around allocator.
  *
- * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely and will
+ * return NULL when direct reclaim and memory compaction have failed to allow
+ * the allocation to succeed.  The OOM killer is not called with the current
+ * implementation.
  *
  * __GFP_MOVABLE: Flag that this page will be movable by the page migration
  * mechanism or reclaimed
@@ -300,22 +303,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
        return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
 }
 
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
-                                               unsigned int order)
+/*
+ * Allocate pages, preferring the node given as nid. The node must be valid and
+ * online. For more general interface, see alloc_pages_node().
+ */
+static inline struct page *
+__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 {
-       /* Unknown node is current node */
-       if (nid < 0)
-               nid = numa_node_id();
+       VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+       VM_WARN_ON(!node_online(nid));
 
        return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
 
-static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
+/*
+ * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
+ * prefer the current CPU's closest node. Otherwise node must be valid and
+ * online.
+ */
+static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
                                                unsigned int order)
 {
-       VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+       if (nid == NUMA_NO_NODE)
+               nid = numa_mem_id();
 
-       return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+       return __alloc_pages_node(nid, gfp_mask, order);
 }
 
 #ifdef CONFIG_NUMA
@@ -354,7 +366,6 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
 void free_pages_exact(void *virt, size_t size);
-/* This is different from alloc_pages_exact_node !!! */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 
 #define __get_free_page(gfp_mask) \
index f10b20f051599287a40188d322ead088726e2296..ecb080d6ff42077513f03b95537dc108bded9e07 100644 (file)
@@ -33,6 +33,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
                        int prot_numa);
+int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
+                       unsigned long pfn, bool write);
 
 enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_FLAG,
@@ -122,7 +124,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 #endif
 extern int hugepage_madvise(struct vm_area_struct *vma,
                            unsigned long *vm_flags, int advice);
-extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                    unsigned long start,
                                    unsigned long end,
                                    long adjust_next);
@@ -138,15 +140,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
        else
                return 0;
 }
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
-                                        unsigned long start,
-                                        unsigned long end,
-                                        long adjust_next)
-{
-       if (!vma->anon_vma || vma->vm_ops)
-               return;
-       __vma_adjust_trans_huge(vma, start, end, adjust_next);
-}
 static inline int hpage_nr_pages(struct page *page)
 {
        if (unlikely(PageTransHuge(page)))
@@ -164,6 +157,13 @@ static inline bool is_huge_zero_page(struct page *page)
        return ACCESS_ONCE(huge_zero_page) == page;
 }
 
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+       return is_huge_zero_page(pmd_page(pmd));
+}
+
+struct page *get_huge_zero_page(void);
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
index d891f949466ae2b11d2fd063a4493044e5dc9618..5e35379f58a53d09cf50bd693d0e8639c7b79709 100644 (file)
@@ -35,6 +35,9 @@ struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
+       long adds_in_progress;
+       struct list_head region_cache;
+       long region_cache_count;
 };
 extern struct resv_map *resv_map_alloc(void);
 void resv_map_release(struct kref *ref);
@@ -80,11 +83,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+                                               long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
 void free_huge_page(struct page *page);
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
+extern struct mutex *hugetlb_fault_mutex_table;
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+                               struct vm_area_struct *vma,
+                               struct address_space *mapping,
+                               pgoff_t idx, unsigned long address);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
@@ -320,9 +330,13 @@ struct huge_bootmem_page {
 #endif
 };
 
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+                               unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_node(struct hstate *h, int nid);
 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+                       pgoff_t idx);
 
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -471,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
 #define alloc_huge_page_node(h, nid) NULL
 #define alloc_huge_page_noerr(v, a, r) NULL
 #define alloc_bootmem_huge_page(h) NULL
index e83a738a3b8741c9fe6401d6d89d8d2e02900f52..768063baafbf5efe6c122a123b0c3f9bfe3bd050 100644 (file)
@@ -121,6 +121,9 @@ extern s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client,
 extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
                                          u8 command, u8 length,
                                          const u8 *values);
+extern s32
+i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client,
+                                         u8 command, u8 length, u8 *values);
 #endif /* I2C */
 
 /**
@@ -550,11 +553,12 @@ void i2c_lock_adapter(struct i2c_adapter *);
 void i2c_unlock_adapter(struct i2c_adapter *);
 
 /*flags for the client struct: */
-#define I2C_CLIENT_PEC 0x04            /* Use Packet Error Checking */
-#define I2C_CLIENT_TEN 0x10            /* we have a ten bit chip address */
+#define I2C_CLIENT_PEC         0x04    /* Use Packet Error Checking */
+#define I2C_CLIENT_TEN         0x10    /* we have a ten bit chip address */
                                        /* Must equal I2C_M_TEN below */
-#define I2C_CLIENT_WAKE        0x80            /* for board_info; true iff can wake */
-#define I2C_CLIENT_SCCB        0x9000          /* Use Omnivision SCCB protocol */
+#define I2C_CLIENT_SLAVE       0x20    /* we are the slave */
+#define I2C_CLIENT_WAKE                0x80    /* for board_info; true iff can wake */
+#define I2C_CLIENT_SCCB                0x9000  /* Use Omnivision SCCB protocol */
                                        /* Must match I2C_M_STOP|IGNORE_NAK */
 
 /* i2c adapter classes (bitmask) */
@@ -638,6 +642,8 @@ extern struct i2c_client *of_find_i2c_device_by_node(struct device_node *node);
 /* must call put_device() when done with returned i2c_adapter device */
 extern struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node);
 
+/* must call i2c_put_adapter() when done with returned i2c_adapter device */
+struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node);
 #else
 
 static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node)
@@ -649,6 +655,11 @@ static inline struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node
 {
        return NULL;
 }
+
+static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node)
+{
+       return NULL;
+}
 #endif /* CONFIG_OF */
 
 #endif /* _LINUX_I2C_H */
index d9a366d24e3bb8736cc7acdaa4f541c9720747a6..6240063bdcac4c42da3f108e9b81c5800085c29b 100644 (file)
@@ -344,7 +344,7 @@ struct intel_iommu {
 
 #ifdef CONFIG_INTEL_IOMMU
        unsigned long   *domain_ids; /* bitmap of domains */
-       struct dmar_domain **domains; /* ptr to domains */
+       struct dmar_domain ***domains; /* ptr to domains */
        spinlock_t      lock; /* protect context, domain ids */
        struct root_entry *root_entry; /* virtual address */
 
index c27dde7215b5b291394747d35e1f4a19d9f1ac8e..e399029b68c5bbdc5ae613c681217c9c554c0f9f 100644 (file)
@@ -21,7 +21,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/bug.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/page.h>
 
 /*
index fb5a99800e77faf6481363fb1fe43bb156d8b213..de64c1e536125fc30296fba6678e028f9bf7d529 100644 (file)
 
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/err.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
 struct device;
+struct resource;
 
 __visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
@@ -80,6 +83,27 @@ int check_signature(const volatile void __iomem *io_addr,
                        const unsigned char *signature, int length);
 void devm_ioremap_release(struct device *dev, void *res);
 
+void *devm_memremap(struct device *dev, resource_size_t offset,
+               size_t size, unsigned long flags);
+void devm_memunmap(struct device *dev, void *addr);
+
+void *__devm_memremap_pages(struct device *dev, struct resource *res);
+
+#ifdef CONFIG_ZONE_DEVICE
+void *devm_memremap_pages(struct device *dev, struct resource *res);
+#else
+static inline void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+       /*
+        * Fail attempts to call devm_memremap_pages() without
+        * ZONE_DEVICE support enabled, this requires callers to fall
+        * back to plain devm_memremap() based on config
+        */
+       WARN_ON_ONCE(1);
+       return ERR_PTR(-ENXIO);
+}
+#endif
+
 /*
  * Some systems do not have legacy ISA devices.
  * /dev/port is not a valid interface on these systems.
@@ -121,4 +145,13 @@ static inline int arch_phys_wc_index(int handle)
 #endif
 #endif
 
+enum {
+       /* See memremap() kernel-doc for usage description... */
+       MEMREMAP_WB = 1 << 0,
+       MEMREMAP_WT = 1 << 1,
+};
+
+void *memremap(resource_size_t offset, size_t size, unsigned long flags);
+void memunmap(void *addr);
+
 #endif /* _LINUX_IO_H */
index 0b1e569f5ff5e630a5766321a422fc365470a5f0..f8cea14485ddde6f5d73bea2817b9bfa8913dc10 100644 (file)
@@ -115,6 +115,11 @@ struct ipmi_smi_handlers {
           implement it. */
        void (*set_need_watch)(void *send_info, bool enable);
 
+       /*
+        * Called when flushing all pending messages.
+        */
+       void (*flush_messages)(void *send_info);
+
        /* Called when the interface should go into "run to
           completion" mode.  If this call sets the value to true, the
           interface should make sure that all messages are flushed
@@ -207,7 +212,7 @@ static inline int ipmi_demangle_device_id(const unsigned char *data,
    upper layer until the start_processing() function in the handlers
    is called, and the lower layer must get the interface from that
    call. */
-int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
+int ipmi_register_smi(const struct ipmi_smi_handlers *handlers,
                      void                     *send_info,
                      struct ipmi_device_id    *device_id,
                      struct device            *dev,
index bf982e021fbd043f8fddf0903a6926f3558a679b..9eeeb9589acfc35baed79a89d7d3f0c1be134cf3 100644 (file)
 #define GICR_SYNCR                     0x00C0
 #define GICR_MOVLPIR                   0x0100
 #define GICR_MOVALLR                   0x0110
+#define GICR_ISACTIVER                 GICD_ISACTIVER
+#define GICR_ICACTIVER                 GICD_ICACTIVER
 #define GICR_IDREGS                    GICD_IDREGS
 #define GICR_PIDR2                     GICD_PIDR2
 
 
 #define ICH_LR_EOI                     (1UL << 41)
 #define ICH_LR_GROUP                   (1UL << 60)
+#define ICH_LR_HW                      (1UL << 61)
 #define ICH_LR_STATE                   (3UL << 62)
 #define ICH_LR_PENDING_BIT             (1UL << 62)
 #define ICH_LR_ACTIVE_BIT              (1UL << 63)
+#define ICH_LR_PHYS_ID_SHIFT           32
+#define ICH_LR_PHYS_ID_MASK            (0x3ffUL << ICH_LR_PHYS_ID_SHIFT)
 
 #define ICH_MISR_EOI                   (1 << 0)
 #define ICH_MISR_U                     (1 << 1)
 #define ICH_VMCR_PMR_MASK              (0xffUL << ICH_VMCR_PMR_SHIFT)
 
 #define ICC_EOIR1_EL1                  sys_reg(3, 0, 12, 12, 1)
+#define ICC_DIR_EL1                    sys_reg(3, 0, 12, 11, 1)
 #define ICC_IAR1_EL1                   sys_reg(3, 0, 12, 12, 0)
 #define ICC_SGI1R_EL1                  sys_reg(3, 0, 12, 11, 5)
 #define ICC_PMR_EL1                    sys_reg(3, 0, 4, 6, 0)
@@ -385,6 +391,12 @@ static inline void gic_write_eoir(u64 irq)
        isb();
 }
 
+static inline void gic_write_dir(u64 irq)
+{
+       asm volatile("msr_s " __stringify(ICC_DIR_EL1) ", %0" : : "r" (irq));
+       isb();
+}
+
 struct irq_domain;
 int its_cpu_init(void);
 int its_init(struct device_node *node, struct rdists *rdists,
index 65da435d01c10218741ccddc1f99d7107acaad4d..b8901dfd9e9584ba06de5f585f14c6edc9da8fb5 100644 (file)
 #define GIC_CPU_ALIAS_BINPOINT         0x1c
 #define GIC_CPU_ACTIVEPRIO             0xd0
 #define GIC_CPU_IDENT                  0xfc
+#define GIC_CPU_DEACTIVATE             0x1000
 
 #define GICC_ENABLE                    0x1
 #define GICC_INT_PRI_THRESHOLD         0xf0
+
+#define GIC_CPU_CTRL_EOImodeNS         (1 << 9)
+
 #define GICC_IAR_INT_ID_MASK           0x3ff
 #define GICC_INT_SPURIOUS              1023
 #define GICC_DIS_BYPASS_MASK           0x1e0
 
 #define GICH_LR_VIRTUALID              (0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT     (10)
-#define GICH_LR_PHYSID_CPUID           (7 << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PHYSID_CPUID           (0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
 #define GICH_LR_STATE                  (3 << 28)
 #define GICH_LR_PENDING_BIT            (1 << 28)
 #define GICH_LR_ACTIVE_BIT             (1 << 29)
 #define GICH_LR_EOI                    (1 << 19)
+#define GICH_LR_HW                     (1 << 31)
 
 #define GICH_VMCR_CTRL_SHIFT           0
 #define GICH_VMCR_CTRL_MASK            (0x21f << GICH_VMCR_CTRL_SHIFT)
index 7f653e8f66900049c358ed4907fab8121047dd91..f1094238ab2a0f0fddeb40e3c7aadde7c2a89015 100644 (file)
@@ -21,8 +21,8 @@
  *
  * DEFINE_STATIC_KEY_TRUE(key);
  * DEFINE_STATIC_KEY_FALSE(key);
- * static_key_likely()
- * statick_key_unlikely()
+ * static_branch_likely()
+ * static_branch_unlikely()
  *
  * Jump labels provide an interface to generate dynamic branches using
  * self-modifying code. Assuming toolchain and architecture support, if we
  * statement, setting the key to true requires us to patch in a jump
  * to the out-of-line of true branch.
  *
- * In addtion to static_branch_{enable,disable}, we can also reference count
+ * In addition to static_branch_{enable,disable}, we can also reference count
  * the key or branch direction via static_branch_{inc,dec}. Thus,
  * static_branch_inc() can be thought of as a 'make more true' and
- * static_branch_dec() as a 'make more false'. The inc()/dec()
- * interface is meant to be used exclusively from the inc()/dec() for a given
- * key.
+ * static_branch_dec() as a 'make more false'.
  *
  * Since this relies on modifying code, the branch modifying functions
  * must be considered absolute slow paths (machine wide synchronization etc.).
index 123be25ea15a6b37ae15fcc93219950d4d357380..5d4e9c4b821ddd33da4622be3c4e8f79ab1bf645 100644 (file)
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 }
 
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
 char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
                                size_t buflen);
 void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 { return -ENOSYS; }
 
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
 static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
                                              size_t buflen)
 { return NULL; }
index b63218f68c4b5a2c2862b082f62a1fe6caa5d128..d140b1e9faa71791264d6439bd8429810fff3ddd 100644 (file)
@@ -16,7 +16,7 @@
 
 #include <uapi/linux/kexec.h>
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
 #include <linux/linkage.h>
 #include <linux/compat.h>
@@ -318,13 +318,24 @@ int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
 
-#else /* !CONFIG_KEXEC */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                        unsigned long buf_len);
+void * __weak arch_kexec_kernel_image_load(struct kimage *image);
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image);
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                       unsigned long buf_len);
+int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
+                                       Elf_Shdr *sechdrs, unsigned int relsec);
+int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                       unsigned int relsec);
+
+#else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #define kexec_in_progress false
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #endif /* !defined(__ASSEBMLY__) */
 
index 0555cc66a15b27dfa7fd70c47e1a519ad49afef1..fcfd2bf14d3f0ef2dc113e5d77fd42c6dc233798 100644 (file)
@@ -85,8 +85,6 @@ enum umh_disable_depth {
        UMH_DISABLED,
 };
 
-extern void usermodehelper_init(void);
-
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
index 81089cf1f0c11c025b6f448d20ddcc1117ab5367..1bef9e21e7259935d2d7e8755bbd4c5710f5a9a6 100644 (file)
@@ -242,6 +242,7 @@ struct kvm_vcpu {
        int sigset_active;
        sigset_t sigset;
        struct kvm_vcpu_stat stat;
+       unsigned int halt_poll_ns;
 
 #ifdef CONFIG_HAS_IOMEM
        int mmio_needed;
index 75e3af01ee325840e3165919ab3a8fd42d76451f..3f021dc5da8c34e59768dca61dda323035368c0f 100644 (file)
@@ -31,6 +31,9 @@ enum {
        ND_CMD_ARS_STATUS_MAX = SZ_4K,
        ND_MAX_MAPPINGS = 32,
 
+       /* region flag indicating to direct-map persistent memory by default */
+       ND_REGION_PAGEMAP = 0,
+
        /* mark newly adjusted resources as requiring a label update */
        DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -91,6 +94,7 @@ struct nd_region_desc {
        void *provider_data;
        int num_lanes;
        int numa_node;
+       unsigned long flags;
 };
 
 struct nvdimm_bus;
index 1cc89e9df480cc14c9c7cf5723995c0bea470688..ffb9c9da4f39f8779a55bc832b3f6657e45f23ac 100644 (file)
@@ -40,6 +40,11 @@ struct lsm_network_audit {
        } fam;
 };
 
+struct lsm_ioctlop_audit {
+       struct path path;
+       u16 cmd;
+};
+
 /* Auxiliary data to use in generating the audit record. */
 struct common_audit_data {
        char type;
@@ -53,6 +58,7 @@ struct common_audit_data {
 #define LSM_AUDIT_DATA_KMOD    8
 #define LSM_AUDIT_DATA_INODE   9
 #define LSM_AUDIT_DATA_DENTRY  10
+#define LSM_AUDIT_DATA_IOCTL_OP        11
        union   {
                struct path path;
                struct dentry *dentry;
@@ -68,6 +74,7 @@ struct common_audit_data {
                } key_struct;
 #endif
                char *kmod_name;
+               struct lsm_ioctlop_audit *op;
        } u;
        /* this union contains LSM specific data */
        union {
index 9429f054c323961aa8bf15d91c9650ec48314f1b..ec3a6bab29de3ac499637a1d848f7cd5c6d80c32 100644 (file)
@@ -1881,8 +1881,10 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
 
 extern int __init security_module_enable(const char *module);
 extern void __init capability_add_hooks(void);
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-void __init yama_add_hooks(void);
+#ifdef CONFIG_SECURITY_YAMA
+extern void __init yama_add_hooks(void);
+#else
+static inline void __init yama_add_hooks(void) { }
 #endif
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
index cc4b019720600617ebb9447f60320e6bb4d2aa2a..c518eb5892603fdd89cdb6d674f1f633f0451367 100644 (file)
@@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
 int memblock_free(phys_addr_t base, phys_addr_t size);
 int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
+bool memblock_overlaps_region(struct memblock_type *type,
+                             phys_addr_t base, phys_addr_t size);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
@@ -323,7 +325,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit);
 int memblock_is_memory(phys_addr_t addr);
 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
 int memblock_is_reserved(phys_addr_t addr);
-int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
 
 extern void __memblock_dump_all(void);
 
index 73b02b0a8f609ac757de6ee59b23bcf8b0e87396..ad800e62cb7a603fdd5f7fb1a752edb03417dbb4 100644 (file)
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
 
 struct mem_cgroup;
 struct page;
@@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
        MEMCG_NR_EVENTS,
 };
 
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+       MEM_CGROUP_TARGET_THRESH,
+       MEM_CGROUP_TARGET_SOFTLIMIT,
+       MEM_CGROUP_TARGET_NUMAINFO,
+       MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+       /* Currently active and new sockets should be assigned to cgroups */
+       MEMCG_SOCK_ACTIVE,
+       /* It was ever activated; we must disarm static keys on destruction */
+       MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+       struct page_counter     memory_allocated;       /* Current allocated memory. */
+       struct percpu_counter   sockets_allocated;      /* Current number of sockets. */
+       int                     memory_pressure;
+       long                    sysctl_mem[3];
+       unsigned long           flags;
+       /*
+        * memcg field is used to find which memcg we belong directly
+        * Each memcg struct can hold more than one cg_proto, so container_of
+        * won't really cut.
+        *
+        * The elegant solution would be having an inverse function to
+        * proto_cgroup in struct proto, but that means polluting the structure
+        * for everybody, instead of just for memcg users.
+        */
+       struct mem_cgroup       *memcg;
+};
+
 #ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+       long count[MEM_CGROUP_STAT_NSTATS];
+       unsigned long events[MEMCG_NR_EVENTS];
+       unsigned long nr_page_events;
+       unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+       struct mem_cgroup *position;
+       /* scan generation, increased every round-trip */
+       unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+       struct lruvec           lruvec;
+       unsigned long           lru_size[NR_LRU_LISTS];
+
+       struct mem_cgroup_reclaim_iter  iter[DEF_PRIORITY + 1];
+
+       struct rb_node          tree_node;      /* RB tree node */
+       unsigned long           usage_in_excess;/* Set to the value by which */
+                                               /* the soft limit is exceeded*/
+       bool                    on_tree;
+       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
+                                               /* use container_of        */
+};
+
+struct mem_cgroup_per_node {
+       struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+       struct eventfd_ctx *eventfd;
+       unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+       /* An array index points to threshold just below or equal to usage. */
+       int current_threshold;
+       /* Size of entries[] */
+       unsigned int size;
+       /* Array of thresholds */
+       struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+       /* Primary thresholds array */
+       struct mem_cgroup_threshold_ary *primary;
+       /*
+        * Spare threshold array.
+        * This is needed to make mem_cgroup_unregister_event() "never fail".
+        * It must be able to store at least primary->size - 1 entries.
+        */
+       struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+       struct cgroup_subsys_state css;
+
+       /* Accounted resources */
+       struct page_counter memory;
+       struct page_counter memsw;
+       struct page_counter kmem;
+
+       /* Normal memory consumption range */
+       unsigned long low;
+       unsigned long high;
+
+       unsigned long soft_limit;
+
+       /* vmpressure notifications */
+       struct vmpressure vmpressure;
+
+       /* css_online() has been completed */
+       int initialized;
+
+       /*
+        * Should the accounting and control be hierarchical, per subtree?
+        */
+       bool use_hierarchy;
+
+       /* protected by memcg_oom_lock */
+       bool            oom_lock;
+       int             under_oom;
+
+       int     swappiness;
+       /* OOM-Killer disable */
+       int             oom_kill_disable;
+
+       /* protect arrays of thresholds */
+       struct mutex thresholds_lock;
+
+       /* thresholds for memory usage. RCU-protected */
+       struct mem_cgroup_thresholds thresholds;
+
+       /* thresholds for mem+swap usage. RCU-protected */
+       struct mem_cgroup_thresholds memsw_thresholds;
+
+       /* For oom notifier event fd */
+       struct list_head oom_notify;
+
+       /*
+        * Should we move charges of a task when a task is moved into this
+        * mem_cgroup ? And what type of charges should we move ?
+        */
+       unsigned long move_charge_at_immigrate;
+       /*
+        * set > 0 if pages under this cgroup are moving to other cgroup.
+        */
+       atomic_t                moving_account;
+       /* taken only while moving_account > 0 */
+       spinlock_t              move_lock;
+       struct task_struct      *move_lock_task;
+       unsigned long           move_lock_flags;
+       /*
+        * percpu counter.
+        */
+       struct mem_cgroup_stat_cpu __percpu *stat;
+       spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+       struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
+       int kmemcg_id;
+       bool kmem_acct_activated;
+       bool kmem_acct_active;
+#endif
+
+       int last_scanned_node;
+#if MAX_NUMNODES > 1
+       nodemask_t      scan_nodes;
+       atomic_t        numainfo_events;
+       atomic_t        numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+#endif
+
+       /* List of events which userspace want to receive */
+       struct list_head event_list;
+       spinlock_t event_list_lock;
+
+       struct mem_cgroup_per_node *nodeinfo[0];
+       /* WARNING: nodeinfo must be the last member here */
+};
 extern struct cgroup_subsys_state *mem_cgroup_root_css;
 
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                       enum mem_cgroup_events_index idx,
-                      unsigned int nr);
+                      unsigned int nr)
+{
+       this_cpu_add(memcg->stat->events[idx], nr);
+}
 
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
 
@@ -90,15 +304,29 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
-                             struct mem_cgroup *root);
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
-extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+       return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
 
-extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+                                  struct mem_cgroup *,
+                                  struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+                             struct mem_cgroup *root)
+{
+       if (root == memcg)
+               return true;
+       if (!root->use_hierarchy)
+               return false;
+       return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
 
 static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
@@ -114,24 +342,68 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
        return match;
 }
 
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+ino_t page_cgroup_ino(struct page *page);
 
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
-                                  struct mem_cgroup *,
-                                  struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+       if (memory_cgrp_subsys.disabled)
+               return true;
+       return false;
+}
 
 /*
  * For memory reclaim.
  */
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
-extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
-                                       struct task_struct *p);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+               int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+       struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup *memcg;
+
+       if (mem_cgroup_disabled())
+               return true;
+
+       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       memcg = mz->memcg;
+
+       return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+       struct mem_cgroup_per_zone *mz;
+
+       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+       return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+       unsigned long inactive_ratio;
+       unsigned long inactive;
+       unsigned long active;
+       unsigned long gb;
+
+       inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+       active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+       gb = (inactive + active) >> (30 - PAGE_SHIFT);
+       if (gb)
+               inactive_ratio = int_sqrt(10 * gb);
+       else
+               inactive_ratio = 1;
+
+       return inactive * inactive_ratio < active;
+}
+
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+                               struct task_struct *p);
 
 static inline void mem_cgroup_oom_enable(void)
 {
@@ -156,18 +428,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-static inline bool mem_cgroup_disabled(void)
-{
-       if (memory_cgrp_subsys.disabled)
-               return true;
-       return false;
-}
-
 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-                                enum mem_cgroup_stat_index idx, int val);
 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
 
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+                                enum mem_cgroup_stat_index idx, int val)
+{
+       VM_BUG_ON(!rcu_read_lock_held());
+
+       if (memcg)
+               this_cpu_add(memcg->stat->count[idx], val);
+}
+
 static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
                                            enum mem_cgroup_stat_index idx)
 {
@@ -184,13 +464,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);
 
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
                                             enum vm_event_item idx)
 {
+       struct mem_cgroup *memcg;
+
        if (mem_cgroup_disabled())
                return;
-       __mem_cgroup_count_vm_event(mm, idx);
+
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+       if (unlikely(!memcg))
+               goto out;
+
+       switch (idx) {
+       case PGFAULT:
+               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+               break;
+       case PGMAJFAULT:
+               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+               break;
+       default:
+               BUG();
+       }
+out:
+       rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
@@ -199,8 +497,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
-#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
-
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                                     enum mem_cgroup_events_index idx,
                                     unsigned int nr)
@@ -258,11 +554,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
        return &zone->lruvec;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
 {
@@ -275,12 +566,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
        return true;
 }
 
-static inline struct cgroup_subsys_state
-               *mem_cgroup_css(struct mem_cgroup *memcg)
-{
-       return NULL;
-}
-
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
@@ -428,8 +713,8 @@ static inline void sock_release_memcg(struct sock *sk)
 extern struct static_key memcg_kmem_enabled_key;
 
 extern int memcg_nr_cache_ids;
-extern void memcg_get_cache_ids(void);
-extern void memcg_put_cache_ids(void);
+void memcg_get_cache_ids(void);
+void memcg_put_cache_ids(void);
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -444,7 +729,10 @@ static inline bool memcg_kmem_enabled(void)
        return static_key_false(&memcg_kmem_enabled_key);
 }
 
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+       return memcg->kmem_acct_active;
+}
 
 /*
  * In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +751,15 @@ void __memcg_kmem_commit_charge(struct page *page,
                                       struct mem_cgroup *memcg, int order);
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+       return memcg ? memcg->kmemcg_id : -1;
+}
 
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
index 6ffa0ac7f7d62a2521ba40d154f4f5248f882afd..8f60e899b33c574b58be6d0a0d928c8f094941ea 100644 (file)
@@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
                void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
-extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default);
-extern int arch_add_memory(int nid, u64 start, u64 size);
+extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+               bool for_device);
+extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
diff --git a/include/linux/microchipphy.h b/include/linux/microchipphy.h
new file mode 100644 (file)
index 0000000..eb492d4
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _MICROCHIPPHY_H
+#define _MICROCHIPPHY_H
+
+#define LAN88XX_INT_MASK                       (0x19)
+#define LAN88XX_INT_MASK_MDINTPIN_EN_          (0x8000)
+#define LAN88XX_INT_MASK_SPEED_CHANGE_         (0x4000)
+#define LAN88XX_INT_MASK_LINK_CHANGE_          (0x2000)
+#define LAN88XX_INT_MASK_FDX_CHANGE_           (0x1000)
+#define LAN88XX_INT_MASK_AUTONEG_ERR_          (0x0800)
+#define LAN88XX_INT_MASK_AUTONEG_DONE_         (0x0400)
+#define LAN88XX_INT_MASK_POE_DETECT_           (0x0200)
+#define LAN88XX_INT_MASK_SYMBOL_ERR_           (0x0100)
+#define LAN88XX_INT_MASK_FAST_LINK_FAIL_       (0x0080)
+#define LAN88XX_INT_MASK_WOL_EVENT_            (0x0040)
+#define LAN88XX_INT_MASK_EXTENDED_INT_         (0x0020)
+#define LAN88XX_INT_MASK_RESERVED_             (0x0010)
+#define LAN88XX_INT_MASK_FALSE_CARRIER_                (0x0008)
+#define LAN88XX_INT_MASK_LINK_SPEED_DS_                (0x0004)
+#define LAN88XX_INT_MASK_MASTER_SLAVE_DONE_    (0x0002)
+#define LAN88XX_INT_MASK_RX__ER_               (0x0001)
+
+#define LAN88XX_INT_STS                                (0x1A)
+#define LAN88XX_INT_STS_INT_ACTIVE_            (0x8000)
+#define LAN88XX_INT_STS_SPEED_CHANGE_          (0x4000)
+#define LAN88XX_INT_STS_LINK_CHANGE_           (0x2000)
+#define LAN88XX_INT_STS_FDX_CHANGE_            (0x1000)
+#define LAN88XX_INT_STS_AUTONEG_ERR_           (0x0800)
+#define LAN88XX_INT_STS_AUTONEG_DONE_          (0x0400)
+#define LAN88XX_INT_STS_POE_DETECT_            (0x0200)
+#define LAN88XX_INT_STS_SYMBOL_ERR_            (0x0100)
+#define LAN88XX_INT_STS_FAST_LINK_FAIL_                (0x0080)
+#define LAN88XX_INT_STS_WOL_EVENT_             (0x0040)
+#define LAN88XX_INT_STS_EXTENDED_INT_          (0x0020)
+#define LAN88XX_INT_STS_RESERVED_              (0x0010)
+#define LAN88XX_INT_STS_FALSE_CARRIER_         (0x0008)
+#define LAN88XX_INT_STS_LINK_SPEED_DS_         (0x0004)
+#define LAN88XX_INT_STS_MASTER_SLAVE_DONE_     (0x0002)
+#define LAN88XX_INT_STS_RX_ER_                 (0x0001)
+
+#define LAN88XX_EXT_PAGE_ACCESS                        (0x1F)
+#define LAN88XX_EXT_PAGE_SPACE_0               (0x0000)
+#define LAN88XX_EXT_PAGE_SPACE_1               (0x0001)
+#define LAN88XX_EXT_PAGE_SPACE_2               (0x0002)
+
+/* Extended Register Page 1 space */
+#define LAN88XX_EXT_MODE_CTRL                  (0x13)
+#define LAN88XX_EXT_MODE_CTRL_MDIX_MASK_       (0x000C)
+#define LAN88XX_EXT_MODE_CTRL_AUTO_MDIX_       (0x0000)
+#define LAN88XX_EXT_MODE_CTRL_MDI_             (0x0008)
+#define LAN88XX_EXT_MODE_CTRL_MDI_X_           (0x000C)
+
+/* MMD 3 Registers */
+#define        LAN88XX_MMD3_CHIP_ID                    (32877)
+#define        LAN88XX_MMD3_CHIP_REV                   (32878)
+
+#endif /* _MICROCHIPPHY_H */
index bcbf8c72a77bee6ef2acc96489a4559bf02467ee..baad4cb8e9b065fb76c8e2b66379d9415d0f4e47 100644 (file)
@@ -79,7 +79,8 @@ enum {
 
 enum {
        MLX4_MAX_PORTS          = 2,
-       MLX4_MAX_PORT_PKEYS     = 128
+       MLX4_MAX_PORT_PKEYS     = 128,
+       MLX4_MAX_PORT_GIDS      = 128
 };
 
 /* base qkey for use in sriov tunnel-qp/proxy-qp communication.
index 9553a73d2049e425bc72bf4fbb7c151ffc9bbe43..5a06d969338e66c01863053212f33042fb36eca5 100644 (file)
@@ -59,6 +59,7 @@ struct mlx4_interface {
        void                    (*event) (struct mlx4_dev *dev, void *context,
                                          enum mlx4_dev_event event, unsigned long param);
        void *                  (*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+       void                    (*activate)(struct mlx4_dev *dev, void *context);
        struct list_head        list;
        enum mlx4_protocol      protocol;
        int                     flags;
index 250b1ff8b48d43c0f9f2479e388b405d8238eb41..8eb3b19af2a4bc2ece866e8d07c6243115ce13d4 100644 (file)
@@ -402,6 +402,17 @@ struct mlx5_cmd_teardown_hca_mbox_out {
        u8                      rsvd[8];
 };
 
+struct mlx5_cmd_query_special_contexts_mbox_in {
+       struct mlx5_inbox_hdr   hdr;
+       u8                      rsvd[8];
+};
+
+struct mlx5_cmd_query_special_contexts_mbox_out {
+       struct mlx5_outbox_hdr  hdr;
+       __be32                  dump_fill_mkey;
+       __be32                  resd_lkey;
+};
+
 struct mlx5_cmd_layout {
        u8              type;
        u8              rsvd0[3];
index 8b6d6f2154a4eaab1cce3db487b92d8d1b36d4a9..27b53f9a24ad85a4be3928a470ee4627a20859a6 100644 (file)
@@ -845,6 +845,7 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
 int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
 int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
+int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey);
 
 struct mlx5_profile {
        u64     mask;
index 8b257c43855bbc32c04698a184055bb3be56bd80..91c08f6f0dc96dbb7474d3349f62b5d3f723fe80 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/shrinker.h>
 #include <linux/resource.h>
 #include <linux/page_ext.h>
+#include <linux/err.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -249,6 +250,8 @@ struct vm_operations_struct {
        void (*close)(struct vm_area_struct * area);
        int (*mremap)(struct vm_area_struct * area);
        int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+       int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+                                               pmd_t *, unsigned int flags);
        void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
        /* notification that a previously read-only page is about to become
@@ -307,18 +310,6 @@ struct inode;
 #define page_private(page)             ((page)->private)
 #define set_page_private(page, v)      ((page)->private = (v))
 
-/* It's valid only if the page is free path or free_list */
-static inline void set_freepage_migratetype(struct page *page, int migratetype)
-{
-       page->index = migratetype;
-}
-
-/* It's valid only if the page is free path or free_list */
-static inline int get_freepage_migratetype(struct page *page)
-{
-       return page->index;
-}
-
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -359,20 +350,15 @@ static inline int get_page_unless_zero(struct page *page)
        return atomic_inc_not_zero(&page->_count);
 }
 
-/*
- * Try to drop a ref unless the page has a refcount of one, return false if
- * that is the case.
- * This is to make sure that the refcount won't become zero after this drop.
- * This can be called when MMU is off so it must not access
- * any of the virtual mappings.
- */
-static inline int put_page_unless_one(struct page *page)
-{
-       return atomic_add_unless(&page->_count, -1, 1);
-}
-
 extern int page_is_ram(unsigned long pfn);
-extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
+
+enum {
+       REGION_INTERSECTS,
+       REGION_DISJOINT,
+       REGION_MIXED,
+};
+
+int region_intersects(resource_size_t offset, size_t size, const char *type);
 
 /* Support for virtually mapped pages */
 struct page *vmalloc_to_page(const void *addr);
@@ -1229,6 +1215,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
                    int write, int force, struct page **pages);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                        struct page **pages);
+
+/* Container for pinned pfns / pages */
+struct frame_vector {
+       unsigned int nr_allocated;      /* Number of frames we have space for */
+       unsigned int nr_frames; /* Number of frames stored in ptrs array */
+       bool got_ref;           /* Did we pin pages by getting page ref? */
+       bool is_pfns;           /* Does array contain pages or pfns? */
+       void *ptrs[0];          /* Array of pinned pfns / pages. Use
+                                * pfns_vector_pages() or pfns_vector_pfns()
+                                * for access */
+};
+
+struct frame_vector *frame_vector_create(unsigned int nr_frames);
+void frame_vector_destroy(struct frame_vector *vec);
+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
+                    bool write, bool force, struct frame_vector *vec);
+void put_vaddr_frames(struct frame_vector *vec);
+int frame_vector_to_pages(struct frame_vector *vec);
+void frame_vector_to_pfns(struct frame_vector *vec);
+
+static inline unsigned int frame_vector_count(struct frame_vector *vec)
+{
+       return vec->nr_frames;
+}
+
+static inline struct page **frame_vector_pages(struct frame_vector *vec)
+{
+       if (vec->is_pfns) {
+               int err = frame_vector_to_pages(vec);
+
+               if (err)
+                       return ERR_PTR(err);
+       }
+       return (struct page **)(vec->ptrs);
+}
+
+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
+{
+       if (!vec->is_pfns)
+               frame_vector_to_pfns(vec);
+       return (unsigned long *)(vec->ptrs);
+}
+
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                        struct page **pages);
@@ -1260,6 +1289,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
        return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
 }
 
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+       return !vma->vm_ops;
+}
+
 static inline int stack_guard_page_start(struct vm_area_struct *vma,
                                             unsigned long addr)
 {
@@ -1883,11 +1917,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
-       unsigned long pgoff, unsigned long *populate);
+       vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
 
+static inline unsigned long
+do_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long pgoff, unsigned long *populate)
+{
+       return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+}
+
 #ifdef CONFIG_MMU
 extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
@@ -2186,6 +2228,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int get_hwpoison_page(struct page *page);
+extern void put_hwpoison_page(struct page *page);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
index c8d0a73d64c455f95ba9ed3a0d855bcbc4d839ce..3d6baa7d4534c68918dd7181e82674ade0b8ed98 100644 (file)
@@ -235,7 +235,7 @@ struct page_frag_cache {
        bool pfmemalloc;
 };
 
-typedef unsigned long __nocast vm_flags_t;
+typedef unsigned long vm_flags_t;
 
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
index 4d3776d25925a14d6140ce681911dd5c1a5bc133..fdd0779ccdfa52309f5e99aeeadd84b356c1a472 100644 (file)
@@ -279,10 +279,13 @@ struct mmc_card {
 #define MMC_QUIRK_LONG_READ_TIME (1<<9)                /* Data read time > CSD says */
 #define MMC_QUIRK_SEC_ERASE_TRIM_BROKEN (1<<10)        /* Skip secure for erase/trim */
 #define MMC_QUIRK_BROKEN_IRQ_POLLING   (1<<11) /* Polling SDIO_CCCR_INTx could create a fake interrupt */
+#define MMC_QUIRK_TRIM_BROKEN  (1<<12)         /* Skip trim */
+
 
        unsigned int            erase_size;     /* erase size in sectors */
        unsigned int            erase_shift;    /* if erase unit is power 2 */
        unsigned int            pref_erase;     /* in sectors */
+       unsigned int            eg_boundary;    /* don't cross erase-group boundaries */
        u8                      erased_byte;    /* value of erased bytes */
 
        u32                     raw_cid[4];     /* raw card CID */
index 5be97676f1fa029a9e0b40418bc1c37b83105d44..134c57422740944fb8d1d004c2f8126b2bdeb604 100644 (file)
@@ -98,6 +98,7 @@ struct mmc_data;
  * @irq_flags: The flags to be passed to request_irq.
  * @irq: The irq value to be passed to request_irq.
  * @sdio_id0: Number of slot0 in the SDIO interrupt registers.
+ * @dto_timer: Timer for broken data transfer over scheme.
  *
  * Locking
  * =======
@@ -153,11 +154,7 @@ struct dw_mci {
        dma_addr_t              sg_dma;
        void                    *sg_cpu;
        const struct dw_mci_dma_ops     *dma_ops;
-#ifdef CONFIG_MMC_DW_IDMAC
        unsigned int            ring_size;
-#else
-       struct dw_mci_dma_data  *dma_data;
-#endif
        u32                     cmd_status;
        u32                     data_status;
        u32                     stop_cmdr;
@@ -204,6 +201,7 @@ struct dw_mci {
        int                     sdio_id0;
 
        struct timer_list       cmd11_timer;
+       struct timer_list       dto_timer;
 };
 
 /* DMA ops for Internal/External DMAC interface */
@@ -226,6 +224,8 @@ struct dw_mci_dma_ops {
 #define DW_MCI_QUIRK_HIGHSPEED                 BIT(2)
 /* Unreliable card detection */
 #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION     BIT(3)
+/* Timer for broken data transfer over scheme */
+#define DW_MCI_QUIRK_BROKEN_DTO                        BIT(4)
 
 struct dma_pdata;
 
@@ -259,7 +259,6 @@ struct dw_mci_board {
 
        struct dw_mci_dma_ops *dma_ops;
        struct dma_pdata *data;
-       struct block_settings *blk_settings;
 };
 
 #endif /* LINUX_MMC_DW_MMC_H */
index 1369e54faeb7e2ee8ef3d49481751249c70bbcbd..83b81fd865f3bba12e7bc7d4c0ac8091ca067c09 100644 (file)
@@ -412,7 +412,8 @@ static inline void mmc_signal_sdio_irq(struct mmc_host *host)
 {
        host->ops->enable_sdio_irq(host, 0);
        host->sdio_irq_pending = true;
-       wake_up_process(host->sdio_irq_thread);
+       if (host->sdio_irq_thread)
+               wake_up_process(host->sdio_irq_thread);
 }
 
 void sdio_run_irqs(struct mmc_host *host);
index 61cd67f4d7881cbbd8eba481729c06b31d45e9c6..a1a210d59961a855964b03ee9a7eae7c74d7c86f 100644 (file)
@@ -65,6 +65,16 @@ struct mmu_notifier_ops {
                                 unsigned long start,
                                 unsigned long end);
 
+       /*
+        * clear_young is a lightweight version of clear_flush_young. Like the
+        * latter, it is supposed to test-and-clear the young/accessed bitflag
+        * in the secondary pte, but it may omit flushing the secondary tlb.
+        */
+       int (*clear_young)(struct mmu_notifier *mn,
+                          struct mm_struct *mm,
+                          unsigned long start,
+                          unsigned long end);
+
        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+                                     unsigned long start,
+                                     unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+                                          unsigned long start,
+                                          unsigned long end)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_clear_young(mm, start, end);
+       return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
 {
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PAGE_SIZE);    \
+       __young;                                                        \
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PMD_SIZE);     \
+       __young;                                                        \
+})
+
 #define        ptep_clear_flush_notify(__vma, __address, __ptep)               \
 ({                                                                     \
        unsigned long ___addr = __address & PAGE_MASK;                  \
@@ -427,6 +471,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define        ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
index ac00e2050943b4352feeb0f196ff5b5b48a3204a..d943477372928c13a586cbcd78472a7e29bf65c9 100644 (file)
@@ -319,7 +319,11 @@ enum zone_type {
        ZONE_HIGHMEM,
 #endif
        ZONE_MOVABLE,
+#ifdef CONFIG_ZONE_DEVICE
+       ZONE_DEVICE,
+#endif
        __MAX_NR_ZONES
+
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -786,6 +790,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
        return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
 }
 
+static inline int zone_id(const struct zone *zone)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+
+       return zone - pgdat->node_zones;
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+static inline bool is_dev_zone(const struct zone *zone)
+{
+       return zone_id(zone) == ZONE_DEVICE;
+}
+#else
+static inline bool is_dev_zone(const struct zone *zone)
+{
+       return false;
+}
+#endif
+
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
index 29975c73a95347a582a98b4cf09baf0fea47c7b0..366cf77953b55bc737fcded0179df083138b3dbf 100644 (file)
@@ -27,9 +27,9 @@
 #include <linux/string.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
+#include <linux/io.h>
 
 #include <asm/unaligned.h>
-#include <asm/io.h>
 #include <asm/barrier.h>
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1
index 9120edb650a068df60b0a2c390ae431ab78671cb..639e9b8b0e4d9ff2c9b10ce5b44a8a328abe2a31 100644 (file)
@@ -68,8 +68,17 @@ extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
-extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-                                        u32 dst_portid, gfp_t gfp_mask);
+
+extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+                                          unsigned int ldiff, u32 dst_portid,
+                                          gfp_t gfp_mask);
+static inline struct sk_buff *
+netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid,
+                 gfp_t gfp_mask)
+{
+       return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask);
+}
+
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
                             __u32 group, gfp_t allocation);
index b8e72aad919cfc72ea6710ac3786dc35bbe47201..00121f298269590cabc12ce7924f61d8e8742e07 100644 (file)
@@ -547,6 +547,24 @@ enum pnfs_notify_deviceid_type4 {
        NOTIFY_DEVICEID4_DELETE = 1 << 2,
 };
 
+enum pnfs_block_volume_type {
+       PNFS_BLOCK_VOLUME_SIMPLE        = 0,
+       PNFS_BLOCK_VOLUME_SLICE         = 1,
+       PNFS_BLOCK_VOLUME_CONCAT        = 2,
+       PNFS_BLOCK_VOLUME_STRIPE        = 3,
+};
+
+enum pnfs_block_extent_state {
+       PNFS_BLOCK_READWRITE_DATA       = 0,
+       PNFS_BLOCK_READ_DATA            = 1,
+       PNFS_BLOCK_INVALID_DATA         = 2,
+       PNFS_BLOCK_NONE_DATA            = 3,
+};
+
+/* on the wire size of a block layout extent */
+#define PNFS_BLOCK_EXTENT_SIZE \
+       (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
 #define NFL4_UFLG_MASK                 0x0000003F
 #define NFL4_UFLG_DENSE                        0x00000001
 #define NFL4_UFLG_COMMIT_THRU_MDS      0x00000002
index 874b77228fb96285fb2024f07fa99d1ed7dd6dda..c0e961474a527058c8d1ac2aa72070c9b15e3db5 100644 (file)
@@ -353,7 +353,6 @@ extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
 extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
 extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
-extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_attribute_cache_expired(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -371,6 +370,7 @@ extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struc
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
 extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
 extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
+extern void nfs_file_clear_open_context(struct file *flip);
 extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
 extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
 extern u64 nfs_compat_user_ino64(u64 fileid);
index 20bc8e51b16124496326274e84fc12d61996a4fd..570a7df2775b599eab0d59ad9e92b888f60dec87 100644 (file)
@@ -173,6 +173,11 @@ struct nfs_server {
                                                   set of attributes supported
                                                   on this filesystem excluding
                                                   the label support bit. */
+       u32                     exclcreat_bitmask[3];
+                                               /* V4 bitmask representing the
+                                                  set of attributes supported
+                                                  on this filesystem for the
+                                                  exclusive create. */
        u32                     cache_consistency_bitmask[3];
                                                /* V4 bitmask representing the subset
                                                   of change attribute, size, ctime
index 7bbe50504211d65cc096baa8bc6d45e2e9449125..52faf7e96c65db03f57777b63ce0428fac912e25 100644 (file)
@@ -379,7 +379,7 @@ struct nfs_openargs {
        struct stateowner_id    id;
        union {
                struct {
-                       struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+                       struct iattr *  attrs;    /* UNCHECKED, GUARDED, EXCLUSIVE4_1 */
                        nfs4_verifier   verifier; /* EXCLUSIVE */
                };
                nfs4_stateid    delegation;             /* CLAIM_DELEGATE_CUR */
@@ -389,7 +389,7 @@ struct nfs_openargs {
        const struct nfs_server *server;         /* Needed for ID mapping */
        const u32 *             bitmask;
        const u32 *             open_bitmap;
-       __u32                   claim;
+       enum open_claim_type4   claim;
        enum createmode4        createmode;
        const struct nfs4_label *label;
 };
@@ -406,8 +406,8 @@ struct nfs_openres {
        const struct nfs_server *server;
        fmode_t                 delegation_type;
        nfs4_stateid            delegation;
+       unsigned long           pagemod_limit;
        __u32                   do_recall;
-       __u64                   maxsize;
        __u32                   attrset[NFS4_BITMAP_SIZE];
        struct nfs4_string      *owner;
        struct nfs4_string      *group_owner;
@@ -1057,11 +1057,13 @@ struct nfs4_statfs_res {
 struct nfs4_server_caps_arg {
        struct nfs4_sequence_args       seq_args;
        struct nfs_fh                  *fhandle;
+       const u32 *                     bitmask;
 };
 
 struct nfs4_server_caps_res {
        struct nfs4_sequence_res        seq_res;
        u32                             attr_bitmask[3];
+       u32                             exclcreat_bitmask[3];
        u32                             acl_bitmask;
        u32                             has_links;
        u32                             has_symlinks;
index a91adf6e02f2a7c47117093eea682c06651034b8..78488e099ce7a4263dff9f9a08fa2e39f60f0e49 100644 (file)
@@ -47,6 +47,12 @@ static inline bool trigger_allbutself_cpu_backtrace(void)
        arch_trigger_all_cpu_backtrace(false);
        return true;
 }
+
+/* generic implementation */
+void nmi_trigger_all_cpu_backtrace(bool include_self,
+                                  void (*raise)(cpumask_t *mask));
+bool nmi_cpu_backtrace(struct pt_regs *regs);
+
 #else
 static inline bool trigger_all_cpu_backtrace(void)
 {
index b02f72bb8e325bb5f17f7b9c2cad8f1f266379a8..f798e2afba88db5cd50b356108d2250048e17593 100644 (file)
@@ -522,10 +522,9 @@ static inline int ntb_mw_clear_trans(struct ntb_dev *ntb, int idx)
  * @speed:     OUT - The link speed expressed as PCIe generation number.
  * @width:     OUT - The link width expressed as the number of PCIe lanes.
  *
- * Set the translation of a memory window.  The peer may access local memory
- * through the window starting at the address, up to the size.  The address
- * must be aligned to the alignment specified by ntb_mw_get_range().  The size
- * must be aligned to the size alignment specified by ntb_mw_get_range().
+ * Get the current state of the ntb link.  It is recommended to query the link
+ * state once after every link event.  It is safe to query the link state in
+ * the context of the link event callback.
  *
  * Return: One if the link is up, zero if the link is down, otherwise a
  *             negative value indicating the error number.
@@ -795,7 +794,7 @@ static inline int ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 }
 
 /**
- * ntb_peer_db_clear() - clear bits in the local doorbell register
+ * ntb_peer_db_clear() - clear bits in the peer doorbell register
  * @ntb:       NTB device context.
  * @db_bits:   Doorbell bits to clear.
  *
index 2862861366a5ee83fbfb813f6fd801778333a110..7243eb98a722e9f23c3106411e47c33310bedea3 100644 (file)
@@ -83,3 +83,4 @@ void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
 void ntb_transport_link_up(struct ntb_transport_qp *qp);
 void ntb_transport_link_down(struct ntb_transport_qp *qp);
 bool ntb_transport_link_query(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_tx_free_entry(struct ntb_transport_qp *qp);
index c2bbf672b84eb58e767bc82882c49761b07abee6..d2fa9ca42e9a74d04d962e46406bccac58762ebe 100644 (file)
@@ -41,7 +41,7 @@ enum OID {
        OID_signed_data,                /* 1.2.840.113549.1.7.2 */
        /* PKCS#9 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-9(9)} */
        OID_email_address,              /* 1.2.840.113549.1.9.1 */
-       OID_content_type,               /* 1.2.840.113549.1.9.3 */
+       OID_contentType,                /* 1.2.840.113549.1.9.3 */
        OID_messageDigest,              /* 1.2.840.113549.1.9.4 */
        OID_signingTime,                /* 1.2.840.113549.1.9.5 */
        OID_smimeCapabilites,           /* 1.2.840.113549.1.9.15 */
@@ -54,6 +54,8 @@ enum OID {
 
        /* Microsoft Authenticode & Software Publishing */
        OID_msIndirectData,             /* 1.3.6.1.4.1.311.2.1.4 */
+       OID_msStatementType,            /* 1.3.6.1.4.1.311.2.1.11 */
+       OID_msSpOpusInfo,               /* 1.3.6.1.4.1.311.2.1.12 */
        OID_msPeImageDataObjId,         /* 1.3.6.1.4.1.311.2.1.15 */
        OID_msIndividualSPKeyPurpose,   /* 1.3.6.1.4.1.311.2.1.21 */
        OID_msOutlookExpress,           /* 1.3.6.1.4.1.311.16.4 */
@@ -61,6 +63,9 @@ enum OID {
        OID_certAuthInfoAccess,         /* 1.3.6.1.5.5.7.1.1 */
        OID_sha1,                       /* 1.3.14.3.2.26 */
        OID_sha256,                     /* 2.16.840.1.101.3.4.2.1 */
+       OID_sha384,                     /* 2.16.840.1.101.3.4.2.2 */
+       OID_sha512,                     /* 2.16.840.1.101.3.4.2.3 */
+       OID_sha224,                     /* 2.16.840.1.101.3.4.2.4 */
 
        /* Distinguished Name attribute IDs [RFC 2256] */
        OID_commonName,                 /* 2.5.4.3 */
index 7deecb7bca5e3f76b480fb6fd514fbdcfc1c90b9..03e6257321f0353efcf6d134744af6f597e78a8b 100644 (file)
@@ -12,6 +12,27 @@ struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
 
+/*
+ * Details of the page allocation that triggered the oom killer that are used to
+ * determine what should be killed.
+ */
+struct oom_control {
+       /* Used to determine cpuset */
+       struct zonelist *zonelist;
+
+       /* Used to determine mempolicy */
+       nodemask_t *nodemask;
+
+       /* Used to determine cpuset and node locality requirement */
+       const gfp_t gfp_mask;
+
+       /*
+        * order == -1 means the oom kill is required by sysrq, otherwise only
+        * for display purposes.
+        */
+       const int order;
+};
+
 /*
  * Types of limitations to the nodes from which allocations may occur
  */
@@ -57,21 +78,18 @@ extern unsigned long oom_badness(struct task_struct *p,
 
 extern int oom_kills_count(void);
 extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                             unsigned int points, unsigned long totalpages,
-                            struct mem_cgroup *memcg, nodemask_t *nodemask,
-                            const char *message);
+                            struct mem_cgroup *memcg, const char *message);
 
-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                              int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+                              enum oom_constraint constraint,
                               struct mem_cgroup *memcg);
 
-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-               unsigned long totalpages, const nodemask_t *nodemask,
-               bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+               struct task_struct *task, unsigned long totalpages);
 
-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-               int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);
 
 extern void exit_oom_victim(void);
 
index 41c93844fb1d1ed5c0dbad77fe5a409557d66067..416509e26d6d16bfa0f75ef793b7e32e6b5fb090 100644 (file)
@@ -108,6 +108,10 @@ enum pageflags {
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        PG_compound_lock,
+#endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+       PG_young,
+       PG_idle,
 #endif
        __NR_PAGEFLAGS,
 
@@ -289,6 +293,13 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young)
+TESTCLEARFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#endif
+
 /*
  * On an anonymous page mapped into a user virtual memory area,
  * page->mapping points to its anon_vma, not to a struct address_space;
index 2dc1e1697b451ce678781a55776a15c8934be7a5..047d64706f2a298157cf0d54305aa8e49f6f7837 100644 (file)
@@ -65,11 +65,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
                        bool skip_hwpoisoned_pages);
 
-/*
- * Internal functions. Changes pageblock's migrate type.
- */
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
-void unset_migratetype_isolate(struct page *page, unsigned migratetype);
 struct page *alloc_migrate_target(struct page *page, unsigned long private,
                                int **resultp);
 
index c42981cd99aae91b33d21a0deb4bc9f925a7fbd1..17f118a82854960701dac4de69dfc1af43713989 100644 (file)
@@ -26,6 +26,10 @@ enum page_ext_flags {
        PAGE_EXT_DEBUG_POISON,          /* Page is poisoned */
        PAGE_EXT_DEBUG_GUARD,
        PAGE_EXT_OWNER,
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+       PAGE_EXT_YOUNG,
+       PAGE_EXT_IDLE,
+#endif
 };
 
 /*
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
new file mode 100644 (file)
index 0000000..bf268fa
--- /dev/null
@@ -0,0 +1,110 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/page_ext.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+#ifdef CONFIG_64BIT
+static inline bool page_is_young(struct page *page)
+{
+       return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+       SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       ClearPageIdle(page);
+}
+#else /* !CONFIG_64BIT */
+/*
+ * If there is not enough space to store Idle and Young bits in page flags, use
+ * page ext flags instead.
+ */
+extern struct page_ext_operations page_idle_ops;
+
+static inline bool page_is_young(struct page *page)
+{
+       return test_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_young(struct page *page)
+{
+       set_bit(PAGE_EXT_YOUNG, &lookup_page_ext(page)->flags);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return test_and_clear_bit(PAGE_EXT_YOUNG,
+                                 &lookup_page_ext(page)->flags);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return test_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+       set_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+       clear_bit(PAGE_EXT_IDLE, &lookup_page_ext(page)->flags);
+}
+#endif /* CONFIG_64BIT */
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+       return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+       return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+       return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
index 1a64733c48c741dff6d8bdfc34588c8c8f72f7fd..e90eb22de6286df6be1ed07ed8f37aa6ab7810b4 100644 (file)
@@ -1227,6 +1227,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
                dma_pool_create(name, &pdev->dev, size, align, allocation)
 #define        pci_pool_destroy(pool) dma_pool_destroy(pool)
 #define        pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
+#define        pci_pool_zalloc(pool, flags, handle) \
+               dma_pool_zalloc(pool, flags, handle)
 #define        pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
 
 struct msix_entry {
diff --git a/include/linux/platform_data/i2c-mux-reg.h b/include/linux/platform_data/i2c-mux-reg.h
new file mode 100644 (file)
index 0000000..c68712a
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * I2C multiplexer using a single register
+ *
+ * Copyright 2015 Freescale Semiconductor
+ * York Sun <yorksun@freescale.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_I2C_MUX_REG_H
+#define __LINUX_PLATFORM_DATA_I2C_MUX_REG_H
+
+/**
+ * struct i2c_mux_reg_platform_data - Platform-dependent data for i2c-mux-reg
+ * @parent: Parent I2C bus adapter number
+ * @base_nr: Base I2C bus number to number adapters from or zero for dynamic
+ * @values: Array of value for each channel
+ * @n_values: Number of multiplexer channels
+ * @little_endian: Indicating if the register is in little endian
+ * @write_only: Reading the register is not allowed by hardware
+ * @classes: Optional I2C auto-detection classes
+ * @idle: Value to write to mux when idle
+ * @idle_in_use: indicate if idle value is in use
+ * @reg: Virtual address of the register to switch channel
+ * @reg_size: register size in bytes
+ */
+struct i2c_mux_reg_platform_data {
+       int parent;
+       int base_nr;
+       const unsigned int *values;
+       int n_values;
+       bool little_endian;
+       bool write_only;
+       const unsigned int *classes;
+       u32 idle;
+       bool idle_in_use;
+       void __iomem *reg;
+       resource_size_t reg_size;
+};
+
+#endif /* __LINUX_PLATFORM_DATA_I2C_MUX_REG_H */
index e1571efa3f2b28e01642e08f6709d7c7c8dc141a..95ccab3f454a9511a27acc7cf351eac52a5dbdb5 100644 (file)
@@ -45,5 +45,6 @@ struct esdhc_platform_data {
        int max_bus_width;
        bool support_vsel;
        unsigned int delay_line;
+       unsigned int tuning_step;       /* The delay cell steps in tuning procedure */
 };
 #endif /* __ASM_ARCH_IMX_ESDHC_H */
index cab7ba55bedb854294488a97c2bb299cde679d33..e817722ee3f018a72e9784e5c2269a3f51017b12 100644 (file)
@@ -34,6 +34,7 @@ bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp);
 
 int dev_pm_opp_get_opp_count(struct device *dev);
 unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev);
+struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev);
 
 struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
                                              unsigned long freq,
@@ -80,6 +81,11 @@ static inline unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
        return 0;
 }
 
+static inline struct dev_pm_opp *dev_pm_opp_get_suspend_opp(struct device *dev)
+{
+       return NULL;
+}
+
 static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
                                        unsigned long freq, bool available)
 {
index d2114045a6c43802c71b863692716dac83c126f2..85f810b339175f71367f6a06fe09a24f82f96f10 100644 (file)
 #define __PMEM_H__
 
 #include <linux/io.h>
+#include <linux/uio.h>
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
-#include <asm/cacheflush.h>
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
+#include <asm/pmem.h>
 #else
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+/*
+ * These are simply here to enable compilation, all call sites gate
+ * calling these symbols with arch_has_pmem_api() and redirect to the
+ * implementation in asm/pmem.h.
+ */
+static inline bool __arch_has_wmb_pmem(void)
+{
+       return false;
+}
+
 static inline void arch_wmb_pmem(void)
 {
        BUG();
 }
 
-static inline bool __arch_has_wmb_pmem(void)
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+               size_t n)
 {
-       return false;
+       BUG();
 }
 
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
-               unsigned long size)
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+               struct iov_iter *i)
 {
-       return NULL;
+       BUG();
+       return 0;
 }
 
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
-               size_t n)
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 {
        BUG();
 }
@@ -43,18 +57,22 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
 
 /*
  * Architectures that define ARCH_HAS_PMEM_API must provide
- * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(),
- * arch_wmb_pmem(), and __arch_has_wmb_pmem().
+ * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(),
+ * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem().
  */
-
 static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
 {
        memcpy(dst, (void __force const *) src, size);
 }
 
-static inline void memunmap_pmem(void __pmem *addr)
+static inline void memunmap_pmem(struct device *dev, void __pmem *addr)
+{
+       devm_memunmap(dev, (void __force *) addr);
+}
+
+static inline bool arch_has_pmem_api(void)
 {
-       iounmap((void __force __iomem *) addr);
+       return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
 }
 
 /**
@@ -68,14 +86,7 @@ static inline void memunmap_pmem(void __pmem *addr)
  */
 static inline bool arch_has_wmb_pmem(void)
 {
-       if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
-               return __arch_has_wmb_pmem();
-       return false;
-}
-
-static inline bool arch_has_pmem_api(void)
-{
-       return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem();
+       return arch_has_pmem_api() && __arch_has_wmb_pmem();
 }
 
 /*
@@ -85,16 +96,24 @@ static inline bool arch_has_pmem_api(void)
  * default_memremap_pmem + default_memcpy_to_pmem is sufficient for
  * making data durable relative to i/o completion.
  */
-static void default_memcpy_to_pmem(void __pmem *dst, const void *src,
+static inline void default_memcpy_to_pmem(void __pmem *dst, const void *src,
                size_t size)
 {
        memcpy((void __force *) dst, src, size);
 }
 
-static void __pmem *default_memremap_pmem(resource_size_t offset,
-               unsigned long size)
+static inline size_t default_copy_from_iter_pmem(void __pmem *addr,
+               size_t bytes, struct iov_iter *i)
+{
+       return copy_from_iter_nocache((void __force *)addr, bytes, i);
+}
+
+static inline void default_clear_pmem(void __pmem *addr, size_t size)
 {
-       return (void __pmem __force *)ioremap_wt(offset, size);
+       if (size == PAGE_SIZE && ((unsigned long)addr & ~PAGE_MASK) == 0)
+               clear_page((void __force *)addr);
+       else
+               memset((void __force *)addr, 0, size);
 }
 
 /**
@@ -109,12 +128,11 @@ static void __pmem *default_memremap_pmem(resource_size_t offset,
  * wmb_pmem() arrange for the data to be written through the
  * cache to persistent media.
  */
-static inline void __pmem *memremap_pmem(resource_size_t offset,
-               unsigned long size)
+static inline void __pmem *memremap_pmem(struct device *dev,
+               resource_size_t offset, unsigned long size)
 {
-       if (arch_has_pmem_api())
-               return arch_memremap_pmem(offset, size);
-       return default_memremap_pmem(offset, size);
+       return (void __pmem *) devm_memremap(dev, offset, size,
+                       ARCH_MEMREMAP_PMEM);
 }
 
 /**
@@ -146,7 +164,42 @@ static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
  */
 static inline void wmb_pmem(void)
 {
-       if (arch_has_pmem_api())
+       if (arch_has_wmb_pmem())
                arch_wmb_pmem();
+       else
+               wmb();
+}
+
+/**
+ * copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr:      PMEM destination address
+ * @bytes:     number of bytes to copy
+ * @i:         iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline size_t copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+               struct iov_iter *i)
+{
+       if (arch_has_pmem_api())
+               return arch_copy_from_iter_pmem(addr, bytes, i);
+       return default_copy_from_iter_pmem(addr, bytes, i);
+}
+
+/**
+ * clear_pmem - zero a PMEM memory range
+ * @addr:      virtual start address
+ * @size:      number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with a wmb_pmem() call.
+ */
+static inline void clear_pmem(void __pmem *addr, size_t size)
+{
+       if (arch_has_pmem_api())
+               arch_clear_pmem(addr, size);
+       else
+               default_clear_pmem(addr, size);
 }
 #endif /* __PMEM_H__ */
index 2110a81c5e2afaab47ec5cb107cf17503d731317..317e16de09e508ed64b87dae6c5006b1efcb129d 100644 (file)
@@ -19,8 +19,8 @@
  * under normal circumstances, used to verify that nobody uses
  * non-initialized list entries.
  */
-#define LIST_POISON1  ((void *) 0x00100100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x00200200 + POISON_POINTER_DELTA)
+#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
 #define ATM_POISON_FREE                0x12
 #define ATM_POISON             0xdeadbeef
 
-/********** net/ **********/
-#define NEIGHBOR_DEAD          0xdeadbeef
-#define NETFILTER_LINK_POISON  0xdead57ac
-
 /********** kernel/mutexes **********/
 #define MUTEX_DEBUG_INIT       0x11
 #define MUTEX_DEBUG_FREE       0x22
@@ -83,7 +79,4 @@
 /********** security/ **********/
 #define KEY_DESTROY            0xbd
 
-/********** sound/oss/ **********/
-#define OSS_POISON_FREE                0xAB
-
 #endif
index a6298b27ac99d9197ccd4dac6c04b09f2c5011da..9729565c25ff19accc05ca6419bbc6f50f8cdb53 100644 (file)
@@ -404,10 +404,10 @@ do {                                                                      \
        static DEFINE_RATELIMIT_STATE(_rs,                              \
                                      DEFAULT_RATELIMIT_INTERVAL,       \
                                      DEFAULT_RATELIMIT_BURST);         \
-       DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);                 \
+       DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));         \
        if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT) &&        \
            __ratelimit(&_rs))                                          \
-               __dynamic_pr_debug(&descriptor, fmt, ##__VA_ARGS__);    \
+               __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);    \
 } while (0)
 #elif defined(DEBUG)
 #define pr_debug_ratelimited(fmt, ...)                                 \
@@ -456,11 +456,17 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,      \
                         groupsize, buf, len, ascii)
-#else
+#elif defined(DEBUG)
 #define print_hex_dump_debug(prefix_str, prefix_type, rowsize,         \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,    \
                       groupsize, buf, len, ascii)
-#endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+#else
+static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
+                                       int rowsize, int groupsize,
+                                       const void *buf, size_t len, bool ascii)
+{
+}
+#endif
 
 #endif
index 987a73a40ef8465ee8290f1ab628bf4c70ae58f8..061265f9287676afce057892572222a477a47505 100644 (file)
@@ -34,6 +34,7 @@
 #define PT_TRACE_SECCOMP       PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
 #define PT_EXITKILL            (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
+#define PT_SUSPEND_SECCOMP     (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
 
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT      31
index 36262d08a9dad8aca7eaea494e30348c0b26cb5b..d681f6875aef1a7a8b22c9453b8f9019f9fd45eb 100644 (file)
@@ -79,26 +79,43 @@ enum {
        PWMF_EXPORTED = 1 << 2,
 };
 
+/**
+ * struct pwm_device - PWM channel object
+ * @label: name of the PWM device
+ * @flags: flags associated with the PWM device
+ * @hwpwm: per-chip relative index of the PWM device
+ * @pwm: global index of the PWM device
+ * @chip: PWM chip providing this PWM device
+ * @chip_data: chip-private data associated with the PWM device
+ * @period: period of the PWM signal (in nanoseconds)
+ * @duty_cycle: duty cycle of the PWM signal (in nanoseconds)
+ * @polarity: polarity of the PWM signal
+ */
 struct pwm_device {
-       const char              *label;
-       unsigned long           flags;
-       unsigned int            hwpwm;
-       unsigned int            pwm;
-       struct pwm_chip         *chip;
-       void                    *chip_data;
-
-       unsigned int            period;         /* in nanoseconds */
-       unsigned int            duty_cycle;     /* in nanoseconds */
-       enum pwm_polarity       polarity;
+       const char *label;
+       unsigned long flags;
+       unsigned int hwpwm;
+       unsigned int pwm;
+       struct pwm_chip *chip;
+       void *chip_data;
+
+       unsigned int period;
+       unsigned int duty_cycle;
+       enum pwm_polarity polarity;
 };
 
+static inline bool pwm_is_enabled(const struct pwm_device *pwm)
+{
+       return test_bit(PWMF_ENABLED, &pwm->flags);
+}
+
 static inline void pwm_set_period(struct pwm_device *pwm, unsigned int period)
 {
        if (pwm)
                pwm->period = period;
 }
 
-static inline unsigned int pwm_get_period(struct pwm_device *pwm)
+static inline unsigned int pwm_get_period(const struct pwm_device *pwm)
 {
        return pwm ? pwm->period : 0;
 }
@@ -109,7 +126,7 @@ static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty)
                pwm->duty_cycle = duty;
 }
 
-static inline unsigned int pwm_get_duty_cycle(struct pwm_device *pwm)
+static inline unsigned int pwm_get_duty_cycle(const struct pwm_device *pwm)
 {
        return pwm ? pwm->duty_cycle : 0;
 }
@@ -119,6 +136,11 @@ static inline unsigned int pwm_get_duty_cycle(struct pwm_device *pwm)
  */
 int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
 
+static inline enum pwm_polarity pwm_get_polarity(const struct pwm_device *pwm)
+{
+       return pwm ? pwm->polarity : PWM_POLARITY_NORMAL;
+}
+
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
@@ -131,25 +153,18 @@ int pwm_set_polarity(struct pwm_device *pwm, enum pwm_polarity polarity);
  * @owner: helps prevent removal of modules exporting active PWMs
  */
 struct pwm_ops {
-       int                     (*request)(struct pwm_chip *chip,
-                                          struct pwm_device *pwm);
-       void                    (*free)(struct pwm_chip *chip,
-                                       struct pwm_device *pwm);
-       int                     (*config)(struct pwm_chip *chip,
-                                         struct pwm_device *pwm,
-                                         int duty_ns, int period_ns);
-       int                     (*set_polarity)(struct pwm_chip *chip,
-                                         struct pwm_device *pwm,
-                                         enum pwm_polarity polarity);
-       int                     (*enable)(struct pwm_chip *chip,
-                                         struct pwm_device *pwm);
-       void                    (*disable)(struct pwm_chip *chip,
-                                          struct pwm_device *pwm);
+       int (*request)(struct pwm_chip *chip, struct pwm_device *pwm);
+       void (*free)(struct pwm_chip *chip, struct pwm_device *pwm);
+       int (*config)(struct pwm_chip *chip, struct pwm_device *pwm,
+                     int duty_ns, int period_ns);
+       int (*set_polarity)(struct pwm_chip *chip, struct pwm_device *pwm,
+                           enum pwm_polarity polarity);
+       int (*enable)(struct pwm_chip *chip, struct pwm_device *pwm);
+       void (*disable)(struct pwm_chip *chip, struct pwm_device *pwm);
 #ifdef CONFIG_DEBUG_FS
-       void                    (*dbg_show)(struct pwm_chip *chip,
-                                           struct seq_file *s);
+       void (*dbg_show)(struct pwm_chip *chip, struct seq_file *s);
 #endif
-       struct module           *owner;
+       struct module *owner;
 };
 
 /**
@@ -160,22 +175,24 @@ struct pwm_ops {
  * @base: number of first PWM controlled by this chip
  * @npwm: number of PWMs controlled by this chip
  * @pwms: array of PWM devices allocated by the framework
+ * @of_xlate: request a PWM device given a device tree PWM specifier
+ * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier
  * @can_sleep: must be true if the .config(), .enable() or .disable()
  *             operations may sleep
  */
 struct pwm_chip {
-       struct device           *dev;
-       struct list_head        list;
-       const struct pwm_ops    *ops;
-       int                     base;
-       unsigned int            npwm;
-
-       struct pwm_device       *pwms;
-
-       struct pwm_device *     (*of_xlate)(struct pwm_chip *pc,
-                                           const struct of_phandle_args *args);
-       unsigned int            of_pwm_n_cells;
-       bool                    can_sleep;
+       struct device *dev;
+       struct list_head list;
+       const struct pwm_ops *ops;
+       int base;
+       unsigned int npwm;
+
+       struct pwm_device *pwms;
+
+       struct pwm_device * (*of_xlate)(struct pwm_chip *pc,
+                                       const struct of_phandle_args *args);
+       unsigned int of_pwm_n_cells;
+       bool can_sleep;
 };
 
 #if IS_ENABLED(CONFIG_PWM)
index 4a6759098769c6ad4902612bf3b2647fe0898f60..8fc0bfd8edc4434fc79fda8591323d6be41f0645 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/rbtree.h>
 #include <linux/err.h>
 #include <linux/bug.h>
+#include <linux/lockdep.h>
 
 struct module;
 struct device;
@@ -51,14 +52,17 @@ struct reg_default {
 };
 
 /**
- * Register/value pairs for sequences of writes
+ * Register/value pairs for sequences of writes with an optional delay in
+ * microseconds to be applied after each write.
  *
  * @reg: Register address.
  * @def: Register value.
+ * @delay_us: Delay to be applied after the register write in microseconds
  */
 struct reg_sequence {
        unsigned int reg;
        unsigned int def;
+       unsigned int delay_us;
 };
 
 #ifdef CONFIG_REGMAP
@@ -307,8 +311,12 @@ typedef void (*regmap_hw_free_context)(void *context);
  *                if not implemented  on a given device.
  * @async_write: Write operation which completes asynchronously, optional and
  *               must serialise with respect to non-async I/O.
+ * @reg_write: Write a single register value to the given register address. This
+ *             write operation has to complete when returning from the function.
  * @read: Read operation.  Data is returned in the buffer used to transmit
  *         data.
+ * @reg_read: Read a single register value from a given register address.
+ * @free_context: Free context.
  * @async_alloc: Allocate a regmap_async() structure.
  * @read_flag_mask: Mask to be set in the top byte of the register when doing
  *                  a read.
@@ -318,7 +326,8 @@ typedef void (*regmap_hw_free_context)(void *context);
  * @val_format_endian_default: Default endianness for formatted register
  *     values. Used when the regmap_config specifies DEFAULT. If this is
  *     DEFAULT, BIG is assumed.
- * @async_size: Size of struct used for async work.
+ * @max_raw_read: Max raw read size that can be used on the bus.
+ * @max_raw_write: Max raw write size that can be used on the bus.
  */
 struct regmap_bus {
        bool fast_io;
@@ -333,47 +342,186 @@ struct regmap_bus {
        u8 read_flag_mask;
        enum regmap_endian reg_format_endian_default;
        enum regmap_endian val_format_endian_default;
+       size_t max_raw_read;
+       size_t max_raw_write;
 };
 
-struct regmap *regmap_init(struct device *dev,
-                          const struct regmap_bus *bus,
-                          void *bus_context,
-                          const struct regmap_config *config);
+/*
+ * __regmap_init functions.
+ *
+ * These functions take a lock key and name parameter, and should not be called
+ * directly. Instead, use the regmap_init macros that generate a key and name
+ * for each call.
+ */
+struct regmap *__regmap_init(struct device *dev,
+                            const struct regmap_bus *bus,
+                            void *bus_context,
+                            const struct regmap_config *config,
+                            struct lock_class_key *lock_key,
+                            const char *lock_name);
+struct regmap *__regmap_init_i2c(struct i2c_client *i2c,
+                                const struct regmap_config *config,
+                                struct lock_class_key *lock_key,
+                                const char *lock_name);
+struct regmap *__regmap_init_spi(struct spi_device *dev,
+                                const struct regmap_config *config,
+                                struct lock_class_key *lock_key,
+                                const char *lock_name);
+struct regmap *__regmap_init_spmi_base(struct spmi_device *dev,
+                                      const struct regmap_config *config,
+                                      struct lock_class_key *lock_key,
+                                      const char *lock_name);
+struct regmap *__regmap_init_spmi_ext(struct spmi_device *dev,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name);
+struct regmap *__regmap_init_mmio_clk(struct device *dev, const char *clk_id,
+                                     void __iomem *regs,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name);
+struct regmap *__regmap_init_ac97(struct snd_ac97 *ac97,
+                                 const struct regmap_config *config,
+                                 struct lock_class_key *lock_key,
+                                 const char *lock_name);
+
+struct regmap *__devm_regmap_init(struct device *dev,
+                                 const struct regmap_bus *bus,
+                                 void *bus_context,
+                                 const struct regmap_config *config,
+                                 struct lock_class_key *lock_key,
+                                 const char *lock_name);
+struct regmap *__devm_regmap_init_i2c(struct i2c_client *i2c,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name);
+struct regmap *__devm_regmap_init_spi(struct spi_device *dev,
+                                     const struct regmap_config *config,
+                                     struct lock_class_key *lock_key,
+                                     const char *lock_name);
+struct regmap *__devm_regmap_init_spmi_base(struct spmi_device *dev,
+                                           const struct regmap_config *config,
+                                           struct lock_class_key *lock_key,
+                                           const char *lock_name);
+struct regmap *__devm_regmap_init_spmi_ext(struct spmi_device *dev,
+                                          const struct regmap_config *config,
+                                          struct lock_class_key *lock_key,
+                                          const char *lock_name);
+struct regmap *__devm_regmap_init_mmio_clk(struct device *dev,
+                                          const char *clk_id,
+                                          void __iomem *regs,
+                                          const struct regmap_config *config,
+                                          struct lock_class_key *lock_key,
+                                          const char *lock_name);
+struct regmap *__devm_regmap_init_ac97(struct snd_ac97 *ac97,
+                                      const struct regmap_config *config,
+                                      struct lock_class_key *lock_key,
+                                      const char *lock_name);
+
+/*
+ * Wrapper for regmap_init macros to include a unique lockdep key and name
+ * for each call. No-op if CONFIG_LOCKDEP is not set.
+ *
+ * @fn: Real function to call (in the form __[*_]regmap_init[_*])
+ * @name: Config variable name (#config in the calling macro)
+ **/
+#ifdef CONFIG_LOCKDEP
+#define __regmap_lockdep_wrapper(fn, name, ...)                                \
+(                                                                      \
+       ({                                                              \
+               static struct lock_class_key _key;                      \
+               fn(__VA_ARGS__, &_key,                                  \
+                       KBUILD_BASENAME ":"                             \
+                       __stringify(__LINE__) ":"                       \
+                       "(" name ")->lock");                            \
+       })                                                              \
+)
+#else
+#define __regmap_lockdep_wrapper(fn, name, ...) fn(__VA_ARGS__, NULL, NULL)
+#endif
+
+/**
+ * regmap_init(): Initialise register map
+ *
+ * @dev: Device that will be interacted with
+ * @bus: Bus-specific callbacks to use with device
+ * @bus_context: Data passed to bus-specific callbacks
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.  This function should generally not be called
+ * directly, it should be called by bus-specific init functions.
+ */
+#define regmap_init(dev, bus, bus_context, config)                     \
+       __regmap_lockdep_wrapper(__regmap_init, #config,                \
+                               dev, bus, bus_context, config)
 int regmap_attach_dev(struct device *dev, struct regmap *map,
-                                const struct regmap_config *config);
-struct regmap *regmap_init_i2c(struct i2c_client *i2c,
-                              const struct regmap_config *config);
-struct regmap *regmap_init_spi(struct spi_device *dev,
-                              const struct regmap_config *config);
-struct regmap *regmap_init_spmi_base(struct spmi_device *dev,
-                                    const struct regmap_config *config);
-struct regmap *regmap_init_spmi_ext(struct spmi_device *dev,
-                                   const struct regmap_config *config);
-struct regmap *regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-                                   void __iomem *regs,
-                                   const struct regmap_config *config);
-struct regmap *regmap_init_ac97(struct snd_ac97 *ac97,
-                               const struct regmap_config *config);
-
-struct regmap *devm_regmap_init(struct device *dev,
-                               const struct regmap_bus *bus,
-                               void *bus_context,
-                               const struct regmap_config *config);
-struct regmap *devm_regmap_init_i2c(struct i2c_client *i2c,
-                                   const struct regmap_config *config);
-struct regmap *devm_regmap_init_spi(struct spi_device *dev,
-                                   const struct regmap_config *config);
-struct regmap *devm_regmap_init_spmi_base(struct spmi_device *dev,
-                                         const struct regmap_config *config);
-struct regmap *devm_regmap_init_spmi_ext(struct spmi_device *dev,
-                                        const struct regmap_config *config);
-struct regmap *devm_regmap_init_mmio_clk(struct device *dev, const char *clk_id,
-                                        void __iomem *regs,
-                                        const struct regmap_config *config);
-struct regmap *devm_regmap_init_ac97(struct snd_ac97 *ac97,
-                                    const struct regmap_config *config);
+                     const struct regmap_config *config);
 
-bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
+/**
+ * regmap_init_i2c(): Initialise register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_i2c(i2c, config)                                   \
+       __regmap_lockdep_wrapper(__regmap_init_i2c, #config,            \
+                               i2c, config)
+
+/**
+ * regmap_init_spi(): Initialise register map
+ *
+ * @spi: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_spi(dev, config)                                   \
+       __regmap_lockdep_wrapper(__regmap_init_spi, #config,            \
+                               dev, config)
+
+/**
+ * regmap_init_spmi_base(): Create regmap for the Base register space
+ * @sdev:      SPMI device that will be interacted with
+ * @config:    Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_spmi_base(dev, config)                             \
+       __regmap_lockdep_wrapper(__regmap_init_spmi_base, #config,      \
+                               dev, config)
+
+/**
+ * regmap_init_spmi_ext(): Create regmap for Ext register space
+ * @sdev:      Device that will be interacted with
+ * @config:    Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_spmi_ext(dev, config)                              \
+       __regmap_lockdep_wrapper(__regmap_init_spmi_ext, #config,       \
+                               dev, config)
+
+/**
+ * regmap_init_mmio_clk(): Initialise register map with register clock
+ *
+ * @dev: Device that will be interacted with
+ * @clk_id: register clock consumer ID
+ * @regs: Pointer to memory-mapped IO region
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_mmio_clk(dev, clk_id, regs, config)                        \
+       __regmap_lockdep_wrapper(__regmap_init_mmio_clk, #config,       \
+                               dev, clk_id, regs, config)
 
 /**
  * regmap_init_mmio(): Initialise register map
@@ -385,12 +533,109 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  * The return value will be an ERR_PTR() on error or a valid pointer to
  * a struct regmap.
  */
-static inline struct regmap *regmap_init_mmio(struct device *dev,
-                                       void __iomem *regs,
-                                       const struct regmap_config *config)
-{
-       return regmap_init_mmio_clk(dev, NULL, regs, config);
-}
+#define regmap_init_mmio(dev, regs, config)            \
+       regmap_init_mmio_clk(dev, NULL, regs, config)
+
+/**
+ * regmap_init_ac97(): Initialise AC'97 register map
+ *
+ * @ac97: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer to
+ * a struct regmap.
+ */
+#define regmap_init_ac97(ac97, config)                                 \
+       __regmap_lockdep_wrapper(__regmap_init_ac97, #config,           \
+                               ac97, config)
+bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
+
+/**
+ * devm_regmap_init(): Initialise managed register map
+ *
+ * @dev: Device that will be interacted with
+ * @bus: Bus-specific callbacks to use with device
+ * @bus_context: Data passed to bus-specific callbacks
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  This function should generally not be called
+ * directly, it should be called by bus-specific init functions.  The
+ * map will be automatically freed by the device management code.
+ */
+#define devm_regmap_init(dev, bus, bus_context, config)                        \
+       __regmap_lockdep_wrapper(__devm_regmap_init, #config,           \
+                               dev, bus, bus_context, config)
+
+/**
+ * devm_regmap_init_i2c(): Initialise managed register map
+ *
+ * @i2c: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_i2c(i2c, config)                              \
+       __regmap_lockdep_wrapper(__devm_regmap_init_i2c, #config,       \
+                               i2c, config)
+
+/**
+ * devm_regmap_init_spi(): Initialise register map
+ *
+ * @spi: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The map will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_spi(dev, config)                              \
+       __regmap_lockdep_wrapper(__devm_regmap_init_spi, #config,       \
+                               dev, config)
+
+/**
+ * devm_regmap_init_spmi_base(): Create managed regmap for Base register space
+ * @sdev:      SPMI device that will be interacted with
+ * @config:    Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_spmi_base(dev, config)                                \
+       __regmap_lockdep_wrapper(__devm_regmap_init_spmi_base, #config, \
+                               dev, config)
+
+/**
+ * devm_regmap_init_spmi_ext(): Create managed regmap for Ext register space
+ * @sdev:      SPMI device that will be interacted with
+ * @config:    Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_spmi_ext(dev, config)                         \
+       __regmap_lockdep_wrapper(__devm_regmap_init_spmi_ext, #config,  \
+                               dev, config)
+
+/**
+ * devm_regmap_init_mmio_clk(): Initialise managed register map with clock
+ *
+ * @dev: Device that will be interacted with
+ * @clk_id: register clock consumer ID
+ * @regs: Pointer to memory-mapped IO region
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_mmio_clk(dev, clk_id, regs, config)           \
+       __regmap_lockdep_wrapper(__devm_regmap_init_mmio_clk, #config,  \
+                               dev, clk_id, regs, config)
 
 /**
  * devm_regmap_init_mmio(): Initialise managed register map
@@ -403,12 +648,22 @@ static inline struct regmap *regmap_init_mmio(struct device *dev,
  * to a struct regmap.  The regmap will be automatically freed by the
  * device management code.
  */
-static inline struct regmap *devm_regmap_init_mmio(struct device *dev,
-                                       void __iomem *regs,
-                                       const struct regmap_config *config)
-{
-       return devm_regmap_init_mmio_clk(dev, NULL, regs, config);
-}
+#define devm_regmap_init_mmio(dev, regs, config)               \
+       devm_regmap_init_mmio_clk(dev, NULL, regs, config)
+
+/**
+ * devm_regmap_init_ac97(): Initialise AC'97 register map
+ *
+ * @ac97: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+#define devm_regmap_init_ac97(ac97, config)                            \
+       __regmap_lockdep_wrapper(__devm_regmap_init_ac97, #config,      \
+                               ac97, config)
 
 void regmap_exit(struct regmap *map);
 int regmap_reinit_cache(struct regmap *map,
@@ -450,6 +705,8 @@ int regmap_get_max_register(struct regmap *map);
 int regmap_get_reg_stride(struct regmap *map);
 int regmap_async_complete(struct regmap *map);
 bool regmap_can_raw_write(struct regmap *map);
+size_t regmap_get_raw_read_max(struct regmap *map);
+size_t regmap_get_raw_write_max(struct regmap *map);
 
 int regcache_sync(struct regmap *map);
 int regcache_sync_region(struct regmap *map, unsigned int min,
index da5602bd77d75996f78114b7369b409ea52f007d..7f65f9cff951033607b0beba1c1ab0df70ee60e9 100644 (file)
@@ -74,6 +74,20 @@ static inline int device_reset_optional(struct device *dev)
        return -ENOSYS;
 }
 
+static inline struct reset_control *__must_check reset_control_get(
+                                       struct device *dev, const char *id)
+{
+       WARN_ON(1);
+       return ERR_PTR(-EINVAL);
+}
+
+static inline struct reset_control *__must_check devm_reset_control_get(
+                                       struct device *dev, const char *id)
+{
+       WARN_ON(1);
+       return ERR_PTR(-EINVAL);
+}
+
 static inline struct reset_control *reset_control_get_optional(
                                        struct device *dev, const char *id)
 {
index a19ddacdac30ae8d180c8b3358b18564bfbff564..f4265039a94c8f655a6a2d340bfbda547eb97704 100644 (file)
@@ -78,7 +78,7 @@ static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
 
 static inline int seccomp_mode(struct seccomp *s)
 {
-       return 0;
+       return SECCOMP_MODE_DISABLED;
 }
 #endif /* CONFIG_SECCOMP */
 
index d4c7271382cb310edc3d2bf4ffd5ef997e5bef87..dde00defbaa52bd2ae380c76e7df305389aa5b8f 100644 (file)
@@ -114,13 +114,22 @@ int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
 int seq_release(struct inode *, struct file *);
-int seq_escape(struct seq_file *, const char *, const char *);
-int seq_putc(struct seq_file *m, char c);
-int seq_puts(struct seq_file *m, const char *s);
 int seq_write(struct seq_file *seq, const void *data, size_t len);
 
-__printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
-__printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
+__printf(2, 0)
+void seq_vprintf(struct seq_file *m, const char *fmt, va_list args);
+__printf(2, 3)
+void seq_printf(struct seq_file *m, const char *fmt, ...);
+void seq_putc(struct seq_file *m, char c);
+void seq_puts(struct seq_file *m, const char *s);
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+                        unsigned long long num);
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num);
+void seq_escape(struct seq_file *m, const char *s, const char *esc);
+
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+                 int rowsize, int groupsize, const void *buf, size_t len,
+                 bool ascii);
 
 int seq_path(struct seq_file *, const struct path *, const char *);
 int seq_file_path(struct seq_file *, struct file *, const char *);
@@ -134,10 +143,6 @@ int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_release_private(struct inode *, struct file *);
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
-                       unsigned long long num);
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
-                       long long num);
 
 static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 {
index 71f711db450067fbcba192999c010b49dbfeb408..dabe643eb5fadcde8d00a02df9aba03faa9c2592 100644 (file)
@@ -48,24 +48,24 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_HEX             0x20
 
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-               unsigned int flags, const char *esc);
+               unsigned int flags, const char *only);
 
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
-               char *dst, size_t osz, const char *esc)
+               char *dst, size_t osz, const char *only)
 {
-       return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
+       return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only);
 }
 
 static inline int string_escape_str(const char *src, char *dst, size_t sz,
-               unsigned int flags, const char *esc)
+               unsigned int flags, const char *only)
 {
-       return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
+       return string_escape_mem(src, strlen(src), dst, sz, flags, only);
 }
 
 static inline int string_escape_str_any_np(const char *src, char *dst,
-               size_t sz, const char *esc)
+               size_t sz, const char *only)
 {
-       return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
+       return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only);
 }
 
 #endif
index 07d8e53bedfc4fbe998a231c91442104308e5bf4..5c9c6cd08d3b66f33afd21361b41d99cc56c888d 100644 (file)
@@ -46,8 +46,8 @@ static inline void rpc_set_port(struct sockaddr *sap,
 #define IPV6_SCOPE_DELIMITER           '%'
 #define IPV6_SCOPE_ID_LEN              sizeof("%nnnnnnnnnn")
 
-static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1,
-                                  const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr4(const struct sockaddr *sap1,
+                                const struct sockaddr *sap2)
 {
        const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
        const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
@@ -67,8 +67,8 @@ static inline bool __rpc_copy_addr4(struct sockaddr *dst,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
-                                  const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
+                                const struct sockaddr *sap2)
 {
        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
@@ -93,7 +93,7 @@ static inline bool __rpc_copy_addr6(struct sockaddr *dst,
        return true;
 }
 #else  /* !(IS_ENABLED(CONFIG_IPV6) */
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
                                   const struct sockaddr *sap2)
 {
        return false;
@@ -122,14 +122,27 @@ static inline bool rpc_cmp_addr(const struct sockaddr *sap1,
        if (sap1->sa_family == sap2->sa_family) {
                switch (sap1->sa_family) {
                case AF_INET:
-                       return __rpc_cmp_addr4(sap1, sap2);
+                       return rpc_cmp_addr4(sap1, sap2);
                case AF_INET6:
-                       return __rpc_cmp_addr6(sap1, sap2);
+                       return rpc_cmp_addr6(sap1, sap2);
                }
        }
        return false;
 }
 
+/**
+ * rpc_cmp_addr_port - compare the address and port number of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ */
+static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1,
+                                    const struct sockaddr *sap2)
+{
+       if (!rpc_cmp_addr(sap1, sap2))
+               return false;
+       return rpc_get_port(sap1) == rpc_get_port(sap2);
+}
+
 /**
  * rpc_copy_addr - copy the address portion of one sockaddr to another
  * @dst: destination sockaddr
index a7cbb570cc5c98cb76fe0a8733a1c791bca7419b..1ecf13e148b8be110f0175950e21e1f77833bbe9 100644 (file)
 #include <linux/atomic.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
+#include <linux/utsname.h>
 
-/* size of the nodename buffer */
-#define UNX_MAXNODENAME        32
+/*
+ * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes,
+ * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes.
+ */
+#define UNX_MAXNODENAME        __NEW_UTS_LEN
 
 struct rpcsec_gss_info;
 
index d5ee6d8b7c5809f7d71e3966a2961d8d1b7ac021..7ccc961f33e933fc25eae43dd74fbd888920d2d7 100644 (file)
@@ -132,6 +132,7 @@ struct svcxprt_rdma {
        struct list_head     sc_accept_q;       /* Conn. waiting accept */
        int                  sc_ord;            /* RDMA read limit */
        int                  sc_max_sge;
+       int                  sc_max_sge_rd;     /* max sge for read target */
 
        int                  sc_sq_depth;       /* Depth of SQ */
        atomic_t             sc_sq_count;       /* Number of SQ WR on queue */
index b17613052cc3fd9d8827ede1944d3489bdd2285d..b7b279b545049c174bf10a46e16b013a04efa9dd 100644 (file)
@@ -49,7 +49,7 @@
  * a single chunk type per message is supported currently.
  */
 #define RPCRDMA_MIN_SLOT_TABLE (2U)
-#define RPCRDMA_DEF_SLOT_TABLE (32U)
+#define RPCRDMA_DEF_SLOT_TABLE (128U)
 #define RPCRDMA_MAX_SLOT_TABLE (256U)
 
 #define RPCRDMA_DEF_INLINE  (1024)     /* default inline max */
index 31496d201fdc0d966fd7ff6b6323219f6a1de760..7ba7dccaf0e7e1291b3489c9ece30318cb44fc6f 100644 (file)
@@ -351,7 +351,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
 extern int kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 #ifdef CONFIG_MEMCG
-extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+       /* root ? */
+       if (mem_cgroup_disabled() || !memcg->css.parent)
+               return vm_swappiness;
+
+       return memcg->swappiness;
+}
+
 #else
 static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 {
@@ -398,6 +406,9 @@ extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t);
 extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
                        struct vm_area_struct *vma, unsigned long addr);
+extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
+                       struct vm_area_struct *vma, unsigned long addr,
+                       bool *new_page_allocated);
 extern struct page *swapin_readahead(swp_entry_t, gfp_t,
                        struct vm_area_struct *vma, unsigned long addr);
 
@@ -431,6 +442,7 @@ extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct page *, struct block_device **);
 extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
+extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
@@ -522,6 +534,11 @@ static inline int page_swapcount(struct page *page)
        return 0;
 }
 
+static inline int swp_swapcount(swp_entry_t entry)
+{
+       return 0;
+}
+
 #define reuse_swap_page(page)  (page_mapcount(page) == 1)
 
 static inline int try_to_free_swap(struct page *page)
index cedf3d3c373f167681424772c4fe2255e17b73fd..5c3a5f3e7eec66e43d255e1423866cb6e44aad54 100644 (file)
@@ -164,6 +164,9 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 #endif
 
 #ifdef CONFIG_MEMORY_FAILURE
+
+extern atomic_long_t num_poisoned_pages __read_mostly;
+
 /*
  * Support for hardware poisoned pages
  */
@@ -177,6 +180,31 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 {
        return swp_type(entry) == SWP_HWPOISON;
 }
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+       return TestSetPageHWPoison(page);
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+       atomic_long_inc(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_dec(void)
+{
+       atomic_long_dec(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_add(long num)
+{
+       atomic_long_add(num, &num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_sub(long num)
+{
+       atomic_long_sub(num, &num_poisoned_pages);
+}
 #else
 
 static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -188,6 +216,15 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 {
        return 0;
 }
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+       return false;
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+}
 #endif
 
 #if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
index 08001317aee7376babb542aa9bb15936d38ef533..a460e2ef28437237d2b4bc2d09486b794290f21d 100644 (file)
@@ -885,4 +885,6 @@ asmlinkage long sys_execveat(int dfd, const char __user *filename,
                        const char __user *const __user *argv,
                        const char __user *const __user *envp, int flags);
 
+asmlinkage long sys_membarrier(int cmd, int flags);
+
 #endif
index 037e9df2f6101bd70f2ad90e79e642599e56bc89..17292fee868659411f6c8797cb540f978921b034 100644 (file)
@@ -92,23 +92,19 @@ struct thermal_zone_device_ops {
                     struct thermal_cooling_device *);
        int (*unbind) (struct thermal_zone_device *,
                       struct thermal_cooling_device *);
-       int (*get_temp) (struct thermal_zone_device *, unsigned long *);
+       int (*get_temp) (struct thermal_zone_device *, int *);
        int (*get_mode) (struct thermal_zone_device *,
                         enum thermal_device_mode *);
        int (*set_mode) (struct thermal_zone_device *,
                enum thermal_device_mode);
        int (*get_trip_type) (struct thermal_zone_device *, int,
                enum thermal_trip_type *);
-       int (*get_trip_temp) (struct thermal_zone_device *, int,
-                             unsigned long *);
-       int (*set_trip_temp) (struct thermal_zone_device *, int,
-                             unsigned long);
-       int (*get_trip_hyst) (struct thermal_zone_device *, int,
-                             unsigned long *);
-       int (*set_trip_hyst) (struct thermal_zone_device *, int,
-                             unsigned long);
-       int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *);
-       int (*set_emul_temp) (struct thermal_zone_device *, unsigned long);
+       int (*get_trip_temp) (struct thermal_zone_device *, int, int *);
+       int (*set_trip_temp) (struct thermal_zone_device *, int, int);
+       int (*get_trip_hyst) (struct thermal_zone_device *, int, int *);
+       int (*set_trip_hyst) (struct thermal_zone_device *, int, int);
+       int (*get_crit_temp) (struct thermal_zone_device *, int *);
+       int (*set_emul_temp) (struct thermal_zone_device *, int);
        int (*get_trend) (struct thermal_zone_device *, int,
                          enum thermal_trend *);
        int (*notify) (struct thermal_zone_device *, int,
@@ -332,9 +328,9 @@ struct thermal_genl_event {
  *                temperature.
  */
 struct thermal_zone_of_device_ops {
-       int (*get_temp)(void *, long *);
+       int (*get_temp)(void *, int *);
        int (*get_trend)(void *, long *);
-       int (*set_emul_temp)(void *, unsigned long);
+       int (*set_emul_temp)(void *, int);
 };
 
 /**
@@ -406,7 +402,7 @@ thermal_of_cooling_device_register(struct device_node *np, char *, void *,
                                   const struct thermal_cooling_device_ops *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
-int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp);
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
@@ -457,7 +453,7 @@ static inline struct thermal_zone_device *thermal_zone_get_zone_by_name(
                const char *name)
 { return ERR_PTR(-ENODEV); }
 static inline int thermal_zone_get_temp(
-               struct thermal_zone_device *tz, unsigned long *temp)
+               struct thermal_zone_device *tz, int *temp)
 { return -ENODEV; }
 static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
 { return -ENODEV; }
index 48d901f83f92e4cf78bc932d94142b4594d65d9d..e312219ff8230bb8a4b8fad60ed4fc0e40a6afd3 100644 (file)
@@ -147,11 +147,20 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
                cpumask_or(mask, mask, tick_nohz_full_mask);
 }
 
+static inline int housekeeping_any_cpu(void)
+{
+       return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+}
+
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
 extern void __tick_nohz_task_switch(void);
 #else
+static inline int housekeeping_any_cpu(void)
+{
+       return smp_processor_id();
+}
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
index ac34819214f9133c2800f79b3dff7d90562a1573..da2049b5161ce77f17eeff22eabb6c0743640433 100644 (file)
 #ifndef _LINUX_VERIFY_PEFILE_H
 #define _LINUX_VERIFY_PEFILE_H
 
+#include <crypto/public_key.h>
+
 extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
-                                  struct key *trusted_keyring, bool *_trusted);
+                                  struct key *trusted_keyring,
+                                  enum key_being_used_for usage,
+                                  bool *_trusted);
 
 #endif /* _LINUX_VERIFY_PEFILE_H */
index f9d41a6e361f42f79a20bfd0c5b2f4579d61dcac..e183a0a65ac1cb176f60a4131cf5bf0419c3608b 100644 (file)
@@ -9,7 +9,7 @@ struct zbud_ops {
        int (*evict)(struct zbud_pool *pool, unsigned long handle);
 };
 
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
 void zbud_destroy_pool(struct zbud_pool *pool);
 int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
        unsigned long *handle);
index d30eff3d84d54282656e687313ca6db4ef8c8d90..42f8ec9924523aa1436ca72ef606ef6030e360c8 100644 (file)
@@ -36,8 +36,10 @@ enum zpool_mapmode {
        ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
 };
 
+bool zpool_has_pool(char *type);
+
 struct zpool *zpool_create_pool(char *type, char *name,
-                       gfp_t gfp, struct zpool_ops *ops);
+                       gfp_t gfp, const struct zpool_ops *ops);
 
 char *zpool_get_type(struct zpool *pool);
 
@@ -81,7 +83,7 @@ struct zpool_driver {
        atomic_t refcount;
        struct list_head list;
 
-       void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+       void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
                        struct zpool *zpool);
        void (*destroy)(void *pool);
 
index 1338190b547838964c95d2cb646ea277320488f1..6398dfae53f103200a38b39c2a00cf89c4d12448 100644 (file)
@@ -34,6 +34,11 @@ enum zs_mapmode {
         */
 };
 
+struct zs_pool_stats {
+       /* How many pages were migrated (freed) */
+       unsigned long pages_compacted;
+};
+
 struct zs_pool;
 
 struct zs_pool *zs_create_pool(char *name, gfp_t flags);
@@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
 unsigned long zs_get_total_pages(struct zs_pool *pool);
 unsigned long zs_compact(struct zs_pool *pool);
 
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
 #endif
index 9f36641a67810f80a4387b2a706d61318841e2af..6513c7ec3116f1722f490fac8abf9d1f39d39ceb 100644 (file)
@@ -15,6 +15,7 @@
 #define _MEDIA_VIDEOBUF2_MEMOPS_H
 
 #include <media/videobuf2-core.h>
+#include <linux/mm.h>
 
 /**
  * struct vb2_vmarea_handler - common vma refcount tracking handler
@@ -31,11 +32,9 @@ struct vb2_vmarea_handler {
 
 extern const struct vm_operations_struct vb2_common_vm_ops;
 
-int vb2_get_contig_userptr(unsigned long vaddr, unsigned long size,
-                          struct vm_area_struct **res_vma, dma_addr_t *res_pa);
-
-struct vm_area_struct *vb2_get_vma(struct vm_area_struct *vma);
-void vb2_put_vma(struct vm_area_struct *vma);
-
+struct frame_vector *vb2_create_framevec(unsigned long start,
+                                        unsigned long length,
+                                        bool write);
+void vb2_destroy_framevec(struct frame_vector *vec);
 
 #endif
index 0c3ac5acb85f5d3ce0d4cc1520dbe399143975cd..b5474b1fcd8399e667e5f807ee99c1e6150f517a 100644 (file)
@@ -91,6 +91,37 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
+static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+{
+       if (dev->addr_len != ETH_ALEN)
+               return -1;
+       memcpy(eui, dev->dev_addr, 3);
+       memcpy(eui + 5, dev->dev_addr + 3, 3);
+
+       /*
+        * The zSeries OSA network cards can be shared among various
+        * OS instances, but the OSA cards have only one MAC address.
+        * This leads to duplicate address conflicts in conjunction
+        * with IPv6 if more than one instance uses the same card.
+        *
+        * The driver for these cards can deliver a unique 16-bit
+        * identifier for each instance sharing the same card.  It is
+        * placed instead of 0xFFFE in the interface identifier.  The
+        * "u" bit of the interface identifier is not inverted in this
+        * case.  Hence the resulting interface identifier has local
+        * scope according to RFC2373.
+        */
+       if (dev->dev_id) {
+               eui[3] = (dev->dev_id >> 8) & 0xFF;
+               eui[4] = dev->dev_id & 0xFF;
+       } else {
+               eui[3] = 0xFF;
+               eui[4] = 0xFE;
+               eui[0] ^= 2;
+       }
+       return 0;
+}
+
 static inline unsigned long addrconf_timeout_fixup(u32 timeout,
                                                   unsigned int unit)
 {
index 20defc0353d1353c02f1d778e3f53809cae6c0bb..c1740a2794a37bce9dc9d48739a766a93ff4f743 100644 (file)
@@ -310,6 +310,13 @@ static inline bool bond_uses_primary(struct bonding *bond)
        return bond_mode_uses_primary(BOND_MODE(bond));
 }
 
+static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
+{
+       struct slave *slave = rcu_dereference(bond->curr_active_slave);
+
+       return bond_uses_primary(bond) && slave ? slave->dev : NULL;
+}
+
 static inline bool bond_slave_is_up(struct slave *slave)
 {
        return netif_running(slave->dev) && netif_carrier_ok(slave->dev);
index 4e8f804f45898aae228e7b9f60fdda9e3666271d..59160de702b68023c248181ab6c5fcb6e2f42452 100644 (file)
@@ -66,7 +66,6 @@ struct fib_rules_ops {
                                           struct nlattr **);
        int                     (*fill)(struct fib_rule *, struct sk_buff *,
                                        struct fib_rule_hdr *);
-       u32                     (*default_pref)(struct fib_rules_ops *ops);
        size_t                  (*nlmsg_payload)(struct fib_rule *);
 
        /* Called after modifications to the rules set, must flush
@@ -118,5 +117,4 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
                     struct fib_lookup_arg *);
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
                         u32 flags);
-u32 fib_default_rule_pref(struct fib_rules_ops *ops);
 #endif
index e3314e516681ed0733ec212d7464c393222428ef..bfc569498bfa793013766e6e16158d29bb84c9ef 100644 (file)
@@ -477,7 +477,9 @@ struct ieee80211_event {
  * @chandef: Channel definition for this BSS -- the hardware might be
  *     configured a higher bandwidth than this BSS uses, for example.
  * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation.
- *     This field is only valid when the channel type is one of the HT types.
+ *     This field is only valid when the channel is a wide HT/VHT channel.
+ *     Note that with TDLS this can be the case (channel is HT, protection must
+ *     be used from this field) even when the BSS association isn't using HT.
  * @cqm_rssi_thold: Connection quality monitor RSSI threshold, a zero value
  *     implies disabled
  * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis
index bab824bde92cabccb025a7de9030d033be1ffd7e..d4c6b5f30acd936d863b1a5a5e89ef2443ad3943 100644 (file)
@@ -59,7 +59,7 @@ static inline unsigned int
 br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops, struct sk_buff *skb,
                       const struct nf_hook_state *state)
 {
-       return NF_DROP;
+       return NF_ACCEPT;
 }
 #endif
 
index f5e23c6dee8bcbcc66705a4d5cefdaef311eb98b..e8ad46834df87453e1335bccb7e80dda2ba9fbe5 100644 (file)
@@ -298,6 +298,7 @@ void init_nf_conntrack_hash_rnd(void);
 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
                                 const struct nf_conntrack_zone *zone,
                                 gfp_t flags);
+void nf_ct_tmpl_free(struct nf_conn *tmpl);
 
 #define NF_CT_STAT_INC(net, count)       __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
index 2a246680a6c38934485fbd7a540df04ad47a0253..aa8bee72c9d34288cd121ebd5ddd1e3ad7a7bf40 100644 (file)
@@ -125,7 +125,7 @@ static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
 
 static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
 {
-       return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1;
+       return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE;
 }
 
 unsigned int nft_parse_register(const struct nlattr *attr);
index 43c6abcf06abc0a5bf0d56bb9235e6800a1fc111..7aa78440559a47db8e5ccc8ea69a34f87b90c125 100644 (file)
@@ -1042,42 +1042,9 @@ struct proto {
 #endif
 };
 
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
-       /* Currently active and new sockets should be assigned to cgroups */
-       MEMCG_SOCK_ACTIVE,
-       /* It was ever activated; we must disarm static keys on destruction */
-       MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
-       struct page_counter     memory_allocated;       /* Current allocated memory. */
-       struct percpu_counter   sockets_allocated;      /* Current number of sockets. */
-       int                     memory_pressure;
-       long                    sysctl_mem[3];
-       unsigned long           flags;
-       /*
-        * memcg field is used to find which memcg we belong directly
-        * Each memcg struct can hold more than one cg_proto, so container_of
-        * won't really cut.
-        *
-        * The elegant solution would be having an inverse function to
-        * proto_cgroup in struct proto, but that means polluting the structure
-        * for everybody, instead of just for memcg users.
-        */
-       struct mem_cgroup       *memcg;
-};
-
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
 
-static inline bool memcg_proto_active(struct cg_proto *cg_proto)
-{
-       return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-}
-
 #ifdef SOCK_REFCNT_DEBUG
 static inline void sk_refcnt_debug_inc(struct sock *sk)
 {
index 39ed2d2fbd51452216586b031a3e25d236099169..92a7d85917b4db3eeb73722b82c4e8335e489097 100644 (file)
@@ -105,14 +105,16 @@ enum ib_cm_data_size {
        IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
        IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
        IB_CM_SIDR_REP_INFO_LENGTH       = 72,
-       /* compare done u32 at a time */
-       IB_CM_COMPARE_SIZE               = (64 / sizeof(u32))
 };
 
 struct ib_cm_id;
 
 struct ib_cm_req_event_param {
        struct ib_cm_id         *listen_id;
+
+       /* P_Key that was used by the GMP's BTH header */
+       u16                     bth_pkey;
+
        u8                      port;
 
        struct ib_sa_path_rec   *primary_path;
@@ -223,6 +225,9 @@ struct ib_cm_apr_event_param {
 
 struct ib_cm_sidr_req_event_param {
        struct ib_cm_id         *listen_id;
+       __be64                  service_id;
+       /* P_Key that was used by the GMP's BTH header */
+       u16                     bth_pkey;
        u8                      port;
        u16                     pkey;
 };
@@ -337,11 +342,6 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id);
 #define IB_SDP_SERVICE_ID      cpu_to_be64(0x0000000000010000ULL)
 #define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
 
-struct ib_cm_compare_data {
-       u32  data[IB_CM_COMPARE_SIZE];
-       u32  mask[IB_CM_COMPARE_SIZE];
-};
-
 /**
  * ib_cm_listen - Initiates listening on the specified service ID for
  *   connection and service ID resolution requests.
@@ -354,12 +354,13 @@ struct ib_cm_compare_data {
  *   range of service IDs.  If set to 0, the service ID is matched
  *   exactly.  This parameter is ignored if %service_id is set to
  *   IB_CM_ASSIGN_SERVICE_ID.
- * @compare_data: This parameter is optional.  It specifies data that must
- *   appear in the private data of a connection request for the specified
- *   listen request.
  */
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
-                struct ib_cm_compare_data *compare_data);
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+                __be64 service_mask);
+
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+                                    ib_cm_handler cm_handler,
+                                    __be64 service_id);
 
 struct ib_cm_req_param {
        struct ib_sa_path_rec   *primary_path;
index c8422d5a5a91f2256bc59455e3d996b0bdc69d6c..188df91d58514a3248cab5b1b0d7ee09513295a6 100644 (file)
 #define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
 #define IB_DEFAULT_PKEY_FULL   0xFFFF
 
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL   0x80
+#define IB_NOTICE_TYPE_URGENT  0x81
+#define IB_NOTICE_TYPE_SECURITY        0x82
+#define IB_NOTICE_TYPE_SM      0x83
+#define IB_NOTICE_TYPE_INFO    0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA              cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH          cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER          cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR       cpu_to_be16(4)
+
 enum {
        IB_MGMT_MAD_HDR = 24,
        IB_MGMT_MAD_DATA = 232,
@@ -240,6 +257,70 @@ struct ib_class_port_info {
        __be32                  trap_qkey;
 };
 
+struct ib_mad_notice_attr {
+       u8 generic_type;
+       u8 prod_type_msb;
+       __be16 prod_type_lsb;
+       __be16 trap_num;
+       __be16 issuer_lid;
+       __be16 toggle_count;
+
+       union {
+               struct {
+                       u8      details[54];
+               } raw_data;
+
+               struct {
+                       __be16  reserved;
+                       __be16  lid;            /* where violation happened */
+                       u8      port_num;       /* where violation happened */
+               } __packed ntc_129_131;
+
+               struct {
+                       __be16  reserved;
+                       __be16  lid;            /* LID where change occurred */
+                       u8      reserved2;
+                       u8      local_changes;  /* low bit - local changes */
+                       __be32  new_cap_mask;   /* new capability mask */
+                       u8      reserved3;
+                       u8      change_flags;   /* low 3 bits only */
+               } __packed ntc_144;
+
+               struct {
+                       __be16  reserved;
+                       __be16  lid;            /* lid where sys guid changed */
+                       __be16  reserved2;
+                       __be64  new_sys_guid;
+               } __packed ntc_145;
+
+               struct {
+                       __be16  reserved;
+                       __be16  lid;
+                       __be16  dr_slid;
+                       u8      method;
+                       u8      reserved2;
+                       __be16  attr_id;
+                       __be32  attr_mod;
+                       __be64  mkey;
+                       u8      reserved3;
+                       u8      dr_trunc_hop;
+                       u8      dr_rtn_path[30];
+               } __packed ntc_256;
+
+               struct {
+                       __be16          reserved;
+                       __be16          lid1;
+                       __be16          lid2;
+                       __be32          key;
+                       __be32          sl_qp1; /* SL: high 4 bits */
+                       __be32          qp2;    /* high 8 bits reserved */
+                       union ib_gid    gid1;
+                       union ib_gid    gid2;
+               } __packed ntc_257_258;
+
+       } details;
+};
+
 /**
  * ib_mad_send_buf - MAD data buffer and work request for sends.
  * @next: A pointer used to chain together MADs for posting.
@@ -388,7 +469,6 @@ enum {
 struct ib_mad_agent {
        struct ib_device        *device;
        struct ib_qp            *qp;
-       struct ib_mr            *mr;
        ib_mad_recv_handler     recv_handler;
        ib_mad_send_handler     send_handler;
        ib_mad_snoop_handler    snoop_handler;
index b1f7592e02e403d94285d6ff4090da865facc99f..709a5331e6b9d2ff04ba262d377b53f6819426c9 100644 (file)
@@ -76,6 +76,8 @@ enum {
        IB_OPCODE_UC                                = 0x20,
        IB_OPCODE_RD                                = 0x40,
        IB_OPCODE_UD                                = 0x60,
+       /* per IBTA 3.1 Table 38, A10.3.2 */
+       IB_OPCODE_CNP                               = 0x80,
 
        /* operations -- just used to define real constants */
        IB_OPCODE_SEND_FIRST                        = 0x00,
index 98b9086d769afcdae35a10ea8a85212fda653ce9..b439e988408e6ffc3abf454de0c88ddf5df6a3ec 100644 (file)
@@ -119,10 +119,57 @@ struct ib_port_info {
        u8 link_roundtrip_latency[3];
 };
 
+struct ib_node_info {
+       u8 base_version;
+       u8 class_version;
+       u8 node_type;
+       u8 num_ports;
+       __be64 sys_guid;
+       __be64 node_guid;
+       __be64 port_guid;
+       __be16 partition_cap;
+       __be16 device_id;
+       __be32 revision;
+       u8 local_port_num;
+       u8 vendor_id[3];
+} __packed;
+
+struct ib_vl_weight_elem {
+       u8      vl;     /* IB: VL is low 4 bits, upper 4 bits reserved */
+                        /* OPA: VL is low 5 bits, upper 3 bits reserved */
+       u8      weight;
+};
+
 static inline u8
 ib_get_smp_direction(struct ib_smp *smp)
 {
        return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
 }
 
+/*
+ * SM Trap/Notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH      cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH      cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE     cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG    cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG    cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY                cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY                cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY                cpu_to_be16(258)
+
+/*
+ * Other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG         0x04    /* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG         0x02    /* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG   0x01
+
+/*
+ * M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE       0x80
+#define IB_NOTICE_TRAP_DR_TRUNC                0x40
+
+
 #endif /* IB_SMI_H */
index b0f898e3b2e733307100cd4cf80159bc88972b6d..7845fae6f2df1bd7c362174c104b6efbaaf62547 100644 (file)
@@ -48,6 +48,7 @@
 #include <linux/rwsem.h>
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
+#include <linux/socket.h>
 #include <uapi/linux/if_ether.h>
 
 #include <linux/atomic.h>
@@ -64,6 +65,12 @@ union ib_gid {
        } global;
 };
 
+extern union ib_gid zgid;
+
+struct ib_gid_attr {
+       struct net_device       *ndev;
+};
+
 enum rdma_node_type {
        /* IB values map to NodeInfo:NodeType. */
        RDMA_NODE_IB_CA         = 1,
@@ -284,7 +291,7 @@ enum ib_port_cap_flags {
        IB_PORT_BOOT_MGMT_SUP                   = 1 << 23,
        IB_PORT_LINK_LATENCY_SUP                = 1 << 24,
        IB_PORT_CLIENT_REG_SUP                  = 1 << 25,
-       IB_PORT_IP_BASED_GIDS                   = 1 << 26
+       IB_PORT_IP_BASED_GIDS                   = 1 << 26,
 };
 
 enum ib_port_width {
@@ -556,20 +563,18 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
  */
 __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
 
-enum ib_mr_create_flags {
-       IB_MR_SIGNATURE_EN = 1,
-};
 
 /**
- * ib_mr_init_attr - Memory region init attributes passed to routine
- *     ib_create_mr.
- * @max_reg_descriptors: max number of registration descriptors that
- *     may be used with registration work requests.
- * @flags: MR creation flags bit mask.
+ * enum ib_mr_type - memory region type
+ * @IB_MR_TYPE_MEM_REG:       memory region that is used for
+ *                            normal registration
+ * @IB_MR_TYPE_SIGNATURE:     memory region that is used for
+ *                            signature operations (data-integrity
+ *                            capable regions)
  */
-struct ib_mr_init_attr {
-       int         max_reg_descriptors;
-       u32         flags;
+enum ib_mr_type {
+       IB_MR_TYPE_MEM_REG,
+       IB_MR_TYPE_SIGNATURE,
 };
 
 /**
@@ -1252,9 +1257,11 @@ struct ib_udata {
 };
 
 struct ib_pd {
+       u32                     local_dma_lkey;
        struct ib_device       *device;
        struct ib_uobject      *uobject;
        atomic_t                usecnt; /* count all resources */
+       struct ib_mr           *local_mr;
 };
 
 struct ib_xrcd {
@@ -1488,7 +1495,7 @@ struct ib_cache {
        rwlock_t                lock;
        struct ib_event_handler event_handler;
        struct ib_pkey_cache  **pkey_cache;
-       struct ib_gid_cache   **gid_cache;
+       struct ib_gid_table   **gid_cache;
        u8                     *lmc_cache;
 };
 
@@ -1550,6 +1557,8 @@ struct ib_device {
 
        spinlock_t                    client_data_lock;
        struct list_head              core_list;
+       /* Access to the client_data_list is protected by the client_data_lock
+        * spinlock and the lists_rwsem read-write semaphore */
        struct list_head              client_data_list;
 
        struct ib_cache               cache;
@@ -1572,9 +1581,47 @@ struct ib_device {
                                                 struct ib_port_attr *port_attr);
        enum rdma_link_layer       (*get_link_layer)(struct ib_device *device,
                                                     u8 port_num);
+       /* When calling get_netdev, the HW vendor's driver should return the
+        * net device of device @device at port @port_num or NULL if such
+        * a net device doesn't exist. The vendor driver should call dev_hold
+        * on this net device. The HW vendor's device driver must guarantee
+        * that this function returns NULL before the net device reaches
+        * NETDEV_UNREGISTER_FINAL state.
+        */
+       struct net_device         *(*get_netdev)(struct ib_device *device,
+                                                u8 port_num);
        int                        (*query_gid)(struct ib_device *device,
                                                u8 port_num, int index,
                                                union ib_gid *gid);
+       /* When calling add_gid, the HW vendor's driver should
+        * add the gid of device @device at gid index @index of
+        * port @port_num to be @gid. Meta-info of that gid (for example,
+        * the network device related to this gid is available
+        * at @attr. @context allows the HW vendor driver to store extra
+        * information together with a GID entry. The HW vendor may allocate
+        * memory to contain this information and store it in @context when a
+        * new GID entry is written to. Params are consistent until the next
+        * call of add_gid or delete_gid. The function should return 0 on
+        * success or error otherwise. The function could be called
+        * concurrently for different ports. This function is only called
+        * when roce_gid_table is used.
+        */
+       int                        (*add_gid)(struct ib_device *device,
+                                             u8 port_num,
+                                             unsigned int index,
+                                             const union ib_gid *gid,
+                                             const struct ib_gid_attr *attr,
+                                             void **context);
+       /* When calling del_gid, the HW vendor's driver should delete the
+        * gid of device @device at gid index @index of port @port_num.
+        * Upon the deletion of a GID entry, the HW vendor must free any
+        * allocated memory. The caller will clear @context afterwards.
+        * This function is only called when roce_gid_table is used.
+        */
+       int                        (*del_gid)(struct ib_device *device,
+                                             u8 port_num,
+                                             unsigned int index,
+                                             void **context);
        int                        (*query_pkey)(struct ib_device *device,
                                                 u8 port_num, u16 index, u16 *pkey);
        int                        (*modify_device)(struct ib_device *device,
@@ -1668,11 +1715,9 @@ struct ib_device {
        int                        (*query_mr)(struct ib_mr *mr,
                                               struct ib_mr_attr *mr_attr);
        int                        (*dereg_mr)(struct ib_mr *mr);
-       int                        (*destroy_mr)(struct ib_mr *mr);
-       struct ib_mr *             (*create_mr)(struct ib_pd *pd,
-                                               struct ib_mr_init_attr *mr_init_attr);
-       struct ib_mr *             (*alloc_fast_reg_mr)(struct ib_pd *pd,
-                                              int max_page_list_len);
+       struct ib_mr *             (*alloc_mr)(struct ib_pd *pd,
+                                              enum ib_mr_type mr_type,
+                                              u32 max_num_sg);
        struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device,
                                                                   int page_list_len);
        void                       (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list);
@@ -1724,6 +1769,7 @@ struct ib_device {
        int                        (*destroy_flow)(struct ib_flow *flow_id);
        int                        (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
                                                      struct ib_mr_status *mr_status);
+       void                       (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
 
        struct ib_dma_mapping_ops   *dma_ops;
 
@@ -1761,8 +1807,30 @@ struct ib_device {
 struct ib_client {
        char  *name;
        void (*add)   (struct ib_device *);
-       void (*remove)(struct ib_device *);
-
+       void (*remove)(struct ib_device *, void *client_data);
+
+       /* Returns the net_dev belonging to this ib_client and matching the
+        * given parameters.
+        * @dev:         An RDMA device that the net_dev use for communication.
+        * @port:        A physical port number on the RDMA device.
+        * @pkey:        P_Key that the net_dev uses if applicable.
+        * @gid:         A GID that the net_dev uses to communicate.
+        * @addr:        An IP address the net_dev is configured with.
+        * @client_data: The device's client data set by ib_set_client_data().
+        *
+        * An ib_client that implements a net_dev on top of RDMA devices
+        * (such as IP over IB) should implement this callback, allowing the
+        * rdma_cm module to find the right net_dev for a given request.
+        *
+        * The caller is responsible for calling dev_put on the returned
+        * netdev. */
+       struct net_device *(*get_net_dev_by_params)(
+                       struct ib_device *dev,
+                       u8 port,
+                       u16 pkey,
+                       const union ib_gid *gid,
+                       const struct sockaddr *addr,
+                       void *client_data);
        struct list_head list;
 };
 
@@ -2070,34 +2138,6 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num)
        return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH;
 }
 
-/**
- * rdma_cap_read_multi_sge - Check if the port of device has the capability
- * RDMA Read Multiple Scatter-Gather Entries.
- * @device: Device to check
- * @port_num: Port number to check
- *
- * iWARP has a restriction that RDMA READ requests may only have a single
- * Scatter/Gather Entry (SGE) in the work request.
- *
- * NOTE: although the linux kernel currently assumes all devices are either
- * single SGE RDMA READ devices or identical SGE maximums for RDMA READs and
- * WRITEs, according to Tom Talpey, this is not accurate.  There are some
- * devices out there that support more than a single SGE on RDMA READ
- * requests, but do not support the same number of SGEs as they do on
- * RDMA WRITE requests.  The linux kernel would need rearchitecting to
- * support these imbalanced READ/WRITE SGEs allowed devices.  So, for now,
- * suffice with either the device supports the same READ/WRITE SGEs, or
- * it only gets one READ sge.
- *
- * Return: true for any device that allows more than one SGE in RDMA READ
- * requests.
- */
-static inline bool rdma_cap_read_multi_sge(struct ib_device *device,
-                                          u8 port_num)
-{
-       return !(device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP);
-}
-
 /**
  * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
  *
@@ -2115,6 +2155,26 @@ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_n
        return device->port_immutable[port_num].max_mad_size;
 }
 
+/**
+ * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE GID table mechanism manages the various GIDs for a device.
+ *
+ * NOTE: if allocating the port's GID table has failed, this call will still
+ * return true, but any RoCE GID table API will fail.
+ *
+ * Return: true if the port uses RoCE GID table mechanism in order to manage
+ * its GIDs.
+ */
+static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
+                                          u8 port_num)
+{
+       return rdma_protocol_roce(device, port_num) &&
+               device->add_gid && device->del_gid;
+}
+
 int ib_query_gid(struct ib_device *device,
                 u8 port_num, int index, union ib_gid *gid);
 
@@ -2135,20 +2195,9 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
 int ib_find_pkey(struct ib_device *device,
                 u8 port_num, u16 pkey, u16 *index);
 
-/**
- * ib_alloc_pd - Allocates an unused protection domain.
- * @device: The device on which to allocate the protection domain.
- *
- * A protection domain object provides an association between QPs, shared
- * receive queues, address handles, memory regions, and memory windows.
- */
 struct ib_pd *ib_alloc_pd(struct ib_device *device);
 
-/**
- * ib_dealloc_pd - Deallocates a protection domain.
- * @pd: The protection domain to deallocate.
- */
-int ib_dealloc_pd(struct ib_pd *pd);
+void ib_dealloc_pd(struct ib_pd *pd);
 
 /**
  * ib_create_ah - Creates an address handle for the given address vector.
@@ -2759,52 +2808,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
                dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
 }
 
-/**
- * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
- *   by an HCA.
- * @pd: The protection domain associated assigned to the registered region.
- * @phys_buf_array: Specifies a list of physical buffers to use in the
- *   memory region.
- * @num_phys_buf: Specifies the size of the phys_buf_array.
- * @mr_access_flags: Specifies the memory access rights.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
-                            struct ib_phys_buf *phys_buf_array,
-                            int num_phys_buf,
-                            int mr_access_flags,
-                            u64 *iova_start);
-
-/**
- * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
- *   Conceptually, this call performs the functions deregister memory region
- *   followed by register physical memory region.  Where possible,
- *   resources are reused instead of deallocated and reallocated.
- * @mr: The memory region to modify.
- * @mr_rereg_mask: A bit-mask used to indicate which of the following
- *   properties of the memory region are being modified.
- * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
- *   the new protection domain to associated with the memory region,
- *   otherwise, this parameter is ignored.
- * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- *   field specifies a list of physical buffers to use in the new
- *   translation, otherwise, this parameter is ignored.
- * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- *   field specifies the size of the phys_buf_array, otherwise, this
- *   parameter is ignored.
- * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
- *   field specifies the new memory access rights, otherwise, this
- *   parameter is ignored.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-int ib_rereg_phys_mr(struct ib_mr *mr,
-                    int mr_rereg_mask,
-                    struct ib_pd *pd,
-                    struct ib_phys_buf *phys_buf_array,
-                    int num_phys_buf,
-                    int mr_access_flags,
-                    u64 *iova_start);
-
 /**
  * ib_query_mr - Retrieves information about a specific memory region.
  * @mr: The memory region to retrieve information about.
@@ -2821,33 +2824,9 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
  */
 int ib_dereg_mr(struct ib_mr *mr);
 
-
-/**
- * ib_create_mr - Allocates a memory region that may be used for
- *     signature handover operations.
- * @pd: The protection domain associated with the region.
- * @mr_init_attr: memory region init attributes.
- */
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
-                          struct ib_mr_init_attr *mr_init_attr);
-
-/**
- * ib_destroy_mr - Destroys a memory region that was created using
- *     ib_create_mr and removes it from HW translation tables.
- * @mr: The memory region to destroy.
- *
- * This function can fail, if the memory region has memory windows bound to it.
- */
-int ib_destroy_mr(struct ib_mr *mr);
-
-/**
- * ib_alloc_fast_reg_mr - Allocates memory region usable with the
- *   IB_WR_FAST_REG_MR send work request.
- * @pd: The protection domain associated with the region.
- * @max_page_list_len: requested max physical buffer list length to be
- *   used with fast register work requests for this MR.
- */
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+                         enum ib_mr_type mr_type,
+                         u32 max_num_sg);
 
 /**
  * ib_alloc_fast_reg_page_list - Allocates a page list array
@@ -3040,4 +3019,8 @@ static inline int ib_check_mr_access(int flags)
 int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
                       struct ib_mr_status *mr_status);
 
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port,
+                                           u16 pkey, const union ib_gid *gid,
+                                           const struct sockaddr *addr);
+
 #endif /* IB_VERBS_H */
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
new file mode 100644 (file)
index 0000000..391dae1
--- /dev/null
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(OPA_PORT_INFO_H)
+#define OPA_PORT_INFO_H
+
+/* Temporary until HFI driver is updated */
+#ifndef USE_PI_LED_ENABLE
+#define USE_PI_LED_ENABLE 0
+#endif
+
+#define OPA_PORT_LINK_MODE_NOP 0               /* No change */
+#define OPA_PORT_LINK_MODE_OPA 4               /* Port mode is OPA */
+
+#define OPA_PORT_PACKET_FORMAT_NOP     0               /* No change */
+#define OPA_PORT_PACKET_FORMAT_8B      1               /* Format 8B */
+#define OPA_PORT_PACKET_FORMAT_9B      2               /* Format 9B */
+#define OPA_PORT_PACKET_FORMAT_10B     4               /* Format 10B */
+#define OPA_PORT_PACKET_FORMAT_16B     8               /* Format 16B */
+
+#define OPA_PORT_LTP_CRC_MODE_NONE     0       /* No change */
+#define OPA_PORT_LTP_CRC_MODE_14       1       /* 14-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_16       2       /* 16-bit LTP CRC mode */
+#define OPA_PORT_LTP_CRC_MODE_48       4       /* 48-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_PER_LANE  8      /* 12/16-bit per lane LTP CRC mode */
+
+/* Link Down / Neighbor Link Down Reason; indicated as follows: */
+#define OPA_LINKDOWN_REASON_NONE                               0       /* No specified reason */
+#define OPA_LINKDOWN_REASON_RCV_ERROR_0                                1
+#define OPA_LINKDOWN_REASON_BAD_PKT_LEN                                2
+#define OPA_LINKDOWN_REASON_PKT_TOO_LONG                       3
+#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT                      4
+#define OPA_LINKDOWN_REASON_BAD_SLID                           5
+#define OPA_LINKDOWN_REASON_BAD_DLID                           6
+#define OPA_LINKDOWN_REASON_BAD_L2                             7
+#define OPA_LINKDOWN_REASON_BAD_SC                             8
+#define OPA_LINKDOWN_REASON_RCV_ERROR_8                                9
+#define OPA_LINKDOWN_REASON_BAD_MID_TAIL                       10
+#define OPA_LINKDOWN_REASON_RCV_ERROR_10                       11
+#define OPA_LINKDOWN_REASON_PREEMPT_ERROR                      12
+#define OPA_LINKDOWN_REASON_PREEMPT_VL15                       13
+#define OPA_LINKDOWN_REASON_BAD_VL_MARKER                      14
+#define OPA_LINKDOWN_REASON_RCV_ERROR_14                       15
+#define OPA_LINKDOWN_REASON_RCV_ERROR_15                       16
+#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST                      17
+#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST                      18
+#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST                      19
+#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK                     20
+#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER              21
+#define OPA_LINKDOWN_REASON_BAD_PREEMPT                                22
+#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT                   23
+#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT             24
+#define OPA_LINKDOWN_REASON_RCV_ERROR_24                       25
+#define OPA_LINKDOWN_REASON_RCV_ERROR_25                       26
+#define OPA_LINKDOWN_REASON_RCV_ERROR_26                       27
+#define OPA_LINKDOWN_REASON_RCV_ERROR_27                       28
+#define OPA_LINKDOWN_REASON_RCV_ERROR_28                       29
+#define OPA_LINKDOWN_REASON_RCV_ERROR_29                       30
+#define OPA_LINKDOWN_REASON_RCV_ERROR_30                       31
+#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN           32
+#define OPA_LINKDOWN_REASON_UNKNOWN                            33
+/* 34 -reserved */
+#define OPA_LINKDOWN_REASON_REBOOT                             35
+#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN                   36
+/* 37-38 reserved */
+#define OPA_LINKDOWN_REASON_FM_BOUNCE                          39
+#define OPA_LINKDOWN_REASON_SPEED_POLICY                       40
+#define OPA_LINKDOWN_REASON_WIDTH_POLICY                       41
+/* 42-48 reserved */
+#define OPA_LINKDOWN_REASON_DISCONNECTED                       49
+#define OPA_LINKDOWN_REASONLOCAL_MEDIA_NOT_INSTALLED           50
+#define OPA_LINKDOWN_REASON_NOT_INSTALLED                      51
+#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG                     52
+/* 53 reserved */
+#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED           54
+/* 55 reserved */
+#define OPA_LINKDOWN_REASON_POWER_POLICY                       56
+#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY                   57
+#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY                   58
+/* 59 reserved */
+#define OPA_LINKDOWN_REASON_SWITCH_MGMT                                60
+#define OPA_LINKDOWN_REASON_SMA_DISABLED                       61
+/* 62 reserved */
+#define OPA_LINKDOWN_REASON_TRANSIENT                          63
+/* 64-255 reserved */
+
+/* OPA Link Init reason; indicated as follows: */
+/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */
+#define OPA_LINKINIT_REASON_NOP                 0
+#define OPA_LINKINIT_REASON_LINKUP              (1 << 4)
+#define OPA_LINKINIT_REASON_FLAPPING            (2 << 4)
+#define OPA_LINKINIT_REASON_CLEAR               (8 << 4)
+#define OPA_LINKINIT_OUTSIDE_POLICY             (8 << 4)
+#define OPA_LINKINIT_QUARANTINED                (9 << 4)
+#define OPA_LINKINIT_INSUFIC_CAPABILITY         (10 << 4)
+
+#define OPA_LINK_SPEED_NOP              0x0000  /*  Reserved (1-5 Gbps) */
+#define OPA_LINK_SPEED_12_5G            0x0001  /*  12.5 Gbps */
+#define OPA_LINK_SPEED_25G              0x0002  /*  25.78125?  Gbps (EDR) */
+
+#define OPA_LINK_WIDTH_1X            0x0001
+#define OPA_LINK_WIDTH_2X            0x0002
+#define OPA_LINK_WIDTH_3X            0x0004
+#define OPA_LINK_WIDTH_4X            0x0008
+
+#define OPA_CAP_MASK3_IsSnoopSupported            (1 << 7)
+#define OPA_CAP_MASK3_IsAsyncSC2VLSupported       (1 << 6)
+#define OPA_CAP_MASK3_IsAddrRangeConfigSupported  (1 << 5)
+#define OPA_CAP_MASK3_IsPassThroughSupported      (1 << 4)
+#define OPA_CAP_MASK3_IsSharedSpaceSupported      (1 << 3)
+/* reserved (1 << 2) */
+#define OPA_CAP_MASK3_IsVLMarkerSupported         (1 << 1)
+#define OPA_CAP_MASK3_IsVLrSupported              (1 << 0)
+
+/**
+ * new MTU values
+ */
+enum {
+       OPA_MTU_8192  = 6,
+       OPA_MTU_10240 = 7,
+};
+
+enum {
+       OPA_PORT_PHYS_CONF_DISCONNECTED = 0,
+       OPA_PORT_PHYS_CONF_STANDARD     = 1,
+       OPA_PORT_PHYS_CONF_FIXED        = 2,
+       OPA_PORT_PHYS_CONF_VARIABLE     = 3,
+       OPA_PORT_PHYS_CONF_SI_PHOTO     = 4
+};
+
+enum port_info_field_masks {
+       /* vl.cap */
+       OPA_PI_MASK_VL_CAP                        = 0x1F,
+       /* port_states.ledenable_offlinereason */
+       OPA_PI_MASK_OFFLINE_REASON                = 0x0F,
+       OPA_PI_MASK_LED_ENABLE                    = 0x40,
+       /* port_states.unsleepstate_downdefstate */
+       OPA_PI_MASK_UNSLEEP_STATE                 = 0xF0,
+       OPA_PI_MASK_DOWNDEF_STATE                 = 0x0F,
+       /* port_states.portphysstate_portstate */
+       OPA_PI_MASK_PORT_PHYSICAL_STATE           = 0xF0,
+       OPA_PI_MASK_PORT_STATE                    = 0x0F,
+       /* port_phys_conf */
+       OPA_PI_MASK_PORT_PHYSICAL_CONF            = 0x0F,
+       /* collectivemask_multicastmask */
+       OPA_PI_MASK_COLLECT_MASK                  = 0x38,
+       OPA_PI_MASK_MULTICAST_MASK                = 0x07,
+       /* mkeyprotect_lmc */
+       OPA_PI_MASK_MKEY_PROT_BIT                 = 0xC0,
+       OPA_PI_MASK_LMC                           = 0x0F,
+       /* smsl */
+       OPA_PI_MASK_SMSL                          = 0x1F,
+       /* partenforce_filterraw */
+       /* Filter Raw In/Out bits 1 and 2 were removed */
+       OPA_PI_MASK_LINKINIT_REASON               = 0xF0,
+       OPA_PI_MASK_PARTITION_ENFORCE_IN          = 0x08,
+       OPA_PI_MASK_PARTITION_ENFORCE_OUT         = 0x04,
+       /* operational_vls */
+       OPA_PI_MASK_OPERATIONAL_VL                = 0x1F,
+       /* sa_qp */
+       OPA_PI_MASK_SA_QP                         = 0x00FFFFFF,
+       /* sm_trap_qp */
+       OPA_PI_MASK_SM_TRAP_QP                    = 0x00FFFFFF,
+       /* localphy_overrun_errors */
+       OPA_PI_MASK_LOCAL_PHY_ERRORS              = 0xF0,
+       OPA_PI_MASK_OVERRUN_ERRORS                = 0x0F,
+       /* clientrereg_subnettimeout */
+       OPA_PI_MASK_CLIENT_REREGISTER             = 0x80,
+       OPA_PI_MASK_SUBNET_TIMEOUT                = 0x1F,
+       /* port_link_mode */
+       OPA_PI_MASK_PORT_LINK_SUPPORTED           = (0x001F << 10),
+       OPA_PI_MASK_PORT_LINK_ENABLED             = (0x001F <<  5),
+       OPA_PI_MASK_PORT_LINK_ACTIVE              = (0x001F <<  0),
+       /* port_link_crc_mode */
+       OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED       = 0x0F00,
+       OPA_PI_MASK_PORT_LINK_CRC_ENABLED         = 0x00F0,
+       OPA_PI_MASK_PORT_LINK_CRC_ACTIVE          = 0x000F,
+       /* port_mode */
+       OPA_PI_MASK_PORT_MODE_SECURITY_CHECK      = 0x0001,
+       OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY      = 0x0002,
+       OPA_PI_MASK_PORT_MODE_PKEY_CONVERT        = 0x0004,
+       OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING       = 0x0008,
+       OPA_PI_MASK_PORT_MODE_VL_MARKER           = 0x0010,
+       OPA_PI_MASK_PORT_PASS_THROUGH             = 0x0020,
+       OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE          = 0x0040,
+       /* flit_control.interleave */
+       OPA_PI_MASK_INTERLEAVE_DIST_SUP           = (0x0003 << 12),
+       OPA_PI_MASK_INTERLEAVE_DIST_ENABLE        = (0x0003 << 10),
+       OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX        = (0x001F <<  5),
+       OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX        = (0x001F <<  0),
+
+       /* port_error_action */
+       OPA_PI_MASK_EX_BUFFER_OVERRUN                  = 0x80000000,
+               /* 7 bits reserved */
+       OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT  = 0x00800000,
+       OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT            = 0x00400000,
+       OPA_PI_MASK_FM_CFG_BAD_PREEMPT                 = 0x00200000,
+       OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER       = 0x00100000,
+       OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK                = 0x00080000,
+       OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST               = 0x00040000,
+       OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST               = 0x00020000,
+       OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST               = 0x00010000,
+               /* 2 bits reserved */
+       OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER             = 0x00002000,
+       OPA_PI_MASK_PORT_RCV_PREEMPT_VL15              = 0x00001000,
+       OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR             = 0x00000800,
+               /* 1 bit reserved */
+       OPA_PI_MASK_PORT_RCV_BAD_MidTail               = 0x00000200,
+               /* 1 bit reserved */
+       OPA_PI_MASK_PORT_RCV_BAD_SC                    = 0x00000080,
+       OPA_PI_MASK_PORT_RCV_BAD_L2                    = 0x00000040,
+       OPA_PI_MASK_PORT_RCV_BAD_DLID                  = 0x00000020,
+       OPA_PI_MASK_PORT_RCV_BAD_SLID                  = 0x00000010,
+       OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT           = 0x00000008,
+       OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG            = 0x00000004,
+       OPA_PI_MASK_PORT_RCV_BAD_PKTLEN                = 0x00000002,
+       OPA_PI_MASK_PORT_RCV_BAD_LT                    = 0x00000001,
+
+       /* pass_through.res_drctl */
+       OPA_PI_MASK_PASS_THROUGH_DR_CONTROL       = 0x01,
+
+       /* buffer_units */
+       OPA_PI_MASK_BUF_UNIT_VL15_INIT            = (0x00000FFF  << 11),
+       OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE     = (0x0000001F  <<  6),
+       OPA_PI_MASK_BUF_UNIT_CREDIT_ACK           = (0x00000003  <<  3),
+       OPA_PI_MASK_BUF_UNIT_BUF_ALLOC            = (0x00000003  <<  0),
+
+       /* neigh_mtu.pvlx_to_mtu */
+       OPA_PI_MASK_NEIGH_MTU_PVL0                = 0xF0,
+       OPA_PI_MASK_NEIGH_MTU_PVL1                = 0x0F,
+
+       /* neigh_mtu.vlstall_hoq_life */
+       OPA_PI_MASK_VL_STALL                      = (0x03 << 5),
+       OPA_PI_MASK_HOQ_LIFE                      = (0x1F << 0),
+
+       /* port_neigh_mode */
+       OPA_PI_MASK_NEIGH_MGMT_ALLOWED            = (0x01 << 3),
+       OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS          = (0x01 << 2),
+       OPA_PI_MASK_NEIGH_NODE_TYPE               = (0x03 << 0),
+
+       /* resptime_value */
+       OPA_PI_MASK_RESPONSE_TIME_VALUE           = 0x1F,
+
+       /* mtucap */
+       OPA_PI_MASK_MTU_CAP                       = 0x0F,
+};
+
+#if USE_PI_LED_ENABLE
+struct opa_port_states {
+       u8     reserved;
+       u8     ledenable_offlinereason;   /* 1 res, 1 bit, 6 bits */
+       u8     reserved2;
+       u8     portphysstate_portstate;   /* 4 bits, 4 bits */
+};
+#define PI_LED_ENABLE_SUP 1
+#else
+struct opa_port_states {
+       u8     reserved;
+       u8     offline_reason;            /* 2 res, 6 bits */
+       u8     reserved2;
+       u8     portphysstate_portstate;   /* 4 bits, 4 bits */
+};
+#define PI_LED_ENABLE_SUP 0
+#endif
+
+struct opa_port_state_info {
+       struct opa_port_states port_states;
+       u16 link_width_downgrade_tx_active;
+       u16 link_width_downgrade_rx_active;
+};
+
+struct opa_port_info {
+       __be32 lid;
+       __be32 flow_control_mask;
+
+       struct {
+               u8     res;                       /* was inittype */
+               u8     cap;                       /* 3 res, 5 bits */
+               __be16 high_limit;
+               __be16 preempt_limit;
+               u8     arb_high_cap;
+               u8     arb_low_cap;
+       } vl;
+
+       struct opa_port_states  port_states;
+       u8     port_phys_conf;                    /* 4 res, 4 bits */
+       u8     collectivemask_multicastmask;      /* 2 res, 3, 3 */
+       u8     mkeyprotect_lmc;                   /* 2 bits, 2 res, 4 bits */
+       u8     smsl;                              /* 3 res, 5 bits */
+
+       u8     partenforce_filterraw;             /* bit fields */
+       u8     operational_vls;                    /* 3 res, 5 bits */
+       __be16 pkey_8b;
+       __be16 pkey_10b;
+       __be16 mkey_violations;
+
+       __be16 pkey_violations;
+       __be16 qkey_violations;
+       __be32 sm_trap_qp;                        /* 8 bits, 24 bits */
+
+       __be32 sa_qp;                             /* 8 bits, 24 bits */
+       u8     neigh_port_num;
+       u8     link_down_reason;
+       u8     neigh_link_down_reason;
+       u8     clientrereg_subnettimeout;         /* 1 bit, 2 bits, 5 */
+
+       struct {
+               __be16 supported;
+               __be16 enabled;
+               __be16 active;
+       } link_speed;
+       struct {
+               __be16 supported;
+               __be16 enabled;
+               __be16 active;
+       } link_width;
+       struct {
+               __be16 supported;
+               __be16 enabled;
+               __be16 tx_active;
+               __be16 rx_active;
+       } link_width_downgrade;
+       __be16 port_link_mode;                  /* 1 res, 5 bits, 5 bits, 5 bits */
+       __be16 port_ltp_crc_mode;               /* 4 res, 4 bits, 4 bits, 4 bits */
+
+       __be16 port_mode;                       /* 9 res, bit fields */
+       struct {
+               __be16 supported;
+               __be16 enabled;
+       } port_packet_format;
+       struct {
+               __be16 interleave;  /* 2 res, 2,2,5,5 */
+               struct {
+                       __be16 min_initial;
+                       __be16 min_tail;
+                       u8     large_pkt_limit;
+                       u8     small_pkt_limit;
+                       u8     max_small_pkt_limit;
+                       u8     preemption_limit;
+               } preemption;
+       } flit_control;
+
+       __be32 reserved4;
+       __be32 port_error_action; /* bit field */
+
+       struct {
+               u8 egress_port;
+               u8 res_drctl;                    /* 7 res, 1 */
+       } pass_through;
+       __be16 mkey_lease_period;
+       __be32 buffer_units;                     /* 9 res, 12, 5, 3, 3 */
+
+       __be32 reserved5;
+       __be32 sm_lid;
+
+       __be64 mkey;
+
+       __be64 subnet_prefix;
+
+       struct {
+               u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */
+       } neigh_mtu;
+
+       struct {
+               u8 vlstall_hoqlife;             /* 3 bits, 5 bits */
+       } xmit_q[OPA_MAX_VLS];
+
+       struct {
+               u8 addr[16];
+       } ipaddr_ipv6;
+
+       struct {
+               u8 addr[4];
+       } ipaddr_ipv4;
+
+       u32    reserved6;
+       u32    reserved7;
+       u32    reserved8;
+
+       __be64 neigh_node_guid;
+
+       __be32 ib_cap_mask;
+       __be16 reserved9;                    /* was ib_cap_mask2 */
+       __be16 opa_cap_mask;
+
+       __be32 reserved10;                   /* was link_roundtrip_latency */
+       __be16 overall_buffer_space;
+       __be16 reserved11;                   /* was max_credit_hint */
+
+       __be16 diag_code;
+       struct {
+               u8 buffer;
+               u8 wire;
+       } replay_depth;
+       u8     port_neigh_mode;
+       u8     mtucap;                          /* 4 res, 4 bits */
+
+       u8     resptimevalue;                   /* 3 res, 5 bits */
+       u8     local_port_num;
+       u8     reserved12;
+       u8     reserved13;                       /* was guid_cap */
+} __attribute__ ((packed));
+
+#endif /* OPA_PORT_INFO_H */
index 29063e84c253e61552e5dc62b927dc179590cd60..4a529ef479951b1a8726f5f52fcc82216a6e514e 100644 (file)
 #define OPA_SMP_DR_DATA_SIZE                   1872
 #define OPA_SMP_MAX_PATH_HOPS                  64
 
+#define OPA_MAX_VLS                            32
+#define OPA_MAX_SLS                            32
+#define OPA_MAX_SCS                            32
+
 #define OPA_SMI_CLASS_VERSION                  0x80
 
 #define OPA_LID_PERMISSIVE                     cpu_to_be32(0xFFFFFFFF)
@@ -73,6 +77,49 @@ struct opa_smp {
 } __packed;
 
 
+/* Subnet management attributes */
+/* ... */
+#define OPA_ATTRIB_ID_NODE_DESCRIPTION         cpu_to_be16(0x0010)
+#define OPA_ATTRIB_ID_NODE_INFO                        cpu_to_be16(0x0011)
+#define OPA_ATTRIB_ID_PORT_INFO                        cpu_to_be16(0x0015)
+#define OPA_ATTRIB_ID_PARTITION_TABLE          cpu_to_be16(0x0016)
+#define OPA_ATTRIB_ID_SL_TO_SC_MAP             cpu_to_be16(0x0017)
+#define OPA_ATTRIB_ID_VL_ARBITRATION           cpu_to_be16(0x0018)
+#define OPA_ATTRIB_ID_SM_INFO                  cpu_to_be16(0x0020)
+#define OPA_ATTRIB_ID_CABLE_INFO               cpu_to_be16(0x0032)
+#define OPA_ATTRIB_ID_AGGREGATE                        cpu_to_be16(0x0080)
+#define OPA_ATTRIB_ID_SC_TO_SL_MAP             cpu_to_be16(0x0082)
+#define OPA_ATTRIB_ID_SC_TO_VLR_MAP            cpu_to_be16(0x0083)
+#define OPA_ATTRIB_ID_SC_TO_VLT_MAP            cpu_to_be16(0x0084)
+#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP           cpu_to_be16(0x0085)
+/* ... */
+#define OPA_ATTRIB_ID_PORT_STATE_INFO          cpu_to_be16(0x0087)
+/* ... */
+#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE     cpu_to_be16(0x008A)
+/* ... */
+
+struct opa_node_description {
+       u8 data[64];
+} __attribute__ ((packed));
+
+struct opa_node_info {
+       u8      base_version;
+       u8      class_version;
+       u8      node_type;
+       u8      num_ports;
+       __be32  reserved;
+       __be64  system_image_guid;
+       __be64  node_guid;
+       __be64  port_guid;
+       __be16  partition_cap;
+       __be16  device_id;
+       __be32  revision;
+       u8      local_port_num;
+       u8      vendor_id[3];   /* network byte order */
+} __attribute__ ((packed));
+
+#define OPA_PARTITION_TABLE_BLK_SIZE 32
+
 static inline u8
 opa_get_smp_direction(struct opa_smp *smp)
 {
index 0790882e0c9b32443a37c44a26b3357224245d82..5852661443290d52eb8a3e719716452d752b6eaa 100644 (file)
@@ -77,4 +77,11 @@ int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
                        unsigned int group, gfp_t flags);
 
+/**
+ * Check if there are any listeners to the netlink group
+ * @group: the netlink group ID
+ * Returns 0 on success or a negative for no listeners.
+ */
+int ibnl_chk_listeners(unsigned int group);
+
 #endif /* _RDMA_NETLINK_H */
index 676b03b78e57371e0fb2cd41629b647e44c9b803..11571b2a831e3e7d223e7dcc17b36146d537e457 100644 (file)
@@ -61,4 +61,9 @@ static inline bool scsi_sense_valid(const struct scsi_sense_hdr *sshdr)
 extern bool scsi_normalize_sense(const u8 *sense_buffer, int sb_len,
                                 struct scsi_sense_hdr *sshdr);
 
+extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq);
+int scsi_set_sense_information(u8 *buf, int buf_len, u64 info);
+extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
+                                      int desc_type);
+
 #endif /* _SCSI_COMMON_H_ */
index 50c2a363bc8fec46d60240c5af8fe4fd2754f9f7..fe89d7cd67b9d3cfdcacddc6e17bb9366de38c0f 100644 (file)
@@ -196,34 +196,13 @@ struct scsi_device {
        struct execute_work     ew; /* used to get process context on put */
        struct work_struct      requeue_work;
 
-       struct scsi_dh_data     *scsi_dh_data;
+       struct scsi_device_handler *handler;
+       void                    *handler_data;
+
        enum scsi_device_state sdev_state;
        unsigned long           sdev_data[0];
 } __attribute__((aligned(sizeof(unsigned long))));
 
-typedef void (*activate_complete)(void *, int);
-struct scsi_device_handler {
-       /* Used by the infrastructure */
-       struct list_head list; /* list of scsi_device_handlers */
-
-       /* Filled by the hardware handler */
-       struct module *module;
-       const char *name;
-       int (*check_sense)(struct scsi_device *, struct scsi_sense_hdr *);
-       struct scsi_dh_data *(*attach)(struct scsi_device *);
-       void (*detach)(struct scsi_device *);
-       int (*activate)(struct scsi_device *, activate_complete, void *);
-       int (*prep_fn)(struct scsi_device *, struct request *);
-       int (*set_params)(struct scsi_device *, const char *);
-       bool (*match)(struct scsi_device *);
-};
-
-struct scsi_dh_data {
-       struct scsi_device_handler *scsi_dh;
-       struct scsi_device *sdev;
-       struct kref kref;
-};
-
 #define        to_scsi_device(d)       \
        container_of(d, struct scsi_device, sdev_gendev)
 #define        class_to_sdev(d)        \
index 620c723ee8ed8a741f976599f9b0d4cbc8d2914d..85d731746834d258e002c51db293457fc0a52ece 100644 (file)
@@ -55,11 +55,26 @@ enum {
        SCSI_DH_NOSYS,
        SCSI_DH_DRIVER_MAX,
 };
-#if defined(CONFIG_SCSI_DH) || defined(CONFIG_SCSI_DH_MODULE)
+
+typedef void (*activate_complete)(void *, int);
+struct scsi_device_handler {
+       /* Used by the infrastructure */
+       struct list_head list; /* list of scsi_device_handlers */
+
+       /* Filled by the hardware handler */
+       struct module *module;
+       const char *name;
+       int (*check_sense)(struct scsi_device *, struct scsi_sense_hdr *);
+       int (*attach)(struct scsi_device *);
+       void (*detach)(struct scsi_device *);
+       int (*activate)(struct scsi_device *, activate_complete, void *);
+       int (*prep_fn)(struct scsi_device *, struct request *);
+       int (*set_params)(struct scsi_device *, const char *);
+};
+
+#ifdef CONFIG_SCSI_DH
 extern int scsi_dh_activate(struct request_queue *, activate_complete, void *);
-extern int scsi_dh_handler_exist(const char *);
 extern int scsi_dh_attach(struct request_queue *, const char *);
-extern void scsi_dh_detach(struct request_queue *);
 extern const char *scsi_dh_attached_handler_name(struct request_queue *, gfp_t);
 extern int scsi_dh_set_params(struct request_queue *, const char *);
 #else
@@ -69,18 +84,10 @@ static inline int scsi_dh_activate(struct request_queue *req,
        fn(data, 0);
        return 0;
 }
-static inline int scsi_dh_handler_exist(const char *name)
-{
-       return 0;
-}
 static inline int scsi_dh_attach(struct request_queue *req, const char *name)
 {
        return SCSI_DH_NOSYS;
 }
-static inline void scsi_dh_detach(struct request_queue *q)
-{
-       return;
-}
 static inline const char *scsi_dh_attached_handler_name(struct request_queue *q,
                                                        gfp_t gfp)
 {
index 8d1d7fa67ec48bad6872be07258066f9410eec6e..dbb8c640e26fc4fdb1762816b3c00f425df77c7c 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/scatterlist.h>
 
 #include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_common.h>
 struct scsi_device;
 struct Scsi_Host;
 
@@ -21,14 +22,9 @@ static inline bool scsi_sense_is_deferred(const struct scsi_sense_hdr *sshdr)
        return ((sshdr->response_code >= 0x70) && (sshdr->response_code & 1));
 }
 
-extern const u8 * scsi_sense_desc_find(const u8 * sense_buffer, int sb_len,
-                                      int desc_type);
-
 extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len,
                                   u64 * info_out);
 
-extern void scsi_build_sense_buffer(int desc, u8 *buf, u8 key, u8 asc, u8 ascq);
-
 extern int scsi_ioctl_reset(struct scsi_device *, int __user *);
 
 struct scsi_eh_save {
index 370f2909ec19d3a884aaf384549499bac4a6e085..44202ff897fd93a75e7bc7029d9733ffff2b20ec 100644 (file)
@@ -51,11 +51,6 @@ struct tegra_smmu_swgroup {
        unsigned int reg;
 };
 
-struct tegra_smmu_ops {
-       void (*flush_dcache)(struct page *page, unsigned long offset,
-                            size_t size);
-};
-
 struct tegra_smmu_soc {
        const struct tegra_mc_client *clients;
        unsigned int num_clients;
@@ -66,9 +61,8 @@ struct tegra_smmu_soc {
        bool supports_round_robin_arbitration;
        bool supports_request_limit;
 
+       unsigned int num_tlb_lines;
        unsigned int num_asids;
-
-       const struct tegra_smmu_ops *ops;
 };
 
 struct tegra_mc;
index 0aedbb2c10e0451c162118988d6c070efcd9b629..373d3342002bfefdc9911f7b3f5ac57af1826f9b 100644 (file)
@@ -62,6 +62,8 @@
 /* T10 protection information disabled by default */
 #define TA_DEFAULT_T10_PI              0
 #define TA_DEFAULT_FABRIC_PROT_TYPE    0
+/* TPG status needs to be enabled to return sendtargets discovery endpoint info */
+#define TA_DEFAULT_TPG_ENABLED_SENDTARGETS 1
 
 #define ISCSI_IOV_DATA_BUFFER          5
 
@@ -517,7 +519,6 @@ struct iscsi_conn {
        u16                     cid;
        /* Remote TCP Port */
        u16                     login_port;
-       u16                     local_port;
        int                     net_size;
        int                     login_family;
        u32                     auth_id;
@@ -527,9 +528,8 @@ struct iscsi_conn {
        u32                     exp_statsn;
        /* Per connection status sequence number */
        u32                     stat_sn;
-#define IPV6_ADDRESS_SPACE                             48
-       unsigned char           login_ip[IPV6_ADDRESS_SPACE];
-       unsigned char           local_ip[IPV6_ADDRESS_SPACE];
+       struct sockaddr_storage login_sockaddr;
+       struct sockaddr_storage local_sockaddr;
        int                     conn_usage_count;
        int                     conn_waiting_on_uc;
        atomic_t                check_immediate_queue;
@@ -636,7 +636,7 @@ struct iscsi_session {
        /* session wide counter: expected command sequence number */
        u32                     exp_cmd_sn;
        /* session wide counter: maximum allowed command sequence number */
-       u32                     max_cmd_sn;
+       atomic_t                max_cmd_sn;
        struct list_head        sess_ooo_cmdsn_list;
 
        /* LIO specific session ID */
@@ -764,6 +764,7 @@ struct iscsi_tpg_attrib {
        u32                     default_erl;
        u8                      t10_pi;
        u32                     fabric_prot_type;
+       u32                     tpg_enabled_sendtargets;
        struct iscsi_portal_group *tpg;
 };
 
@@ -776,12 +777,10 @@ struct iscsi_np {
        enum iscsi_timer_flags_table np_login_timer_flags;
        u32                     np_exports;
        enum np_flags_table     np_flags;
-       unsigned char           np_ip[IPV6_ADDRESS_SPACE];
-       u16                     np_port;
        spinlock_t              np_thread_lock;
        struct completion       np_restart_comp;
        struct socket           *np_socket;
-       struct __kernel_sockaddr_storage np_sockaddr;
+       struct sockaddr_storage np_sockaddr;
        struct task_struct      *np_thread;
        struct timer_list       np_login_timer;
        void                    *np_context;
index 3ff76b4faad3265e1fea89c1735411af6f371ddc..e615bb485d0b3a79ea43e7494db956db17805ac0 100644 (file)
@@ -50,7 +50,7 @@ struct iscsi_login_stats {
        u64             last_fail_time;         /* time stamp (jiffies) */
        u32             last_fail_type;
        int             last_intr_fail_ip_family;
-       unsigned char   last_intr_fail_ip_addr[IPV6_ADDRESS_SPACE];
+       struct sockaddr_storage last_intr_fail_sockaddr;
        char            last_intr_fail_name[224];
 } ____cacheline_aligned;
 
index e6bb166f12c212aac238d0d951594cce65851145..90e37faa2ede5d3331dbfb7c246e8060c14e44ab 100644 (file)
@@ -9,7 +9,7 @@ struct iscsit_transport {
        int priv_size;
        struct module *owner;
        struct list_head t_node;
-       int (*iscsit_setup_np)(struct iscsi_np *, struct __kernel_sockaddr_storage *);
+       int (*iscsit_setup_np)(struct iscsi_np *, struct sockaddr_storage *);
        int (*iscsit_accept_np)(struct iscsi_np *, struct iscsi_conn *);
        void (*iscsit_free_np)(struct iscsi_np *);
        void (*iscsit_wait_conn)(struct iscsi_conn *);
index 1e5c8f949bae4947b8645bd1cf9d7d8966708713..56cf8e485ef22101ac22b1120704664bf599eaef 100644 (file)
@@ -93,4 +93,6 @@ bool  target_lun_is_rdonly(struct se_cmd *);
 sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd,
        sense_reason_t (*exec_cmd)(struct se_cmd *cmd));
 
+bool target_sense_desc_format(struct se_device *dev);
+
 #endif /* TARGET_CORE_BACKEND_H */
index 17ae2d6a4891e57245c16fbeb4e2a8a6462d32d0..ac9bf1c0e42d851a6059224cbea852b5d7b9bd44 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
 #include <linux/percpu_ida.h>
+#include <linux/t10-pi.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 
@@ -426,12 +427,6 @@ enum target_core_dif_check {
        TARGET_DIF_CHECK_REFTAG = 0x1 << 2,
 };
 
-struct se_dif_v1_tuple {
-       __be16                  guard_tag;
-       __be16                  app_tag;
-       __be32                  ref_tag;
-};
-
 /* for sam_task_attr */
 #define TCM_SIMPLE_TAG 0x20
 #define TCM_HEAD_TAG   0x21
@@ -444,6 +439,9 @@ struct se_cmd {
        u8                      scsi_asc;
        u8                      scsi_ascq;
        u16                     scsi_sense_length;
+       unsigned                cmd_wait_set:1;
+       unsigned                unknown_data_length:1;
+       bool                    state_active:1;
        u64                     tag; /* SAM command identifier aka task tag */
        /* Delay for ALUA Active/NonOptimized state access in milliseconds */
        int                     alua_nonop_delay;
@@ -455,11 +453,8 @@ struct se_cmd {
        unsigned int            map_tag;
        /* Transport protocol dependent state, see transport_state_table */
        enum transport_state_table t_state;
-       unsigned                cmd_wait_set:1;
-       unsigned                unknown_data_length:1;
        /* See se_cmd_flags_table */
        u32                     se_cmd_flags;
-       u32                     se_ordered_id;
        /* Total size in bytes associated with command */
        u32                     data_length;
        u32                     residual_count;
@@ -477,7 +472,6 @@ struct se_cmd {
        struct se_tmr_req       *se_tmr_req;
        struct list_head        se_cmd_list;
        struct completion       cmd_wait_comp;
-       struct kref             cmd_kref;
        const struct target_core_fabric_ops *se_tfo;
        sense_reason_t          (*execute_cmd)(struct se_cmd *);
        sense_reason_t (*transport_complete_callback)(struct se_cmd *, bool);
@@ -497,6 +491,7 @@ struct se_cmd {
 #define CMD_T_REQUEST_STOP     (1 << 8)
 #define CMD_T_BUSY             (1 << 9)
        spinlock_t              t_state_lock;
+       struct kref             cmd_kref;
        struct completion       t_transport_stop_comp;
 
        struct work_struct      work;
@@ -509,8 +504,10 @@ struct se_cmd {
        struct scatterlist      *t_bidi_data_sg;
        unsigned int            t_bidi_data_nents;
 
+       /* Used for lun->lun_ref counting */
+       int                     lun_ref_active;
+
        struct list_head        state_list;
-       bool                    state_active;
 
        /* old task stop completion, consider merging with some of the above */
        struct completion       task_stop_comp;
@@ -518,20 +515,17 @@ struct se_cmd {
        /* backend private data */
        void                    *priv;
 
-       /* Used for lun->lun_ref counting */
-       int                     lun_ref_active;
-
        /* DIF related members */
        enum target_prot_op     prot_op;
        enum target_prot_type   prot_type;
        u8                      prot_checks;
+       bool                    prot_pto;
        u32                     prot_length;
        u32                     reftag_seed;
        struct scatterlist      *t_prot_sg;
        unsigned int            t_prot_nents;
        sense_reason_t          pi_err;
        sector_t                bad_sector;
-       bool                    prot_pto;
 };
 
 struct se_ua {
@@ -598,7 +592,6 @@ struct se_ml_stat_grps {
 };
 
 struct se_lun_acl {
-       char                    initiatorname[TRANSPORT_IQN_LEN];
        u64                     mapped_lun;
        struct se_node_acl      *se_lun_nacl;
        struct se_lun           *se_lun;
@@ -685,7 +678,6 @@ struct se_lun {
 #define SE_LUN_LINK_MAGIC                      0xffff7771
        u32                     lun_link_magic;
        u32                     lun_access;
-       u32                     lun_flags;
        u32                     lun_index;
 
        /* RELATIVE TARGET PORT IDENTIFER */
@@ -751,7 +743,6 @@ struct se_device {
        atomic_long_t           write_bytes;
        /* Active commands on this virtual SE device */
        atomic_t                simple_cmds;
-       atomic_t                dev_ordered_id;
        atomic_t                dev_ordered_sync;
        atomic_t                dev_qf_count;
        u32                     export_count;
index 18afef91b447f950cb5f78b022355a1db6664af0..7fb2557a760e432ffa054f2550acd8e8a9e7085a 100644 (file)
@@ -5,6 +5,19 @@ struct target_core_fabric_ops {
        struct module *module;
        const char *name;
        size_t node_acl_size;
+       /*
+        * Limits number of scatterlist entries per SCF_SCSI_DATA_CDB payload.
+        * Setting this value tells target-core to enforce this limit, and
+        * report as INQUIRY EVPD=b0 MAXIMUM TRANSFER LENGTH.
+        *
+        * target-core will currently reset se_cmd->data_length to this
+        * maximum size, and set UNDERFLOW residual count if length exceeds
+        * this limit.
+        *
+        * XXX: Not all initiator hosts honor this block-limit EVPD
+        * XXX: Currently assumes single PAGE_SIZE per scatterlist entry
+        */
+       u32 max_data_sg_nents;
        char *(*get_fabric_name)(void);
        char *(*tpg_get_wwn)(struct se_portal_group *);
        u16 (*tpg_get_tag)(struct se_portal_group *);
@@ -152,6 +165,7 @@ int transport_generic_handle_tmr(struct se_cmd *);
 void   transport_generic_request_failure(struct se_cmd *, sense_reason_t);
 void   __target_execute_cmd(struct se_cmd *);
 int    transport_lookup_tmr_lun(struct se_cmd *, u64);
+void   core_allocate_nexus_loss_ua(struct se_node_acl *acl);
 
 struct se_node_acl *core_tpg_get_initiator_node_acl(struct se_portal_group *tpg,
                unsigned char *);
index a44062da684b575fe3c411ab24efd5795cfa9c02..d6f83222a6a1671ec819d730801398ebb88f80ab 100644 (file)
@@ -358,6 +358,36 @@ TRACE_EVENT(
 
 #endif
 
+TRACE_EVENT(kvm_halt_poll_ns,
+       TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+       TP_ARGS(grow, vcpu_id, new, old),
+
+       TP_STRUCT__entry(
+               __field(bool, grow)
+               __field(unsigned int, vcpu_id)
+               __field(int, new)
+               __field(int, old)
+       ),
+
+       TP_fast_assign(
+               __entry->grow           = grow;
+               __entry->vcpu_id        = vcpu_id;
+               __entry->new            = new;
+               __entry->old            = old;
+       ),
+
+       TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+                       __entry->vcpu_id,
+                       __entry->new,
+                       __entry->grow ? "grow" : "shrink",
+                       __entry->old)
+);
+
+#define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \
+       trace_kvm_halt_poll_ns(true, vcpu_id, new, old)
+#define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
+       trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
index dee3bb1d5a6b5b28d001b98f46ea9694d7a6bb65..2cca6cd342d897f90269cf1542df40d4a1858a18 100644 (file)
@@ -46,7 +46,7 @@ TRACE_EVENT(task_rename,
        TP_fast_assign(
                __entry->pid = task->pid;
                memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
-               memcpy(entry->newcomm, comm, TASK_COMM_LEN);
+               strlcpy(entry->newcomm, comm, TASK_COMM_LEN);
                __entry->oom_score_adj = task->signal->oom_score_adj;
        ),
 
index 12e1321c4e0c8b9f6a4fc7327c0f4bc3a0bdd8c9..5afae8fe37951dcb3027d3b80eb8f0aaf3c6afc6 100644 (file)
@@ -11,7 +11,7 @@ TRACE_EVENT(thermal_power_allocator,
                 u32 total_req_power, u32 *granted_power,
                 u32 total_granted_power, size_t num_actors,
                 u32 power_range, u32 max_allocatable_power,
-                unsigned long current_temp, s32 delta_temp),
+                int current_temp, s32 delta_temp),
        TP_ARGS(tz, req_power, total_req_power, granted_power,
                total_granted_power, num_actors, power_range,
                max_allocatable_power, current_temp, delta_temp),
@@ -24,7 +24,7 @@ TRACE_EVENT(thermal_power_allocator,
                __field(size_t,        num_actors               )
                __field(u32,           power_range              )
                __field(u32,           max_allocatable_power    )
-               __field(unsigned long, current_temp             )
+               __field(int,           current_temp             )
                __field(s32,           delta_temp               )
        ),
        TP_fast_assign(
@@ -42,7 +42,7 @@ TRACE_EVENT(thermal_power_allocator,
                __entry->delta_temp = delta_temp;
        ),
 
-       TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%lu delta_temperature=%d",
+       TP_printk("thermal_zone_id=%d req_power={%s} total_req_power=%u granted_power={%s} total_granted_power=%u power_range=%u max_allocatable_power=%u current_temperature=%d delta_temperature=%d",
                __entry->tz_id,
                __print_array(__get_dynamic_array(req_power),
                               __entry->num_actors, 4),
index a7aa607a4c55e51ec8ba8a6593828604e31a7aac..fff846b512e6e675043a37d3d5c7cb804b71cbe8 100644 (file)
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
        TP_ARGS(inode, flags)
 );
 
+#ifdef CREATE_TRACE_POINTS
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+       return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+       struct cgroup *cgrp = wb->memcg_css->cgroup;
+       char *path;
+
+       path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
+       WARN_ON_ONCE(path != buf);
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+       if (wbc->wb)
+               return __trace_wb_cgroup_size(wbc->wb);
+       else
+               return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+                                            struct writeback_control *wbc)
+{
+       if (wbc->wb)
+               __trace_wb_assign_cgroup(buf, wbc->wb);
+       else
+               strcpy(buf, "/");
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+       return 2;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+       strcpy(buf, "/");
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+       return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+                                            struct writeback_control *wbc)
+{
+       strcpy(buf, "/");
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+#endif /* CREATE_TRACE_POINTS */
+
 DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
                __array(char, name, 32)
                __field(unsigned long, ino)
                __field(int, sync_mode)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
        ),
 
        TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
                        dev_name(inode_to_bdi(inode)->dev), 32);
                __entry->ino            = inode->i_ino;
                __entry->sync_mode      = wbc->sync_mode;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
        ),
 
-       TP_printk("bdi %s: ino=%lu sync_mode=%d",
+       TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
                __entry->name,
                __entry->ino,
-               __entry->sync_mode
+               __entry->sync_mode,
+               __get_str(cgroup)
        )
 );
 
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
 );
 
 DECLARE_EVENT_CLASS(writeback_work_class,
-       TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
-       TP_ARGS(bdi, work),
+       TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
+       TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
        TP_fast_assign(
                strncpy(__entry->name,
-                       bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+                       wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background = work->for_background;
                __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-                 "kupdate=%d range_cyclic=%d background=%d reason=%s",
+                 "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
-                 __print_symbolic(__entry->reason, WB_WORK_REASON)
+                 __print_symbolic(__entry->reason, WB_WORK_REASON),
+                 __get_str(cgroup)
        )
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
 DEFINE_EVENT(writeback_work_class, name, \
-       TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
-       TP_ARGS(bdi, work))
+       TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
+       TP_ARGS(wb, work))
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
 );
 
 DECLARE_EVENT_CLASS(writeback_class,
-       TP_PROTO(struct backing_dev_info *bdi),
-       TP_ARGS(bdi),
+       TP_PROTO(struct bdi_writeback *wb),
+       TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
        TP_fast_assign(
-               strncpy(__entry->name, dev_name(bdi->dev), 32);
+               strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
-       TP_printk("bdi %s",
-                 __entry->name
+       TP_printk("bdi %s: cgroup=%s",
+                 __entry->name,
+                 __get_str(cgroup)
        )
 );
 #define DEFINE_WRITEBACK_EVENT(name) \
 DEFINE_EVENT(writeback_class, name, \
-       TP_PROTO(struct backing_dev_info *bdi), \
-       TP_ARGS(bdi))
+       TP_PROTO(struct bdi_writeback *wb), \
+       TP_ARGS(wb))
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+
+TRACE_EVENT(writeback_bdi_register,
+       TP_PROTO(struct backing_dev_info *bdi),
+       TP_ARGS(bdi),
+       TP_STRUCT__entry(
+               __array(char, name, 32)
+       ),
+       TP_fast_assign(
+               strncpy(__entry->name, dev_name(bdi->dev), 32);
+       ),
+       TP_printk("bdi %s",
+               __entry->name
+       )
+);
 
 DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
        ),
 
        TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
                __entry->range_cyclic   = wbc->range_cyclic;
                __entry->range_start    = (long)wbc->range_start;
                __entry->range_end      = (long)wbc->range_end;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
        ),
 
        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
-               "start=0x%lx end=0x%lx",
+               "start=0x%lx end=0x%lx cgroup=%s",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
-               __entry->range_end)
+               __entry->range_end,
+               __get_str(cgroup)
+       )
 )
 
 #define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
                __field(long,           age)
                __field(int,            moved)
                __field(int,            reason)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
        TP_fast_assign(
                unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
                                  (jiffies - *older_than_this) * 1000 / HZ : -1;
                __entry->moved  = moved;
                __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
-       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
                __entry->name,
                __entry->older, /* older_than_this in jiffies */
                __entry->age,   /* older_than_this in relative milliseconds */
                __entry->moved,
-               __print_symbolic(__entry->reason, WB_WORK_REASON)
+               __print_symbolic(__entry->reason, WB_WORK_REASON),
+               __get_str(cgroup)
        )
 );
 
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
 
 TRACE_EVENT(bdi_dirty_ratelimit,
 
-       TP_PROTO(struct backing_dev_info *bdi,
+       TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),
 
-       TP_ARGS(bdi, dirty_rate, task_ratelimit),
+       TP_ARGS(wb, dirty_rate, task_ratelimit),
 
        TP_STRUCT__entry(
                __array(char,           bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
                __field(unsigned long,  dirty_ratelimit)
                __field(unsigned long,  task_ratelimit)
                __field(unsigned long,  balanced_dirty_ratelimit)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
 
        TP_fast_assign(
-               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-               __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
-               __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
+               strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+               __entry->write_bw       = KBps(wb->write_bandwidth);
+               __entry->avg_write_bw   = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate     = KBps(dirty_rate);
-               __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
+               __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
-                                       KBps(bdi->wb.balanced_dirty_ratelimit);
+                                       KBps(wb->balanced_dirty_ratelimit);
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
 
        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
-                 "balanced_dirty_ratelimit=%lu",
+                 "balanced_dirty_ratelimit=%lu cgroup=%s",
                  __entry->bdi,
                  __entry->write_bw,            /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,          /* bdi dirty rate */
                  __entry->dirty_ratelimit,     /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
-                 __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
+                 __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
+                 __get_str(cgroup)
        )
 );
 
 TRACE_EVENT(balance_dirty_pages,
 
-       TP_PROTO(struct backing_dev_info *bdi,
+       TP_PROTO(struct bdi_writeback *wb,
                 unsigned long thresh,
                 unsigned long bg_thresh,
                 unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
                 long pause,
                 unsigned long start_time),
 
-       TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+       TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),
 
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
                __field(         long,  pause)
                __field(unsigned long,  period)
                __field(         long,  think)
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
        ),
 
        TP_fast_assign(
                unsigned long freerun = (thresh + bg_thresh) / 2;
-               strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+               strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
 
                __entry->limit          = global_wb_domain.dirty_limit;
                __entry->setpoint       = (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
                __entry->period         = period * 1000 / HZ;
                __entry->pause          = pause * 1000 / HZ;
                __entry->paused         = (jiffies - start_time) * 1000 / HZ;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
        ),
 
 
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
-                 "paused=%lu pause=%ld period=%lu think=%ld",
+                 "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
                  __entry->paused,      /* ms */
                  __entry->pause,       /* ms */
                  __entry->period,      /* ms */
-                 __entry->think        /* ms */
+                 __entry->think,       /* ms */
+                 __get_str(cgroup)
          )
 );
 
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
                __field(unsigned long, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
+               __dynamic_array(char, cgroup,
+                               __trace_wb_cgroup_size(inode_to_wb(inode)))
        ),
 
        TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
                __entry->ino            = inode->i_ino;
                __entry->state          = inode->i_state;
                __entry->dirtied_when   = inode->dirtied_when;
+               __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
        ),
 
-       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+       TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
                  __entry->name,
                  __entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
-                 (jiffies - __entry->dirtied_when) / HZ
+                 (jiffies - __entry->dirtied_when) / HZ,
+                 __get_str(cgroup)
        )
 );
 
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
+               __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
        ),
 
        TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write    = nr_to_write;
                __entry->wrote          = nr_to_write - wbc->nr_to_write;
+               __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
        ),
 
        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
-                 "index=%lu to_write=%ld wrote=%lu",
+                 "index=%lu to_write=%ld wrote=%lu cgroup=%s",
                  __entry->name,
                  __entry->ino,
                  show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
-                 __entry->wrote
+                 __entry->wrote,
+                 __get_str(cgroup)
        )
 );
 
index e016bd9b1a049686da0ee465fd561a8062156ff6..8da542a2874d6c3490860eec0092ac26998a303d 100644 (file)
@@ -709,9 +709,11 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
 __SYSCALL(__NR_bpf, sys_bpf)
 #define __NR_execveat 281
 __SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_membarrier 282
+__SYSCALL(__NR_membarrier, sys_membarrier)
 
 #undef __NR_syscalls
-#define __NR_syscalls 282
+#define __NR_syscalls 283
 
 /*
  * All syscalls below here should go away really,
index dbd16a2d37db6defcadf264d5f5a5b474d819be2..fd5aa47bd6892841a285dbce151afa71dac4b618 100644 (file)
@@ -358,7 +358,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_RESOURCE_STREAMER 36
 
 typedef struct drm_i915_getparam {
-       s32 param;
+       __s32 param;
        /*
         * WARNING: Using pointers instead of fixed-size u64 means we need to write
         * compat32 code. Don't repeat this mistake.
index 70ff1d9abf0ddab0055d64b1d9ec70b8f488fd9f..f7b2db44eb4b07a910d0097e63a657c1e4a37816 100644 (file)
@@ -252,6 +252,7 @@ header-y += mdio.h
 header-y += media.h
 header-y += media-bus-format.h
 header-y += mei.h
+header-y += membarrier.h
 header-y += memfd.h
 header-y += mempolicy.h
 header-y += meye.h
index 1f977dd4c370a8c031401bde254752fb1d77bb52..843540c398eb078d318a1a1c15d11b303a092110 100644 (file)
 #define AUDIT_OBJ_UID  109
 #define AUDIT_OBJ_GID  110
 #define AUDIT_FIELD_COMPARE    111
+#define AUDIT_EXE      112
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -324,8 +325,10 @@ enum {
 
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT     0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002
+#define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH   0x00000004
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
-                                 AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME)
+                                 AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
+                                 AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST           AUDIT_FEATURE_BITMAP_ALL
index 3429a3ba382b64edcef8f7f299134f81cead2184..b56dfcfe922ae2becbda3aea5116c97379bfe939 100644 (file)
@@ -39,6 +39,7 @@
 #define EM_TI_C6000    140     /* TI C6X DSPs */
 #define EM_AARCH64     183     /* ARM 64 bit */
 #define EM_TILEPRO     188     /* Tilera TILEPro */
+#define EM_MICROBLAZE  189     /* Xilinx MicroBlaze */
 #define EM_TILEGX      191     /* Tilera TILE-Gx */
 #define EM_FRV         0x5441  /* Fujitsu FR-V */
 #define EM_AVR32       0x18ad  /* Atmel AVR32 */
index aa63ed023c2b96b61b42231f9dd9b34b6ae46b66..ea9221b0331adea9b748afdbc761e386d6814b4d 100644 (file)
@@ -42,6 +42,7 @@
 #define ETH_P_LOOP     0x0060          /* Ethernet Loopback packet     */
 #define ETH_P_PUP      0x0200          /* Xerox PUP packet             */
 #define ETH_P_PUPAT    0x0201          /* Xerox PUP Addr Trans packet  */
+#define ETH_P_TSN      0x22F0          /* TSN (IEEE 1722) packet       */
 #define ETH_P_IP       0x0800          /* Internet Protocol packet     */
 #define ETH_P_X25      0x0805          /* CCITT X.25                   */
 #define ETH_P_ARP      0x0806          /* Address Resolution packet    */
index a6c4962e5d4623912adb21f4c240b352de223481..5da5f8751ce7dc082a4a99cfe29e7d2eb7f6ba5a 100644 (file)
@@ -33,6 +33,7 @@
 #define KPF_THP                        22
 #define KPF_BALLOON            23
 #define KPF_ZERO_PAGE          24
+#define KPF_IDLE               25
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
index 0d831f94f8a8f773e3ff034554d911bf8de4cc17..a9256f0331aeaf6be9af8f8507f480a54d0ca273 100644 (file)
@@ -237,6 +237,7 @@ struct kvm_run {
                        __u32 count;
                        __u64 data_offset; /* relative to kvm_run start */
                } io;
+               /* KVM_EXIT_DEBUG */
                struct {
                        struct kvm_debug_exit_arch arch;
                } debug;
@@ -285,6 +286,7 @@ struct kvm_run {
                        __u32 data;
                        __u8  is_write;
                } dcr;
+               /* KVM_EXIT_INTERNAL_ERROR */
                struct {
                        __u32 suberror;
                        /* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
                struct {
                        __u64 gprs[32];
                } osi;
+               /* KVM_EXIT_PAPR_HCALL */
                struct {
                        __u64 nr;
                        __u64 ret;
@@ -819,6 +822,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
new file mode 100644 (file)
index 0000000..e0b108b
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef _UAPI_LINUX_MEMBARRIER_H
+#define _UAPI_LINUX_MEMBARRIER_H
+
+/*
+ * linux/membarrier.h
+ *
+ * membarrier system call API
+ *
+ * Copyright (c) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * enum membarrier_cmd - membarrier system call command
+ * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
+ *                          a bitmask of valid commands.
+ * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
+ *
+ * Command to be passed to the membarrier system call. The commands need to
+ * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
+ * the value 0.
+ */
+enum membarrier_cmd {
+       MEMBARRIER_CMD_QUERY = 0,
+       MEMBARRIER_CMD_SHARED = (1 << 0),
+};
+
+#endif /* _UAPI_LINUX_MEMBARRIER_H */
index 2b94ea2287bb92fd882b92f81faaa36b28731f28..5b4a4be06e2b9301699db8da20d8df8d10bc0a89 100644 (file)
@@ -87,7 +87,7 @@ struct nd_cmd_ars_status {
                __u32 handle;
                __u32 flags;
                __u64 err_address;
-               __u64 mask;
+               __u64 length;
        } __packed records[0];
 } __packed;
 
@@ -111,6 +111,11 @@ enum {
        ND_CMD_VENDOR = 9,
 };
 
+enum {
+       ND_ARS_VOLATILE = 1,
+       ND_ARS_PERSISTENT = 2,
+};
+
 static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 {
        static const char * const names[] = {
@@ -194,4 +199,9 @@ enum nd_driver_flags {
 enum {
        ND_MIN_NAMESPACE_SIZE = 0x00400000,
 };
+
+enum ars_masks {
+       ARS_STATUS_MASK = 0x0000FFFF,
+       ARS_EXT_STATUS_SHIFT = 16,
+};
 #endif /* __NDCTL_H__ */
index 2119c7c274d710696b8dc3a38ed718be6a941d01..2b871e0858d9fb23364ab908b22a39118bf5d34e 100644 (file)
@@ -15,7 +15,7 @@
 
 #include <linux/types.h>
 
-#define NFS4_BITMAP_SIZE       2
+#define NFS4_BITMAP_SIZE       3
 #define NFS4_VERIFIER_SIZE     8
 #define NFS4_STATEID_SEQID_SIZE 4
 #define NFS4_STATEID_OTHER_SIZE 12
index cf1019e15f5bc57c0fbf6120a96deb2340465294..a7a69798661440b33f13c74e6560aabdb335ed1a 100644 (file)
@@ -89,9 +89,11 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_O_TRACESECCOMP  (1 << PTRACE_EVENT_SECCOMP)
 
 /* eventless options */
-#define PTRACE_O_EXITKILL      (1 << 20)
+#define PTRACE_O_EXITKILL              (1 << 20)
+#define PTRACE_O_SUSPEND_SECCOMP       (1 << 21)
 
-#define PTRACE_O_MASK          (0x000000ff | PTRACE_O_EXITKILL)
+#define PTRACE_O_MASK          (\
+       0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP)
 
 #include <asm/ptrace.h>
 
index b67f99d3c520bdc6ca6e51803eb12b1479709fdf..95c6521d8a95ffbc32327fa83b6761b9209392a7 100644 (file)
 #define TCMU_MAILBOX_VERSION 2
 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */
 
-/* See https://gcc.gnu.org/onlinedocs/cpp/Stringification.html */
-#define xstr(s) str(s)
-#define str(s) #s
-
 struct tcmu_mailbox {
        __u16 version;
        __u16 flags;
index e9bef5b2f91ebf61b1d244322d911e111c794a4f..c58bf4b5bb266438c217c46a76b7eb21150bfdc4 100644 (file)
@@ -1,6 +1,7 @@
 /* toshiba.h -- Linux driver for accessing the SMM on Toshiba laptops 
  *
  * Copyright (c) 1996-2000  Jonathan A. Buzzard (jonathan@buzzard.org.uk)
+ * Copyright (c) 2015  Azael Avalos <coproscefalo@gmail.com>
  *
  * Thanks to Juergen Heinzl <juergen@monocerus.demon.co.uk> for the pointers
  * on making sure the structure is aligned and packed.
 #ifndef _UAPI_LINUX_TOSHIBA_H
 #define _UAPI_LINUX_TOSHIBA_H
 
-#define TOSH_PROC "/proc/toshiba"
-#define TOSH_DEVICE "/dev/toshiba"
-#define TOSH_SMM _IOWR('t', 0x90, int) /* broken: meant 24 bytes */
+/*
+ * Toshiba modules paths
+ */
+
+#define TOSH_PROC              "/proc/toshiba"
+#define TOSH_DEVICE            "/dev/toshiba"
+#define TOSHIBA_ACPI_PROC      "/proc/acpi/toshiba"
+#define TOSHIBA_ACPI_DEVICE    "/dev/toshiba_acpi"
+
+/*
+ * Toshiba SMM structure
+ */
 
 typedef struct {
        unsigned int eax;
@@ -33,5 +43,21 @@ typedef struct {
        unsigned int edi __attribute__ ((packed));
 } SMMRegisters;
 
+/*
+ * IOCTLs (0x90 - 0x91)
+ */
+
+#define TOSH_SMM               _IOWR('t', 0x90, SMMRegisters)
+/*
+ * Convenience toshiba_acpi command.
+ *
+ * The System Configuration Interface (SCI) is opened/closed internally
+ * to avoid userspace of buggy BIOSes.
+ *
+ * The toshiba_acpi module checks whether the eax register is set with
+ * SCI_GET (0xf300) or SCI_SET (0xf400), returning -EINVAL if not.
+ */
+#define TOSHIBA_ACPI_SCI       _IOWR('t', 0x91, SMMRegisters)
+
 
 #endif /* _UAPI_LINUX_TOSHIBA_H */
index 687ae332200f9f76f6c3a557b257ecb45b0a458f..231901b08f6ce750501c7734ca4567a14caebf4b 100644 (file)
@@ -5,3 +5,4 @@ header-y += ib_user_sa.h
 header-y += ib_user_verbs.h
 header-y += rdma_netlink.h
 header-y += rdma_user_cm.h
+header-y += hfi/
diff --git a/include/uapi/rdma/hfi/Kbuild b/include/uapi/rdma/hfi/Kbuild
new file mode 100644 (file)
index 0000000..ef23c29
--- /dev/null
@@ -0,0 +1,2 @@
+# UAPI Header export list
+header-y += hfi1_user.h
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
new file mode 100644 (file)
index 0000000..78c442f
--- /dev/null
@@ -0,0 +1,427 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+#ifndef _LINUX__HFI1_USER_H
+#define _LINUX__HFI1_USER_H
+
+#include <linux/types.h>
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of hfi1_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures change in an incompatible
+ * way. The driver must be the same for initialization to succeed.
+ */
+#define HFI1_USER_SWMAJOR 4
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define HFI1_USER_SWMINOR 0
+
+/*
+ * Set of HW and driver capability/feature bits.
+ * These bit values are used to configure enabled/disabled HW and
+ * driver features. The same set of bits are communicated to user
+ * space.
+ */
+#define HFI1_CAP_DMA_RTAIL        (1UL <<  0) /* Use DMA'ed RTail value */
+#define HFI1_CAP_SDMA             (1UL <<  1) /* Enable SDMA support */
+#define HFI1_CAP_SDMA_AHG         (1UL <<  2) /* Enable SDMA AHG support */
+#define HFI1_CAP_EXTENDED_PSN     (1UL <<  3) /* Enable Extended PSN support */
+#define HFI1_CAP_HDRSUPP          (1UL <<  4) /* Enable Header Suppression */
+/* 1UL << 5 reserved */
+#define HFI1_CAP_USE_SDMA_HEAD    (1UL <<  6) /* DMA Hdr Q tail vs. use CSR */
+#define HFI1_CAP_MULTI_PKT_EGR    (1UL <<  7) /* Enable multi-packet Egr buffs*/
+#define HFI1_CAP_NODROP_RHQ_FULL  (1UL <<  8) /* Don't drop on Hdr Q full */
+#define HFI1_CAP_NODROP_EGR_FULL  (1UL <<  9) /* Don't drop on EGR buffs full */
+#define HFI1_CAP_TID_UNMAP        (1UL << 10) /* Enable Expected TID caching */
+#define HFI1_CAP_PRINT_UNIMPL     (1UL << 11) /* Show for unimplemented feats */
+#define HFI1_CAP_ALLOW_PERM_JKEY  (1UL << 12) /* Allow use of permissive JKEY */
+#define HFI1_CAP_NO_INTEGRITY     (1UL << 13) /* Enable ctxt integrity checks */
+#define HFI1_CAP_PKEY_CHECK       (1UL << 14) /* Enable ctxt PKey checking */
+#define HFI1_CAP_STATIC_RATE_CTRL (1UL << 15) /* Allow PBC.StaticRateControl */
+#define HFI1_CAP_QSFP_ENABLED     (1UL << 16) /* Enable QSFP check during LNI */
+#define HFI1_CAP_SDMA_HEAD_CHECK  (1UL << 17) /* SDMA head checking */
+#define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */
+
+#define HFI1_RCVHDR_ENTSIZE_2    (1UL << 0)
+#define HFI1_RCVHDR_ENTSIZE_16   (1UL << 1)
+#define HFI1_RCVDHR_ENTSIZE_32   (1UL << 2)
+
+/*
+ * If the unit is specified via open, HFI choice is fixed.  If port is
+ * specified, it's also fixed.  Otherwise we try to spread contexts
+ * across ports and HFIs, using different algorithms.  WITHIN is
+ * the old default, prior to this mechanism.
+ */
+#define HFI1_ALG_ACROSS 0 /* round robin contexts across HFIs, then
+                         * ports; this is the default */
+#define HFI1_ALG_WITHIN 1 /* use all contexts on an HFI (round robin
+                         * active ports within), then next HFI */
+#define HFI1_ALG_COUNT  2 /* number of algorithm choices */
+
+
+/* User commands. */
+#define HFI1_CMD_ASSIGN_CTXT     1     /* allocate HFI and context */
+#define HFI1_CMD_CTXT_INFO       2     /* find out what resources we got */
+#define HFI1_CMD_USER_INFO       3     /* set up userspace */
+#define HFI1_CMD_TID_UPDATE      4     /* update expected TID entries */
+#define HFI1_CMD_TID_FREE        5     /* free expected TID entries */
+#define HFI1_CMD_CREDIT_UPD      6     /* force an update of PIO credit */
+#define HFI1_CMD_SDMA_STATUS_UPD 7       /* force update of SDMA status ring */
+
+#define HFI1_CMD_RECV_CTRL       8     /* control receipt of packets */
+#define HFI1_CMD_POLL_TYPE       9     /* set the kind of polling we want */
+#define HFI1_CMD_ACK_EVENT       10    /* ack & clear user status bits */
+#define HFI1_CMD_SET_PKEY        11      /* set context's pkey */
+#define HFI1_CMD_CTXT_RESET      12      /* reset context's HW send context */
+/* separate EPROM commands from normal PSM commands */
+#define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
+#define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
+#define HFI1_CMD_EP_ERASE_P0     66      /* erase EPROM partition 0 */
+#define HFI1_CMD_EP_ERASE_P1     67      /* erase EPROM partition 1 */
+#define HFI1_CMD_EP_READ_P0      68      /* read EPROM partition 0 */
+#define HFI1_CMD_EP_READ_P1      69      /* read EPROM partition 1 */
+#define HFI1_CMD_EP_WRITE_P0     70      /* write EPROM partition 0 */
+#define HFI1_CMD_EP_WRITE_P1     71      /* write EPROM partition 1 */
+
+#define _HFI1_EVENT_FROZEN_BIT       0
+#define _HFI1_EVENT_LINKDOWN_BIT     1
+#define _HFI1_EVENT_LID_CHANGE_BIT   2
+#define _HFI1_EVENT_LMC_CHANGE_BIT   3
+#define _HFI1_EVENT_SL2VL_CHANGE_BIT 4
+#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_SL2VL_CHANGE_BIT
+
+#define HFI1_EVENT_FROZEN                (1UL << _HFI1_EVENT_FROZEN_BIT)
+#define HFI1_EVENT_LINKDOWN_BIT                (1UL << _HFI1_EVENT_LINKDOWN_BIT)
+#define HFI1_EVENT_LID_CHANGE_BIT      (1UL << _HFI1_EVENT_LID_CHANGE_BIT)
+#define HFI1_EVENT_LMC_CHANGE_BIT      (1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
+#define HFI1_EVENT_SL2VL_CHANGE_BIT    (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
+
+/*
+ * These are the status bits readable (in ASCII form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define HFI1_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initialized */
+#define HFI1_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define HFI1_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define HFI1_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define HFI1_STATUS_HWERROR     0x200
+
+/*
+ * Number of supported shared contexts.
+ * This is the maximum number of software contexts that can share
+ * a hardware send/receive context.
+ */
+#define HFI1_MAX_SHARED_CTXTS 8
+
+/*
+ * Poll types
+ */
+#define HFI1_POLL_TYPE_ANYRCV     0x0
+#define HFI1_POLL_TYPE_URGENT     0x1
+
+/*
+ * This structure is passed to the driver to tell it where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct hfi1_user_info {
+       /*
+        * version of user software, to detect compatibility issues.
+        * Should be set to HFI1_USER_SWVERSION.
+        */
+       __u32 userversion;
+       __u16 pad;
+       /* HFI selection algorithm, if unit has not selected */
+       __u16 hfi1_alg;
+       /*
+        * If two or more processes wish to share a context, each process
+        * must set the subcontext_cnt and subcontext_id to the same
+        * values.  The only restriction on the subcontext_id is that
+        * it be unique for a given node.
+        */
+       __u16 subctxt_cnt;
+       __u16 subctxt_id;
+       /* 128bit UUID passed in by PSM. */
+       __u8 uuid[16];
+};
+
+struct hfi1_ctxt_info {
+       __u64 runtime_flags;    /* chip/drv runtime flags (HFI1_CAP_*) */
+       __u32 rcvegr_size;      /* size of each eager buffer */
+       __u16 num_active;       /* number of active units */
+       __u16 unit;             /* unit (chip) assigned to caller */
+       __u16 ctxt;             /* ctxt on unit assigned to caller */
+       __u16 subctxt;          /* subctxt on unit assigned to caller */
+       __u16 rcvtids;          /* number of Rcv TIDs for this context */
+       __u16 credits;          /* number of PIO credits for this context */
+       __u16 numa_node;        /* NUMA node of the assigned device */
+       __u16 rec_cpu;          /* cpu # for affinity (0xffff if none) */
+       __u16 send_ctxt;        /* send context in use by this user context */
+       __u16 egrtids;          /* number of RcvArray entries for Eager Rcvs */
+       __u16 rcvhdrq_cnt;      /* number of RcvHdrQ entries */
+       __u16 rcvhdrq_entsize;  /* size (in bytes) for each RcvHdrQ entry */
+       __u16 sdma_ring_size;   /* number of entries in SDMA request ring */
+};
+
+struct hfi1_tid_info {
+       /* virtual address of first page in transfer */
+       __u64 vaddr;
+       /* pointer to tid array. this array is big enough */
+       __u64 tidlist;
+       /* number of tids programmed by this request */
+       __u32 tidcnt;
+       /* length of transfer buffer programmed by this request */
+       __u32 length;
+       /*
+        * pointer to bitmap of TIDs used for this call;
+        * checked for being large enough at open
+        */
+       __u64 tidmap;
+};
+
+struct hfi1_cmd {
+       __u32 type;        /* command type */
+       __u32 len;         /* length of struct pointed to by add */
+       __u64 addr;        /* pointer to user structure */
+};
+
+enum hfi1_sdma_comp_state {
+       FREE = 0,
+       QUEUED,
+       COMPLETE,
+       ERROR
+};
+
+/*
+ * SDMA completion ring entry
+ */
+struct hfi1_sdma_comp_entry {
+       __u32 status;
+       __u32 errcode;
+};
+
+/*
+ * Device status and notifications from driver to user-space.
+ */
+struct hfi1_status {
+       __u64 dev;      /* device/hw status bits */
+       __u64 port;     /* port state and status bits */
+       char freezemsg[0];
+};
+
+/*
+ * This structure is returned by the driver immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explicit pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct hfi1_base_info {
+       /* version of hardware, for feature checking. */
+       __u32 hw_version;
+       /* version of software, for feature checking. */
+       __u32 sw_version;
+       /* Job key */
+       __u16 jkey;
+       __u16 padding1;
+       /*
+        * The special QP (queue pair) value that identifies PSM
+        * protocol packet from standard IB packets.
+        */
+       __u32 bthqp;
+       /* PIO credit return address, */
+       __u64 sc_credits_addr;
+       /*
+        * Base address of write-only pio buffers for this process.
+        * Each buffer has sendpio_credits*64 bytes.
+        */
+       __u64 pio_bufbase_sop;
+       /*
+        * Base address of write-only pio buffers for this process.
+        * Each buffer has sendpio_credits*64 bytes.
+        */
+       __u64 pio_bufbase;
+       /* address where receive buffer queue is mapped into */
+       __u64 rcvhdr_bufbase;
+       /* base address of Eager receive buffers. */
+       __u64 rcvegr_bufbase;
+       /* base address of SDMA completion ring */
+       __u64 sdma_comp_bufbase;
+       /*
+        * User register base for init code, not to be used directly by
+        * protocol or applications.  Always maps real chip register space.
+        * the register addresses are:
+        * ur_rcvhdrhead, ur_rcvhdrtail, ur_rcvegrhead, ur_rcvegrtail,
+        * ur_rcvtidflow
+        */
+       __u64 user_regbase;
+       /* notification events */
+       __u64 events_bufbase;
+       /* status page */
+       __u64 status_bufbase;
+       /* rcvhdrtail update */
+       __u64 rcvhdrtail_base;
+       /*
+        * shared memory pages for subctxts if ctxt is shared; these cover
+        * all the processes in the group sharing a single context.
+        * all have enough space for the num_subcontexts value on this job.
+        */
+       __u64 subctxt_uregbase;
+       __u64 subctxt_rcvegrbuf;
+       __u64 subctxt_rcvhdrbuf;
+};
+
+enum sdma_req_opcode {
+       EXPECTED = 0,
+       EAGER
+};
+
+#define HFI1_SDMA_REQ_VERSION_MASK 0xF
+#define HFI1_SDMA_REQ_VERSION_SHIFT 0x0
+#define HFI1_SDMA_REQ_OPCODE_MASK 0xF
+#define HFI1_SDMA_REQ_OPCODE_SHIFT 0x4
+#define HFI1_SDMA_REQ_IOVCNT_MASK 0xFF
+#define HFI1_SDMA_REQ_IOVCNT_SHIFT 0x8
+
+struct sdma_req_info {
+       /*
+        * bits 0-3 - version (currently unused)
+        * bits 4-7 - opcode (enum sdma_req_opcode)
+        * bits 8-15 - io vector count
+        */
+       __u16 ctrl;
+       /*
+        * Number of fragments contained in this request.
+        * User-space has already computed how many
+        * fragment-sized packet the user buffer will be
+        * split into.
+        */
+       __u16 npkts;
+       /*
+        * Size of each fragment the user buffer will be
+        * split into.
+        */
+       __u16 fragsize;
+       /*
+        * Index of the slot in the SDMA completion ring
+        * this request should be using. User-space is
+        * in charge of managing its own ring.
+        */
+       __u16 comp_idx;
+} __packed;
+
+/*
+ * SW KDETH header.
+ * swdata is SW defined portion.
+ */
+struct hfi1_kdeth_header {
+       __le32 ver_tid_offset;
+       __le16 jkey;
+       __le16 hcrc;
+       __le32 swdata[7];
+} __packed;
+
+/*
+ * Structure describing the headers that User space uses. The
+ * structure above is a subset of this one.
+ */
+struct hfi1_pkt_header {
+       __le16 pbc[4];
+       __be16 lrh[4];
+       __be32 bth[3];
+       struct hfi1_kdeth_header kdeth;
+} __packed;
+
+
+/*
+ * The list of usermode accessible registers.
+ */
+enum hfi1_ureg {
+       /* (RO)  DMA RcvHdr to be used next. */
+       ur_rcvhdrtail = 0,
+       /* (RW)  RcvHdr entry to be processed next by host. */
+       ur_rcvhdrhead = 1,
+       /* (RO)  Index of next Eager index to use. */
+       ur_rcvegrindextail = 2,
+       /* (RW)  Eager TID to be processed next */
+       ur_rcvegrindexhead = 3,
+       /* (RO)  Receive Eager Offset Tail */
+       ur_rcvegroffsettail = 4,
+       /* For internal use only; max register number. */
+       ur_maxreg,
+       /* (RW)  Receive TID flow table */
+       ur_rcvtidflowtable = 256
+};
+
+#endif /* _LINIUX__HFI1_USER_H */
index 6e4bb4270ca2ea1bb9eab5e49ae7049b6061e12b..c19a5dc1531af5df6f58ec8fadb41fe51c7afc3d 100644 (file)
@@ -7,12 +7,14 @@ enum {
        RDMA_NL_RDMA_CM = 1,
        RDMA_NL_NES,
        RDMA_NL_C4IW,
+       RDMA_NL_LS,     /* RDMA Local Services */
        RDMA_NL_NUM_CLIENTS
 };
 
 enum {
        RDMA_NL_GROUP_CM = 1,
        RDMA_NL_GROUP_IWPM,
+       RDMA_NL_GROUP_LS,
        RDMA_NL_NUM_GROUPS
 };
 
@@ -128,5 +130,85 @@ enum {
        IWPM_NLA_ERR_MAX
 };
 
+/*
+ * Local service operations:
+ *   RESOLVE - The client requests the local service to resolve a path.
+ *   SET_TIMEOUT - The local service requests the client to set the timeout.
+ */
+enum {
+       RDMA_NL_LS_OP_RESOLVE = 0,
+       RDMA_NL_LS_OP_SET_TIMEOUT,
+       RDMA_NL_LS_NUM_OPS
+};
+
+/* Local service netlink message flags */
+#define RDMA_NL_LS_F_ERR       0x0100  /* Failed response */
+
+/*
+ * Local service resolve operation family header.
+ * The layout for the resolve operation:
+ *    nlmsg header
+ *    family header
+ *    attributes
+ */
+
+/*
+ * Local service path use:
+ * Specify how the path(s) will be used.
+ *   ALL - For connected CM operation (6 pathrecords)
+ *   UNIDIRECTIONAL - For unidirectional UD (1 pathrecord)
+ *   GMP - For miscellaneous GMP like operation (at least 1 reversible
+ *         pathrecord)
+ */
+enum {
+       LS_RESOLVE_PATH_USE_ALL = 0,
+       LS_RESOLVE_PATH_USE_UNIDIRECTIONAL,
+       LS_RESOLVE_PATH_USE_GMP,
+       LS_RESOLVE_PATH_USE_MAX
+};
+
+#define LS_DEVICE_NAME_MAX 64
+
+struct rdma_ls_resolve_header {
+       __u8 device_name[LS_DEVICE_NAME_MAX];
+       __u8 port_num;
+       __u8 path_use;
+};
+
+/* Local service attribute type */
+#define RDMA_NLA_F_MANDATORY   (1 << 13)
+#define RDMA_NLA_TYPE_MASK     (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER | \
+                                 RDMA_NLA_F_MANDATORY))
+
+/*
+ * Local service attributes:
+ *   Attr Name       Size                       Byte order
+ *   -----------------------------------------------------
+ *   PATH_RECORD     struct ib_path_rec_data
+ *   TIMEOUT         u32                        cpu
+ *   SERVICE_ID      u64                        cpu
+ *   DGID            u8[16]                     BE
+ *   SGID            u8[16]                     BE
+ *   TCLASS          u8
+ *   PKEY            u16                        cpu
+ *   QOS_CLASS       u16                        cpu
+ */
+enum {
+       LS_NLA_TYPE_UNSPEC = 0,
+       LS_NLA_TYPE_PATH_RECORD,
+       LS_NLA_TYPE_TIMEOUT,
+       LS_NLA_TYPE_SERVICE_ID,
+       LS_NLA_TYPE_DGID,
+       LS_NLA_TYPE_SGID,
+       LS_NLA_TYPE_TCLASS,
+       LS_NLA_TYPE_PKEY,
+       LS_NLA_TYPE_QOS_CLASS,
+       LS_NLA_TYPE_MAX
+};
+
+/* Local service DGID/SGID attribute: big endian */
+struct rdma_nla_ls_gid {
+       __u8            gid[16];
+};
 
 #endif /* _UAPI_RDMA_NETLINK_H */
index a85316811d79ad43493d21c1e35d51ed10aa257c..7ddeeda9380971b0327b09fc639eee9ebf82f1f7 100644 (file)
@@ -44,6 +44,10 @@ struct privcmd_hypercall {
 
 struct privcmd_mmap_entry {
        __u64 va;
+       /*
+        * This should be a GFN. It's not possible to change the name because
+        * it's exposed to the user-space.
+        */
        __u64 mfn;
        __u64 npages;
 };
index cac567f22e6232265b96ca361a505cbe46b2dfb1..d334e64c1c193e23d739510bd164d96944112206 100644 (file)
@@ -18,7 +18,7 @@
 #define __linux_video_vga_h__
 
 #include <linux/types.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/vga.h>
 #include <asm/byteorder.h>
 
index 7d95fdf9cf3e773f3d800194ca43a2f0a7acdd7a..88da2abaf53592c929260dd9a3c7c0f7fb346482 100644 (file)
@@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_xen_hvm_callback_vector xen_hvm_callback_vector
 #endif
-extern int xen_have_vector_callback;
 int xen_set_callback_via(uint64_t via);
 void xen_evtchn_do_upcall(struct pt_regs *regs);
 void xen_hvm_evtchn_do_upcall(void);
index 5cc49ea8d8406f1fec09609828e0b47d1c22f196..8e035871360e15e0593a026d151088a8d7482347 100644 (file)
@@ -474,6 +474,23 @@ struct xenpf_core_parking {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking);
 
+#define XENPF_get_symbol      63
+struct xenpf_symdata {
+       /* IN/OUT variables */
+       uint32_t        namelen; /* size of 'name' buffer */
+
+       /* IN/OUT variables */
+       uint32_t        symnum; /* IN:  Symbol to read                       */
+                               /* OUT: Next available symbol. If same as IN */
+                               /* then  we reached the end                  */
+
+       /* OUT variables */
+       GUEST_HANDLE(char) name;
+       uint64_t        address;
+       char            type;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata);
+
 struct xen_platform_op {
        uint32_t cmd;
        uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@@ -495,6 +512,7 @@ struct xen_platform_op {
                struct xenpf_cpu_hotadd        cpu_add;
                struct xenpf_mem_hotadd        mem_add;
                struct xenpf_core_parking      core_parking;
+               struct xenpf_symdata           symdata;
                uint8_t                        pad[128];
        } u;
 };
index a483789580623596e02e1b44cd06f796cc9b9064..167071c290b3d2afbb8fe949cf668dd19077367e 100644 (file)
@@ -80,6 +80,7 @@
 #define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
 #define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
+#define __HYPERVISOR_xenpmu_op            40
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
 #define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
 #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
 #define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
+#define VIRQ_XENPMU     13  /* PMC interrupt                                 */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
@@ -585,26 +587,29 @@ struct shared_info {
 };
 
 /*
- * Start-of-day memory layout for the initial domain (DOM0):
+ * Start-of-day memory layout
+ *
  *  1. The domain is started within contiguous virtual-memory region.
  *  2. The contiguous region begins and ends on an aligned 4MB boundary.
- *  3. The region start corresponds to the load address of the OS image.
- *     If the load address is not 4MB aligned then the address is rounded down.
- *  4. This the order of bootstrap elements in the initial virtual region:
+ *  3. This the order of bootstrap elements in the initial virtual region:
  *      a. relocated kernel image
  *      b. initial ram disk              [mod_start, mod_len]
+ *         (may be omitted)
  *      c. list of allocated page frames [mfn_list, nr_pages]
+ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
  *      d. start_info_t structure        [register ESI (x86)]
- *      e. bootstrap page tables         [pt_base, CR3 (x86)]
- *      f. bootstrap stack               [register ESP (x86)]
- *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
- *  6. The initial ram disk may be omitted.
- *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *         in case of dom0 this page contains the console info, too
+ *      e. unless dom0: xenstore ring page
+ *      f. unless dom0: console ring page
+ *      g. bootstrap page tables         [pt_base, CR3 (x86)]
+ *      h. bootstrap stack               [register ESP (x86)]
+ *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  5. The list of page frames forms a contiguous 'pseudo-physical' memory
  *     layout for the domain. In particular, the bootstrap virtual-memory
  *     region is a 1:1 mapping to the first section of the pseudo-physical map.
- *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *  6. All bootstrap elements are mapped read-writable for the guest OS. The
  *     only exception is the bootstrap page table, which is mapped read-only.
- *  9. There is guaranteed to be at least 512kB padding after the final
+ *  7. There is guaranteed to be at least 512kB padding after the final
  *     bootstrap element. If necessary, the bootstrap virtual region is
  *     extended by an extra 4MB to ensure this.
  */
@@ -641,10 +646,12 @@ struct start_info {
 };
 
 /* These flags are passed in the 'flags' field of start_info_t. */
-#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
-#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
-#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
-#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_PRIVILEGED      (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN      (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD   (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_MOD_START_PFN   (1<<3)  /* Is mod_start a PFN? */
+#define SIF_VIRT_P2M_4TOOLS (1<<4)  /* Do Xen tools understand a virt. mapped */
+                                   /* P->M making the 3 level tree obsolete? */
 #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
 
 /*
diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h
new file mode 100644 (file)
index 0000000..139efc9
--- /dev/null
@@ -0,0 +1,94 @@
+#ifndef __XEN_PUBLIC_XENPMU_H__
+#define __XEN_PUBLIC_XENPMU_H__
+
+#include "xen.h"
+
+#define XENPMU_VER_MAJ    0
+#define XENPMU_VER_MIN    1
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args);
+ *
+ * @cmd  == XENPMU_* (PMU operation)
+ * @args == struct xenpmu_params
+ */
+/* ` enum xenpmu_op { */
+#define XENPMU_mode_get        0 /* Also used for getting PMU version */
+#define XENPMU_mode_set        1
+#define XENPMU_feature_get     2
+#define XENPMU_feature_set     3
+#define XENPMU_init            4
+#define XENPMU_finish          5
+#define XENPMU_lvtpc_set       6
+#define XENPMU_flush           7
+
+/* ` } */
+
+/* Parameters structure for HYPERVISOR_xenpmu_op call */
+struct xen_pmu_params {
+       /* IN/OUT parameters */
+       struct {
+               uint32_t maj;
+               uint32_t min;
+       } version;
+       uint64_t val;
+
+       /* IN parameters */
+       uint32_t vcpu;
+       uint32_t pad;
+};
+
+/* PMU modes:
+ * - XENPMU_MODE_OFF:   No PMU virtualization
+ * - XENPMU_MODE_SELF:  Guests can profile themselves
+ * - XENPMU_MODE_HV:    Guests can profile themselves, dom0 profiles
+ *                      itself and Xen
+ * - XENPMU_MODE_ALL:   Only dom0 has access to VPMU and it profiles
+ *                      everyone: itself, the hypervisor and the guests.
+ */
+#define XENPMU_MODE_OFF           0
+#define XENPMU_MODE_SELF          (1<<0)
+#define XENPMU_MODE_HV            (1<<1)
+#define XENPMU_MODE_ALL           (1<<2)
+
+/*
+ * PMU features:
+ * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD)
+ */
+#define XENPMU_FEATURE_INTEL_BTS  1
+
+/*
+ * Shared PMU data between hypervisor and PV(H) domains.
+ *
+ * The hypervisor fills out this structure during PMU interrupt and sends an
+ * interrupt to appropriate VCPU.
+ * Architecture-independent fields of xen_pmu_data are WO for the hypervisor
+ * and RO for the guest but some fields in xen_pmu_arch can be writable
+ * by both the hypervisor and the guest (see arch-$arch/pmu.h).
+ */
+struct xen_pmu_data {
+       /* Interrupted VCPU */
+       uint32_t vcpu_id;
+
+       /*
+        * Physical processor on which the interrupt occurred. On non-privileged
+        * guests set to vcpu_id;
+        */
+       uint32_t pcpu_id;
+
+       /*
+        * Domain that was interrupted. On non-privileged guests set to
+        * DOMID_SELF.
+        * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in
+        * XENPMU_MODE_ALL mode, domain ID of another domain.
+        */
+       domid_t  domain_id;
+
+       uint8_t pad[6];
+
+       /* Architecture-specific information */
+       struct xen_pmu_arch pmu;
+};
+
+#endif /* __XEN_PUBLIC_XENPMU_H__ */
index c5ed20bb3fe96d5f309c5299a55a7ba3b7a140bd..1daae485e3360b3f8b30aea342b85b365230e705 100644 (file)
@@ -3,14 +3,14 @@
 
 #include <asm/xen/page.h>
 
-static inline unsigned long page_to_mfn(struct page *page)
+static inline unsigned long xen_page_to_gfn(struct page *page)
 {
-       return pfn_to_mfn(page_to_pfn(page));
+       return pfn_to_gfn(page_to_pfn(page));
 }
 
 struct xen_memory_region {
-       phys_addr_t start;
-       phys_addr_t size;
+       unsigned long start_pfn;
+       unsigned long n_pfns;
 };
 
 #define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */
index 0ce4f32017ea91e6af37bc860c5633f473da17b3..e4e214a5abd531b0fa1805186ac32a5cb8576253 100644 (file)
@@ -30,7 +30,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
 struct vm_area_struct;
 
 /*
- * xen_remap_domain_mfn_array() - map an array of foreign frames
+ * xen_remap_domain_gfn_array() - map an array of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     Array of GFNs to map
@@ -46,14 +46,14 @@ struct vm_area_struct;
  * Returns the number of successfully mapped frames, or a -ve error
  * code.
  */
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
                               unsigned long addr,
                               xen_pfn_t *gfn, int nr,
                               int *err_ptr, pgprot_t prot,
                               unsigned domid,
                               struct page **pages);
 
-/* xen_remap_domain_mfn_range() - map a range of foreign frames
+/* xen_remap_domain_gfn_range() - map a range of foreign frames
  * @vma:     VMA to map the pages into
  * @addr:    Address at which to map the pages
  * @gfn:     First GFN to map.
@@ -65,12 +65,12 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
  * Returns the number of successfully mapped frames, or a -ve error
  * code.
  */
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
                               unsigned long addr,
                               xen_pfn_t gfn, int nr,
                               pgprot_t prot, unsigned domid,
                               struct page **pages);
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
                               int numpgs, struct page **pages);
 int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
                              unsigned long addr,
index 9cabd866b34b7432ce877538e6c4f50904d4150d..c24b6f767bf0f2a4d8a873388a5fbd59ba01c562 100644 (file)
@@ -1602,6 +1602,18 @@ config PCI_QUIRKS
          bugs/quirks. Disable this only if your target machine is
          unaffected by PCI quirks.
 
+config MEMBARRIER
+       bool "Enable membarrier() system call" if EXPERT
+       default y
+       help
+         Enable the membarrier() system call that allows issuing memory
+         barriers across all running threads, which can be used to distribute
+         the cost of user-space memory barriers asymmetrically by transforming
+         pairs of memory barriers into pairs consisting of membarrier() and a
+         compiler barrier.
+
+         If unsure, say Y.
+
 config EMBEDDED
        bool "Embedded system"
        option allnoconfig_y
@@ -1765,17 +1777,23 @@ config MMAP_ALLOW_UNINITIALIZED
 
          See Documentation/nommu-mmap.txt for more information.
 
-config SYSTEM_TRUSTED_KEYRING
-       bool "Provide system-wide ring of trusted keys"
-       depends on KEYS
+config SYSTEM_DATA_VERIFICATION
+       def_bool n
+       select SYSTEM_TRUSTED_KEYRING
+       select KEYS
+       select CRYPTO
+       select ASYMMETRIC_KEY_TYPE
+       select ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+       select PUBLIC_KEY_ALGO_RSA
+       select ASN1
+       select OID_REGISTRY
+       select X509_CERTIFICATE_PARSER
+       select PKCS7_MESSAGE_PARSER
        help
-         Provide a system keyring to which trusted keys can be added.  Keys in
-         the keyring are considered to be trusted.  Keys may be added at will
-         by the kernel from compiled-in data and from hardware key stores, but
-         userspace may only add extra keys if those keys can be verified by
-         keys already in the keyring.
-
-         Keys in this keyring are used by module signature checking.
+         Provide PKCS#7 message verification using the contents of the system
+         trusted keyring to provide public keys.  This then can be used for
+         module verification, kexec image verification and firmware blob
+         verification.
 
 config PROFILING
        bool "Profiling support"
@@ -1885,20 +1903,16 @@ config MODULE_SRCVERSION_ALL
 config MODULE_SIG
        bool "Module signature verification"
        depends on MODULES
-       select SYSTEM_TRUSTED_KEYRING
-       select KEYS
-       select CRYPTO
-       select ASYMMETRIC_KEY_TYPE
-       select ASYMMETRIC_PUBLIC_KEY_SUBTYPE
-       select PUBLIC_KEY_ALGO_RSA
-       select ASN1
-       select OID_REGISTRY
-       select X509_CERTIFICATE_PARSER
+       select SYSTEM_DATA_VERIFICATION
        help
          Check modules for valid signatures upon load: the signature
          is simply appended to the module. For more information see
          Documentation/module-signing.txt.
 
+         Note that this option adds the OpenSSL development packages as a
+         kernel build dependency so that the signing tool can use its crypto
+         library.
+
          !!!WARNING!!!  If you enable this option, you MUST make sure that the
          module DOES NOT get stripped after being signed.  This includes the
          debuginfo strip done by some packagers (such as rpmbuild) and
index ad1bd7787bbb0c3298e2f9790b0edd5322227639..b32ad7d97ac94f52a0c50acd2a904e8a0c2f888d 100644 (file)
@@ -526,14 +526,14 @@ extern unsigned long __initramfs_size;
 
 static void __init free_initrd(void)
 {
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
        unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
 #endif
        if (do_retain_initrd)
                goto skip;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * If the initrd region is overlapped with crashkernel reserved region,
         * free only memory that is not part of crashkernel region.
index 56506553d4d80dff814b75f45db6db280fd0dea7..9e64d7097f1ad4d5744755c977cac583debbaf38 100644 (file)
@@ -877,7 +877,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
        cpuset_init_smp();
-       usermodehelper_init();
        shmem_init();
        driver_init();
        init_irq_proc();
index 2b491590ebab1f7fac5b6fecaf632a9a5aebada7..71f448e5e927aed0ccd8f5af24a928e82cfe616f 100644 (file)
@@ -123,7 +123,7 @@ struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst)
        size_t len = src->m_ts;
        size_t alen;
 
-       BUG_ON(dst == NULL);
+       WARN_ON(dst == NULL);
        if (src->m_ts > dst->m_ts)
                return ERR_PTR(-EINVAL);
 
index 4aef24d91b633e12275cea64a380df4543fc796b..222131e8e38f334547004bf0830b26bf808cc6a2 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -159,7 +159,7 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
         * We raced in the idr lookup or with shm_destroy().  Either way, the
         * ID is busted.
         */
-       BUG_ON(IS_ERR(ipcp));
+       WARN_ON(IS_ERR(ipcp));
 
        return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
@@ -393,7 +393,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
                return ret;
        sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
-       BUG_ON(!sfd->vm_ops->fault);
+       WARN_ON(!sfd->vm_ops->fault);
 #endif
        vma->vm_ops = &shm_vm_ops;
        shm_open(vma);
index 718fb8afab7afbd5ca5270f30a78e09797d800f6..53abf008ecb39758e1812f7a593323d65e7fd304 100644 (file)
@@ -45,12 +45,13 @@ ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
 obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
@@ -65,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
 obj-$(CONFIG_KPROBES) += kprobes.o
@@ -99,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
+
+obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
@@ -112,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
        $(call filechk,ikconfiggz)
-
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509".  Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
-                               $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-
-quiet_cmd_x509certs  = CERTS   $@
-      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) "  - Including cert $(X509)")
-
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
-       $(call if_changed,x509certs)
-
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
-       @echo $(X509_CERTIFICATES) >$@
-endif
-
-clean-files := x509_certificate_list .x509.list
-
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
-
-signing_key.priv signing_key.x509: x509.genkey
-       @echo "###"
-       @echo "### Now generating an X.509 key pair to be used for signing modules."
-       @echo "###"
-       @echo "### If this takes a long time, you might wish to run rngd in the"
-       @echo "### background to keep the supply of entropy topped up.  It"
-       @echo "### needs to be run as root, and uses a hardware random"
-       @echo "### number generator if one is available."
-       @echo "###"
-       openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-               -batch -x509 -config x509.genkey \
-               -outform DER -out signing_key.x509 \
-               -keyout signing_key.priv 2>&1
-       @echo "###"
-       @echo "### Key pair generated."
-       @echo "###"
-
-x509.genkey:
-       @echo Generating X.509 key generation config
-       @echo  >x509.genkey "[ req ]"
-       @echo >>x509.genkey "default_bits = 4096"
-       @echo >>x509.genkey "distinguished_name = req_distinguished_name"
-       @echo >>x509.genkey "prompt = no"
-       @echo >>x509.genkey "string_mask = utf8only"
-       @echo >>x509.genkey "x509_extensions = myexts"
-       @echo >>x509.genkey
-       @echo >>x509.genkey "[ req_distinguished_name ]"
-       @echo >>x509.genkey "#O = Unspecified company"
-       @echo >>x509.genkey "CN = Build time autogenerated kernel key"
-       @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
-       @echo >>x509.genkey
-       @echo >>x509.genkey "[ myexts ]"
-       @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
-       @echo >>x509.genkey "keyUsage=digitalSignature"
-       @echo >>x509.genkey "subjectKeyIdentifier=hash"
-       @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
index f9e6065346db1e826ddf19fff7ef7d6806aca51f..662c007635fb778bef2bca5cc213f8cb23ed364e 100644 (file)
@@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
        } else
                audit_log_format(ab, " name=(null)");
 
-       if (n->ino != (unsigned long)-1)
+       if (n->ino != AUDIT_INO_UNSET)
                audit_log_format(ab, " inode=%lu"
                                 " dev=%02x:%02x mode=%#ho"
                                 " ouid=%u ogid=%u rdev=%02x:%02x",
index d641f9bb3ed0a3970b0b717d99ccdc82bb4acd6c..dadf86a0e59e5956d16ef741086c56c1fbf77a6c 100644 (file)
@@ -50,6 +50,7 @@ enum audit_state {
 
 /* Rule lists */
 struct audit_watch;
+struct audit_fsnotify_mark;
 struct audit_tree;
 struct audit_chunk;
 
@@ -252,6 +253,7 @@ struct audit_net {
 extern int selinux_audit_rule_update(void);
 
 extern struct mutex audit_filter_mutex;
+extern int audit_del_rule(struct audit_entry *);
 extern void audit_free_rule_rcu(struct rcu_head *);
 extern struct list_head audit_filter_list[];
 
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
 extern void audit_remove_watch_rule(struct audit_krule *krule);
 extern char *audit_watch_path(struct audit_watch *watch);
 extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
+extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
+extern void audit_remove_mark_rule(struct audit_krule *krule);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+
 #else
 #define audit_put_watch(w) {}
 #define audit_get_watch(w) {}
@@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev
 #define audit_watch_path(w) ""
 #define audit_watch_compare(w, i, d) 0
 
+#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_mark_path(m) ""
+#define audit_remove_mark(m)
+#define audit_remove_mark_rule(k)
+#define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
 #endif /* CONFIG_AUDIT_WATCH */
 
 #ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644 (file)
index 0000000..27c6046
--- /dev/null
@@ -0,0 +1,216 @@
+/* audit_fsnotify.c -- tracking inodes
+ *
+ * Copyright 2003-2009,2014-2015 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * this mark lives on the parent directory of the inode in question.
+ * but dev, ino, and path are about the child
+ */
+struct audit_fsnotify_mark {
+       dev_t dev;              /* associated superblock device */
+       unsigned long ino;      /* associated inode number */
+       char *path;             /* insertion path */
+       struct fsnotify_mark mark; /* fsnotify mark on the inode */
+       struct audit_krule *rule;
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_fsnotify_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+                        FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+
+static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
+{
+       kfree(audit_mark->path);
+       kfree(audit_mark);
+}
+
+static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+       struct audit_fsnotify_mark *audit_mark;
+
+       audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
+       audit_fsnotify_mark_free(audit_mark);
+}
+
+char *audit_mark_path(struct audit_fsnotify_mark *mark)
+{
+       return mark->path;
+}
+
+int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+{
+       if (mark->ino == AUDIT_INO_UNSET)
+               return 0;
+       return (mark->ino == ino) && (mark->dev == dev);
+}
+
+static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
+                            struct inode *inode)
+{
+       audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+       audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
+}
+
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+{
+       struct audit_fsnotify_mark *audit_mark;
+       struct path path;
+       struct dentry *dentry;
+       struct inode *inode;
+       int ret;
+
+       if (pathname[0] != '/' || pathname[len-1] == '/')
+               return ERR_PTR(-EINVAL);
+
+       dentry = kern_path_locked(pathname, &path);
+       if (IS_ERR(dentry))
+               return (void *)dentry; /* returning an error */
+       inode = path.dentry->d_inode;
+       mutex_unlock(&inode->i_mutex);
+
+       audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
+       if (unlikely(!audit_mark)) {
+               audit_mark = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+
+       fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+       audit_mark->mark.mask = AUDIT_FS_EVENTS;
+       audit_mark->path = pathname;
+       audit_update_mark(audit_mark, dentry->d_inode);
+       audit_mark->rule = krule;
+
+       ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+       if (ret < 0) {
+               audit_fsnotify_mark_free(audit_mark);
+               audit_mark = ERR_PTR(ret);
+       }
+out:
+       dput(dentry);
+       path_put(&path);
+       return audit_mark;
+}
+
+static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
+{
+       struct audit_buffer *ab;
+       struct audit_krule *rule = audit_mark->rule;
+
+       if (!audit_enabled)
+               return;
+       ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+       if (unlikely(!ab))
+               return;
+       audit_log_format(ab, "auid=%u ses=%u op=",
+                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
+                        audit_get_sessionid(current));
+       audit_log_string(ab, op);
+       audit_log_format(ab, " path=");
+       audit_log_untrustedstring(ab, audit_mark->path);
+       audit_log_key(ab, rule->filterkey);
+       audit_log_format(ab, " list=%d res=1", rule->listnr);
+       audit_log_end(ab);
+}
+
+void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
+{
+       fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
+       fsnotify_put_mark(&audit_mark->mark);
+}
+
+void audit_remove_mark_rule(struct audit_krule *krule)
+{
+       struct audit_fsnotify_mark *mark = krule->exe;
+
+       audit_remove_mark(mark);
+}
+
+static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
+{
+       struct audit_krule *rule = audit_mark->rule;
+       struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
+
+       audit_mark_log_rule_change(audit_mark, "autoremove_rule");
+       audit_del_rule(entry);
+}
+
+/* Update mark data in audit rules based on fsnotify events. */
+static int audit_mark_handle_event(struct fsnotify_group *group,
+                                   struct inode *to_tell,
+                                   struct fsnotify_mark *inode_mark,
+                                   struct fsnotify_mark *vfsmount_mark,
+                                   u32 mask, void *data, int data_type,
+                                   const unsigned char *dname, u32 cookie)
+{
+       struct audit_fsnotify_mark *audit_mark;
+       struct inode *inode = NULL;
+
+       audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+
+       BUG_ON(group != audit_fsnotify_group);
+
+       switch (data_type) {
+       case (FSNOTIFY_EVENT_PATH):
+               inode = ((struct path *)data)->dentry->d_inode;
+               break;
+       case (FSNOTIFY_EVENT_INODE):
+               inode = (struct inode *)data;
+               break;
+       default:
+               BUG();
+               return 0;
+       };
+
+       if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+               if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+                       return 0;
+               audit_update_mark(audit_mark, inode);
+       } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+               audit_autoremove_mark_rule(audit_mark);
+
+       return 0;
+}
+
+static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+       .handle_event = audit_mark_handle_event,
+};
+
+static int __init audit_fsnotify_init(void)
+{
+       audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+       if (IS_ERR(audit_fsnotify_group)) {
+               audit_fsnotify_group = NULL;
+               audit_panic("cannot create audit fsnotify group");
+       }
+       return 0;
+}
+device_initcall(audit_fsnotify_init);
index b0f9877273fc39746fa063849f1bac51487d8606..94ecdabda8e6b31f8f2ba25fc66b992e320d9fcb 100644 (file)
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree)
                if (rule->tree) {
                        /* not a half-baked one */
                        audit_tree_log_remove_rule(rule);
+                       if (entry->rule.exe)
+                               audit_remove_mark(entry->rule.exe);
                        rule->tree = NULL;
                        list_del_rcu(&entry->list);
                        list_del(&entry->rule.list);
index 6e30024d9aac19fa7aae230d6f521f36a376ee8c..656c7e93ac0d30d3e42a8f7e0dfc7dd071360d78 100644 (file)
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
 
 int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
 {
-       return (watch->ino != (unsigned long)-1) &&
+       return (watch->ino != AUDIT_INO_UNSET) &&
                (watch->ino == ino) &&
                (watch->dev == dev);
 }
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
        INIT_LIST_HEAD(&watch->rules);
        atomic_set(&watch->count, 1);
        watch->path = path;
-       watch->dev = (dev_t)-1;
-       watch->ino = (unsigned long)-1;
+       watch->dev = AUDIT_DEV_UNSET;
+       watch->ino = AUDIT_INO_UNSET;
 
        return watch;
 }
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
        if (IS_ERR(watch))
                return PTR_ERR(watch);
 
-       audit_get_watch(watch);
        krule->watch = watch;
 
        return 0;
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
                                list_replace(&oentry->rule.list,
                                             &nentry->rule.list);
                        }
+                       if (oentry->rule.exe)
+                               audit_remove_mark(oentry->rule.exe);
 
                        audit_watch_log_rule_change(r, owatch, "updated_rules");
 
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
                        e = container_of(r, struct audit_entry, rule);
                        audit_watch_log_rule_change(r, w, "remove_rule");
+                       if (e->rule.exe)
+                               audit_remove_mark(e->rule.exe);
                        list_del(&r->rlist);
                        list_del(&r->list);
                        list_del_rcu(&e->list);
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
 
                watch_found = 1;
 
-               /* put krule's and initial refs to temporary watch */
-               audit_put_watch(watch);
+               /* put krule's ref to temporary watch */
                audit_put_watch(watch);
 
                audit_get_watch(w);
                krule->watch = watch = w;
+
+               audit_put_parent(parent);
                break;
        }
 
        if (!watch_found) {
-               audit_get_parent(parent);
                watch->parent = parent;
 
+               audit_get_watch(watch);
                list_add(&watch->wlist, &parent->watches);
        }
        list_add(&krule->rlist, &watch->rules);
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
 
        audit_add_to_parent(krule, parent);
 
-       /* match get in audit_find_parent or audit_init_parent */
-       audit_put_parent(parent);
-
        h = audit_hash_ino((u32)watch->ino);
        *list = &audit_inode_hash[h];
 error:
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
        if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
                audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
        else if (mask & (FS_DELETE|FS_MOVED_FROM))
-               audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+               audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
        else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
                audit_remove_parent_watches(parent);
 
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void)
        return 0;
 }
 device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+       struct audit_fsnotify_mark *audit_mark;
+       char *pathname;
+
+       pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+       if (!pathname)
+               return -ENOMEM;
+
+       audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+       if (IS_ERR(audit_mark)) {
+               kfree(pathname);
+               return PTR_ERR(audit_mark);
+       }
+       new->exe = audit_mark;
+
+       return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+       struct file *exe_file;
+       unsigned long ino;
+       dev_t dev;
+
+       rcu_read_lock();
+       exe_file = rcu_dereference(tsk->mm->exe_file);
+       ino = exe_file->f_inode->i_ino;
+       dev = exe_file->f_inode->i_sb->s_dev;
+       rcu_read_unlock();
+       return audit_mark_compare(mark, ino, dev);
+}
index 72e1660a79a3ab0fe96ce1dbd45e4e787d1dbd0b..7714d93edb8505fc6bbd212ad3dbcec9d05b4cdb 100644 (file)
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
                if (f->val > AUDIT_MAX_FIELD_COMPARE)
                        return -EINVAL;
                break;
+       case AUDIT_EXE:
+               if (f->op != Audit_equal)
+                       return -EINVAL;
+               if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+                       return -EINVAL;
+               break;
        };
        return 0;
 }
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
        char *str;
+       struct audit_fsnotify_mark *audit_mark;
 
        entry = audit_to_entry_common(data);
        if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                        entry->rule.buflen += f->val;
                        entry->rule.filterkey = str;
                        break;
+               case AUDIT_EXE:
+                       if (entry->rule.exe || f->val > PATH_MAX)
+                               goto exit_free;
+                       str = audit_unpack_string(&bufp, &remain, f->val);
+                       if (IS_ERR(str)) {
+                               err = PTR_ERR(str);
+                               goto exit_free;
+                       }
+                       entry->rule.buflen += f->val;
+
+                       audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+                       if (IS_ERR(audit_mark)) {
+                               kfree(str);
+                               err = PTR_ERR(audit_mark);
+                               goto exit_free;
+                       }
+                       entry->rule.exe = audit_mark;
+                       break;
                }
        }
 
@@ -549,10 +574,10 @@ exit_nofree:
        return entry;
 
 exit_free:
-       if (entry->rule.watch)
-               audit_put_watch(entry->rule.watch); /* matches initial get */
        if (entry->rule.tree)
                audit_put_tree(entry->rule.tree); /* that's the temporary one */
+       if (entry->rule.exe)
+               audit_remove_mark(entry->rule.exe); /* that's the template one */
        audit_free_rule(entry);
        return ERR_PTR(err);
 }
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, krule->filterkey);
                        break;
+               case AUDIT_EXE:
+                       data->buflen += data->values[i] =
+                               audit_pack_string(&bufp, audit_mark_path(krule->exe));
+                       break;
                case AUDIT_LOGINUID_SET:
                        if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
                                data->fields[i] = AUDIT_LOGINUID;
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
                        if (strcmp(a->filterkey, b->filterkey))
                                return 1;
                        break;
+               case AUDIT_EXE:
+                       /* both paths exist based on above type compare */
+                       if (strcmp(audit_mark_path(a->exe),
+                                  audit_mark_path(b->exe)))
+                               return 1;
+                       break;
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
                                err = -ENOMEM;
                        else
                                new->filterkey = fk;
+                       break;
+               case AUDIT_EXE:
+                       err = audit_dupe_exe(new, old);
+                       break;
                }
                if (err) {
+                       if (new->exe)
+                               audit_remove_mark(new->exe);
                        audit_free_rule(entry);
                        return ERR_PTR(err);
                }
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
-       int err;
+       int err = 0;
 #ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;
 
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
                        audit_put_tree(tree);
-               goto error;
+               return err;
        }
 
        if (watch) {
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry)
                         */
                        if (tree)
                                audit_put_tree(tree);
-                       goto error;
+                       return err;
                }
        }
        if (tree) {
                err = audit_add_tree_rule(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
-                       goto error;
+                       return err;
                }
        }
 
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #endif
        mutex_unlock(&audit_filter_mutex);
 
-       return 0;
-
-error:
-       if (watch)
-               audit_put_watch(watch); /* tmp watch, matches initial get */
        return err;
 }
 
 /* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry)
+int audit_del_rule(struct audit_entry *entry)
 {
        struct audit_entry  *e;
-       struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        int ret = 0;
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
        if (!e) {
-               mutex_unlock(&audit_filter_mutex);
                ret = -ENOENT;
                goto out;
        }
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);
 
-       list_del_rcu(&e->list);
-       list_del(&e->rule.list);
-       call_rcu(&e->rcu, audit_free_rule_rcu);
+       if (e->rule.exe)
+               audit_remove_mark_rule(&e->rule);
 
 #ifdef CONFIG_AUDITSYSCALL
        if (!dont_count)
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
        if (!audit_match_signal(entry))
                audit_signals--;
 #endif
-       mutex_unlock(&audit_filter_mutex);
+
+       list_del_rcu(&e->list);
+       list_del(&e->rule.list);
+       call_rcu(&e->rcu, audit_free_rule_rcu);
 
 out:
-       if (watch)
-               audit_put_watch(watch); /* match initial get */
+       mutex_unlock(&audit_filter_mutex);
+
        if (tree)
                audit_put_tree(tree);   /* that's the temporary one */
 
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
                WARN_ON(1);
        }
 
-       if (err || type == AUDIT_DEL_RULE)
+       if (err || type == AUDIT_DEL_RULE) {
+               if (entry->rule.exe)
+                       audit_remove_mark(entry->rule.exe);
                audit_free_rule(entry);
+       }
 
        return err;
 }
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
                return 0;
 
        nentry = audit_dupe_rule(r);
+       if (entry->rule.exe)
+               audit_remove_mark(entry->rule.exe);
        if (IS_ERR(nentry)) {
                /* save the first error encountered for the
                 * return value */
index e85bdfd15fedd4c8fed46818edae6191673bff07..b86cc04959dee64ac1972a28fd35acddae85e94b 100644 (file)
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
                return 0;
 
        list_for_each_entry(n, &ctx->names_list, list) {
-               if ((n->ino != -1) &&
+               if ((n->ino != AUDIT_INO_UNSET) &&
                    ((n->mode & S_IFMT) == mode))
                        return 1;
        }
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
                                result = audit_comparator(ctx->ppid, f->op, f->val);
                        }
                        break;
+               case AUDIT_EXE:
+                       result = audit_exe_compare(tsk, rule->exe);
+                       break;
                case AUDIT_UID:
                        result = audit_uid_comparator(cred->uid, f->op, f->uid);
                        break;
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
                aname->should_free = true;
        }
 
-       aname->ino = (unsigned long)-1;
+       aname->ino = AUDIT_INO_UNSET;
        aname->type = type;
        list_add_tail(&aname->list, &context->names_list);
 
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent,
        if (inode)
                audit_copy_inode(found_child, dentry, inode);
        else
-               found_child->ino = (unsigned long)-1;
+               found_child->ino = AUDIT_INO_UNSET;
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
 
index dc9b464fefa954a50c83e632c4ebea2ec464839e..35bac8e8b071ae5aa57e3837f4363d8dcd741158 100644 (file)
@@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *uvalue = u64_to_ptr(attr->value);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *value, *ptr;
+       struct fd f;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *uvalue = u64_to_ptr(attr->value);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *value;
+       struct fd f;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr)
 {
        void __user *ukey = u64_to_ptr(attr->key);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
+       struct fd f;
        void *key;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
@@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr)
        void __user *ukey = u64_to_ptr(attr->key);
        void __user *unext_key = u64_to_ptr(attr->next_key);
        int ufd = attr->map_fd;
-       struct fd f = fdget(ufd);
        struct bpf_map *map;
        void *key, *next_key;
+       struct fd f;
        int err;
 
        if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
                return -EINVAL;
 
+       f = fdget(ufd);
        map = bpf_map_get(f);
        if (IS_ERR(map))
                return PTR_ERR(map);
index ed12e385fb75997f559e345989b178bc4c709e22..b074b23000d6e95792e7fcefe0cd29e5a649a6ea 100644 (file)
@@ -283,7 +283,7 @@ static const char *const bpf_class_string[] = {
        [BPF_ALU64] = "alu64",
 };
 
-static const char *const bpf_alu_string[] = {
+static const char *const bpf_alu_string[16] = {
        [BPF_ADD >> 4]  = "+=",
        [BPF_SUB >> 4]  = "-=",
        [BPF_MUL >> 4]  = "*=",
@@ -307,7 +307,7 @@ static const char *const bpf_ldst_string[] = {
        [BPF_DW >> 3] = "u64",
 };
 
-static const char *const bpf_jmp_string[] = {
+static const char *const bpf_jmp_string[16] = {
        [BPF_JA >> 4]   = "jmp",
        [BPF_JEQ >> 4]  = "==",
        [BPF_JGT >> 4]  = ">",
index a8538e4437842d9cc85027acc516ed9a680d06cf..2cf0f79f1fc9014cffce5ed79969bbcdaa3b9f90 100644 (file)
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
        if (root != &cgrp_dfl_root)
                for_each_subsys(ss, ssid)
                        if (root->subsys_mask & (1 << ssid))
-                               seq_show_option(seq, ss->name, NULL);
+                               seq_show_option(seq, ss->legacy_name, NULL);
        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
        if (root->flags & CGRP_ROOT_XATTR)
index 9656a3c36503dee343813149bbf1153bb6aea05a..009cc9a17d95d601e4f6bf38f13d0f810ebbdb34 100644 (file)
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  * low power state that may have caused some blocks in the same power domain
  * to reset.
  *
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
  * domain, and before cpu_pm_exit has been called on any cpu in the power
  * domain. Notified drivers can include VFP co-processor, interrupt controller
  * and its PM extensions, local CPU timers context save/restore which
index ec1c07667ec1d5daec2820d6b30cda2bf99818c4..71179a09c1d6a3240fd9c1a4ead58d89646da24e 100644 (file)
 #include <linux/cn_proc.h>
 
 #if 0
-#define kdebug(FMT, ...) \
-       printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...)                                               \
+       printk("[%-5.5s%5u] " FMT "\n",                                 \
+              current->comm, current->pid, ##__VA_ARGS__)
 #else
-#define kdebug(FMT, ...) \
-       no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...)                                               \
+do {                                                                   \
+       if (0)                                                          \
+               no_printk("[%-5.5s%5u] " FMT "\n",                      \
+                         current->comm, current->pid, ##__VA_ARGS__);  \
+} while (0)
 #endif
 
 static struct kmem_cache *cred_jar;
index e8183895691c61f021e9dc9a7e0aab5f6cc709fc..f548f69c4299dd1ee44bfdc1f84d79d655d0d6d7 100644 (file)
@@ -9094,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
 
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
index c98f926277a8de676335eba7507eebc0513e93a9..e820ccee984673e77b23b09ff42a0db254aba762 100644 (file)
@@ -18,7 +18,6 @@
 #include <linux/ftrace.h>
 #include <linux/memory.h>
 #include <linux/module.h>
-#include <linux/ftrace.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
 
index a785c1015e25bf1ecacd3a6d92956e3e630e7f37..4c5edc357923a1b6198c9f8122b90b73b9a5e38f 100644 (file)
 /*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
  *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#define pr_fmt(fmt)    "kexec: " fmt
-
 #include <linux/capability.h>
 #include <linux/mm.h>
 #include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/kexec.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/highmem.h>
 #include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
 #include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
-       .name  = "Crash kernel",
-       .start = 0,
-       .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
-       .name  = "Crash kernel",
-       .start = 0,
-       .end   = 0,
-       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
-       /*
-        * If crash_kexec_post_notifiers is enabled, don't run
-        * crash_kexec() here yet, which must be run after panic
-        * notifiers in panic().
-        */
-       if (crash_kexec_post_notifiers)
-               return 0;
-       /*
-        * There are 4 panic() calls in do_exit() path, each of which
-        * corresponds to each of these 4 conditions.
-        */
-       if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
-               return 1;
-       return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses.  On processors
- * where you can disable the MMU this is trivial, and easy.  For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place.  This means I can only support memory whose
- * physical address can fit in an unsigned long.  In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages.  As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it).  The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- *  - allocating a page table with the control code buffer identity
- *    mapped, to simplify machine_kexec and make kexec_on_panic more
- *    reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
+#include <linux/slab.h>
 
-static int kimage_is_destination_range(struct kimage *image,
-                                      unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
-                                      gfp_t gfp_mask,
-                                      unsigned long dest);
+#include "kexec_internal.h"
 
 static int copy_user_segment_list(struct kimage *image,
                                  unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
        return ret;
 }
 
-static int sanity_check_segment_list(struct kimage *image)
-{
-       int result, i;
-       unsigned long nr_segments = image->nr_segments;
-
-       /*
-        * Verify we have good destination addresses.  The caller is
-        * responsible for making certain we don't attempt to load
-        * the new image into invalid or reserved areas of RAM.  This
-        * just verifies it is an address we can use.
-        *
-        * Since the kernel does everything in page size chunks ensure
-        * the destination addresses are page aligned.  Too many
-        * special cases crop of when we don't do this.  The most
-        * insidious is getting overlapping destination addresses
-        * simply because addresses are changed to page size
-        * granularity.
-        */
-       result = -EADDRNOTAVAIL;
-       for (i = 0; i < nr_segments; i++) {
-               unsigned long mstart, mend;
-
-               mstart = image->segment[i].mem;
-               mend   = mstart + image->segment[i].memsz;
-               if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
-                       return result;
-               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
-                       return result;
-       }
-
-       /* Verify our destination addresses do not overlap.
-        * If we alloed overlapping destination addresses
-        * through very weird things can happen with no
-        * easy explanation as one segment stops on another.
-        */
-       result = -EINVAL;
-       for (i = 0; i < nr_segments; i++) {
-               unsigned long mstart, mend;
-               unsigned long j;
-
-               mstart = image->segment[i].mem;
-               mend   = mstart + image->segment[i].memsz;
-               for (j = 0; j < i; j++) {
-                       unsigned long pstart, pend;
-                       pstart = image->segment[j].mem;
-                       pend   = pstart + image->segment[j].memsz;
-                       /* Do the segments overlap ? */
-                       if ((mend > pstart) && (mstart < pend))
-                               return result;
-               }
-       }
-
-       /* Ensure our buffer sizes are strictly less than
-        * our memory sizes.  This should always be the case,
-        * and it is easier to check up front than to be surprised
-        * later on.
-        */
-       result = -EINVAL;
-       for (i = 0; i < nr_segments; i++) {
-               if (image->segment[i].bufsz > image->segment[i].memsz)
-                       return result;
-       }
-
-       /*
-        * Verify we have good destination addresses.  Normally
-        * the caller is responsible for making certain we don't
-        * attempt to load the new image into invalid or reserved
-        * areas of RAM.  But crash kernels are preloaded into a
-        * reserved area of ram.  We must ensure the addresses
-        * are in the reserved area otherwise preloading the
-        * kernel could corrupt things.
-        */
-
-       if (image->type == KEXEC_TYPE_CRASH) {
-               result = -EADDRNOTAVAIL;
-               for (i = 0; i < nr_segments; i++) {
-                       unsigned long mstart, mend;
-
-                       mstart = image->segment[i].mem;
-                       mend = mstart + image->segment[i].memsz - 1;
-                       /* Ensure we are within the crash kernel limits */
-                       if ((mstart < crashk_res.start) ||
-                           (mend > crashk_res.end))
-                               return result;
-               }
-       }
-
-       return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
-       struct kimage *image;
-
-       /* Allocate a controlling structure */
-       image = kzalloc(sizeof(*image), GFP_KERNEL);
-       if (!image)
-               return NULL;
-
-       image->head = 0;
-       image->entry = &image->head;
-       image->last_entry = &image->head;
-       image->control_page = ~0; /* By default this does not apply */
-       image->type = KEXEC_TYPE_DEFAULT;
-
-       /* Initialize the list of control pages */
-       INIT_LIST_HEAD(&image->control_pages);
-
-       /* Initialize the list of destination pages */
-       INIT_LIST_HEAD(&image->dest_pages);
-
-       /* Initialize the list of unusable pages */
-       INIT_LIST_HEAD(&image->unusable_pages);
-
-       return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
 static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
                             unsigned long nr_segments,
                             struct kexec_segment __user *segments,
@@ -354,2427 +101,155 @@ out_free_image:
        return ret;
 }
 
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
-       struct fd f = fdget(fd);
-       int ret;
-       struct kstat stat;
-       loff_t pos;
-       ssize_t bytes = 0;
-
-       if (!f.file)
-               return -EBADF;
-
-       ret = vfs_getattr(&f.file->f_path, &stat);
-       if (ret)
-               goto out;
-
-       if (stat.size > INT_MAX) {
-               ret = -EFBIG;
-               goto out;
-       }
-
-       /* Don't hand 0 to vmalloc, it whines. */
-       if (stat.size == 0) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       *buf = vmalloc(stat.size);
-       if (!*buf) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       pos = 0;
-       while (pos < stat.size) {
-               bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
-                                   stat.size - pos);
-               if (bytes < 0) {
-                       vfree(*buf);
-                       ret = bytes;
-                       goto out;
-               }
-
-               if (bytes == 0)
-                       break;
-               pos += bytes;
-       }
-
-       if (pos != stat.size) {
-               ret = -EBADF;
-               vfree(*buf);
-               goto out;
-       }
-
-       *buf_len = pos;
-out:
-       fdput(f);
-       return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-                                        unsigned long buf_len)
-{
-       return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
-       return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
-                                       unsigned long buf_len)
-{
-       return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                                unsigned int relsec)
-{
-       pr_err("RELA relocation unsupported.\n");
-       return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-                            unsigned int relsec)
-{
-       pr_err("REL relocation unsupported.\n");
-       return -ENOEXEC;
-}
-
 /*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and then copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
  */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-
-       vfree(image->kernel_buf);
-       image->kernel_buf = NULL;
 
-       vfree(image->initrd_buf);
-       image->initrd_buf = NULL;
-
-       kfree(image->cmdline_buf);
-       image->cmdline_buf = NULL;
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+               struct kexec_segment __user *, segments, unsigned long, flags)
+{
+       struct kimage **dest_image, *image;
+       int result;
 
-       vfree(pi->purgatory_buf);
-       pi->purgatory_buf = NULL;
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+               return -EPERM;
 
-       vfree(pi->sechdrs);
-       pi->sechdrs = NULL;
+       /*
+        * Verify we have a legal set of flags
+        * This leaves us room for future extensions.
+        */
+       if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+               return -EINVAL;
 
-       /* See if architecture has anything to cleanup post load */
-       arch_kimage_file_post_load_cleanup(image);
+       /* Verify we are on the appropriate architecture */
+       if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+               ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+               return -EINVAL;
 
-       /*
-        * Above call should have called into bootloader to free up
-        * any data stored in kimage->image_loader_data. It should
-        * be ok now to free it up.
+       /* Put an artificial cap on the number
+        * of segments passed to kexec_load.
         */
-       kfree(image->image_loader_data);
-       image->image_loader_data = NULL;
-}
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
 
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
-                            const char __user *cmdline_ptr,
-                            unsigned long cmdline_len, unsigned flags)
-{
-       int ret = 0;
-       void *ldata;
+       image = NULL;
+       result = 0;
 
-       ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
-                               &image->kernel_buf_len);
-       if (ret)
-               return ret;
+       /* Because we write directly to the reserved memory
+        * region when loading crash kernels we need a mutex here to
+        * prevent multiple crash  kernels from attempting to load
+        * simultaneously, and to prevent a crash kernel from loading
+        * over the top of a in use crash kernel.
+        *
+        * KISS: always take the mutex.
+        */
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
 
-       /* Call arch image probe handlers */
-       ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
-                                           image->kernel_buf_len);
+       dest_image = &kexec_image;
+       if (flags & KEXEC_ON_CRASH)
+               dest_image = &kexec_crash_image;
+       if (nr_segments > 0) {
+               unsigned long i;
 
-       if (ret)
-               goto out;
+               if (flags & KEXEC_ON_CRASH) {
+                       /*
+                        * Loading another kernel to switch to if this one
+                        * crashes.  Free any current crash dump kernel before
+                        * we corrupt it.
+                        */
 
-#ifdef CONFIG_KEXEC_VERIFY_SIG
-       ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
-                                          image->kernel_buf_len);
-       if (ret) {
-               pr_debug("kernel signature verification failed.\n");
-               goto out;
-       }
-       pr_debug("kernel signature verification successful.\n");
-#endif
-       /* It is possible that there no initramfs is being loaded */
-       if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
-               ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
-                                       &image->initrd_buf_len);
-               if (ret)
-                       goto out;
-       }
+                       kimage_free(xchg(&kexec_crash_image, NULL));
+                       result = kimage_alloc_init(&image, entry, nr_segments,
+                                                  segments, flags);
+                       crash_map_reserved_pages();
+               } else {
+                       /* Loading another kernel to reboot into. */
 
-       if (cmdline_len) {
-               image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
-               if (!image->cmdline_buf) {
-                       ret = -ENOMEM;
-                       goto out;
+                       result = kimage_alloc_init(&image, entry, nr_segments,
+                                                  segments, flags);
                }
-
-               ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
-                                    cmdline_len);
-               if (ret) {
-                       ret = -EFAULT;
+               if (result)
                        goto out;
-               }
-
-               image->cmdline_buf_len = cmdline_len;
 
-               /* command line should be a string with last byte null */
-               if (image->cmdline_buf[cmdline_len - 1] != '\0') {
-                       ret = -EINVAL;
+               if (flags & KEXEC_PRESERVE_CONTEXT)
+                       image->preserve_context = 1;
+               result = machine_kexec_prepare(image);
+               if (result)
                        goto out;
-               }
-       }
 
-       /* Call arch image load handlers */
-       ldata = arch_kexec_kernel_image_load(image);
-
-       if (IS_ERR(ldata)) {
-               ret = PTR_ERR(ldata);
-               goto out;
+               for (i = 0; i < nr_segments; i++) {
+                       result = kimage_load_segment(image, &image->segment[i]);
+                       if (result)
+                               goto out;
+               }
+               kimage_terminate(image);
+               if (flags & KEXEC_ON_CRASH)
+                       crash_unmap_reserved_pages();
        }
+       /* Install the new kernel, and  Uninstall the old */
+       image = xchg(dest_image, image);
 
-       image->image_loader_data = ldata;
 out:
-       /* In case of error, free up all allocated memory in this function */
-       if (ret)
-               kimage_file_post_load_cleanup(image);
-       return ret;
+       mutex_unlock(&kexec_mutex);
+       kimage_free(image);
+
+       return result;
 }
 
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
-                      int initrd_fd, const char __user *cmdline_ptr,
-                      unsigned long cmdline_len, unsigned long flags)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+                      compat_ulong_t, nr_segments,
+                      struct compat_kexec_segment __user *, segments,
+                      compat_ulong_t, flags)
 {
-       int ret;
-       struct kimage *image;
-       bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
-       image = do_kimage_alloc_init();
-       if (!image)
-               return -ENOMEM;
+       struct compat_kexec_segment in;
+       struct kexec_segment out, __user *ksegments;
+       unsigned long i, result;
 
-       image->file_mode = 1;
+       /* Don't allow clients that don't understand the native
+        * architecture to do anything.
+        */
+       if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+               return -EINVAL;
 
-       if (kexec_on_panic) {
-               /* Enable special crash kernel control page alloc policy. */
-               image->control_page = crashk_res.start;
-               image->type = KEXEC_TYPE_CRASH;
-       }
+       if (nr_segments > KEXEC_SEGMENT_MAX)
+               return -EINVAL;
 
-       ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
-                                          cmdline_ptr, cmdline_len, flags);
-       if (ret)
-               goto out_free_image;
-
-       ret = sanity_check_segment_list(image);
-       if (ret)
-               goto out_free_post_load_bufs;
-
-       ret = -ENOMEM;
-       image->control_code_page = kimage_alloc_control_pages(image,
-                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
-       if (!image->control_code_page) {
-               pr_err("Could not allocate control_code_buffer\n");
-               goto out_free_post_load_bufs;
-       }
-
-       if (!kexec_on_panic) {
-               image->swap_page = kimage_alloc_control_pages(image, 0);
-               if (!image->swap_page) {
-                       pr_err("Could not allocate swap buffer\n");
-                       goto out_free_control_pages;
-               }
-       }
-
-       *rimage = image;
-       return 0;
-out_free_control_pages:
-       kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
-       kimage_file_post_load_cleanup(image);
-out_free_image:
-       kfree(image);
-       return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
-                                       unsigned long start,
-                                       unsigned long end)
-{
-       unsigned long i;
-
-       for (i = 0; i < image->nr_segments; i++) {
-               unsigned long mstart, mend;
-
-               mstart = image->segment[i].mem;
-               mend = mstart + image->segment[i].memsz;
-               if ((end > mstart) && (start < mend))
-                       return 1;
-       }
-
-       return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
-       struct page *pages;
-
-       pages = alloc_pages(gfp_mask, order);
-       if (pages) {
-               unsigned int count, i;
-               pages->mapping = NULL;
-               set_page_private(pages, order);
-               count = 1 << order;
-               for (i = 0; i < count; i++)
-                       SetPageReserved(pages + i);
-       }
-
-       return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
-       unsigned int order, count, i;
-
-       order = page_private(page);
-       count = 1 << order;
-       for (i = 0; i < count; i++)
-               ClearPageReserved(page + i);
-       __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
-       struct list_head *pos, *next;
-
-       list_for_each_safe(pos, next, list) {
-               struct page *page;
-
-               page = list_entry(pos, struct page, lru);
-               list_del(&page->lru);
-               kimage_free_pages(page);
-       }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
-                                                       unsigned int order)
-{
-       /* Control pages are special, they are the intermediaries
-        * that are needed while we copy the rest of the pages
-        * to their final resting place.  As such they must
-        * not conflict with either the destination addresses
-        * or memory the kernel is already using.
-        *
-        * The only case where we really need more than one of
-        * these are for architectures where we cannot disable
-        * the MMU and must instead generate an identity mapped
-        * page table for all of the memory.
-        *
-        * At worst this runs in O(N) of the image size.
-        */
-       struct list_head extra_pages;
-       struct page *pages;
-       unsigned int count;
-
-       count = 1 << order;
-       INIT_LIST_HEAD(&extra_pages);
-
-       /* Loop while I can allocate a page and the page allocated
-        * is a destination page.
-        */
-       do {
-               unsigned long pfn, epfn, addr, eaddr;
-
-               pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
-               if (!pages)
-                       break;
-               pfn   = page_to_pfn(pages);
-               epfn  = pfn + count;
-               addr  = pfn << PAGE_SHIFT;
-               eaddr = epfn << PAGE_SHIFT;
-               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-                             kimage_is_destination_range(image, addr, eaddr)) {
-                       list_add(&pages->lru, &extra_pages);
-                       pages = NULL;
-               }
-       } while (!pages);
-
-       if (pages) {
-               /* Remember the allocated page... */
-               list_add(&pages->lru, &image->control_pages);
-
-               /* Because the page is already in it's destination
-                * location we will never allocate another page at
-                * that address.  Therefore kimage_alloc_pages
-                * will not return it (again) and we don't need
-                * to give it an entry in image->segment[].
-                */
-       }
-       /* Deal with the destination pages I have inadvertently allocated.
-        *
-        * Ideally I would convert multi-page allocations into single
-        * page allocations, and add everything to image->dest_pages.
-        *
-        * For now it is simpler to just free the pages.
-        */
-       kimage_free_page_list(&extra_pages);
-
-       return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
-                                                     unsigned int order)
-{
-       /* Control pages are special, they are the intermediaries
-        * that are needed while we copy the rest of the pages
-        * to their final resting place.  As such they must
-        * not conflict with either the destination addresses
-        * or memory the kernel is already using.
-        *
-        * Control pages are also the only pags we must allocate
-        * when loading a crash kernel.  All of the other pages
-        * are specified by the segments and we just memcpy
-        * into them directly.
-        *
-        * The only case where we really need more than one of
-        * these are for architectures where we cannot disable
-        * the MMU and must instead generate an identity mapped
-        * page table for all of the memory.
-        *
-        * Given the low demand this implements a very simple
-        * allocator that finds the first hole of the appropriate
-        * size in the reserved memory region, and allocates all
-        * of the memory up to and including the hole.
-        */
-       unsigned long hole_start, hole_end, size;
-       struct page *pages;
-
-       pages = NULL;
-       size = (1 << order) << PAGE_SHIFT;
-       hole_start = (image->control_page + (size - 1)) & ~(size - 1);
-       hole_end   = hole_start + size - 1;
-       while (hole_end <= crashk_res.end) {
-               unsigned long i;
-
-               if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
-                       break;
-               /* See if I overlap any of the segments */
-               for (i = 0; i < image->nr_segments; i++) {
-                       unsigned long mstart, mend;
-
-                       mstart = image->segment[i].mem;
-                       mend   = mstart + image->segment[i].memsz - 1;
-                       if ((hole_end >= mstart) && (hole_start <= mend)) {
-                               /* Advance the hole to the end of the segment */
-                               hole_start = (mend + (size - 1)) & ~(size - 1);
-                               hole_end   = hole_start + size - 1;
-                               break;
-                       }
-               }
-               /* If I don't overlap any segments I have found my hole! */
-               if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
-                       break;
-               }
-       }
-       if (pages)
-               image->control_page = hole_end;
-
-       return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
-                                        unsigned int order)
-{
-       struct page *pages = NULL;
-
-       switch (image->type) {
-       case KEXEC_TYPE_DEFAULT:
-               pages = kimage_alloc_normal_control_pages(image, order);
-               break;
-       case KEXEC_TYPE_CRASH:
-               pages = kimage_alloc_crash_control_pages(image, order);
-               break;
-       }
-
-       return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
-       if (*image->entry != 0)
-               image->entry++;
-
-       if (image->entry == image->last_entry) {
-               kimage_entry_t *ind_page;
-               struct page *page;
-
-               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-               if (!page)
-                       return -ENOMEM;
-
-               ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
-               image->entry = ind_page;
-               image->last_entry = ind_page +
-                                     ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
-       }
-       *image->entry = entry;
-       image->entry++;
-       *image->entry = 0;
-
-       return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
-                                  unsigned long destination)
-{
-       int result;
-
-       destination &= PAGE_MASK;
-       result = kimage_add_entry(image, destination | IND_DESTINATION);
-
-       return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
-       int result;
-
-       page &= PAGE_MASK;
-       result = kimage_add_entry(image, page | IND_SOURCE);
-
-       return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
-       /* Walk through and free any extra destination pages I may have */
-       kimage_free_page_list(&image->dest_pages);
-
-       /* Walk through and free any unusable pages I have cached */
-       kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
-       if (*image->entry != 0)
-               image->entry++;
-
-       *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
-       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-               ptr = (entry & IND_INDIRECTION) ? \
-                       phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
-       struct page *page;
-
-       page = pfn_to_page(entry >> PAGE_SHIFT);
-       kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
-       kimage_entry_t *ptr, entry;
-       kimage_entry_t ind = 0;
-
-       if (!image)
-               return;
-
-       kimage_free_extra_pages(image);
-       for_each_kimage_entry(image, ptr, entry) {
-               if (entry & IND_INDIRECTION) {
-                       /* Free the previous indirection page */
-                       if (ind & IND_INDIRECTION)
-                               kimage_free_entry(ind);
-                       /* Save this indirection page until we are
-                        * done with it.
-                        */
-                       ind = entry;
-               } else if (entry & IND_SOURCE)
-                       kimage_free_entry(entry);
-       }
-       /* Free the final indirection page */
-       if (ind & IND_INDIRECTION)
-               kimage_free_entry(ind);
-
-       /* Handle any machine specific cleanup */
-       machine_kexec_cleanup(image);
-
-       /* Free the kexec control pages... */
-       kimage_free_page_list(&image->control_pages);
-
-       /*
-        * Free up any temporary buffers allocated. This might hit if
-        * error occurred much later after buffer allocation.
-        */
-       if (image->file_mode)
-               kimage_file_post_load_cleanup(image);
-
-       kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
-                                       unsigned long page)
-{
-       kimage_entry_t *ptr, entry;
-       unsigned long destination = 0;
-
-       for_each_kimage_entry(image, ptr, entry) {
-               if (entry & IND_DESTINATION)
-                       destination = entry & PAGE_MASK;
-               else if (entry & IND_SOURCE) {
-                       if (page == destination)
-                               return ptr;
-                       destination += PAGE_SIZE;
-               }
-       }
-
-       return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
-                                       gfp_t gfp_mask,
-                                       unsigned long destination)
-{
-       /*
-        * Here we implement safeguards to ensure that a source page
-        * is not copied to its destination page before the data on
-        * the destination page is no longer useful.
-        *
-        * To do this we maintain the invariant that a source page is
-        * either its own destination page, or it is not a
-        * destination page at all.
-        *
-        * That is slightly stronger than required, but the proof
-        * that no problems will not occur is trivial, and the
-        * implementation is simply to verify.
-        *
-        * When allocating all pages normally this algorithm will run
-        * in O(N) time, but in the worst case it will run in O(N^2)
-        * time.   If the runtime is a problem the data structures can
-        * be fixed.
-        */
-       struct page *page;
-       unsigned long addr;
-
-       /*
-        * Walk through the list of destination pages, and see if I
-        * have a match.
-        */
-       list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
-               if (addr == destination) {
-                       list_del(&page->lru);
-                       return page;
-               }
-       }
-       page = NULL;
-       while (1) {
-               kimage_entry_t *old;
-
-               /* Allocate a page, if we run out of memory give up */
-               page = kimage_alloc_pages(gfp_mask, 0);
-               if (!page)
-                       return NULL;
-               /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
-                               (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
-                       list_add(&page->lru, &image->unusable_pages);
-                       continue;
-               }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
-
-               /* If it is the destination page we want use it */
-               if (addr == destination)
-                       break;
-
-               /* If the page is not a destination page use it */
-               if (!kimage_is_destination_range(image, addr,
-                                                 addr + PAGE_SIZE))
-                       break;
-
-               /*
-                * I know that the page is someones destination page.
-                * See if there is already a source page for this
-                * destination page.  And if so swap the source pages.
-                */
-               old = kimage_dst_used(image, addr);
-               if (old) {
-                       /* If so move it */
-                       unsigned long old_addr;
-                       struct page *old_page;
-
-                       old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
-                       copy_highpage(page, old_page);
-                       *old = addr | (*old & ~PAGE_MASK);
-
-                       /* The old page I have found cannot be a
-                        * destination page, so return it if it's
-                        * gfp_flags honor the ones passed in.
-                        */
-                       if (!(gfp_mask & __GFP_HIGHMEM) &&
-                           PageHighMem(old_page)) {
-                               kimage_free_pages(old_page);
-                               continue;
-                       }
-                       addr = old_addr;
-                       page = old_page;
-                       break;
-               } else {
-                       /* Place the page on the destination list I
-                        * will use it later.
-                        */
-                       list_add(&page->lru, &image->dest_pages);
-               }
-       }
-
-       return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
-                                        struct kexec_segment *segment)
-{
-       unsigned long maddr;
-       size_t ubytes, mbytes;
-       int result;
-       unsigned char __user *buf = NULL;
-       unsigned char *kbuf = NULL;
-
-       result = 0;
-       if (image->file_mode)
-               kbuf = segment->kbuf;
-       else
-               buf = segment->buf;
-       ubytes = segment->bufsz;
-       mbytes = segment->memsz;
-       maddr = segment->mem;
-
-       result = kimage_set_destination(image, maddr);
-       if (result < 0)
-               goto out;
-
-       while (mbytes) {
-               struct page *page;
-               char *ptr;
-               size_t uchunk, mchunk;
-
-               page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
-               if (!page) {
-                       result  = -ENOMEM;
-                       goto out;
-               }
-               result = kimage_add_page(image, page_to_pfn(page)
-                                                               << PAGE_SHIFT);
-               if (result < 0)
-                       goto out;
-
-               ptr = kmap(page);
-               /* Start with a clear page */
-               clear_page(ptr);
-               ptr += maddr & ~PAGE_MASK;
-               mchunk = min_t(size_t, mbytes,
-                               PAGE_SIZE - (maddr & ~PAGE_MASK));
-               uchunk = min(ubytes, mchunk);
-
-               /* For file based kexec, source pages are in kernel memory */
-               if (image->file_mode)
-                       memcpy(ptr, kbuf, uchunk);
-               else
-                       result = copy_from_user(ptr, buf, uchunk);
-               kunmap(page);
-               if (result) {
-                       result = -EFAULT;
-                       goto out;
-               }
-               ubytes -= uchunk;
-               maddr  += mchunk;
-               if (image->file_mode)
-                       kbuf += mchunk;
-               else
-                       buf += mchunk;
-               mbytes -= mchunk;
-       }
-out:
-       return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
-                                       struct kexec_segment *segment)
-{
-       /* For crash dumps kernels we simply copy the data from
-        * user space to it's destination.
-        * We do things a page at a time for the sake of kmap.
-        */
-       unsigned long maddr;
-       size_t ubytes, mbytes;
-       int result;
-       unsigned char __user *buf = NULL;
-       unsigned char *kbuf = NULL;
-
-       result = 0;
-       if (image->file_mode)
-               kbuf = segment->kbuf;
-       else
-               buf = segment->buf;
-       ubytes = segment->bufsz;
-       mbytes = segment->memsz;
-       maddr = segment->mem;
-       while (mbytes) {
-               struct page *page;
-               char *ptr;
-               size_t uchunk, mchunk;
-
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
-               if (!page) {
-                       result  = -ENOMEM;
-                       goto out;
-               }
-               ptr = kmap(page);
-               ptr += maddr & ~PAGE_MASK;
-               mchunk = min_t(size_t, mbytes,
-                               PAGE_SIZE - (maddr & ~PAGE_MASK));
-               uchunk = min(ubytes, mchunk);
-               if (mchunk > uchunk) {
-                       /* Zero the trailing part of the page */
-                       memset(ptr + uchunk, 0, mchunk - uchunk);
-               }
-
-               /* For file based kexec, source pages are in kernel memory */
-               if (image->file_mode)
-                       memcpy(ptr, kbuf, uchunk);
-               else
-                       result = copy_from_user(ptr, buf, uchunk);
-               kexec_flush_icache_page(page);
-               kunmap(page);
-               if (result) {
-                       result = -EFAULT;
-                       goto out;
-               }
-               ubytes -= uchunk;
-               maddr  += mchunk;
-               if (image->file_mode)
-                       kbuf += mchunk;
-               else
-                       buf += mchunk;
-               mbytes -= mchunk;
-       }
-out:
-       return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
-                               struct kexec_segment *segment)
-{
-       int result = -ENOMEM;
-
-       switch (image->type) {
-       case KEXEC_TYPE_DEFAULT:
-               result = kimage_load_normal_segment(image, segment);
-               break;
-       case KEXEC_TYPE_CRASH:
-               result = kimage_load_crash_segment(image, segment);
-               break;
-       }
-
-       return result;
-}
-
-/*
- * Exec Kernel system call: for obvious reasons only root may call it.
- *
- * This call breaks up into three pieces.
- * - A generic part which loads the new kernel from the current
- *   address space, and very carefully places the data in the
- *   allocated pages.
- *
- * - A generic part that interacts with the kernel and tells all of
- *   the devices to shut down.  Preventing on-going dmas, and placing
- *   the devices in a consistent state so a later kernel can
- *   reinitialize them.
- *
- * - A machine specific part that includes the syscall number
- *   and then copies the image to it's final destination.  And
- *   jumps into the image at entry.
- *
- * kexec does not sync, or unmount filesystems so if you need
- * that to happen you need to do that yourself.
- */
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
-
-SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
-               struct kexec_segment __user *, segments, unsigned long, flags)
-{
-       struct kimage **dest_image, *image;
-       int result;
-
-       /* We only trust the superuser with rebooting the system. */
-       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-               return -EPERM;
-
-       /*
-        * Verify we have a legal set of flags
-        * This leaves us room for future extensions.
-        */
-       if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
-               return -EINVAL;
-
-       /* Verify we are on the appropriate architecture */
-       if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
-               ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
-               return -EINVAL;
-
-       /* Put an artificial cap on the number
-        * of segments passed to kexec_load.
-        */
-       if (nr_segments > KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       image = NULL;
-       result = 0;
-
-       /* Because we write directly to the reserved memory
-        * region when loading crash kernels we need a mutex here to
-        * prevent multiple crash  kernels from attempting to load
-        * simultaneously, and to prevent a crash kernel from loading
-        * over the top of a in use crash kernel.
-        *
-        * KISS: always take the mutex.
-        */
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-
-       dest_image = &kexec_image;
-       if (flags & KEXEC_ON_CRASH)
-               dest_image = &kexec_crash_image;
-       if (nr_segments > 0) {
-               unsigned long i;
-
-               if (flags & KEXEC_ON_CRASH) {
-                       /*
-                        * Loading another kernel to switch to if this one
-                        * crashes.  Free any current crash dump kernel before
-                        * we corrupt it.
-                        */
-
-                       kimage_free(xchg(&kexec_crash_image, NULL));
-                       result = kimage_alloc_init(&image, entry, nr_segments,
-                                                  segments, flags);
-                       crash_map_reserved_pages();
-               } else {
-                       /* Loading another kernel to reboot into. */
-
-                       result = kimage_alloc_init(&image, entry, nr_segments,
-                                                  segments, flags);
-               }
-               if (result)
-                       goto out;
-
-               if (flags & KEXEC_PRESERVE_CONTEXT)
-                       image->preserve_context = 1;
-               result = machine_kexec_prepare(image);
-               if (result)
-                       goto out;
-
-               for (i = 0; i < nr_segments; i++) {
-                       result = kimage_load_segment(image, &image->segment[i]);
-                       if (result)
-                               goto out;
-               }
-               kimage_terminate(image);
-               if (flags & KEXEC_ON_CRASH)
-                       crash_unmap_reserved_pages();
-       }
-       /* Install the new kernel, and  Uninstall the old */
-       image = xchg(dest_image, image);
-
-out:
-       mutex_unlock(&kexec_mutex);
-       kimage_free(image);
-
-       return result;
-}
-
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
-                      compat_ulong_t, nr_segments,
-                      struct compat_kexec_segment __user *, segments,
-                      compat_ulong_t, flags)
-{
-       struct compat_kexec_segment in;
-       struct kexec_segment out, __user *ksegments;
-       unsigned long i, result;
-
-       /* Don't allow clients that don't understand the native
-        * architecture to do anything.
-        */
-       if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
-               return -EINVAL;
-
-       if (nr_segments > KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
-       for (i = 0; i < nr_segments; i++) {
-               result = copy_from_user(&in, &segments[i], sizeof(in));
-               if (result)
-                       return -EFAULT;
-
-               out.buf   = compat_ptr(in.buf);
-               out.bufsz = in.bufsz;
-               out.mem   = in.mem;
-               out.memsz = in.memsz;
-
-               result = copy_to_user(&ksegments[i], &out, sizeof(out));
-               if (result)
-                       return -EFAULT;
-       }
-
-       return sys_kexec_load(entry, nr_segments, ksegments, flags);
-}
-#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
-               unsigned long, cmdline_len, const char __user *, cmdline_ptr,
-               unsigned long, flags)
-{
-       int ret = 0, i;
-       struct kimage **dest_image, *image;
-
-       /* We only trust the superuser with rebooting the system. */
-       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
-               return -EPERM;
-
-       /* Make sure we have a legal set of flags */
-       if (flags != (flags & KEXEC_FILE_FLAGS))
-               return -EINVAL;
-
-       image = NULL;
-
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-
-       dest_image = &kexec_image;
-       if (flags & KEXEC_FILE_ON_CRASH)
-               dest_image = &kexec_crash_image;
-
-       if (flags & KEXEC_FILE_UNLOAD)
-               goto exchange;
-
-       /*
-        * In case of crash, new kernel gets loaded in reserved region. It is
-        * same memory where old crash kernel might be loaded. Free any
-        * current crash dump kernel before we corrupt it.
-        */
-       if (flags & KEXEC_FILE_ON_CRASH)
-               kimage_free(xchg(&kexec_crash_image, NULL));
-
-       ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
-                                    cmdline_len, flags);
-       if (ret)
-               goto out;
-
-       ret = machine_kexec_prepare(image);
-       if (ret)
-               goto out;
-
-       ret = kexec_calculate_store_digests(image);
-       if (ret)
-               goto out;
-
-       for (i = 0; i < image->nr_segments; i++) {
-               struct kexec_segment *ksegment;
-
-               ksegment = &image->segment[i];
-               pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
-                        i, ksegment->buf, ksegment->bufsz, ksegment->mem,
-                        ksegment->memsz);
-
-               ret = kimage_load_segment(image, &image->segment[i]);
-               if (ret)
-                       goto out;
-       }
-
-       kimage_terminate(image);
-
-       /*
-        * Free up any temporary buffers allocated which are not needed
-        * after image has been loaded
-        */
-       kimage_file_post_load_cleanup(image);
-exchange:
-       image = xchg(dest_image, image);
-out:
-       mutex_unlock(&kexec_mutex);
-       kimage_free(image);
-       return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
-       /* Take the kexec_mutex here to prevent sys_kexec_load
-        * running on one cpu from replacing the crash kernel
-        * we are using after a panic on a different cpu.
-        *
-        * If the crash kernel was not located in a fixed area
-        * of memory the xchg(&kexec_crash_image) would be
-        * sufficient.  But since I reuse the memory...
-        */
-       if (mutex_trylock(&kexec_mutex)) {
-               if (kexec_crash_image) {
-                       struct pt_regs fixed_regs;
-
-                       crash_setup_regs(&fixed_regs, regs);
-                       crash_save_vmcoreinfo();
-                       machine_crash_shutdown(&fixed_regs);
-                       machine_kexec(kexec_crash_image);
-               }
-               mutex_unlock(&kexec_mutex);
-       }
-}
-
-size_t crash_get_memory_size(void)
-{
-       size_t size = 0;
-       mutex_lock(&kexec_mutex);
-       if (crashk_res.end != crashk_res.start)
-               size = resource_size(&crashk_res);
-       mutex_unlock(&kexec_mutex);
-       return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
-                                          unsigned long end)
-{
-       unsigned long addr;
-
-       for (addr = begin; addr < end; addr += PAGE_SIZE)
-               free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
-       int ret = 0;
-       unsigned long start, end;
-       unsigned long old_size;
-       struct resource *ram_res;
-
-       mutex_lock(&kexec_mutex);
-
-       if (kexec_crash_image) {
-               ret = -ENOENT;
-               goto unlock;
-       }
-       start = crashk_res.start;
-       end = crashk_res.end;
-       old_size = (end == 0) ? 0 : end - start + 1;
-       if (new_size >= old_size) {
-               ret = (new_size == old_size) ? 0 : -EINVAL;
-               goto unlock;
-       }
-
-       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
-       if (!ram_res) {
-               ret = -ENOMEM;
-               goto unlock;
-       }
-
-       start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-       end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
-       crash_map_reserved_pages();
-       crash_free_reserved_phys_range(end, crashk_res.end);
-
-       if ((start == end) && (crashk_res.parent != NULL))
-               release_resource(&crashk_res);
-
-       ram_res->start = end;
-       ram_res->end = crashk_res.end;
-       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
-       ram_res->name = "System RAM";
-
-       crashk_res.end = end - 1;
-
-       insert_resource(&iomem_resource, ram_res);
-       crash_unmap_reserved_pages();
-
-unlock:
-       mutex_unlock(&kexec_mutex);
-       return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-                           size_t data_len)
-{
-       struct elf_note note;
-
-       note.n_namesz = strlen(name) + 1;
-       note.n_descsz = data_len;
-       note.n_type   = type;
-       memcpy(buf, &note, sizeof(note));
-       buf += (sizeof(note) + 3)/4;
-       memcpy(buf, name, note.n_namesz);
-       buf += (note.n_namesz + 3)/4;
-       memcpy(buf, data, note.n_descsz);
-       buf += (note.n_descsz + 3)/4;
-
-       return buf;
-}
-
-static void final_note(u32 *buf)
-{
-       struct elf_note note;
-
-       note.n_namesz = 0;
-       note.n_descsz = 0;
-       note.n_type   = 0;
-       memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
-       struct elf_prstatus prstatus;
-       u32 *buf;
-
-       if ((cpu < 0) || (cpu >= nr_cpu_ids))
-               return;
-
-       /* Using ELF notes here is opportunistic.
-        * I need a well defined structure format
-        * for the data I pass, and I need tags
-        * on the data to indicate what information I have
-        * squirrelled away.  ELF notes happen to provide
-        * all of that, so there is no need to invent something new.
-        */
-       buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
-       if (!buf)
-               return;
-       memset(&prstatus, 0, sizeof(prstatus));
-       prstatus.pr_pid = current->pid;
-       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
-       buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-                             &prstatus, sizeof(prstatus));
-       final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
-       /* Allocate memory for saving cpu registers. */
-       crash_notes = alloc_percpu(note_buf_t);
-       if (!crash_notes) {
-               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-               return -ENOMEM;
-       }
-       return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-                                       unsigned long long system_ram,
-                                       unsigned long long *crash_size,
-                                       unsigned long long *crash_base)
-{
-       char *cur = cmdline, *tmp;
-
-       /* for each entry of the comma-separated list */
-       do {
-               unsigned long long start, end = ULLONG_MAX, size;
-
-               /* get the start of the range */
-               start = memparse(cur, &tmp);
-               if (cur == tmp) {
-                       pr_warn("crashkernel: Memory value expected\n");
-                       return -EINVAL;
-               }
-               cur = tmp;
-               if (*cur != '-') {
-                       pr_warn("crashkernel: '-' expected\n");
-                       return -EINVAL;
-               }
-               cur++;
-
-               /* if no ':' is here, than we read the end */
-               if (*cur != ':') {
-                       end = memparse(cur, &tmp);
-                       if (cur == tmp) {
-                               pr_warn("crashkernel: Memory value expected\n");
-                               return -EINVAL;
-                       }
-                       cur = tmp;
-                       if (end <= start) {
-                               pr_warn("crashkernel: end <= start\n");
-                               return -EINVAL;
-                       }
-               }
-
-               if (*cur != ':') {
-                       pr_warn("crashkernel: ':' expected\n");
-                       return -EINVAL;
-               }
-               cur++;
-
-               size = memparse(cur, &tmp);
-               if (cur == tmp) {
-                       pr_warn("Memory value expected\n");
-                       return -EINVAL;
-               }
-               cur = tmp;
-               if (size >= system_ram) {
-                       pr_warn("crashkernel: invalid size\n");
-                       return -EINVAL;
-               }
-
-               /* match ? */
-               if (system_ram >= start && system_ram < end) {
-                       *crash_size = size;
-                       break;
-               }
-       } while (*cur++ == ',');
-
-       if (*crash_size > 0) {
-               while (*cur && *cur != ' ' && *cur != '@')
-                       cur++;
-               if (*cur == '@') {
-                       cur++;
-                       *crash_base = memparse(cur, &tmp);
-                       if (cur == tmp) {
-                               pr_warn("Memory value expected after '@'\n");
-                               return -EINVAL;
-                       }
-               }
-       }
-
-       return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *     crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-                                          unsigned long long *crash_size,
-                                          unsigned long long *crash_base)
-{
-       char *cur = cmdline;
-
-       *crash_size = memparse(cmdline, &cur);
-       if (cmdline == cur) {
-               pr_warn("crashkernel: memory value expected\n");
-               return -EINVAL;
-       }
-
-       if (*cur == '@')
-               *crash_base = memparse(cur+1, &cur);
-       else if (*cur != ' ' && *cur != '\0') {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-       [SUFFIX_HIGH] = ",high",
-       [SUFFIX_LOW]  = ",low",
-       [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *     crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-                                          unsigned long long   *crash_size,
-                                          const char *suffix)
-{
-       char *cur = cmdline;
-
-       *crash_size = memparse(cmdline, &cur);
-       if (cmdline == cur) {
-               pr_warn("crashkernel: memory value expected\n");
-               return -EINVAL;
-       }
-
-       /* check with suffix */
-       if (strncmp(cur, suffix, strlen(suffix))) {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-       cur += strlen(suffix);
-       if (*cur != ' ' && *cur != '\0') {
-               pr_warn("crashkernel: unrecognized char\n");
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-                            const char *name,
-                            const char *suffix)
-{
-       char *p = cmdline, *ck_cmdline = NULL;
-
-       /* find crashkernel and use the last one if there are more */
-       p = strstr(p, name);
-       while (p) {
-               char *end_p = strchr(p, ' ');
-               char *q;
-
-               if (!end_p)
-                       end_p = p + strlen(p);
-
-               if (!suffix) {
-                       int i;
-
-                       /* skip the one with any known suffix */
-                       for (i = 0; suffix_tbl[i]; i++) {
-                               q = end_p - strlen(suffix_tbl[i]);
-                               if (!strncmp(q, suffix_tbl[i],
-                                            strlen(suffix_tbl[i])))
-                                       goto next;
-                       }
-                       ck_cmdline = p;
-               } else {
-                       q = end_p - strlen(suffix);
-                       if (!strncmp(q, suffix, strlen(suffix)))
-                               ck_cmdline = p;
-               }
-next:
-               p = strstr(p+1, name);
-       }
-
-       if (!ck_cmdline)
-               return NULL;
-
-       return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base,
-                            const char *name,
-                            const char *suffix)
-{
-       char    *first_colon, *first_space;
-       char    *ck_cmdline;
-
-       BUG_ON(!crash_size || !crash_base);
-       *crash_size = 0;
-       *crash_base = 0;
-
-       ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-       if (!ck_cmdline)
-               return -EINVAL;
-
-       ck_cmdline += strlen(name);
-
-       if (suffix)
-               return parse_crashkernel_suffix(ck_cmdline, crash_size,
-                               suffix);
-       /*
-        * if the commandline contains a ':', then that's the extended
-        * syntax -- if not, it must be the classic syntax
-        */
-       first_colon = strchr(ck_cmdline, ':');
-       first_space = strchr(ck_cmdline, ' ');
-       if (first_colon && (!first_space || first_colon < first_space))
-               return parse_crashkernel_mem(ck_cmdline, system_ram,
-                               crash_size, crash_base);
-
-       return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                       "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                               "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-                            unsigned long long system_ram,
-                            unsigned long long *crash_size,
-                            unsigned long long *crash_base)
-{
-       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                               "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-       u32 *buf = vmcoreinfo_note;
-
-       if (!vmcoreinfo_size)
-               return;
-       buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-                             vmcoreinfo_size);
-       final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-       vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-       update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-       va_list args;
-       char buf[0x50];
-       size_t r;
-
-       va_start(args, fmt);
-       r = vscnprintf(buf, sizeof(buf), fmt, args);
-       va_end(args);
-
-       r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-       memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-       vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
-       return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-       VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-       VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-       VMCOREINFO_SYMBOL(init_uts_ns);
-       VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-       VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-       VMCOREINFO_SYMBOL(_stext);
-       VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-       VMCOREINFO_SYMBOL(mem_map);
-       VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-       VMCOREINFO_SYMBOL(mem_section);
-       VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-       VMCOREINFO_STRUCT_SIZE(mem_section);
-       VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-       VMCOREINFO_STRUCT_SIZE(page);
-       VMCOREINFO_STRUCT_SIZE(pglist_data);
-       VMCOREINFO_STRUCT_SIZE(zone);
-       VMCOREINFO_STRUCT_SIZE(free_area);
-       VMCOREINFO_STRUCT_SIZE(list_head);
-       VMCOREINFO_SIZE(nodemask_t);
-       VMCOREINFO_OFFSET(page, flags);
-       VMCOREINFO_OFFSET(page, _count);
-       VMCOREINFO_OFFSET(page, mapping);
-       VMCOREINFO_OFFSET(page, lru);
-       VMCOREINFO_OFFSET(page, _mapcount);
-       VMCOREINFO_OFFSET(page, private);
-       VMCOREINFO_OFFSET(pglist_data, node_zones);
-       VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-       VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-       VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-       VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-       VMCOREINFO_OFFSET(pglist_data, node_id);
-       VMCOREINFO_OFFSET(zone, free_area);
-       VMCOREINFO_OFFSET(zone, vm_stat);
-       VMCOREINFO_OFFSET(zone, spanned_pages);
-       VMCOREINFO_OFFSET(free_area, free_list);
-       VMCOREINFO_OFFSET(list_head, next);
-       VMCOREINFO_OFFSET(list_head, prev);
-       VMCOREINFO_OFFSET(vmap_area, va_start);
-       VMCOREINFO_OFFSET(vmap_area, list);
-       VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-       log_buf_kexec_setup();
-       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-       VMCOREINFO_NUMBER(NR_FREE_PAGES);
-       VMCOREINFO_NUMBER(PG_lru);
-       VMCOREINFO_NUMBER(PG_private);
-       VMCOREINFO_NUMBER(PG_swapcache);
-       VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-       VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-       VMCOREINFO_NUMBER(PG_head_mask);
-       VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
-       VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
-       arch_crash_save_vmcoreinfo();
-       update_vmcoreinfo_note();
-
-       return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
-                                   struct kexec_buf *kbuf)
-{
-       struct kimage *image = kbuf->image;
-       unsigned long temp_start, temp_end;
-
-       temp_end = min(end, kbuf->buf_max);
-       temp_start = temp_end - kbuf->memsz;
-
-       do {
-               /* align down start */
-               temp_start = temp_start & (~(kbuf->buf_align - 1));
-
-               if (temp_start < start || temp_start < kbuf->buf_min)
-                       return 0;
-
-               temp_end = temp_start + kbuf->memsz - 1;
-
-               /*
-                * Make sure this does not conflict with any of existing
-                * segments
-                */
-               if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                       temp_start = temp_start - PAGE_SIZE;
-                       continue;
-               }
-
-               /* We found a suitable memory range */
-               break;
-       } while (1);
-
-       /* If we are here, we found a suitable memory range */
-       kbuf->mem = temp_start;
-
-       /* Success, stop navigating through remaining System RAM ranges */
-       return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
-                                    struct kexec_buf *kbuf)
-{
-       struct kimage *image = kbuf->image;
-       unsigned long temp_start, temp_end;
-
-       temp_start = max(start, kbuf->buf_min);
-
-       do {
-               temp_start = ALIGN(temp_start, kbuf->buf_align);
-               temp_end = temp_start + kbuf->memsz - 1;
-
-               if (temp_end > end || temp_end > kbuf->buf_max)
-                       return 0;
-               /*
-                * Make sure this does not conflict with any of existing
-                * segments
-                */
-               if (kimage_is_destination_range(image, temp_start, temp_end)) {
-                       temp_start = temp_start + PAGE_SIZE;
-                       continue;
-               }
-
-               /* We found a suitable memory range */
-               break;
-       } while (1);
-
-       /* If we are here, we found a suitable memory range */
-       kbuf->mem = temp_start;
-
-       /* Success, stop navigating through remaining System RAM ranges */
-       return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
-       struct kexec_buf *kbuf = (struct kexec_buf *)arg;
-       unsigned long sz = end - start + 1;
-
-       /* Returning 0 will take to next memory range */
-       if (sz < kbuf->memsz)
-               return 0;
-
-       if (end < kbuf->buf_min || start > kbuf->buf_max)
-               return 0;
-
-       /*
-        * Allocate memory top down with-in ram range. Otherwise bottom up
-        * allocation.
-        */
-       if (kbuf->top_down)
-               return locate_mem_hole_top_down(start, end, kbuf);
-       return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-                    unsigned long memsz, unsigned long buf_align,
-                    unsigned long buf_min, unsigned long buf_max,
-                    bool top_down, unsigned long *load_addr)
-{
-
-       struct kexec_segment *ksegment;
-       struct kexec_buf buf, *kbuf;
-       int ret;
-
-       /* Currently adding segment this way is allowed only in file mode */
-       if (!image->file_mode)
-               return -EINVAL;
-
-       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
-               return -EINVAL;
-
-       /*
-        * Make sure we are not trying to add buffer after allocating
-        * control pages. All segments need to be placed first before
-        * any control pages are allocated. As control page allocation
-        * logic goes through list of segments to make sure there are
-        * no destination overlaps.
-        */
-       if (!list_empty(&image->control_pages)) {
-               WARN_ON(1);
-               return -EINVAL;
-       }
-
-       memset(&buf, 0, sizeof(struct kexec_buf));
-       kbuf = &buf;
-       kbuf->image = image;
-       kbuf->buffer = buffer;
-       kbuf->bufsz = bufsz;
-
-       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-       kbuf->buf_align = max(buf_align, PAGE_SIZE);
-       kbuf->buf_min = buf_min;
-       kbuf->buf_max = buf_max;
-       kbuf->top_down = top_down;
-
-       /* Walk the RAM ranges and allocate a suitable range for the buffer */
-       if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res("Crash kernel",
-                                    IORESOURCE_MEM | IORESOURCE_BUSY,
-                                    crashk_res.start, crashk_res.end, kbuf,
-                                    locate_mem_hole_callback);
-       else
-               ret = walk_system_ram_res(0, -1, kbuf,
-                                         locate_mem_hole_callback);
-       if (ret != 1) {
-               /* A suitable memory range could not be found for buffer */
-               return -EADDRNOTAVAIL;
-       }
-
-       /* Found a suitable memory range */
-       ksegment = &image->segment[image->nr_segments];
-       ksegment->kbuf = kbuf->buffer;
-       ksegment->bufsz = kbuf->bufsz;
-       ksegment->mem = kbuf->mem;
-       ksegment->memsz = kbuf->memsz;
-       image->nr_segments++;
-       *load_addr = ksegment->mem;
-       return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
-       struct crypto_shash *tfm;
-       struct shash_desc *desc;
-       int ret = 0, i, j, zero_buf_sz, sha_region_sz;
-       size_t desc_size, nullsz;
-       char *digest;
-       void *zero_buf;
-       struct kexec_sha_region *sha_regions;
-       struct purgatory_info *pi = &image->purgatory_info;
-
-       zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
-       zero_buf_sz = PAGE_SIZE;
-
-       tfm = crypto_alloc_shash("sha256", 0, 0);
-       if (IS_ERR(tfm)) {
-               ret = PTR_ERR(tfm);
-               goto out;
-       }
-
-       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-       desc = kzalloc(desc_size, GFP_KERNEL);
-       if (!desc) {
-               ret = -ENOMEM;
-               goto out_free_tfm;
-       }
-
-       sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
-       sha_regions = vzalloc(sha_region_sz);
-       if (!sha_regions)
-               goto out_free_desc;
-
-       desc->tfm   = tfm;
-       desc->flags = 0;
-
-       ret = crypto_shash_init(desc);
-       if (ret < 0)
-               goto out_free_sha_regions;
-
-       digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
-       if (!digest) {
-               ret = -ENOMEM;
-               goto out_free_sha_regions;
-       }
-
-       for (j = i = 0; i < image->nr_segments; i++) {
-               struct kexec_segment *ksegment;
-
-               ksegment = &image->segment[i];
-               /*
-                * Skip purgatory as it will be modified once we put digest
-                * info in purgatory.
-                */
-               if (ksegment->kbuf == pi->purgatory_buf)
-                       continue;
-
-               ret = crypto_shash_update(desc, ksegment->kbuf,
-                                         ksegment->bufsz);
-               if (ret)
-                       break;
-
-               /*
-                * Assume rest of the buffer is filled with zero and
-                * update digest accordingly.
-                */
-               nullsz = ksegment->memsz - ksegment->bufsz;
-               while (nullsz) {
-                       unsigned long bytes = nullsz;
-
-                       if (bytes > zero_buf_sz)
-                               bytes = zero_buf_sz;
-                       ret = crypto_shash_update(desc, zero_buf, bytes);
-                       if (ret)
-                               break;
-                       nullsz -= bytes;
-               }
-
-               if (ret)
-                       break;
-
-               sha_regions[j].start = ksegment->mem;
-               sha_regions[j].len = ksegment->memsz;
-               j++;
-       }
-
-       if (!ret) {
-               ret = crypto_shash_final(desc, digest);
-               if (ret)
-                       goto out_free_digest;
-               ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
-                                               sha_regions, sha_region_sz, 0);
-               if (ret)
-                       goto out_free_digest;
-
-               ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
-                                               digest, SHA256_DIGEST_SIZE, 0);
-               if (ret)
-                       goto out_free_digest;
-       }
-
-out_free_digest:
-       kfree(digest);
-out_free_sha_regions:
-       vfree(sha_regions);
-out_free_desc:
-       kfree(desc);
-out_free_tfm:
-       kfree(tfm);
-out:
-       return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-                                 unsigned long max, int top_down)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
-       unsigned char *buf_addr, *src;
-       int i, ret = 0, entry_sidx = -1;
-       const Elf_Shdr *sechdrs_c;
-       Elf_Shdr *sechdrs = NULL;
-       void *purgatory_buf = NULL;
-
-       /*
-        * sechdrs_c points to section headers in purgatory and are read
-        * only. No modifications allowed.
-        */
-       sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
-       /*
-        * We can not modify sechdrs_c[] and its fields. It is read only.
-        * Copy it over to a local copy where one can store some temporary
-        * data and free it at the end. We need to modify ->sh_addr and
-        * ->sh_offset fields to keep track of permanent and temporary
-        * locations of sections.
-        */
-       sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-       if (!sechdrs)
-               return -ENOMEM;
-
-       memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
-       /*
-        * We seem to have multiple copies of sections. First copy is which
-        * is embedded in kernel in read only section. Some of these sections
-        * will be copied to a temporary buffer and relocated. And these
-        * sections will finally be copied to their final destination at
-        * segment load time.
-        *
-        * Use ->sh_offset to reflect section address in memory. It will
-        * point to original read only copy if section is not allocatable.
-        * Otherwise it will point to temporary copy which will be relocated.
-        *
-        * Use ->sh_addr to contain final address of the section where it
-        * will go during execution time.
-        */
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type == SHT_NOBITS)
-                       continue;
-
-               sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-                                               sechdrs[i].sh_offset;
-       }
-
-       /*
-        * Identify entry point section and make entry relative to section
-        * start.
-        */
-       entry = pi->ehdr->e_entry;
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-                       continue;
-
-               /* Make entry section relative */
-               if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-                   ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-                    pi->ehdr->e_entry)) {
-                       entry_sidx = i;
-                       entry -= sechdrs[i].sh_addr;
-                       break;
-               }
-       }
-
-       /* Determine how much memory is needed to load relocatable object. */
-       buf_align = 1;
-       bss_align = 1;
-       buf_sz = 0;
-       bss_sz = 0;
-
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               align = sechdrs[i].sh_addralign;
-               if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       if (buf_align < align)
-                               buf_align = align;
-                       buf_sz = ALIGN(buf_sz, align);
-                       buf_sz += sechdrs[i].sh_size;
-               } else {
-                       /* bss section */
-                       if (bss_align < align)
-                               bss_align = align;
-                       bss_sz = ALIGN(bss_sz, align);
-                       bss_sz += sechdrs[i].sh_size;
-               }
-       }
-
-       /* Determine the bss padding required to align bss properly */
-       bss_pad = 0;
-       if (buf_sz & (bss_align - 1))
-               bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
-       memsz = buf_sz + bss_pad + bss_sz;
-
-       /* Allocate buffer for purgatory */
-       purgatory_buf = vzalloc(buf_sz);
-       if (!purgatory_buf) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       if (buf_align < bss_align)
-               buf_align = bss_align;
-
-       /* Add buffer to segment list */
-       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-                               buf_align, min, max, top_down,
-                               &pi->purgatory_load_addr);
-       if (ret)
-               goto out;
-
-       /* Load SHF_ALLOC sections */
-       buf_addr = purgatory_buf;
-       load_addr = curr_load_addr = pi->purgatory_load_addr;
-       bss_addr = load_addr + buf_sz + bss_pad;
-
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-                       continue;
-
-               align = sechdrs[i].sh_addralign;
-               if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       curr_load_addr = ALIGN(curr_load_addr, align);
-                       offset = curr_load_addr - load_addr;
-                       /* We already modifed ->sh_offset to keep src addr */
-                       src = (char *) sechdrs[i].sh_offset;
-                       memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
-                       /* Store load address and source address of section */
-                       sechdrs[i].sh_addr = curr_load_addr;
-
-                       /*
-                        * This section got copied to temporary buffer. Update
-                        * ->sh_offset accordingly.
-                        */
-                       sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-                       /* Advance to the next address */
-                       curr_load_addr += sechdrs[i].sh_size;
-               } else {
-                       bss_addr = ALIGN(bss_addr, align);
-                       sechdrs[i].sh_addr = bss_addr;
-                       bss_addr += sechdrs[i].sh_size;
-               }
-       }
-
-       /* Update entry point based on load address of text section */
-       if (entry_sidx >= 0)
-               entry += sechdrs[entry_sidx].sh_addr;
-
-       /* Make kernel jump to purgatory after shutdown */
-       image->start = entry;
-
-       /* Used later to get/set symbol values */
-       pi->sechdrs = sechdrs;
-
-       /*
-        * Used later to identify which section is purgatory and skip it
-        * from checksumming.
-        */
-       pi->purgatory_buf = purgatory_buf;
-       return ret;
-out:
-       vfree(sechdrs);
-       vfree(purgatory_buf);
-       return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
-       int i, ret;
-       struct purgatory_info *pi = &image->purgatory_info;
-       Elf_Shdr *sechdrs = pi->sechdrs;
-
-       /* Apply relocations */
-       for (i = 0; i < pi->ehdr->e_shnum; i++) {
-               Elf_Shdr *section, *symtab;
-
-               if (sechdrs[i].sh_type != SHT_RELA &&
-                   sechdrs[i].sh_type != SHT_REL)
-                       continue;
-
-               /*
-                * For section of type SHT_RELA/SHT_REL,
-                * ->sh_link contains section header index of associated
-                * symbol table. And ->sh_info contains section header
-                * index of section to which relocations apply.
-                */
-               if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-                   sechdrs[i].sh_link >= pi->ehdr->e_shnum)
-                       return -ENOEXEC;
-
-               section = &sechdrs[sechdrs[i].sh_info];
-               symtab = &sechdrs[sechdrs[i].sh_link];
-
-               if (!(section->sh_flags & SHF_ALLOC))
-                       continue;
-
-               /*
-                * symtab->sh_link contain section header index of associated
-                * string table.
-                */
-               if (symtab->sh_link >= pi->ehdr->e_shnum)
-                       /* Invalid section number? */
-                       continue;
-
-               /*
-                * Respective architecture needs to provide support for applying
-                * relocations of type SHT_RELA/SHT_REL.
-                */
-               if (sechdrs[i].sh_type == SHT_RELA)
-                       ret = arch_kexec_apply_relocations_add(pi->ehdr,
-                                                              sechdrs, i);
-               else if (sechdrs[i].sh_type == SHT_REL)
-                       ret = arch_kexec_apply_relocations(pi->ehdr,
-                                                          sechdrs, i);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-                        unsigned long max, int top_down,
-                        unsigned long *load_addr)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       int ret;
-
-       if (kexec_purgatory_size <= 0)
-               return -EINVAL;
-
-       if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-               return -ENOEXEC;
-
-       pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
-       if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-           || pi->ehdr->e_type != ET_REL
-           || !elf_check_arch(pi->ehdr)
-           || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-               return -ENOEXEC;
-
-       if (pi->ehdr->e_shoff >= kexec_purgatory_size
-           || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-           kexec_purgatory_size - pi->ehdr->e_shoff))
-               return -ENOEXEC;
-
-       ret = __kexec_load_purgatory(image, min, max, top_down);
-       if (ret)
-               return ret;
-
-       ret = kexec_apply_relocations(image);
-       if (ret)
-               goto out;
-
-       *load_addr = pi->purgatory_load_addr;
-       return 0;
-out:
-       vfree(pi->sechdrs);
-       vfree(pi->purgatory_buf);
-       return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-                                           const char *name)
-{
-       Elf_Sym *syms;
-       Elf_Shdr *sechdrs;
-       Elf_Ehdr *ehdr;
-       int i, k;
-       const char *strtab;
-
-       if (!pi->sechdrs || !pi->ehdr)
-               return NULL;
-
-       sechdrs = pi->sechdrs;
-       ehdr = pi->ehdr;
-
-       for (i = 0; i < ehdr->e_shnum; i++) {
-               if (sechdrs[i].sh_type != SHT_SYMTAB)
-                       continue;
-
-               if (sechdrs[i].sh_link >= ehdr->e_shnum)
-                       /* Invalid strtab section number */
-                       continue;
-               strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-               syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
-               /* Go through symbols for a match */
-               for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
-                       if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
-                               continue;
-
-                       if (strcmp(strtab + syms[k].st_name, name) != 0)
-                               continue;
-
-                       if (syms[k].st_shndx == SHN_UNDEF ||
-                           syms[k].st_shndx >= ehdr->e_shnum) {
-                               pr_debug("Symbol: %s has bad section index %d.\n",
-                                               name, syms[k].st_shndx);
-                               return NULL;
-                       }
-
-                       /* Found the symbol we are looking for */
-                       return &syms[k];
-               }
-       }
-
-       return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
-       struct purgatory_info *pi = &image->purgatory_info;
-       Elf_Sym *sym;
-       Elf_Shdr *sechdr;
-
-       sym = kexec_purgatory_find_symbol(pi, name);
-       if (!sym)
-               return ERR_PTR(-EINVAL);
-
-       sechdr = &pi->sechdrs[sym->st_shndx];
-
-       /*
-        * Returns the address where symbol will finally be loaded after
-        * kexec_load_segment()
-        */
-       return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
-                                  void *buf, unsigned int size, bool get_value)
-{
-       Elf_Sym *sym;
-       Elf_Shdr *sechdrs;
-       struct purgatory_info *pi = &image->purgatory_info;
-       char *sym_buf;
-
-       sym = kexec_purgatory_find_symbol(pi, name);
-       if (!sym)
-               return -EINVAL;
-
-       if (sym->st_size != size) {
-               pr_err("symbol %s size mismatch: expected %lu actual %u\n",
-                      name, (unsigned long)sym->st_size, size);
-               return -EINVAL;
-       }
+       ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+       for (i = 0; i < nr_segments; i++) {
+               result = copy_from_user(&in, &segments[i], sizeof(in));
+               if (result)
+                       return -EFAULT;
 
-       sechdrs = pi->sechdrs;
+               out.buf   = compat_ptr(in.buf);
+               out.bufsz = in.bufsz;
+               out.mem   = in.mem;
+               out.memsz = in.memsz;
 
-       if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
-               pr_err("symbol %s is in a bss section. Cannot %s\n", name,
-                      get_value ? "get" : "set");
-               return -EINVAL;
+               result = copy_to_user(&ksegments[i], &out, sizeof(out));
+               if (result)
+                       return -EFAULT;
        }
 
-       sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-                                       sym->st_value;
-
-       if (get_value)
-               memcpy((void *)buf, sym_buf, size);
-       else
-               memcpy((void *)sym_buf, buf, size);
-
-       return 0;
+       return sys_kexec_load(entry, nr_segments, ksegments, flags);
 }
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable.  If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
-       int error = 0;
-
-       if (!mutex_trylock(&kexec_mutex))
-               return -EBUSY;
-       if (!kexec_image) {
-               error = -EINVAL;
-               goto Unlock;
-       }
-
-#ifdef CONFIG_KEXEC_JUMP
-       if (kexec_image->preserve_context) {
-               lock_system_sleep();
-               pm_prepare_console();
-               error = freeze_processes();
-               if (error) {
-                       error = -EBUSY;
-                       goto Restore_console;
-               }
-               suspend_console();
-               error = dpm_suspend_start(PMSG_FREEZE);
-               if (error)
-                       goto Resume_console;
-               /* At this point, dpm_suspend_start() has been called,
-                * but *not* dpm_suspend_end(). We *must* call
-                * dpm_suspend_end() now.  Otherwise, drivers for
-                * some devices (e.g. interrupt controllers) become
-                * desynchronized with the actual state of the
-                * hardware at resume time, and evil weirdness ensues.
-                */
-               error = dpm_suspend_end(PMSG_FREEZE);
-               if (error)
-                       goto Resume_devices;
-               error = disable_nonboot_cpus();
-               if (error)
-                       goto Enable_cpus;
-               local_irq_disable();
-               error = syscore_suspend();
-               if (error)
-                       goto Enable_irqs;
-       } else
-#endif
-       {
-               kexec_in_progress = true;
-               kernel_restart_prepare(NULL);
-               migrate_to_reboot_cpu();
-
-               /*
-                * migrate_to_reboot_cpu() disables CPU hotplug assuming that
-                * no further code needs to use CPU hotplug (which is true in
-                * the reboot case). However, the kexec path depends on using
-                * CPU hotplug again; so re-enable it here.
-                */
-               cpu_hotplug_enable();
-               pr_emerg("Starting new kernel\n");
-               machine_shutdown();
-       }
-
-       machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
-       if (kexec_image->preserve_context) {
-               syscore_resume();
- Enable_irqs:
-               local_irq_enable();
- Enable_cpus:
-               enable_nonboot_cpus();
-               dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
-               dpm_resume_end(PMSG_RESTORE);
- Resume_console:
-               resume_console();
-               thaw_processes();
- Restore_console:
-               pm_restore_console();
-               unlock_system_sleep();
-       }
 #endif
-
- Unlock:
-       mutex_unlock(&kexec_mutex);
-       return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644 (file)
index 0000000..201b453
--- /dev/null
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt)    "kexec: " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+       .name  = "Crash kernel",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+       .name  = "Crash kernel",
+       .start = 0,
+       .end   = 0,
+       .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+       /*
+        * If crash_kexec_post_notifiers is enabled, don't run
+        * crash_kexec() here yet, which must be run after panic
+        * notifiers in panic().
+        */
+       if (crash_kexec_post_notifiers)
+               return 0;
+       /*
+        * There are 4 panic() calls in do_exit() path, each of which
+        * corresponds to each of these 4 conditions.
+        */
+       if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+               return 1;
+       return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+                                      gfp_t gfp_mask,
+                                      unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+       int result, i;
+       unsigned long nr_segments = image->nr_segments;
+
+       /*
+        * Verify we have good destination addresses.  The caller is
+        * responsible for making certain we don't attempt to load
+        * the new image into invalid or reserved areas of RAM.  This
+        * just verifies it is an address we can use.
+        *
+        * Since the kernel does everything in page size chunks ensure
+        * the destination addresses are page aligned.  Too many
+        * special cases crop of when we don't do this.  The most
+        * insidious is getting overlapping destination addresses
+        * simply because addresses are changed to page size
+        * granularity.
+        */
+       result = -EADDRNOTAVAIL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mstart, mend;
+
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+                       return result;
+               if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                       return result;
+       }
+
+       /* Verify our destination addresses do not overlap.
+        * If we alloed overlapping destination addresses
+        * through very weird things can happen with no
+        * easy explanation as one segment stops on another.
+        */
+       result = -EINVAL;
+       for (i = 0; i < nr_segments; i++) {
+               unsigned long mstart, mend;
+               unsigned long j;
+
+               mstart = image->segment[i].mem;
+               mend   = mstart + image->segment[i].memsz;
+               for (j = 0; j < i; j++) {
+                       unsigned long pstart, pend;
+
+                       pstart = image->segment[j].mem;
+                       pend   = pstart + image->segment[j].memsz;
+                       /* Do the segments overlap ? */
+                       if ((mend > pstart) && (mstart < pend))
+                               return result;
+               }
+       }
+
+       /* Ensure our buffer sizes are strictly less than
+        * our memory sizes.  This should always be the case,
+        * and it is easier to check up front than to be surprised
+        * later on.
+        */
+       result = -EINVAL;
+       for (i = 0; i < nr_segments; i++) {
+               if (image->segment[i].bufsz > image->segment[i].memsz)
+                       return result;
+       }
+
+       /*
+        * Verify we have good destination addresses.  Normally
+        * the caller is responsible for making certain we don't
+        * attempt to load the new image into invalid or reserved
+        * areas of RAM.  But crash kernels are preloaded into a
+        * reserved area of ram.  We must ensure the addresses
+        * are in the reserved area otherwise preloading the
+        * kernel could corrupt things.
+        */
+
+       if (image->type == KEXEC_TYPE_CRASH) {
+               result = -EADDRNOTAVAIL;
+               for (i = 0; i < nr_segments; i++) {
+                       unsigned long mstart, mend;
+
+                       mstart = image->segment[i].mem;
+                       mend = mstart + image->segment[i].memsz - 1;
+                       /* Ensure we are within the crash kernel limits */
+                       if ((mstart < crashk_res.start) ||
+                           (mend > crashk_res.end))
+                               return result;
+               }
+       }
+
+       return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+       struct kimage *image;
+
+       /* Allocate a controlling structure */
+       image = kzalloc(sizeof(*image), GFP_KERNEL);
+       if (!image)
+               return NULL;
+
+       image->head = 0;
+       image->entry = &image->head;
+       image->last_entry = &image->head;
+       image->control_page = ~0; /* By default this does not apply */
+       image->type = KEXEC_TYPE_DEFAULT;
+
+       /* Initialize the list of control pages */
+       INIT_LIST_HEAD(&image->control_pages);
+
+       /* Initialize the list of destination pages */
+       INIT_LIST_HEAD(&image->dest_pages);
+
+       /* Initialize the list of unusable pages */
+       INIT_LIST_HEAD(&image->unusable_pages);
+
+       return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       unsigned long i;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               unsigned long mstart, mend;
+
+               mstart = image->segment[i].mem;
+               mend = mstart + image->segment[i].memsz;
+               if ((end > mstart) && (start < mend))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+       struct page *pages;
+
+       pages = alloc_pages(gfp_mask, order);
+       if (pages) {
+               unsigned int count, i;
+
+               pages->mapping = NULL;
+               set_page_private(pages, order);
+               count = 1 << order;
+               for (i = 0; i < count; i++)
+                       SetPageReserved(pages + i);
+       }
+
+       return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+       unsigned int order, count, i;
+
+       order = page_private(page);
+       count = 1 << order;
+       for (i = 0; i < count; i++)
+               ClearPageReserved(page + i);
+       __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+       struct list_head *pos, *next;
+
+       list_for_each_safe(pos, next, list) {
+               struct page *page;
+
+               page = list_entry(pos, struct page, lru);
+               list_del(&page->lru);
+               kimage_free_pages(page);
+       }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+                                                       unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * At worst this runs in O(N) of the image size.
+        */
+       struct list_head extra_pages;
+       struct page *pages;
+       unsigned int count;
+
+       count = 1 << order;
+       INIT_LIST_HEAD(&extra_pages);
+
+       /* Loop while I can allocate a page and the page allocated
+        * is a destination page.
+        */
+       do {
+               unsigned long pfn, epfn, addr, eaddr;
+
+               pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+               if (!pages)
+                       break;
+               pfn   = page_to_pfn(pages);
+               epfn  = pfn + count;
+               addr  = pfn << PAGE_SHIFT;
+               eaddr = epfn << PAGE_SHIFT;
+               if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                             kimage_is_destination_range(image, addr, eaddr)) {
+                       list_add(&pages->lru, &extra_pages);
+                       pages = NULL;
+               }
+       } while (!pages);
+
+       if (pages) {
+               /* Remember the allocated page... */
+               list_add(&pages->lru, &image->control_pages);
+
+               /* Because the page is already in it's destination
+                * location we will never allocate another page at
+                * that address.  Therefore kimage_alloc_pages
+                * will not return it (again) and we don't need
+                * to give it an entry in image->segment[].
+                */
+       }
+       /* Deal with the destination pages I have inadvertently allocated.
+        *
+        * Ideally I would convert multi-page allocations into single
+        * page allocations, and add everything to image->dest_pages.
+        *
+        * For now it is simpler to just free the pages.
+        */
+       kimage_free_page_list(&extra_pages);
+
+       return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+                                                     unsigned int order)
+{
+       /* Control pages are special, they are the intermediaries
+        * that are needed while we copy the rest of the pages
+        * to their final resting place.  As such they must
+        * not conflict with either the destination addresses
+        * or memory the kernel is already using.
+        *
+        * Control pages are also the only pags we must allocate
+        * when loading a crash kernel.  All of the other pages
+        * are specified by the segments and we just memcpy
+        * into them directly.
+        *
+        * The only case where we really need more than one of
+        * these are for architectures where we cannot disable
+        * the MMU and must instead generate an identity mapped
+        * page table for all of the memory.
+        *
+        * Given the low demand this implements a very simple
+        * allocator that finds the first hole of the appropriate
+        * size in the reserved memory region, and allocates all
+        * of the memory up to and including the hole.
+        */
+       unsigned long hole_start, hole_end, size;
+       struct page *pages;
+
+       pages = NULL;
+       size = (1 << order) << PAGE_SHIFT;
+       hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+       hole_end   = hole_start + size - 1;
+       while (hole_end <= crashk_res.end) {
+               unsigned long i;
+
+               if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+                       break;
+               /* See if I overlap any of the segments */
+               for (i = 0; i < image->nr_segments; i++) {
+                       unsigned long mstart, mend;
+
+                       mstart = image->segment[i].mem;
+                       mend   = mstart + image->segment[i].memsz - 1;
+                       if ((hole_end >= mstart) && (hole_start <= mend)) {
+                               /* Advance the hole to the end of the segment */
+                               hole_start = (mend + (size - 1)) & ~(size - 1);
+                               hole_end   = hole_start + size - 1;
+                               break;
+                       }
+               }
+               /* If I don't overlap any segments I have found my hole! */
+               if (i == image->nr_segments) {
+                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       image->control_page = hole_end;
+                       break;
+               }
+       }
+
+       return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       struct page *pages = NULL;
+
+       switch (image->type) {
+       case KEXEC_TYPE_DEFAULT:
+               pages = kimage_alloc_normal_control_pages(image, order);
+               break;
+       case KEXEC_TYPE_CRASH:
+               pages = kimage_alloc_crash_control_pages(image, order);
+               break;
+       }
+
+       return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+       if (*image->entry != 0)
+               image->entry++;
+
+       if (image->entry == image->last_entry) {
+               kimage_entry_t *ind_page;
+               struct page *page;
+
+               page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+               if (!page)
+                       return -ENOMEM;
+
+               ind_page = page_address(page);
+               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               image->entry = ind_page;
+               image->last_entry = ind_page +
+                                     ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+       }
+       *image->entry = entry;
+       image->entry++;
+       *image->entry = 0;
+
+       return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+                                  unsigned long destination)
+{
+       int result;
+
+       destination &= PAGE_MASK;
+       result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+       return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+       int result;
+
+       page &= PAGE_MASK;
+       result = kimage_add_entry(image, page | IND_SOURCE);
+
+       return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+       /* Walk through and free any extra destination pages I may have */
+       kimage_free_page_list(&image->dest_pages);
+
+       /* Walk through and free any unusable pages I have cached */
+       kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+       if (*image->entry != 0)
+               image->entry++;
+
+       *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+       for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+               ptr = (entry & IND_INDIRECTION) ? \
+                       phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+       struct page *page;
+
+       page = pfn_to_page(entry >> PAGE_SHIFT);
+       kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+       kimage_entry_t *ptr, entry;
+       kimage_entry_t ind = 0;
+
+       if (!image)
+               return;
+
+       kimage_free_extra_pages(image);
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_INDIRECTION) {
+                       /* Free the previous indirection page */
+                       if (ind & IND_INDIRECTION)
+                               kimage_free_entry(ind);
+                       /* Save this indirection page until we are
+                        * done with it.
+                        */
+                       ind = entry;
+               } else if (entry & IND_SOURCE)
+                       kimage_free_entry(entry);
+       }
+       /* Free the final indirection page */
+       if (ind & IND_INDIRECTION)
+               kimage_free_entry(ind);
+
+       /* Handle any machine specific cleanup */
+       machine_kexec_cleanup(image);
+
+       /* Free the kexec control pages... */
+       kimage_free_page_list(&image->control_pages);
+
+       /*
+        * Free up any temporary buffers allocated. This might hit if
+        * error occurred much later after buffer allocation.
+        */
+       if (image->file_mode)
+               kimage_file_post_load_cleanup(image);
+
+       kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+                                       unsigned long page)
+{
+       kimage_entry_t *ptr, entry;
+       unsigned long destination = 0;
+
+       for_each_kimage_entry(image, ptr, entry) {
+               if (entry & IND_DESTINATION)
+                       destination = entry & PAGE_MASK;
+               else if (entry & IND_SOURCE) {
+                       if (page == destination)
+                               return ptr;
+                       destination += PAGE_SIZE;
+               }
+       }
+
+       return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+                                       gfp_t gfp_mask,
+                                       unsigned long destination)
+{
+       /*
+        * Here we implement safeguards to ensure that a source page
+        * is not copied to its destination page before the data on
+        * the destination page is no longer useful.
+        *
+        * To do this we maintain the invariant that a source page is
+        * either its own destination page, or it is not a
+        * destination page at all.
+        *
+        * That is slightly stronger than required, but the proof
+        * that no problems will not occur is trivial, and the
+        * implementation is simply to verify.
+        *
+        * When allocating all pages normally this algorithm will run
+        * in O(N) time, but in the worst case it will run in O(N^2)
+        * time.   If the runtime is a problem the data structures can
+        * be fixed.
+        */
+       struct page *page;
+       unsigned long addr;
+
+       /*
+        * Walk through the list of destination pages, and see if I
+        * have a match.
+        */
+       list_for_each_entry(page, &image->dest_pages, lru) {
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+               if (addr == destination) {
+                       list_del(&page->lru);
+                       return page;
+               }
+       }
+       page = NULL;
+       while (1) {
+               kimage_entry_t *old;
+
+               /* Allocate a page, if we run out of memory give up */
+               page = kimage_alloc_pages(gfp_mask, 0);
+               if (!page)
+                       return NULL;
+               /* If the page cannot be used file it away */
+               if (page_to_pfn(page) >
+                               (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                       list_add(&page->lru, &image->unusable_pages);
+                       continue;
+               }
+               addr = page_to_pfn(page) << PAGE_SHIFT;
+
+               /* If it is the destination page we want use it */
+               if (addr == destination)
+                       break;
+
+               /* If the page is not a destination page use it */
+               if (!kimage_is_destination_range(image, addr,
+                                                 addr + PAGE_SIZE))
+                       break;
+
+               /*
+                * I know that the page is someones destination page.
+                * See if there is already a source page for this
+                * destination page.  And if so swap the source pages.
+                */
+               old = kimage_dst_used(image, addr);
+               if (old) {
+                       /* If so move it */
+                       unsigned long old_addr;
+                       struct page *old_page;
+
+                       old_addr = *old & PAGE_MASK;
+                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       copy_highpage(page, old_page);
+                       *old = addr | (*old & ~PAGE_MASK);
+
+                       /* The old page I have found cannot be a
+                        * destination page, so return it if it's
+                        * gfp_flags honor the ones passed in.
+                        */
+                       if (!(gfp_mask & __GFP_HIGHMEM) &&
+                           PageHighMem(old_page)) {
+                               kimage_free_pages(old_page);
+                               continue;
+                       }
+                       addr = old_addr;
+                       page = old_page;
+                       break;
+               }
+               /* Place the page on the destination list, to be used later */
+               list_add(&page->lru, &image->dest_pages);
+       }
+
+       return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+                                        struct kexec_segment *segment)
+{
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+
+       result = kimage_set_destination(image, maddr);
+       if (result < 0)
+               goto out;
+
+       while (mbytes) {
+               struct page *page;
+               char *ptr;
+               size_t uchunk, mchunk;
+
+               page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+               if (!page) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               result = kimage_add_page(image, page_to_pfn(page)
+                                                               << PAGE_SHIFT);
+               if (result < 0)
+                       goto out;
+
+               ptr = kmap(page);
+               /* Start with a clear page */
+               clear_page(ptr);
+               ptr += maddr & ~PAGE_MASK;
+               mchunk = min_t(size_t, mbytes,
+                               PAGE_SIZE - (maddr & ~PAGE_MASK));
+               uchunk = min(ubytes, mchunk);
+
+               /* For file based kexec, source pages are in kernel memory */
+               if (image->file_mode)
+                       memcpy(ptr, kbuf, uchunk);
+               else
+                       result = copy_from_user(ptr, buf, uchunk);
+               kunmap(page);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               ubytes -= uchunk;
+               maddr  += mchunk;
+               if (image->file_mode)
+                       kbuf += mchunk;
+               else
+                       buf += mchunk;
+               mbytes -= mchunk;
+       }
+out:
+       return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+                                       struct kexec_segment *segment)
+{
+       /* For crash dumps kernels we simply copy the data from
+        * user space to it's destination.
+        * We do things a page at a time for the sake of kmap.
+        */
+       unsigned long maddr;
+       size_t ubytes, mbytes;
+       int result;
+       unsigned char __user *buf = NULL;
+       unsigned char *kbuf = NULL;
+
+       result = 0;
+       if (image->file_mode)
+               kbuf = segment->kbuf;
+       else
+               buf = segment->buf;
+       ubytes = segment->bufsz;
+       mbytes = segment->memsz;
+       maddr = segment->mem;
+       while (mbytes) {
+               struct page *page;
+               char *ptr;
+               size_t uchunk, mchunk;
+
+               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               if (!page) {
+                       result  = -ENOMEM;
+                       goto out;
+               }
+               ptr = kmap(page);
+               ptr += maddr & ~PAGE_MASK;
+               mchunk = min_t(size_t, mbytes,
+                               PAGE_SIZE - (maddr & ~PAGE_MASK));
+               uchunk = min(ubytes, mchunk);
+               if (mchunk > uchunk) {
+                       /* Zero the trailing part of the page */
+                       memset(ptr + uchunk, 0, mchunk - uchunk);
+               }
+
+               /* For file based kexec, source pages are in kernel memory */
+               if (image->file_mode)
+                       memcpy(ptr, kbuf, uchunk);
+               else
+                       result = copy_from_user(ptr, buf, uchunk);
+               kexec_flush_icache_page(page);
+               kunmap(page);
+               if (result) {
+                       result = -EFAULT;
+                       goto out;
+               }
+               ubytes -= uchunk;
+               maddr  += mchunk;
+               if (image->file_mode)
+                       kbuf += mchunk;
+               else
+                       buf += mchunk;
+               mbytes -= mchunk;
+       }
+out:
+       return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       int result = -ENOMEM;
+
+       switch (image->type) {
+       case KEXEC_TYPE_DEFAULT:
+               result = kimage_load_normal_segment(image, segment);
+               break;
+       case KEXEC_TYPE_CRASH:
+               result = kimage_load_crash_segment(image, segment);
+               break;
+       }
+
+       return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+       /* Take the kexec_mutex here to prevent sys_kexec_load
+        * running on one cpu from replacing the crash kernel
+        * we are using after a panic on a different cpu.
+        *
+        * If the crash kernel was not located in a fixed area
+        * of memory the xchg(&kexec_crash_image) would be
+        * sufficient.  But since I reuse the memory...
+        */
+       if (mutex_trylock(&kexec_mutex)) {
+               if (kexec_crash_image) {
+                       struct pt_regs fixed_regs;
+
+                       crash_setup_regs(&fixed_regs, regs);
+                       crash_save_vmcoreinfo();
+                       machine_crash_shutdown(&fixed_regs);
+                       machine_kexec(kexec_crash_image);
+               }
+               mutex_unlock(&kexec_mutex);
+       }
+}
+
+size_t crash_get_memory_size(void)
+{
+       size_t size = 0;
+
+       mutex_lock(&kexec_mutex);
+       if (crashk_res.end != crashk_res.start)
+               size = resource_size(&crashk_res);
+       mutex_unlock(&kexec_mutex);
+       return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+                                          unsigned long end)
+{
+       unsigned long addr;
+
+       for (addr = begin; addr < end; addr += PAGE_SIZE)
+               free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+       int ret = 0;
+       unsigned long start, end;
+       unsigned long old_size;
+       struct resource *ram_res;
+
+       mutex_lock(&kexec_mutex);
+
+       if (kexec_crash_image) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+       start = crashk_res.start;
+       end = crashk_res.end;
+       old_size = (end == 0) ? 0 : end - start + 1;
+       if (new_size >= old_size) {
+               ret = (new_size == old_size) ? 0 : -EINVAL;
+               goto unlock;
+       }
+
+       ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+       if (!ram_res) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
+       start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+       end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+       crash_map_reserved_pages();
+       crash_free_reserved_phys_range(end, crashk_res.end);
+
+       if ((start == end) && (crashk_res.parent != NULL))
+               release_resource(&crashk_res);
+
+       ram_res->start = end;
+       ram_res->end = crashk_res.end;
+       ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       ram_res->name = "System RAM";
+
+       crashk_res.end = end - 1;
+
+       insert_resource(&iomem_resource, ram_res);
+       crash_unmap_reserved_pages();
+
+unlock:
+       mutex_unlock(&kexec_mutex);
+       return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                           size_t data_len)
+{
+       struct elf_note note;
+
+       note.n_namesz = strlen(name) + 1;
+       note.n_descsz = data_len;
+       note.n_type   = type;
+       memcpy(buf, &note, sizeof(note));
+       buf += (sizeof(note) + 3)/4;
+       memcpy(buf, name, note.n_namesz);
+       buf += (note.n_namesz + 3)/4;
+       memcpy(buf, data, note.n_descsz);
+       buf += (note.n_descsz + 3)/4;
+
+       return buf;
+}
+
+static void final_note(u32 *buf)
+{
+       struct elf_note note;
+
+       note.n_namesz = 0;
+       note.n_descsz = 0;
+       note.n_type   = 0;
+       memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+       struct elf_prstatus prstatus;
+       u32 *buf;
+
+       if ((cpu < 0) || (cpu >= nr_cpu_ids))
+               return;
+
+       /* Using ELF notes here is opportunistic.
+        * I need a well defined structure format
+        * for the data I pass, and I need tags
+        * on the data to indicate what information I have
+        * squirrelled away.  ELF notes happen to provide
+        * all of that, so there is no need to invent something new.
+        */
+       buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+       if (!buf)
+               return;
+       memset(&prstatus, 0, sizeof(prstatus));
+       prstatus.pr_pid = current->pid;
+       elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+       buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+                             &prstatus, sizeof(prstatus));
+       final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+       /* Allocate memory for saving cpu registers. */
+       size_t size, align;
+
+       /*
+        * crash_notes could be allocated across 2 vmalloc pages when percpu
+        * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+        * pages are also on 2 continuous physical pages. In this case the
+        * 2nd part of crash_notes in 2nd page could be lost since only the
+        * starting address and size of crash_notes are exported through sysfs.
+        * Here round up the size of crash_notes to the nearest power of two
+        * and pass it to __alloc_percpu as align value. This can make sure
+        * crash_notes is allocated inside one physical page.
+        */
+       size = sizeof(note_buf_t);
+       align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+       /*
+        * Break compile if size is bigger than PAGE_SIZE since crash_notes
+        * definitely will be in 2 pages with that.
+        */
+       BUILD_BUG_ON(size > PAGE_SIZE);
+
+       crash_notes = __alloc_percpu(size, align);
+       if (!crash_notes) {
+               pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+               return -ENOMEM;
+       }
+       return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+                                       unsigned long long system_ram,
+                                       unsigned long long *crash_size,
+                                       unsigned long long *crash_base)
+{
+       char *cur = cmdline, *tmp;
+
+       /* for each entry of the comma-separated list */
+       do {
+               unsigned long long start, end = ULLONG_MAX, size;
+
+               /* get the start of the range */
+               start = memparse(cur, &tmp);
+               if (cur == tmp) {
+                       pr_warn("crashkernel: Memory value expected\n");
+                       return -EINVAL;
+               }
+               cur = tmp;
+               if (*cur != '-') {
+                       pr_warn("crashkernel: '-' expected\n");
+                       return -EINVAL;
+               }
+               cur++;
+
+               /* if no ':' is here, than we read the end */
+               if (*cur != ':') {
+                       end = memparse(cur, &tmp);
+                       if (cur == tmp) {
+                               pr_warn("crashkernel: Memory value expected\n");
+                               return -EINVAL;
+                       }
+                       cur = tmp;
+                       if (end <= start) {
+                               pr_warn("crashkernel: end <= start\n");
+                               return -EINVAL;
+                       }
+               }
+
+               if (*cur != ':') {
+                       pr_warn("crashkernel: ':' expected\n");
+                       return -EINVAL;
+               }
+               cur++;
+
+               size = memparse(cur, &tmp);
+               if (cur == tmp) {
+                       pr_warn("Memory value expected\n");
+                       return -EINVAL;
+               }
+               cur = tmp;
+               if (size >= system_ram) {
+                       pr_warn("crashkernel: invalid size\n");
+                       return -EINVAL;
+               }
+
+               /* match ? */
+               if (system_ram >= start && system_ram < end) {
+                       *crash_size = size;
+                       break;
+               }
+       } while (*cur++ == ',');
+
+       if (*crash_size > 0) {
+               while (*cur && *cur != ' ' && *cur != '@')
+                       cur++;
+               if (*cur == '@') {
+                       cur++;
+                       *crash_base = memparse(cur, &tmp);
+                       if (cur == tmp) {
+                               pr_warn("Memory value expected after '@'\n");
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *     crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+                                          unsigned long long *crash_size,
+                                          unsigned long long *crash_base)
+{
+       char *cur = cmdline;
+
+       *crash_size = memparse(cmdline, &cur);
+       if (cmdline == cur) {
+               pr_warn("crashkernel: memory value expected\n");
+               return -EINVAL;
+       }
+
+       if (*cur == '@')
+               *crash_base = memparse(cur+1, &cur);
+       else if (*cur != ' ' && *cur != '\0') {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+       [SUFFIX_HIGH] = ",high",
+       [SUFFIX_LOW]  = ",low",
+       [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *     crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+                                          unsigned long long   *crash_size,
+                                          const char *suffix)
+{
+       char *cur = cmdline;
+
+       *crash_size = memparse(cmdline, &cur);
+       if (cmdline == cur) {
+               pr_warn("crashkernel: memory value expected\n");
+               return -EINVAL;
+       }
+
+       /* check with suffix */
+       if (strncmp(cur, suffix, strlen(suffix))) {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+       cur += strlen(suffix);
+       if (*cur != ' ' && *cur != '\0') {
+               pr_warn("crashkernel: unrecognized char\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+                            const char *name,
+                            const char *suffix)
+{
+       char *p = cmdline, *ck_cmdline = NULL;
+
+       /* find crashkernel and use the last one if there are more */
+       p = strstr(p, name);
+       while (p) {
+               char *end_p = strchr(p, ' ');
+               char *q;
+
+               if (!end_p)
+                       end_p = p + strlen(p);
+
+               if (!suffix) {
+                       int i;
+
+                       /* skip the one with any known suffix */
+                       for (i = 0; suffix_tbl[i]; i++) {
+                               q = end_p - strlen(suffix_tbl[i]);
+                               if (!strncmp(q, suffix_tbl[i],
+                                            strlen(suffix_tbl[i])))
+                                       goto next;
+                       }
+                       ck_cmdline = p;
+               } else {
+                       q = end_p - strlen(suffix);
+                       if (!strncmp(q, suffix, strlen(suffix)))
+                               ck_cmdline = p;
+               }
+next:
+               p = strstr(p+1, name);
+       }
+
+       if (!ck_cmdline)
+               return NULL;
+
+       return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base,
+                            const char *name,
+                            const char *suffix)
+{
+       char    *first_colon, *first_space;
+       char    *ck_cmdline;
+
+       BUG_ON(!crash_size || !crash_base);
+       *crash_size = 0;
+       *crash_base = 0;
+
+       ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+       if (!ck_cmdline)
+               return -EINVAL;
+
+       ck_cmdline += strlen(name);
+
+       if (suffix)
+               return parse_crashkernel_suffix(ck_cmdline, crash_size,
+                               suffix);
+       /*
+        * if the commandline contains a ':', then that's the extended
+        * syntax -- if not, it must be the classic syntax
+        */
+       first_colon = strchr(ck_cmdline, ':');
+       first_space = strchr(ck_cmdline, ' ');
+       if (first_colon && (!first_space || first_colon < first_space))
+               return parse_crashkernel_mem(ck_cmdline, system_ram,
+                               crash_size, crash_base);
+
+       return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                       "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                               "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+                            unsigned long long system_ram,
+                            unsigned long long *crash_size,
+                            unsigned long long *crash_base)
+{
+       return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                               "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+       u32 *buf = vmcoreinfo_note;
+
+       if (!vmcoreinfo_size)
+               return;
+       buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+                             vmcoreinfo_size);
+       final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+       vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+       update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+       va_list args;
+       char buf[0x50];
+       size_t r;
+
+       va_start(args, fmt);
+       r = vscnprintf(buf, sizeof(buf), fmt, args);
+       va_end(args);
+
+       r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+       memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+       vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+       return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+       VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+       VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+       VMCOREINFO_SYMBOL(init_uts_ns);
+       VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+       VMCOREINFO_SYMBOL(_stext);
+       VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+       VMCOREINFO_SYMBOL(mem_map);
+       VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+       VMCOREINFO_SYMBOL(mem_section);
+       VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+       VMCOREINFO_STRUCT_SIZE(mem_section);
+       VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+       VMCOREINFO_STRUCT_SIZE(page);
+       VMCOREINFO_STRUCT_SIZE(pglist_data);
+       VMCOREINFO_STRUCT_SIZE(zone);
+       VMCOREINFO_STRUCT_SIZE(free_area);
+       VMCOREINFO_STRUCT_SIZE(list_head);
+       VMCOREINFO_SIZE(nodemask_t);
+       VMCOREINFO_OFFSET(page, flags);
+       VMCOREINFO_OFFSET(page, _count);
+       VMCOREINFO_OFFSET(page, mapping);
+       VMCOREINFO_OFFSET(page, lru);
+       VMCOREINFO_OFFSET(page, _mapcount);
+       VMCOREINFO_OFFSET(page, private);
+       VMCOREINFO_OFFSET(pglist_data, node_zones);
+       VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+       VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+       VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+       VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+       VMCOREINFO_OFFSET(pglist_data, node_id);
+       VMCOREINFO_OFFSET(zone, free_area);
+       VMCOREINFO_OFFSET(zone, vm_stat);
+       VMCOREINFO_OFFSET(zone, spanned_pages);
+       VMCOREINFO_OFFSET(free_area, free_list);
+       VMCOREINFO_OFFSET(list_head, next);
+       VMCOREINFO_OFFSET(list_head, prev);
+       VMCOREINFO_OFFSET(vmap_area, va_start);
+       VMCOREINFO_OFFSET(vmap_area, list);
+       VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+       log_buf_kexec_setup();
+       VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+       VMCOREINFO_NUMBER(NR_FREE_PAGES);
+       VMCOREINFO_NUMBER(PG_lru);
+       VMCOREINFO_NUMBER(PG_private);
+       VMCOREINFO_NUMBER(PG_swapcache);
+       VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+       VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+       VMCOREINFO_NUMBER(PG_head_mask);
+       VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+       VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+       VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+       arch_crash_save_vmcoreinfo();
+       update_vmcoreinfo_note();
+
+       return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+       int error = 0;
+
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
+       if (!kexec_image) {
+               error = -EINVAL;
+               goto Unlock;
+       }
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context) {
+               lock_system_sleep();
+               pm_prepare_console();
+               error = freeze_processes();
+               if (error) {
+                       error = -EBUSY;
+                       goto Restore_console;
+               }
+               suspend_console();
+               error = dpm_suspend_start(PMSG_FREEZE);
+               if (error)
+                       goto Resume_console;
+               /* At this point, dpm_suspend_start() has been called,
+                * but *not* dpm_suspend_end(). We *must* call
+                * dpm_suspend_end() now.  Otherwise, drivers for
+                * some devices (e.g. interrupt controllers) become
+                * desynchronized with the actual state of the
+                * hardware at resume time, and evil weirdness ensues.
+                */
+               error = dpm_suspend_end(PMSG_FREEZE);
+               if (error)
+                       goto Resume_devices;
+               error = disable_nonboot_cpus();
+               if (error)
+                       goto Enable_cpus;
+               local_irq_disable();
+               error = syscore_suspend();
+               if (error)
+                       goto Enable_irqs;
+       } else
+#endif
+       {
+               kexec_in_progress = true;
+               kernel_restart_prepare(NULL);
+               migrate_to_reboot_cpu();
+
+               /*
+                * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+                * no further code needs to use CPU hotplug (which is true in
+                * the reboot case). However, the kexec path depends on using
+                * CPU hotplug again; so re-enable it here.
+                */
+               cpu_hotplug_enable();
+               pr_emerg("Starting new kernel\n");
+               machine_shutdown();
+       }
+
+       machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+       if (kexec_image->preserve_context) {
+               syscore_resume();
+ Enable_irqs:
+               local_irq_enable();
+ Enable_cpus:
+               enable_nonboot_cpus();
+               dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+               dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+               resume_console();
+               thaw_processes();
+ Restore_console:
+               pm_restore_console();
+               unlock_system_sleep();
+       }
+#endif
+
+ Unlock:
+       mutex_unlock(&kexec_mutex);
+       return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644 (file)
index 0000000..6a9a3f2
--- /dev/null
@@ -0,0 +1,1045 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ *      Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+       struct fd f = fdget(fd);
+       int ret;
+       struct kstat stat;
+       loff_t pos;
+       ssize_t bytes = 0;
+
+       if (!f.file)
+               return -EBADF;
+
+       ret = vfs_getattr(&f.file->f_path, &stat);
+       if (ret)
+               goto out;
+
+       if (stat.size > INT_MAX) {
+               ret = -EFBIG;
+               goto out;
+       }
+
+       /* Don't hand 0 to vmalloc, it whines. */
+       if (stat.size == 0) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       *buf = vmalloc(stat.size);
+       if (!*buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       pos = 0;
+       while (pos < stat.size) {
+               bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+                                   stat.size - pos);
+               if (bytes < 0) {
+                       vfree(*buf);
+                       ret = bytes;
+                       goto out;
+               }
+
+               if (bytes == 0)
+                       break;
+               pos += bytes;
+       }
+
+       if (pos != stat.size) {
+               ret = -EBADF;
+               vfree(*buf);
+               goto out;
+       }
+
+       *buf_len = pos;
+out:
+       fdput(f);
+       return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                        unsigned long buf_len)
+{
+       return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+       return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+       return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+                                       unsigned long buf_len)
+{
+       return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                                unsigned int relsec)
+{
+       pr_err("RELA relocation unsupported.\n");
+       return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+                            unsigned int relsec)
+{
+       pr_err("REL relocation unsupported.\n");
+       return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+
+       vfree(image->kernel_buf);
+       image->kernel_buf = NULL;
+
+       vfree(image->initrd_buf);
+       image->initrd_buf = NULL;
+
+       kfree(image->cmdline_buf);
+       image->cmdline_buf = NULL;
+
+       vfree(pi->purgatory_buf);
+       pi->purgatory_buf = NULL;
+
+       vfree(pi->sechdrs);
+       pi->sechdrs = NULL;
+
+       /* See if architecture has anything to cleanup post load */
+       arch_kimage_file_post_load_cleanup(image);
+
+       /*
+        * Above call should have called into bootloader to free up
+        * any data stored in kimage->image_loader_data. It should
+        * be ok now to free it up.
+        */
+       kfree(image->image_loader_data);
+       image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+                            const char __user *cmdline_ptr,
+                            unsigned long cmdline_len, unsigned flags)
+{
+       int ret = 0;
+       void *ldata;
+
+       ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+                               &image->kernel_buf_len);
+       if (ret)
+               return ret;
+
+       /* Call arch image probe handlers */
+       ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+                                           image->kernel_buf_len);
+
+       if (ret)
+               goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+       ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+                                          image->kernel_buf_len);
+       if (ret) {
+               pr_debug("kernel signature verification failed.\n");
+               goto out;
+       }
+       pr_debug("kernel signature verification successful.\n");
+#endif
+       /* It is possible that there no initramfs is being loaded */
+       if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+               ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+                                       &image->initrd_buf_len);
+               if (ret)
+                       goto out;
+       }
+
+       if (cmdline_len) {
+               image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+               if (!image->cmdline_buf) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+
+               ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+                                    cmdline_len);
+               if (ret) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               image->cmdline_buf_len = cmdline_len;
+
+               /* command line should be a string with last byte null */
+               if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /* Call arch image load handlers */
+       ldata = arch_kexec_kernel_image_load(image);
+
+       if (IS_ERR(ldata)) {
+               ret = PTR_ERR(ldata);
+               goto out;
+       }
+
+       image->image_loader_data = ldata;
+out:
+       /* In case of error, free up all allocated memory in this function */
+       if (ret)
+               kimage_file_post_load_cleanup(image);
+       return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+                      int initrd_fd, const char __user *cmdline_ptr,
+                      unsigned long cmdline_len, unsigned long flags)
+{
+       int ret;
+       struct kimage *image;
+       bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+       image = do_kimage_alloc_init();
+       if (!image)
+               return -ENOMEM;
+
+       image->file_mode = 1;
+
+       if (kexec_on_panic) {
+               /* Enable special crash kernel control page alloc policy. */
+               image->control_page = crashk_res.start;
+               image->type = KEXEC_TYPE_CRASH;
+       }
+
+       ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+                                          cmdline_ptr, cmdline_len, flags);
+       if (ret)
+               goto out_free_image;
+
+       ret = sanity_check_segment_list(image);
+       if (ret)
+               goto out_free_post_load_bufs;
+
+       ret = -ENOMEM;
+       image->control_code_page = kimage_alloc_control_pages(image,
+                                          get_order(KEXEC_CONTROL_PAGE_SIZE));
+       if (!image->control_code_page) {
+               pr_err("Could not allocate control_code_buffer\n");
+               goto out_free_post_load_bufs;
+       }
+
+       if (!kexec_on_panic) {
+               image->swap_page = kimage_alloc_control_pages(image, 0);
+               if (!image->swap_page) {
+                       pr_err("Could not allocate swap buffer\n");
+                       goto out_free_control_pages;
+               }
+       }
+
+       *rimage = image;
+       return 0;
+out_free_control_pages:
+       kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+       kimage_file_post_load_cleanup(image);
+out_free_image:
+       kfree(image);
+       return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+               unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+               unsigned long, flags)
+{
+       int ret = 0, i;
+       struct kimage **dest_image, *image;
+
+       /* We only trust the superuser with rebooting the system. */
+       if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+               return -EPERM;
+
+       /* Make sure we have a legal set of flags */
+       if (flags != (flags & KEXEC_FILE_FLAGS))
+               return -EINVAL;
+
+       image = NULL;
+
+       if (!mutex_trylock(&kexec_mutex))
+               return -EBUSY;
+
+       dest_image = &kexec_image;
+       if (flags & KEXEC_FILE_ON_CRASH)
+               dest_image = &kexec_crash_image;
+
+       if (flags & KEXEC_FILE_UNLOAD)
+               goto exchange;
+
+       /*
+        * In case of crash, new kernel gets loaded in reserved region. It is
+        * same memory where old crash kernel might be loaded. Free any
+        * current crash dump kernel before we corrupt it.
+        */
+       if (flags & KEXEC_FILE_ON_CRASH)
+               kimage_free(xchg(&kexec_crash_image, NULL));
+
+       ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+                                    cmdline_len, flags);
+       if (ret)
+               goto out;
+
+       ret = machine_kexec_prepare(image);
+       if (ret)
+               goto out;
+
+       ret = kexec_calculate_store_digests(image);
+       if (ret)
+               goto out;
+
+       for (i = 0; i < image->nr_segments; i++) {
+               struct kexec_segment *ksegment;
+
+               ksegment = &image->segment[i];
+               pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+                        i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+                        ksegment->memsz);
+
+               ret = kimage_load_segment(image, &image->segment[i]);
+               if (ret)
+                       goto out;
+       }
+
+       kimage_terminate(image);
+
+       /*
+        * Free up any temporary buffers allocated which are not needed
+        * after image has been loaded
+        */
+       kimage_file_post_load_cleanup(image);
+exchange:
+       image = xchg(dest_image, image);
+out:
+       mutex_unlock(&kexec_mutex);
+       kimage_free(image);
+       return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+                                   struct kexec_buf *kbuf)
+{
+       struct kimage *image = kbuf->image;
+       unsigned long temp_start, temp_end;
+
+       temp_end = min(end, kbuf->buf_max);
+       temp_start = temp_end - kbuf->memsz;
+
+       do {
+               /* align down start */
+               temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+               if (temp_start < start || temp_start < kbuf->buf_min)
+                       return 0;
+
+               temp_end = temp_start + kbuf->memsz - 1;
+
+               /*
+                * Make sure this does not conflict with any of existing
+                * segments
+                */
+               if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                       temp_start = temp_start - PAGE_SIZE;
+                       continue;
+               }
+
+               /* We found a suitable memory range */
+               break;
+       } while (1);
+
+       /* If we are here, we found a suitable memory range */
+       kbuf->mem = temp_start;
+
+       /* Success, stop navigating through remaining System RAM ranges */
+       return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+                                    struct kexec_buf *kbuf)
+{
+       struct kimage *image = kbuf->image;
+       unsigned long temp_start, temp_end;
+
+       temp_start = max(start, kbuf->buf_min);
+
+       do {
+               temp_start = ALIGN(temp_start, kbuf->buf_align);
+               temp_end = temp_start + kbuf->memsz - 1;
+
+               if (temp_end > end || temp_end > kbuf->buf_max)
+                       return 0;
+               /*
+                * Make sure this does not conflict with any of existing
+                * segments
+                */
+               if (kimage_is_destination_range(image, temp_start, temp_end)) {
+                       temp_start = temp_start + PAGE_SIZE;
+                       continue;
+               }
+
+               /* We found a suitable memory range */
+               break;
+       } while (1);
+
+       /* If we are here, we found a suitable memory range */
+       kbuf->mem = temp_start;
+
+       /* Success, stop navigating through remaining System RAM ranges */
+       return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+       struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+       unsigned long sz = end - start + 1;
+
+       /* Returning 0 will take to next memory range */
+       if (sz < kbuf->memsz)
+               return 0;
+
+       if (end < kbuf->buf_min || start > kbuf->buf_max)
+               return 0;
+
+       /*
+        * Allocate memory top down with-in ram range. Otherwise bottom up
+        * allocation.
+        */
+       if (kbuf->top_down)
+               return locate_mem_hole_top_down(start, end, kbuf);
+       return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+                    unsigned long memsz, unsigned long buf_align,
+                    unsigned long buf_min, unsigned long buf_max,
+                    bool top_down, unsigned long *load_addr)
+{
+
+       struct kexec_segment *ksegment;
+       struct kexec_buf buf, *kbuf;
+       int ret;
+
+       /* Currently adding segment this way is allowed only in file mode */
+       if (!image->file_mode)
+               return -EINVAL;
+
+       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+               return -EINVAL;
+
+       /*
+        * Make sure we are not trying to add buffer after allocating
+        * control pages. All segments need to be placed first before
+        * any control pages are allocated. As control page allocation
+        * logic goes through list of segments to make sure there are
+        * no destination overlaps.
+        */
+       if (!list_empty(&image->control_pages)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       memset(&buf, 0, sizeof(struct kexec_buf));
+       kbuf = &buf;
+       kbuf->image = image;
+       kbuf->buffer = buffer;
+       kbuf->bufsz = bufsz;
+
+       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+       kbuf->buf_align = max(buf_align, PAGE_SIZE);
+       kbuf->buf_min = buf_min;
+       kbuf->buf_max = buf_max;
+       kbuf->top_down = top_down;
+
+       /* Walk the RAM ranges and allocate a suitable range for the buffer */
+       if (image->type == KEXEC_TYPE_CRASH)
+               ret = walk_iomem_res("Crash kernel",
+                                    IORESOURCE_MEM | IORESOURCE_BUSY,
+                                    crashk_res.start, crashk_res.end, kbuf,
+                                    locate_mem_hole_callback);
+       else
+               ret = walk_system_ram_res(0, -1, kbuf,
+                                         locate_mem_hole_callback);
+       if (ret != 1) {
+               /* A suitable memory range could not be found for buffer */
+               return -EADDRNOTAVAIL;
+       }
+
+       /* Found a suitable memory range */
+       ksegment = &image->segment[image->nr_segments];
+       ksegment->kbuf = kbuf->buffer;
+       ksegment->bufsz = kbuf->bufsz;
+       ksegment->mem = kbuf->mem;
+       ksegment->memsz = kbuf->memsz;
+       image->nr_segments++;
+       *load_addr = ksegment->mem;
+       return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+       struct crypto_shash *tfm;
+       struct shash_desc *desc;
+       int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+       size_t desc_size, nullsz;
+       char *digest;
+       void *zero_buf;
+       struct kexec_sha_region *sha_regions;
+       struct purgatory_info *pi = &image->purgatory_info;
+
+       zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+       zero_buf_sz = PAGE_SIZE;
+
+       tfm = crypto_alloc_shash("sha256", 0, 0);
+       if (IS_ERR(tfm)) {
+               ret = PTR_ERR(tfm);
+               goto out;
+       }
+
+       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+       desc = kzalloc(desc_size, GFP_KERNEL);
+       if (!desc) {
+               ret = -ENOMEM;
+               goto out_free_tfm;
+       }
+
+       sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+       sha_regions = vzalloc(sha_region_sz);
+       if (!sha_regions)
+               goto out_free_desc;
+
+       desc->tfm   = tfm;
+       desc->flags = 0;
+
+       ret = crypto_shash_init(desc);
+       if (ret < 0)
+               goto out_free_sha_regions;
+
+       digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+       if (!digest) {
+               ret = -ENOMEM;
+               goto out_free_sha_regions;
+       }
+
+       for (j = i = 0; i < image->nr_segments; i++) {
+               struct kexec_segment *ksegment;
+
+               ksegment = &image->segment[i];
+               /*
+                * Skip purgatory as it will be modified once we put digest
+                * info in purgatory.
+                */
+               if (ksegment->kbuf == pi->purgatory_buf)
+                       continue;
+
+               ret = crypto_shash_update(desc, ksegment->kbuf,
+                                         ksegment->bufsz);
+               if (ret)
+                       break;
+
+               /*
+                * Assume rest of the buffer is filled with zero and
+                * update digest accordingly.
+                */
+               nullsz = ksegment->memsz - ksegment->bufsz;
+               while (nullsz) {
+                       unsigned long bytes = nullsz;
+
+                       if (bytes > zero_buf_sz)
+                               bytes = zero_buf_sz;
+                       ret = crypto_shash_update(desc, zero_buf, bytes);
+                       if (ret)
+                               break;
+                       nullsz -= bytes;
+               }
+
+               if (ret)
+                       break;
+
+               sha_regions[j].start = ksegment->mem;
+               sha_regions[j].len = ksegment->memsz;
+               j++;
+       }
+
+       if (!ret) {
+               ret = crypto_shash_final(desc, digest);
+               if (ret)
+                       goto out_free_digest;
+               ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+                                               sha_regions, sha_region_sz, 0);
+               if (ret)
+                       goto out_free_digest;
+
+               ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+                                               digest, SHA256_DIGEST_SIZE, 0);
+               if (ret)
+                       goto out_free_digest;
+       }
+
+out_free_digest:
+       kfree(digest);
+out_free_sha_regions:
+       vfree(sha_regions);
+out_free_desc:
+       kfree(desc);
+out_free_tfm:
+       kfree(tfm);
+out:
+       return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+                                 unsigned long max, int top_down)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+       unsigned char *buf_addr, *src;
+       int i, ret = 0, entry_sidx = -1;
+       const Elf_Shdr *sechdrs_c;
+       Elf_Shdr *sechdrs = NULL;
+       void *purgatory_buf = NULL;
+
+       /*
+        * sechdrs_c points to section headers in purgatory and are read
+        * only. No modifications allowed.
+        */
+       sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+       /*
+        * We can not modify sechdrs_c[] and its fields. It is read only.
+        * Copy it over to a local copy where one can store some temporary
+        * data and free it at the end. We need to modify ->sh_addr and
+        * ->sh_offset fields to keep track of permanent and temporary
+        * locations of sections.
+        */
+       sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+       if (!sechdrs)
+               return -ENOMEM;
+
+       memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+       /*
+        * We seem to have multiple copies of sections. First copy is which
+        * is embedded in kernel in read only section. Some of these sections
+        * will be copied to a temporary buffer and relocated. And these
+        * sections will finally be copied to their final destination at
+        * segment load time.
+        *
+        * Use ->sh_offset to reflect section address in memory. It will
+        * point to original read only copy if section is not allocatable.
+        * Otherwise it will point to temporary copy which will be relocated.
+        *
+        * Use ->sh_addr to contain final address of the section where it
+        * will go during execution time.
+        */
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (sechdrs[i].sh_type == SHT_NOBITS)
+                       continue;
+
+               sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+                                               sechdrs[i].sh_offset;
+       }
+
+       /*
+        * Identify entry point section and make entry relative to section
+        * start.
+        */
+       entry = pi->ehdr->e_entry;
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+                       continue;
+
+               /* Make entry section relative */
+               if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+                   ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+                    pi->ehdr->e_entry)) {
+                       entry_sidx = i;
+                       entry -= sechdrs[i].sh_addr;
+                       break;
+               }
+       }
+
+       /* Determine how much memory is needed to load relocatable object. */
+       buf_align = 1;
+       bss_align = 1;
+       buf_sz = 0;
+       bss_sz = 0;
+
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               align = sechdrs[i].sh_addralign;
+               if (sechdrs[i].sh_type != SHT_NOBITS) {
+                       if (buf_align < align)
+                               buf_align = align;
+                       buf_sz = ALIGN(buf_sz, align);
+                       buf_sz += sechdrs[i].sh_size;
+               } else {
+                       /* bss section */
+                       if (bss_align < align)
+                               bss_align = align;
+                       bss_sz = ALIGN(bss_sz, align);
+                       bss_sz += sechdrs[i].sh_size;
+               }
+       }
+
+       /* Determine the bss padding required to align bss properly */
+       bss_pad = 0;
+       if (buf_sz & (bss_align - 1))
+               bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+       memsz = buf_sz + bss_pad + bss_sz;
+
+       /* Allocate buffer for purgatory */
+       purgatory_buf = vzalloc(buf_sz);
+       if (!purgatory_buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (buf_align < bss_align)
+               buf_align = bss_align;
+
+       /* Add buffer to segment list */
+       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+                               buf_align, min, max, top_down,
+                               &pi->purgatory_load_addr);
+       if (ret)
+               goto out;
+
+       /* Load SHF_ALLOC sections */
+       buf_addr = purgatory_buf;
+       load_addr = curr_load_addr = pi->purgatory_load_addr;
+       bss_addr = load_addr + buf_sz + bss_pad;
+
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               align = sechdrs[i].sh_addralign;
+               if (sechdrs[i].sh_type != SHT_NOBITS) {
+                       curr_load_addr = ALIGN(curr_load_addr, align);
+                       offset = curr_load_addr - load_addr;
+                       /* We already modifed ->sh_offset to keep src addr */
+                       src = (char *) sechdrs[i].sh_offset;
+                       memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+                       /* Store load address and source address of section */
+                       sechdrs[i].sh_addr = curr_load_addr;
+
+                       /*
+                        * This section got copied to temporary buffer. Update
+                        * ->sh_offset accordingly.
+                        */
+                       sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+                       /* Advance to the next address */
+                       curr_load_addr += sechdrs[i].sh_size;
+               } else {
+                       bss_addr = ALIGN(bss_addr, align);
+                       sechdrs[i].sh_addr = bss_addr;
+                       bss_addr += sechdrs[i].sh_size;
+               }
+       }
+
+       /* Update entry point based on load address of text section */
+       if (entry_sidx >= 0)
+               entry += sechdrs[entry_sidx].sh_addr;
+
+       /* Make kernel jump to purgatory after shutdown */
+       image->start = entry;
+
+       /* Used later to get/set symbol values */
+       pi->sechdrs = sechdrs;
+
+       /*
+        * Used later to identify which section is purgatory and skip it
+        * from checksumming.
+        */
+       pi->purgatory_buf = purgatory_buf;
+       return ret;
+out:
+       vfree(sechdrs);
+       vfree(purgatory_buf);
+       return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+       int i, ret;
+       struct purgatory_info *pi = &image->purgatory_info;
+       Elf_Shdr *sechdrs = pi->sechdrs;
+
+       /* Apply relocations */
+       for (i = 0; i < pi->ehdr->e_shnum; i++) {
+               Elf_Shdr *section, *symtab;
+
+               if (sechdrs[i].sh_type != SHT_RELA &&
+                   sechdrs[i].sh_type != SHT_REL)
+                       continue;
+
+               /*
+                * For section of type SHT_RELA/SHT_REL,
+                * ->sh_link contains section header index of associated
+                * symbol table. And ->sh_info contains section header
+                * index of section to which relocations apply.
+                */
+               if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+                   sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+                       return -ENOEXEC;
+
+               section = &sechdrs[sechdrs[i].sh_info];
+               symtab = &sechdrs[sechdrs[i].sh_link];
+
+               if (!(section->sh_flags & SHF_ALLOC))
+                       continue;
+
+               /*
+                * symtab->sh_link contain section header index of associated
+                * string table.
+                */
+               if (symtab->sh_link >= pi->ehdr->e_shnum)
+                       /* Invalid section number? */
+                       continue;
+
+               /*
+                * Respective architecture needs to provide support for applying
+                * relocations of type SHT_RELA/SHT_REL.
+                */
+               if (sechdrs[i].sh_type == SHT_RELA)
+                       ret = arch_kexec_apply_relocations_add(pi->ehdr,
+                                                              sechdrs, i);
+               else if (sechdrs[i].sh_type == SHT_REL)
+                       ret = arch_kexec_apply_relocations(pi->ehdr,
+                                                          sechdrs, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+                        unsigned long max, int top_down,
+                        unsigned long *load_addr)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       int ret;
+
+       if (kexec_purgatory_size <= 0)
+               return -EINVAL;
+
+       if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+               return -ENOEXEC;
+
+       pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+       if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+           || pi->ehdr->e_type != ET_REL
+           || !elf_check_arch(pi->ehdr)
+           || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+               return -ENOEXEC;
+
+       if (pi->ehdr->e_shoff >= kexec_purgatory_size
+           || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+           kexec_purgatory_size - pi->ehdr->e_shoff))
+               return -ENOEXEC;
+
+       ret = __kexec_load_purgatory(image, min, max, top_down);
+       if (ret)
+               return ret;
+
+       ret = kexec_apply_relocations(image);
+       if (ret)
+               goto out;
+
+       *load_addr = pi->purgatory_load_addr;
+       return 0;
+out:
+       vfree(pi->sechdrs);
+       vfree(pi->purgatory_buf);
+       return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+                                           const char *name)
+{
+       Elf_Sym *syms;
+       Elf_Shdr *sechdrs;
+       Elf_Ehdr *ehdr;
+       int i, k;
+       const char *strtab;
+
+       if (!pi->sechdrs || !pi->ehdr)
+               return NULL;
+
+       sechdrs = pi->sechdrs;
+       ehdr = pi->ehdr;
+
+       for (i = 0; i < ehdr->e_shnum; i++) {
+               if (sechdrs[i].sh_type != SHT_SYMTAB)
+                       continue;
+
+               if (sechdrs[i].sh_link >= ehdr->e_shnum)
+                       /* Invalid strtab section number */
+                       continue;
+               strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+               syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+               /* Go through symbols for a match */
+               for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+                       if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+                               continue;
+
+                       if (strcmp(strtab + syms[k].st_name, name) != 0)
+                               continue;
+
+                       if (syms[k].st_shndx == SHN_UNDEF ||
+                           syms[k].st_shndx >= ehdr->e_shnum) {
+                               pr_debug("Symbol: %s has bad section index %d.\n",
+                                               name, syms[k].st_shndx);
+                               return NULL;
+                       }
+
+                       /* Found the symbol we are looking for */
+                       return &syms[k];
+               }
+       }
+
+       return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+       struct purgatory_info *pi = &image->purgatory_info;
+       Elf_Sym *sym;
+       Elf_Shdr *sechdr;
+
+       sym = kexec_purgatory_find_symbol(pi, name);
+       if (!sym)
+               return ERR_PTR(-EINVAL);
+
+       sechdr = &pi->sechdrs[sym->st_shndx];
+
+       /*
+        * Returns the address where symbol will finally be loaded after
+        * kexec_load_segment()
+        */
+       return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+                                  void *buf, unsigned int size, bool get_value)
+{
+       Elf_Sym *sym;
+       Elf_Shdr *sechdrs;
+       struct purgatory_info *pi = &image->purgatory_info;
+       char *sym_buf;
+
+       sym = kexec_purgatory_find_symbol(pi, name);
+       if (!sym)
+               return -EINVAL;
+
+       if (sym->st_size != size) {
+               pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+                      name, (unsigned long)sym->st_size, size);
+               return -EINVAL;
+       }
+
+       sechdrs = pi->sechdrs;
+
+       if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+               pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+                      get_value ? "get" : "set");
+               return -EINVAL;
+       }
+
+       sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+                                       sym->st_value;
+
+       if (get_value)
+               memcpy((void *)buf, sym_buf, size);
+       else
+               memcpy((void *)sym_buf, buf, size);
+
+       return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644 (file)
index 0000000..e4392a6
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+                               unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
index 2777f40a9c7be84c60da316960a012e982d67c35..da98d0593de24206d68222d787d059a5a2b025a1 100644 (file)
@@ -45,8 +45,6 @@
 
 extern int max_threads;
 
-static struct workqueue_struct *khelper_wq;
-
 #define CAP_BSET       (void *)1
 #define CAP_PI         (void *)2
 
@@ -114,10 +112,11 @@ out:
  * @...: arguments as specified in the format string
  *
  * Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
- * successful module load does not mean the module did not then unload
- * and exit on an error of its own. Callers must check that the service
- * they requested is now available not blindly invoke it.
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
  *
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
 /*
  * This is the task which runs the usermode application
  */
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
 {
        struct subprocess_info *sub_info = data;
        struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
        flush_signal_handlers(current, 1);
        spin_unlock_irq(&current->sighand->siglock);
 
-       /* We can run anywhere, unlike our parent keventd(). */
-       set_cpus_allowed_ptr(current, cpu_all_mask);
-
        /*
-        * Our parent is keventd, which runs with elevated scheduling priority.
-        * Avoid propagating that into the userspace child.
+        * Our parent (unbound workqueue) runs with elevated scheduling
+        * priority. Avoid propagating that into the userspace child.
         */
        set_user_nice(current, 0);
 
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
                           (const char __user *const __user *)sub_info->envp);
 out:
        sub_info->retval = retval;
-       /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+       /*
+        * call_usermodehelper_exec_sync() will call umh_complete
+        * if UHM_WAIT_PROC.
+        */
        if (!(sub_info->wait & UMH_WAIT_PROC))
                umh_complete(sub_info);
        if (!retval)
@@ -266,15 +265,14 @@ out:
        do_exit(0);
 }
 
-/* Keventd can't block, but this (a child) can. */
-static int wait_for_helper(void *data)
+/* Handles UMH_WAIT_PROC.  */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
 {
-       struct subprocess_info *sub_info = data;
        pid_t pid;
 
        /* If SIGCLD is ignored sys_wait4 won't populate the status. */
        kernel_sigaction(SIGCHLD, SIG_DFL);
-       pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+       pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
        if (pid < 0) {
                sub_info->retval = pid;
        } else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
                /*
                 * Normally it is bogus to call wait4() from in-kernel because
                 * wait4() wants to write the exit code to a userspace address.
-                * But wait_for_helper() always runs as keventd, and put_user()
-                * to a kernel address works OK for kernel threads, due to their
-                * having an mm_segment_t which spans the entire address space.
+                * But call_usermodehelper_exec_sync() always runs as kernel
+                * thread (workqueue) and put_user() to a kernel address works
+                * OK for kernel threads, due to their having an mm_segment_t
+                * which spans the entire address space.
                 *
                 * Thus the __user pointer cast is valid here.
                 */
                sys_wait4(pid, (int __user *)&ret, 0, NULL);
 
                /*
-                * If ret is 0, either ____call_usermodehelper failed and the
-                * real error code is already in sub_info->retval or
+                * If ret is 0, either call_usermodehelper_exec_async failed and
+                * the real error code is already in sub_info->retval or
                 * sub_info->retval is 0 anyway, so don't mess with it then.
                 */
                if (ret)
                        sub_info->retval = ret;
        }
 
+       /* Restore default kernel sig handler */
+       kernel_sigaction(SIGCHLD, SIG_IGN);
+
        umh_complete(sub_info);
-       do_exit(0);
 }
 
-/* This is run by khelper thread  */
-static void __call_usermodehelper(struct work_struct *work)
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-       pid_t pid;
 
-       if (sub_info->wait & UMH_WAIT_PROC)
-               pid = kernel_thread(wait_for_helper, sub_info,
-                                   CLONE_FS | CLONE_FILES | SIGCHLD);
-       else
-               pid = kernel_thread(____call_usermodehelper, sub_info,
-                                   SIGCHLD);
+       if (sub_info->wait & UMH_WAIT_PROC) {
+               call_usermodehelper_exec_sync(sub_info);
+       } else {
+               pid_t pid;
 
-       if (pid < 0) {
-               sub_info->retval = pid;
-               umh_complete(sub_info);
+               pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+                                   SIGCHLD);
+               if (pid < 0) {
+                       sub_info->retval = pid;
+                       umh_complete(sub_info);
+               }
        }
 }
 
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
        if (!sub_info)
                goto out;
 
-       INIT_WORK(&sub_info->work, __call_usermodehelper);
+       INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
        sub_info->path = path;
        sub_info->argv = argv;
        sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  *        from interrupt context.
  *
  * Runs a user-space application.  The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
- * (ie. it runs with full root capabilities).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
  */
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                return -EINVAL;
        }
        helper_lock();
-       if (!khelper_wq || usermodehelper_disabled) {
+       if (usermodehelper_disabled) {
                retval = -EBUSY;
                goto out;
        }
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;
 
-       queue_work(khelper_wq, &sub_info->work);
+       queue_work(system_unbound_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
 
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
        },
        { }
 };
-
-void __init usermodehelper_init(void)
-{
-       khelper_wq = create_singlethread_workqueue("khelper");
-       BUG_ON(!khelper_wq);
-}
index 6683ccef9fffb2de28b6a4d6d01393b080811d6a..e83b264640615c47c31cce539f31014dc11b0776 100644 (file)
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
 KERNEL_ATTR_RW(profiling);
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static ssize_t kexec_loaded_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
        &kexec_crash_size_attr.attr,
index 337c8818541d339aac3fd1e3e6af32dac6dff4c9..87e9ce6a63c5d0e78a17977e2e9271ffaf0bb946 100644 (file)
@@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
        if (pv_enabled())
                goto queue;
 
-       if (virt_queued_spin_lock(lock))
+       if (virt_spin_lock(lock))
                return;
 
        /*
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644 (file)
index 0000000..536c727
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+       if (unlikely(flags))
+               return -EINVAL;
+       switch (cmd) {
+       case MEMBARRIER_CMD_QUERY:
+               return MEMBARRIER_CMD_BITMASK;
+       case MEMBARRIER_CMD_SHARED:
+               if (num_online_cpus() > 1)
+                       synchronize_sched();
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644 (file)
index 0000000..72b0c66
--- /dev/null
@@ -0,0 +1,190 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/memory_hotplug.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+       return ioremap(offset, size);
+}
+#endif
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ *
+ * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * the architecture.  This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility.  Attempts to
+ * map "System RAM" with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+       int is_ram = region_intersects(offset, size, "System RAM");
+       void *addr = NULL;
+
+       if (is_ram == REGION_MIXED) {
+               WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+                               &offset, (unsigned long) size);
+               return NULL;
+       }
+
+       /* Try all mapping types requested until one returns non-NULL */
+       if (flags & MEMREMAP_WB) {
+               flags &= ~MEMREMAP_WB;
+               /*
+                * MEMREMAP_WB is special in that it can be satisifed
+                * from the direct map.  Some archs depend on the
+                * capability of memremap() to autodetect cases where
+                * the requested range is potentially in "System RAM"
+                */
+               if (is_ram == REGION_INTERSECTS)
+                       addr = __va(offset);
+               else
+                       addr = ioremap_cache(offset, size);
+       }
+
+       /*
+        * If we don't have a mapping yet and more request flags are
+        * pending then we will be attempting to establish a new virtual
+        * address mapping.  Enforce that this mapping is not aliasing
+        * "System RAM"
+        */
+       if (!addr && is_ram == REGION_INTERSECTS && flags) {
+               WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+                               &offset, (unsigned long) size);
+               return NULL;
+       }
+
+       if (!addr && (flags & MEMREMAP_WT)) {
+               flags &= ~MEMREMAP_WT;
+               addr = ioremap_wt(offset, size);
+       }
+
+       return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+       if (is_vmalloc_addr(addr))
+               iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+       memunmap(res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+       return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+               size_t size, unsigned long flags)
+{
+       void **ptr, *addr;
+
+       ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+       if (!ptr)
+               return NULL;
+
+       addr = memremap(offset, size, flags);
+       if (addr) {
+               *ptr = addr;
+               devres_add(dev, ptr);
+       } else
+               devres_free(ptr);
+
+       return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+       WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+                              addr));
+       memunmap(addr);
+}
+EXPORT_SYMBOL(devm_memunmap);
+
+#ifdef CONFIG_ZONE_DEVICE
+struct page_map {
+       struct resource res;
+};
+
+static void devm_memremap_pages_release(struct device *dev, void *res)
+{
+       struct page_map *page_map = res;
+
+       /* pages are dead and unused, undo the arch mapping */
+       arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+}
+
+void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+       int is_ram = region_intersects(res->start, resource_size(res),
+                       "System RAM");
+       struct page_map *page_map;
+       int error, nid;
+
+       if (is_ram == REGION_MIXED) {
+               WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+                               __func__, res);
+               return ERR_PTR(-ENXIO);
+       }
+
+       if (is_ram == REGION_INTERSECTS)
+               return __va(res->start);
+
+       page_map = devres_alloc(devm_memremap_pages_release,
+                       sizeof(*page_map), GFP_KERNEL);
+       if (!page_map)
+               return ERR_PTR(-ENOMEM);
+
+       memcpy(&page_map->res, res, sizeof(*res));
+
+       nid = dev_to_node(dev);
+       if (nid < 0)
+               nid = 0;
+
+       error = arch_add_memory(nid, res->start, resource_size(res), true);
+       if (error) {
+               devres_free(page_map);
+               return ERR_PTR(error);
+       }
+
+       devres_add(dev, page_map);
+       return __va(res->start);
+}
+EXPORT_SYMBOL(devm_memremap_pages);
+#endif /* CONFIG_ZONE_DEVICE */
index be5b8fac4bd0de72aba1f91674a2d0eb7a296d31..bd62f5cda74673377ef080c718163917387ddde0 100644 (file)
  */
 
 #include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
 #include <keys/system_keyring.h>
+#include <crypto/public_key.h>
 #include "module-internal.h"
 
 /*
  *     - Information block
  */
 struct module_signature {
-       u8      algo;           /* Public-key crypto algorithm [enum pkey_algo] */
-       u8      hash;           /* Digest algorithm [enum hash_algo] */
-       u8      id_type;        /* Key identifier type [enum pkey_id_type] */
-       u8      signer_len;     /* Length of signer's name */
-       u8      key_id_len;     /* Length of key identifier */
+       u8      algo;           /* Public-key crypto algorithm [0] */
+       u8      hash;           /* Digest algorithm [0] */
+       u8      id_type;        /* Key identifier type [PKEY_ID_PKCS7] */
+       u8      signer_len;     /* Length of signer's name [0] */
+       u8      key_id_len;     /* Length of key identifier [0] */
        u8      __pad[3];
        __be32  sig_len;        /* Length of signature data */
 };
 
-/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
-                                                   const void *mod,
-                                                   unsigned long modlen)
-{
-       struct public_key_signature *pks;
-       struct crypto_shash *tfm;
-       struct shash_desc *desc;
-       size_t digest_size, desc_size;
-       int ret;
-
-       pr_devel("==>%s()\n", __func__);
-       
-       /* Allocate the hashing algorithm we're going to need and find out how
-        * big the hash operational data will be.
-        */
-       tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
-       if (IS_ERR(tfm))
-               return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-
-       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
-       digest_size = crypto_shash_digestsize(tfm);
-
-       /* We allocate the hash operational data storage on the end of our
-        * context data and the digest output buffer on the end of that.
-        */
-       ret = -ENOMEM;
-       pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
-       if (!pks)
-               goto error_no_pks;
-
-       pks->pkey_hash_algo     = hash;
-       pks->digest             = (u8 *)pks + sizeof(*pks) + desc_size;
-       pks->digest_size        = digest_size;
-
-       desc = (void *)pks + sizeof(*pks);
-       desc->tfm   = tfm;
-       desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
-       ret = crypto_shash_init(desc);
-       if (ret < 0)
-               goto error;
-
-       ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
-       if (ret < 0)
-               goto error;
-
-       crypto_free_shash(tfm);
-       pr_devel("<==%s() = ok\n", __func__);
-       return pks;
-
-error:
-       kfree(pks);
-error_no_pks:
-       crypto_free_shash(tfm);
-       pr_devel("<==%s() = %d\n", __func__, ret);
-       return ERR_PTR(ret);
-}
-
-/*
- * Extract an MPI array from the signature data.  This represents the actual
- * signature.  Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
-                                const void *data, size_t len)
-{
-       size_t nbytes;
-       MPI mpi;
-
-       if (len < 3)
-               return -EBADMSG;
-       nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
-       data += 2;
-       len -= 2;
-       if (len != nbytes)
-               return -EBADMSG;
-
-       mpi = mpi_read_raw_data(data, nbytes);
-       if (!mpi)
-               return -ENOMEM;
-       pks->mpi[0] = mpi;
-       pks->nr_mpi = 1;
-       return 0;
-}
-
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
-                                         const u8 *key_id, size_t key_id_len)
-{
-       key_ref_t key;
-       size_t i;
-       char *id, *q;
-
-       pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-
-       /* Construct an identifier. */
-       id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
-       if (!id)
-               return ERR_PTR(-ENOKEY);
-
-       memcpy(id, signer, signer_len);
-
-       q = id + signer_len;
-       *q++ = ':';
-       *q++ = ' ';
-       for (i = 0; i < key_id_len; i++) {
-               *q++ = hex_asc[*key_id >> 4];
-               *q++ = hex_asc[*key_id++ & 0x0f];
-       }
-
-       *q = 0;
-
-       pr_debug("Look up: \"%s\"\n", id);
-
-       key = keyring_search(make_key_ref(system_trusted_keyring, 1),
-                            &key_type_asymmetric, id);
-       if (IS_ERR(key))
-               pr_warn("Request for unknown module key '%s' err %ld\n",
-                       id, PTR_ERR(key));
-       kfree(id);
-
-       if (IS_ERR(key)) {
-               switch (PTR_ERR(key)) {
-                       /* Hide some search errors */
-               case -EACCES:
-               case -ENOTDIR:
-               case -EAGAIN:
-                       return ERR_PTR(-ENOKEY);
-               default:
-                       return ERR_CAST(key);
-               }
-       }
-
-       pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
-       return key_ref_to_ptr(key);
-}
-
 /*
  * Verify the signature on a module.
  */
 int mod_verify_sig(const void *mod, unsigned long *_modlen)
 {
-       struct public_key_signature *pks;
        struct module_signature ms;
-       struct key *key;
-       const void *sig;
        size_t modlen = *_modlen, sig_len;
-       int ret;
 
        pr_devel("==>%s(,%zu)\n", __func__, modlen);
 
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
        if (sig_len >= modlen)
                return -EBADMSG;
        modlen -= sig_len;
-       if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
-               return -EBADMSG;
-       modlen -= (size_t)ms.signer_len + ms.key_id_len;
-
        *_modlen = modlen;
-       sig = mod + modlen;
-
-       /* For the moment, only support RSA and X.509 identifiers */
-       if (ms.algo != PKEY_ALGO_RSA ||
-           ms.id_type != PKEY_ID_X509)
-               return -ENOPKG;
 
-       if (ms.hash >= PKEY_HASH__LAST ||
-           !hash_algo_name[ms.hash])
+       if (ms.id_type != PKEY_ID_PKCS7) {
+               pr_err("Module is not signed with expected PKCS#7 message\n");
                return -ENOPKG;
-
-       key = request_asymmetric_key(sig, ms.signer_len,
-                                    sig + ms.signer_len, ms.key_id_len);
-       if (IS_ERR(key))
-               return PTR_ERR(key);
-
-       pks = mod_make_digest(ms.hash, mod, modlen);
-       if (IS_ERR(pks)) {
-               ret = PTR_ERR(pks);
-               goto error_put_key;
        }
 
-       ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
-                                   sig_len);
-       if (ret < 0)
-               goto error_free_pks;
-
-       ret = verify_signature(key, pks);
-       pr_devel("verify_signature() = %d\n", ret);
+       if (ms.algo != 0 ||
+           ms.hash != 0 ||
+           ms.signer_len != 0 ||
+           ms.key_id_len != 0 ||
+           ms.__pad[0] != 0 ||
+           ms.__pad[1] != 0 ||
+           ms.__pad[2] != 0) {
+               pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+               return -EBADMSG;
+       }
 
-error_free_pks:
-       mpi_free(pks->rsa.s);
-       kfree(pks);
-error_put_key:
-       key_put(key);
-       pr_devel("<==%s() = %d\n", __func__, ret);
-       return ret;     
+       return system_verify_data(mod, modlen, mod + modlen, sig_len,
+                                 VERIFYING_MODULE_SIGNATURE);
 }
index cf8c24203368651af417eba7525a053e9cc8ff93..8f0324ef72ab374925badb5454aa0a79ae731c61 100644 (file)
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
        .release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
index a7bcd28d6e9f5861670cc029a62ebac61047a4dc..99513e1160e518d322f6d0ce0f346e6da9fcbbf0 100644 (file)
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
                node = cpu_to_mem(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-                       page = alloc_pages_exact_node(node,
+                       page = __alloc_pages_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                       page = alloc_pages_exact_node(node,
+                       page = __alloc_pages_node(node,
                                        GFP_KERNEL | __GFP_ZERO,
                                        0);
                        if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
                int node = cpu_to_mem(cpu);
                struct page *page;
 
-               page = alloc_pages_exact_node(node,
+               page = __alloc_pages_node(node,
                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
-               page = alloc_pages_exact_node(node,
+               page = __alloc_pages_node(node,
                                GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
                                0);
                if (!page)
index c8e0e050a36afb0ccb875e13d9f2b526b0af4d29..787320de68e02425e8506363681b8875b6ee6d57 100644 (file)
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
        if (data & ~(unsigned long)PTRACE_O_MASK)
                return -EINVAL;
 
+       if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+               if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+                   !config_enabled(CONFIG_SECCOMP))
+                       return -EINVAL;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+                   current->ptrace & PT_SUSPEND_SECCOMP)
+                       return -EPERM;
+       }
+
        /* Avoid intermediate state when all opts are cleared */
        flags = child->ptrace;
        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
index d20c85d9f8c0d71df00a2ac7297d4ace5ed18323..bd30a973fe946b03916a1eeb873928adfe1b32b0 100644 (file)
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                kernel_restart(buffer);
                break;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        case LINUX_REBOOT_CMD_KEXEC:
                ret = kernel_kexec();
                break;
index fed052a1bc9f5792c7cb856336f2093e75aa2432..f150dbbe6f62d31aa07b3fbf3f07a8807fa40991 100644 (file)
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
-/*
- * Search for a resouce entry that fully contains the specified region.
- * If found, return 1 if it is RAM, 0 if not.
- * If not found, or region is not fully contained, return -1
+/**
+ * region_intersects() - determine intersection of region with known resources
+ * @start: region start address
+ * @size: size of region
+ * @name: name of resource (in iomem_resource)
  *
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
- * a vast speed up over walking through the resource table page by page.
+ * Check if the specified region partially overlaps or fully eclipses a
+ * resource identified by @name.  Return REGION_DISJOINT if the region
+ * does not overlap @name, return REGION_MIXED if the region overlaps
+ * @type and another resource, and return REGION_INTERSECTS if the
+ * region overlaps @type and no other defined resource. Note, that
+ * REGION_INTERSECTS is also returned in the case when the specified
+ * region overlaps RAM and undefined memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
  */
-int region_is_ram(resource_size_t start, unsigned long size)
+int region_intersects(resource_size_t start, size_t size, const char *name)
 {
-       struct resource *p;
-       resource_size_t end = start + size - 1;
        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-       const char *name = "System RAM";
-       int ret = -1;
+       resource_size_t end = start + size - 1;
+       int type = 0; int other = 0;
+       struct resource *p;
 
        read_lock(&resource_lock);
        for (p = iomem_resource.child; p ; p = p->sibling) {
-               if (p->end < start)
-                       continue;
-
-               if (p->start <= start && end <= p->end) {
-                       /* resource fully contains region */
-                       if ((p->flags != flags) || strcmp(p->name, name))
-                               ret = 0;
-                       else
-                               ret = 1;
-                       break;
-               }
-               if (end < p->start)
-                       break;  /* not found */
+               bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+
+               if (start >= p->start && start <= p->end)
+                       is_type ? type++ : other++;
+               if (end >= p->start && end <= p->end)
+                       is_type ? type++ : other++;
+               if (p->start >= start && p->end <= end)
+                       is_type ? type++ : other++;
        }
        read_unlock(&resource_lock);
-       return ret;
+
+       if (other == 0)
+               return type ? REGION_INTERSECTS : REGION_DISJOINT;
+
+       if (type)
+               return REGION_MIXED;
+
+       return REGION_DISJOINT;
 }
 
 void __weak arch_remove_reservations(struct resource *avail)
index 3595403921bd5be10c3e5e591bf04916e654423d..97d276ff1edb1225f0ad894cb66b052be36b2104 100644 (file)
@@ -621,18 +621,21 @@ int get_nohz_timer_target(void)
        int i, cpu = smp_processor_id();
        struct sched_domain *sd;
 
-       if (!idle_cpu(cpu))
+       if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
                return cpu;
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
                for_each_cpu(i, sched_domain_span(sd)) {
-                       if (!idle_cpu(i)) {
+                       if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
                                cpu = i;
                                goto unlock;
                        }
                }
        }
+
+       if (!is_housekeeping_cpu(cpu))
+               cpu = housekeeping_any_cpu();
 unlock:
        rcu_read_unlock();
        return cpu;
@@ -5178,24 +5181,47 @@ static void migrate_tasks(struct rq *dead_rq)
                        break;
 
                /*
-                * Ensure rq->lock covers the entire task selection
-                * until the migration.
+                * pick_next_task assumes pinned rq->lock.
                 */
                lockdep_pin_lock(&rq->lock);
                next = pick_next_task(rq, &fake_task);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
+               /*
+                * Rules for changing task_struct::cpus_allowed are holding
+                * both pi_lock and rq->lock, such that holding either
+                * stabilizes the mask.
+                *
+                * Drop rq->lock is not quite as disastrous as it usually is
+                * because !cpu_active at this point, which means load-balance
+                * will not interfere. Also, stop-machine.
+                */
+               lockdep_unpin_lock(&rq->lock);
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&next->pi_lock);
+               raw_spin_lock(&rq->lock);
+
+               /*
+                * Since we're inside stop-machine, _nothing_ should have
+                * changed the task, WARN if weird stuff happened, because in
+                * that case the above rq->lock drop is a fail too.
+                */
+               if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+                       raw_spin_unlock(&next->pi_lock);
+                       continue;
+               }
+
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-               lockdep_unpin_lock(&rq->lock);
                rq = __migrate_task(rq, next, dest_cpu);
                if (rq != dead_rq) {
                        raw_spin_unlock(&rq->lock);
                        rq = dead_rq;
                        raw_spin_lock(&rq->lock);
                }
+               raw_spin_unlock(&next->pi_lock);
        }
 
        rq->stop = stop;
index 245df6b32b81f8eef778a203c2edb8432a52abd6..5bd4779282df00e8831d07ec09b80f9e07b73ad7 100644 (file)
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  */
 static u32 seccomp_run_filters(struct seccomp_data *sd)
 {
-       struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
        struct seccomp_data sd_local;
        u32 ret = SECCOMP_RET_ALLOW;
+       /* Make sure cross-thread synced filter points somewhere sane. */
+       struct seccomp_filter *f =
+                       lockless_dereference(current->seccomp.filter);
 
        /* Ensure unexpected behavior doesn't result in failing open. */
        if (unlikely(WARN_ON(f == NULL)))
                return SECCOMP_RET_KILL;
 
-       /* Make sure cross-thread synced filter points somewhere sane. */
-       smp_read_barrier_depends();
-
        if (!sd) {
                populate_seccomp_data(&sd_local);
                sd = &sd_local;
@@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall)
 {
        int mode = current->seccomp.mode;
 
-       if (mode == 0)
+       if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+           unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+               return;
+
+       if (mode == SECCOMP_MODE_DISABLED)
                return;
        else if (mode == SECCOMP_MODE_STRICT)
                __secure_computing_strict(this_syscall);
@@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
        int this_syscall = sd ? sd->nr :
                syscall_get_nr(current, task_pt_regs(current));
 
+       if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+           unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+               return SECCOMP_PHASE1_OK;
+
        switch (mode) {
        case SECCOMP_MODE_STRICT:
                __secure_computing_strict(this_syscall);  /* may call do_exit */
index 03c3875d995898b2af39ac593379085cb250c242..a02decf155832fa03117ce6e6ebf89e6a0b2809f 100644 (file)
@@ -245,3 +245,6 @@ cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
index 19b62b522158acb6414cd7440b25e64bd16add35..e69201d8094eb8bed747329afc17528c4315a6b7 100644 (file)
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
 #endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        {
                .procname       = "kexec_load_disabled",
                .data           = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                       *lvalp = (unsigned long)-val;
+                       *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
                int val = *valp;
                if (val < 0) {
                        *negp = true;
-                       *lvalp = (unsigned long)-val;
+                       *lvalp = -(unsigned long)val;
                } else {
                        *negp = false;
                        *lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
                unsigned long lval;
                if (val < 0) {
                        *negp = true;
-                       lval = (unsigned long)-val;
+                       lval = -(unsigned long)val;
                } else {
                        *negp = false;
                        lval = (unsigned long)val;
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644 (file)
index 3e9868d..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-
-       __INITRODATA
-
-       .align 8
-       .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
-__cert_list_start:
-       .incbin "kernel/x509_certificate_list"
-__cert_list_end:
-
-       .align 8
-       .globl VMLINUX_SYMBOL(system_certificate_list_size)
-VMLINUX_SYMBOL(system_certificate_list_size):
-#ifdef CONFIG_64BIT
-       .quad __cert_list_end - __cert_list_start
-#else
-       .long __cert_list_end - __cert_list_start
-#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644 (file)
index 875f64e..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const unsigned long system_certificate_list_size;
-
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
-       pr_notice("Initialise system trusted keyring\n");
-
-       system_trusted_keyring =
-               keyring_alloc(".system_keyring",
-                             KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
-                             ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                             KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
-                             KEY_ALLOC_NOT_IN_QUOTA, NULL);
-       if (IS_ERR(system_trusted_keyring))
-               panic("Can't allocate system trusted keyring\n");
-
-       set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
-       return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
-       key_ref_t key;
-       const u8 *p, *end;
-       size_t plen;
-
-       pr_notice("Loading compiled-in X.509 certificates\n");
-
-       p = system_certificate_list;
-       end = p + system_certificate_list_size;
-       while (p < end) {
-               /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
-                * than 256 bytes in size.
-                */
-               if (end - p < 4)
-                       goto dodgy_cert;
-               if (p[0] != 0x30 &&
-                   p[1] != 0x82)
-                       goto dodgy_cert;
-               plen = (p[2] << 8) | p[3];
-               plen += 4;
-               if (plen > end - p)
-                       goto dodgy_cert;
-
-               key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
-                                          "asymmetric",
-                                          NULL,
-                                          p,
-                                          plen,
-                                          ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                          KEY_USR_VIEW | KEY_USR_READ),
-                                          KEY_ALLOC_NOT_IN_QUOTA |
-                                          KEY_ALLOC_TRUSTED);
-               if (IS_ERR(key)) {
-                       pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
-                              PTR_ERR(key));
-               } else {
-                       set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
-                       pr_notice("Loaded X.509 cert '%s'\n",
-                                 key_ref_to_ptr(key)->description);
-                       key_ref_put(key);
-               }
-               p += plen;
-       }
-
-       return 0;
-
-dodgy_cert:
-       pr_err("Problem parsing in-kernel X.509 certificate list\n");
-       return 0;
-}
-late_initcall(load_system_certificate_list);
index 50eb107f119877ab0762bb671a69cf9c0cd418bc..a9b76a40319e86b5d545621d274c262bc69c7c70 100644 (file)
@@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 static int __clockevents_switch_state(struct clock_event_device *dev,
                                      enum clock_event_state state)
 {
-       /* Transition with legacy set_mode() callback */
-       if (dev->set_mode) {
-               /* Legacy callback doesn't support new modes */
-               if (state > CLOCK_EVT_STATE_ONESHOT)
-                       return -ENOSYS;
-               /*
-                * 'clock_event_state' and 'clock_event_mode' have 1-to-1
-                * mapping until *_ONESHOT, and so a simple cast will work.
-                */
-               dev->set_mode((enum clock_event_mode)state, dev);
-               dev->mode = (enum clock_event_mode)state;
-               return 0;
-       }
-
        if (dev->features & CLOCK_EVT_FEAT_DUMMY)
                return 0;
 
@@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
 {
        int ret = 0;
 
-       if (dev->set_mode) {
-               dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
-               dev->mode = CLOCK_EVT_MODE_RESUME;
-       } else if (dev->tick_resume) {
+       if (dev->tick_resume)
                ret = dev->tick_resume(dev);
-       }
 
        return ret;
 }
@@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind_device);
 
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
-       /* Legacy set_mode() callback */
-       if (dev->set_mode) {
-               /* We shouldn't be supporting new modes now */
-               WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-                       dev->set_state_shutdown || dev->tick_resume ||
-                       dev->set_state_oneshot_stopped);
-
-               BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-               return 0;
-       }
-
-       if (dev->features & CLOCK_EVT_FEAT_DUMMY)
-               return 0;
-
-       return 0;
-}
-
 /**
  * clockevents_register_device - register a clock event device
  * @dev:       device to register
@@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
        unsigned long flags;
 
-       BUG_ON(clockevents_sanity_check(dev));
-
        /* Initialize state to DETACHED */
        clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
index d11c55b6ab7db3585d449ac2a9a5c55ad4a0dd12..4fcd99e12aa01ce3ce0fa24fb219d644a68417a3 100644 (file)
@@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu)
                 * the set mode function!
                 */
                clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
-               dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
index 3319e16f31e58ed69534ab2fa7e5cd05d7b90d8f..7c7ec45159834a1b25576fbed037c9951f3c076f 100644 (file)
@@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
 __setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
+                                      unsigned long action,
+                                      void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
 
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                /*
-                * If we handle the timekeeping duty for full dynticks CPUs,
-                * we can't safely shutdown that CPU.
+                * The boot CPU handles housekeeping duty (unbound timers,
+                * workqueues, timekeeping, ...) on behalf of full dynticks
+                * CPUs. It must remain online when nohz full is enabled.
                 */
                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
                        return NOTIFY_BAD;
@@ -370,6 +371,12 @@ void __init tick_nohz_init(void)
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
+
+       /*
+        * We need at least one CPU to handle housekeeping work such
+        * as timekeeping, unbound timers, workqueues, ...
+        */
+       WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
index f6ee2e6b6f5dcd53a451caf2a9dc83278df481c6..3739ac6aa47355e7234cf0ee2fc60ebc3adce979 100644 (file)
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
        negative = (tick_error < 0);
 
        /* Sort out the magnitude of the correction */
-       tick_error = abs(tick_error);
+       tick_error = abs64(tick_error);
        for (adj = 0; tick_error > interval; adj++)
                tick_error >>= 1;
 
index 129c96033e466cea9e804cb2136348f6eb335e7c..f75e35b6014900da71ffa438a507c34f19e36e3a 100644 (file)
@@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                   (unsigned long long) dev->min_delta_ns);
        SEQ_printf(m, " mult:           %u\n", dev->mult);
        SEQ_printf(m, " shift:          %u\n", dev->shift);
-       SEQ_printf(m, " mode:           %d\n", dev->mode);
+       SEQ_printf(m, " mode:           %d\n", clockevent_get_state(dev));
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
 
@@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->set_next_event);
        SEQ_printf(m, "\n");
 
-       if (dev->set_mode) {
-               SEQ_printf(m, " set_mode:       ");
-               print_name_offset(m, dev->set_mode);
+       if (dev->set_state_shutdown) {
+               SEQ_printf(m, " shutdown: ");
+               print_name_offset(m, dev->set_state_shutdown);
                SEQ_printf(m, "\n");
-       } else {
-               if (dev->set_state_shutdown) {
-                       SEQ_printf(m, " shutdown: ");
-                       print_name_offset(m, dev->set_state_shutdown);
-                       SEQ_printf(m, "\n");
-               }
+       }
 
-               if (dev->set_state_periodic) {
-                       SEQ_printf(m, " periodic: ");
-                       print_name_offset(m, dev->set_state_periodic);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->set_state_periodic) {
+               SEQ_printf(m, " periodic: ");
+               print_name_offset(m, dev->set_state_periodic);
+               SEQ_printf(m, "\n");
+       }
 
-               if (dev->set_state_oneshot) {
-                       SEQ_printf(m, " oneshot:  ");
-                       print_name_offset(m, dev->set_state_oneshot);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->set_state_oneshot) {
+               SEQ_printf(m, " oneshot:  ");
+               print_name_offset(m, dev->set_state_oneshot);
+               SEQ_printf(m, "\n");
+       }
 
-               if (dev->set_state_oneshot_stopped) {
-                       SEQ_printf(m, " oneshot stopped: ");
-                       print_name_offset(m, dev->set_state_oneshot_stopped);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->set_state_oneshot_stopped) {
+               SEQ_printf(m, " oneshot stopped: ");
+               print_name_offset(m, dev->set_state_oneshot_stopped);
+               SEQ_printf(m, "\n");
+       }
 
-               if (dev->tick_resume) {
-                       SEQ_printf(m, " resume:   ");
-                       print_name_offset(m, dev->tick_resume);
-                       SEQ_printf(m, "\n");
-               }
+       if (dev->tick_resume) {
+               SEQ_printf(m, " resume:   ");
+               print_name_offset(m, dev->tick_resume);
+               SEQ_printf(m, "\n");
        }
 
        SEQ_printf(m, " event_handler:  ");
index eb11011b5292add880af7038800560aa29c5a674..b0623ac785a22287526327a021d81fe3eaf5fafb 100644 (file)
@@ -630,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v)
                goto out;
        }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       avg = rec->time;
+       do_div(avg, rec->counter);
+       if (tracing_thresh && (avg < tracing_thresh))
+               goto out;
+#endif
+
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
        seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        seq_puts(m, "    ");
-       avg = rec->time;
-       do_div(avg, rec->counter);
 
        /* Sample standard deviation (s^2) */
        if (rec->counter <= 1)
index 6260717c18e3c6fb8eefa4cfcc185ab8322a6929..fc347f8b1bca24debb823cdeca09424368c16e2e 100644 (file)
@@ -399,6 +399,17 @@ struct rb_irq_work {
        bool                            wakeup_full;
 };
 
+/*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+       u64                     ts;
+       u64                     delta;
+       unsigned long           length;
+       struct buffer_page      *tail_page;
+       int                     add_timestamp;
+};
+
 /*
  * Used for which event context the event is in.
  *  NMI     = 0
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
        return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
 }
 
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-                  struct ring_buffer_event *event)
-{
-       unsigned long addr = (unsigned long)event;
-       unsigned long index;
-
-       index = rb_event_index(event);
-       addr &= PAGE_MASK;
-
-       return cpu_buffer->commit_page->page == (void *)addr &&
-               rb_commit_index(cpu_buffer) == index;
-}
-
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
-       unsigned long max_count;
-
-       /*
-        * We only race with interrupts and NMIs on this CPU.
-        * If we own the commit event, then we can commit
-        * all others that interrupted us, since the interruptions
-        * are in stack format (they finish before they come
-        * back to us). This allows us to do a simple loop to
-        * assign the commit to the tail.
-        */
- again:
-       max_count = cpu_buffer->nr_pages * 100;
-
-       while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-               if (RB_WARN_ON(cpu_buffer, !(--max_count)))
-                       return;
-               if (RB_WARN_ON(cpu_buffer,
-                              rb_is_reader_page(cpu_buffer->tail_page)))
-                       return;
-               local_set(&cpu_buffer->commit_page->page->commit,
-                         rb_page_write(cpu_buffer->commit_page));
-               rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-               cpu_buffer->write_stamp =
-                       cpu_buffer->commit_page->page->time_stamp;
-               /* add barrier to keep gcc from optimizing too much */
-               barrier();
-       }
-       while (rb_commit_index(cpu_buffer) !=
-              rb_page_write(cpu_buffer->commit_page)) {
-
-               local_set(&cpu_buffer->commit_page->page->commit,
-                         rb_page_write(cpu_buffer->commit_page));
-               RB_WARN_ON(cpu_buffer,
-                          local_read(&cpu_buffer->commit_page->page->commit) &
-                          ~RB_WRITE_MASK);
-               barrier();
-       }
-
-       /* again, keep gcc from optimizing */
-       barrier();
-
-       /*
-        * If an interrupt came in just after the first while loop
-        * and pushed the tail page forward, we will be left with
-        * a dangling commit that will never go forward.
-        */
-       if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
-               goto again;
-}
-
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
        cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
        iter->head = 0;
 }
 
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
-       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-
-       /* Not the first event on the page? */
-       if (rb_event_index(event)) {
-               event->time_delta = delta & TS_MASK;
-               event->array[0] = delta >> TS_SHIFT;
-       } else {
-               /* nope, just zero it */
-               event->time_delta = 0;
-               event->array[0] = 0;
-       }
-
-       return skip_time_extend(event);
-}
-
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
-               struct ring_buffer_event *event, unsigned length,
-               int add_timestamp, u64 delta)
-{
-       /* Only a commit updates the timestamp */
-       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-               delta = 0;
-
-       /*
-        * If we need to add a timestamp, then we
-        * add it to the start of the resevered space.
-        */
-       if (unlikely(add_timestamp)) {
-               event = rb_add_time_stamp(event, delta);
-               length -= RB_LEN_TIME_EXTEND;
-               delta = 0;
-       }
-
-       event->time_delta = delta;
-       length -= RB_EVNT_HDR_SIZE;
-       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
-               event->type_len = 0;
-               event->array[0] = length;
-       } else
-               event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
-
 /*
  * rb_handle_head_page - writer hit the head page
  *
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 }
 
-static unsigned rb_calculate_event_length(unsigned length)
-{
-       struct ring_buffer_event event; /* Used only for sizeof array */
-
-       /* zero length can cause confusions */
-       if (!length)
-               length++;
-
-       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
-               length += sizeof(event.array[0]);
-
-       length += RB_EVNT_HDR_SIZE;
-       length = ALIGN(length, RB_ARCH_ALIGNMENT);
-
-       return length;
-}
-
 static inline void
 rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
-             struct buffer_page *tail_page,
-             unsigned long tail, unsigned long length)
+             unsigned long tail, struct rb_event_info *info)
 {
+       struct buffer_page *tail_page = info->tail_page;
        struct ring_buffer_event *event;
+       unsigned long length = info->length;
 
        /*
         * Only the event that crossed the page boundary
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
  */
 static noinline struct ring_buffer_event *
 rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
-            unsigned long length, unsigned long tail,
-            struct buffer_page *tail_page, u64 ts)
+            unsigned long tail, struct rb_event_info *info)
 {
+       struct buffer_page *tail_page = info->tail_page;
        struct buffer_page *commit_page = cpu_buffer->commit_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
        struct buffer_page *next_page;
        int ret;
+       u64 ts;
 
        next_page = tail_page;
 
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_again:
 
-       rb_reset_tail(cpu_buffer, tail_page, tail, length);
+       rb_reset_tail(cpu_buffer, tail, info);
 
        /* fail and let the caller try again */
        return ERR_PTR(-EAGAIN);
 
  out_reset:
        /* reset write */
-       rb_reset_tail(cpu_buffer, tail_page, tail, length);
+       rb_reset_tail(cpu_buffer, tail, info);
 
        return NULL;
 }
 
-static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                 unsigned long length, u64 ts,
-                 u64 delta, int add_timestamp)
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 {
-       struct buffer_page *tail_page;
-       struct ring_buffer_event *event;
-       unsigned long tail, write;
+       event->type_len = RINGBUF_TYPE_TIME_EXTEND;
 
-       /*
-        * If the time delta since the last event is too big to
-        * hold in the time field of the event, then we append a
-        * TIME EXTEND event ahead of the data event.
-        */
-       if (unlikely(add_timestamp))
-               length += RB_LEN_TIME_EXTEND;
+       /* Not the first event on the page? */
+       if (rb_event_index(event)) {
+               event->time_delta = delta & TS_MASK;
+               event->array[0] = delta >> TS_SHIFT;
+       } else {
+               /* nope, just zero it */
+               event->time_delta = 0;
+               event->array[0] = 0;
+       }
+
+       return skip_time_extend(event);
+}
 
-       tail_page = cpu_buffer->tail_page;
-       write = local_add_return(length, &tail_page->write);
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+                                    struct ring_buffer_event *event);
 
-       /* set write to only the index of the write */
-       write &= RB_WRITE_MASK;
-       tail = write - length;
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+               struct ring_buffer_event *event,
+               struct rb_event_info *info)
+{
+       unsigned length = info->length;
+       u64 delta = info->delta;
+
+       /* Only a commit updates the timestamp */
+       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+               delta = 0;
 
        /*
-        * If this is the first commit on the page, then it has the same
-        * timestamp as the page itself.
+        * If we need to add a timestamp, then we
+        * add it to the start of the resevered space.
         */
-       if (!tail)
+       if (unlikely(info->add_timestamp)) {
+               event = rb_add_time_stamp(event, delta);
+               length -= RB_LEN_TIME_EXTEND;
                delta = 0;
+       }
 
-       /* See if we shot pass the end of this buffer page */
-       if (unlikely(write > BUF_PAGE_SIZE))
-               return rb_move_tail(cpu_buffer, length, tail,
-                                   tail_page, ts);
+       event->time_delta = delta;
+       length -= RB_EVNT_HDR_SIZE;
+       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+               event->type_len = 0;
+               event->array[0] = length;
+       } else
+               event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
 
-       /* We reserved something on the buffer */
+static unsigned rb_calculate_event_length(unsigned length)
+{
+       struct ring_buffer_event event; /* Used only for sizeof array */
 
-       event = __rb_page_index(tail_page, tail);
-       kmemcheck_annotate_bitfield(event, bitfield);
-       rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+       /* zero length can cause confusions */
+       if (!length)
+               length++;
 
-       local_inc(&tail_page->entries);
+       if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+               length += sizeof(event.array[0]);
+
+       length += RB_EVNT_HDR_SIZE;
+       length = ALIGN(length, RB_ARCH_ALIGNMENT);
 
        /*
-        * If this is the first commit on the page, then update
-        * its timestamp.
+        * In case the time delta is larger than the 27 bits for it
+        * in the header, we need to add a timestamp. If another
+        * event comes in when trying to discard this one to increase
+        * the length, then the timestamp will be added in the allocated
+        * space of this event. If length is bigger than the size needed
+        * for the TIME_EXTEND, then padding has to be used. The events
+        * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+        * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+        * As length is a multiple of 4, we only need to worry if it
+        * is 12 (RB_LEN_TIME_EXTEND + 4).
         */
-       if (!tail)
-               tail_page->page->time_stamp = ts;
+       if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+               length += RB_ALIGNMENT;
 
-       /* account for these added bytes */
-       local_add(length, &cpu_buffer->entries_bytes);
+       return length;
+}
 
-       return event;
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+       return true;
 }
+#endif
 
 static inline int
 rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2483,22 +2400,75 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
        local_inc(&cpu_buffer->commits);
 }
 
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
-       unsigned long commits;
-
-       if (RB_WARN_ON(cpu_buffer,
-                      !local_read(&cpu_buffer->committing)))
-               return;
+       unsigned long max_count;
 
+       /*
+        * We only race with interrupts and NMIs on this CPU.
+        * If we own the commit event, then we can commit
+        * all others that interrupted us, since the interruptions
+        * are in stack format (they finish before they come
+        * back to us). This allows us to do a simple loop to
+        * assign the commit to the tail.
+        */
  again:
-       commits = local_read(&cpu_buffer->commits);
-       /* synchronize with interrupts */
-       barrier();
-       if (local_read(&cpu_buffer->committing) == 1)
-               rb_set_commit_to_write(cpu_buffer);
-
-       local_dec(&cpu_buffer->committing);
+       max_count = cpu_buffer->nr_pages * 100;
+
+       while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+               if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+                       return;
+               if (RB_WARN_ON(cpu_buffer,
+                              rb_is_reader_page(cpu_buffer->tail_page)))
+                       return;
+               local_set(&cpu_buffer->commit_page->page->commit,
+                         rb_page_write(cpu_buffer->commit_page));
+               rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+               cpu_buffer->write_stamp =
+                       cpu_buffer->commit_page->page->time_stamp;
+               /* add barrier to keep gcc from optimizing too much */
+               barrier();
+       }
+       while (rb_commit_index(cpu_buffer) !=
+              rb_page_write(cpu_buffer->commit_page)) {
+
+               local_set(&cpu_buffer->commit_page->page->commit,
+                         rb_page_write(cpu_buffer->commit_page));
+               RB_WARN_ON(cpu_buffer,
+                          local_read(&cpu_buffer->commit_page->page->commit) &
+                          ~RB_WRITE_MASK);
+               barrier();
+       }
+
+       /* again, keep gcc from optimizing */
+       barrier();
+
+       /*
+        * If an interrupt came in just after the first while loop
+        * and pushed the tail page forward, we will be left with
+        * a dangling commit that will never go forward.
+        */
+       if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+               goto again;
+}
+
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+{
+       unsigned long commits;
+
+       if (RB_WARN_ON(cpu_buffer,
+                      !local_read(&cpu_buffer->committing)))
+               return;
+
+ again:
+       commits = local_read(&cpu_buffer->commits);
+       /* synchronize with interrupts */
+       barrier();
+       if (local_read(&cpu_buffer->committing) == 1)
+               rb_set_commit_to_write(cpu_buffer);
+
+       local_dec(&cpu_buffer->committing);
 
        /* synchronize with interrupts */
        barrier();
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
        }
 }
 
-static struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer *buffer,
-                     struct ring_buffer_per_cpu *cpu_buffer,
-                     unsigned long length)
+static inline void rb_event_discard(struct ring_buffer_event *event)
 {
-       struct ring_buffer_event *event;
-       u64 ts, delta;
-       int nr_loops = 0;
-       int add_timestamp;
-       u64 diff;
+       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+               event = skip_time_extend(event);
 
-       rb_start_commit(cpu_buffer);
+       /* array[0] holds the actual length for the discarded event */
+       event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+       event->type_len = RINGBUF_TYPE_PADDING;
+       /* time delta must be non zero */
+       if (!event->time_delta)
+               event->time_delta = 1;
+}
 
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
-       /*
-        * Due to the ability to swap a cpu buffer from a buffer
-        * it is possible it was swapped before we committed.
-        * (committing stops a swap). We check for it here and
-        * if it happened, we have to fail the write.
-        */
-       barrier();
-       if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
-               local_dec(&cpu_buffer->committing);
-               local_dec(&cpu_buffer->commits);
-               return NULL;
-       }
-#endif
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+                  struct ring_buffer_event *event)
+{
+       unsigned long addr = (unsigned long)event;
+       unsigned long index;
 
-       length = rb_calculate_event_length(length);
- again:
-       add_timestamp = 0;
-       delta = 0;
+       index = rb_event_index(event);
+       addr &= PAGE_MASK;
+
+       return cpu_buffer->commit_page->page == (void *)addr &&
+               rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+                     struct ring_buffer_event *event)
+{
+       u64 delta;
 
        /*
-        * We allow for interrupts to reenter here and do a trace.
-        * If one does, it will cause this original code to loop
-        * back here. Even with heavy interrupts happening, this
-        * should only happen a few times in a row. If this happens
-        * 1000 times in a row, there must be either an interrupt
-        * storm or we have something buggy.
-        * Bail!
+        * The event first in the commit queue updates the
+        * time stamp.
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
-               goto out_fail;
+       if (rb_event_is_commit(cpu_buffer, event)) {
+               /*
+                * A commit event that is first on a page
+                * updates the write timestamp with the page stamp
+                */
+               if (!rb_event_index(event))
+                       cpu_buffer->write_stamp =
+                               cpu_buffer->commit_page->page->time_stamp;
+               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+                       delta = event->array[0];
+                       delta <<= TS_SHIFT;
+                       delta += event->time_delta;
+                       cpu_buffer->write_stamp += delta;
+               } else
+                       cpu_buffer->write_stamp += event->time_delta;
+       }
+}
 
-       ts = rb_time_stamp(cpu_buffer->buffer);
-       diff = ts - cpu_buffer->write_stamp;
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+                     struct ring_buffer_event *event)
+{
+       local_inc(&cpu_buffer->entries);
+       rb_update_write_stamp(cpu_buffer, event);
+       rb_end_commit(cpu_buffer);
+}
 
-       /* make sure this diff is calculated here */
-       barrier();
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+       bool pagebusy;
 
-       /* Did the write stamp get updated already? */
-       if (likely(ts >= cpu_buffer->write_stamp)) {
-               delta = diff;
-               if (unlikely(test_time_stamp(delta))) {
-                       int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-                       local_clock_stable = sched_clock_stable();
-#endif
-                       WARN_ONCE(delta > (1ULL << 59),
-                                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-                                 (unsigned long long)delta,
-                                 (unsigned long long)ts,
-                                 (unsigned long long)cpu_buffer->write_stamp,
-                                 local_clock_stable ? "" :
-                                 "If you just came from a suspend/resume,\n"
-                                 "please switch to the trace global clock:\n"
-                                 "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
-                       add_timestamp = 1;
-               }
+       if (buffer->irq_work.waiters_pending) {
+               buffer->irq_work.waiters_pending = false;
+               /* irq_work_queue() supplies it's own memory barriers */
+               irq_work_queue(&buffer->irq_work.work);
        }
 
-       event = __rb_reserve_next(cpu_buffer, length, ts,
-                                 delta, add_timestamp);
-       if (unlikely(PTR_ERR(event) == -EAGAIN))
-               goto again;
-
-       if (!event)
-               goto out_fail;
+       if (cpu_buffer->irq_work.waiters_pending) {
+               cpu_buffer->irq_work.waiters_pending = false;
+               /* irq_work_queue() supplies it's own memory barriers */
+               irq_work_queue(&cpu_buffer->irq_work.work);
+       }
 
-       return event;
+       pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
 
- out_fail:
-       rb_end_commit(cpu_buffer);
-       return NULL;
+       if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+               cpu_buffer->irq_work.wakeup_full = true;
+               cpu_buffer->irq_work.full_waiters_pending = false;
+               /* irq_work_queue() supplies it's own memory barriers */
+               irq_work_queue(&cpu_buffer->irq_work.work);
+       }
 }
 
 /*
@@ -2671,6 +2644,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->current_context &= cpu_buffer->current_context - 1;
 }
 
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+                             struct ring_buffer_event *event)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       int cpu = raw_smp_processor_id();
+
+       cpu_buffer = buffer->buffers[cpu];
+
+       rb_commit(cpu_buffer, event);
+
+       rb_wakeups(buffer, cpu_buffer);
+
+       trace_recursive_unlock(cpu_buffer);
+
+       preempt_enable_notrace();
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+                   struct rb_event_info *info)
+{
+       WARN_ONCE(info->delta > (1ULL << 59),
+                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+                 (unsigned long long)info->delta,
+                 (unsigned long long)info->ts,
+                 (unsigned long long)cpu_buffer->write_stamp,
+                 sched_clock_stable() ? "" :
+                 "If you just came from a suspend/resume,\n"
+                 "please switch to the trace global clock:\n"
+                 "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+       info->add_timestamp = 1;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+                 struct rb_event_info *info)
+{
+       struct ring_buffer_event *event;
+       struct buffer_page *tail_page;
+       unsigned long tail, write;
+
+       /*
+        * If the time delta since the last event is too big to
+        * hold in the time field of the event, then we append a
+        * TIME EXTEND event ahead of the data event.
+        */
+       if (unlikely(info->add_timestamp))
+               info->length += RB_LEN_TIME_EXTEND;
+
+       tail_page = info->tail_page = cpu_buffer->tail_page;
+       write = local_add_return(info->length, &tail_page->write);
+
+       /* set write to only the index of the write */
+       write &= RB_WRITE_MASK;
+       tail = write - info->length;
+
+       /*
+        * If this is the first commit on the page, then it has the same
+        * timestamp as the page itself.
+        */
+       if (!tail)
+               info->delta = 0;
+
+       /* See if we shot pass the end of this buffer page */
+       if (unlikely(write > BUF_PAGE_SIZE))
+               return rb_move_tail(cpu_buffer, tail, info);
+
+       /* We reserved something on the buffer */
+
+       event = __rb_page_index(tail_page, tail);
+       kmemcheck_annotate_bitfield(event, bitfield);
+       rb_update_event(cpu_buffer, event, info);
+
+       local_inc(&tail_page->entries);
+
+       /*
+        * If this is the first commit on the page, then update
+        * its timestamp.
+        */
+       if (!tail)
+               tail_page->page->time_stamp = info->ts;
+
+       /* account for these added bytes */
+       local_add(info->length, &cpu_buffer->entries_bytes);
+
+       return event;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer *buffer,
+                     struct ring_buffer_per_cpu *cpu_buffer,
+                     unsigned long length)
+{
+       struct ring_buffer_event *event;
+       struct rb_event_info info;
+       int nr_loops = 0;
+       u64 diff;
+
+       rb_start_commit(cpu_buffer);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+       /*
+        * Due to the ability to swap a cpu buffer from a buffer
+        * it is possible it was swapped before we committed.
+        * (committing stops a swap). We check for it here and
+        * if it happened, we have to fail the write.
+        */
+       barrier();
+       if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+               local_dec(&cpu_buffer->committing);
+               local_dec(&cpu_buffer->commits);
+               return NULL;
+       }
+#endif
+
+       info.length = rb_calculate_event_length(length);
+ again:
+       info.add_timestamp = 0;
+       info.delta = 0;
+
+       /*
+        * We allow for interrupts to reenter here and do a trace.
+        * If one does, it will cause this original code to loop
+        * back here. Even with heavy interrupts happening, this
+        * should only happen a few times in a row. If this happens
+        * 1000 times in a row, there must be either an interrupt
+        * storm or we have something buggy.
+        * Bail!
+        */
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+               goto out_fail;
+
+       info.ts = rb_time_stamp(cpu_buffer->buffer);
+       diff = info.ts - cpu_buffer->write_stamp;
+
+       /* make sure this diff is calculated here */
+       barrier();
+
+       /* Did the write stamp get updated already? */
+       if (likely(info.ts >= cpu_buffer->write_stamp)) {
+               info.delta = diff;
+               if (unlikely(test_time_stamp(info.delta)))
+                       rb_handle_timestamp(cpu_buffer, &info);
+       }
+
+       event = __rb_reserve_next(cpu_buffer, &info);
+
+       if (unlikely(PTR_ERR(event) == -EAGAIN))
+               goto again;
+
+       if (!event)
+               goto out_fail;
+
+       return event;
+
+ out_fail:
+       rb_end_commit(cpu_buffer);
+       return NULL;
+}
+
 /**
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                     struct ring_buffer_event *event)
-{
-       u64 delta;
-
-       /*
-        * The event first in the commit queue updates the
-        * time stamp.
-        */
-       if (rb_event_is_commit(cpu_buffer, event)) {
-               /*
-                * A commit event that is first on a page
-                * updates the write timestamp with the page stamp
-                */
-               if (!rb_event_index(event))
-                       cpu_buffer->write_stamp =
-                               cpu_buffer->commit_page->page->time_stamp;
-               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-                       delta = event->array[0];
-                       delta <<= TS_SHIFT;
-                       delta += event->time_delta;
-                       cpu_buffer->write_stamp += delta;
-               } else
-                       cpu_buffer->write_stamp += event->time_delta;
-       }
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
-                     struct ring_buffer_event *event)
-{
-       local_inc(&cpu_buffer->entries);
-       rb_update_write_stamp(cpu_buffer, event);
-       rb_end_commit(cpu_buffer);
-}
-
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
-       bool pagebusy;
-
-       if (buffer->irq_work.waiters_pending) {
-               buffer->irq_work.waiters_pending = false;
-               /* irq_work_queue() supplies it's own memory barriers */
-               irq_work_queue(&buffer->irq_work.work);
-       }
-
-       if (cpu_buffer->irq_work.waiters_pending) {
-               cpu_buffer->irq_work.waiters_pending = false;
-               /* irq_work_queue() supplies it's own memory barriers */
-               irq_work_queue(&cpu_buffer->irq_work.work);
-       }
-
-       pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-
-       if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
-               cpu_buffer->irq_work.wakeup_full = true;
-               cpu_buffer->irq_work.full_waiters_pending = false;
-               /* irq_work_queue() supplies it's own memory barriers */
-               irq_work_queue(&cpu_buffer->irq_work.work);
-       }
-}
-
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-                             struct ring_buffer_event *event)
-{
-       struct ring_buffer_per_cpu *cpu_buffer;
-       int cpu = raw_smp_processor_id();
-
-       cpu_buffer = buffer->buffers[cpu];
-
-       rb_commit(cpu_buffer, event);
-
-       rb_wakeups(buffer, cpu_buffer);
-
-       trace_recursive_unlock(cpu_buffer);
-
-       preempt_enable_notrace();
-
-       return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
-       if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
-               event = skip_time_extend(event);
-
-       /* array[0] holds the actual length for the discarded event */
-       event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
-       event->type_len = RINGBUF_TYPE_PADDING;
-       /* time delta must be non zero */
-       if (!event->time_delta)
-               event->time_delta = 1;
-}
-
 /*
  * Decrement the entries to the page that an event is on.
  * The event does not even need to exist, only the pointer
index abcbf7ff874364d22b62c0fdcf32cfbc8b0d6363..6e79408674aaa15e7f5be7da0e86488599fdfa0f 100644 (file)
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
        if (!iter)
                return ERR_PTR(-ENOMEM);
 
-       iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+       iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
                                    GFP_KERNEL);
        if (!iter->buffer_iter)
                goto release;
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
        trace_init_global_iter(&iter);
 
        for_each_tracing_cpu(cpu) {
-               atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+               atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
        }
 
        old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
index 404a372ad85a94545638d043bf86d113dfe9216f..7ca09cdc20c2f920faa004e32eb248e3bc92bf61 100644 (file)
@@ -30,6 +30,7 @@
 DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_generic_fields);
 static LIST_HEAD(ftrace_common_fields);
 
 #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
        struct ftrace_event_field *field;
        struct list_head *head;
 
+       field = __find_event_field(&ftrace_generic_fields, name);
+       if (field)
+               return field;
+
        field = __find_event_field(&ftrace_common_fields, name);
        if (field)
                return field;
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
 
+#define __generic_field(type, item, filter_type)                       \
+       ret = __trace_define_field(&ftrace_generic_fields, #type,       \
+                                  #item, 0, 0, is_signed_type(type),   \
+                                  filter_type);                        \
+       if (ret)                                                        \
+               return ret;
+
 #define __common_field(type, item)                                     \
        ret = __trace_define_field(&ftrace_common_fields, #type,        \
                                   "common_" #item,                     \
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
        if (ret)                                                        \
                return ret;
 
+static int trace_define_generic_fields(void)
+{
+       int ret;
+
+       __generic_field(int, cpu, FILTER_OTHER);
+       __generic_field(char *, comm, FILTER_PTR_STRING);
+
+       return ret;
+}
+
 static int trace_define_common_fields(void)
 {
        int ret;
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void)
        if (!entry)
                pr_warn("Could not create tracefs 'available_events' entry\n");
 
+       if (trace_define_generic_fields())
+               pr_warn("tracing: Failed to allocated generic fields");
+
        if (trace_define_common_fields())
                pr_warn("tracing: Failed to allocate common fields");
 
index d81d6f302b14b3308bbf2c96432d0a827aa61770..bd1bf184c5c98b6cc36d5c7e183bd4698702a514 100644 (file)
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
        return match;
 }
 
+/* Filter predicate for CPUs. */
+static int filter_pred_cpu(struct filter_pred *pred, void *event)
+{
+       int cpu, cmp;
+       int match = 0;
+
+       cpu = raw_smp_processor_id();
+       cmp = pred->val;
+
+       switch (pred->op) {
+       case OP_EQ:
+               match = cpu == cmp;
+               break;
+       case OP_LT:
+               match = cpu < cmp;
+               break;
+       case OP_LE:
+               match = cpu <= cmp;
+               break;
+       case OP_GT:
+               match = cpu > cmp;
+               break;
+       case OP_GE:
+               match = cpu >= cmp;
+               break;
+       default:
+               break;
+       }
+
+       return !!match == !pred->not;
+}
+
+/* Filter predicate for COMM. */
+static int filter_pred_comm(struct filter_pred *pred, void *event)
+{
+       int cmp, match;
+
+       cmp = pred->regex.match(current->comm, &pred->regex,
+                               pred->regex.field_len);
+       match = cmp ^ pred->not;
+
+       return match;
+}
+
 static int filter_pred_none(struct filter_pred *pred, void *event)
 {
        return 0;
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
        if (is_string_field(field)) {
                filter_build_regex(pred);
 
-               if (field->filter_type == FILTER_STATIC_STRING) {
+               if (!strcmp(field->name, "comm")) {
+                       fn = filter_pred_comm;
+                       pred->regex.field_len = TASK_COMM_LEN;
+               } else if (field->filter_type == FILTER_STATIC_STRING) {
                        fn = filter_pred_string;
                        pred->regex.field_len = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
                }
                pred->val = val;
 
-               fn = select_comparison_fn(pred->op, field->size,
+               if (!strcmp(field->name, "cpu"))
+                       fn = filter_pred_cpu;
+               else
+                       fn = select_comparison_fn(pred->op, field->size,
                                          field->is_signed);
                if (!fn) {
                        parse_error(ps, FILT_ERR_INVALID_OP, 0);
index 8968bf720c1259387ba2f6413dbdda026671ada8..ca98445782acaa83915a5ffb653a189bed1705b1 100644 (file)
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
                snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
                trace_seq_printf(s, ".%s", nsecs_str);
-               len += strlen(nsecs_str);
+               len += strlen(nsecs_str) + 1;
        }
 
        trace_seq_puts(s, " us ");
 
        /* Print remaining spaces to fit the row's width */
-       for (i = len; i < 7; i++)
+       for (i = len; i < 8; i++)
                trace_seq_putc(s, ' ');
 }
 
index dfab253727dc9ea2ce674d5c0add0c9fc59c2d08..8e481a84aeea79b8b008900f3b6f5dbf3044ba7c 100644 (file)
@@ -496,6 +496,8 @@ static const struct trace_mark {
        char                    sym;
 } mark[] = {
        MARK(1000000000ULL      , '$'), /* 1 sec */
+       MARK(100000000ULL       , '@'), /* 100 msec */
+       MARK(10000000ULL        , '*'), /* 10 msec */
        MARK(1000000ULL         , '#'), /* 1000 usecs */
        MARK(100000ULL          , '!'), /* 100 usecs */
        MARK(10000ULL           , '+'), /* 10 usecs */
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d)
        int size = ARRAY_SIZE(mark);
 
        for (i = 0; i < size; i++) {
-               if (d >= mark[i].val)
+               if (d > mark[i].val)
                        break;
        }
 
index 3f34496244e936c876184131d322ea58e597e563..b746399ab59c01e422da63468aa370b1b642a860 100644 (file)
 
 #define STACK_TRACE_ENTRIES 500
 
-#ifdef CC_USING_FENTRY
-# define fentry                1
-#else
-# define fentry                0
-#endif
-
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
         { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
 static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
  */
 static struct stack_trace max_stack_trace = {
        .max_entries            = STACK_TRACE_ENTRIES - 1,
-       .entries                = &stack_dump_trace[1],
+       .entries                = &stack_dump_trace[0],
 };
 
 static unsigned long max_stack_size;
@@ -55,7 +49,7 @@ static inline void print_max_stack(void)
 
        pr_emerg("        Depth    Size   Location    (%d entries)\n"
                           "        -----    ----   --------\n",
-                          max_stack_trace.nr_entries - 1);
+                          max_stack_trace.nr_entries);
 
        for (i = 0; i < max_stack_trace.nr_entries; i++) {
                if (stack_dump_trace[i] == ULONG_MAX)
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack)
        unsigned long this_size, flags; unsigned long *p, *top, *start;
        static int tracer_frame;
        int frame_size = ACCESS_ONCE(tracer_frame);
-       int i;
+       int i, x;
 
        this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
        this_size = THREAD_SIZE - this_size;
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack)
        max_stack_size = this_size;
 
        max_stack_trace.nr_entries = 0;
-
-       if (using_ftrace_ops_list_func())
-               max_stack_trace.skip = 4;
-       else
-               max_stack_trace.skip = 3;
+       max_stack_trace.skip = 3;
 
        save_stack_trace(&max_stack_trace);
 
-       /*
-        * Add the passed in ip from the function tracer.
-        * Searching for this on the stack will skip over
-        * most of the overhead from the stack tracer itself.
-        */
-       stack_dump_trace[0] = ip;
-       max_stack_trace.nr_entries++;
+       /* Skip over the overhead of the stack tracer itself */
+       for (i = 0; i < max_stack_trace.nr_entries; i++) {
+               if (stack_dump_trace[i] == ip)
+                       break;
+       }
 
        /*
         * Now find where in the stack these are.
         */
-       i = 0;
+       x = 0;
        start = stack;
        top = (unsigned long *)
                (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack)
        while (i < max_stack_trace.nr_entries) {
                int found = 0;
 
-               stack_dump_index[i] = this_size;
+               stack_dump_index[x] = this_size;
                p = start;
 
                for (; p < top && i < max_stack_trace.nr_entries; p++) {
+                       if (stack_dump_trace[i] == ULONG_MAX)
+                               break;
                        if (*p == stack_dump_trace[i]) {
-                               this_size = stack_dump_index[i++] =
+                               stack_dump_trace[x] = stack_dump_trace[i++];
+                               this_size = stack_dump_index[x++] =
                                        (top - p) * sizeof(unsigned long);
                                found = 1;
                                /* Start the search from here */
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack)
                                 * out what that is, then figure it out
                                 * now.
                                 */
-                               if (unlikely(!tracer_frame) && i == 1) {
+                               if (unlikely(!tracer_frame)) {
                                        tracer_frame = (p - stack) *
                                                sizeof(unsigned long);
                                        max_stack_size -= tracer_frame;
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack)
                        i++;
        }
 
+       max_stack_trace.nr_entries = x;
+       for (; x < i; x++)
+               stack_dump_trace[x] = ULONG_MAX;
+
        if (task_stack_end_corrupted(current)) {
                print_max_stack();
                BUG();
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
        if (per_cpu(trace_active, cpu)++ != 0)
                goto out;
 
-       /*
-        * When fentry is used, the traced function does not get
-        * its stack frame set up, and we lose the parent.
-        * The ip is pretty useless because the function tracer
-        * was called before that function set up its stack frame.
-        * In this case, we use the parent ip.
-        *
-        * By adding the return address of either the parent ip
-        * or the current ip we can disregard most of the stack usage
-        * caused by the stack tracer itself.
-        *
-        * The function tracer always reports the address of where the
-        * mcount call was, but the stack will hold the return address.
-        */
-       if (fentry)
-               ip = parent_ip;
-       else
-               ip += MCOUNT_INSN_SIZE;
+       ip += MCOUNT_INSN_SIZE;
 
        check_stack(ip, &stack);
 
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos)
 {
        long n = *pos - 1;
 
-       if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+       if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
                return NULL;
 
        m->private = (void *)n;
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v)
                seq_printf(m, "        Depth    Size   Location"
                           "    (%d entries)\n"
                           "        -----    ----   --------\n",
-                          max_stack_trace.nr_entries - 1);
+                          max_stack_trace.nr_entries);
 
                if (!stack_tracer_enabled && !max_stack_size)
                        print_disabled(m);
index 8a49ff9d15027af8e0c60b40c56f56e4d54f8125..2e491ac15622a559c88ba12a4067eeb5ca704115 100644 (file)
@@ -525,4 +525,7 @@ config ARCH_HAS_SG_CHAIN
 config ARCH_HAS_PMEM_API
        bool
 
+config ARCH_HAS_MMIO_FLUSH
+       bool
+
 endmenu
index f01c558bf80db603abcb53868f0e777134208a60..13a7c6ae3feca4b0e24bdd90281933a8ab09bf4f 100644 (file)
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
         sha1.o md5.o irq_regs.o argv_split.o \
         proportions.o flex_proportions.o ratelimit.o show_mem.o \
         is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-        earlycpio.o seq_buf.o
+        earlycpio.o seq_buf.o nmi_backtrace.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
index 1a000bb050f9f91a01e50f429b20238e19d161c8..2b3f46c049d458a590d080823b344da3b3229f7c 100644 (file)
@@ -24,15 +24,20 @@ static const unsigned char asn1_op_lengths[ASN1_OP__NR] = {
        [ASN1_OP_MATCH_JUMP]                    = 1 + 1 + 1,
        [ASN1_OP_MATCH_JUMP_OR_SKIP]            = 1 + 1 + 1,
        [ASN1_OP_MATCH_ANY]                     = 1,
+       [ASN1_OP_MATCH_ANY_OR_SKIP]             = 1,
        [ASN1_OP_MATCH_ANY_ACT]                 = 1         + 1,
+       [ASN1_OP_MATCH_ANY_ACT_OR_SKIP]         = 1         + 1,
        [ASN1_OP_COND_MATCH_OR_SKIP]            = 1 + 1,
        [ASN1_OP_COND_MATCH_ACT_OR_SKIP]        = 1 + 1     + 1,
        [ASN1_OP_COND_MATCH_JUMP_OR_SKIP]       = 1 + 1 + 1,
        [ASN1_OP_COND_MATCH_ANY]                = 1,
+       [ASN1_OP_COND_MATCH_ANY_OR_SKIP]        = 1,
        [ASN1_OP_COND_MATCH_ANY_ACT]            = 1         + 1,
+       [ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP]    = 1         + 1,
        [ASN1_OP_COND_FAIL]                     = 1,
        [ASN1_OP_COMPLETE]                      = 1,
        [ASN1_OP_ACT]                           = 1         + 1,
+       [ASN1_OP_MAYBE_ACT]                     = 1         + 1,
        [ASN1_OP_RETURN]                        = 1,
        [ASN1_OP_END_SEQ]                       = 1,
        [ASN1_OP_END_SEQ_OF]                    = 1     + 1,
@@ -177,6 +182,7 @@ int asn1_ber_decoder(const struct asn1_decoder *decoder,
        unsigned char flags = 0;
 #define FLAG_INDEFINITE_LENGTH 0x01
 #define FLAG_MATCHED           0x02
+#define FLAG_LAST_MATCHED      0x04 /* Last tag matched */
 #define FLAG_CONS              0x20 /* Corresponds to CONS bit in the opcode tag
                                      * - ie. whether or not we are going to parse
                                      *   a compound type.
@@ -208,9 +214,9 @@ next_op:
                unsigned char tmp;
 
                /* Skip conditional matches if possible */
-               if ((op & ASN1_OP_MATCH__COND &&
-                    flags & FLAG_MATCHED) ||
-                   dp == datalen) {
+               if ((op & ASN1_OP_MATCH__COND && flags & FLAG_MATCHED) ||
+                   (op & ASN1_OP_MATCH__SKIP && dp == datalen)) {
+                       flags &= ~FLAG_LAST_MATCHED;
                        pc += asn1_op_lengths[op];
                        goto next_op;
                }
@@ -302,7 +308,9 @@ next_op:
        /* Decide how to handle the operation */
        switch (op) {
        case ASN1_OP_MATCH_ANY_ACT:
+       case ASN1_OP_MATCH_ANY_ACT_OR_SKIP:
        case ASN1_OP_COND_MATCH_ANY_ACT:
+       case ASN1_OP_COND_MATCH_ANY_ACT_OR_SKIP:
                ret = actions[machine[pc + 1]](context, hdr, tag, data + dp, len);
                if (ret < 0)
                        return ret;
@@ -319,8 +327,10 @@ next_op:
        case ASN1_OP_MATCH:
        case ASN1_OP_MATCH_OR_SKIP:
        case ASN1_OP_MATCH_ANY:
+       case ASN1_OP_MATCH_ANY_OR_SKIP:
        case ASN1_OP_COND_MATCH_OR_SKIP:
        case ASN1_OP_COND_MATCH_ANY:
+       case ASN1_OP_COND_MATCH_ANY_OR_SKIP:
        skip_data:
                if (!(flags & FLAG_CONS)) {
                        if (flags & FLAG_INDEFINITE_LENGTH) {
@@ -422,8 +432,15 @@ next_op:
                pc += asn1_op_lengths[op];
                goto next_op;
 
+       case ASN1_OP_MAYBE_ACT:
+               if (!(flags & FLAG_LAST_MATCHED)) {
+                       pc += asn1_op_lengths[op];
+                       goto next_op;
+               }
        case ASN1_OP_ACT:
                ret = actions[machine[pc + 1]](context, hdr, tag, data + tdp, len);
+               if (ret < 0)
+                       return ret;
                pc += asn1_op_lengths[op];
                goto next_op;
 
@@ -431,6 +448,7 @@ next_op:
                if (unlikely(jsp <= 0))
                        goto jump_stack_underflow;
                pc = jump_stack[--jsp];
+               flags |= FLAG_MATCHED | FLAG_LAST_MATCHED;
                goto next_op;
 
        default:
@@ -438,7 +456,8 @@ next_op:
        }
 
        /* Shouldn't reach here */
-       pr_err("ASN.1 decoder error: Found reserved opcode (%u)\n", op);
+       pr_err("ASN.1 decoder error: Found reserved opcode (%u) pc=%zu\n",
+              op, pc);
        return -EBADMSG;
 
 data_overrun_error:
index a578a018919977579063bb599d4fd462bae6a54b..814814397cce39b5b0a4aafa3571062a6468e8cf 100644 (file)
@@ -367,7 +367,8 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
 
        nchunks = nbits = totaldigits = c = 0;
        do {
-               chunk = ndigits = 0;
+               chunk = 0;
+               ndigits = totaldigits;
 
                /* Get the next chunk of the bitmap */
                while (buflen) {
@@ -406,9 +407,9 @@ int __bitmap_parse(const char *buf, unsigned int buflen,
                                return -EOVERFLOW;
 
                        chunk = (chunk << 4) | hex_to_bin(c);
-                       ndigits++; totaldigits++;
+                       totaldigits++;
                }
-               if (ndigits == 0)
+               if (ndigits == totaldigits)
                        return -EINVAL;
                if (nchunks == 0 && chunk == 0)
                        continue;
@@ -505,7 +506,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                int nmaskbits)
 {
        unsigned a, b;
-       int c, old_c, totaldigits;
+       int c, old_c, totaldigits, ndigits;
        const char __user __force *ubuf = (const char __user __force *)buf;
        int at_start, in_range;
 
@@ -515,6 +516,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                at_start = 1;
                in_range = 0;
                a = b = 0;
+               ndigits = totaldigits;
 
                /* Get the next cpu# or a range of cpu#'s */
                while (buflen) {
@@ -528,23 +530,27 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                        if (isspace(c))
                                continue;
 
-                       /*
-                        * If the last character was a space and the current
-                        * character isn't '\0', we've got embedded whitespace.
-                        * This is a no-no, so throw an error.
-                        */
-                       if (totaldigits && c && isspace(old_c))
-                               return -EINVAL;
-
                        /* A '\0' or a ',' signal the end of a cpu# or range */
                        if (c == '\0' || c == ',')
                                break;
+                       /*
+                       * whitespaces between digits are not allowed,
+                       * but it's ok if whitespaces are on head or tail.
+                       * when old_c is whilespace,
+                       * if totaldigits == ndigits, whitespace is on head.
+                       * if whitespace is on tail, it should not run here.
+                       * as c was ',' or '\0',
+                       * the last code line has broken the current loop.
+                       */
+                       if ((totaldigits != ndigits) && isspace(old_c))
+                               return -EINVAL;
 
                        if (c == '-') {
                                if (at_start || in_range)
                                        return -EINVAL;
                                b = 0;
                                in_range = 1;
+                               at_start = 1;
                                continue;
                        }
 
@@ -557,15 +563,18 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
                        at_start = 0;
                        totaldigits++;
                }
+               if (ndigits == totaldigits)
+                       continue;
+               /* if no digit is after '-', it's wrong*/
+               if (at_start && in_range)
+                       return -EINVAL;
                if (!(a <= b))
                        return -EINVAL;
                if (b >= nmaskbits)
                        return -ERANGE;
-               if (!at_start) {
-                       while (a <= b) {
-                               set_bit(a, maskp);
-                               a++;
-                       }
+               while (a <= b) {
+                       set_bit(a, maskp);
+                       a++;
                }
        } while (buflen && c == ',');
        return 0;
index 6dd0335ea61b296b5dfd77818e58e294fb2e805c..0234361b24b89ee09dc452e397bd950a43f69025 100644 (file)
@@ -743,12 +743,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long len,
+STATIC int INIT __decompress(unsigned char *buf, long len,
                        long (*fill)(void*, unsigned long),
                        long (*flush)(void*, unsigned long),
-                       unsigned char *outbuf,
+                       unsigned char *outbuf, long olen,
                        long *pos,
-                       void(*error)(char *x))
+                       void (*error)(char *x))
 {
        return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error);
 }
index d4c7891635ecc2b1fb70b9f4fc75a216c782fe0d..555c06bf20daa83190139392597c4622a00a0e5d 100644 (file)
@@ -1,4 +1,5 @@
 #ifdef STATIC
+#define PREBOOT
 /* Pre-boot environment: included */
 
 /* prevent inclusion of _LINUX_KERNEL_H in pre-boot environment: lots
@@ -33,23 +34,23 @@ static long INIT nofill(void *buffer, unsigned long len)
 }
 
 /* Included from initramfs et al code */
-STATIC int INIT gunzip(unsigned char *buf, long len,
+STATIC int INIT __gunzip(unsigned char *buf, long len,
                       long (*fill)(void*, unsigned long),
                       long (*flush)(void*, unsigned long),
-                      unsigned char *out_buf,
+                      unsigned char *out_buf, long out_len,
                       long *pos,
                       void(*error)(char *x)) {
        u8 *zbuf;
        struct z_stream_s *strm;
        int rc;
-       size_t out_len;
 
        rc = -1;
        if (flush) {
                out_len = 0x8000; /* 32 K */
                out_buf = malloc(out_len);
        } else {
-               out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
+               if (!out_len)
+                       out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
        }
        if (!out_buf) {
                error("Out of memory while allocating output buffer");
@@ -181,4 +182,24 @@ gunzip_nomem1:
        return rc; /* returns Z_OK (0) if successful */
 }
 
-#define decompress gunzip
+#ifndef PREBOOT
+STATIC int INIT gunzip(unsigned char *buf, long len,
+                      long (*fill)(void*, unsigned long),
+                      long (*flush)(void*, unsigned long),
+                      unsigned char *out_buf,
+                      long *pos,
+                      void (*error)(char *x))
+{
+       return __gunzip(buf, len, fill, flush, out_buf, 0, pos, error);
+}
+#else
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long out_len,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return __gunzip(buf, len, fill, flush, out_buf, out_len, pos, error);
+}
+#endif
index 40f66ebe57b77a0566460a2407bdd713d6e0b3fc..036fc882cd72561a2a96b39314078676fe94ff90 100644 (file)
@@ -196,12 +196,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
                              long (*fill)(void*, unsigned long),
                              long (*flush)(void*, unsigned long),
-                             unsigned char *output,
+                             unsigned char *output, long out_len,
                              long *posp,
-                             void(*error)(char *x)
+                             void (*error)(char *x)
        )
 {
        return unlz4(buf, in_len - 4, fill, flush, output, posp, error);
index 0be83af62b884c3dbfa9f29f9630a5ecfd605e2f..ed7a1fd819f2fbc86b3ad0f238fce01739a8e07f 100644 (file)
@@ -620,7 +620,7 @@ STATIC inline int INIT unlzma(unsigned char *buf, long in_len,
 
        num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp));
        p = (uint16_t *) large_malloc(num_probs * sizeof(*p));
-       if (p == 0)
+       if (p == NULL)
                goto exit_2;
        num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp));
        for (i = 0; i < num_probs; i++)
@@ -667,13 +667,12 @@ exit_0:
 }
 
 #ifdef PREBOOT
-STATIC int INIT decompress(unsigned char *buf, long in_len,
+STATIC int INIT __decompress(unsigned char *buf, long in_len,
                              long (*fill)(void*, unsigned long),
                              long (*flush)(void*, unsigned long),
-                             unsigned char *output,
+                             unsigned char *output, long out_len,
                              long *posp,
-                             void(*error)(char *x)
-       )
+                             void (*error)(char *x))
 {
        return unlzma(buf, in_len - 4, fill, flush, output, posp, error);
 }
index b94a31bdd87d15f34a7f4902eb6170f800c01206..f4c158e3a022aa1af35a6937190d5e0bc90e31ce 100644 (file)
@@ -31,6 +31,7 @@
  */
 
 #ifdef STATIC
+#define PREBOOT
 #include "lzo/lzo1x_decompress_safe.c"
 #else
 #include <linux/decompress/unlzo.h>
@@ -287,4 +288,14 @@ exit:
        return ret;
 }
 
-#define decompress unlzo
+#ifdef PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long olen,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return unlzo(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
index b07a78340e9d315006a97194fbcccae0c4c16509..25d59a95bd6681465d9e57af06f77c3d641b0649 100644 (file)
@@ -394,4 +394,14 @@ error_alloc_state:
  * This macro is used by architecture-specific files to decompress
  * the kernel image.
  */
-#define decompress unxz
+#ifdef XZ_PREBOOT
+STATIC int INIT __decompress(unsigned char *buf, long len,
+                          long (*fill)(void*, unsigned long),
+                          long (*flush)(void*, unsigned long),
+                          unsigned char *out_buf, long olen,
+                          long *pos,
+                          void (*error)(char *x))
+{
+       return unxz(buf, len, fill, flush, out_buf, pos, error);
+}
+#endif
index fbe2aac522e67dc16708040128337eec946cd4ad..f13a2468ff39c3403cdc4e7b13c6c35e61a0f113 100644 (file)
@@ -119,10 +119,9 @@ EXPORT_SYMBOL(devm_iounmap);
  * @dev: generic device to handle the resource for
  * @res: resource to be handled
  *
- * Checks that a resource is a valid memory region, requests the memory region
- * and ioremaps it either as cacheable or as non-cacheable memory depending on
- * the resource's flags. All operations are managed and will be undone on
- * driver detach.
+ * Checks that a resource is a valid memory region, requests the memory
+ * region and ioremaps it. All operations are managed and will be undone
+ * on driver detach.
  *
  * Returns a pointer to the remapped memory or an ERR_PTR() encoded error code
  * on failure. Usage example:
@@ -153,11 +152,7 @@ void __iomem *devm_ioremap_resource(struct device *dev, struct resource *res)
                return IOMEM_ERR_PTR(-EBUSY);
        }
 
-       if (res->flags & IORESOURCE_CACHEABLE)
-               dest_ptr = devm_ioremap(dev, res->start, size);
-       else
-               dest_ptr = devm_ioremap_nocache(dev, res->start, size);
-
+       dest_ptr = devm_ioremap(dev, res->start, size);
        if (!dest_ptr) {
                dev_err(dev, "ioremap failed for resource %pR\n", res);
                devm_release_mem_region(dev, res->start, size);
index ec8da78df9be9f4ea245ff398193bd1d90210573..94be244e844103d0fb6ed20c7bee905ca908fa12 100644 (file)
@@ -152,7 +152,7 @@ int kstrtoll(const char *s, unsigned int base, long long *res)
                rv = _kstrtoull(s + 1, base, &tmp);
                if (rv < 0)
                        return rv;
-               if ((long long)(-tmp) >= 0)
+               if ((long long)-tmp > 0)
                        return -ERANGE;
                *res = -tmp;
        } else {
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
new file mode 100644 (file)
index 0000000..88d3d32
--- /dev/null
@@ -0,0 +1,162 @@
+/*
+ *  NMI backtrace support
+ *
+ * Gratuitously copied from arch/x86/kernel/apic/hw_nmi.c by Russell King,
+ * with the following header:
+ *
+ *  HW NMI watchdog support
+ *
+ *  started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ *  Arch specific calls to support NMI watchdog
+ *
+ *  Bits copied from original nmi.c file
+ */
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/seq_buf.h>
+
+#ifdef arch_trigger_all_cpu_backtrace
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+static cpumask_t printtrace_mask;
+
+#define NMI_BUF_SIZE           4096
+
+struct nmi_seq_buf {
+       unsigned char           buffer[NMI_BUF_SIZE];
+       struct seq_buf          seq;
+};
+
+/* Safe printing in NMI context */
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
+static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+       const char *buf = s->buffer + start;
+
+       printk("%.*s", (end - start) + 1, buf);
+}
+
+void nmi_trigger_all_cpu_backtrace(bool include_self,
+                                  void (*raise)(cpumask_t *mask))
+{
+       struct nmi_seq_buf *s;
+       int i, cpu, this_cpu = get_cpu();
+
+       if (test_and_set_bit(0, &backtrace_flag)) {
+               /*
+                * If there is already a trigger_all_cpu_backtrace() in progress
+                * (backtrace_flag == 1), don't output double cpu dump infos.
+                */
+               put_cpu();
+               return;
+       }
+
+       cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+       if (!include_self)
+               cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
+
+       cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
+
+       /*
+        * Set up per_cpu seq_buf buffers that the NMIs running on the other
+        * CPUs will write to.
+        */
+       for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
+               s = &per_cpu(nmi_print_seq, cpu);
+               seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
+       }
+
+       if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+               pr_info("Sending NMI to %s CPUs:\n",
+                       (include_self ? "all" : "other"));
+               raise(to_cpumask(backtrace_mask));
+       }
+
+       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+       for (i = 0; i < 10 * 1000; i++) {
+               if (cpumask_empty(to_cpumask(backtrace_mask)))
+                       break;
+               mdelay(1);
+               touch_softlockup_watchdog();
+       }
+
+       /*
+        * Now that all the NMIs have triggered, we can dump out their
+        * back traces safely to the console.
+        */
+       for_each_cpu(cpu, &printtrace_mask) {
+               int len, last_i = 0;
+
+               s = &per_cpu(nmi_print_seq, cpu);
+               len = seq_buf_used(&s->seq);
+               if (!len)
+                       continue;
+
+               /* Print line by line. */
+               for (i = 0; i < len; i++) {
+                       if (s->buffer[i] == '\n') {
+                               print_seq_line(s, last_i, i);
+                               last_i = i + 1;
+                       }
+               }
+               /* Check if there was a partial line. */
+               if (last_i < len) {
+                       print_seq_line(s, last_i, len - 1);
+                       pr_cont("\n");
+               }
+       }
+
+       clear_bit(0, &backtrace_flag);
+       smp_mb__after_atomic();
+       put_cpu();
+}
+
+/*
+ * It is not safe to call printk() directly from NMI handlers.
+ * It may be fine if the NMI detected a lock up and we have no choice
+ * but to do so, but doing a NMI on all other CPUs to get a back trace
+ * can be done with a sysrq-l. We don't want that to lock up, which
+ * can happen if the NMI interrupts a printk in progress.
+ *
+ * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
+ * the content into a per cpu seq_buf buffer. Then when the NMIs are
+ * all done, we can safely dump the contents of the seq_buf to a printk()
+ * from a non NMI context.
+ */
+static int nmi_vprintk(const char *fmt, va_list args)
+{
+       struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+       unsigned int len = seq_buf_used(&s->seq);
+
+       seq_buf_vprintf(&s->seq, fmt, args);
+       return seq_buf_used(&s->seq) - len;
+}
+
+bool nmi_cpu_backtrace(struct pt_regs *regs)
+{
+       int cpu = smp_processor_id();
+
+       if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+               printk_func_t printk_func_save = this_cpu_read(printk_func);
+
+               /* Replace printk to write into the NMI seq */
+               this_cpu_write(printk_func, nmi_vprintk);
+               pr_warn("NMI backtrace for cpu %d\n", cpu);
+               show_regs(regs);
+               this_cpu_write(printk_func, printk_func_save);
+
+               cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+               return true;
+       }
+
+       return false;
+}
+NOKPROBE_SYMBOL(nmi_cpu_backtrace);
+#endif
index 5f5d24d1d53ff15c67889f97f881488b5e1ec7d1..c10fba461454fc8efe03b9fb46719ee80dad6a47 100644 (file)
@@ -41,11 +41,8 @@ void __iomem *pci_iomap_range(struct pci_dev *dev,
                len = maxlen;
        if (flags & IORESOURCE_IO)
                return __pci_ioport_map(dev, start, len);
-       if (flags & IORESOURCE_MEM) {
-               if (flags & IORESOURCE_CACHEABLE)
-                       return ioremap(start, len);
-               return ioremap_nocache(start, len);
-       }
+       if (flags & IORESOURCE_MEM)
+               return ioremap(start, len);
        /* What? */
        return NULL;
 }
index adc98e1825ba0d6df09623595253f5d9f4af9a73..1feed6a2b12ae6abe2750b18adfa0d4c34327b96 100644 (file)
@@ -38,11 +38,9 @@ void show_mem(unsigned int filter)
 
        printk("%lu pages RAM\n", total);
        printk("%lu pages HighMem/MovableOnly\n", highmem);
+       printk("%lu pages reserved\n", reserved);
 #ifdef CONFIG_CMA
-       printk("%lu pages reserved\n", (reserved - totalcma_pages));
        printk("%lu pages cma reserved\n", totalcma_pages);
-#else
-       printk("%lu pages reserved\n", reserved);
 #endif
 #ifdef CONFIG_QUICKLIST
        printk("%lu pages in pagetable cache\n",
index c98ae818eb4eed802119133ca9d1c809d6b9f155..5939f63d90cde79fe1e09814765539a7e43a3c28 100644 (file)
@@ -59,7 +59,11 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
        }
 
        exp = divisor[units] / (u32)blk_size;
-       if (size >= exp) {
+       /*
+        * size must be strictly greater than exp here to ensure that remainder
+        * is greater than divisor[units] coming out of the if below.
+        */
+       if (size > exp) {
                remainder = do_div(size, divisor[units]);
                remainder *= blk_size;
                i++;
@@ -410,7 +414,7 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * @dst:       destination buffer (escaped)
  * @osz:       destination buffer size
  * @flags:     combination of the flags (bitwise OR):
- *     %ESCAPE_SPACE:
+ *     %ESCAPE_SPACE: (special white space, not space itself)
  *             '\f' - form feed
  *             '\n' - new line
  *             '\r' - carriage return
@@ -432,16 +436,18 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  *             all previous together
  *     %ESCAPE_HEX:
  *             '\xHH' - byte with hexadecimal value HH (2 digits)
- * @esc:       NULL-terminated string of characters any of which, if found in
- *             the source, has to be escaped
+ * @only:      NULL-terminated string containing characters used to limit
+ *             the selected escape class. If characters are included in @only
+ *             that would not normally be escaped by the classes selected
+ *             in @flags, they will be copied to @dst unescaped.
  *
  * Description:
  * The process of escaping byte buffer includes several parts. They are applied
  * in the following sequence.
  *     1. The character is matched to the printable class, if asked, and in
  *        case of match it passes through to the output.
- *     2. The character is not matched to the one from @esc string and thus
- *        must go as is to the output.
+ *     2. The character is not matched to the one from @only string and thus
+ *        must go as-is to the output.
  *     3. The character is checked if it falls into the class given by @flags.
  *        %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
  *        character. Note that they actually can't go together, otherwise
@@ -458,11 +464,11 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * dst for a '\0' terminator if and only if ret < osz.
  */
 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
-                     unsigned int flags, const char *esc)
+                     unsigned int flags, const char *only)
 {
        char *p = dst;
        char *end = p + osz;
-       bool is_dict = esc && *esc;
+       bool is_dict = only && *only;
 
        while (isz--) {
                unsigned char c = *src++;
@@ -471,7 +477,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                 * Apply rules in the following sequence:
                 *      - the character is printable, when @flags has
                 *        %ESCAPE_NP bit set
-                *      - the @esc string is supplied and does not contain a
+                *      - the @only string is supplied and does not contain a
                 *        character under question
                 *      - the character doesn't fall into a class of symbols
                 *        defined by given @flags
@@ -479,7 +485,7 @@ int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                 * output buffer.
                 */
                if ((flags & ESCAPE_NP && isprint(c)) ||
-                   (is_dict && !strchr(esc, c))) {
+                   (is_dict && !strchr(only, c))) {
                        /* do nothing */
                } else {
                        if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
index 4137bca5f8e8e5008ca88b7fcb801096856eaf0c..f355f67169b6a32fbb7162b46691a45de4291cd2 100644 (file)
@@ -260,6 +260,7 @@ static void __init test_kstrtoll_ok(void)
                {"4294967297",  10,     4294967297LL},
                {"9223372036854775807", 10,     9223372036854775807LL},
 
+               {"-0",  10,     0LL},
                {"-1",  10,     -1LL},
                {"-2",  10,     -2LL},
                {"-9223372036854775808",        10,     LLONG_MIN},
@@ -277,11 +278,6 @@ static void __init test_kstrtoll_fail(void)
                {"-9223372036854775809",        10},
                {"-18446744073709551614",       10},
                {"-18446744073709551615",       10},
-               /* negative zero isn't an integer in Linux */
-               {"-0",  0},
-               {"-0",  8},
-               {"-0",  10},
-               {"-0",  16},
                /* sign is first character if any */
                {"-+1", 0},
                {"-+1", 8},
index 098c08eddfab715e98a0c428ef69f3012b58d5d0..c1efb1b610179013baf5d662f40f739a5f9abc60 100644 (file)
@@ -65,7 +65,7 @@ static noinline void __init kmalloc_node_oob_right(void)
        kfree(ptr);
 }
 
-static noinline void __init kmalloc_large_oob_rigth(void)
+static noinline void __init kmalloc_large_oob_right(void)
 {
        char *ptr;
        size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
@@ -114,7 +114,7 @@ static noinline void __init kmalloc_oob_krealloc_less(void)
                kfree(ptr1);
                return;
        }
-       ptr2[size1] = 'x';
+       ptr2[size2] = 'x';
        kfree(ptr2);
 }
 
@@ -259,7 +259,7 @@ static int __init kmalloc_tests_init(void)
        kmalloc_oob_right();
        kmalloc_oob_left();
        kmalloc_node_oob_right();
-       kmalloc_large_oob_rigth();
+       kmalloc_large_oob_right();
        kmalloc_oob_krealloc_more();
        kmalloc_oob_krealloc_less();
        kmalloc_oob_16();
index ddf348299f244a9db0b5f430b2df3b173298b4ac..9b1756b12743fad143f4c07ae866dfed5fa5bf74 100644 (file)
@@ -35,6 +35,7 @@
 /* #include "deflate.h" */
 
 #include <linux/zutil.h>
+#include <linux/bitrev.h>
 #include "defutil.h"
 
 #ifdef DEBUG_ZLIB
@@ -146,7 +147,6 @@ static void send_all_trees (deflate_state *s, int lcodes, int dcodes,
 static void compress_block (deflate_state *s, ct_data *ltree,
                            ct_data *dtree);
 static void set_data_type  (deflate_state *s);
-static unsigned bi_reverse (unsigned value, int length);
 static void bi_windup      (deflate_state *s);
 static void bi_flush       (deflate_state *s);
 static void copy_block     (deflate_state *s, char *buf, unsigned len,
@@ -284,7 +284,7 @@ static void tr_static_init(void)
     /* The static distance tree is trivial: */
     for (n = 0; n < D_CODES; n++) {
         static_dtree[n].Len = 5;
-        static_dtree[n].Code = bi_reverse((unsigned)n, 5);
+        static_dtree[n].Code = bitrev32((u32)n) >> (32 - 5);
     }
     static_init_done = 1;
 }
@@ -520,7 +520,7 @@ static void gen_codes(
         int len = tree[n].Len;
         if (len == 0) continue;
         /* Now reverse the bits */
-        tree[n].Code = bi_reverse(next_code[len]++, len);
+        tree[n].Code = bitrev32((u32)(next_code[len]++)) >> (32 - len);
 
         Tracecv(tree != static_ltree, (stderr,"\nn %3d %c l %2d c %4x (%x) ",
              n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
index b640b6402e99e3a74db809012a4138c4cef48457..a8c370897c9f4ee641373ba9245a5d833de07683 100644 (file)
@@ -292,22 +292,6 @@ void zlib_tr_stored_type_only (deflate_state *);
     put_byte(s, (uch)((ush)(w) >> 8)); \
 }
 
-/* ===========================================================================
- * Reverse the first len bits of a code, using straightforward code (a faster
- * method would use a table)
- * IN assertion: 1 <= len <= 15
- */
-static inline unsigned bi_reverse(unsigned code, /* the value to invert */
-                                 int len)       /* its bit length */
-{
-    register unsigned res = 0;
-    do {
-        res |= code & 1;
-        code >>= 1, res <<= 1;
-    } while (--len > 0);
-    return res >> 1;
-}
-
 /* ===========================================================================
  * Flush the bit buffer, keeping at most 7 bits in it.
  */
index d4e6495a720f4af4869d1263a7c94fd7d5777254..0d9fdcd01e479d87a45cb487db54e8c1fff27883 100644 (file)
@@ -648,3 +648,35 @@ config DEFERRED_STRUCT_PAGE_INIT
          when kswapd starts. This has a potential performance impact on
          processes running early in the lifetime of the systemm until kswapd
          finishes the initialisation.
+
+config IDLE_PAGE_TRACKING
+       bool "Enable idle page tracking"
+       depends on SYSFS && MMU
+       select PAGE_EXTENSION if !64BIT
+       help
+         This feature allows to estimate the amount of user pages that have
+         not been touched during a given period of time. This information can
+         be useful to tune memory cgroup limits and/or for job placement
+         within a compute cluster.
+
+         See Documentation/vm/idle_page_tracking.txt for more details.
+
+config ZONE_DEVICE
+       bool "Device memory (pmem, etc...) hotplug support" if EXPERT
+       default !ZONE_DMA
+       depends on !ZONE_DMA
+       depends on MEMORY_HOTPLUG
+       depends on MEMORY_HOTREMOVE
+       depends on X86_64 #arch_add_memory() comprehends device memory
+
+       help
+         Device memory hotplug support allows for establishing pmem,
+         or other device driver discovered memory regions, in the
+         memmap. This allows pfn_to_page() lookups of otherwise
+         "device-physical" addresses which is needed for using a DAX
+         mapping in an O_DIRECT operation, among other things.
+
+         If FS_DAX is enabled, then say Y.
+
+config FRAME_VECTOR
+       bool
index b424d5e5b6ff5b1dec8f95fdd089451dba4dd19c..2ed43191fc3bf78f46f111e88fa9d5a01b8c661a 100644 (file)
@@ -79,3 +79,5 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
+obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
index ee8d7fd07be3993d364e4345ec843a02782d3ee8..2df8ddcb0ca0a7f7a055456de4b46a8c55bbfdf1 100644 (file)
@@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
        int ret = 0;
 
        memcg = mem_cgroup_from_css(memcg_css);
-       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
        blkcg = css_to_blkcg(blkcg_css);
        memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
        blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 
                        /* see whether the blkcg association has changed */
                        blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
-                                                    &blkio_cgrp_subsys);
+                                                    &io_cgrp_subsys);
                        if (unlikely(wb->blkcg_css != blkcg_css ||
                                     !wb_tryget(wb)))
                                wb = NULL;
index a23dd19346548223152a4922a0e2e7d90cb56b10..3b6380784c285938b369e0a058793011a83242b5 100644 (file)
@@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        count += pages;
        while (pages--)
                __free_pages_bootmem(page++, cur++, 0);
+       bdata->node_bootmem_map = NULL;
 
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
 
@@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
                sidx + bdata->node_min_pfn,
                eidx + bdata->node_min_pfn);
 
+       if (WARN_ON(bdata->node_bootmem_map == NULL))
+               return;
+
        if (bdata->hint_idx > sidx)
                bdata->hint_idx = sidx;
 
@@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
                eidx + bdata->node_min_pfn,
                flags);
 
+       if (WARN_ON(bdata->node_bootmem_map == NULL))
+               return 0;
+
        for (idx = sidx; idx < eidx; idx++)
                if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
                        if (exclusive) {
index 018f08da99a2ece0aa25475c5f8f54bcc436f0eb..c5c627aae9962daf9c64d4f482c075e4cd96422a 100644 (file)
@@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
        return !get_pageblock_skip(page);
 }
 
+static void reset_cached_positions(struct zone *zone)
+{
+       zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+       zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+       zone->compact_cached_free_pfn = zone_end_pfn(zone);
+}
+
 /*
  * This function is called to clear all cached information on pageblocks that
  * should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long pfn;
 
-       zone->compact_cached_migrate_pfn[0] = start_pfn;
-       zone->compact_cached_migrate_pfn[1] = start_pfn;
-       zone->compact_cached_free_pfn = end_pfn;
        zone->compact_blockskip_flush = false;
 
        /* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 
                clear_pageblock_skip(page);
        }
+
+       reset_cached_positions(zone);
 }
 
 void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 
                if (!valid_page)
                        valid_page = page;
+
+               /*
+                * For compound pages such as THP and hugetlbfs, we can save
+                * potentially a lot of iterations if we skip them at once.
+                * The check is racy, but we can consider only valid values
+                * and the only danger is skipping too much.
+                */
+               if (PageCompound(page)) {
+                       unsigned int comp_order = compound_order(page);
+
+                       if (likely(comp_order < MAX_ORDER)) {
+                               blockpfn += (1UL << comp_order) - 1;
+                               cursor += (1UL << comp_order) - 1;
+                       }
+
+                       goto isolate_fail;
+               }
+
                if (!PageBuddy(page))
                        goto isolate_fail;
 
@@ -490,6 +514,13 @@ isolate_fail:
 
        }
 
+       /*
+        * There is a tiny chance that we have read bogus compound_order(),
+        * so be careful to not go outside of the pageblock.
+        */
+       if (unlikely(blockpfn > end_pfn))
+               blockpfn = end_pfn;
+
        trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
                                        nr_scanned, total_isolated);
 
@@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
        /* Time to isolate some pages for migration */
        for (; low_pfn < end_pfn; low_pfn++) {
+               bool is_lru;
+
                /*
                 * Periodically drop the lock (if held) regardless of its
                 * contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 * It's possible to migrate LRU pages and balloon pages
                 * Skip any other type of page
                 */
-               if (!PageLRU(page)) {
+               is_lru = PageLRU(page);
+               if (!is_lru) {
                        if (unlikely(balloon_page_movable(page))) {
                                if (balloon_page_isolate(page)) {
                                        /* Successfully isolated */
                                        goto isolate_success;
                                }
                        }
-                       continue;
                }
 
                /*
-                * PageLRU is set. lru_lock normally excludes isolation
-                * splitting and collapsing (collapsing has already happened
-                * if PageLRU is set) but the lock is not necessarily taken
-                * here and it is wasteful to take it just to check transhuge.
-                * Check TransHuge without lock and skip the whole pageblock if
-                * it's either a transhuge or hugetlbfs page, as calling
-                * compound_order() without preventing THP from splitting the
-                * page underneath us may return surprising results.
+                * Regardless of being on LRU, compound pages such as THP and
+                * hugetlbfs are not to be compacted. We can potentially save
+                * a lot of iterations if we skip them at once. The check is
+                * racy, but we can consider only valid values and the only
+                * danger is skipping too much.
                 */
-               if (PageTransHuge(page)) {
-                       if (!locked)
-                               low_pfn = ALIGN(low_pfn + 1,
-                                               pageblock_nr_pages) - 1;
-                       else
-                               low_pfn += (1 << compound_order(page)) - 1;
+               if (PageCompound(page)) {
+                       unsigned int comp_order = compound_order(page);
+
+                       if (likely(comp_order < MAX_ORDER))
+                               low_pfn += (1UL << comp_order) - 1;
 
                        continue;
                }
 
+               if (!is_lru)
+                       continue;
+
                /*
                 * Migration will fail if an anonymous page is pinned in memory,
                 * so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        if (!locked)
                                break;
 
-                       /* Recheck PageLRU and PageTransHuge under lock */
+                       /* Recheck PageLRU and PageCompound under lock */
                        if (!PageLRU(page))
                                continue;
-                       if (PageTransHuge(page)) {
-                               low_pfn += (1 << compound_order(page)) - 1;
+
+                       /*
+                        * Page become compound since the non-locked check,
+                        * and it's on LRU. It can only be a THP so the order
+                        * is safe to read and it's 0 for tail pages.
+                        */
+                       if (unlikely(PageCompound(page))) {
+                               low_pfn += (1UL << compound_order(page)) - 1;
                                continue;
                        }
                }
@@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                if (__isolate_lru_page(page, isolate_mode) != 0)
                        continue;
 
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               VM_BUG_ON_PAGE(PageCompound(page), page);
 
                /* Successfully isolated */
                del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -897,6 +935,16 @@ static bool suitable_migration_target(struct page *page)
        return false;
 }
 
+/*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+       return (cc->free_pfn >> pageblock_order)
+               <= (cc->migrate_pfn >> pageblock_order);
+}
+
 /*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
@@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-       for (; block_start_pfn >= low_pfn &&
-                       cc->nr_migratepages > cc->nr_freepages;
+       for (; block_start_pfn >= low_pfn;
                                block_end_pfn = block_start_pfn,
                                block_start_pfn -= pageblock_nr_pages,
                                isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
                                        block_end_pfn, freelist, false);
 
                /*
+                * If we isolated enough freepages, or aborted due to async
+                * compaction being contended, terminate the loop.
                 * Remember where the free scanner should restart next time,
                 * which is where isolate_freepages_block() left off.
                 * But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
                 * In that case we will however want to restart at the start
                 * of the previous pageblock.
                 */
-               cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
-                               isolate_start_pfn :
-                               block_start_pfn - pageblock_nr_pages;
-
-               /*
-                * isolate_freepages_block() might have aborted due to async
-                * compaction being contended
-                */
-               if (cc->contended)
+               if ((cc->nr_freepages >= cc->nr_migratepages)
+                                                       || cc->contended) {
+                       if (isolate_start_pfn >= block_end_pfn)
+                               isolate_start_pfn =
+                                       block_start_pfn - pageblock_nr_pages;
                        break;
+               } else {
+                       /*
+                        * isolate_freepages_block() should not terminate
+                        * prematurely unless contended, or isolated enough
+                        */
+                       VM_BUG_ON(isolate_start_pfn < block_end_pfn);
+               }
        }
 
        /* split_free_page does not map the pages */
        map_pages(freelist);
 
        /*
-        * If we crossed the migrate scanner, we want to keep it that way
-        * so that compact_finished() may detect this
+        * Record where the free scanner will restart next time. Either we
+        * broke from the loop and set isolate_start_pfn based on the last
+        * call to isolate_freepages_block(), or we met the migration scanner
+        * and the loop terminated due to isolate_start_pfn < low_pfn
         */
-       if (block_start_pfn < low_pfn)
-               cc->free_pfn = cc->migrate_pfn;
+       cc->free_pfn = isolate_start_pfn;
 }
 
 /*
@@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+       unsigned long isolate_start_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        continue;
 
                /* Perform the isolation */
+               isolate_start_pfn = low_pfn;
                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                                                isolate_mode);
 
@@ -1118,6 +1173,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        return ISOLATE_ABORT;
                }
 
+               /*
+                * Record where we could have freed pages by migration and not
+                * yet flushed them to buddy allocator.
+                * - this is the lowest page that could have been isolated and
+                * then freed by migration.
+                */
+               if (cc->nr_migratepages && !cc->last_migrated_pfn)
+                       cc->last_migrated_pfn = isolate_start_pfn;
+
                /*
                 * Either we isolated something and proceed with migration. Or
                 * we failed and compact_zone should decide if we should
@@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        }
 
        acct_isolated(zone, cc);
-       /*
-        * Record where migration scanner will be restarted. If we end up in
-        * the same pageblock as the free scanner, make the scanners fully
-        * meet so that compact_finished() terminates compaction.
-        */
-       cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
+       /* Record where migration scanner will be restarted. */
+       cc->migrate_pfn = low_pfn;
 
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
@@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
                return COMPACT_PARTIAL;
 
        /* Compaction run completes if the migrate and free scanner meet */
-       if (cc->free_pfn <= cc->migrate_pfn) {
+       if (compact_scanners_met(cc)) {
                /* Let the next compaction start anew. */
-               zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
-               zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
-               zone->compact_cached_free_pfn = zone_end_pfn(zone);
+               reset_cached_positions(zone);
 
                /*
                 * Mark that the PG_migrate_skip information should be cleared
@@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        unsigned long end_pfn = zone_end_pfn(zone);
        const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
        const bool sync = cc->mode != MIGRATE_ASYNC;
-       unsigned long last_migrated_pfn = 0;
 
        ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                                                        cc->classzone_idx);
@@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
        }
+       cc->last_migrated_pfn = 0;
 
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync);
@@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        while ((ret = compact_finished(zone, cc, migratetype)) ==
                                                COMPACT_CONTINUE) {
                int err;
-               unsigned long isolate_start_pfn = cc->migrate_pfn;
 
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
@@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                         * migrate_pages() may return -ENOMEM when scanners meet
                         * and we want compact_finished() to detect it
                         */
-                       if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+                       if (err == -ENOMEM && !compact_scanners_met(cc)) {
                                ret = COMPACT_PARTIAL;
                                goto out;
                        }
                }
 
-               /*
-                * Record where we could have freed pages by migration and not
-                * yet flushed them to buddy allocator. We use the pfn that
-                * isolate_migratepages() started from in this loop iteration
-                * - this is the lowest page that could have been isolated and
-                * then freed by migration.
-                */
-               if (!last_migrated_pfn)
-                       last_migrated_pfn = isolate_start_pfn;
-
 check_drain:
                /*
                 * Has the migration scanner moved away from the previous
@@ -1400,18 +1447,18 @@ check_drain:
                 * compact_finished() can detect immediately if allocation
                 * would succeed.
                 */
-               if (cc->order > 0 && last_migrated_pfn) {
+               if (cc->order > 0 && cc->last_migrated_pfn) {
                        int cpu;
                        unsigned long current_block_start =
                                cc->migrate_pfn & ~((1UL << cc->order) - 1);
 
-                       if (last_migrated_pfn < current_block_start) {
+                       if (cc->last_migrated_pfn < current_block_start) {
                                cpu = get_cpu();
                                lru_add_drain_cpu(cpu);
                                drain_local_pages(zone);
                                put_cpu();
                                /* No more flushing until we migrate again */
-                               last_migrated_pfn = 0;
+                               cc->last_migrated_pfn = 0;
                        }
                }
 
index 76089ddf99ea1cba96695010cbcbe9e5a4a90331..6c1b3ea61bfddfe4f042a6ef067e53e34f82792b 100644 (file)
@@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        {1UL << PG_compound_lock,       "compound_lock" },
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
+       {1UL << PG_young,               "young"         },
+       {1UL << PG_idle,                "idle"          },
+#endif
 };
 
 static void dump_flags(unsigned long flags,
index 59d10d16f0a5d906b18d98d1962f454075c828fe..71a8998cd03a6b8b0d2dbfe36a24ce70766047e8 100644 (file)
@@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
 {
        bool empty = false;
 
+       if (unlikely(!pool))
+               return;
+
        mutex_lock(&pools_reg_lock);
        mutex_lock(&pools_lock);
        list_del(&pool->pools);
@@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
        spin_unlock_irqrestore(&pool->lock, flags);
 
-       page = pool_alloc_page(pool, mem_flags);
+       page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
        if (!page)
                return NULL;
 
@@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                        break;
                }
        }
-       memset(retval, POOL_POISON_ALLOCATED, pool->size);
+       if (!(mem_flags & __GFP_ZERO))
+               memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
        spin_unlock_irqrestore(&pool->lock, flags);
+
+       if (mem_flags & __GFP_ZERO)
+               memset(retval, 0, pool->size);
+
        return retval;
 }
 EXPORT_SYMBOL(dma_pool_alloc);
index e10ccd299d6666887ba9347308afaa112ee8816f..17ae14b5aefa2e5302a90e5b748a719e22a22f3b 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <asm/fixmap.h>
+#include <asm/early_ioremap.h>
 
 #ifdef CONFIG_MMU
 static int early_ioremap_debug __initdata;
@@ -217,6 +218,35 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
        return (__force void *)__early_ioremap(phys_addr, size,
                                               FIXMAP_PAGE_NORMAL);
 }
+#ifdef FIXMAP_PAGE_RO
+void __init *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+       return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+}
+#endif
+
+#define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+       unsigned long slop, clen;
+       char *p;
+
+       while (size) {
+               slop = src & ~PAGE_MASK;
+               clen = size;
+               if (clen > MAX_MAP_CHUNK - slop)
+                       clen = MAX_MAP_CHUNK - slop;
+               p = early_memremap(src & PAGE_MASK, clen + slop);
+               memcpy(dest, p + slop, clen);
+               early_memunmap(p, clen + slop);
+               dest += clen;
+               src += clen;
+               size -= clen;
+       }
+}
+
 #else /* CONFIG_MMU */
 
 void __init __iomem *
@@ -231,6 +261,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
 {
        return (void *)phys_addr;
 }
+void __init *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+       return (void *)phys_addr;
+}
 
 void __init early_iounmap(void __iomem *addr, unsigned long size)
 {
index 1283fc82545861d155c4eef7013bf8e285040fed..72940fb38666811b80c146bc085a1c84fc0e7ecc 100644 (file)
@@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
-                       page = alloc_pages_exact_node(n, gfp, 0);
+                       page = __alloc_pages_node(n, gfp, 0);
                } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
 
                return page;
@@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file,
                                                iov_iter_count(i));
 
 again:
-               /*
-                * Bring in the user page that we will copy from _first_.
-                * Otherwise there's a nasty deadlock on copying from the
-                * same page as we're writing to, without it being marked
-                * up-to-date.
-                *
-                * Not only is this an optimisation, but it is also required
-                * to check that the address is actually valid, when atomic
-                * usercopies are used, below.
-                */
-               if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-                       status = -EFAULT;
-                       break;
-               }
-
                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
                if (unlikely(status < 0))
@@ -2495,8 +2480,17 @@ again:
 
                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);
-
+               /*
+                * 'page' is now locked.  If we are trying to copy from a
+                * mapping of 'page' in userspace, the copy might fault and
+                * would need PageUptodate() to complete.  But, page can not be
+                * made Uptodate without acquiring the page lock, which we hold.
+                * Deadlock.  Avoid with pagefault_disable().  Fix up below with
+                * iov_iter_fault_in_readable().
+                */
+               pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+               pagefault_enable();
                flush_dcache_page(page);
 
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2513,14 @@ again:
                         */
                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_single_seg_count(i));
+                       /*
+                        * This is the fallback to recover if the copy from
+                        * userspace above faults.
+                        */
+                       if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                               status = -EFAULT;
+                               break;
+                       }
                        goto again;
                }
                pos += copied;
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
new file mode 100644 (file)
index 0000000..cdabcb9
--- /dev/null
@@ -0,0 +1,230 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+/*
+ * get_vaddr_frames() - map virtual addresses to pfns
+ * @start:     starting user address
+ * @nr_frames: number of pages / pfns from start to map
+ * @write:     whether pages will be written to by the caller
+ * @force:     whether to force write access even if user mapping is
+ *             readonly. See description of the same argument of
+               get_user_pages().
+ * @vec:       structure which receives pages / pfns of the addresses mapped.
+ *             It should have space for at least nr_frames entries.
+ *
+ * This function maps virtual addresses from @start and fills @vec structure
+ * with page frame numbers or page pointers to corresponding pages (choice
+ * depends on the type of the vma underlying the virtual address). If @start
+ * belongs to a normal vma, the function grabs reference to each of the pages
+ * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
+ * touch page structures and the caller must make sure pfns aren't reused for
+ * anything else while he is using them.
+ *
+ * The function returns number of pages mapped which may be less than
+ * @nr_frames. In particular we stop mapping if there are more vmas of
+ * different type underlying the specified range of virtual addresses.
+ * When the function isn't able to map a single page, it returns error.
+ *
+ * This function takes care of grabbing mmap_sem as necessary.
+ */
+int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
+                    bool write, bool force, struct frame_vector *vec)
+{
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       int ret = 0;
+       int err;
+       int locked;
+
+       if (nr_frames == 0)
+               return 0;
+
+       if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
+               nr_frames = vec->nr_allocated;
+
+       down_read(&mm->mmap_sem);
+       locked = 1;
+       vma = find_vma_intersection(mm, start, start + 1);
+       if (!vma) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
+               vec->got_ref = true;
+               vec->is_pfns = false;
+               ret = get_user_pages_locked(current, mm, start, nr_frames,
+                       write, force, (struct page **)(vec->ptrs), &locked);
+               goto out;
+       }
+
+       vec->got_ref = false;
+       vec->is_pfns = true;
+       do {
+               unsigned long *nums = frame_vector_pfns(vec);
+
+               while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
+                       err = follow_pfn(vma, start, &nums[ret]);
+                       if (err) {
+                               if (ret == 0)
+                                       ret = err;
+                               goto out;
+                       }
+                       start += PAGE_SIZE;
+                       ret++;
+               }
+               /*
+                * We stop if we have enough pages or if VMA doesn't completely
+                * cover the tail page.
+                */
+               if (ret >= nr_frames || start < vma->vm_end)
+                       break;
+               vma = find_vma_intersection(mm, start, start + 1);
+       } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
+out:
+       if (locked)
+               up_read(&mm->mmap_sem);
+       if (!ret)
+               ret = -EFAULT;
+       if (ret > 0)
+               vec->nr_frames = ret;
+       return ret;
+}
+EXPORT_SYMBOL(get_vaddr_frames);
+
+/**
+ * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
+ *                     them
+ * @vec:       frame vector to put
+ *
+ * Drop references to pages if get_vaddr_frames() acquired them. We also
+ * invalidate the frame vector so that it is prepared for the next call into
+ * get_vaddr_frames().
+ */
+void put_vaddr_frames(struct frame_vector *vec)
+{
+       int i;
+       struct page **pages;
+
+       if (!vec->got_ref)
+               goto out;
+       pages = frame_vector_pages(vec);
+       /*
+        * frame_vector_pages() might needed to do a conversion when
+        * get_vaddr_frames() got pages but vec was later converted to pfns.
+        * But it shouldn't really fail to convert pfns back...
+        */
+       if (WARN_ON(IS_ERR(pages)))
+               goto out;
+       for (i = 0; i < vec->nr_frames; i++)
+               put_page(pages[i]);
+       vec->got_ref = false;
+out:
+       vec->nr_frames = 0;
+}
+EXPORT_SYMBOL(put_vaddr_frames);
+
+/**
+ * frame_vector_to_pages - convert frame vector to contain page pointers
+ * @vec:       frame vector to convert
+ *
+ * Convert @vec to contain array of page pointers.  If the conversion is
+ * successful, return 0. Otherwise return an error. Note that we do not grab
+ * page references for the page structures.
+ */
+int frame_vector_to_pages(struct frame_vector *vec)
+{
+       int i;
+       unsigned long *nums;
+       struct page **pages;
+
+       if (!vec->is_pfns)
+               return 0;
+       nums = frame_vector_pfns(vec);
+       for (i = 0; i < vec->nr_frames; i++)
+               if (!pfn_valid(nums[i]))
+                       return -EINVAL;
+       pages = (struct page **)nums;
+       for (i = 0; i < vec->nr_frames; i++)
+               pages[i] = pfn_to_page(nums[i]);
+       vec->is_pfns = false;
+       return 0;
+}
+EXPORT_SYMBOL(frame_vector_to_pages);
+
+/**
+ * frame_vector_to_pfns - convert frame vector to contain pfns
+ * @vec:       frame vector to convert
+ *
+ * Convert @vec to contain array of pfns.
+ */
+void frame_vector_to_pfns(struct frame_vector *vec)
+{
+       int i;
+       unsigned long *nums;
+       struct page **pages;
+
+       if (vec->is_pfns)
+               return;
+       pages = (struct page **)(vec->ptrs);
+       nums = (unsigned long *)pages;
+       for (i = 0; i < vec->nr_frames; i++)
+               nums[i] = page_to_pfn(pages[i]);
+       vec->is_pfns = true;
+}
+EXPORT_SYMBOL(frame_vector_to_pfns);
+
+/**
+ * frame_vector_create() - allocate & initialize structure for pinned pfns
+ * @nr_frames: number of pfns slots we should reserve
+ *
+ * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
+ * pfns.
+ */
+struct frame_vector *frame_vector_create(unsigned int nr_frames)
+{
+       struct frame_vector *vec;
+       int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
+
+       if (WARN_ON_ONCE(nr_frames == 0))
+               return NULL;
+       /*
+        * This is absurdly high. It's here just to avoid strange effects when
+        * arithmetics overflows.
+        */
+       if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
+               return NULL;
+       /*
+        * Avoid higher order allocations, use vmalloc instead. It should
+        * be rare anyway.
+        */
+       if (size <= PAGE_SIZE)
+               vec = kmalloc(size, GFP_KERNEL);
+       else
+               vec = vmalloc(size);
+       if (!vec)
+               return NULL;
+       vec->nr_allocated = nr_frames;
+       vec->nr_frames = 0;
+       return vec;
+}
+EXPORT_SYMBOL(frame_vector_create);
+
+/**
+ * frame_vector_destroy() - free memory allocated to carry frame vector
+ * @vec:       Frame vector to free
+ *
+ * Free structure allocated by frame_vector_create() to carry frames.
+ */
+void frame_vector_destroy(struct frame_vector *vec)
+{
+       /* Make sure put_vaddr_frames() got called properly... */
+       VM_BUG_ON(vec->nr_frames > 0);
+       kvfree(vec);
+}
+EXPORT_SYMBOL(frame_vector_destroy);
index 279a818a39b13d76e574bf8f330c7c925b8e3a67..4b06b8db9df23c8f33406586507bbaecf7f5444c 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/mm_inline.h>
+#include <linux/dax.h>
 #include <linux/kthread.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
@@ -24,6 +25,7 @@
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -105,7 +107,7 @@ static struct khugepaged_scan khugepaged_scan = {
 };
 
 
-static int set_recommended_min_free_kbytes(void)
+static void set_recommended_min_free_kbytes(void)
 {
        struct zone *zone;
        int nr_zones = 0;
@@ -140,7 +142,6 @@ static int set_recommended_min_free_kbytes(void)
                min_free_kbytes = recommended_min;
        }
        setup_per_zone_wmarks();
-       return 0;
 }
 
 static int start_stop_khugepaged(void)
@@ -172,12 +173,7 @@ fail:
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
-       return is_huge_zero_page(pmd_page(pmd));
-}
-
-static struct page *get_huge_zero_page(void)
+struct page *get_huge_zero_page(void)
 {
        struct page *zero_page;
 retry:
@@ -794,16 +790,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 
 /* Caller must hold page table lock. */
-static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct page *zero_page)
 {
        pmd_t entry;
+       if (!pmd_none(*pmd))
+               return false;
        entry = mk_pmd(zero_page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        atomic_long_inc(&mm->nr_ptes);
+       return true;
 }
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -870,6 +869,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                            flags);
 }
 
+static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+               pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       pmd_t entry;
+       spinlock_t *ptl;
+
+       ptl = pmd_lock(mm, pmd);
+       if (pmd_none(*pmd)) {
+               entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+               if (write) {
+                       entry = pmd_mkyoung(pmd_mkdirty(entry));
+                       entry = maybe_pmd_mkwrite(entry, vma);
+               }
+               set_pmd_at(mm, addr, pmd, entry);
+               update_mmu_cache_pmd(vma, addr, pmd);
+       }
+       spin_unlock(ptl);
+}
+
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+                       pmd_t *pmd, unsigned long pfn, bool write)
+{
+       pgprot_t pgprot = vma->vm_page_prot;
+       /*
+        * If we had pmd_special, we could avoid all these restrictions,
+        * but we need to be consistent with PTEs and architectures that
+        * can't support a 'special' bit.
+        */
+       BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+       BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+                                               (VM_PFNMAP|VM_MIXEDMAP));
+       BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+       if (addr < vma->vm_start || addr >= vma->vm_end)
+               return VM_FAULT_SIGBUS;
+       if (track_pfn_insert(vma, &pgprot, pfn))
+               return VM_FAULT_SIGBUS;
+       insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+       return VM_FAULT_NOPAGE;
+}
+
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *vma)
@@ -1414,41 +1456,41 @@ out:
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
 {
+       pmd_t orig_pmd;
        spinlock_t *ptl;
-       int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-               struct page *page;
-               pgtable_t pgtable;
-               pmd_t orig_pmd;
-               /*
-                * For architectures like ppc64 we look at deposited pgtable
-                * when calling pmdp_huge_get_and_clear. So do the
-                * pgtable_trans_huge_withdraw after finishing pmdp related
-                * operations.
-                */
-               orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-                                                       tlb->fullmm);
-               tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
-               pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
-               if (is_huge_zero_pmd(orig_pmd)) {
-                       atomic_long_dec(&tlb->mm->nr_ptes);
-                       spin_unlock(ptl);
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+               return 0;
+       /*
+        * For architectures like ppc64 we look at deposited pgtable
+        * when calling pmdp_huge_get_and_clear. So do the
+        * pgtable_trans_huge_withdraw after finishing pmdp related
+        * operations.
+        */
+       orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+                       tlb->fullmm);
+       tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+       if (vma_is_dax(vma)) {
+               spin_unlock(ptl);
+               if (is_huge_zero_pmd(orig_pmd))
                        put_huge_zero_page();
-               } else {
-                       page = pmd_page(orig_pmd);
-                       page_remove_rmap(page);
-                       VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-                       add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-                       VM_BUG_ON_PAGE(!PageHead(page), page);
-                       atomic_long_dec(&tlb->mm->nr_ptes);
-                       spin_unlock(ptl);
-                       tlb_remove_page(tlb, page);
-               }
-               pte_free(tlb->mm, pgtable);
-               ret = 1;
+       } else if (is_huge_zero_pmd(orig_pmd)) {
+               pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+               atomic_long_dec(&tlb->mm->nr_ptes);
+               spin_unlock(ptl);
+               put_huge_zero_page();
+       } else {
+               struct page *page = pmd_page(orig_pmd);
+               page_remove_rmap(page);
+               VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+               add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+               VM_BUG_ON_PAGE(!PageHead(page), page);
+               pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+               atomic_long_dec(&tlb->mm->nr_ptes);
+               spin_unlock(ptl);
+               tlb_remove_page(tlb, page);
        }
-       return ret;
+       return 1;
 }
 
 int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
@@ -1716,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page,
                /* clear PageTail before overwriting first_page */
                smp_wmb();
 
+               if (page_is_young(page))
+                       set_page_young(page_tail);
+               if (page_is_idle(page))
+                       set_page_idle(page_tail);
+
                /*
                 * __split_huge_page_splitting() already set the
                 * splitting bit in all pmd that could map this
@@ -2221,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                VM_BUG_ON_PAGE(PageLRU(page), page);
 
                /* If there is no mapped pte young don't collapse the page */
-               if (pte_young(pteval) || PageReferenced(page) ||
+               if (pte_young(pteval) ||
+                   page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
@@ -2285,8 +2333,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 
 static void khugepaged_alloc_sleep(void)
 {
-       wait_event_freezable_timeout(khugepaged_wait, false,
-                       msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+       DEFINE_WAIT(wait);
+
+       add_wait_queue(&khugepaged_wait, &wait);
+       freezable_schedule_timeout_interruptible(
+               msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+       remove_wait_queue(&khugepaged_wait, &wait);
 }
 
 static int khugepaged_node_load[MAX_NUMNODES];
@@ -2373,7 +2425,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
         */
        up_read(&mm->mmap_sem);
 
-       *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
+       *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2648,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 */
                if (page_count(page) != 1 + !!PageSwapCache(page))
                        goto out_unmap;
-               if (pte_young(pteval) || PageReferenced(page) ||
+               if (pte_young(pteval) ||
+                   page_is_young(page) || PageReferenced(page) ||
                    mmu_notifier_test_young(vma->vm_mm, address))
                        referenced = true;
        }
@@ -2911,7 +2964,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
                pmd_t *pmd)
 {
        spinlock_t *ptl;
-       struct page *page;
+       struct page *page = NULL;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        unsigned long mmun_start;       /* For mmu_notifiers */
@@ -2924,25 +2977,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_trans_huge(*pmd))) {
-               spin_unlock(ptl);
-               mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-               return;
-       }
-       if (is_huge_zero_pmd(*pmd)) {
+       if (unlikely(!pmd_trans_huge(*pmd)))
+               goto unlock;
+       if (vma_is_dax(vma)) {
+               pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+               if (is_huge_zero_pmd(_pmd))
+                       put_huge_zero_page();
+       } else if (is_huge_zero_pmd(*pmd)) {
                __split_huge_zero_page_pmd(vma, haddr, pmd);
-               spin_unlock(ptl);
-               mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-               return;
+       } else {
+               page = pmd_page(*pmd);
+               VM_BUG_ON_PAGE(!page_count(page), page);
+               get_page(page);
        }
-       page = pmd_page(*pmd);
-       VM_BUG_ON_PAGE(!page_count(page), page);
-       get_page(page);
+ unlock:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-       split_huge_page(page);
+       if (!page)
+               return;
 
+       split_huge_page(page);
        put_page(page);
 
        /*
@@ -2991,7 +3046,7 @@ static void split_huge_page_address(struct mm_struct *mm,
        split_huge_page_pmd_mm(mm, address, pmd);
 }
 
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
                             unsigned long start,
                             unsigned long end,
                             long adjust_next)
index 51ae41d0fbc0d8ba3556e2f272c90442d90b2ab2..999fb0aef8f16f9a126579e54fca79ad8e4f6487 100644 (file)
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
  * prevent spurious OOMs when the hugepage pool is fully utilized.
  */
 static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@ struct file_region {
 
 /*
  * Add the huge page range represented by [f, t) to the reserve
- * map.  Existing regions will be expanded to accommodate the
- * specified range.  We know only existing regions need to be
- * expanded, because region_add is only called after region_chg
- * with the same range.  If a new file_region structure must
- * be allocated, it is done in region_chg.
+ * map.  In the normal case, existing regions will be expanded
+ * to accommodate the specified range.  Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range.  However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded.  In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
  *
  * Return the number of new huge pages added to the map.  This
  * number is greater than or equal to zero.
@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
                if (f <= rg->to)
                        break;
 
+       /*
+        * If no region exists which can be expanded to include the
+        * specified range, the list must have been modified by an
+        * interleving call to region_del().  Pull a region descriptor
+        * from the cache and use it for this range.
+        */
+       if (&rg->link == head || t < rg->from) {
+               VM_BUG_ON(resv->region_cache_count <= 0);
+
+               resv->region_cache_count--;
+               nrg = list_first_entry(&resv->region_cache, struct file_region,
+                                       link);
+               list_del(&nrg->link);
+
+               nrg->from = f;
+               nrg->to = t;
+               list_add(&nrg->link, rg->link.prev);
+
+               add += t - f;
+               goto out_locked;
+       }
+
        /* Round our left edge to the current segment if it encloses us. */
        if (f > rg->from)
                f = rg->from;
@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
        add += t - nrg->to;             /* Added to end of region */
        nrg->to = t;
 
+out_locked:
+       resv->adds_in_progress--;
        spin_unlock(&resv->lock);
        VM_BUG_ON(add < 0);
        return add;
@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
  * so that the subsequent region_add call will have all the
  * regions it needs and will not fail.
  *
- * Returns the number of huge pages that need to be added
- * to the existing reservation map for the range [f, t).
- * This number is greater or equal to zero.  -ENOMEM is
- * returned if a new file_region structure is needed and can
- * not be allocated.
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map.  If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t).  This number is greater or equal to
+ * zero.  -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
  */
 static long region_chg(struct resv_map *resv, long f, long t)
 {
@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
 
 retry:
        spin_lock(&resv->lock);
+retry_locked:
+       resv->adds_in_progress++;
+
+       /*
+        * Check for sufficient descriptors in the cache to accommodate
+        * the number of in progress add operations.
+        */
+       if (resv->adds_in_progress > resv->region_cache_count) {
+               struct file_region *trg;
+
+               VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+               /* Must drop lock to allocate a new descriptor. */
+               resv->adds_in_progress--;
+               spin_unlock(&resv->lock);
+
+               trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+               if (!trg)
+                       return -ENOMEM;
+
+               spin_lock(&resv->lock);
+               list_add(&trg->link, &resv->region_cache);
+               resv->region_cache_count++;
+               goto retry_locked;
+       }
+
        /* Locate the region we are before or in. */
        list_for_each_entry(rg, head, link)
                if (f <= rg->to)
@@ -336,6 +391,7 @@ retry:
         * size such that we can guarantee to record the reservation. */
        if (&rg->link == head || t < rg->from) {
                if (!nrg) {
+                       resv->adds_in_progress--;
                        spin_unlock(&resv->lock);
                        nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
                        if (!nrg)
@@ -385,43 +441,131 @@ out_nrg:
 }
 
 /*
- * Truncate the reserve map at index 'end'.  Modify/truncate any
- * region which contains end.  Delete any regions past end.
- * Return the number of huge pages removed from the map.
+ * Abort the in progress add operation.  The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add.  Operations are sometimes
+ * aborted after the call to region_chg.  In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine.  They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
  */
-static long region_truncate(struct resv_map *resv, long end)
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+       spin_lock(&resv->lock);
+       VM_BUG_ON(!resv->region_cache_count);
+       resv->adds_in_progress--;
+       spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map.  If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted.  Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more.  In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated.  If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM.  Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
+ */
+static long region_del(struct resv_map *resv, long f, long t)
 {
        struct list_head *head = &resv->regions;
        struct file_region *rg, *trg;
-       long chg = 0;
+       struct file_region *nrg = NULL;
+       long del = 0;
 
+retry:
        spin_lock(&resv->lock);
-       /* Locate the region we are either in or before. */
-       list_for_each_entry(rg, head, link)
-               if (end <= rg->to)
+       list_for_each_entry_safe(rg, trg, head, link) {
+               if (rg->to <= f)
+                       continue;
+               if (rg->from >= t)
                        break;
-       if (&rg->link == head)
-               goto out;
 
-       /* If we are in the middle of a region then adjust it. */
-       if (end > rg->from) {
-               chg = rg->to - end;
-               rg->to = end;
-               rg = list_entry(rg->link.next, typeof(*rg), link);
-       }
+               if (f > rg->from && t < rg->to) { /* Must split region */
+                       /*
+                        * Check for an entry in the cache before dropping
+                        * lock and attempting allocation.
+                        */
+                       if (!nrg &&
+                           resv->region_cache_count > resv->adds_in_progress) {
+                               nrg = list_first_entry(&resv->region_cache,
+                                                       struct file_region,
+                                                       link);
+                               list_del(&nrg->link);
+                               resv->region_cache_count--;
+                       }
 
-       /* Drop any remaining regions. */
-       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-               if (&rg->link == head)
+                       if (!nrg) {
+                               spin_unlock(&resv->lock);
+                               nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                               if (!nrg)
+                                       return -ENOMEM;
+                               goto retry;
+                       }
+
+                       del += t - f;
+
+                       /* New entry for end of split region */
+                       nrg->from = t;
+                       nrg->to = rg->to;
+                       INIT_LIST_HEAD(&nrg->link);
+
+                       /* Original entry is trimmed */
+                       rg->to = f;
+
+                       list_add(&nrg->link, &rg->link);
+                       nrg = NULL;
                        break;
-               chg += rg->to - rg->from;
-               list_del(&rg->link);
-               kfree(rg);
+               }
+
+               if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+                       del += rg->to - rg->from;
+                       list_del(&rg->link);
+                       kfree(rg);
+                       continue;
+               }
+
+               if (f <= rg->from) {    /* Trim beginning of region */
+                       del += t - rg->from;
+                       rg->from = t;
+               } else {                /* Trim end of region */
+                       del += rg->to - f;
+                       rg->to = f;
+               }
        }
 
-out:
        spin_unlock(&resv->lock);
-       return chg;
+       kfree(nrg);
+       return del;
+}
+
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page.  The huge page itself was free'ed
+ * and removed from the page cache.  This routine will adjust the subpool
+ * usage count, and the global reserve count if needed.  By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+       struct hugepage_subpool *spool = subpool_inode(inode);
+       long rsv_adjust;
+
+       rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+       if (restore_reserve && rsv_adjust) {
+               struct hstate *h = hstate_inode(inode);
+
+               hugetlb_acct_memory(h, 1);
+       }
 }
 
 /*
@@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
 struct resv_map *resv_map_alloc(void)
 {
        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
-       if (!resv_map)
+       struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+       if (!resv_map || !rg) {
+               kfree(resv_map);
+               kfree(rg);
                return NULL;
+       }
 
        kref_init(&resv_map->refs);
        spin_lock_init(&resv_map->lock);
        INIT_LIST_HEAD(&resv_map->regions);
 
+       resv_map->adds_in_progress = 0;
+
+       INIT_LIST_HEAD(&resv_map->region_cache);
+       list_add(&rg->link, &resv_map->region_cache);
+       resv_map->region_cache_count = 1;
+
        return resv_map;
 }
 
 void resv_map_release(struct kref *ref)
 {
        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+       struct list_head *head = &resv_map->region_cache;
+       struct file_region *rg, *trg;
 
        /* Clear out any active regions before we release the map. */
-       region_truncate(resv_map, 0);
+       region_del(resv_map, 0, LONG_MAX);
+
+       /* ... and any entries left in the cache */
+       list_for_each_entry_safe(rg, trg, head, link) {
+               list_del(&rg->link);
+               kfree(rg);
+       }
+
+       VM_BUG_ON(resv_map->adds_in_progress);
+
        kfree(resv_map);
 }
 
@@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
        }
 
        /* Shared mappings always use reserves */
-       if (vma->vm_flags & VM_MAYSHARE)
-               return true;
+       if (vma->vm_flags & VM_MAYSHARE) {
+               /*
+                * We know VM_NORESERVE is not set.  Therefore, there SHOULD
+                * be a region map for all pages.  The only situation where
+                * there is no region map is if a hole was punched via
+                * fallocate.  In this case, there really are no reverves to
+                * use.  This situation is indicated if chg != 0.
+                */
+               if (chg)
+                       return false;
+               else
+                       return true;
+       }
 
        /*
         * Only the process that called mmap() has reserves for
@@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
 
-       page = alloc_pages_exact_node(nid,
+       page = __alloc_pages_node(nid,
                htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
                huge_page_order(h));
@@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
                                   __GFP_REPEAT|__GFP_NOWARN,
                                   huge_page_order(h));
        else
-               page = alloc_pages_exact_node(nid,
+               page = __alloc_pages_node(nid,
                        htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                        __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
 
@@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
        }
 }
 
+
 /*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
- * page allocation routines to manage reservations.
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
  *
  * vma_needs_reservation is called to determine if the huge page at addr
  * within the vma has an associated reservation.  If a reservation is
  * needed, the value 1 is returned.  The caller is then responsible for
  * managing the global reservation and subpool usage counts.  After
  * the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map.  If the page allocation fails,
+ * the reservation must be ended instead of committed.  vma_end_reservation
+ * is called in such cases.
  *
  * In the normal case, vma_commit_reservation returns the same value
  * as the preceding vma_needs_reservation call.  The only time this
@@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
  * is the responsibility of the caller to notice the difference and
  * take appropriate action.
  */
+enum vma_resv_mode {
+       VMA_NEEDS_RESV,
+       VMA_COMMIT_RESV,
+       VMA_END_RESV,
+};
 static long __vma_reservation_common(struct hstate *h,
                                struct vm_area_struct *vma, unsigned long addr,
-                               bool commit)
+                               enum vma_resv_mode mode)
 {
        struct resv_map *resv;
        pgoff_t idx;
@@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
                return 1;
 
        idx = vma_hugecache_offset(h, vma, addr);
-       if (commit)
-               ret = region_add(resv, idx, idx + 1);
-       else
+       switch (mode) {
+       case VMA_NEEDS_RESV:
                ret = region_chg(resv, idx, idx + 1);
+               break;
+       case VMA_COMMIT_RESV:
+               ret = region_add(resv, idx, idx + 1);
+               break;
+       case VMA_END_RESV:
+               region_abort(resv, idx, idx + 1);
+               ret = 0;
+               break;
+       default:
+               BUG();
+       }
 
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
@@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
 static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-       return __vma_reservation_common(h, vma, addr, false);
+       return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
 }
 
 static long vma_commit_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
 {
-       return __vma_reservation_common(h, vma, addr, true);
+       return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
+}
+
+static void vma_end_reservation(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
 }
 
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-       long chg, commit;
+       long map_chg, map_commit;
+       long gbl_chg;
        int ret, idx;
        struct hugetlb_cgroup *h_cg;
 
        idx = hstate_index(h);
        /*
-        * Processes that did not create the mapping will have no
-        * reserves and will not have accounted against subpool
-        * limit. Check that the subpool limit can be made before
-        * satisfying the allocation MAP_NORESERVE mappings may also
-        * need pages and subpool limit allocated allocated if no reserve
-        * mapping overlaps.
+        * Examine the region/reserve map to determine if the process
+        * has a reservation for the page to be allocated.  A return
+        * code of zero indicates a reservation exists (no change).
         */
-       chg = vma_needs_reservation(h, vma, addr);
-       if (chg < 0)
+       map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+       if (map_chg < 0)
                return ERR_PTR(-ENOMEM);
-       if (chg || avoid_reserve)
-               if (hugepage_subpool_get_pages(spool, 1) < 0)
+
+       /*
+        * Processes that did not create the mapping will have no
+        * reserves as indicated by the region/reserve map. Check
+        * that the allocation will not exceed the subpool limit.
+        * Allocations for MAP_NORESERVE mappings also need to be
+        * checked against any subpool limit.
+        */
+       if (map_chg || avoid_reserve) {
+               gbl_chg = hugepage_subpool_get_pages(spool, 1);
+               if (gbl_chg < 0) {
+                       vma_end_reservation(h, vma, addr);
                        return ERR_PTR(-ENOSPC);
+               }
+
+               /*
+                * Even though there was no reservation in the region/reserve
+                * map, there could be reservations associated with the
+                * subpool that can be used.  This would be indicated if the
+                * return value of hugepage_subpool_get_pages() is zero.
+                * However, if avoid_reserve is specified we still avoid even
+                * the subpool reservations.
+                */
+               if (avoid_reserve)
+                       gbl_chg = 1;
+       }
 
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret)
                goto out_subpool_put;
 
        spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+       /*
+        * glb_chg is passed to indicate whether or not a page must be taken
+        * from the global free pool (global change).  gbl_chg == 0 indicates
+        * a reservation exists for the allocation.
+        */
+       page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 
        set_page_private(page, (unsigned long)spool);
 
-       commit = vma_commit_reservation(h, vma, addr);
-       if (unlikely(chg > commit)) {
+       map_commit = vma_commit_reservation(h, vma, addr);
+       if (unlikely(map_chg > map_commit)) {
                /*
                 * The page was added to the reservation map between
                 * vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
 out_subpool_put:
-       if (chg || avoid_reserve)
+       if (map_chg || avoid_reserve)
                hugepage_subpool_put_pages(spool, 1);
+       vma_end_reservation(h, vma, addr);
        return ERR_PTR(-ENOSPC);
 }
 
@@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
        }
 
        kobject_put(hugepages_kobj);
-       kfree(htlb_fault_mutex_table);
+       kfree(hugetlb_fault_mutex_table);
 }
 module_exit(hugetlb_exit);
 
@@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
 #else
        num_fault_mutexes = 1;
 #endif
-       htlb_fault_mutex_table =
+       hugetlb_fault_mutex_table =
                kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
-       BUG_ON(!htlb_fault_mutex_table);
+       BUG_ON(!hugetlb_fault_mutex_table);
 
        for (i = 0; i < num_fault_mutexes; i++)
-               mutex_init(&htlb_fault_mutex_table[i]);
+               mutex_init(&hugetlb_fault_mutex_table[i]);
        return 0;
 }
 module_init(hugetlb_init);
@@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
        return page != NULL;
 }
 
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+                          pgoff_t idx)
+{
+       struct inode *inode = mapping->host;
+       struct hstate *h = hstate_inode(inode);
+       int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+       if (err)
+               return err;
+       ClearPagePrivate(page);
+
+       spin_lock(&inode->i_lock);
+       inode->i_blocks += blocks_per_huge_page(h);
+       spin_unlock(&inode->i_lock);
+       return 0;
+}
+
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                           struct address_space *mapping, pgoff_t idx,
                           unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3439,13 @@ retry:
                set_page_huge_active(page);
 
                if (vma->vm_flags & VM_MAYSHARE) {
-                       int err;
-                       struct inode *inode = mapping->host;
-
-                       err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+                       int err = huge_add_to_page_cache(page, mapping, idx);
                        if (err) {
                                put_page(page);
                                if (err == -EEXIST)
                                        goto retry;
                                goto out;
                        }
-                       ClearPagePrivate(page);
-
-                       spin_lock(&inode->i_lock);
-                       inode->i_blocks += blocks_per_huge_page(h);
-                       spin_unlock(&inode->i_lock);
                } else {
                        lock_page(page);
                        if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3473,14 @@ retry:
         * any allocations necessary to record that reservation occur outside
         * the spinlock.
         */
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                if (vma_needs_reservation(h, vma, address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto backout_unlocked;
                }
+               /* Just decrements count, does not deallocate */
+               vma_end_reservation(h, vma, address);
+       }
 
        ptl = huge_pte_lockptr(h, mm, ptep);
        spin_lock(ptl);
@@ -3280,7 +3520,7 @@ backout_unlocked:
 }
 
 #ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                            struct vm_area_struct *vma,
                            struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
@@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
  * For uniprocesor systems we always use a single mutex, so just
  * return 0 and avoid the hashing overhead.
  */
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                            struct vm_area_struct *vma,
                            struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
@@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
-       hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
-       mutex_lock(&htlb_fault_mutex_table[hash]);
+       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry)) {
@@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
                }
+               /* Just decrements count, does not deallocate */
+               vma_end_reservation(h, vma, address);
 
                if (!(vma->vm_flags & VM_MAYSHARE))
                        pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3679,7 @@ out_ptl:
                put_page(pagecache_page);
        }
 out_mutex:
-       mutex_unlock(&htlb_fault_mutex_table[hash]);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
        }
        return 0;
 out_err:
+       if (!vma || vma->vm_flags & VM_MAYSHARE)
+               region_abort(resv_map, from, to);
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                kref_put(&resv_map->refs, resv_map_release);
        return ret;
 }
 
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+                                                               long freed)
 {
        struct hstate *h = hstate_inode(inode);
        struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;
 
-       if (resv_map)
-               chg = region_truncate(resv_map, offset);
+       if (resv_map) {
+               chg = region_del(resv_map, start, end);
+               /*
+                * region_del() can fail in the rare case where a region
+                * must be split and another region descriptor can not be
+                * allocated.  If end == LONG_MAX, it will not fail.
+                */
+               if (chg < 0)
+                       return chg;
+       }
+
        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
@@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);
+
+       return 0;
 }
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
index bf73ac17dad424d9e46857334e5b0befb6805fff..9d26fd9fefe4a1f4ec78279455c3b27f4ddbb875 100644 (file)
@@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val)
        /*
         * do a racy check with elevated page count, to make sure PG_hwpoison
         * will only be set for the targeted owner (or on a free page).
-        * We temporarily take page lock for try_get_mem_cgroup_from_page().
         * memory_failure() will redo the check reliably inside page lock.
         */
-       lock_page(hpage);
        err = hwpoison_filter(hpage);
-       unlock_page(hpage);
        if (err)
                goto put_out;
 
@@ -58,7 +55,7 @@ inject:
        pr_info("Injecting memory failure at pfn %#lx\n", pfn);
        return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 put_out:
-       put_page(p);
+       put_hwpoison_page(p);
        return 0;
 }
 
@@ -126,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
 
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
index 1195dd2d6a2b94214e9ebb11e8c2037cd0e0cea6..bc0fa9a69e463771ca1de684c686e96644bc30dd 100644 (file)
@@ -182,6 +182,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+       unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
        enum migrate_mode mode;         /* Async or sync migration mode */
        bool ignore_skip_hint;          /* Scan blocks even if marked skip */
        int order;                      /* order a direct compactor needs */
index 7b28e9cdf1c7686428fe49802fced44088043555..8da211411b57f1b0db116d8d67401bae30628ec8 100644 (file)
@@ -135,12 +135,11 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
 
        if (unlikely(*shadow_addr)) {
                u16 shadow_first_bytes = *(u16 *)shadow_addr;
-               s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
 
                if (unlikely(shadow_first_bytes))
                        return true;
 
-               if (likely(!last_byte))
+               if (likely(IS_ALIGNED(addr, 8)))
                        return false;
 
                return memory_is_poisoned_1(addr + 15);
index cf79f110157c9122afb7e6a72ded4b090ea814af..77191eccdc6f6c372e84e2f49750f7f95e56c324 100644 (file)
@@ -302,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq,
                            struct kmemleak_object *object)
 {
        const u8 *ptr = (const u8 *)object->pointer;
-       int i, len, remaining;
-       unsigned char linebuf[HEX_ROW_SIZE * 5];
+       size_t len;
 
        /* limit the number of lines to HEX_MAX_LINES */
-       remaining = len =
-               min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
-
-       seq_printf(seq, "  hex dump (first %d bytes):\n", len);
-       for (i = 0; i < len; i += HEX_ROW_SIZE) {
-               int linelen = min(remaining, HEX_ROW_SIZE);
-
-               remaining -= HEX_ROW_SIZE;
-               hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
-                                  HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
-                                  HEX_ASCII);
-               seq_printf(seq, "    %s\n", linebuf);
-       }
+       len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
+
+       seq_printf(seq, "  hex dump (first %zu bytes):\n", len);
+       seq_hex_dump(seq, "    ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+                    HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
 }
 
 /*
@@ -838,6 +829,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
        }
 
        if (crt_early_log >= ARRAY_SIZE(early_log)) {
+               crt_early_log++;
                kmemleak_disable();
                return;
        }
@@ -1882,7 +1874,7 @@ void __init kmemleak_init(void)
        object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
        scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
 
-       if (crt_early_log >= ARRAY_SIZE(early_log))
+       if (crt_early_log > ARRAY_SIZE(early_log))
                pr_warning("Early log buffer exceeded (%d), please increase "
                           "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
 
index 909eca2c820e4af45f9c4d519eee744041f18a2b..e1da19fac1b3629f7aeff4815a64681f9ca9bd60 100644 (file)
@@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
        struct list_lru_one *l;
 
        spin_lock(&nlru->lock);
-       l = list_lru_from_kmem(nlru, item);
        if (list_empty(item)) {
+               l = list_lru_from_kmem(nlru, item);
                list_add_tail(item, &l->list);
                l->nr_items++;
                spin_unlock(&nlru->lock);
@@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
        struct list_lru_one *l;
 
        spin_lock(&nlru->lock);
-       l = list_lru_from_kmem(nlru, item);
        if (!list_empty(item)) {
+               l = list_lru_from_kmem(nlru, item);
                list_del_init(item);
                l->nr_items--;
                spin_unlock(&nlru->lock);
index ce3a4222c7e7ae4558b201704a0ee5d55518fcfc..c889fcbb530e98d8779ef75750e1fde08bf786cf 100644 (file)
@@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
 
-       if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
+       if (vma->vm_flags & VM_LOCKED)
                return -EINVAL;
 
        f = vma->vm_file;
index 95ce68c6da8adc0b8d4c925027a5b04b824ab381..1c7b647e58971ee77e0892bb0edbad5b40ec50fd 100644 (file)
@@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }
 
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
                                        phys_addr_t base, phys_addr_t size)
 {
        unsigned long i;
@@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
                        break;
        }
 
-       return (i < type->cnt) ? i : -1;
+       return i < type->cnt;
 }
 
 /*
@@ -569,6 +569,7 @@ repeat:
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                        WARN_ON(nid != memblock_get_region_node(rgn));
 #endif
+                       WARN_ON(flags != rgn->flags);
                        nr_new++;
                        if (insert)
                                memblock_insert_region(type, i++, base,
@@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
                                                int nid,
                                                unsigned long flags)
 {
-       struct memblock_type *_rgn = &memblock.memory;
+       struct memblock_type *type = &memblock.memory;
 
        memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
                     (unsigned long long)base,
                     (unsigned long long)base + size - 1,
                     flags, (void *)_RET_IP_);
 
-       return memblock_add_range(_rgn, base, size, nid, flags);
+       return memblock_add_range(type, base, size, nid, flags);
 }
 
 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
  *
  * This function isolates region [@base, @base + @size), and sets/clears flag
  *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
  */
 static int __init_memblock memblock_setclr_flag(phys_addr_t base,
                                phys_addr_t size, int set, int flag)
@@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
  */
 int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
 {
@@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
  */
 int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 {
@@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
  * @base: the base phys addr of the region
  * @size: the size of the region
  *
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
  */
 int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
 {
@@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
                                           phys_addr_t *out_start,
                                           phys_addr_t *out_end)
 {
-       struct memblock_type *rsv = &memblock.reserved;
+       struct memblock_type *type = &memblock.reserved;
 
-       if (*idx >= 0 && *idx < rsv->cnt) {
-               struct memblock_region *r = &rsv->regions[*idx];
+       if (*idx >= 0 && *idx < type->cnt) {
+               struct memblock_region *r = &type->regions[*idx];
                phys_addr_t base = r->base;
                phys_addr_t size = r->size;
 
@@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
  * in type_b.
  *
  * @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
  * @flags: pick from blocks based on memory attributes
  * @type_a: pointer to memblock_type from where the range is taken
  * @type_b: pointer to memblock_type which excludes memory from being taken
@@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
  * Check if the region [@base, @base+@size) intersects a reserved memory block.
  *
  * RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
  */
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
        memblock_cap_size(base, &size);
-       return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+       return memblock_overlaps_region(&memblock.reserved, base, size);
 }
 
 void __init_memblock memblock_trim_memory(phys_addr_t align)
index 1af057575ce9e65c862dfc61f574c46d42b7cd5f..6ddaeba34e097a7553d33b8add26e26a27d3c81a 100644 (file)
@@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
        "unevictable",
 };
 
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-       MEM_CGROUP_TARGET_THRESH,
-       MEM_CGROUP_TARGET_SOFTLIMIT,
-       MEM_CGROUP_TARGET_NUMAINFO,
-       MEM_CGROUP_NTARGETS,
-};
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET 1024
 
-struct mem_cgroup_stat_cpu {
-       long count[MEM_CGROUP_STAT_NSTATS];
-       unsigned long events[MEMCG_NR_EVENTS];
-       unsigned long nr_page_events;
-       unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
-       struct mem_cgroup *position;
-       /* scan generation, increased every round-trip */
-       unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
-       struct lruvec           lruvec;
-       unsigned long           lru_size[NR_LRU_LISTS];
-
-       struct reclaim_iter     iter[DEF_PRIORITY + 1];
-
-       struct rb_node          tree_node;      /* RB tree node */
-       unsigned long           usage_in_excess;/* Set to the value by which */
-                                               /* the soft limit is exceeded*/
-       bool                    on_tree;
-       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
-                                               /* use container_of        */
-};
-
-struct mem_cgroup_per_node {
-       struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
@@ -181,32 +135,6 @@ struct mem_cgroup_tree {
 
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 
-struct mem_cgroup_threshold {
-       struct eventfd_ctx *eventfd;
-       unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
-       /* An array index points to threshold just below or equal to usage. */
-       int current_threshold;
-       /* Size of entries[] */
-       unsigned int size;
-       /* Array of thresholds */
-       struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
-       /* Primary thresholds array */
-       struct mem_cgroup_threshold_ary *primary;
-       /*
-        * Spare threshold array.
-        * This is needed to make mem_cgroup_unregister_event() "never fail".
-        * It must be able to store at least primary->size - 1 entries.
-        */
-       struct mem_cgroup_threshold_ary *spare;
-};
-
 /* for OOM */
 struct mem_cgroup_eventfd_list {
        struct list_head list;
@@ -256,113 +184,6 @@ struct mem_cgroup_event {
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
-       struct cgroup_subsys_state css;
-
-       /* Accounted resources */
-       struct page_counter memory;
-       struct page_counter memsw;
-       struct page_counter kmem;
-
-       /* Normal memory consumption range */
-       unsigned long low;
-       unsigned long high;
-
-       unsigned long soft_limit;
-
-       /* vmpressure notifications */
-       struct vmpressure vmpressure;
-
-       /* css_online() has been completed */
-       int initialized;
-
-       /*
-        * Should the accounting and control be hierarchical, per subtree?
-        */
-       bool use_hierarchy;
-
-       /* protected by memcg_oom_lock */
-       bool            oom_lock;
-       int             under_oom;
-
-       int     swappiness;
-       /* OOM-Killer disable */
-       int             oom_kill_disable;
-
-       /* protect arrays of thresholds */
-       struct mutex thresholds_lock;
-
-       /* thresholds for memory usage. RCU-protected */
-       struct mem_cgroup_thresholds thresholds;
-
-       /* thresholds for mem+swap usage. RCU-protected */
-       struct mem_cgroup_thresholds memsw_thresholds;
-
-       /* For oom notifier event fd */
-       struct list_head oom_notify;
-
-       /*
-        * Should we move charges of a task when a task is moved into this
-        * mem_cgroup ? And what type of charges should we move ?
-        */
-       unsigned long move_charge_at_immigrate;
-       /*
-        * set > 0 if pages under this cgroup are moving to other cgroup.
-        */
-       atomic_t                moving_account;
-       /* taken only while moving_account > 0 */
-       spinlock_t              move_lock;
-       struct task_struct      *move_lock_task;
-       unsigned long           move_lock_flags;
-       /*
-        * percpu counter.
-        */
-       struct mem_cgroup_stat_cpu __percpu *stat;
-       spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-       struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-       int kmemcg_id;
-       bool kmem_acct_activated;
-       bool kmem_acct_active;
-#endif
-
-       int last_scanned_node;
-#if MAX_NUMNODES > 1
-       nodemask_t      scan_nodes;
-       atomic_t        numainfo_events;
-       atomic_t        numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
-       struct list_head cgwb_list;
-       struct wb_domain cgwb_domain;
-#endif
-
-       /* List of events which userspace want to receive */
-       struct list_head event_list;
-       spinlock_t event_list_lock;
-
-       struct mem_cgroup_per_node *nodeinfo[0];
-       /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
-       return memcg->kmem_acct_active;
-}
-#endif
-
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved.
@@ -423,11 +244,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
-       return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
@@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
                rcu_read_lock();
                memcg = mem_cgroup_from_task(current);
                cg_proto = sk->sk_prot->proto_cgroup(memcg);
-               if (!mem_cgroup_is_root(memcg) &&
-                   memcg_proto_active(cg_proto) &&
+               if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
                    css_tryget_online(&memcg->css)) {
                        sk->sk_cgrp = cg_proto;
                }
@@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
        return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
-       return &memcg->css;
-}
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -631,6 +441,34 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
        return &memcg->css;
 }
 
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+       struct mem_cgroup *memcg;
+       unsigned long ino = 0;
+
+       rcu_read_lock();
+       memcg = READ_ONCE(page->mem_cgroup);
+       while (memcg && !(memcg->css.flags & CSS_ONLINE))
+               memcg = parent_mem_cgroup(memcg);
+       if (memcg)
+               ino = cgroup_ino(memcg->css.cgroup);
+       rcu_read_unlock();
+       return ino;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -876,14 +714,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
 
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-       struct mem_cgroup_per_zone *mz;
-
-       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-       return mz->lru_size[lru];
-}
-
 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                                                  int nid,
                                                  unsigned int lru_mask)
@@ -986,6 +816,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 
        return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
+EXPORT_SYMBOL(mem_cgroup_from_task);
 
 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
@@ -1031,7 +862,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
                                   struct mem_cgroup_reclaim_cookie *reclaim)
 {
-       struct reclaim_iter *uninitialized_var(iter);
+       struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
        struct cgroup_subsys_state *css = NULL;
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *pos = NULL;
@@ -1173,30 +1004,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
             iter != NULL;                              \
             iter = mem_cgroup_iter(NULL, iter, NULL))
 
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
-       struct mem_cgroup *memcg;
-
-       rcu_read_lock();
-       memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-       if (unlikely(!memcg))
-               goto out;
-
-       switch (idx) {
-       case PGFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-               break;
-       case PGMAJFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-               break;
-       default:
-               BUG();
-       }
-out:
-       rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
@@ -1295,15 +1102,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
        VM_BUG_ON((long)(*lru_size) < 0);
 }
 
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
-       if (root == memcg)
-               return true;
-       if (!root->use_hierarchy)
-               return false;
-       return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
 {
        struct mem_cgroup *task_memcg;
@@ -1330,39 +1128,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
        return ret;
 }
 
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-       unsigned long inactive_ratio;
-       unsigned long inactive;
-       unsigned long active;
-       unsigned long gb;
-
-       inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-       active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
-       else
-               inactive_ratio = 1;
-
-       return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
-       struct mem_cgroup_per_zone *mz;
-       struct mem_cgroup *memcg;
-
-       if (mem_cgroup_disabled())
-               return true;
-
-       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-       memcg = mz->memcg;
-
-       return !!(memcg->css.flags & CSS_ONLINE);
-}
-
 #define mem_cgroup_from_counter(counter, member)       \
        container_of(counter, struct mem_cgroup, member)
 
@@ -1394,15 +1159,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
        return margin;
 }
 
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
-       /* root ? */
-       if (mem_cgroup_disabled() || !memcg->css.parent)
-               return vm_swappiness;
-
-       return memcg->swappiness;
-}
-
 /*
  * A routine for checking "mem" is under move_account() or not.
  *
@@ -1545,6 +1301,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                     int order)
 {
+       struct oom_control oc = {
+               .zonelist = NULL,
+               .nodemask = NULL,
+               .gfp_mask = gfp_mask,
+               .order = order,
+       };
        struct mem_cgroup *iter;
        unsigned long chosen_points = 0;
        unsigned long totalpages;
@@ -1563,7 +1325,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                goto unlock;
        }
 
-       check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+       check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
        totalpages = mem_cgroup_get_limit(memcg) ? : 1;
        for_each_mem_cgroup_tree(iter, memcg) {
                struct css_task_iter it;
@@ -1571,8 +1333,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
                css_task_iter_start(&iter->css, &it);
                while ((task = css_task_iter_next(&it))) {
-                       switch (oom_scan_process_thread(task, totalpages, NULL,
-                                                       false)) {
+                       switch (oom_scan_process_thread(&oc, task, totalpages)) {
                        case OOM_SCAN_SELECT:
                                if (chosen)
                                        put_task_struct(chosen);
@@ -1610,8 +1371,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
        if (chosen) {
                points = chosen_points * 1000 / totalpages;
-               oom_kill_process(chosen, gfp_mask, order, points, totalpages,
-                                memcg, NULL, "Memory cgroup out of memory");
+               oom_kill_process(&oc, chosen, points, totalpages, memcg,
+                                "Memory cgroup out of memory");
        }
 unlock:
        mutex_unlock(&oom_lock);
@@ -2062,23 +1823,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-                                enum mem_cgroup_stat_index idx, int val)
-{
-       VM_BUG_ON(!rcu_read_lock_held());
-
-       if (memcg)
-               this_cpu_add(memcg->stat->count[idx], val);
-}
-
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
@@ -2355,40 +2099,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
        css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges.  If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       struct mem_cgroup *memcg;
-       unsigned short id;
-       swp_entry_t ent;
-
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-       memcg = page->mem_cgroup;
-       if (memcg) {
-               if (!css_tryget_online(&memcg->css))
-                       memcg = NULL;
-       } else if (PageSwapCache(page)) {
-               ent.val = page_private(page);
-               id = lookup_swap_cgroup_id(ent);
-               rcu_read_lock();
-               memcg = mem_cgroup_from_id(id);
-               if (memcg && !css_tryget_online(&memcg->css))
-                       memcg = NULL;
-               rcu_read_unlock();
-       }
-       return memcg;
-}
-
 static void lock_page_lru(struct page *page, int *isolated)
 {
        struct zone *zone = page_zone(page);
@@ -2504,16 +2214,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
        css_put_many(&memcg->css, nr_pages);
 }
 
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-       return memcg ? memcg->kmemcg_id : -1;
-}
-
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -5127,10 +4827,12 @@ static void mem_cgroup_clear_mc(void)
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
-       struct task_struct *p = cgroup_taskset_first(tset);
-       int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *from;
+       struct task_struct *p;
+       struct mm_struct *mm;
        unsigned long move_flags;
+       int ret = 0;
 
        /*
         * We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4840,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
         * So we need to save it, and keep it going.
         */
        move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
-       if (move_flags) {
-               struct mm_struct *mm;
-               struct mem_cgroup *from = mem_cgroup_from_task(p);
+       if (!move_flags)
+               return 0;
 
-               VM_BUG_ON(from == memcg);
+       p = cgroup_taskset_first(tset);
+       from = mem_cgroup_from_task(p);
 
-               mm = get_task_mm(p);
-               if (!mm)
-                       return 0;
-               /* We move charges only when we move a owner of the mm */
-               if (mm->owner == p) {
-                       VM_BUG_ON(mc.from);
-                       VM_BUG_ON(mc.to);
-                       VM_BUG_ON(mc.precharge);
-                       VM_BUG_ON(mc.moved_charge);
-                       VM_BUG_ON(mc.moved_swap);
-
-                       spin_lock(&mc.lock);
-                       mc.from = from;
-                       mc.to = memcg;
-                       mc.flags = move_flags;
-                       spin_unlock(&mc.lock);
-                       /* We set mc.moving_task later */
-
-                       ret = mem_cgroup_precharge_mc(mm);
-                       if (ret)
-                               mem_cgroup_clear_mc();
-               }
-               mmput(mm);
+       VM_BUG_ON(from == memcg);
+
+       mm = get_task_mm(p);
+       if (!mm)
+               return 0;
+       /* We move charges only when we move a owner of the mm */
+       if (mm->owner == p) {
+               VM_BUG_ON(mc.from);
+               VM_BUG_ON(mc.to);
+               VM_BUG_ON(mc.precharge);
+               VM_BUG_ON(mc.moved_charge);
+               VM_BUG_ON(mc.moved_swap);
+
+               spin_lock(&mc.lock);
+               mc.from = from;
+               mc.to = memcg;
+               mc.flags = move_flags;
+               spin_unlock(&mc.lock);
+               /* We set mc.moving_task later */
+
+               ret = mem_cgroup_precharge_mc(mm);
+               if (ret)
+                       mem_cgroup_clear_mc();
        }
+       mmput(mm);
        return ret;
 }
 
@@ -5520,19 +5223,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
        .early_init = 0,
 };
 
-/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
-                      enum mem_cgroup_events_index idx,
-                      unsigned int nr)
-{
-       this_cpu_add(memcg->stat->events[idx], nr);
-}
-
 /**
  * mem_cgroup_low - check if memory consumption is below the normal range
  * @root: the highest ancestor to consider
@@ -5605,8 +5295,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 * the page lock, which serializes swap cache removal, which
                 * in turn serializes uncharging.
                 */
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
                if (page->mem_cgroup)
                        goto out;
+
+               if (do_swap_account) {
+                       swp_entry_t ent = { .val = page_private(page), };
+                       unsigned short id = lookup_swap_cgroup_id(ent);
+
+                       rcu_read_lock();
+                       memcg = mem_cgroup_from_id(id);
+                       if (memcg && !css_tryget_online(&memcg->css))
+                               memcg = NULL;
+                       rcu_read_unlock();
+               }
        }
 
        if (PageTransHuge(page)) {
@@ -5614,8 +5316,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
        }
 
-       if (do_swap_account && PageSwapCache(page))
-               memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
                memcg = get_mem_cgroup_from_mm(mm);
 
index 1f4446a90cef07c67ee1082b83f0ca87ebfefea1..95882692e747c2a534488190287e5954fba35d39 100644 (file)
@@ -130,27 +130,15 @@ static int hwpoison_filter_flags(struct page *p)
  * can only guarantee that the page either belongs to the memcg tasks, or is
  * a freed page.
  */
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_MEMCG
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
 {
-       struct mem_cgroup *mem;
-       struct cgroup_subsys_state *css;
-       unsigned long ino;
-
        if (!hwpoison_filter_memcg)
                return 0;
 
-       mem = try_get_mem_cgroup_from_page(p);
-       if (!mem)
-               return -EINVAL;
-
-       css = mem_cgroup_css(mem);
-       ino = cgroup_ino(css->cgroup);
-       css_put(css);
-
-       if (ino != hwpoison_filter_memcg)
+       if (page_cgroup_ino(p) != hwpoison_filter_memcg)
                return -EINVAL;
 
        return 0;
@@ -934,6 +922,27 @@ int get_hwpoison_page(struct page *page)
 }
 EXPORT_SYMBOL_GPL(get_hwpoison_page);
 
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page:      raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+       struct page *head = compound_head(page);
+
+       if (PageHuge(head)) {
+               put_page(head);
+               return;
+       }
+
+       if (PageTransHuge(head))
+               if (page != head)
+                       put_page(head);
+
+       put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1100,7 +1109,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                nr_pages = 1 << compound_order(hpage);
        else /* normal page or thp */
                nr_pages = 1;
-       atomic_long_add(nr_pages, &num_poisoned_pages);
+       num_poisoned_pages_add(nr_pages);
 
        /*
         * We need/can do nothing about count=0 pages.
@@ -1128,7 +1137,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        if (PageHWPoison(hpage)) {
                                if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
                                    || (p != hpage && TestSetPageHWPoison(hpage))) {
-                                       atomic_long_sub(nr_pages, &num_poisoned_pages);
+                                       num_poisoned_pages_sub(nr_pages);
                                        unlock_page(hpage);
                                        return 0;
                                }
@@ -1152,10 +1161,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                        else
                                pr_err("MCE: %#lx: thp split failed\n", pfn);
                        if (TestClearPageHWPoison(p))
-                               atomic_long_sub(nr_pages, &num_poisoned_pages);
-                       put_page(p);
-                       if (p != hpage)
-                               put_page(hpage);
+                               num_poisoned_pages_sub(nr_pages);
+                       put_hwpoison_page(p);
                        return -EBUSY;
                }
                VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1214,16 +1221,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         */
        if (!PageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
-               atomic_long_sub(nr_pages, &num_poisoned_pages);
+               num_poisoned_pages_sub(nr_pages);
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                return 0;
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                       atomic_long_sub(nr_pages, &num_poisoned_pages);
+                       num_poisoned_pages_sub(nr_pages);
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                return 0;
        }
 
@@ -1237,7 +1244,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                return 0;
        }
        /*
@@ -1426,6 +1433,22 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
 
+       if (page_count(page) > 1) {
+               pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+               return 0;
+       }
+
+       if (page_mapped(page)) {
+               pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+               return 0;
+       }
+
+       if (page_mapping(page)) {
+               pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+                       pfn);
+               return 0;
+       }
+
        /*
         * unpoison_memory() can encounter thp only when the thp is being
         * worked by memory_failure() and the page lock is not held yet.
@@ -1450,7 +1473,7 @@ int unpoison_memory(unsigned long pfn)
                        return 0;
                }
                if (TestClearPageHWPoison(p))
-                       atomic_long_dec(&num_poisoned_pages);
+                       num_poisoned_pages_dec();
                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1464,16 +1487,16 @@ int unpoison_memory(unsigned long pfn)
         */
        if (TestClearPageHWPoison(page)) {
                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
-               atomic_long_sub(nr_pages, &num_poisoned_pages);
+               num_poisoned_pages_sub(nr_pages);
                freeit = 1;
                if (PageHuge(page))
                        clear_page_hwpoison_huge_page(page);
        }
        unlock_page(page);
 
-       put_page(page);
+       put_hwpoison_page(page);
        if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
-               put_page(page);
+               put_hwpoison_page(page);
 
        return 0;
 }
@@ -1486,7 +1509,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
                return alloc_huge_page_node(page_hstate(compound_head(p)),
                                                   nid);
        else
-               return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+               return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
@@ -1533,7 +1556,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                /*
                 * Try to free it.
                 */
-               put_page(page);
+               put_hwpoison_page(page);
                shake_page(page, 1);
 
                /*
@@ -1542,7 +1565,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                ret = __get_any_page(page, pfn, 0);
                if (!PageLRU(page)) {
                        /* Drop page reference which is from __get_any_page() */
-                       put_page(page);
+                       put_hwpoison_page(page);
                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
                                pfn, page->flags);
                        return -EIO;
@@ -1565,7 +1588,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        lock_page(hpage);
        if (PageHWPoison(hpage)) {
                unlock_page(hpage);
-               put_page(hpage);
+               put_hwpoison_page(hpage);
                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1576,7 +1599,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
         * get_any_page() and isolate_huge_page() takes a refcount each,
         * so need to drop one here.
         */
-       put_page(hpage);
+       put_hwpoison_page(hpage);
        if (!ret) {
                pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
                return -EBUSY;
@@ -1600,11 +1623,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
                if (PageHuge(page)) {
                        set_page_hwpoison_huge_page(hpage);
                        dequeue_hwpoisoned_huge_page(hpage);
-                       atomic_long_add(1 << compound_order(hpage),
-                                       &num_poisoned_pages);
+                       num_poisoned_pages_add(1 << compound_order(hpage));
                } else {
                        SetPageHWPoison(page);
-                       atomic_long_inc(&num_poisoned_pages);
+                       num_poisoned_pages_inc();
                }
        }
        return ret;
@@ -1625,7 +1647,7 @@ static int __soft_offline_page(struct page *page, int flags)
        wait_on_page_writeback(page);
        if (PageHWPoison(page)) {
                unlock_page(page);
-               put_page(page);
+               put_hwpoison_page(page);
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1640,10 +1662,10 @@ static int __soft_offline_page(struct page *page, int flags)
         * would need to fix isolation locking first.
         */
        if (ret == 1) {
-               put_page(page);
+               put_hwpoison_page(page);
                pr_info("soft_offline: %#lx: invalidated\n", pfn);
                SetPageHWPoison(page);
-               atomic_long_inc(&num_poisoned_pages);
+               num_poisoned_pages_inc();
                return 0;
        }
 
@@ -1657,14 +1679,12 @@ static int __soft_offline_page(struct page *page, int flags)
         * Drop page reference which is came from get_any_page()
         * successful isolate_lru_page() already took another one.
         */
-       put_page(page);
+       put_hwpoison_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
                inc_zone_page_state(page, NR_ISOLATED_ANON +
                                        page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
-               if (!TestSetPageHWPoison(page))
-                       atomic_long_inc(&num_poisoned_pages);
                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
                if (ret) {
@@ -1679,8 +1699,6 @@ static int __soft_offline_page(struct page *page, int flags)
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
-                       if (TestClearPageHWPoison(page))
-                               atomic_long_dec(&num_poisoned_pages);
                }
        } else {
                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1719,12 +1737,16 @@ int soft_offline_page(struct page *page, int flags)
 
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
+               if (flags & MF_COUNT_INCREASED)
+                       put_hwpoison_page(page);
                return -EBUSY;
        }
        if (!PageHuge(page) && PageTransHuge(hpage)) {
                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
                        pr_info("soft offline: %#lx: failed to split THP\n",
                                pfn);
+                       if (flags & MF_COUNT_INCREASED)
+                               put_hwpoison_page(page);
                        return -EBUSY;
                }
        }
@@ -1742,11 +1764,10 @@ int soft_offline_page(struct page *page, int flags)
                if (PageHuge(page)) {
                        set_page_hwpoison_huge_page(hpage);
                        if (!dequeue_hwpoisoned_huge_page(hpage))
-                               atomic_long_add(1 << compound_order(hpage),
-                                       &num_poisoned_pages);
+                               num_poisoned_pages_add(1 << compound_order(hpage));
                } else {
                        if (!TestSetPageHWPoison(page))
-                               atomic_long_inc(&num_poisoned_pages);
+                               num_poisoned_pages_inc();
                }
        }
        return ret;
index bb04d8f2f86c415c24a52dadd71623a2316454d1..9cb27470fee991cb874676bb0cbc0f694b5e1d36 100644 (file)
@@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
 
-
-       /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
        i_mmap_lock_write(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                } else {
                        /*
                         * The fault handler has no page to lock, so it holds
-                        * i_mmap_lock for read to protect against truncate.
+                        * i_mmap_lock for write to protect against truncate.
                         */
-                       i_mmap_unlock_read(vma->vm_file->f_mapping);
+                       i_mmap_unlock_write(vma->vm_file->f_mapping);
                }
                goto uncharge_out;
        }
@@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        } else {
                /*
                 * The fault handler has no page to lock, so it holds
-                * i_mmap_lock for read to protect against truncate.
+                * i_mmap_lock for write to protect against truncate.
                 */
-               i_mmap_unlock_read(vma->vm_file->f_mapping);
+               i_mmap_unlock_write(vma->vm_file->f_mapping);
        }
        return ret;
 uncharge_out:
@@ -3232,6 +3230,27 @@ out:
        return 0;
 }
 
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+       if (vma_is_anonymous(vma))
+               return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+       if (vma->vm_ops->pmd_fault)
+               return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+       return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+                       unsigned int flags)
+{
+       if (vma_is_anonymous(vma))
+               return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+       if (vma->vm_ops->pmd_fault)
+               return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+       return VM_FAULT_FALLBACK;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
        barrier();
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
-                       if (vma->vm_ops)
+                       if (vma_is_anonymous(vma))
+                               return do_anonymous_page(mm, vma, address,
+                                                        pte, pmd, flags);
+                       else
                                return do_fault(mm, vma, address, pte, pmd,
                                                flags, entry);
-
-                       return do_anonymous_page(mm, vma, address, pte, pmd,
-                                       flags);
                }
                return do_swap_page(mm, vma, address,
                                        pte, pmd, flags, entry);
@@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = VM_FAULT_FALLBACK;
-               if (!vma->vm_ops)
-                       ret = do_huge_pmd_anonymous_page(mm, vma, address,
-                                       pmd, flags);
+               int ret = create_huge_pmd(mm, vma, address, pmd, flags);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
@@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                             orig_pmd, pmd);
 
                        if (dirty && !pmd_write(orig_pmd)) {
-                               ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
-                                                         orig_pmd);
+                               ret = wp_huge_pmd(mm, vma, address, pmd,
+                                                       orig_pmd, flags);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
index 8fd97dac538a46c4a5768273060631c7ac10a80e..aa992e2df58a42a6307a6324a3aa353e194d6f0f 100644 (file)
@@ -778,7 +778,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 
        start = phys_start_pfn << PAGE_SHIFT;
        size = nr_pages * PAGE_SIZE;
-       ret = release_mem_region_adjustable(&iomem_resource, start, size);
+
+       /* in the ZONE_DEVICE case device driver owns the memory region */
+       if (!is_dev_zone(zone))
+               ret = release_mem_region_adjustable(&iomem_resource, start, size);
        if (ret) {
                resource_size_t endres = start + size - 1;
 
@@ -1215,8 +1218,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size)
        return 0;
 }
 
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
+int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
+               bool for_device)
 {
+#ifdef CONFIG_ZONE_DEVICE
+       if (for_device)
+               return ZONE_DEVICE;
+#endif
        if (should_add_memory_movable(nid, start, size))
                return ZONE_MOVABLE;
 
@@ -1265,7 +1273,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        }
 
        /* call arch's memory hotadd */
-       ret = arch_add_memory(nid, start, size);
+       ret = arch_add_memory(nid, start, size, false);
 
        if (ret < 0)
                goto error;
index a7f1e0d1d6b8fed5d1e3d9bd1380c9cd9b1447e8..87a177917cb2e60a13b09e6a53836ccd9f9275bf 100644 (file)
@@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
 
        qp->prev = vma;
 
-       if (vma->vm_flags & VM_PFNMAP)
-               return 1;
-
        if (flags & MPOL_MF_LAZY) {
                /* Similar to task_numa_work, skip inaccessible VMAs */
                if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
                return alloc_huge_page_node(page_hstate(compound_head(page)),
                                        node);
        else
-               return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+               return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
                                                    __GFP_THISNODE, 0);
 }
 
@@ -2001,7 +1998,7 @@ retry_cpuset:
                nmask = policy_nodemask(gfp, pol);
                if (!nmask || node_isset(hpage_node, *nmask)) {
                        mpol_cond_put(pol);
-                       page = alloc_pages_exact_node(hpage_node,
+                       page = __alloc_pages_node(hpage_node,
                                                gfp | __GFP_THISNODE, order);
                        goto out;
                }
index 2cc08de8b1db259c70ad8d3aa28d0691f7ada544..4c533bc51d733989f12c9f5e25cad713f647baea 100644 (file)
@@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
  */
 void mempool_destroy(mempool_t *pool)
 {
+       if (unlikely(!pool))
+               return;
+
        while (pool->curr_nr) {
                void *element = remove_element(pool);
                pool->free(element, pool->pool_data);
index 0a1cc133f6d72af96a7e16df6481c821dd907a03..8eaa4c3a5f65a86bd3f465455960b573b9434273 100644 (file)
@@ -1,11 +1,6 @@
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
 #include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
 #include <linux/init.h>
-#include <linux/pfn.h>
 #include <linux/memblock.h>
 
 static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
 
 static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
 {
-       printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
-              (unsigned long long) pattern,
-              (unsigned long long) start_bad,
-              (unsigned long long) end_bad);
+       pr_info("  %016llx bad mem addr %pa - %pa reserved\n",
+               cpu_to_be64(pattern), &start_bad, &end_bad);
        memblock_reserve(start_bad, end_bad - start_bad);
 }
 
@@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);
                if (this_start < this_end) {
-                       printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
-                              (unsigned long long)this_start,
-                              (unsigned long long)this_end,
-                              (unsigned long long)cpu_to_be64(pattern));
+                       pr_info("  %pa - %pa pattern %016llx\n",
+                               &this_start, &this_end, cpu_to_be64(pattern));
                        memtest(pattern, this_start, this_end - this_start);
                }
        }
 }
 
 /* default is disabled */
-static int memtest_pattern __initdata;
+static unsigned int memtest_pattern __initdata;
 
 static int __init parse_memtest(char *arg)
 {
+       int ret = 0;
+
        if (arg)
-               memtest_pattern = simple_strtoul(arg, NULL, 0);
+               ret = kstrtouint(arg, 0, &memtest_pattern);
        else
                memtest_pattern = ARRAY_SIZE(patterns);
 
-       return 0;
+       return ret;
 }
 
 early_param("memtest", parse_memtest);
@@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
        if (!memtest_pattern)
                return;
 
-       printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+       pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
        for (i = memtest_pattern-1; i < UINT_MAX; --i) {
                idx = i % ARRAY_SIZE(patterns);
                do_one_pass(patterns[idx], start, end);
index 5c08cab5419e771d4b2d04762ae3d876cf8080eb..c3cb566af3e273a92e8353835b1cd6d03d64c7e3 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/gfp.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                        __set_page_dirty_nobuffers(newpage);
        }
 
+       if (page_is_young(page))
+               set_page_young(newpage);
+       if (page_is_idle(page))
+               set_page_idle(newpage);
+
        /*
         * Copy NUMA information to the new page, to prevent over-eager
         * future migrations of this same page.
@@ -880,8 +886,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        /* Establish migration ptes or remove ptes */
        if (page_mapped(page)) {
                try_to_unmap(page,
-                       TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
-                       TTU_IGNORE_HWPOISON);
+                       TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
                page_was_mapped = 1;
        }
 
@@ -952,9 +957,11 @@ out:
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
                /* Soft-offlined page shouldn't go through lru cache list */
-               if (reason == MR_MEMORY_FAILURE)
+               if (reason == MR_MEMORY_FAILURE) {
                        put_page(page);
-               else
+                       if (!test_set_page_hwpoison(page))
+                               num_poisoned_pages_inc();
+               } else
                        putback_lru_page(page);
        }
 
@@ -1194,7 +1201,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
                return alloc_huge_page_node(page_hstate(compound_head(p)),
                                        pm->node);
        else
-               return alloc_pages_exact_node(pm->node,
+               return __alloc_pages_node(pm->node,
                                GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
 }
 
@@ -1554,7 +1561,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
        int nid = (int) data;
        struct page *newpage;
 
-       newpage = alloc_pages_exact_node(nid,
+       newpage = __alloc_pages_node(nid,
                                         (GFP_HIGHUSER_MOVABLE |
                                          __GFP_THISNODE | __GFP_NOMEMALLOC |
                                          __GFP_NORETRY | __GFP_NOWARN) &
index 82db4fc0a9d34040bf7fa6750eddf7b508fb0e1a..c739d6db7193e854dc2e4495bcaf9ed4678e1e6b 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1260,14 +1260,12 @@ static inline int mlock_future_check(struct mm_struct *mm,
 /*
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
-
-unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
-                       unsigned long flags, unsigned long pgoff,
-                       unsigned long *populate)
+                       unsigned long flags, vm_flags_t vm_flags,
+                       unsigned long pgoff, unsigned long *populate)
 {
        struct mm_struct *mm = current->mm;
-       vm_flags_t vm_flags;
 
        *populate = 0;
 
@@ -1311,7 +1309,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
-       vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+       vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
        if (flags & MAP_LOCKED)
@@ -2455,7 +2453,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
              unsigned long addr, int new_below)
 {
        struct vm_area_struct *new;
-       int err = -ENOMEM;
+       int err;
 
        if (is_vm_hugetlb_page(vma) && (addr &
                                        ~(huge_page_mask(hstate_vma(vma)))))
@@ -2463,7 +2461,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 
        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!new)
-               goto out_err;
+               return -ENOMEM;
 
        /* most fields are the same, copy all, and then fixup */
        *new = *vma;
@@ -2511,7 +2509,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        mpol_put(vma_policy(new));
  out_free_vma:
        kmem_cache_free(vm_area_cachep, new);
- out_err:
        return err;
 }
 
@@ -2872,6 +2869,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
        struct vm_area_struct *prev;
        struct rb_node **rb_link, *rb_parent;
 
+       if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+                          &prev, &rb_link, &rb_parent))
+               return -ENOMEM;
+       if ((vma->vm_flags & VM_ACCOUNT) &&
+            security_vm_enough_memory_mm(mm, vma_pages(vma)))
+               return -ENOMEM;
+
        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
@@ -2884,16 +2888,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap_pgoff and in do_brk.
         */
-       if (!vma->vm_file) {
+       if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }
-       if (find_vma_links(mm, vma->vm_start, vma->vm_end,
-                          &prev, &rb_link, &rb_parent))
-               return -ENOMEM;
-       if ((vma->vm_flags & VM_ACCOUNT) &&
-            security_vm_enough_memory_mm(mm, vma_pages(vma)))
-               return -ENOMEM;
 
        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
@@ -2918,7 +2916,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
-       if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+       if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }
@@ -2952,30 +2950,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-               if (new_vma) {
-                       *new_vma = *vma;
-                       new_vma->vm_start = addr;
-                       new_vma->vm_end = addr + len;
-                       new_vma->vm_pgoff = pgoff;
-                       if (vma_dup_policy(vma, new_vma))
-                               goto out_free_vma;
-                       INIT_LIST_HEAD(&new_vma->anon_vma_chain);
-                       if (anon_vma_clone(new_vma, vma))
-                               goto out_free_mempol;
-                       if (new_vma->vm_file)
-                               get_file(new_vma->vm_file);
-                       if (new_vma->vm_ops && new_vma->vm_ops->open)
-                               new_vma->vm_ops->open(new_vma);
-                       vma_link(mm, new_vma, prev, rb_link, rb_parent);
-                       *need_rmap_locks = false;
-               }
+               if (!new_vma)
+                       goto out;
+               *new_vma = *vma;
+               new_vma->vm_start = addr;
+               new_vma->vm_end = addr + len;
+               new_vma->vm_pgoff = pgoff;
+               if (vma_dup_policy(vma, new_vma))
+                       goto out_free_vma;
+               INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+               if (anon_vma_clone(new_vma, vma))
+                       goto out_free_mempol;
+               if (new_vma->vm_file)
+                       get_file(new_vma->vm_file);
+               if (new_vma->vm_ops && new_vma->vm_ops->open)
+                       new_vma->vm_ops->open(new_vma);
+               vma_link(mm, new_vma, prev, rb_link, rb_parent);
+               *need_rmap_locks = false;
        }
        return new_vma;
 
- out_free_mempol:
+out_free_mempol:
        mpol_put(vma_policy(new_vma));
- out_free_vma:
+out_free_vma:
        kmem_cache_free(vm_area_cachep, new_vma);
+out:
        return NULL;
 }
 
@@ -3027,21 +3026,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
        pgoff_t pgoff;
        struct page **pages;
 
-       /*
-        * special mappings have no vm_file, and in that case, the mm
-        * uses vm_pgoff internally. So we have to subtract it from here.
-        * We are allowed to do this because we are the mm; do not copy
-        * this code into drivers!
-        */
-       pgoff = vmf->pgoff - vma->vm_pgoff;
-
        if (vma->vm_ops == &legacy_special_mapping_vmops)
                pages = vma->vm_private_data;
        else
                pages = ((struct vm_special_mapping *)vma->vm_private_data)->
                        pages;
 
-       for (; pgoff && *pages; ++pages)
+       for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;
 
        if (*pages) {
index 3b9b3d0741b2a1546837761d90f7eec2c0b3b18b..5fbdd367bbed9c57bc9ffd600293a5913b44e752 100644 (file)
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+                              unsigned long start,
+                              unsigned long end)
+{
+       struct mmu_notifier *mn;
+       int young = 0, id;
+
+       id = srcu_read_lock(&srcu);
+       hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->clear_young)
+                       young |= mn->ops->clear_young(mn, mm, start, end);
+       }
+       srcu_read_unlock(&srcu, id);
+
+       return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
                              unsigned long address)
 {
index 1cc0709fcaa5312351ce69d548afadda5ce80097..ab14a2014dea76b62e77b0176d810037e1c76788 100644 (file)
@@ -1233,18 +1233,19 @@ enomem:
 /*
  * handle mapping creation for uClinux
  */
-unsigned long do_mmap_pgoff(struct file *file,
-                           unsigned long addr,
-                           unsigned long len,
-                           unsigned long prot,
-                           unsigned long flags,
-                           unsigned long pgoff,
-                           unsigned long *populate)
+unsigned long do_mmap(struct file *file,
+                       unsigned long addr,
+                       unsigned long len,
+                       unsigned long prot,
+                       unsigned long flags,
+                       vm_flags_t vm_flags,
+                       unsigned long pgoff,
+                       unsigned long *populate)
 {
        struct vm_area_struct *vma;
        struct vm_region *region;
        struct rb_node *rb;
-       unsigned long capabilities, vm_flags, result;
+       unsigned long capabilities, result;
        int ret;
 
        *populate = 0;
@@ -1262,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
-       vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+       vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
 
        /* we're going to need to record the mapping */
        region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
index dff991e0681e85a5308ba097533e346e621bc954..1ecc0bcaecc518458765347f8b0fa5d5eed46f75 100644 (file)
@@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
  * Determine the type of allocation constraint.
  */
 #ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                               gfp_t gfp_mask, nodemask_t *nodemask,
-                               unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+                                            unsigned long *totalpages)
 {
        struct zone *zone;
        struct zoneref *z;
-       enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+       enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
        bool cpuset_limited = false;
        int nid;
 
        /* Default to all available memory */
        *totalpages = totalram_pages + total_swap_pages;
 
-       if (!zonelist)
+       if (!oc->zonelist)
                return CONSTRAINT_NONE;
        /*
         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
         * to kill current.We have to random task kill in this case.
         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
         */
-       if (gfp_mask & __GFP_THISNODE)
+       if (oc->gfp_mask & __GFP_THISNODE)
                return CONSTRAINT_NONE;
 
        /*
@@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
         * the page allocator means a mempolicy is in effect.  Cpuset policy
         * is enforced in get_page_from_freelist().
         */
-       if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+       if (oc->nodemask &&
+           !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
                *totalpages = total_swap_pages;
-               for_each_node_mask(nid, *nodemask)
+               for_each_node_mask(nid, *oc->nodemask)
                        *totalpages += node_spanned_pages(nid);
                return CONSTRAINT_MEMORY_POLICY;
        }
 
        /* Check this allocation failure is caused by cpuset's wall function */
-       for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                       high_zoneidx, nodemask)
-               if (!cpuset_zone_allowed(zone, gfp_mask))
+       for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+                       high_zoneidx, oc->nodemask)
+               if (!cpuset_zone_allowed(zone, oc->gfp_mask))
                        cpuset_limited = true;
 
        if (cpuset_limited) {
@@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
        return CONSTRAINT_NONE;
 }
 #else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                               gfp_t gfp_mask, nodemask_t *nodemask,
-                               unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+                                            unsigned long *totalpages)
 {
        *totalpages = totalram_pages + total_swap_pages;
        return CONSTRAINT_NONE;
 }
 #endif
 
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-               unsigned long totalpages, const nodemask_t *nodemask,
-               bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+                       struct task_struct *task, unsigned long totalpages)
 {
-       if (oom_unkillable_task(task, NULL, nodemask))
+       if (oom_unkillable_task(task, NULL, oc->nodemask))
                return OOM_SCAN_CONTINUE;
 
        /*
@@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
         * Don't allow any other task to have access to the reserves.
         */
        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-               if (!force_kill)
+               if (oc->order != -1)
                        return OOM_SCAN_ABORT;
        }
        if (!task->mm)
@@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
        if (oom_task_origin(task))
                return OOM_SCAN_SELECT;
 
-       if (task_will_free_mem(task) && !force_kill)
+       if (task_will_free_mem(task) && oc->order != -1)
                return OOM_SCAN_ABORT;
 
        return OOM_SCAN_OK;
@@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 /*
  * Simple selection loop. We chose the process with the highest
  * number of 'points'.  Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
-               unsigned long totalpages, const nodemask_t *nodemask,
-               bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+               unsigned int *ppoints, unsigned long totalpages)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        for_each_process_thread(g, p) {
                unsigned int points;
 
-               switch (oom_scan_process_thread(p, totalpages, nodemask,
-                                               force_kill)) {
+               switch (oom_scan_process_thread(oc, p, totalpages)) {
                case OOM_SCAN_SELECT:
                        chosen = p;
                        chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                case OOM_SCAN_OK:
                        break;
                };
-               points = oom_badness(p, NULL, nodemask, totalpages);
+               points = oom_badness(p, NULL, oc->nodemask, totalpages);
                if (!points || points < chosen_points)
                        continue;
                /* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
        rcu_read_unlock();
 }
 
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
-                       struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+                       struct mem_cgroup *memcg)
 {
        task_lock(current);
        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
                "oom_score_adj=%hd\n",
-               current->comm, gfp_mask, order,
+               current->comm, oc->gfp_mask, oc->order,
                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
@@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        else
                show_mem(SHOW_MEM_FILTER_NODES);
        if (sysctl_oom_dump_tasks)
-               dump_tasks(memcg, nodemask);
+               dump_tasks(memcg, oc->nodemask);
 }
 
 /*
@@ -487,10 +481,9 @@ void oom_killer_enable(void)
  * Must be called while holding a reference to p, which will be released upon
  * returning.
  */
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                      unsigned int points, unsigned long totalpages,
-                     struct mem_cgroup *memcg, nodemask_t *nodemask,
-                     const char *message)
+                     struct mem_cgroup *memcg, const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
@@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        task_unlock(p);
 
        if (__ratelimit(&oom_rs))
-               dump_header(p, gfp_mask, order, memcg, nodemask);
+               dump_header(oc, p, memcg);
 
        task_lock(p);
        pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        /*
                         * oom_badness() returns 0 if the thread is unkillable
                         */
-                       child_points = oom_badness(child, memcg, nodemask,
+                       child_points = oom_badness(child, memcg, oc->nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
                                put_task_struct(victim);
@@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 /*
  * Determines whether the kernel must panic because of the panic_on_oom sysctl.
  */
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                       int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
                        struct mem_cgroup *memcg)
 {
        if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-       dump_header(NULL, gfp_mask, order, memcg, nodemask);
+       /* Do not panic for oom kills triggered by sysrq */
+       if (oc->order == -1)
+               return;
+       dump_header(oc, NULL, memcg);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
@@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
 /**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-                  int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
 {
-       const nodemask_t *mpol_mask;
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
        unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
-       int killed = 0;
 
        if (oom_killer_disabled)
                return false;
@@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
-               goto out;
+               return true;
 
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        if (current->mm &&
            (fatal_signal_pending(current) || task_will_free_mem(current))) {
                mark_oom_victim(current);
-               goto out;
+               return true;
        }
 
        /*
         * Check if there were limitations on the allocation (only relevant for
         * NUMA) that may require different handling.
         */
-       constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
-                                               &totalpages);
-       mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
-       check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+       constraint = constrained_alloc(oc, &totalpages);
+       if (constraint != CONSTRAINT_MEMORY_POLICY)
+               oc->nodemask = NULL;
+       check_panic_on_oom(oc, constraint, NULL);
 
        if (sysctl_oom_kill_allocating_task && current->mm &&
-           !oom_unkillable_task(current, NULL, nodemask) &&
+           !oom_unkillable_task(current, NULL, oc->nodemask) &&
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                get_task_struct(current);
-               oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
-                                nodemask,
+               oom_kill_process(oc, current, 0, totalpages, NULL,
                                 "Out of memory (oom_kill_allocating_task)");
-               goto out;
+               return true;
        }
 
-       p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+       p = select_bad_process(oc, &points, totalpages);
        /* Found nothing?!?! Either we hang forever, or we panic. */
-       if (!p) {
-               dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+       if (!p && oc->order != -1) {
+               dump_header(oc, NULL, NULL);
                panic("Out of memory and no killable processes...\n");
        }
-       if (p != (void *)-1UL) {
-               oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
-                                nodemask, "Out of memory");
-               killed = 1;
-       }
-out:
-       /*
-        * Give the killed threads a good chance of exiting before trying to
-        * allocate memory again.
-        */
-       if (killed)
+       if (p && p != (void *)-1UL) {
+               oom_kill_process(oc, p, points, totalpages, NULL,
+                                "Out of memory");
+               /*
+                * Give the killed process a good chance to exit before trying
+                * to allocate memory again.
+                */
                schedule_timeout_killable(1);
-
+       }
        return true;
 }
 
@@ -728,13 +711,20 @@ out:
  */
 void pagefault_out_of_memory(void)
 {
+       struct oom_control oc = {
+               .zonelist = NULL,
+               .nodemask = NULL,
+               .gfp_mask = 0,
+               .order = 0,
+       };
+
        if (mem_cgroup_oom_synchronize(true))
                return;
 
        if (!mutex_trylock(&oom_lock))
                return;
 
-       if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+       if (!out_of_memory(&oc)) {
                /*
                 * There shouldn't be any user tasks runnable while the
                 * OOM killer is disabled, so the current task has to
index 5cccc127ef81f1d64ca46f9ce9ad50f519d4ea9f..0a931cdd4f6baaa96cdfab2e0dd668c0abef8809 100644 (file)
@@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
        wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
-       trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
+       trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
 }
 
 static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
@@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * do a reset, as it may be a light dirtier.
                 */
                if (pause < min_pause) {
-                       trace_balance_dirty_pages(bdi,
+                       trace_balance_dirty_pages(wb,
                                                  sdtc->thresh,
                                                  sdtc->bg_thresh,
                                                  sdtc->dirty,
@@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                }
 
 pause:
-               trace_balance_dirty_pages(bdi,
+               trace_balance_dirty_pages(wb,
                                          sdtc->thresh,
                                          sdtc->bg_thresh,
                                          sdtc->dirty,
index 5b5240b7f642de179efa3552245fbf9326d7a206..48aaf7b9f253e6ea68587caa1e7e3e254905936a 100644 (file)
@@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+       return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+       page->index = migratetype;
+}
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
@@ -206,6 +224,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
         "HighMem",
 #endif
         "Movable",
+#ifdef CONFIG_ZONE_DEVICE
+        "Device",
+#endif
 };
 
 int min_free_kbytes = 1024;
@@ -788,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                       mt = get_freepage_migratetype(page);
+
+                       mt = get_pcppage_migratetype(page);
+                       /* MIGRATE_ISOLATE page should not go to pcplists */
+                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+                       /* Pageblock could have been isolated meanwhile */
                        if (unlikely(has_isolate_pageblock(zone)))
                                mt = get_pageblock_migratetype(page);
 
@@ -952,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        migratetype = get_pfnblock_migratetype(page, pfn);
        local_irq_save(flags);
        __count_vm_events(PGFREE, 1 << order);
-       set_freepage_migratetype(page, migratetype);
        free_one_page(page_zone(page), page, pfn, order, migratetype);
        local_irq_restore(flags);
 }
@@ -1380,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                rmv_page_order(page);
                area->nr_free--;
                expand(zone, page, order, current_order, area, migratetype);
-               set_freepage_migratetype(page, migratetype);
+               set_pcppage_migratetype(page, migratetype);
                return page;
        }
 
@@ -1457,7 +1481,6 @@ int move_freepages(struct zone *zone,
                order = page_order(page);
                list_move(&page->lru,
                          &zone->free_area[order].free_list[migratetype]);
-               set_freepage_migratetype(page, migratetype);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -1627,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                expand(zone, page, order, current_order, area,
                                        start_migratetype);
                /*
-                * The freepage_migratetype may differ from pageblock's
+                * The pcppage_migratetype may differ from pageblock's
                 * migratetype depending on the decisions in
-                * try_to_steal_freepages(). This is OK as long as it
-                * does not differ for MIGRATE_CMA pageblocks. For CMA
-                * we need to make sure unallocated pages flushed from
-                * pcp lists are returned to the correct freelist.
+                * find_suitable_fallback(). This is OK as long as it does not
+                * differ for MIGRATE_CMA pageblocks. Those can be used as
+                * fallback only via special __rmqueue_cma_fallback() function
                 */
-               set_freepage_migratetype(page, start_migratetype);
+               set_pcppage_migratetype(page, start_migratetype);
 
                trace_mm_page_alloc_extfrag(page, order, current_order,
                        start_migratetype, fallback_mt);
@@ -1710,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                else
                        list_add_tail(&page->lru, list);
                list = &page->lru;
-               if (is_migrate_cma(get_freepage_migratetype(page)))
+               if (is_migrate_cma(get_pcppage_migratetype(page)))
                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                              -(1 << order));
        }
@@ -1907,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                return;
 
        migratetype = get_pfnblock_migratetype(page, pfn);
-       set_freepage_migratetype(page, migratetype);
+       set_pcppage_migratetype(page, migratetype);
        local_irq_save(flags);
        __count_vm_event(PGFREE);
 
@@ -2112,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                if (!page)
                        goto failed;
                __mod_zone_freepage_state(zone, -(1 << order),
-                                         get_freepage_migratetype(page));
+                                         get_pcppage_migratetype(page));
        }
 
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2693,6 +2715,12 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        const struct alloc_context *ac, unsigned long *did_some_progress)
 {
+       struct oom_control oc = {
+               .zonelist = ac->zonelist,
+               .nodemask = ac->nodemask,
+               .gfp_mask = gfp_mask,
+               .order = order,
+       };
        struct page *page;
 
        *did_some_progress = 0;
@@ -2744,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-       if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
-                       || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+       if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                *did_some_progress = 1;
 out:
        mutex_unlock(&oom_lock);
@@ -3487,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
  */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
@@ -5063,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 {
        unsigned long zone_start_pfn, zone_end_pfn;
 
-       /* When hotadd a new node, the node should be empty */
+       /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
 
@@ -5130,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long zone_start_pfn, zone_end_pfn;
 
-       /* When hotadd a new node, the node should be empty */
+       /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
 
@@ -5303,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
  *
  * NOTE: pgdat should get zeroed by caller.
  */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
-               unsigned long node_start_pfn, unsigned long node_end_pfn)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
@@ -5455,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-               (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+               (u64)start_pfn << PAGE_SHIFT,
+               end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5467,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
 
-       free_area_init_core(pgdat, start_pfn, end_pfn);
+       free_area_init_core(pgdat);
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5478,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
  */
 void __init setup_nr_node_ids(void)
 {
-       unsigned int node;
-       unsigned int highest = 0;
+       unsigned int highest;
 
-       for_each_node_mask(node, node_possible_map)
-               highest = node;
+       highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
        nr_node_ids = highest + 1;
 }
 #endif
@@ -6003,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
- * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
@@ -6056,7 +6079,7 @@ void __init page_alloc_init(void)
 }
 
 /*
- * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
  *     or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
@@ -6100,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
 
 /*
  * setup_per_zone_lowmem_reserve - called whenever
- *     sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *     sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
  *     has a correct pages reserved value, so an adequate number of
  *     pages are left in the zone after a successful __alloc_pages().
  */
index d86fd2f5353fcb05d39c27887f64b81e148a2bc6..292ca7b8debd2c27c87d056e1ea4872d3094c756 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/vmalloc.h>
 #include <linux/kmemleak.h>
 #include <linux/page_owner.h>
+#include <linux/page_idle.h>
 
 /*
  * struct page extension
@@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = {
 #ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
 #endif
+#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
+       &page_idle_ops,
+#endif
 };
 
 static unsigned long total_usage;
diff --git a/mm/page_idle.c b/mm/page_idle.c
new file mode 100644 (file)
index 0000000..d5dd790
--- /dev/null
@@ -0,0 +1,232 @@
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_ext.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE      sizeof(u64)
+#define BITMAP_CHUNK_BITS      (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+       struct page *page;
+       struct zone *zone;
+
+       if (!pfn_valid(pfn))
+               return NULL;
+
+       page = pfn_to_page(pfn);
+       if (!page || !PageLRU(page) ||
+           !get_page_unless_zero(page))
+               return NULL;
+
+       zone = page_zone(page);
+       spin_lock_irq(&zone->lru_lock);
+       if (unlikely(!PageLRU(page))) {
+               put_page(page);
+               page = NULL;
+       }
+       spin_unlock_irq(&zone->lru_lock);
+       return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+                                       struct vm_area_struct *vma,
+                                       unsigned long addr, void *arg)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       spinlock_t *ptl;
+       pmd_t *pmd;
+       pte_t *pte;
+       bool referenced = false;
+
+       if (unlikely(PageTransHuge(page))) {
+               pmd = page_check_address_pmd(page, mm, addr,
+                                            PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+               if (pmd) {
+                       referenced = pmdp_clear_young_notify(vma, addr, pmd);
+                       spin_unlock(ptl);
+               }
+       } else {
+               pte = page_check_address(page, mm, addr, &ptl, 0);
+               if (pte) {
+                       referenced = ptep_clear_young_notify(vma, addr, pte);
+                       pte_unmap_unlock(pte, ptl);
+               }
+       }
+       if (referenced) {
+               clear_page_idle(page);
+               /*
+                * We cleared the referenced bit in a mapping to this page. To
+                * avoid interference with page reclaim, mark it young so that
+                * page_referenced() will return > 0.
+                */
+               set_page_young(page);
+       }
+       return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+       /*
+        * Since rwc.arg is unused, rwc is effectively immutable, so we
+        * can make it static const to save some cycles and stack.
+        */
+       static const struct rmap_walk_control rwc = {
+               .rmap_one = page_idle_clear_pte_refs_one,
+               .anon_lock = page_lock_anon_vma_read,
+       };
+       bool need_lock;
+
+       if (!page_mapped(page) ||
+           !page_rmapping(page))
+               return;
+
+       need_lock = !PageAnon(page) || PageKsm(page);
+       if (need_lock && !trylock_page(page))
+               return;
+
+       rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+       if (need_lock)
+               unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+                                    struct bin_attribute *attr, char *buf,
+                                    loff_t pos, size_t count)
+{
+       u64 *out = (u64 *)buf;
+       struct page *page;
+       unsigned long pfn, end_pfn;
+       int bit;
+
+       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+               return -EINVAL;
+
+       pfn = pos * BITS_PER_BYTE;
+       if (pfn >= max_pfn)
+               return 0;
+
+       end_pfn = pfn + count * BITS_PER_BYTE;
+       if (end_pfn > max_pfn)
+               end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+       for (; pfn < end_pfn; pfn++) {
+               bit = pfn % BITMAP_CHUNK_BITS;
+               if (!bit)
+                       *out = 0ULL;
+               page = page_idle_get_page(pfn);
+               if (page) {
+                       if (page_is_idle(page)) {
+                               /*
+                                * The page might have been referenced via a
+                                * pte, in which case it is not idle. Clear
+                                * refs and recheck.
+                                */
+                               page_idle_clear_pte_refs(page);
+                               if (page_is_idle(page))
+                                       *out |= 1ULL << bit;
+                       }
+                       put_page(page);
+               }
+               if (bit == BITMAP_CHUNK_BITS - 1)
+                       out++;
+               cond_resched();
+       }
+       return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+                                     struct bin_attribute *attr, char *buf,
+                                     loff_t pos, size_t count)
+{
+       const u64 *in = (u64 *)buf;
+       struct page *page;
+       unsigned long pfn, end_pfn;
+       int bit;
+
+       if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+               return -EINVAL;
+
+       pfn = pos * BITS_PER_BYTE;
+       if (pfn >= max_pfn)
+               return -ENXIO;
+
+       end_pfn = pfn + count * BITS_PER_BYTE;
+       if (end_pfn > max_pfn)
+               end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+       for (; pfn < end_pfn; pfn++) {
+               bit = pfn % BITMAP_CHUNK_BITS;
+               if ((*in >> bit) & 1) {
+                       page = page_idle_get_page(pfn);
+                       if (page) {
+                               page_idle_clear_pte_refs(page);
+                               set_page_idle(page);
+                               put_page(page);
+                       }
+               }
+               if (bit == BITMAP_CHUNK_BITS - 1)
+                       in++;
+               cond_resched();
+       }
+       return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+               __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+                          page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+       &page_idle_bitmap_attr,
+       NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+       .bin_attrs = page_idle_bin_attrs,
+       .name = "page_idle",
+};
+
+#ifndef CONFIG_64BIT
+static bool need_page_idle(void)
+{
+       return true;
+}
+struct page_ext_operations page_idle_ops = {
+       .need = need_page_idle,
+};
+#endif
+
+static int __init page_idle_init(void)
+{
+       int err;
+
+       err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+       if (err) {
+               pr_err("page_idle: register sysfs failed\n");
+               return err;
+       }
+       return 0;
+}
+subsys_initcall(page_idle_init);
index 303c908790efca6f7d0b30cc6d8a9db918085e10..4568fd58f70a02c472e525528db58eeb20c06ebf 100644 (file)
@@ -9,7 +9,8 @@
 #include <linux/hugetlb.h>
 #include "internal.h"
 
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page,
+                               bool skip_hwpoisoned_pages)
 {
        struct zone *zone;
        unsigned long flags, pfn;
@@ -72,7 +73,7 @@ out:
        return ret;
 }
 
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
        struct zone *zone;
        unsigned long flags, nr_pages;
@@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
                        continue;
                }
                page = pfn_to_page(pfn);
-               if (PageBuddy(page)) {
+               if (PageBuddy(page))
                        /*
-                        * If race between isolatation and allocation happens,
-                        * some free pages could be in MIGRATE_MOVABLE list
-                        * although pageblock's migratation type of the page
-                        * is MIGRATE_ISOLATE. Catch it and move the page into
-                        * MIGRATE_ISOLATE list.
+                        * If the page is on a free list, it has to be on
+                        * the correct MIGRATE_ISOLATE freelist. There is no
+                        * simple way to verify that as VM_BUG_ON(), though.
                         */
-                       if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
-                               struct page *end_page;
-
-                               end_page = page + (1 << page_order(page)) - 1;
-                               move_freepages(page_zone(page), page, end_page,
-                                               MIGRATE_ISOLATE);
-                       }
                        pfn += 1 << page_order(page);
-               }
-               else if (page_count(page) == 0 &&
-                       get_freepage_migratetype(page) == MIGRATE_ISOLATE)
-                       pfn += 1;
-               else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
-                       /*
-                        * The HWPoisoned page may be not in buddy
-                        * system, and page_count() is not 0.
-                        */
+               else if (skip_hwpoisoned_pages && PageHWPoison(page))
+                       /* A HWPoisoned page cannot be also PageBuddy */
                        pfn++;
-                       continue;
-               }
                else
                        break;
        }
index 0db38e7d0a72b20ce63a6653ba24934ac3ce7825..f5b5c1f3dcd755ae313bba1404f2c9b079d5c18f 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -59,6 +59,7 @@
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
+       if (referenced)
+               clear_page_idle(page);
+       if (test_and_clear_page_young(page))
+               referenced++;
+
        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags;
index dbe0c1e8349c72ac569a58289da702a841104951..48ce82926d931bef026baf16a971add9d00c45fd 100644 (file)
@@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
+static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat)
+{
+       struct inode *inode = dentry->d_inode;
+       struct shmem_inode_info *info = SHMEM_I(inode);
+
+       spin_lock(&info->lock);
+       shmem_recalc_inode(inode);
+       spin_unlock(&info->lock);
+
+       generic_fillattr(inode, stat);
+
+       return 0;
+}
+
 static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = d_inode(dentry);
@@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = {
 };
 
 static const struct inode_operations shmem_inode_operations = {
+       .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
index 60c936938b8486b1763c8f9477b479a5d4a54dc4..c77ebe6cc87cd3066f24fd9e3682699448689fa8 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
        if (memcg_charge_slab(cachep, flags, cachep->gfporder))
                return NULL;
 
-       page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+       page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
        if (!page) {
                memcg_uncharge_slab(cachep, cachep->gfporder);
                slab_out_of_memory(cachep, flags, nodeid);
index c26829fe4e37ea0b38ce6c26fef84a7dcb528eee..5ce4faeb16fbbdfa19b16c3d551aace8c3a495c0 100644 (file)
@@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
                             struct kmem_cache *root_cache)
 {
        static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-       struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+       struct cgroup_subsys_state *css = &memcg->css;
        struct memcg_cache_array *arr;
        struct kmem_cache *s = NULL;
        char *cache_name;
@@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
        bool need_rcu_barrier = false;
        bool busy = false;
 
+       if (unlikely(!s))
+               return;
+
        BUG_ON(!is_root_cache(s));
 
        get_online_cpus();
index 165bbd3cd60626e0aa8b0c98ba18c2d327af6117..0d7e5df74d1f03e7069d1d938452b9474bc9a969 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -45,7 +45,7 @@
  * NUMA support in SLOB is fairly simplistic, pushing most of the real
  * logic down to the page allocator, and simply doing the node accounting
  * on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_exact_node() with the specified node id is used
+ * provided, __alloc_pages_node() with the specified node id is used
  * instead. The common case (or when the node id isn't explicitly provided)
  * will default to the current node, as per numa_node_id().
  *
@@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
 
 #ifdef CONFIG_NUMA
        if (node != NUMA_NO_NODE)
-               page = alloc_pages_exact_node(node, gfp, order);
+               page = __alloc_pages_node(node, gfp, order);
        else
 #endif
                page = alloc_pages(gfp, order);
index 084184e706c63184124bcf874cfe6702e0343950..f614b5dc396bc17b43cebacd97383243bbb03b99 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
        if (node == NUMA_NO_NODE)
                page = alloc_pages(flags, order);
        else
-               page = alloc_pages_exact_node(node, flags, order);
+               page = __alloc_pages_node(node, flags, order);
 
        if (!page)
                memcg_uncharge_slab(s, order);
index a3a0a2f1f7c3dc48c43494b949af6aee66adcf8f..983f692a47fdfbb80505fa77f673b9af37d08739 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/uio.h>
 #include <linux/hugetlb.h>
+#include <linux/page_idle.h>
 
 #include "internal.h"
 
@@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page)
        } else if (!PageReferenced(page)) {
                SetPageReferenced(page);
        }
+       if (page_is_idle(page))
+               clear_page_idle(page);
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
index 8bc8e66138da1baec2b1420564deb5745f19c990..d504adb7fa5f08ced98eeb2a285976c0db64a9ae 100644 (file)
@@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
        return page;
 }
 
-/* 
- * Locate a page of swap in physical memory, reserving swap cache space
- * and reading the disk if it is not already cached.
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-                       struct vm_area_struct *vma, unsigned long addr)
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+                       struct vm_area_struct *vma, unsigned long addr,
+                       bool *new_page_allocated)
 {
        struct page *found_page, *new_page = NULL;
+       struct address_space *swapper_space = swap_address_space(entry);
        int err;
+       *new_page_allocated = false;
 
        do {
                /*
@@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * called after lookup_swap_cache() failed, re-calling
                 * that would confuse statistics.
                 */
-               found_page = find_get_page(swap_address_space(entry),
-                                       entry.val);
+               found_page = find_get_page(swapper_space, entry.val);
                if (found_page)
                        break;
 
@@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                         * Initiate read into locked page and return.
                         */
                        lru_cache_add_anon(new_page);
-                       swap_readpage(new_page);
+                       *new_page_allocated = true;
                        return new_page;
                }
                radix_tree_preload_end();
@@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        return found_page;
 }
 
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       bool page_was_allocated;
+       struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+                       vma, addr, &page_was_allocated);
+
+       if (page_was_allocated)
+               swap_readpage(retpage);
+
+       return retpage;
+}
+
 static unsigned long swapin_nr_pages(unsigned long offset)
 {
        static unsigned long prev_offset;
index aebc2dd6e64975e2589429a0e22929f96c4d7532..58877312cf6b94b74da00b8913bf8db64d56ab26 100644 (file)
@@ -874,6 +874,48 @@ int page_swapcount(struct page *page)
        return count;
 }
 
+/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+       int count, tmp_count, n;
+       struct swap_info_struct *p;
+       struct page *page;
+       pgoff_t offset;
+       unsigned char *map;
+
+       p = swap_info_get(entry);
+       if (!p)
+               return 0;
+
+       count = swap_count(p->swap_map[swp_offset(entry)]);
+       if (!(count & COUNT_CONTINUED))
+               goto out;
+
+       count &= ~COUNT_CONTINUED;
+       n = SWAP_MAP_MAX + 1;
+
+       offset = swp_offset(entry);
+       page = vmalloc_to_page(p->swap_map + offset);
+       offset &= ~PAGE_MASK;
+       VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+       do {
+               page = list_entry(page->lru.next, struct page, lru);
+               map = kmap_atomic(page);
+               tmp_count = map[offset];
+               kunmap_atomic(map);
+
+               count += (tmp_count & ~COUNT_CONTINUED) * n;
+               n *= (SWAP_CONT_MAX + 1);
+       } while (tmp_count & COUNT_CONTINUED);
+out:
+       spin_unlock(&p->lock);
+       return count;
+}
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
index b1139039122a05389019aa569b82dcde442f25d7..2d978b28a410b25df1acde351630dee387efbbe5 100644 (file)
@@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
        if (!memcg)
                return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
-       if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+       if (memcg->css.cgroup)
                return true;
 #endif
        return false;
@@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    __GFP_IO|__GFP_FS for this reason); but more thought
                 *    would probably show more reasons.
                 *
-                * 3) Legacy memcg encounters a page that is not already marked
+                * 3) Legacy memcg encounters a page that is already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
                 *    pages are in writeback and there is nothing else to
@@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
-
                                goto keep_locked;
 
                        /* Case 3 above */
                        } else {
+                               unlock_page(page);
                                wait_on_page_writeback(page);
+                               /* then go back and try same page again */
+                               list_add_tail(&page->lru, page_list);
+                               continue;
                        }
                }
 
@@ -1196,7 +1199,7 @@ cull_mlocked:
                if (PageSwapCache(page))
                        try_to_free_swap(page);
                unlock_page(page);
-               putback_lru_page(page);
+               list_add(&page->lru, &ret_pages);
                continue;
 
 activate_locked:
@@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        unsigned long nr_taken = 0;
        unsigned long scan;
 
-       for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+       for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+                                       !list_empty(src); scan++) {
                struct page *page;
                int nr_pages;
 
index f3bf6f7627d8d103bb674a99ed504092f15499a9..fa48bcdff9d5b921d6a52b6d203fa4e89d7c7cb6 100644 (file)
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@ struct zbud_pool {
        struct list_head buddied;
        struct list_head lru;
        u64 pages_nr;
-       struct zbud_ops *ops;
+       const struct zbud_ops *ops;
 #ifdef CONFIG_ZPOOL
        struct zpool *zpool;
-       struct zpool_ops *zpool_ops;
+       const struct zpool_ops *zpool_ops;
 #endif
 };
 
@@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
                return -ENOENT;
 }
 
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
        .evict =        zbud_zpool_evict
 };
 
 static void *zbud_zpool_create(char *name, gfp_t gfp,
-                              struct zpool_ops *zpool_ops,
+                              const struct zpool_ops *zpool_ops,
                               struct zpool *zpool)
 {
        struct zbud_pool *pool;
@@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
  * Return: pointer to the new zbud pool or NULL if the metadata allocation
  * failed.
  */
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
 {
        struct zbud_pool *pool;
        int i;
index 722a4f60e90b29cd3626cdc4acddc1594ab7a0a5..8f670d3e87060f6277f5651a79cfaa8d27a30713 100644 (file)
@@ -22,7 +22,7 @@ struct zpool {
 
        struct zpool_driver *driver;
        void *pool;
-       struct zpool_ops *ops;
+       const struct zpool_ops *ops;
 
        struct list_head list;
 };
@@ -99,6 +99,39 @@ static void zpool_put_driver(struct zpool_driver *driver)
        module_put(driver->owner);
 }
 
+/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type       The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available.  This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling.  If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure.  However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+       struct zpool_driver *driver = zpool_get_driver(type);
+
+       if (!driver) {
+               request_module("zpool-%s", type);
+               driver = zpool_get_driver(type);
+       }
+
+       if (!driver)
+               return false;
+
+       zpool_put_driver(driver);
+       return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
 /**
  * zpool_create_pool() - Create a new zpool
  * @type       The type of the zpool to create (e.g. zbud, zsmalloc)
@@ -115,7 +148,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
  * Returns: New zpool on success, NULL on failure.
  */
 struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
-               struct zpool_ops *ops)
+               const struct zpool_ops *ops)
 {
        struct zpool_driver *driver;
        struct zpool *zpool;
@@ -320,20 +353,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
        return zpool->driver->total_size(zpool->pool);
 }
 
-static int __init init_zpool(void)
-{
-       pr_info("loaded\n");
-       return 0;
-}
-
-static void __exit exit_zpool(void)
-{
-       pr_info("unloaded\n");
-}
-
-module_init(init_zpool);
-module_exit(exit_zpool);
-
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
 MODULE_DESCRIPTION("Common API for compressed memory storage");
index 0a7f81aa2249c2c9925cceadd4737ec8c88bf259..f135b1b6fcdcab49aaf0845e078c6fc299b4b28b 100644 (file)
@@ -169,14 +169,12 @@ enum zs_stat_type {
        NR_ZS_STAT_TYPE,
 };
 
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static struct dentry *zs_stat_root;
-
 struct zs_size_stat {
        unsigned long objs[NR_ZS_STAT_TYPE];
 };
 
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
 #endif
 
 /*
@@ -201,6 +199,8 @@ static int zs_size_classes;
 static const int fullness_threshold_frac = 4;
 
 struct size_class {
+       spinlock_t lock;
+       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
        /*
         * Size of objects stored in this class. Must be multiple
         * of ZS_ALIGN.
@@ -210,16 +210,10 @@ struct size_class {
 
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
-       /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
-       bool huge;
-
-#ifdef CONFIG_ZSMALLOC_STAT
        struct zs_size_stat stats;
-#endif
-
-       spinlock_t lock;
 
-       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+       /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+       bool huge;
 };
 
 /*
@@ -251,6 +245,15 @@ struct zs_pool {
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
 
+       struct zs_pool_stats stats;
+
+       /* Compact classes */
+       struct shrinker shrinker;
+       /*
+        * To signify that register_shrinker() was successful
+        * and unregister_shrinker() will not Oops.
+        */
+       bool shrinker_enabled;
 #ifdef CONFIG_ZSMALLOC_STAT
        struct dentry *stat_dentry;
 #endif
@@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
 
 static void destroy_handle_cache(struct zs_pool *pool)
 {
-       if (pool->handle_cachep)
-               kmem_cache_destroy(pool->handle_cachep);
+       kmem_cache_destroy(pool->handle_cachep);
 }
 
 static unsigned long alloc_handle(struct zs_pool *pool)
@@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
 
 #ifdef CONFIG_ZPOOL
 
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+                            const struct zpool_ops *zpool_ops,
                             struct zpool *zpool)
 {
        return zs_create_pool(name, gfp);
@@ -441,8 +444,6 @@ static int get_size_class_index(int size)
        return min(zs_size_classes - 1, idx);
 }
 
-#ifdef CONFIG_ZSMALLOC_STAT
-
 static inline void zs_stat_inc(struct size_class *class,
                                enum zs_stat_type type, unsigned long cnt)
 {
@@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
        return class->stats.objs[type];
 }
 
+#ifdef CONFIG_ZSMALLOC_STAT
+
 static int __init zs_stat_init(void)
 {
        if (!debugfs_initialized())
@@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 
 #else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
-                               enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
-                               enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
-                               enum zs_stat_type type)
-{
-       return 0;
-}
-
 static int __init zs_stat_init(void)
 {
        return 0;
@@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
 static inline void zs_pool_stat_destroy(struct zs_pool *pool)
 {
 }
-
 #endif
 
 
@@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
        if (fullness >= _ZS_NR_FULLNESS_GROUPS)
                return;
 
-       head = &class->fullness_list[fullness];
-       if (*head)
-               list_add_tail(&page->lru, &(*head)->lru);
-
-       *head = page;
        zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
                        CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+
+       head = &class->fullness_list[fullness];
+       if (!*head) {
+               *head = page;
+               return;
+       }
+
+       /*
+        * We want to see more ZS_FULL pages and less almost
+        * empty/full. Put pages with higher ->inuse first.
+        */
+       list_add_tail(&page->lru, &(*head)->lru);
+       if (page->inuse >= (*head)->inuse)
+               *head = page;
 }
 
 /*
@@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_free);
 
-static void zs_object_copy(unsigned long src, unsigned long dst,
+static void zs_object_copy(unsigned long dst, unsigned long src,
                                struct size_class *class)
 {
        struct page *s_page, *d_page;
@@ -1602,8 +1596,6 @@ struct zs_compact_control {
         /* Starting object index within @s_page which used for live object
          * in the subpage. */
        int index;
-       /* how many of objects are migrated */
-       int nr_migrated;
 };
 
 static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
        struct page *s_page = cc->s_page;
        struct page *d_page = cc->d_page;
        unsigned long index = cc->index;
-       int nr_migrated = 0;
        int ret = 0;
 
        while (1) {
@@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 
                used_obj = handle_to_obj(handle);
                free_obj = obj_malloc(d_page, class, handle);
-               zs_object_copy(used_obj, free_obj, class);
+               zs_object_copy(free_obj, used_obj, class);
                index++;
                record_obj(handle, free_obj);
                unpin_tag(handle);
                obj_free(pool, class, used_obj);
-               nr_migrated++;
        }
 
        /* Remember last position in this iteration */
        cc->s_page = s_page;
        cc->index = index;
-       cc->nr_migrated = nr_migrated;
 
        return ret;
 }
 
-static struct page *alloc_target_page(struct size_class *class)
+static struct page *isolate_target_page(struct size_class *class)
 {
        int i;
        struct page *page;
@@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
        return page;
 }
 
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
-                               struct page *first_page)
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+                       struct size_class *class,
+                       struct page *first_page)
 {
        enum fullness_group fullness;
 
@@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
 
                free_zspage(first_page);
        }
+
+       return fullness;
 }
 
 static struct page *isolate_source_page(struct size_class *class)
 {
-       struct page *page;
+       int i;
+       struct page *page = NULL;
 
-       page = class->fullness_list[ZS_ALMOST_EMPTY];
-       if (page)
-               remove_zspage(page, class, ZS_ALMOST_EMPTY);
+       for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+               page = class->fullness_list[i];
+               if (!page)
+                       continue;
+
+               remove_zspage(page, class, i);
+               break;
+       }
 
        return page;
 }
 
-static unsigned long __zs_compact(struct zs_pool *pool,
-                               struct size_class *class)
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+       unsigned long obj_wasted;
+
+       obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
+               zs_stat_get(class, OBJ_USED);
+
+       obj_wasted /= get_maxobj_per_zspage(class->size,
+                       class->pages_per_zspage);
+
+       return obj_wasted * class->pages_per_zspage;
+}
+
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 {
-       int nr_to_migrate;
        struct zs_compact_control cc;
        struct page *src_page;
        struct page *dst_page = NULL;
-       unsigned long nr_total_migrated = 0;
 
        spin_lock(&class->lock);
        while ((src_page = isolate_source_page(class))) {
 
                BUG_ON(!is_first_page(src_page));
 
-               /* The goal is to migrate all live objects in source page */
-               nr_to_migrate = src_page->inuse;
+               if (!zs_can_compact(class))
+                       break;
+
                cc.index = 0;
                cc.s_page = src_page;
 
-               while ((dst_page = alloc_target_page(class))) {
+               while ((dst_page = isolate_target_page(class))) {
                        cc.d_page = dst_page;
                        /*
-                        * If there is no more space in dst_page, try to
-                        * allocate another zspage.
+                        * If there is no more space in dst_page, resched
+                        * and see if anyone had allocated another zspage.
                         */
                        if (!migrate_zspage(pool, class, &cc))
                                break;
 
                        putback_zspage(pool, class, dst_page);
-                       nr_total_migrated += cc.nr_migrated;
-                       nr_to_migrate -= cc.nr_migrated;
                }
 
                /* Stop if we couldn't find slot */
@@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
                        break;
 
                putback_zspage(pool, class, dst_page);
-               putback_zspage(pool, class, src_page);
+               if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+                       pool->stats.pages_compacted += class->pages_per_zspage;
                spin_unlock(&class->lock);
-               nr_total_migrated += cc.nr_migrated;
                cond_resched();
                spin_lock(&class->lock);
        }
@@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
                putback_zspage(pool, class, src_page);
 
        spin_unlock(&class->lock);
-
-       return nr_total_migrated;
 }
 
 unsigned long zs_compact(struct zs_pool *pool)
 {
        int i;
-       unsigned long nr_migrated = 0;
        struct size_class *class;
 
        for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
                        continue;
                if (class->index != i)
                        continue;
-               nr_migrated += __zs_compact(pool, class);
+               __zs_compact(pool, class);
        }
 
-       return nr_migrated;
+       return pool->stats.pages_compacted;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
 
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+       memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+               struct shrink_control *sc)
+{
+       unsigned long pages_freed;
+       struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+                       shrinker);
+
+       pages_freed = pool->stats.pages_compacted;
+       /*
+        * Compact classes and calculate compaction delta.
+        * Can run concurrently with a manually triggered
+        * (by user) compaction.
+        */
+       pages_freed = zs_compact(pool) - pages_freed;
+
+       return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+               struct shrink_control *sc)
+{
+       int i;
+       struct size_class *class;
+       unsigned long pages_to_free = 0;
+       struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+                       shrinker);
+
+       if (!pool->shrinker_enabled)
+               return 0;
+
+       for (i = zs_size_classes - 1; i >= 0; i--) {
+               class = pool->size_class[i];
+               if (!class)
+                       continue;
+               if (class->index != i)
+                       continue;
+
+               pages_to_free += zs_can_compact(class);
+       }
+
+       return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+       if (pool->shrinker_enabled) {
+               unregister_shrinker(&pool->shrinker);
+               pool->shrinker_enabled = false;
+       }
+}
+
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+       pool->shrinker.scan_objects = zs_shrinker_scan;
+       pool->shrinker.count_objects = zs_shrinker_count;
+       pool->shrinker.batch = 0;
+       pool->shrinker.seeks = DEFAULT_SEEKS;
+
+       return register_shrinker(&pool->shrinker);
+}
+
 /**
  * zs_create_pool - Creates an allocation pool to work from.
  * @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
        if (zs_pool_stat_create(name, pool))
                goto err;
 
+       /*
+        * Not critical, we still can use the pool
+        * and user can trigger compaction manually.
+        */
+       if (zs_register_shrinker(pool) == 0)
+               pool->shrinker_enabled = true;
        return pool;
 
 err:
@@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 {
        int i;
 
+       zs_unregister_shrinker(pool);
        zs_pool_stat_destroy(pool);
 
        for (i = 0; i < zs_size_classes; i++) {
index 2d5727baed5988c23ca4252a601ebaf734820b04..4043df7c672fb6f5b1be298b8d510fd17a3bbf42 100644 (file)
@@ -80,85 +80,54 @@ static u64 zswap_duplicate_entry;
 static bool zswap_enabled;
 module_param_named(enabled, zswap_enabled, bool, 0644);
 
-/* Compressor to be used by zswap (fixed at boot for now) */
+/* Crypto compressor to use */
 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0444);
-
-/* The maximum percentage of memory that the compressed pool can occupy */
-static unsigned int zswap_max_pool_percent = 20;
-module_param_named(max_pool_percent,
-                       zswap_max_pool_percent, uint, 0644);
+static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
+static struct kparam_string zswap_compressor_kparam = {
+       .string =       zswap_compressor,
+       .maxlen =       sizeof(zswap_compressor),
+};
+static int zswap_compressor_param_set(const char *,
+                                     const struct kernel_param *);
+static struct kernel_param_ops zswap_compressor_param_ops = {
+       .set =          zswap_compressor_param_set,
+       .get =          param_get_string,
+};
+module_param_cb(compressor, &zswap_compressor_param_ops,
+               &zswap_compressor_kparam, 0644);
 
-/* Compressed storage to use */
+/* Compressed storage zpool to use */
 #define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-module_param_named(zpool, zswap_zpool_type, charp, 0444);
+static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
+static struct kparam_string zswap_zpool_kparam = {
+       .string =       zswap_zpool_type,
+       .maxlen =       sizeof(zswap_zpool_type),
+};
+static int zswap_zpool_param_set(const char *, const struct kernel_param *);
+static struct kernel_param_ops zswap_zpool_param_ops = {
+       .set =  zswap_zpool_param_set,
+       .get =  param_get_string,
+};
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
 
-/* zpool is shared by all of zswap backend  */
-static struct zpool *zswap_pool;
+/* The maximum percentage of memory that the compressed pool can occupy */
+static unsigned int zswap_max_pool_percent = 20;
+module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
 
 /*********************************
-* compression functions
+* data structures
 **********************************/
-/* per-cpu compression transforms */
-static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
 
-enum comp_op {
-       ZSWAP_COMPOP_COMPRESS,
-       ZSWAP_COMPOP_DECOMPRESS
+struct zswap_pool {
+       struct zpool *zpool;
+       struct crypto_comp * __percpu *tfm;
+       struct kref kref;
+       struct list_head list;
+       struct rcu_head rcu_head;
+       struct notifier_block notifier;
+       char tfm_name[CRYPTO_MAX_ALG_NAME];
 };
 
-static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
-                               u8 *dst, unsigned int *dlen)
-{
-       struct crypto_comp *tfm;
-       int ret;
-
-       tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
-       switch (op) {
-       case ZSWAP_COMPOP_COMPRESS:
-               ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
-               break;
-       case ZSWAP_COMPOP_DECOMPRESS:
-               ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
-               break;
-       default:
-               ret = -EINVAL;
-       }
-
-       put_cpu();
-       return ret;
-}
-
-static int __init zswap_comp_init(void)
-{
-       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
-               pr_info("%s compressor not available\n", zswap_compressor);
-               /* fall back to default compressor */
-               zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-               if (!crypto_has_comp(zswap_compressor, 0, 0))
-                       /* can't even load the default compressor */
-                       return -ENODEV;
-       }
-       pr_info("using %s compressor\n", zswap_compressor);
-
-       /* alloc percpu transforms */
-       zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
-       if (!zswap_comp_pcpu_tfms)
-               return -ENOMEM;
-       return 0;
-}
-
-static void __init zswap_comp_exit(void)
-{
-       /* free percpu transforms */
-       free_percpu(zswap_comp_pcpu_tfms);
-}
-
-/*********************************
-* data structures
-**********************************/
 /*
  * struct zswap_entry
  *
@@ -166,22 +135,24 @@ static void __init zswap_comp_exit(void)
  * page within zswap.
  *
  * rbnode - links the entry into red-black tree for the appropriate swap type
+ * offset - the swap offset for the entry.  Index into the red-black tree.
  * refcount - the number of outstanding reference to the entry. This is needed
  *            to protect against premature freeing of the entry by code
  *            concurrent calls to load, invalidate, and writeback.  The lock
  *            for the zswap_tree structure that contains the entry must
  *            be held while changing the refcount.  Since the lock must
  *            be held, there is no reason to also make refcount atomic.
- * offset - the swap offset for the entry.  Index into the red-black tree.
- * handle - zpool allocation handle that stores the compressed page data
  * length - the length in bytes of the compressed page data.  Needed during
  *          decompression
+ * pool - the zswap_pool the entry's data is in
+ * handle - zpool allocation handle that stores the compressed page data
  */
 struct zswap_entry {
        struct rb_node rbnode;
        pgoff_t offset;
        int refcount;
        unsigned int length;
+       struct zswap_pool *pool;
        unsigned long handle;
 };
 
@@ -201,6 +172,51 @@ struct zswap_tree {
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 
+/* RCU-protected iteration */
+static LIST_HEAD(zswap_pools);
+/* protects zswap_pools list modification */
+static DEFINE_SPINLOCK(zswap_pools_lock);
+
+/* used by param callback function */
+static bool zswap_init_started;
+
+/*********************************
+* helpers and fwd declarations
+**********************************/
+
+#define zswap_pool_debug(msg, p)                               \
+       pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
+                zpool_get_type((p)->zpool))
+
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_pool_get(struct zswap_pool *pool);
+static void zswap_pool_put(struct zswap_pool *pool);
+
+static const struct zpool_ops zswap_zpool_ops = {
+       .evict = zswap_writeback_entry
+};
+
+static bool zswap_is_full(void)
+{
+       return totalram_pages * zswap_max_pool_percent / 100 <
+               DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
+static void zswap_update_total_size(void)
+{
+       struct zswap_pool *pool;
+       u64 total = 0;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list)
+               total += zpool_get_total_size(pool->zpool);
+
+       rcu_read_unlock();
+
+       zswap_pool_total_size = total;
+}
+
 /*********************************
 * zswap entry functions
 **********************************/
@@ -294,10 +310,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  */
 static void zswap_free_entry(struct zswap_entry *entry)
 {
-       zpool_free(zswap_pool, entry->handle);
+       zpool_free(entry->pool->zpool, entry->handle);
+       zswap_pool_put(entry->pool);
        zswap_entry_cache_free(entry);
        atomic_dec(&zswap_stored_pages);
-       zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+       zswap_update_total_size();
 }
 
 /* caller must hold the tree lock */
@@ -339,35 +356,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 **********************************/
 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
 
-static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
+static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
 {
-       struct crypto_comp *tfm;
        u8 *dst;
 
        switch (action) {
        case CPU_UP_PREPARE:
-               tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
-               if (IS_ERR(tfm)) {
-                       pr_err("can't allocate compressor transform\n");
-                       return NOTIFY_BAD;
-               }
-               *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
                dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
                if (!dst) {
                        pr_err("can't allocate compressor buffer\n");
-                       crypto_free_comp(tfm);
-                       *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
                        return NOTIFY_BAD;
                }
                per_cpu(zswap_dstmem, cpu) = dst;
                break;
        case CPU_DEAD:
        case CPU_UP_CANCELED:
-               tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
-               if (tfm) {
-                       crypto_free_comp(tfm);
-                       *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
-               }
                dst = per_cpu(zswap_dstmem, cpu);
                kfree(dst);
                per_cpu(zswap_dstmem, cpu) = NULL;
@@ -378,43 +381,398 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
        return NOTIFY_OK;
 }
 
-static int zswap_cpu_notifier(struct notifier_block *nb,
-                               unsigned long action, void *pcpu)
+static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
+                                    unsigned long action, void *pcpu)
 {
-       unsigned long cpu = (unsigned long)pcpu;
-       return __zswap_cpu_notifier(action, cpu);
+       return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
 }
 
-static struct notifier_block zswap_cpu_notifier_block = {
-       .notifier_call = zswap_cpu_notifier
+static struct notifier_block zswap_dstmem_notifier = {
+       .notifier_call =        zswap_cpu_dstmem_notifier,
 };
 
-static int __init zswap_cpu_init(void)
+static int __init zswap_cpu_dstmem_init(void)
 {
        unsigned long cpu;
 
        cpu_notifier_register_begin();
        for_each_online_cpu(cpu)
-               if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+               if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
+                   NOTIFY_BAD)
                        goto cleanup;
-       __register_cpu_notifier(&zswap_cpu_notifier_block);
+       __register_cpu_notifier(&zswap_dstmem_notifier);
        cpu_notifier_register_done();
        return 0;
 
 cleanup:
        for_each_online_cpu(cpu)
-               __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
+               __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
        cpu_notifier_register_done();
        return -ENOMEM;
 }
 
+static void zswap_cpu_dstmem_destroy(void)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
+       __unregister_cpu_notifier(&zswap_dstmem_notifier);
+       cpu_notifier_register_done();
+}
+
+static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
+                                    unsigned long action, unsigned long cpu)
+{
+       struct crypto_comp *tfm;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+               if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
+                       break;
+               tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
+               if (IS_ERR_OR_NULL(tfm)) {
+                       pr_err("could not alloc crypto comp %s : %ld\n",
+                              pool->tfm_name, PTR_ERR(tfm));
+                       return NOTIFY_BAD;
+               }
+               *per_cpu_ptr(pool->tfm, cpu) = tfm;
+               break;
+       case CPU_DEAD:
+       case CPU_UP_CANCELED:
+               tfm = *per_cpu_ptr(pool->tfm, cpu);
+               if (!IS_ERR_OR_NULL(tfm))
+                       crypto_free_comp(tfm);
+               *per_cpu_ptr(pool->tfm, cpu) = NULL;
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static int zswap_cpu_comp_notifier(struct notifier_block *nb,
+                                  unsigned long action, void *pcpu)
+{
+       unsigned long cpu = (unsigned long)pcpu;
+       struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
+
+       return __zswap_cpu_comp_notifier(pool, action, cpu);
+}
+
+static int zswap_cpu_comp_init(struct zswap_pool *pool)
+{
+       unsigned long cpu;
+
+       memset(&pool->notifier, 0, sizeof(pool->notifier));
+       pool->notifier.notifier_call = zswap_cpu_comp_notifier;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
+                   NOTIFY_BAD)
+                       goto cleanup;
+       __register_cpu_notifier(&pool->notifier);
+       cpu_notifier_register_done();
+       return 0;
+
+cleanup:
+       for_each_online_cpu(cpu)
+               __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
+       cpu_notifier_register_done();
+       return -ENOMEM;
+}
+
+static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
+{
+       unsigned long cpu;
+
+       cpu_notifier_register_begin();
+       for_each_online_cpu(cpu)
+               __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
+       __unregister_cpu_notifier(&pool->notifier);
+       cpu_notifier_register_done();
+}
+
 /*********************************
-* helpers
+* pool functions
 **********************************/
-static bool zswap_is_full(void)
+
+static struct zswap_pool *__zswap_pool_current(void)
 {
-       return totalram_pages * zswap_max_pool_percent / 100 <
-               DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+       struct zswap_pool *pool;
+
+       pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
+       WARN_ON(!pool);
+
+       return pool;
+}
+
+static struct zswap_pool *zswap_pool_current(void)
+{
+       assert_spin_locked(&zswap_pools_lock);
+
+       return __zswap_pool_current();
+}
+
+static struct zswap_pool *zswap_pool_current_get(void)
+{
+       struct zswap_pool *pool;
+
+       rcu_read_lock();
+
+       pool = __zswap_pool_current();
+       if (!pool || !zswap_pool_get(pool))
+               pool = NULL;
+
+       rcu_read_unlock();
+
+       return pool;
+}
+
+static struct zswap_pool *zswap_pool_last_get(void)
+{
+       struct zswap_pool *pool, *last = NULL;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list)
+               last = pool;
+       if (!WARN_ON(!last) && !zswap_pool_get(last))
+               last = NULL;
+
+       rcu_read_unlock();
+
+       return last;
+}
+
+static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
+{
+       struct zswap_pool *pool;
+
+       assert_spin_locked(&zswap_pools_lock);
+
+       list_for_each_entry_rcu(pool, &zswap_pools, list) {
+               if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
+                       continue;
+               if (strncmp(zpool_get_type(pool->zpool), type,
+                           sizeof(zswap_zpool_type)))
+                       continue;
+               /* if we can't get it, it's about to be destroyed */
+               if (!zswap_pool_get(pool))
+                       continue;
+               return pool;
+       }
+
+       return NULL;
+}
+
+static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
+{
+       struct zswap_pool *pool;
+       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+
+       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+       if (!pool) {
+               pr_err("pool alloc failed\n");
+               return NULL;
+       }
+
+       pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
+       if (!pool->zpool) {
+               pr_err("%s zpool not available\n", type);
+               goto error;
+       }
+       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+
+       strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
+       pool->tfm = alloc_percpu(struct crypto_comp *);
+       if (!pool->tfm) {
+               pr_err("percpu alloc failed\n");
+               goto error;
+       }
+
+       if (zswap_cpu_comp_init(pool))
+               goto error;
+       pr_debug("using %s compressor\n", pool->tfm_name);
+
+       /* being the current pool takes 1 ref; this func expects the
+        * caller to always add the new pool as the current pool
+        */
+       kref_init(&pool->kref);
+       INIT_LIST_HEAD(&pool->list);
+
+       zswap_pool_debug("created", pool);
+
+       return pool;
+
+error:
+       free_percpu(pool->tfm);
+       if (pool->zpool)
+               zpool_destroy_pool(pool->zpool);
+       kfree(pool);
+       return NULL;
+}
+
+static struct zswap_pool *__zswap_pool_create_fallback(void)
+{
+       if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+               pr_err("compressor %s not available, using default %s\n",
+                      zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
+               strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
+                       sizeof(zswap_compressor));
+       }
+       if (!zpool_has_pool(zswap_zpool_type)) {
+               pr_err("zpool %s not available, using default %s\n",
+                      zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
+               strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
+                       sizeof(zswap_zpool_type));
+       }
+
+       return zswap_pool_create(zswap_zpool_type, zswap_compressor);
+}
+
+static void zswap_pool_destroy(struct zswap_pool *pool)
+{
+       zswap_pool_debug("destroying", pool);
+
+       zswap_cpu_comp_destroy(pool);
+       free_percpu(pool->tfm);
+       zpool_destroy_pool(pool->zpool);
+       kfree(pool);
+}
+
+static int __must_check zswap_pool_get(struct zswap_pool *pool)
+{
+       return kref_get_unless_zero(&pool->kref);
+}
+
+static void __zswap_pool_release(struct rcu_head *head)
+{
+       struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+
+       /* nobody should have been able to get a kref... */
+       WARN_ON(kref_get_unless_zero(&pool->kref));
+
+       /* pool is now off zswap_pools list and has no references. */
+       zswap_pool_destroy(pool);
+}
+
+static void __zswap_pool_empty(struct kref *kref)
+{
+       struct zswap_pool *pool;
+
+       pool = container_of(kref, typeof(*pool), kref);
+
+       spin_lock(&zswap_pools_lock);
+
+       WARN_ON(pool == zswap_pool_current());
+
+       list_del_rcu(&pool->list);
+       call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+       spin_unlock(&zswap_pools_lock);
+}
+
+static void zswap_pool_put(struct zswap_pool *pool)
+{
+       kref_put(&pool->kref, __zswap_pool_empty);
+}
+
+/*********************************
+* param callbacks
+**********************************/
+
+static int __zswap_param_set(const char *val, const struct kernel_param *kp,
+                            char *type, char *compressor)
+{
+       struct zswap_pool *pool, *put_pool = NULL;
+       char str[kp->str->maxlen], *s;
+       int ret;
+
+       /*
+        * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
+        * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
+        * 32 (arbitrary).
+        */
+       strlcpy(str, val, kp->str->maxlen);
+       s = strim(str);
+
+       /* if this is load-time (pre-init) param setting,
+        * don't create a pool; that's done during init.
+        */
+       if (!zswap_init_started)
+               return param_set_copystring(s, kp);
+
+       /* no change required */
+       if (!strncmp(kp->str->string, s, kp->str->maxlen))
+               return 0;
+
+       if (!type) {
+               type = s;
+               if (!zpool_has_pool(type)) {
+                       pr_err("zpool %s not available\n", type);
+                       return -ENOENT;
+               }
+       } else if (!compressor) {
+               compressor = s;
+               if (!crypto_has_comp(compressor, 0, 0)) {
+                       pr_err("compressor %s not available\n", compressor);
+                       return -ENOENT;
+               }
+       }
+
+       spin_lock(&zswap_pools_lock);
+
+       pool = zswap_pool_find_get(type, compressor);
+       if (pool) {
+               zswap_pool_debug("using existing", pool);
+               list_del_rcu(&pool->list);
+       } else {
+               spin_unlock(&zswap_pools_lock);
+               pool = zswap_pool_create(type, compressor);
+               spin_lock(&zswap_pools_lock);
+       }
+
+       if (pool)
+               ret = param_set_copystring(s, kp);
+       else
+               ret = -EINVAL;
+
+       if (!ret) {
+               put_pool = zswap_pool_current();
+               list_add_rcu(&pool->list, &zswap_pools);
+       } else if (pool) {
+               /* add the possibly pre-existing pool to the end of the pools
+                * list; if it's new (and empty) then it'll be removed and
+                * destroyed by the put after we drop the lock
+                */
+               list_add_tail_rcu(&pool->list, &zswap_pools);
+               put_pool = pool;
+       }
+
+       spin_unlock(&zswap_pools_lock);
+
+       /* drop the ref from either the old current pool,
+        * or the new pool we failed to add
+        */
+       if (put_pool)
+               zswap_pool_put(put_pool);
+
+       return ret;
+}
+
+static int zswap_compressor_param_set(const char *val,
+                                     const struct kernel_param *kp)
+{
+       return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
+}
+
+static int zswap_zpool_param_set(const char *val,
+                                const struct kernel_param *kp)
+{
+       return __zswap_param_set(val, kp, NULL, zswap_compressor);
 }
 
 /*********************************
@@ -446,75 +804,14 @@ enum zswap_get_swap_ret {
 static int zswap_get_swap_cache_page(swp_entry_t entry,
                                struct page **retpage)
 {
-       struct page *found_page, *new_page = NULL;
-       struct address_space *swapper_space = swap_address_space(entry);
-       int err;
-
-       *retpage = NULL;
-       do {
-               /*
-                * First check the swap cache.  Since this is normally
-                * called after lookup_swap_cache() failed, re-calling
-                * that would confuse statistics.
-                */
-               found_page = find_get_page(swapper_space, entry.val);
-               if (found_page)
-                       break;
-
-               /*
-                * Get a new page to read into from swap.
-                */
-               if (!new_page) {
-                       new_page = alloc_page(GFP_KERNEL);
-                       if (!new_page)
-                               break; /* Out of memory */
-               }
-
-               /*
-                * call radix_tree_preload() while we can wait.
-                */
-               err = radix_tree_preload(GFP_KERNEL);
-               if (err)
-                       break;
-
-               /*
-                * Swap entry may have been freed since our caller observed it.
-                */
-               err = swapcache_prepare(entry);
-               if (err == -EEXIST) { /* seems racy */
-                       radix_tree_preload_end();
-                       continue;
-               }
-               if (err) { /* swp entry is obsolete ? */
-                       radix_tree_preload_end();
-                       break;
-               }
+       bool page_was_allocated;
 
-               /* May fail (-ENOMEM) if radix-tree node allocation failed. */
-               __set_page_locked(new_page);
-               SetPageSwapBacked(new_page);
-               err = __add_to_swap_cache(new_page, entry);
-               if (likely(!err)) {
-                       radix_tree_preload_end();
-                       lru_cache_add_anon(new_page);
-                       *retpage = new_page;
-                       return ZSWAP_SWAPCACHE_NEW;
-               }
-               radix_tree_preload_end();
-               ClearPageSwapBacked(new_page);
-               __clear_page_locked(new_page);
-               /*
-                * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-                * clear SWAP_HAS_CACHE flag.
-                */
-               swapcache_free(entry);
-       } while (err != -ENOMEM);
-
-       if (new_page)
-               page_cache_release(new_page);
-       if (!found_page)
+       *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+                       NULL, 0, &page_was_allocated);
+       if (page_was_allocated)
+               return ZSWAP_SWAPCACHE_NEW;
+       if (!*retpage)
                return ZSWAP_SWAPCACHE_FAIL;
-       *retpage = found_page;
        return ZSWAP_SWAPCACHE_EXIST;
 }
 
@@ -538,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
        pgoff_t offset;
        struct zswap_entry *entry;
        struct page *page;
+       struct crypto_comp *tfm;
        u8 *src, *dst;
        unsigned int dlen;
        int ret;
@@ -578,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
                /* decompress */
                dlen = PAGE_SIZE;
-               src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+               src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
                                ZPOOL_MM_RO) + sizeof(struct zswap_header);
                dst = kmap_atomic(page);
-               ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
-                               entry->length, dst, &dlen);
+               tfm = *get_cpu_ptr(entry->pool->tfm);
+               ret = crypto_comp_decompress(tfm, src, entry->length,
+                                            dst, &dlen);
+               put_cpu_ptr(entry->pool->tfm);
                kunmap_atomic(dst);
-               zpool_unmap_handle(zswap_pool, entry->handle);
+               zpool_unmap_handle(entry->pool->zpool, entry->handle);
                BUG_ON(ret);
                BUG_ON(dlen != PAGE_SIZE);
 
@@ -633,6 +933,22 @@ end:
        return ret;
 }
 
+static int zswap_shrink(void)
+{
+       struct zswap_pool *pool;
+       int ret;
+
+       pool = zswap_pool_last_get();
+       if (!pool)
+               return -ENOENT;
+
+       ret = zpool_shrink(pool->zpool, 1, NULL);
+
+       zswap_pool_put(pool);
+
+       return ret;
+}
+
 /*********************************
 * frontswap hooks
 **********************************/
@@ -642,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry, *dupentry;
+       struct crypto_comp *tfm;
        int ret;
        unsigned int dlen = PAGE_SIZE, len;
        unsigned long handle;
@@ -657,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
        /* reclaim space if needed */
        if (zswap_is_full()) {
                zswap_pool_limit_hit++;
-               if (zpool_shrink(zswap_pool, 1, NULL)) {
+               if (zswap_shrink()) {
                        zswap_reject_reclaim_fail++;
                        ret = -ENOMEM;
                        goto reject;
@@ -672,33 +989,42 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                goto reject;
        }
 
+       /* if entry is successfully added, it keeps the reference */
+       entry->pool = zswap_pool_current_get();
+       if (!entry->pool) {
+               ret = -EINVAL;
+               goto freepage;
+       }
+
        /* compress */
        dst = get_cpu_var(zswap_dstmem);
+       tfm = *get_cpu_ptr(entry->pool->tfm);
        src = kmap_atomic(page);
-       ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
+       ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
        kunmap_atomic(src);
+       put_cpu_ptr(entry->pool->tfm);
        if (ret) {
                ret = -EINVAL;
-               goto freepage;
+               goto put_dstmem;
        }
 
        /* store */
        len = dlen + sizeof(struct zswap_header);
-       ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
-               &handle);
+       ret = zpool_malloc(entry->pool->zpool, len,
+                          __GFP_NORETRY | __GFP_NOWARN, &handle);
        if (ret == -ENOSPC) {
                zswap_reject_compress_poor++;
-               goto freepage;
+               goto put_dstmem;
        }
        if (ret) {
                zswap_reject_alloc_fail++;
-               goto freepage;
+               goto put_dstmem;
        }
-       zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
+       zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
        zhdr->swpentry = swp_entry(type, offset);
        buf = (u8 *)(zhdr + 1);
        memcpy(buf, dst, dlen);
-       zpool_unmap_handle(zswap_pool, handle);
+       zpool_unmap_handle(entry->pool->zpool, handle);
        put_cpu_var(zswap_dstmem);
 
        /* populate entry */
@@ -721,12 +1047,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 
        /* update stats */
        atomic_inc(&zswap_stored_pages);
-       zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+       zswap_update_total_size();
 
        return 0;
 
-freepage:
+put_dstmem:
        put_cpu_var(zswap_dstmem);
+       zswap_pool_put(entry->pool);
+freepage:
        zswap_entry_cache_free(entry);
 reject:
        return ret;
@@ -741,6 +1069,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 {
        struct zswap_tree *tree = zswap_trees[type];
        struct zswap_entry *entry;
+       struct crypto_comp *tfm;
        u8 *src, *dst;
        unsigned int dlen;
        int ret;
@@ -757,13 +1086,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
        /* decompress */
        dlen = PAGE_SIZE;
-       src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+       src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
                        ZPOOL_MM_RO) + sizeof(struct zswap_header);
        dst = kmap_atomic(page);
-       ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
-               dst, &dlen);
+       tfm = *get_cpu_ptr(entry->pool->tfm);
+       ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
+       put_cpu_ptr(entry->pool->tfm);
        kunmap_atomic(dst);
-       zpool_unmap_handle(zswap_pool, entry->handle);
+       zpool_unmap_handle(entry->pool->zpool, entry->handle);
        BUG_ON(ret);
 
        spin_lock(&tree->lock);
@@ -816,10 +1146,6 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        zswap_trees[type] = NULL;
 }
 
-static struct zpool_ops zswap_zpool_ops = {
-       .evict = zswap_writeback_entry
-};
-
 static void zswap_frontswap_init(unsigned type)
 {
        struct zswap_tree *tree;
@@ -900,49 +1226,40 @@ static void __exit zswap_debugfs_exit(void) { }
 **********************************/
 static int __init init_zswap(void)
 {
-       gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+       struct zswap_pool *pool;
 
-       pr_info("loading zswap\n");
-
-       zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
-                                       &zswap_zpool_ops);
-       if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
-               pr_info("%s zpool not available\n", zswap_zpool_type);
-               zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-               zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
-                                       &zswap_zpool_ops);
-       }
-       if (!zswap_pool) {
-               pr_err("%s zpool not available\n", zswap_zpool_type);
-               pr_err("zpool creation failed\n");
-               goto error;
-       }
-       pr_info("using %s pool\n", zswap_zpool_type);
+       zswap_init_started = true;
 
        if (zswap_entry_cache_create()) {
                pr_err("entry cache creation failed\n");
-               goto cachefail;
+               goto cache_fail;
        }
-       if (zswap_comp_init()) {
-               pr_err("compressor initialization failed\n");
-               goto compfail;
+
+       if (zswap_cpu_dstmem_init()) {
+               pr_err("dstmem alloc failed\n");
+               goto dstmem_fail;
        }
-       if (zswap_cpu_init()) {
-               pr_err("per-cpu initialization failed\n");
-               goto pcpufail;
+
+       pool = __zswap_pool_create_fallback();
+       if (!pool) {
+               pr_err("pool creation failed\n");
+               goto pool_fail;
        }
+       pr_info("loaded using pool %s/%s\n", pool->tfm_name,
+               zpool_get_type(pool->zpool));
+
+       list_add(&pool->list, &zswap_pools);
 
        frontswap_register_ops(&zswap_frontswap_ops);
        if (zswap_debugfs_init())
                pr_warn("debugfs initialization failed\n");
        return 0;
-pcpufail:
-       zswap_comp_exit();
-compfail:
+
+pool_fail:
+       zswap_cpu_dstmem_destroy();
+dstmem_fail:
        zswap_entry_cache_destroy();
-cachefail:
-       zpool_destroy_pool(zswap_pool);
-error:
+cache_fail:
        return -ENOMEM;
 }
 /* must be late so crypto has time to come up */
index 37a78d20c0f647a976ec58dbad1a06a957cc1771..ba1210253f5ec077dc01fbd1e8fc6bf2102253ef 100644 (file)
@@ -94,8 +94,6 @@ struct p9_trans_rdma {
        struct ib_pd *pd;
        struct ib_qp *qp;
        struct ib_cq *cq;
-       struct ib_mr *dma_mr;
-       u32 lkey;
        long timeout;
        int sq_depth;
        struct semaphore sq_sem;
@@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
        if (!rdma)
                return;
 
-       if (rdma->dma_mr && !IS_ERR(rdma->dma_mr))
-               ib_dereg_mr(rdma->dma_mr);
-
        if (rdma->qp && !IS_ERR(rdma->qp))
                ib_destroy_qp(rdma->qp);
 
@@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
 
        sge.addr = c->busa;
        sge.length = client->msize;
-       sge.lkey = rdma->lkey;
+       sge.lkey = rdma->pd->local_dma_lkey;
 
        wr.next = NULL;
        c->wc_op = IB_WC_RECV;
@@ -506,7 +501,7 @@ dont_need_post_recv:
 
        sge.addr = c->busa;
        sge.length = c->req->tc->size;
-       sge.lkey = rdma->lkey;
+       sge.lkey = rdma->pd->local_dma_lkey;
 
        wr.next = NULL;
        c->wc_op = IB_WC_SEND;
@@ -647,7 +642,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
        struct p9_trans_rdma *rdma;
        struct rdma_conn_param conn_param;
        struct ib_qp_init_attr qp_attr;
-       struct ib_device_attr devattr;
        struct ib_cq_init_attr cq_attr = {};
 
        /* Parse the transport specific mount options */
@@ -700,11 +694,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
        if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED))
                goto error;
 
-       /* Query the device attributes */
-       err = ib_query_device(rdma->cm_id->device, &devattr);
-       if (err)
-               goto error;
-
        /* Create the Completion Queue */
        cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
        rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
@@ -719,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
        if (IS_ERR(rdma->pd))
                goto error;
 
-       /* Cache the DMA lkey in the transport */
-       rdma->dma_mr = NULL;
-       if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
-               rdma->lkey = rdma->cm_id->device->local_dma_lkey;
-       else {
-               rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE);
-               if (IS_ERR(rdma->dma_mr))
-                       goto error;
-               rdma->lkey = rdma->dma_mr->lkey;
-       }
-
        /* Create the Queue Pair */
        memset(&qp_attr, 0, sizeof qp_attr);
        qp_attr.event_handler = qp_event_handler;
index af5e187553fd6a9d3b033d446642040585995b80..ea748c93a07f1dcd988cc18130d629d6a3556d98 100644 (file)
@@ -16,7 +16,6 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
-#include <net/switchdev.h>
 #include <uapi/linux/if_bridge.h>
 
 #include "br_private.h"
index 3cd8cc9e804b37240047421ba6c5855bb72f79d9..5f5a02b49a99617918c8f76ecf7858920152ba53 100644 (file)
@@ -117,10 +117,11 @@ out_filt:
        return err;
 }
 
-static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
-                          u16 vid)
+static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
+                         u16 vid)
 {
        const struct net_device_ops *ops = dev->netdev_ops;
+       int err = 0;
 
        /* If driver uses VLAN ndo ops, use 8021q to delete vid
         * on device, otherwise try switchdev ops to delete vid.
@@ -137,8 +138,12 @@ static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
                        },
                };
 
-               switchdev_port_obj_del(dev, &vlan_obj);
+               err = switchdev_port_obj_del(dev, &vlan_obj);
+               if (err == -EOPNOTSUPP)
+                       err = 0;
        }
+
+       return err;
 }
 
 static int __vlan_del(struct net_port_vlans *v, u16 vid)
@@ -151,7 +156,11 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid)
 
        if (v->port_idx) {
                struct net_bridge_port *p = v->parent.port;
-               __vlan_vid_del(p->dev, p->br, vid);
+               int err;
+
+               err = __vlan_vid_del(p->dev, p->br, vid);
+               if (err)
+                       return err;
        }
 
        clear_bit(vid, v->vlan_bitmap);
index 69a4d30a9ccf44900961e0691d942acfb4262201..54a00d66509e748d47068a664bc14585166cbd97 100644 (file)
@@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name,
        opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
        opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
        opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+       opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
 
        /* get mon ip(s) */
        /* ip1[:port1][,ip2[:port2]...] */
index 790fe89d90c0ac49301bfcc81ba1b6633b9559cd..4440edcce0d6c0fd427fe5e5b456dd67de7e8e52 100644 (file)
@@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
        return 0;
 }
 
-
-
-#define AES_KEY_SIZE 16
-
 static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
 {
        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
index e3be1d22a2477dd2d9e271a4594d70ebdb1148e7..b9b0e3b5da49f84d9fb1775e75aca83f55e80b2d 100644 (file)
@@ -163,6 +163,7 @@ static struct kmem_cache    *ceph_msg_data_cache;
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
 
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
@@ -176,7 +177,7 @@ static struct lock_class_key socket_class;
 
 static void queue_con(struct ceph_connection *con);
 static void cancel_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
+static void ceph_con_workfn(struct work_struct *);
 static void con_fault(struct ceph_connection *con);
 
 /*
@@ -276,22 +277,22 @@ static void _ceph_msgr_exit(void)
                ceph_msgr_wq = NULL;
        }
 
-       ceph_msgr_slab_exit();
-
        BUG_ON(zero_page == NULL);
        page_cache_release(zero_page);
        zero_page = NULL;
+
+       ceph_msgr_slab_exit();
 }
 
 int ceph_msgr_init(void)
 {
+       if (ceph_msgr_slab_init())
+               return -ENOMEM;
+
        BUG_ON(zero_page != NULL);
        zero_page = ZERO_PAGE(0);
        page_cache_get(zero_page);
 
-       if (ceph_msgr_slab_init())
-               return -ENOMEM;
-
        /*
         * The number of active work items is limited by the number of
         * connections, so leave @max_active at default.
@@ -749,7 +750,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
        mutex_init(&con->mutex);
        INIT_LIST_HEAD(&con->out_queue);
        INIT_LIST_HEAD(&con->out_sent);
-       INIT_DELAYED_WORK(&con->work, con_work);
+       INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
        con->state = CON_STATE_CLOSED;
 }
@@ -1351,7 +1352,16 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 {
        dout("prepare_write_keepalive %p\n", con);
        con_out_kvec_reset(con);
-       con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+       if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+               struct timespec now = CURRENT_TIME;
+
+               con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+               ceph_encode_timespec(&con->out_temp_keepalive2, &now);
+               con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
+                                &con->out_temp_keepalive2);
+       } else {
+               con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+       }
        con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
@@ -1625,6 +1635,12 @@ static void prepare_read_tag(struct ceph_connection *con)
        con->in_tag = CEPH_MSGR_TAG_READY;
 }
 
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+       dout("prepare_read_keepalive_ack %p\n", con);
+       con->in_base_pos = 0;
+}
+
 /*
  * Prepare to read a message.
  */
@@ -2322,13 +2338,6 @@ static int read_partial_message(struct ceph_connection *con)
                        return ret;
 
                BUG_ON(!con->in_msg ^ skip);
-               if (con->in_msg && data_len > con->in_msg->data_length) {
-                       pr_warn("%s skipping long message (%u > %zd)\n",
-                               __func__, data_len, con->in_msg->data_length);
-                       ceph_msg_put(con->in_msg);
-                       con->in_msg = NULL;
-                       skip = 1;
-               }
                if (skip) {
                        /* skip this message */
                        dout("alloc_msg said skip message\n");
@@ -2457,6 +2466,17 @@ static void process_message(struct ceph_connection *con)
        mutex_lock(&con->mutex);
 }
 
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+       struct ceph_timespec ceph_ts;
+       size_t size = sizeof(ceph_ts);
+       int ret = read_partial(con, size, size, &ceph_ts);
+       if (ret <= 0)
+               return ret;
+       ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+       prepare_read_tag(con);
+       return 1;
+}
 
 /*
  * Write something to the socket.  Called in a worker thread when the
@@ -2526,6 +2546,10 @@ more_kvec:
 
 do_next:
        if (con->state == CON_STATE_OPEN) {
+               if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+                       prepare_write_keepalive(con);
+                       goto more;
+               }
                /* is anything else pending? */
                if (!list_empty(&con->out_queue)) {
                        prepare_write_message(con);
@@ -2535,10 +2559,6 @@ do_next:
                        prepare_write_ack(con);
                        goto more;
                }
-               if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
-                       prepare_write_keepalive(con);
-                       goto more;
-               }
        }
 
        /* Nothing to do! */
@@ -2641,6 +2661,9 @@ more:
                case CEPH_MSGR_TAG_ACK:
                        prepare_read_ack(con);
                        break;
+               case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+                       prepare_read_keepalive_ack(con);
+                       break;
                case CEPH_MSGR_TAG_CLOSE:
                        con_close_socket(con);
                        con->state = CON_STATE_CLOSED;
@@ -2684,6 +2707,12 @@ more:
                process_ack(con);
                goto more;
        }
+       if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+               ret = read_keepalive_ack(con);
+               if (ret <= 0)
+                       goto out;
+               goto more;
+       }
 
 out:
        dout("try_read done on %p ret %d\n", con, ret);
@@ -2799,7 +2828,7 @@ static void con_fault_finish(struct ceph_connection *con)
 /*
  * Do some work on a connection.  Drop a connection ref when we're done.
  */
-static void con_work(struct work_struct *work)
+static void ceph_con_workfn(struct work_struct *work)
 {
        struct ceph_connection *con = container_of(work, struct ceph_connection,
                                                   work.work);
@@ -3101,6 +3130,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+bool ceph_con_keepalive_expired(struct ceph_connection *con,
+                              unsigned long interval)
+{
+       if (interval > 0 &&
+           (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+               struct timespec now = CURRENT_TIME;
+               struct timespec ts;
+               jiffies_to_timespec(interval, &ts);
+               ts = timespec_add(con->last_keepalive_ack, ts);
+               return timespec_compare(&now, &ts) >= 0;
+       }
+       return false;
+}
+
 static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 {
        struct ceph_msg_data *data;
index 9d6ff1215928cb69787a85421fdbab9e2a1c8bf5..edda01626a459efbfdaeebd46fd6a0b13a037879 100644 (file)
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
                              CEPH_ENTITY_TYPE_MON, monc->cur_mon,
                              &monc->monmap->mon_inst[monc->cur_mon].addr);
 
+               /* send an initial keepalive to ensure our timestamp is
+                * valid by the time we are in an OPENED state */
+               ceph_con_keepalive(&monc->con);
+
                /* initiatiate authentication handshake */
                ret = ceph_auth_build_hello(monc->auth,
                                            monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
  */
 static void __schedule_delayed(struct ceph_mon_client *monc)
 {
-       unsigned int delay;
+       struct ceph_options *opt = monc->client->options;
+       unsigned long delay;
 
-       if (monc->cur_mon < 0 || __sub_expired(monc))
+       if (monc->cur_mon < 0 || __sub_expired(monc)) {
                delay = 10 * HZ;
-       else
+       } else {
                delay = 20 * HZ;
-       dout("__schedule_delayed after %u\n", delay);
-       schedule_delayed_work(&monc->delayed_work, delay);
+               if (opt->monc_ping_timeout > 0)
+                       delay = min(delay, opt->monc_ping_timeout / 3);
+       }
+       dout("__schedule_delayed after %lu\n", delay);
+       schedule_delayed_work(&monc->delayed_work,
+                             round_jiffies_relative(delay));
 }
 
 /*
@@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work)
                __close_session(monc);
                __open_session(monc);  /* continue hunting */
        } else {
-               ceph_con_keepalive(&monc->con);
+               struct ceph_options *opt = monc->client->options;
+               int is_auth = ceph_auth_is_authenticated(monc->auth);
+               if (ceph_con_keepalive_expired(&monc->con,
+                                              opt->monc_ping_timeout)) {
+                       dout("monc keepalive timeout\n");
+                       is_auth = 0;
+                       __close_session(monc);
+                       monc->hunting = true;
+                       __open_session(monc);
+               }
 
-               __validate_auth(monc);
+               if (!monc->hunting) {
+                       ceph_con_keepalive(&monc->con);
+                       __validate_auth(monc);
+               }
 
-               if (ceph_auth_is_authenticated(monc->auth))
+               if (is_auth)
                        __send_subscribe(monc);
        }
        __schedule_delayed(monc);
index 50033677c0fa5134d540fba82cc8e298ce1367a0..80b94e37c94aae115155454b9f4386a1b91021de 100644 (file)
@@ -2817,8 +2817,9 @@ out:
 }
 
 /*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
+ * Lookup and return message for incoming reply.  Don't try to do
+ * anything about a larger than preallocated data portion of the
+ * message at the moment - for now, just skip the message.
  */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
                                  struct ceph_msg_header *hdr,
@@ -2836,10 +2837,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        mutex_lock(&osdc->request_mutex);
        req = __lookup_request(osdc, tid);
        if (!req) {
-               *skip = 1;
+               pr_warn("%s osd%d tid %llu unknown, skipping\n",
+                       __func__, osd->o_osd, tid);
                m = NULL;
-               dout("get_reply unknown tid %llu from osd%d\n", tid,
-                    osd->o_osd);
+               *skip = 1;
                goto out;
        }
 
@@ -2849,10 +2850,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        ceph_msg_revoke_incoming(req->r_reply);
 
        if (front_len > req->r_reply->front_alloc_len) {
-               pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n",
-                       front_len, req->r_reply->front_alloc_len,
-                       (unsigned int)con->peer_name.type,
-                       le64_to_cpu(con->peer_name.num));
+               pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
+                       __func__, osd->o_osd, req->r_tid, front_len,
+                       req->r_reply->front_alloc_len);
                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
                                 false);
                if (!m)
@@ -2860,37 +2860,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
        }
-       m = ceph_msg_get(req->r_reply);
-
-       if (data_len > 0) {
-               struct ceph_osd_data *osd_data;
 
-               /*
-                * XXX This is assuming there is only one op containing
-                * XXX page data.  Probably OK for reads, but this
-                * XXX ought to be done more generally.
-                */
-               osd_data = osd_req_op_extent_osd_data(req, 0);
-               if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-                       if (osd_data->pages &&
-                               unlikely(osd_data->length < data_len)) {
-
-                               pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
-                                       tid, data_len, osd_data->length);
-                               *skip = 1;
-                               ceph_msg_put(m);
-                               m = NULL;
-                               goto out;
-                       }
-               }
+       if (data_len > req->r_reply->data_length) {
+               pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
+                       __func__, osd->o_osd, req->r_tid, data_len,
+                       req->r_reply->data_length);
+               m = NULL;
+               *skip = 1;
+               goto out;
        }
-       *skip = 0;
+
+       m = ceph_msg_get(req->r_reply);
        dout("get_reply tid %lld %p\n", tid, m);
 
 out:
        mutex_unlock(&osdc->request_mutex);
        return m;
-
 }
 
 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
index 4a3125836b64a0e5264e005badb7d108ddf9c47b..7d8f581d9f1f7987b8d7051160c34f42ad2f5e73 100644 (file)
@@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                ceph_decode_addr(&addr);
                pr_info("osd%d up\n", osd);
                BUG_ON(osd >= map->max_osd);
-               map->osd_state[osd] |= CEPH_OSD_UP;
+               map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
                map->osd_addr[osd] = addr;
        }
 
index ae8306e7c56f966196b570eb372a5321d146248a..bf77e3639ce0fd318822cca563c56b4376f9e8b7 100644 (file)
@@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 }
 EXPORT_SYMBOL(fib_default_rule_add);
 
-u32 fib_default_rule_pref(struct fib_rules_ops *ops)
+static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 {
        struct list_head *pos;
        struct fib_rule *rule;
@@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops)
 
        return 0;
 }
-EXPORT_SYMBOL(fib_default_rule_pref);
 
 static void notify_rule_change(int event, struct fib_rule *rule,
                               struct fib_rules_ops *ops, struct nlmsghdr *nlh,
@@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
        }
        rule->fr_net = net;
 
-       if (tb[FRA_PRIORITY])
-               rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+       rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
+                                     : fib_default_rule_pref(ops);
 
        if (tb[FRA_IIFNAME]) {
                struct net_device *dev;
@@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
        else
                rule->suppress_ifgroup = -1;
 
-       if (!tb[FRA_PRIORITY] && ops->default_pref)
-               rule->pref = ops->default_pref(ops);
-
        err = -EINVAL;
        if (tb[FRA_GOTO]) {
                if (rule->action != FR_ACT_GOTO)
index 9d66a0f72f906733878de68e7f2e6bd80932c1b9..295bbd6a56f2e690435148cd557b39eb4c05fed7 100644 (file)
@@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
        .configure      = dn_fib_rule_configure,
        .compare        = dn_fib_rule_compare,
        .fill           = dn_fib_rule_fill,
-       .default_pref   = fib_default_rule_pref,
        .flush_cache    = dn_fib_rule_flush_cache,
        .nlgroup        = RTNLGRP_DECnet_RULE,
        .policy         = dn_fib_rule_policy,
index 18123d50f576117358e47c4d3de0f5c3dcebafaa..f2bda9e89c61b251b66ed1201c14e568bd5078b3 100644 (file)
@@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
        .delete         = fib4_rule_delete,
        .compare        = fib4_rule_compare,
        .fill           = fib4_rule_fill,
-       .default_pref   = fib_default_rule_pref,
        .nlmsg_payload  = fib4_rule_nlmsg_payload,
        .flush_cache    = fib4_rule_flush_cache,
        .nlgroup        = RTNLGRP_IPV4_RULE,
index 3a2c0162c3badeed716599e538ed06426ddc7199..866ee89f5254a4d6b401c4d9152d15640802b6b8 100644 (file)
@@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
        .match          = ipmr_rule_match,
        .configure      = ipmr_rule_configure,
        .compare        = ipmr_rule_compare,
-       .default_pref   = fib_default_rule_pref,
        .fill           = ipmr_rule_fill,
        .nlgroup        = RTNLGRP_IPV4_RULE,
        .policy         = ipmr_rule_policy,
index 28011fb1f4a2104a34f81fc0c9fb4a4382bdadac..c6ded6b2a79fb5d8ece3f3b4b1ed0e01707124c3 100644 (file)
@@ -151,6 +151,21 @@ static void bictcp_init(struct sock *sk)
                tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
 }
 
+static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+       if (event == CA_EVENT_TX_START) {
+               s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime;
+               struct bictcp *ca = inet_csk_ca(sk);
+
+               /* We were application limited (idle) for a while.
+                * Shift epoch_start to keep cwnd growth to cubic curve.
+                */
+               if (ca->epoch_start && delta > 0)
+                       ca->epoch_start += delta;
+               return;
+       }
+}
+
 /* calculate the cubic root of x using a table lookup followed by one
  * Newton-Raphson iteration.
  * Avg err ~= 0.195%
@@ -450,6 +465,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = {
        .cong_avoid     = bictcp_cong_avoid,
        .set_state      = bictcp_state,
        .undo_cwnd      = bictcp_undo_cwnd,
+       .cwnd_event     = bictcp_cwnd_event,
        .pkts_acked     = bictcp_acked,
        .owner          = THIS_MODULE,
        .name           = "cubic",
index 1188e4fcf23bf87d40fb73f6e1483ee004fd74b1..f9a8a12b62ee64d954ae9a4aab75bcdce687650b 100644 (file)
@@ -164,6 +164,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
        struct inet_connection_sock *icsk = inet_csk(sk);
        const u32 now = tcp_time_stamp;
 
+       if (tcp_packets_in_flight(tp) == 0)
+               tcp_ca_event(sk, CA_EVENT_TX_START);
+
        tp->lsndtime = now;
 
        /* If it is a reply for ato after last received
@@ -940,9 +943,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                                                           &md5);
        tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
-       if (tcp_packets_in_flight(tp) == 0)
-               tcp_ca_event(sk, CA_EVENT_TX_START);
-
        /* if no packet is in qdisc/device queue, then allow XPS to select
         * another queue. We can be called from tcp_tsq_handler()
         * which holds one reference to sk_wmem_alloc.
index 99c0f2b843f01de8ae86190b720d58b57bcf3e57..030fefdc9aed8cedbf8ba9e26864de0aca89236a 100644 (file)
@@ -1943,37 +1943,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
        __ipv6_dev_ac_dec(ifp->idev, &addr);
 }
 
-static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
-{
-       if (dev->addr_len != ETH_ALEN)
-               return -1;
-       memcpy(eui, dev->dev_addr, 3);
-       memcpy(eui + 5, dev->dev_addr + 3, 3);
-
-       /*
-        * The zSeries OSA network cards can be shared among various
-        * OS instances, but the OSA cards have only one MAC address.
-        * This leads to duplicate address conflicts in conjunction
-        * with IPv6 if more than one instance uses the same card.
-        *
-        * The driver for these cards can deliver a unique 16-bit
-        * identifier for each instance sharing the same card.  It is
-        * placed instead of 0xFFFE in the interface identifier.  The
-        * "u" bit of the interface identifier is not inverted in this
-        * case.  Hence the resulting interface identifier has local
-        * scope according to RFC2373.
-        */
-       if (dev->dev_id) {
-               eui[3] = (dev->dev_id >> 8) & 0xFF;
-               eui[4] = dev->dev_id & 0xFF;
-       } else {
-               eui[3] = 0xFF;
-               eui[4] = 0xFE;
-               eui[0] ^= 2;
-       }
-       return 0;
-}
-
 static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
 {
        if (dev->addr_len != IEEE802154_ADDR_LEN)
index 2367a16eae58a31e01aa0d1d676090b688102593..9f777ec59a59d24566d87643889a8c591dd52637 100644 (file)
@@ -258,11 +258,6 @@ nla_put_failure:
        return -ENOBUFS;
 }
 
-static u32 fib6_rule_default_pref(struct fib_rules_ops *ops)
-{
-       return 0x3FFF;
-}
-
 static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
 {
        return nla_total_size(16) /* dst */
@@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
        .configure              = fib6_rule_configure,
        .compare                = fib6_rule_compare,
        .fill                   = fib6_rule_fill,
-       .default_pref           = fib6_rule_default_pref,
        .nlmsg_payload          = fib6_rule_nlmsg_payload,
        .nlgroup                = RTNLGRP_IPV6_RULE,
        .policy                 = fib6_rule_policy,
index 74ceb73c1c9a042b0f8f9f65c264e8426d65f7f3..0e004cc42a22b1593fa308f3adac735568ddf2ee 100644 (file)
@@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
        .match          = ip6mr_rule_match,
        .configure      = ip6mr_rule_configure,
        .compare        = ip6mr_rule_compare,
-       .default_pref   = fib_default_rule_pref,
        .fill           = ip6mr_rule_fill,
        .nlgroup        = RTNLGRP_IPV6_RULE,
        .policy         = ip6mr_rule_policy,
@@ -550,7 +549,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
        if (it->cache == &mrt->mfc6_unres_queue)
                spin_unlock_bh(&mfc_unres_lock);
-       else if (it->cache == mrt->mfc6_cache_array)
+       else if (it->cache == &mrt->mfc6_cache_array[it->ct])
                read_unlock(&mrt_lock);
 }
 
index f45cac6f83563977186732adacedcec55a6b3ce7..53617d71518850dfc26220dd15923b8027c5249a 100644 (file)
@@ -1748,7 +1748,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
        return -EINVAL;
 }
 
-int ip6_route_add(struct fib6_config *cfg)
+int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)
 {
        int err;
        struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1756,7 +1756,6 @@ int ip6_route_add(struct fib6_config *cfg)
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;
        struct fib6_table *table;
-       struct mx6_config mxc = { .mx = NULL, };
        int addr_type;
 
        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
@@ -1981,6 +1980,32 @@ install_route:
 
        cfg->fc_nlinfo.nl_net = dev_net(dev);
 
+       *rt_ret = rt;
+
+       return 0;
+out:
+       if (dev)
+               dev_put(dev);
+       if (idev)
+               in6_dev_put(idev);
+       if (rt)
+               dst_free(&rt->dst);
+
+       *rt_ret = NULL;
+
+       return err;
+}
+
+int ip6_route_add(struct fib6_config *cfg)
+{
+       struct mx6_config mxc = { .mx = NULL, };
+       struct rt6_info *rt = NULL;
+       int err;
+
+       err = ip6_route_info_create(cfg, &rt);
+       if (err)
+               goto out;
+
        err = ip6_convert_metrics(&mxc, cfg);
        if (err)
                goto out;
@@ -1988,14 +2013,12 @@ install_route:
        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
 
        kfree(mxc.mx);
+
        return err;
 out:
-       if (dev)
-               dev_put(dev);
-       if (idev)
-               in6_dev_put(idev);
        if (rt)
                dst_free(&rt->dst);
+
        return err;
 }
 
@@ -2776,19 +2799,78 @@ errout:
        return err;
 }
 
-static int ip6_route_multipath(struct fib6_config *cfg, int add)
+struct rt6_nh {
+       struct rt6_info *rt6_info;
+       struct fib6_config r_cfg;
+       struct mx6_config mxc;
+       struct list_head next;
+};
+
+static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
+{
+       struct rt6_nh *nh;
+
+       list_for_each_entry(nh, rt6_nh_list, next) {
+               pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
+                       &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
+                       nh->r_cfg.fc_ifindex);
+       }
+}
+
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
+                                struct rt6_info *rt, struct fib6_config *r_cfg)
+{
+       struct rt6_nh *nh;
+       struct rt6_info *rtnh;
+       int err = -EEXIST;
+
+       list_for_each_entry(nh, rt6_nh_list, next) {
+               /* check if rt6_info already exists */
+               rtnh = nh->rt6_info;
+
+               if (rtnh->dst.dev == rt->dst.dev &&
+                   rtnh->rt6i_idev == rt->rt6i_idev &&
+                   ipv6_addr_equal(&rtnh->rt6i_gateway,
+                                   &rt->rt6i_gateway))
+                       return err;
+       }
+
+       nh = kzalloc(sizeof(*nh), GFP_KERNEL);
+       if (!nh)
+               return -ENOMEM;
+       nh->rt6_info = rt;
+       err = ip6_convert_metrics(&nh->mxc, r_cfg);
+       if (err) {
+               kfree(nh);
+               return err;
+       }
+       memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
+       list_add_tail(&nh->next, rt6_nh_list);
+
+       return 0;
+}
+
+static int ip6_route_multipath_add(struct fib6_config *cfg)
 {
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
+       struct rt6_info *rt;
+       struct rt6_nh *err_nh;
+       struct rt6_nh *nh, *nh_safe;
        int remaining;
        int attrlen;
-       int err = 0, last_err = 0;
+       int err = 1;
+       int nhn = 0;
+       int replace = (cfg->fc_nlinfo.nlh &&
+                      (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
+       LIST_HEAD(rt6_nh_list);
 
        remaining = cfg->fc_mp_len;
-beginning:
        rtnh = (struct rtnexthop *)cfg->fc_mp;
 
-       /* Parse a Multipath Entry */
+       /* Parse a Multipath Entry and build a list (rt6_nh_list) of
+        * rt6_info structs per nexthop
+        */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
@@ -2808,22 +2890,32 @@ beginning:
                        if (nla)
                                r_cfg.fc_encap_type = nla_get_u16(nla);
                }
-               err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+
+               err = ip6_route_info_create(&r_cfg, &rt);
+               if (err)
+                       goto cleanup;
+
+               err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
                if (err) {
-                       last_err = err;
-                       /* If we are trying to remove a route, do not stop the
-                        * loop when ip6_route_del() fails (because next hop is
-                        * already gone), we should try to remove all next hops.
-                        */
-                       if (add) {
-                               /* If add fails, we should try to delete all
-                                * next hops that have been already added.
-                                */
-                               add = 0;
-                               remaining = cfg->fc_mp_len - remaining;
-                               goto beginning;
-                       }
+                       dst_free(&rt->dst);
+                       goto cleanup;
+               }
+
+               rtnh = rtnh_next(rtnh, &remaining);
+       }
+
+       err_nh = NULL;
+       list_for_each_entry(nh, &rt6_nh_list, next) {
+               err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
+               /* nh->rt6_info is used or freed at this point, reset to NULL*/
+               nh->rt6_info = NULL;
+               if (err) {
+                       if (replace && nhn)
+                               ip6_print_replace_route_err(&rt6_nh_list);
+                       err_nh = nh;
+                       goto add_errout;
                }
+
                /* Because each route is added like a single route we remove
                 * these flags after the first nexthop: if there is a collision,
                 * we have already failed to add the first nexthop:
@@ -2833,6 +2925,62 @@ beginning:
                 */
                cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
                                                     NLM_F_REPLACE);
+               nhn++;
+       }
+
+       goto cleanup;
+
+add_errout:
+       /* Delete routes that were already added */
+       list_for_each_entry(nh, &rt6_nh_list, next) {
+               if (err_nh == nh)
+                       break;
+               ip6_route_del(&nh->r_cfg);
+       }
+
+cleanup:
+       list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
+               if (nh->rt6_info)
+                       dst_free(&nh->rt6_info->dst);
+               kfree(nh->mxc.mx);
+               list_del(&nh->next);
+               kfree(nh);
+       }
+
+       return err;
+}
+
+static int ip6_route_multipath_del(struct fib6_config *cfg)
+{
+       struct fib6_config r_cfg;
+       struct rtnexthop *rtnh;
+       int remaining;
+       int attrlen;
+       int err = 1, last_err = 0;
+
+       remaining = cfg->fc_mp_len;
+       rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+       /* Parse a Multipath Entry */
+       while (rtnh_ok(rtnh, remaining)) {
+               memcpy(&r_cfg, cfg, sizeof(*cfg));
+               if (rtnh->rtnh_ifindex)
+                       r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+               attrlen = rtnh_attrlen(rtnh);
+               if (attrlen > 0) {
+                       struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+                       nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+                       if (nla) {
+                               nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+                               r_cfg.fc_flags |= RTF_GATEWAY;
+                       }
+               }
+               err = ip6_route_del(&r_cfg);
+               if (err)
+                       last_err = err;
+
                rtnh = rtnh_next(rtnh, &remaining);
        }
 
@@ -2849,7 +2997,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
                return err;
 
        if (cfg.fc_mp)
-               return ip6_route_multipath(&cfg, 0);
+               return ip6_route_multipath_del(&cfg);
        else
                return ip6_route_del(&cfg);
 }
@@ -2864,7 +3012,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
                return err;
 
        if (cfg.fc_mp)
-               return ip6_route_multipath(&cfg, 1);
+               return ip6_route_multipath_add(&cfg);
        else
                return ip6_route_add(&cfg);
 }
index 685ec13ed7c2b0a2dcdcf82d7388d1c44d041a26..17b1fe961c5d67b958346dee07e265448408cc71 100644 (file)
@@ -2468,6 +2468,10 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
            rssi_hyst == bss_conf->cqm_rssi_hyst)
                return 0;
 
+       if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER &&
+           !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
+               return -EOPNOTSUPP;
+
        bss_conf->cqm_rssi_thold = rssi_thold;
        bss_conf->cqm_rssi_hyst = rssi_hyst;
 
index 705ef1d040edfb70042fdd9cd25f050b19dab4c0..cd7e55e08a238c23f546ce7d6860759345eff371 100644 (file)
@@ -4267,6 +4267,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
        struct ieee80211_supported_band *sband;
        struct cfg80211_chan_def chandef;
        int ret;
+       u32 i;
+       bool have_80mhz;
 
        sband = local->hw.wiphy->bands[cbss->channel->band];
 
@@ -4317,6 +4319,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
                }
        }
 
+       /* Allow VHT if at least one channel on the sband supports 80 MHz */
+       have_80mhz = false;
+       for (i = 0; i < sband->n_channels; i++) {
+               if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+                                               IEEE80211_CHAN_NO_80MHZ))
+                       continue;
+
+               have_80mhz = true;
+               break;
+       }
+
+       if (!have_80mhz)
+               ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
        ifmgd->flags |= ieee80211_determine_chantype(sdata, sband,
                                                     cbss->channel,
                                                     ht_cap, ht_oper, vht_oper,
index 9857693b91ec721ff71e3f3cd1087ccc289912e1..9ce8883d5f449ed438f0a4a4a4807ede153c6725 100644 (file)
@@ -716,7 +716,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata,
 
                /* Filter out rates that the STA does not support */
                *mask &= sta->supp_rates[sband->band];
-               for (i = 0; i < sizeof(mcs_mask); i++)
+               for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
                        mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i];
 
                sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map;
index aee701a5649e59ebd03ef300f25e33eadfc280d5..4e202d0679b26a0dfa0b89c5ae189f2ce7b56883 100644 (file)
@@ -1249,6 +1249,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata)
        mutex_unlock(&local->chanctx_mtx);
 }
 
+static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata)
+{
+       struct sta_info *sta;
+       bool result = false;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+               if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded ||
+                   !test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
+                   !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) ||
+                   !sta->sta.ht_cap.ht_supported)
+                       continue;
+               result = true;
+               break;
+       }
+       rcu_read_unlock();
+
+       return result;
+}
+
+static void
+iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata,
+                                  struct sta_info *sta)
+{
+       struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+       bool tdls_ht;
+       u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED |
+                        IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT |
+                        IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT;
+       u16 opmode;
+
+       /* Nothing to do if the BSS connection uses HT */
+       if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT))
+               return;
+
+       tdls_ht = (sta && sta->sta.ht_cap.ht_supported) ||
+                 iee80211_tdls_have_ht_peers(sdata);
+
+       opmode = sdata->vif.bss_conf.ht_operation_mode;
+
+       if (tdls_ht)
+               opmode |= protection;
+       else
+               opmode &= ~protection;
+
+       if (opmode == sdata->vif.bss_conf.ht_operation_mode)
+               return;
+
+       sdata->vif.bss_conf.ht_operation_mode = opmode;
+       ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+}
+
 int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                        const u8 *peer, enum nl80211_tdls_operation oper)
 {
@@ -1274,6 +1326,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                return -ENOTSUPP;
        }
 
+       /* protect possible bss_conf changes and avoid concurrency in
+        * ieee80211_bss_info_change_notify()
+        */
+       sdata_lock(sdata);
        mutex_lock(&local->mtx);
        tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer);
 
@@ -1287,16 +1343,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
 
                iee80211_tdls_recalc_chanctx(sdata);
 
-               rcu_read_lock();
+               mutex_lock(&local->sta_mtx);
                sta = sta_info_get(sdata, peer);
                if (!sta) {
-                       rcu_read_unlock();
+                       mutex_unlock(&local->sta_mtx);
                        ret = -ENOLINK;
                        break;
                }
 
+               iee80211_tdls_recalc_ht_protection(sdata, sta);
+
                set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
-               rcu_read_unlock();
+               mutex_unlock(&local->sta_mtx);
 
                WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) ||
                             !ether_addr_equal(sdata->u.mgd.tdls_peer, peer));
@@ -1318,6 +1376,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                ieee80211_flush_queues(local, sdata, false);
 
                ret = sta_info_destroy_addr(sdata, peer);
+
+               mutex_lock(&local->sta_mtx);
+               iee80211_tdls_recalc_ht_protection(sdata, NULL);
+               mutex_unlock(&local->sta_mtx);
+
                iee80211_tdls_recalc_chanctx(sdata);
                break;
        default:
@@ -1335,6 +1398,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
                                     &sdata->u.mgd.request_smps_work);
 
        mutex_unlock(&local->mtx);
+       sdata_unlock(sdata);
        return ret;
 }
 
index 834ccdbc74be1ccd518aaa952ad854810d94f4ca..ff1c798921a6acc90181923456cf92bed0973f61 100644 (file)
@@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
        struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
        struct ieee80211_sta_vht_cap own_cap;
        u32 cap_info, i;
+       bool have_80mhz;
 
        memset(vht_cap, 0, sizeof(*vht_cap));
 
@@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
        if (!vht_cap_ie || !sband->vht_cap.vht_supported)
                return;
 
+       /* Allow VHT if at least one channel on the sband supports 80 MHz */
+       have_80mhz = false;
+       for (i = 0; i < sband->n_channels; i++) {
+               if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED |
+                                               IEEE80211_CHAN_NO_80MHZ))
+                       continue;
+
+               have_80mhz = true;
+               break;
+       }
+
+       if (!have_80mhz)
+               return;
+
        /*
         * A VHT STA must support 40 MHz, but if we verify that here
         * then we break a few things - some APs (e.g. Netgear R6300v2
index afe905c208af879a5bd48f473bea8597d39e84cf..691b54fcaf2a47189665454fdcaff419236b020f 100644 (file)
@@ -152,9 +152,13 @@ htable_bits(u32 hashsize)
 #define SET_HOST_MASK(family)  (family == AF_INET ? 32 : 128)
 
 #ifdef IP_SET_HASH_WITH_NET0
+/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */
 #define NLEN(family)           (SET_HOST_MASK(family) + 1)
+#define CIDR_POS(c)            ((c) - 1)
 #else
+/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */
 #define NLEN(family)           SET_HOST_MASK(family)
+#define CIDR_POS(c)            ((c) - 2)
 #endif
 
 #else
@@ -305,7 +309,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
                } else if (h->nets[i].cidr[n] < cidr) {
                        j = i;
                } else if (h->nets[i].cidr[n] == cidr) {
-                       h->nets[cidr - 1].nets[n]++;
+                       h->nets[CIDR_POS(cidr)].nets[n]++;
                        return;
                }
        }
@@ -314,7 +318,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
                        h->nets[i].cidr[n] = h->nets[i - 1].cidr[n];
        }
        h->nets[i].cidr[n] = cidr;
-       h->nets[cidr - 1].nets[n] = 1;
+       h->nets[CIDR_POS(cidr)].nets[n] = 1;
 }
 
 static void
@@ -325,8 +329,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
        for (i = 0; i < nets_length; i++) {
                if (h->nets[i].cidr[n] != cidr)
                        continue;
-               h->nets[cidr - 1].nets[n]--;
-               if (h->nets[cidr - 1].nets[n] > 0)
+               h->nets[CIDR_POS(cidr)].nets[n]--;
+               if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
                        return;
                for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
                        h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
index 3c862c0a76d1eac5fcb2d94675e4c3a7fe498d89..a93dfebffa811bcaee4be26c935e583b8ed3fc00 100644 (file)
@@ -131,6 +131,13 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next,
 #define HOST_MASK      32
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netnet4_init(struct hash_netnet4_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
                  const struct xt_action_param *par,
@@ -160,7 +167,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 {
        const struct hash_netnet *h = set->data;
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netnet4_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        u32 ip = 0, ip_to = 0, last;
        u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2;
@@ -169,6 +176,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netnet4_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
                return -IPSET_ERR_PROTOCOL;
@@ -357,6 +365,13 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netnet6_init(struct hash_netnet6_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
                  const struct xt_action_param *par,
@@ -385,13 +400,14 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
                  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netnet6_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        int ret;
 
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netnet6_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
                return -IPSET_ERR_PROTOCOL;
index 0c68734f5cc4af2f5c23170c6369b1fad6b7ff0e..9a14c237830f4b2ccbbd999662256d88cd95662e 100644 (file)
@@ -142,6 +142,13 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
 #define HOST_MASK      32
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netportnet4_init(struct hash_netportnet4_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
                      const struct xt_action_param *par,
@@ -175,7 +182,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 {
        const struct hash_netportnet *h = set->data;
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netportnet4_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to;
        u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2;
@@ -185,6 +192,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netportnet4_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -412,6 +420,13 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
+static void
+hash_netportnet6_init(struct hash_netportnet6_elem *e)
+{
+       e->cidr[0] = HOST_MASK;
+       e->cidr[1] = HOST_MASK;
+}
+
 static int
 hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
                      const struct xt_action_param *par,
@@ -445,7 +460,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 {
        const struct hash_netportnet *h = set->data;
        ipset_adtfn adtfn = set->variant->adt[adt];
-       struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
+       struct hash_netportnet6_elem e = { };
        struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
        u32 port, port_to;
        bool with_ports = false;
@@ -454,6 +469,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
        if (tb[IPSET_ATTR_LINENO])
                *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
 
+       hash_netportnet6_init(&e);
        if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
                     !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
                     !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
index eedf0495f11f5eb93c4bce1e2ffbb1b04cc273b3..c09d6c7198f60d809b36783ca1de43646025c876 100644 (file)
@@ -313,12 +313,13 @@ out_free:
 }
 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 
-static void nf_ct_tmpl_free(struct nf_conn *tmpl)
+void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
        nf_ct_ext_destroy(tmpl);
        nf_ct_ext_free(tmpl);
        kfree(tmpl);
 }
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
 
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
index 888b9558415eb23bedd78b80f1bb458763331e68..c8a4a48bced988a29cd19df06a00117ea026c6ad 100644 (file)
@@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net)
 err3:
        free_percpu(snet->stats);
 err2:
-       nf_conntrack_free(ct);
+       nf_ct_tmpl_free(ct);
 err1:
        return err;
 }
index 0c0e8ecf02abbb4214b18f00ef798d728234866b..70277b11f742e8f0a2756c2ba54e1565419adc95 100644 (file)
@@ -444,6 +444,7 @@ done:
 static void nfnetlink_rcv(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
+       u_int16_t res_id;
        int msglen;
 
        if (nlh->nlmsg_len < NLMSG_HDRLEN ||
@@ -468,7 +469,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 
                nfgenmsg = nlmsg_data(nlh);
                skb_pull(skb, msglen);
-               nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id);
+               /* Work around old nft using host byte order */
+               if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES)
+                       res_id = NFNL_SUBSYS_NFTABLES;
+               else
+                       res_id = ntohs(nfgenmsg->res_id);
+               nfnetlink_rcv_batch(skb, nlh, res_id);
        } else {
                netlink_rcv_skb(skb, &nfnetlink_rcv_msg);
        }
index 685cc6a17163ffaf1c7649d726f98c5118066fdd..a5cd6d90b78b16ebd0c96c4d1d426ad7aea6e86b 100644 (file)
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                           __be32 **packet_id_ptr)
 {
        size_t size;
-       size_t data_len = 0, cap_len = 0;
+       size_t data_len = 0, cap_len = 0, rem_len = 0;
        unsigned int hlen = 0;
        struct sk_buff *skb;
        struct nlattr *nla;
@@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                hlen = min_t(unsigned int, hlen, data_len);
                size += sizeof(struct nlattr) + hlen;
                cap_len = entskb->len;
+               rem_len = data_len - hlen;
                break;
        }
 
@@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                        size += nla_total_size(seclen);
        }
 
-       skb = nfnetlink_alloc_skb(net, size, queue->peer_portid,
+       skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
                                  GFP_ATOMIC);
        if (!skb) {
                skb_tx_error(entskb);
index 8e524898ccea234a2b5cae3bdfaf2cd72d023238..faf32d888198a72a50c293312c014bcb63747654 100644 (file)
@@ -255,7 +255,7 @@ out:
        return 0;
 
 err3:
-       nf_conntrack_free(ct);
+       nf_ct_tmpl_free(ct);
 err2:
        nf_ct_l3proto_module_put(par->family);
 err1:
index 50889be1517d04aa3f8d118196205d737a7dfa4d..7f86d3b550601839f730d4c9f15951f5410e3fd4 100644 (file)
@@ -674,12 +674,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock,
 
        mask = datagram_poll(file, sock, wait);
 
-       spin_lock_bh(&sk->sk_receive_queue.lock);
-       if (nlk->rx_ring.pg_vec) {
-               if (netlink_has_valid_frame(&nlk->rx_ring))
-                       mask |= POLLIN | POLLRDNORM;
+       /* We could already have received frames in the normal receive
+        * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
+        * so if mask contains pollin/etc already, there's no point
+        * walking the ring.
+        */
+       if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
+               spin_lock_bh(&sk->sk_receive_queue.lock);
+               if (nlk->rx_ring.pg_vec) {
+                       if (netlink_has_valid_frame(&nlk->rx_ring))
+                               mask |= POLLIN | POLLRDNORM;
+               }
+               spin_unlock_bh(&sk->sk_receive_queue.lock);
        }
-       spin_unlock_bh(&sk->sk_receive_queue.lock);
 
        spin_lock_bh(&sk->sk_write_queue.lock);
        if (nlk->tx_ring.pg_vec) {
@@ -1837,15 +1844,16 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
-struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
-                                 u32 dst_portid, gfp_t gfp_mask)
+struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
+                                   unsigned int ldiff, u32 dst_portid,
+                                   gfp_t gfp_mask)
 {
 #ifdef CONFIG_NETLINK_MMAP
+       unsigned int maxlen, linear_size;
        struct sock *sk = NULL;
        struct sk_buff *skb;
        struct netlink_ring *ring;
        struct nl_mmap_hdr *hdr;
-       unsigned int maxlen;
 
        sk = netlink_getsockbyportid(ssk, dst_portid);
        if (IS_ERR(sk))
@@ -1856,7 +1864,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
        if (ring->pg_vec == NULL)
                goto out_put;
 
-       if (ring->frame_size - NL_MMAP_HDRLEN < size)
+       /* We need to account the full linear size needed as a ring
+        * slot cannot have non-linear parts.
+        */
+       linear_size = size + ldiff;
+       if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
                goto out_put;
 
        skb = alloc_skb_head(gfp_mask);
@@ -1870,13 +1882,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
 
        /* check again under lock */
        maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-       if (maxlen < size)
+       if (maxlen < linear_size)
                goto out_free;
 
        netlink_forward_ring(ring);
        hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
        if (hdr == NULL)
                goto err2;
+
        netlink_ring_setup_skb(skb, sk, ring, hdr);
        netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
        atomic_inc(&ring->pending);
@@ -1902,7 +1915,7 @@ out:
 #endif
        return alloc_skb(size, gfp_mask);
 }
-EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
 
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
index af7cdef42066bdab81286961f1667d631653da6d..2a071f470d578e135e6a04c462199f9a5cdb7b69 100644 (file)
@@ -5,6 +5,7 @@
 config OPENVSWITCH
        tristate "Open vSwitch"
        depends on INET
+       depends on (!NF_CONNTRACK || NF_CONNTRACK)
        select LIBCRC32C
        select MPLS
        select NET_MPLS_GSO
@@ -31,17 +32,6 @@ config OPENVSWITCH
 
          If unsure, say N.
 
-config OPENVSWITCH_CONNTRACK
-       bool "Open vSwitch conntrack action support"
-       depends on OPENVSWITCH
-       depends on NF_CONNTRACK
-       default OPENVSWITCH
-       ---help---
-         If you say Y here, then Open vSwitch module will be able to pass
-         packets through conntrack.
-
-         Say N to exclude this support and reduce the binary size.
-
 config OPENVSWITCH_GRE
        tristate "Open vSwitch GRE tunneling support"
        depends on OPENVSWITCH
index 5b5913b06f540e5513e5a5618a37597893aa183e..60f809085b920bfa95696542c59bf13b4d6a9510 100644 (file)
@@ -15,7 +15,9 @@ openvswitch-y := \
        vport-internal_dev.o \
        vport-netdev.o
 
-openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o
+ifneq ($(CONFIG_NF_CONNTRACK),)
+openvswitch-y += conntrack.o
+endif
 
 obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
 obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
index 3cb30667a7dcb83d0009a0f870f31e8dd72fccb7..43f5dd7a55774414aeb7aad8c0560db3e0596035 100644 (file)
@@ -19,7 +19,7 @@
 struct ovs_conntrack_info;
 enum ovs_key_attr;
 
-#if defined(CONFIG_OPENVSWITCH_CONNTRACK)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 void ovs_ct_init(struct net *);
 void ovs_ct_exit(struct net *);
 bool ovs_ct_verify(struct net *, enum ovs_key_attr attr);
@@ -82,5 +82,5 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key,
 }
 
 static inline void ovs_ct_free_action(const struct nlattr *a) { }
-#endif
+#endif /* CONFIG_NF_CONNTRACK */
 #endif /* ovs_conntrack.h */
index a50e652eb269dce22f52900754839537678ebda8..49adeef8090caea90be3e42702276314bca1d118 100644 (file)
@@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 } while (0)
 
 /* rcu read lock must be held or the connection spinlock */
-static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+static struct rds_connection *rds_conn_lookup(struct net *net,
+                                             struct hlist_head *head,
                                              __be32 laddr, __be32 faddr,
                                              struct rds_transport *trans)
 {
@@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
 
        hlist_for_each_entry_rcu(conn, head, c_hash_node) {
                if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
-                               conn->c_trans == trans) {
+                   conn->c_trans == trans && net == rds_conn_net(conn)) {
                        ret = conn;
                        break;
                }
@@ -132,7 +133,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
        if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
                goto new_conn;
        rcu_read_lock();
-       conn = rds_conn_lookup(head, laddr, faddr, trans);
+       conn = rds_conn_lookup(net, head, laddr, faddr, trans);
        if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
            laddr == faddr && !is_outgoing) {
                /* This is a looped back IB connection, and we're
@@ -189,6 +190,12 @@ new_conn:
                }
        }
 
+       if (trans == NULL) {
+               kmem_cache_free(rds_conn_slab, conn);
+               conn = ERR_PTR(-ENODEV);
+               goto out;
+       }
+
        conn->c_trans = trans;
 
        ret = trans->conn_alloc(conn, gfp);
@@ -239,7 +246,7 @@ new_conn:
                if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
                        found = NULL;
                else
-                       found = rds_conn_lookup(head, laddr, faddr, trans);
+                       found = rds_conn_lookup(net, head, laddr, faddr, trans);
                if (found) {
                        trans->conn_free(conn->c_transport_data);
                        kmem_cache_free(rds_conn_slab, conn);
index d020fade312ce3608def5ecc9ed19939d349e9a5..2d3f2ab475df8dc0b4329ca79ae59285f60df136 100644 (file)
@@ -99,8 +99,6 @@ static void rds_ib_dev_free(struct work_struct *work)
 
        if (rds_ibdev->mr_pool)
                rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
-       if (rds_ibdev->mr)
-               ib_dereg_mr(rds_ibdev->mr);
        if (rds_ibdev->pd)
                ib_dealloc_pd(rds_ibdev->pd);
 
@@ -164,12 +162,6 @@ static void rds_ib_add_one(struct ib_device *device)
                goto put_dev;
        }
 
-       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(rds_ibdev->mr)) {
-               rds_ibdev->mr = NULL;
-               goto put_dev;
-       }
-
        rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
        if (IS_ERR(rds_ibdev->mr_pool)) {
                rds_ibdev->mr_pool = NULL;
@@ -230,11 +222,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
  *
  * This can be called at any time and can be racing with any other RDS path.
  */
-static void rds_ib_remove_one(struct ib_device *device)
+static void rds_ib_remove_one(struct ib_device *device, void *client_data)
 {
-       struct rds_ib_device *rds_ibdev;
+       struct rds_ib_device *rds_ibdev = client_data;
 
-       rds_ibdev = ib_get_client_data(device, &rds_ib_client);
        if (!rds_ibdev)
                return;
 
index 9fc95e38659a40fdabd679ba2a914591ad774208..aae60fda77f6d26d033a8c8b1fdb808eb6effaa0 100644 (file)
@@ -100,7 +100,6 @@ struct rds_ib_connection {
        /* alphabet soup, IBTA style */
        struct rdma_cm_id       *i_cm_id;
        struct ib_pd            *i_pd;
-       struct ib_mr            *i_mr;
        struct ib_cq            *i_send_cq;
        struct ib_cq            *i_recv_cq;
 
@@ -173,7 +172,6 @@ struct rds_ib_device {
        struct list_head        conn_list;
        struct ib_device        *dev;
        struct ib_pd            *pd;
-       struct ib_mr            *mr;
        struct rds_ib_mr_pool   *mr_pool;
        unsigned int            fmr_max_remaps;
        unsigned int            max_fmrs;
index d150bb4aa3cb913510f704959e4aac4004710909..9043f5c04787216e3447813c026c284e30da2b5b 100644 (file)
@@ -269,7 +269,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
        /* Protection domain and memory range */
        ic->i_pd = rds_ibdev->pd;
-       ic->i_mr = rds_ibdev->mr;
 
        cq_attr.cqe = ic->i_send_ring.w_nr + 1;
        ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
@@ -375,7 +374,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 
        rds_ib_recv_init_ack(ic);
 
-       rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
+       rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
                 ic->i_send_cq, ic->i_recv_cq);
 
 out:
@@ -682,7 +681,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 
                ic->i_cm_id = NULL;
                ic->i_pd = NULL;
-               ic->i_mr = NULL;
                ic->i_send_cq = NULL;
                ic->i_recv_cq = NULL;
                ic->i_send_hdrs = NULL;
index 6bbe620600606fbdd528600f74bfafce892d93af..f43831e4186a3543af1b8cd8beba803c66a233d3 100644 (file)
@@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
                sge = &recv->r_sge[0];
                sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
                sge->length = sizeof(struct rds_header);
-               sge->lkey = ic->i_mr->lkey;
+               sge->lkey = ic->i_pd->local_dma_lkey;
 
                sge = &recv->r_sge[1];
                sge->addr = 0;
                sge->length = RDS_FRAG_SIZE;
-               sge->lkey = ic->i_mr->lkey;
+               sge->lkey = ic->i_pd->local_dma_lkey;
        }
 }
 
@@ -564,7 +564,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
 
        sge->addr = ic->i_ack_dma;
        sge->length = sizeof(struct rds_header);
-       sge->lkey = ic->i_mr->lkey;
+       sge->lkey = ic->i_pd->local_dma_lkey;
 
        wr->sg_list = sge;
        wr->num_sge = 1;
index c576ebeb4115adf7cb6d5142dd2dc1684a7646a9..4e88047086b6e10c62485f95b63f397900c32b28 100644 (file)
@@ -202,9 +202,9 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
                sge = &send->s_sge[0];
                sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
                sge->length = sizeof(struct rds_header);
-               sge->lkey = ic->i_mr->lkey;
+               sge->lkey = ic->i_pd->local_dma_lkey;
 
-               send->s_sge[1].lkey = ic->i_mr->lkey;
+               send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
        }
 }
 
@@ -818,7 +818,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
        /* Convert our struct scatterlist to struct ib_sge */
        send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
        send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
-       send->s_sge[0].lkey = ic->i_mr->lkey;
+       send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
 
        rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
                 send->s_sge[0].addr, send->s_sge[0].length);
@@ -932,7 +932,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                        send->s_sge[j].addr =
                                 ib_sg_dma_address(ic->i_cm_id->device, scat);
                        send->s_sge[j].length = len;
-                       send->s_sge[j].lkey = ic->i_mr->lkey;
+                       send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
 
                        sent += len;
                        rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
index 5d5a9d258658258ebf5579a6c47fb1149c784ab1..3df0295c6659c305751b14e4585e2504fbb7a90f 100644 (file)
@@ -125,12 +125,11 @@ free_attr:
        kfree(dev_attr);
 }
 
-static void rds_iw_remove_one(struct ib_device *device)
+static void rds_iw_remove_one(struct ib_device *device, void *client_data)
 {
-       struct rds_iw_device *rds_iwdev;
+       struct rds_iw_device *rds_iwdev = client_data;
        struct rds_iw_cm_id *i_cm_id, *next;
 
-       rds_iwdev = ib_get_client_data(device, &rds_iw_client);
        if (!rds_iwdev)
                return;
 
@@ -149,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device)
        if (rds_iwdev->mr)
                ib_dereg_mr(rds_iwdev->mr);
 
-       while (ib_dealloc_pd(rds_iwdev->pd)) {
-               rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
-               msleep(1);
-       }
+       ib_dealloc_pd(rds_iwdev->pd);
 
        list_del(&rds_iwdev->list);
        kfree(rds_iwdev);
index dba8d0864f18046ee87a168d49cc159518fa2916..6a8fbd6e69e7cf721c26aa742cc957e3be109fbd 100644 (file)
@@ -667,11 +667,12 @@ static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
        struct ib_mr *mr;
        int err;
 
-       mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
+       mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
+                        pool->max_message_size);
        if (IS_ERR(mr)) {
                err = PTR_ERR(mr);
 
-               printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
+               printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
                return err;
        }
 
index 334fe98c50841fed6f448cdb05eb01a5388abaee..86152ec3b8879a2dcf6eb41f85fa1cd746f5cb00 100644 (file)
@@ -153,9 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic)
                sge->length = sizeof(struct rds_header);
                sge->lkey = 0;
 
-               send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
+               send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
+                                        fastreg_message_size);
                if (IS_ERR(send->s_mr)) {
-                       printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
+                       printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
                        break;
                }
 
index f12149a29cb19b1b508b30528dc3bfd032799622..b41e9ea2ffff461847465d621a3284480f433cc5 100644 (file)
@@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
 {
        struct rfkill *rfkill;
 
-       rfkill_global_states[type].cur = blocked;
+       if (type == RFKILL_TYPE_ALL) {
+               int i;
+
+               for (i = 0; i < NUM_RFKILL_TYPES; i++)
+                       rfkill_global_states[i].cur = blocked;
+       } else {
+               rfkill_global_states[type].cur = blocked;
+       }
+
        list_for_each_entry(rfkill, &rfkill_list, node) {
                if (rfkill->type != type && type != RFKILL_TYPE_ALL)
                        continue;
index 4345790ad3266c353eeac5398593c2a9ce4effda..b7143337e4fa025fdb473732fdc064503e731dd4 100644 (file)
@@ -506,14 +506,22 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
                if (IS_ERR(rt))
                        continue;
 
+               if (!dst)
+                       dst = &rt->dst;
+
                /* Ensure the src address belongs to the output
                 * interface.
                 */
                odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
                                     false);
-               if (!odev || odev->ifindex != fl4->flowi4_oif)
+               if (!odev || odev->ifindex != fl4->flowi4_oif) {
+                       if (&rt->dst != dst)
+                               dst_release(&rt->dst);
                        continue;
+               }
 
+               if (dst != &rt->dst)
+                       dst_release(dst);
                dst = &rt->dst;
                break;
        }
index 4feda2d0a8333eb6a280521b0dc187cae84e6621..548240dd15fcf018f81134ba0717327c13eb580e 100644 (file)
@@ -23,7 +23,7 @@ struct unx_cred {
 };
 #define uc_uid                 uc_base.cr_uid
 
-#define UNX_WRITESLACK         (21 + (UNX_MAXNODENAME >> 2))
+#define UNX_WRITESLACK         (21 + XDR_QUADLEN(UNX_MAXNODENAME))
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define RPCDBG_FACILITY       RPCDBG_AUTH
index f1e8dafbd5079b3406a769ba4854ecba229edca6..cb25c89da6239154475d6c31736e328d13f19134 100644 (file)
@@ -39,6 +39,25 @@ static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
            struct rpcrdma_create_data_internal *cdata)
 {
+       struct ib_device_attr *devattr = &ia->ri_devattr;
+       struct ib_mr *mr;
+
+       /* Obtain an lkey to use for the regbufs, which are
+        * protected from remote access.
+        */
+       if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+               ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+       } else {
+               mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE);
+               if (IS_ERR(mr)) {
+                       pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+                              __func__, PTR_ERR(mr));
+                       return -ENOMEM;
+               }
+               ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+               ia->ri_dma_mr = mr;
+       }
+
        return 0;
 }
 
index 04ea914201b237cc6f42ce68caa6b5dbc7b29d59..d6653f5d0830378cd08531afb61c0b766ae8b6b9 100644 (file)
@@ -117,7 +117,7 @@ __frwr_recovery_worker(struct work_struct *work)
        if (ib_dereg_mr(r->r.frmr.fr_mr))
                goto out_fail;
 
-       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+       r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
        if (IS_ERR(r->r.frmr.fr_mr))
                goto out_fail;
 
@@ -148,7 +148,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
        struct rpcrdma_frmr *f = &r->r.frmr;
        int rc;
 
-       f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+       f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
        if (IS_ERR(f->fr_mr))
                goto out_mr_err;
        f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
@@ -158,7 +158,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
 
 out_mr_err:
        rc = PTR_ERR(f->fr_mr);
-       dprintk("RPC:       %s: ib_alloc_fast_reg_mr status %i\n",
+       dprintk("RPC:       %s: ib_alloc_mr status %i\n",
                __func__, rc);
        return rc;
 
@@ -189,6 +189,11 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
        struct ib_device_attr *devattr = &ia->ri_devattr;
        int depth, delta;
 
+       /* Obtain an lkey to use for the regbufs, which are
+        * protected from remote access.
+        */
+       ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+
        ia->ri_max_frmr_depth =
                        min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
                              devattr->max_fast_reg_page_list_len);
index 41985d07fdb744b5d9523b7c34af93c30f70522d..72cf8b15bbb4e331d49f937c58abd85f7dd70862 100644 (file)
@@ -23,6 +23,29 @@ static int
 physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
                 struct rpcrdma_create_data_internal *cdata)
 {
+       struct ib_device_attr *devattr = &ia->ri_devattr;
+       struct ib_mr *mr;
+
+       /* Obtain an rkey to use for RPC data payloads.
+        */
+       mr = ib_get_dma_mr(ia->ri_pd,
+                          IB_ACCESS_LOCAL_WRITE |
+                          IB_ACCESS_REMOTE_WRITE |
+                          IB_ACCESS_REMOTE_READ);
+       if (IS_ERR(mr)) {
+               pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+                      __func__, PTR_ERR(mr));
+               return -ENOMEM;
+       }
+       ia->ri_dma_mr = mr;
+
+       /* Obtain an lkey to use for regbufs.
+        */
+       if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+               ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+       else
+               ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+
        return 0;
 }
 
@@ -51,7 +74,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
        struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
        rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
-       seg->mr_rkey = ia->ri_bind_mem->rkey;
+       seg->mr_rkey = ia->ri_dma_mr->rkey;
        seg->mr_base = seg->mr_dma;
        seg->mr_nsegs = 1;
        return 1;
index 84ea37daef36b0aa885c27e5eda950dda818949a..bc8bd6577467d5d2346b452a3c0d3c65411b3639 100644 (file)
@@ -71,6 +71,67 @@ static const char transfertypes[][12] = {
 };
 #endif
 
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+       unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+       return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+       unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+       return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+       size_t tlen = buf->tail[0].iov_len;
+       size_t skip = tlen & 3;
+
+       /* Do not include the tail if it is only an XDR pad */
+       if (tlen < 4)
+               return 0;
+
+       /* xdr_write_pages() adds a pad at the beginning of the tail
+        * if the content in "buf->pages" is unaligned. Force the
+        * tail's actual content to land at the next XDR position
+        * after the head instead.
+        */
+       if (skip) {
+               unsigned char *src, *dst;
+               unsigned int count;
+
+               src = buf->tail[0].iov_base;
+               dst = buf->head[0].iov_base;
+               dst += buf->head[0].iov_len;
+
+               src += skip;
+               tlen -= skip;
+
+               dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
+                       __func__, skip, dst, src, tlen);
+
+               for (count = tlen; count; count--)
+                       *dst++ = *src++;
+       }
+
+       return tlen;
+}
+
 /*
  * Chunk assembly from upper layer xdr_buf.
  *
@@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        if (len && n == nsegs)
                return -EIO;
 
+       /* When encoding the read list, the tail is always sent inline */
+       if (type == rpcrdma_readch)
+               return n;
+
        if (xdrbuf->tail[0].iov_len) {
                /* the rpcrdma protocol allows us to omit any trailing
                 * xdr pad bytes, saving the server an RDMA operation. */
@@ -297,8 +362,7 @@ out:
  * pre-registered memory buffer for this request. For small amounts
  * of data, this is efficient. The cutoff value is tunable.
  */
-static int
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
 {
        int i, npages, curlen;
        int copy_len;
@@ -310,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
        destp = rqst->rq_svec[0].iov_base;
        curlen = rqst->rq_svec[0].iov_len;
        destp += curlen;
-       /*
-        * Do optional padding where it makes sense. Alignment of write
-        * payload can help the server, if our setting is accurate.
-        */
-       pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
-       if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
-               pad = 0;        /* don't pad this request */
 
-       dprintk("RPC:       %s: pad %d destp 0x%p len %d hdrlen %d\n",
-               __func__, pad, destp, rqst->rq_slen, curlen);
+       dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
+               __func__, destp, rqst->rq_slen, curlen);
 
        copy_len = rqst->rq_snd_buf.page_len;
 
@@ -355,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
                page_base = 0;
        }
        /* header now contains entire send message */
-       return pad;
 }
 
 /*
@@ -380,7 +436,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
        struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
        char *base;
-       size_t rpclen, padlen;
+       size_t rpclen;
        ssize_t hdrlen;
        enum rpcrdma_chunktype rtype, wtype;
        struct rpcrdma_msg *headerp;
@@ -402,28 +458,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        /*
         * Chunks needed for results?
         *
+        * o Read ops return data as write chunk(s), header as inline.
         * o If the expected result is under the inline threshold, all ops
-        *   return as inline (but see later).
+        *   return as inline.
         * o Large non-read ops return as a single reply chunk.
-        * o Large read ops return data as write chunk(s), header as inline.
-        *
-        * Note: the NFS code sending down multiple result segments implies
-        * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
-        */
-
-       /*
-        * This code can handle read chunks, write chunks OR reply
-        * chunks -- only one type. If the request is too big to fit
-        * inline, then we will choose read chunks. If the request is
-        * a READ, then use write chunks to separate the file data
-        * into pages; otherwise use reply chunks.
         */
-       if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-               wtype = rpcrdma_noch;
-       else if (rqst->rq_rcv_buf.page_len == 0)
-               wtype = rpcrdma_replych;
-       else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+       if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
                wtype = rpcrdma_writech;
+       else if (rpcrdma_results_inline(rqst))
+               wtype = rpcrdma_noch;
        else
                wtype = rpcrdma_replych;
 
@@ -432,21 +475,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         *
         * o If the total request is under the inline threshold, all ops
         *   are sent as inline.
-        * o Large non-write ops are sent with the entire message as a
-        *   single read chunk (protocol 0-position special case).
         * o Large write ops transmit data as read chunk(s), header as
         *   inline.
+        * o Large non-write ops are sent with the entire message as a
+        *   single read chunk (protocol 0-position special case).
         *
-        * Note: the NFS code sending down multiple argument segments
-        * implies the op is a write.
-        * TBD check NFSv4 setacl
+        * This assumes that the upper layer does not present a request
+        * that both has a data payload, and whose non-data arguments
+        * by themselves are larger than the inline threshold.
         */
-       if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+       if (rpcrdma_args_inline(rqst)) {
                rtype = rpcrdma_noch;
-       else if (rqst->rq_snd_buf.page_len == 0)
-               rtype = rpcrdma_areadch;
-       else
+       } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
                rtype = rpcrdma_readch;
+       } else {
+               r_xprt->rx_stats.nomsg_call_count++;
+               headerp->rm_type = htonl(RDMA_NOMSG);
+               rtype = rpcrdma_areadch;
+               rpclen = 0;
+       }
 
        /* The following simplification is not true forever */
        if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
@@ -458,7 +505,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        }
 
        hdrlen = RPCRDMA_HDRLEN_MIN;
-       padlen = 0;
 
        /*
         * Pull up any extra send data into the preregistered buffer.
@@ -467,45 +513,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
         */
        if (rtype == rpcrdma_noch) {
 
-               padlen = rpcrdma_inline_pullup(rqst,
-                                               RPCRDMA_INLINE_PAD_VALUE(rqst));
-
-               if (padlen) {
-                       headerp->rm_type = rdma_msgp;
-                       headerp->rm_body.rm_padded.rm_align =
-                               cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
-                       headerp->rm_body.rm_padded.rm_thresh =
-                               cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
-                       headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
-                       headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
-                       headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
-                       hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-                       if (wtype != rpcrdma_noch) {
-                               dprintk("RPC:       %s: invalid chunk list\n",
-                                       __func__);
-                               return -EIO;
-                       }
-               } else {
-                       headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
-                       headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
-                       headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
-                       /* new length after pullup */
-                       rpclen = rqst->rq_svec[0].iov_len;
-                       /*
-                        * Currently we try to not actually use read inline.
-                        * Reply chunks have the desirable property that
-                        * they land, packed, directly in the target buffers
-                        * without headers, so they require no fixup. The
-                        * additional RDMA Write op sends the same amount
-                        * of data, streams on-the-wire and adds no overhead
-                        * on receive. Therefore, we request a reply chunk
-                        * for non-writes wherever feasible and efficient.
-                        */
-                       if (wtype == rpcrdma_noch)
-                               wtype = rpcrdma_replych;
-               }
-       }
+               rpcrdma_inline_pullup(rqst);
 
+               headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+               headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+               headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+               /* new length after pullup */
+               rpclen = rqst->rq_svec[0].iov_len;
+       } else if (rtype == rpcrdma_readch)
+               rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
        if (rtype != rpcrdma_noch) {
                hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
                                               headerp, rtype);
@@ -518,9 +534,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        if (hdrlen < 0)
                return hdrlen;
 
-       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd"
                " headerp 0x%p base 0x%p lkey 0x%x\n",
-               __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+               __func__, transfertypes[wtype], hdrlen, rpclen,
                headerp, base, rdmab_lkey(req->rl_rdmabuf));
 
        /*
@@ -534,26 +550,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
        req->rl_send_iov[0].length = hdrlen;
        req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
 
+       req->rl_niovs = 1;
+       if (rtype == rpcrdma_areadch)
+               return 0;
+
        req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
        req->rl_send_iov[1].length = rpclen;
        req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
 
        req->rl_niovs = 2;
-
-       if (padlen) {
-               struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
-               req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
-               req->rl_send_iov[2].length = padlen;
-               req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-
-               req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
-               req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
-               req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-
-               req->rl_niovs = 4;
-       }
-
        return 0;
 }
 
index 2e1348bde325de89d22b8e5fcbcc897d50eba98f..cb51742840740f790d24797e585e7fb520646a09 100644 (file)
@@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
        rqstp->rq_arg.tail[0].iov_len = 0;
 }
 
-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
-{
-       if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device,
-                                    xprt->sc_cm_id->port_num))
-               return 1;
-       else
-               return min_t(int, sge_count, xprt->sc_max_sge);
-}
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
 int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
                        struct svc_rqst *rqstp,
@@ -144,8 +135,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 
        ctxt->direction = DMA_FROM_DEVICE;
        ctxt->read_hdr = head;
-       pages_needed =
-               min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
+       pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd);
        read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
 
        for (pno = 0; pno < pages_needed; pno++) {
index 21e40365042ca2c4756e87f5b8dfa09912adc072..fcc3eb80c265456d88bfe733ac9d6c1301d24e92 100644 (file)
@@ -734,17 +734,19 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
        struct ib_mr *mr;
        struct ib_fast_reg_page_list *pl;
        struct svc_rdma_fastreg_mr *frmr;
+       u32 num_sg;
 
        frmr = kmalloc(sizeof(*frmr), GFP_KERNEL);
        if (!frmr)
                goto err;
 
-       mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES);
+       num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len);
+       mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg);
        if (IS_ERR(mr))
                goto err_free_frmr;
 
        pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
-                                        RPCSVC_MAXPAGES);
+                                        num_sg);
        if (IS_ERR(pl))
                goto err_free_mr;
 
@@ -873,6 +875,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
         * capabilities of this particular device */
        newxprt->sc_max_sge = min((size_t)devattr.max_sge,
                                  (size_t)RPCSVC_MAXPAGES);
+       newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+                                      RPCSVC_MAXPAGES);
        newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
                                   (size_t)svcrdma_max_requests);
        newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
@@ -1047,6 +1051,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                "    remote_ip       : %pI4\n"
                "    remote_port     : %d\n"
                "    max_sge         : %d\n"
+               "    max_sge_rd      : %d\n"
                "    sq_depth        : %d\n"
                "    max_requests    : %d\n"
                "    ord             : %d\n",
@@ -1060,6 +1065,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
                ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
                       route.addr.dst_addr)->sin_port),
                newxprt->sc_max_sge,
+               newxprt->sc_max_sge_rd,
                newxprt->sc_sq_depth,
                newxprt->sc_max_requests,
                newxprt->sc_ord);
index 680f888a9ddd045314b305ef772385c7c6d5624e..64443eb754ad0fe7fd0b16633c3aa10cebdc3e26 100644 (file)
@@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
 }
 
 static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
 {
-       struct sockaddr *sap = (struct sockaddr *)
-                                       &rpcx_to_rdmad(xprt).addr;
        char buf[128];
 
        switch (sap->sa_family) {
@@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args)
        struct rpc_xprt *xprt;
        struct rpcrdma_xprt *new_xprt;
        struct rpcrdma_ep *new_ep;
-       struct sockaddr_in *sin;
+       struct sockaddr *sap;
        int rc;
 
        if (args->addrlen > sizeof(xprt->addr)) {
@@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args)
         * Set up RDMA-specific connect data.
         */
 
-       /* Put server RDMA address in local cdata */
-       memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+       sap = (struct sockaddr *)&cdata.addr;
+       memcpy(sap, args->dstaddr, args->addrlen);
 
        /* Ensure xprt->addr holds valid server TCP (not RDMA)
         * address, for any side protocols which peek at it */
        xprt->prot = IPPROTO_TCP;
        xprt->addrlen = args->addrlen;
-       memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+       memcpy(&xprt->addr, sap, xprt->addrlen);
 
-       sin = (struct sockaddr_in *)&cdata.addr;
-       if (ntohs(sin->sin_port) != 0)
+       if (rpc_get_port(sap))
                xprt_set_bound(xprt);
 
-       dprintk("RPC:       %s: %pI4:%u\n",
-               __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
-       /* Set max requests */
        cdata.max_requests = xprt->max_reqs;
 
-       /* Set some length limits */
        cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
        cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
 
@@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args)
 
        new_xprt = rpcx_to_rdmax(xprt);
 
-       rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
-                               xprt_rdma_memreg_strategy);
+       rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
        if (rc)
                goto out1;
 
@@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args)
        INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
                          xprt_rdma_connect_worker);
 
-       xprt_rdma_format_addresses(xprt);
+       xprt_rdma_format_addresses(xprt, sap);
        xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
        if (xprt->max_payload == 0)
                goto out4;
@@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args)
        if (!try_module_get(THIS_MODULE))
                goto out4;
 
+       dprintk("RPC:       %s: %s:%s\n", __func__,
+               xprt->address_strings[RPC_DISPLAY_ADDR],
+               xprt->address_strings[RPC_DISPLAY_PORT]);
        return xprt;
 
 out4:
@@ -653,31 +647,30 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
        if (xprt_connected(xprt))
                idle_time = (long)(jiffies - xprt->last_used) / HZ;
 
-       seq_printf(seq,
-         "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
-         "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
-          0,   /* need a local port? */
-          xprt->stat.bind_count,
-          xprt->stat.connect_count,
-          xprt->stat.connect_time,
-          idle_time,
-          xprt->stat.sends,
-          xprt->stat.recvs,
-          xprt->stat.bad_xids,
-          xprt->stat.req_u,
-          xprt->stat.bklog_u,
-
-          r_xprt->rx_stats.read_chunk_count,
-          r_xprt->rx_stats.write_chunk_count,
-          r_xprt->rx_stats.reply_chunk_count,
-          r_xprt->rx_stats.total_rdma_request,
-          r_xprt->rx_stats.total_rdma_reply,
-          r_xprt->rx_stats.pullup_copy_count,
-          r_xprt->rx_stats.fixup_copy_count,
-          r_xprt->rx_stats.hardway_register_count,
-          r_xprt->rx_stats.failed_marshal_count,
-          r_xprt->rx_stats.bad_reply_count);
+       seq_puts(seq, "\txprt:\trdma ");
+       seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+                  0,   /* need a local port? */
+                  xprt->stat.bind_count,
+                  xprt->stat.connect_count,
+                  xprt->stat.connect_time,
+                  idle_time,
+                  xprt->stat.sends,
+                  xprt->stat.recvs,
+                  xprt->stat.bad_xids,
+                  xprt->stat.req_u,
+                  xprt->stat.bklog_u);
+       seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+                  r_xprt->rx_stats.read_chunk_count,
+                  r_xprt->rx_stats.write_chunk_count,
+                  r_xprt->rx_stats.reply_chunk_count,
+                  r_xprt->rx_stats.total_rdma_request,
+                  r_xprt->rx_stats.total_rdma_reply,
+                  r_xprt->rx_stats.pullup_copy_count,
+                  r_xprt->rx_stats.fixup_copy_count,
+                  r_xprt->rx_stats.hardway_register_count,
+                  r_xprt->rx_stats.failed_marshal_count,
+                  r_xprt->rx_stats.bad_reply_count,
+                  r_xprt->rx_stats.nomsg_call_count);
 }
 
 static int
index 891c4ede2c20ea8d8c6bc79ee080f353d4df13d7..682996779970c6ccae749c9de566f06a9b205c80 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/prefetch.h>
 #include <linux/sunrpc/addr.h>
 #include <asm/bitops.h>
+#include <linux/module.h> /* try_module_get()/module_put() */
 
 #include "xprt_rdma.h"
 
@@ -414,6 +415,14 @@ connected:
        return 0;
 }
 
+static void rpcrdma_destroy_id(struct rdma_cm_id *id)
+{
+       if (id) {
+               module_put(id->device->owner);
+               rdma_destroy_id(id);
+       }
+}
+
 static struct rdma_cm_id *
 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
                        struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -440,6 +449,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
        }
        wait_for_completion_interruptible_timeout(&ia->ri_done,
                                msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+
+       /* FIXME:
+        * Until xprtrdma supports DEVICE_REMOVAL, the provider must
+        * be pinned while there are active NFS/RDMA mounts to prevent
+        * hangs and crashes at umount time.
+        */
+       if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
+               dprintk("RPC:       %s: Failed to get device module\n",
+                       __func__);
+               ia->ri_async_rc = -ENODEV;
+       }
        rc = ia->ri_async_rc;
        if (rc)
                goto out;
@@ -449,16 +469,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
        if (rc) {
                dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
                        __func__, rc);
-               goto out;
+               goto put;
        }
        wait_for_completion_interruptible_timeout(&ia->ri_done,
                                msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
        rc = ia->ri_async_rc;
        if (rc)
-               goto out;
+               goto put;
 
        return id;
-
+put:
+       module_put(id->device->owner);
 out:
        rdma_destroy_id(id);
        return ERR_PTR(rc);
@@ -493,9 +514,11 @@ rpcrdma_clean_cq(struct ib_cq *cq)
 int
 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 {
-       int rc, mem_priv;
        struct rpcrdma_ia *ia = &xprt->rx_ia;
        struct ib_device_attr *devattr = &ia->ri_devattr;
+       int rc;
+
+       ia->ri_dma_mr = NULL;
 
        ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
        if (IS_ERR(ia->ri_id)) {
@@ -519,11 +542,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                goto out3;
        }
 
-       if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
-               ia->ri_have_dma_lkey = 1;
-               ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
-       }
-
        if (memreg == RPCRDMA_FRMR) {
                /* Requires both frmr reg and local dma lkey */
                if (((devattr->device_cap_flags &
@@ -539,42 +557,19 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
                if (!ia->ri_device->alloc_fmr) {
                        dprintk("RPC:       %s: MTHCAFMR registration "
                                "not supported by HCA\n", __func__);
-                       memreg = RPCRDMA_ALLPHYSICAL;
+                       goto out3;
                }
        }
 
-       /*
-        * Optionally obtain an underlying physical identity mapping in
-        * order to do a memory window-based bind. This base registration
-        * is protected from remote access - that is enabled only by binding
-        * for the specific bytes targeted during each RPC operation, and
-        * revoked after the corresponding completion similar to a storage
-        * adapter.
-        */
        switch (memreg) {
        case RPCRDMA_FRMR:
                ia->ri_ops = &rpcrdma_frwr_memreg_ops;
                break;
        case RPCRDMA_ALLPHYSICAL:
                ia->ri_ops = &rpcrdma_physical_memreg_ops;
-               mem_priv = IB_ACCESS_LOCAL_WRITE |
-                               IB_ACCESS_REMOTE_WRITE |
-                               IB_ACCESS_REMOTE_READ;
-               goto register_setup;
+               break;
        case RPCRDMA_MTHCAFMR:
                ia->ri_ops = &rpcrdma_fmr_memreg_ops;
-               if (ia->ri_have_dma_lkey)
-                       break;
-               mem_priv = IB_ACCESS_LOCAL_WRITE;
-       register_setup:
-               ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
-               if (IS_ERR(ia->ri_bind_mem)) {
-                       printk(KERN_ALERT "%s: ib_get_dma_mr for "
-                               "phys register failed with %lX\n",
-                               __func__, PTR_ERR(ia->ri_bind_mem));
-                       rc = -ENOMEM;
-                       goto out3;
-               }
                break;
        default:
                printk(KERN_ERR "RPC: Unsupported memory "
@@ -592,7 +587,7 @@ out3:
        ib_dealloc_pd(ia->ri_pd);
        ia->ri_pd = NULL;
 out2:
-       rdma_destroy_id(ia->ri_id);
+       rpcrdma_destroy_id(ia->ri_id);
        ia->ri_id = NULL;
 out1:
        return rc;
@@ -606,25 +601,17 @@ out1:
 void
 rpcrdma_ia_close(struct rpcrdma_ia *ia)
 {
-       int rc;
-
        dprintk("RPC:       %s: entering\n", __func__);
-       if (ia->ri_bind_mem != NULL) {
-               rc = ib_dereg_mr(ia->ri_bind_mem);
-               dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
-                       __func__, rc);
-       }
-
        if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
                if (ia->ri_id->qp)
                        rdma_destroy_qp(ia->ri_id);
-               rdma_destroy_id(ia->ri_id);
+               rpcrdma_destroy_id(ia->ri_id);
                ia->ri_id = NULL;
        }
 
        /* If the pd is still busy, xprtrdma missed freeing a resource */
        if (ia->ri_pd && !IS_ERR(ia->ri_pd))
-               WARN_ON(ib_dealloc_pd(ia->ri_pd));
+               ib_dealloc_pd(ia->ri_pd);
 }
 
 /*
@@ -639,6 +626,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        struct ib_cq_init_attr cq_attr = {};
        int rc, err;
 
+       if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+               dprintk("RPC:       %s: insufficient sge's available\n",
+                       __func__);
+               return -ENOMEM;
+       }
+
        /* check provider's send/recv wr limits */
        if (cdata->max_requests > devattr->max_qp_wr)
                cdata->max_requests = devattr->max_qp_wr;
@@ -651,21 +644,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
        if (rc)
                return rc;
        ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
-       ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+       ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
        ep->rep_attr.cap.max_recv_sge = 1;
        ep->rep_attr.cap.max_inline_data = 0;
        ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        ep->rep_attr.qp_type = IB_QPT_RC;
        ep->rep_attr.port_num = ~0;
 
-       if (cdata->padding) {
-               ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
-                                                     GFP_KERNEL);
-               if (IS_ERR(ep->rep_padbuf))
-                       return PTR_ERR(ep->rep_padbuf);
-       } else
-               ep->rep_padbuf = NULL;
-
        dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
                "iovs: send %d recv %d\n",
                __func__,
@@ -748,7 +733,8 @@ out2:
                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
                        __func__, err);
 out1:
-       rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+       if (ia->ri_dma_mr)
+               ib_dereg_mr(ia->ri_dma_mr);
        return rc;
 }
 
@@ -775,8 +761,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
                ia->ri_id->qp = NULL;
        }
 
-       rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-
        rpcrdma_clean_cq(ep->rep_attr.recv_cq);
        rc = ib_destroy_cq(ep->rep_attr.recv_cq);
        if (rc)
@@ -788,6 +772,12 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
        if (rc)
                dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
                        __func__, rc);
+
+       if (ia->ri_dma_mr) {
+               rc = ib_dereg_mr(ia->ri_dma_mr);
+               dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
+                       __func__, rc);
+       }
 }
 
 /*
@@ -825,7 +815,7 @@ retry:
                if (ia->ri_device != id->device) {
                        printk("RPC:       %s: can't reconnect on "
                                "different device!\n", __func__);
-                       rdma_destroy_id(id);
+                       rpcrdma_destroy_id(id);
                        rc = -ENETUNREACH;
                        goto out;
                }
@@ -834,7 +824,7 @@ retry:
                if (rc) {
                        dprintk("RPC:       %s: rdma_create_qp failed %i\n",
                                __func__, rc);
-                       rdma_destroy_id(id);
+                       rpcrdma_destroy_id(id);
                        rc = -ENETUNREACH;
                        goto out;
                }
@@ -845,7 +835,7 @@ retry:
                write_unlock(&ia->ri_qplock);
 
                rdma_destroy_qp(old);
-               rdma_destroy_id(old);
+               rpcrdma_destroy_id(old);
        } else {
                dprintk("RPC:       %s: connecting...\n", __func__);
                rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -1229,75 +1219,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
                (unsigned long long)seg->mr_dma, seg->mr_dmalen);
 }
 
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
-                               struct ib_mr **mrp, struct ib_sge *iov)
-{
-       struct ib_phys_buf ipb;
-       struct ib_mr *mr;
-       int rc;
-
-       /*
-        * All memory passed here was kmalloc'ed, therefore phys-contiguous.
-        */
-       iov->addr = ib_dma_map_single(ia->ri_device,
-                       va, len, DMA_BIDIRECTIONAL);
-       if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-               return -ENOMEM;
-
-       iov->length = len;
-
-       if (ia->ri_have_dma_lkey) {
-               *mrp = NULL;
-               iov->lkey = ia->ri_dma_lkey;
-               return 0;
-       } else if (ia->ri_bind_mem != NULL) {
-               *mrp = NULL;
-               iov->lkey = ia->ri_bind_mem->lkey;
-               return 0;
-       }
-
-       ipb.addr = iov->addr;
-       ipb.size = iov->length;
-       mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
-                       IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
-       dprintk("RPC:       %s: phys convert: 0x%llx "
-                       "registered 0x%llx length %d\n",
-                       __func__, (unsigned long long)ipb.addr,
-                       (unsigned long long)iov->addr, len);
-
-       if (IS_ERR(mr)) {
-               *mrp = NULL;
-               rc = PTR_ERR(mr);
-               dprintk("RPC:       %s: failed with %i\n", __func__, rc);
-       } else {
-               *mrp = mr;
-               iov->lkey = mr->lkey;
-               rc = 0;
-       }
-
-       return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
-                               struct ib_mr *mr, struct ib_sge *iov)
-{
-       int rc;
-
-       ib_dma_unmap_single(ia->ri_device,
-                           iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
-       if (NULL == mr)
-               return 0;
-
-       rc = ib_dereg_mr(mr);
-       if (rc)
-               dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
-       return rc;
-}
-
 /**
  * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
  * @ia: controlling rpcrdma_ia
@@ -1317,26 +1238,29 @@ struct rpcrdma_regbuf *
 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
 {
        struct rpcrdma_regbuf *rb;
-       int rc;
+       struct ib_sge *iov;
 
-       rc = -ENOMEM;
        rb = kmalloc(sizeof(*rb) + size, flags);
        if (rb == NULL)
                goto out;
 
-       rb->rg_size = size;
-       rb->rg_owner = NULL;
-       rc = rpcrdma_register_internal(ia, rb->rg_base, size,
-                                      &rb->rg_mr, &rb->rg_iov);
-       if (rc)
+       iov = &rb->rg_iov;
+       iov->addr = ib_dma_map_single(ia->ri_device,
+                                     (void *)rb->rg_base, size,
+                                     DMA_BIDIRECTIONAL);
+       if (ib_dma_mapping_error(ia->ri_device, iov->addr))
                goto out_free;
 
+       iov->length = size;
+       iov->lkey = ia->ri_dma_lkey;
+       rb->rg_size = size;
+       rb->rg_owner = NULL;
        return rb;
 
 out_free:
        kfree(rb);
 out:
-       return ERR_PTR(rc);
+       return ERR_PTR(-ENOMEM);
 }
 
 /**
@@ -1347,10 +1271,15 @@ out:
 void
 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 {
-       if (rb) {
-               rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
-               kfree(rb);
-       }
+       struct ib_sge *iov;
+
+       if (!rb)
+               return;
+
+       iov = &rb->rg_iov;
+       ib_dma_unmap_single(ia->ri_device,
+                           iov->addr, iov->length, DMA_BIDIRECTIONAL);
+       kfree(rb);
 }
 
 /*
@@ -1363,9 +1292,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
                struct rpcrdma_ep *ep,
                struct rpcrdma_req *req)
 {
+       struct ib_device *device = ia->ri_device;
        struct ib_send_wr send_wr, *send_wr_fail;
        struct rpcrdma_rep *rep = req->rl_reply;
-       int rc;
+       struct ib_sge *iov = req->rl_send_iov;
+       int i, rc;
 
        if (rep) {
                rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1376,22 +1307,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 
        send_wr.next = NULL;
        send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
-       send_wr.sg_list = req->rl_send_iov;
+       send_wr.sg_list = iov;
        send_wr.num_sge = req->rl_niovs;
        send_wr.opcode = IB_WR_SEND;
-       if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
-               ib_dma_sync_single_for_device(ia->ri_device,
-                                             req->rl_send_iov[3].addr,
-                                             req->rl_send_iov[3].length,
-                                             DMA_TO_DEVICE);
-       ib_dma_sync_single_for_device(ia->ri_device,
-                                     req->rl_send_iov[1].addr,
-                                     req->rl_send_iov[1].length,
-                                     DMA_TO_DEVICE);
-       ib_dma_sync_single_for_device(ia->ri_device,
-                                     req->rl_send_iov[0].addr,
-                                     req->rl_send_iov[0].length,
-                                     DMA_TO_DEVICE);
+
+       for (i = 0; i < send_wr.num_sge; i++)
+               ib_dma_sync_single_for_device(device, iov[i].addr,
+                                             iov[i].length, DMA_TO_DEVICE);
+       dprintk("RPC:       %s: posting %d s/g entries\n",
+               __func__, send_wr.num_sge);
 
        if (DECR_CQCOUNT(ep) > 0)
                send_wr.send_flags = 0;
index e718d0959af34207082211b5a3b0c69bc5d779eb..02512221b8bc885dde93b987c561b344f6e96722 100644 (file)
@@ -64,9 +64,8 @@ struct rpcrdma_ia {
        struct ib_device        *ri_device;
        struct rdma_cm_id       *ri_id;
        struct ib_pd            *ri_pd;
-       struct ib_mr            *ri_bind_mem;
+       struct ib_mr            *ri_dma_mr;
        u32                     ri_dma_lkey;
-       int                     ri_have_dma_lkey;
        struct completion       ri_done;
        int                     ri_async_rc;
        unsigned int            ri_max_frmr_depth;
@@ -88,7 +87,6 @@ struct rpcrdma_ep {
        int                     rep_connected;
        struct ib_qp_init_attr  rep_attr;
        wait_queue_head_t       rep_connect_wait;
-       struct rpcrdma_regbuf   *rep_padbuf;
        struct rdma_conn_param  rep_remote_cma;
        struct sockaddr_storage rep_remote_addr;
        struct delayed_work     rep_connect_worker;
@@ -118,7 +116,6 @@ struct rpcrdma_ep {
 struct rpcrdma_regbuf {
        size_t                  rg_size;
        struct rpcrdma_req      *rg_owner;
-       struct ib_mr            *rg_mr;
        struct ib_sge           rg_iov;
        __be32                  rg_base[0] __attribute__ ((aligned(256)));
 };
@@ -164,8 +161,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  * struct rpcrdma_buffer. N is the max number of outstanding requests.
  */
 
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS  (64)    /* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS  ((1 * 1024 * 1024) / PAGE_SIZE)
 #define RPCRDMA_MAX_SEGS       (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
 
 struct rpcrdma_buffer;
@@ -257,16 +253,18 @@ struct rpcrdma_mr_seg {           /* chunk descriptors */
        char            *mr_offset;     /* kva if no page, else offset */
 };
 
+#define RPCRDMA_MAX_IOVS       (2)
+
 struct rpcrdma_req {
-       unsigned int    rl_niovs;       /* 0, 2 or 4 */
-       unsigned int    rl_nchunks;     /* non-zero if chunks */
-       unsigned int    rl_connect_cookie;      /* retry detection */
-       struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+       unsigned int            rl_niovs;
+       unsigned int            rl_nchunks;
+       unsigned int            rl_connect_cookie;
+       struct rpcrdma_buffer   *rl_buffer;
        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
-       struct ib_sge   rl_send_iov[4]; /* for active requests */
-       struct rpcrdma_regbuf *rl_rdmabuf;
-       struct rpcrdma_regbuf *rl_sendbuf;
-       struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+       struct ib_sge           rl_send_iov[RPCRDMA_MAX_IOVS];
+       struct rpcrdma_regbuf   *rl_rdmabuf;
+       struct rpcrdma_regbuf   *rl_sendbuf;
+       struct rpcrdma_mr_seg   rl_segments[RPCRDMA_MAX_SEGS];
 };
 
 static inline struct rpcrdma_req *
@@ -341,6 +339,7 @@ struct rpcrdma_stats {
        unsigned long           hardway_register_count;
        unsigned long           failed_marshal_count;
        unsigned long           bad_reply_count;
+       unsigned long           nomsg_call_count;
 };
 
 /*
index 0030376327b77f0a08d4af887286a0616a60069e..7be90bc1a7c26c2c4b998e4b21ca319df19f9638 100644 (file)
@@ -822,6 +822,8 @@ static void xs_reset_transport(struct sock_xprt *transport)
        if (atomic_read(&transport->xprt.swapper))
                sk_clear_memalloc(sk);
 
+       kernel_sock_shutdown(sock, SHUT_RDWR);
+
        write_lock_bh(&sk->sk_callback_lock);
        transport->inet = NULL;
        transport->sock = NULL;
@@ -829,6 +831,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
        sk->sk_user_data = NULL;
 
        xs_restore_old_callbacks(transport, sk);
+       xprt_clear_connected(xprt);
        write_unlock_bh(&sk->sk_callback_lock);
        xs_sock_reset_connection_flags(xprt);
 
@@ -1866,7 +1869,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
                sk->sk_data_ready = xs_local_data_ready;
                sk->sk_write_space = xs_udp_write_space;
                sk->sk_error_report = xs_error_report;
-               sk->sk_allocation = GFP_ATOMIC;
+               sk->sk_allocation = GFP_NOIO;
 
                xprt_clear_connected(xprt);
 
@@ -2051,7 +2054,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                sk->sk_user_data = xprt;
                sk->sk_data_ready = xs_udp_data_ready;
                sk->sk_write_space = xs_udp_write_space;
-               sk->sk_allocation = GFP_ATOMIC;
+               sk->sk_allocation = GFP_NOIO;
 
                xprt_set_connected(xprt);
 
@@ -2153,7 +2156,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
                sk->sk_state_change = xs_tcp_state_change;
                sk->sk_write_space = xs_tcp_write_space;
                sk->sk_error_report = xs_error_report;
-               sk->sk_allocation = GFP_ATOMIC;
+               sk->sk_allocation = GFP_NOIO;
 
                /* socket options */
                sock_reset_flag(sk, SOCK_LINGER);
@@ -2279,13 +2282,14 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 
        WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
 
-       /* Start by resetting any existing state */
-       xs_reset_transport(transport);
-
-       if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
+       if (transport->sock != NULL) {
                dprintk("RPC:       xs_connect delayed xprt %p for %lu "
                                "seconds\n",
                                xprt, xprt->reestablish_timeout / HZ);
+
+               /* Start by resetting any existing state */
+               xs_reset_transport(transport);
+
                queue_delayed_work(rpciod_workqueue,
                                   &transport->connect_worker,
                                   xprt->reestablish_timeout);
index 16c1c43980a12dbe11832ad61777333446b69eea..fda38f830a10869713177220f8d2066c6076da0c 100644 (file)
@@ -853,12 +853,8 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                .cb = cb,
                .idx = idx,
        };
-       int err;
-
-       err = switchdev_port_obj_dump(dev, &dump.obj);
-       if (err)
-               return err;
 
+       switchdev_port_obj_dump(dev, &dump.obj);
        return dump.idx;
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
index 8b010c976b2f7c8eba5f6fe1cadb516b4e0f7269..41042de3ae9bcfad4504e0bcbb29d3bea4512bb5 100644 (file)
@@ -169,6 +169,30 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
        }
 }
 
+/**
+ * bclink_prepare_wakeup - prepare users for wakeup after congestion
+ * @bcl: broadcast link
+ * @resultq: queue for users which can be woken up
+ * Move a number of waiting users, as permitted by available space in
+ * the send queue, from link wait queue to specified queue for wakeup
+ */
+static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq)
+{
+       int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
+       int imp, lim;
+       struct sk_buff *skb, *tmp;
+
+       skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) {
+               imp = TIPC_SKB_CB(skb)->chain_imp;
+               lim = bcl->window + bcl->backlog[imp].limit;
+               pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
+               if ((pnd[imp] + bcl->backlog[imp].len) >= lim)
+                       continue;
+               skb_unlink(skb, &bcl->wakeupq);
+               skb_queue_tail(resultq, skb);
+       }
+}
+
 /**
  * tipc_bclink_wakeup_users - wake up pending users
  *
@@ -177,8 +201,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
 void tipc_bclink_wakeup_users(struct net *net)
 {
        struct tipc_net *tn = net_generic(net, tipc_net_id);
+       struct tipc_link *bcl = tn->bcl;
+       struct sk_buff_head resultq;
 
-       tipc_sk_rcv(net, &tn->bclink->link.wakeupq);
+       skb_queue_head_init(&resultq);
+       bclink_prepare_wakeup(bcl, &resultq);
+       tipc_sk_rcv(net, &resultq);
 }
 
 /**
index b144485946f2e5ce2ec6411cd52462fa278b185e..2510b231451ec8c7e0f0f33d523739dce3387ca3 100644 (file)
@@ -2625,7 +2625,7 @@ static void restore_regulatory_settings(bool reset_user)
         * settings, user regulatory settings takes precedence.
         */
        if (is_an_alpha2(alpha2))
-               regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER);
+               regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER);
 
        spin_lock(&reg_requests_lock);
        list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
index 5ecfe93f2028712afa413dba6f67c72e1ce0930e..12efbbefd4d754064c4f8533351d126851449ab9 100644 (file)
@@ -10,3 +10,5 @@ recordmcount
 docproc
 sortextable
 asn1_compiler
+extract-cert
+sign-file
index d3437b82ac256cb7bca2527f0cfe43e07f66a1cb..1db6d73c8dd2bdfeec2cae452dffc8791b3ef645 100644 (file)
@@ -128,6 +128,10 @@ cc-option-align = $(subst -functions=0,,\
 cc-disable-warning = $(call try-run,\
        $(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
+# cc-name
+# Expands to either gcc or clang
+cc-name = $(shell $(CC) -v 2>&1 | grep -q "clang version" && echo clang || echo gcc)
+
 # cc-version
 cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
 
@@ -303,3 +307,54 @@ why =                                                                        \
 
 echo-why = $(call escsq, $(strip $(why)))
 endif
+
+###############################################################################
+#
+# When a Kconfig string contains a filename, it is suitable for
+# passing to shell commands. It is surrounded by double-quotes, and
+# any double-quotes or backslashes within it are escaped by
+# backslashes.
+#
+# This is no use for dependencies or $(wildcard). We need to strip the
+# surrounding quotes and the escaping from quotes and backslashes, and
+# we *do* need to escape any spaces in the string. So, for example:
+#
+# Usage: $(eval $(call config_filename,FOO))
+#
+# Defines FOO_FILENAME based on the contents of the CONFIG_FOO option,
+# transformed as described above to be suitable for use within the
+# makefile.
+#
+# Also, if the filename is a relative filename and exists in the source
+# tree but not the build tree, define FOO_SRCPREFIX as $(srctree)/ to
+# be prefixed to *both* command invocation and dependencies.
+#
+# Note: We also print the filenames in the quiet_cmd_foo text, and
+# perhaps ought to have a version specially escaped for that purpose.
+# But it's only cosmetic, and $(patsubst "%",%,$(CONFIG_FOO)) is good
+# enough.  It'll strip the quotes in the common case where there's no
+# space and it's a simple filename, and it'll retain the quotes when
+# there's a space. There are some esoteric cases in which it'll print
+# the wrong thing, but we don't really care. The actual dependencies
+# and commands *do* get it right, with various combinations of single
+# and double quotes, backslashes and spaces in the filenames.
+#
+###############################################################################
+#
+space_escape := %%%SPACE%%%
+#
+define config_filename
+ifneq ($$(CONFIG_$(1)),"")
+$(1)_FILENAME := $$(subst \\,\,$$(subst \$$(quote),$$(quote),$$(subst $$(space_escape),\$$(space),$$(patsubst "%",%,$$(subst $$(space),$$(space_escape),$$(CONFIG_$(1)))))))
+ifneq ($$(patsubst /%,%,$$(firstword $$($(1)_FILENAME))),$$(firstword $$($(1)_FILENAME)))
+else
+ifeq ($$(wildcard $$($(1)_FILENAME)),)
+ifneq ($$(wildcard $$(srctree)/$$($(1)_FILENAME)),)
+$(1)_SRCPREFIX := $(srctree)/
+endif
+endif
+endif
+endif
+endef
+#
+###############################################################################
index 2016a64497ab1cca3074a73a76057ff02947b3a0..1b2661712d449a88668f470d2034b5091c5342c0 100644 (file)
@@ -16,9 +16,13 @@ hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount
 hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable
 hostprogs-$(CONFIG_ASN1)        += asn1_compiler
+hostprogs-$(CONFIG_MODULE_SIG)  += sign-file
+hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert
 
 HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include
 HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include
+HOSTLOADLIBES_sign-file = -lcrypto
+HOSTLOADLIBES_extract-cert = -lcrypto
 
 always         := $(hostprogs-y) $(hostprogs-m)
 
index f734033af219d267fd67428432feb9bacee0baae..4efedcbe4165f48ac1afbca50f3612bd312e9b70 100644 (file)
@@ -56,7 +56,7 @@ endif
 KBUILD_CFLAGS += $(warning)
 else
 
-ifeq ($(COMPILER),clang)
+ifeq ($(cc-name),clang)
 KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-value)
 KBUILD_CFLAGS += $(call cc-disable-warning, format)
index e48a4e9d88682b2cdc5834fc5e588eb0eba0e642..07650eeaaf06dd85bb5f3439959ed05c36d8a5ff 100644 (file)
@@ -22,7 +22,7 @@ quiet_cmd_modules_install = INSTALL $@
     mkdir -p $(2) ; \
     cp $@ $(2) ; \
     $(mod_strip_cmd) $(2)/$(notdir $@) ; \
-    $(mod_sign_cmd) $(2)/$(notdir $@) $(patsubst %,|| true,$(KBUILD_EXTMOD)) ; \
+    $(mod_sign_cmd) $(2)/$(notdir $@) $(patsubst %,|| true,$(KBUILD_EXTMOD)) && \
     $(mod_compress_cmd) $(2)/$(notdir $@)
 
 # Modules built outside the kernel source tree go into extra by default
index 7750e9c31483b99bdf467db98757ca16e7152847..e000f44e37b8775f45023464b765a460d92d1204 100644 (file)
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include <string.h>
 #include <ctype.h>
 #include <unistd.h>
@@ -293,8 +294,8 @@ static const char *const directives[NR__DIRECTIVES] = {
 
 struct action {
        struct action   *next;
+       char            *name;
        unsigned char   index;
-       char            name[];
 };
 
 static struct action *action_list;
@@ -305,15 +306,17 @@ struct token {
        enum token_type token_type : 8;
        unsigned char   size;
        struct action   *action;
-       const char      *value;
+       char            *content;
        struct type     *type;
 };
 
 static struct token *token_list;
 static unsigned nr_tokens;
-static _Bool verbose;
+static bool verbose_opt;
+static bool debug_opt;
 
-#define debug(fmt, ...) do { if (verbose) printf(fmt, ## __VA_ARGS__); } while (0)
+#define verbose(fmt, ...) do { if (verbose_opt) printf(fmt, ## __VA_ARGS__); } while (0)
+#define debug(fmt, ...) do { if (debug_opt) printf(fmt, ## __VA_ARGS__); } while (0)
 
 static int directive_compare(const void *_key, const void *_pdir)
 {
@@ -325,11 +328,9 @@ static int directive_compare(const void *_key, const void *_pdir)
        dlen = strlen(dir);
        clen = (dlen < token->size) ? dlen : token->size;
 
-       //debug("cmp(%*.*s,%s) = ",
-       //       (int)token->size, (int)token->size, token->value,
-       //       dir);
+       //debug("cmp(%s,%s) = ", token->content, dir);
 
-       val = memcmp(token->value, dir, clen);
+       val = memcmp(token->content, dir, clen);
        if (val != 0) {
                //debug("%d [cmp]\n", val);
                return val;
@@ -349,7 +350,7 @@ static int directive_compare(const void *_key, const void *_pdir)
 static void tokenise(char *buffer, char *end)
 {
        struct token *tokens;
-       char *line, *nl, *p, *q;
+       char *line, *nl, *start, *p, *q;
        unsigned tix, lineno;
 
        /* Assume we're going to have half as many tokens as we have
@@ -408,11 +409,11 @@ static void tokenise(char *buffer, char *end)
                                break;
 
                        tokens[tix].line = lineno;
-                       tokens[tix].value = p;
+                       start = p;
 
                        /* Handle string tokens */
                        if (isalpha(*p)) {
-                               const char **dir;
+                               const char **dir, *start = p;
 
                                /* Can be a directive, type name or element
                                 * name.  Find the end of the name.
@@ -423,10 +424,18 @@ static void tokenise(char *buffer, char *end)
                                tokens[tix].size = q - p;
                                p = q;
 
+                               tokens[tix].content = malloc(tokens[tix].size + 1);
+                               if (!tokens[tix].content) {
+                                       perror(NULL);
+                                       exit(1);
+                               }
+                               memcpy(tokens[tix].content, start, tokens[tix].size);
+                               tokens[tix].content[tokens[tix].size] = 0;
+                               
                                /* If it begins with a lowercase letter then
                                 * it's an element name
                                 */
-                               if (islower(tokens[tix].value[0])) {
+                               if (islower(tokens[tix].content[0])) {
                                        tokens[tix++].token_type = TOKEN_ELEMENT_NAME;
                                        continue;
                                }
@@ -455,6 +464,13 @@ static void tokenise(char *buffer, char *end)
                                        q++;
                                tokens[tix].size = q - p;
                                p = q;
+                               tokens[tix].content = malloc(tokens[tix].size + 1);
+                               if (!tokens[tix].content) {
+                                       perror(NULL);
+                                       exit(1);
+                               }
+                               memcpy(tokens[tix].content, start, tokens[tix].size);
+                               tokens[tix].content[tokens[tix].size] = 0;
                                tokens[tix++].token_type = TOKEN_NUMBER;
                                continue;
                        }
@@ -463,6 +479,7 @@ static void tokenise(char *buffer, char *end)
                                if (memcmp(p, "::=", 3) == 0) {
                                        p += 3;
                                        tokens[tix].size = 3;
+                                       tokens[tix].content = "::=";
                                        tokens[tix++].token_type = TOKEN_ASSIGNMENT;
                                        continue;
                                }
@@ -472,12 +489,14 @@ static void tokenise(char *buffer, char *end)
                                if (memcmp(p, "({", 2) == 0) {
                                        p += 2;
                                        tokens[tix].size = 2;
+                                       tokens[tix].content = "({";
                                        tokens[tix++].token_type = TOKEN_OPEN_ACTION;
                                        continue;
                                }
                                if (memcmp(p, "})", 2) == 0) {
                                        p += 2;
                                        tokens[tix].size = 2;
+                                       tokens[tix].content = "})";
                                        tokens[tix++].token_type = TOKEN_CLOSE_ACTION;
                                        continue;
                                }
@@ -488,22 +507,27 @@ static void tokenise(char *buffer, char *end)
                                switch (*p) {
                                case '{':
                                        p += 1;
+                                       tokens[tix].content = "{";
                                        tokens[tix++].token_type = TOKEN_OPEN_CURLY;
                                        continue;
                                case '}':
                                        p += 1;
+                                       tokens[tix].content = "}";
                                        tokens[tix++].token_type = TOKEN_CLOSE_CURLY;
                                        continue;
                                case '[':
                                        p += 1;
+                                       tokens[tix].content = "[";
                                        tokens[tix++].token_type = TOKEN_OPEN_SQUARE;
                                        continue;
                                case ']':
                                        p += 1;
+                                       tokens[tix].content = "]";
                                        tokens[tix++].token_type = TOKEN_CLOSE_SQUARE;
                                        continue;
                                case ',':
                                        p += 1;
+                                       tokens[tix].content = ",";
                                        tokens[tix++].token_type = TOKEN_COMMA;
                                        continue;
                                default:
@@ -518,22 +542,20 @@ static void tokenise(char *buffer, char *end)
        }
 
        nr_tokens = tix;
-       debug("Extracted %u tokens\n", nr_tokens);
+       verbose("Extracted %u tokens\n", nr_tokens);
 
 #if 0
        {
                int n;
                for (n = 0; n < nr_tokens; n++)
-                       debug("Token %3u: '%*.*s'\n",
-                              n,
-                              (int)token_list[n].size, (int)token_list[n].size,
-                              token_list[n].value);
+                       debug("Token %3u: '%s'\n", n, token_list[n].content);
        }
 #endif
 }
 
 static void build_type_list(void);
 static void parse(void);
+static void dump_elements(void);
 static void render(FILE *out, FILE *hdr);
 
 /*
@@ -548,16 +570,27 @@ int main(int argc, char **argv)
        char *kbuild_verbose;
        int fd;
 
+       kbuild_verbose = getenv("KBUILD_VERBOSE");
+       if (kbuild_verbose)
+               verbose_opt = atoi(kbuild_verbose);
+
+       while (argc > 4) {
+               if (strcmp(argv[1], "-v") == 0)
+                       verbose_opt = true;
+               else if (strcmp(argv[1], "-d") == 0)
+                       debug_opt = true;
+               else
+                       break;
+               memmove(&argv[1], &argv[2], (argc - 2) * sizeof(char *));
+               argc--;
+       }
+
        if (argc != 4) {
-               fprintf(stderr, "Format: %s <grammar-file> <c-file> <hdr-file>\n",
+               fprintf(stderr, "Format: %s [-v] [-d] <grammar-file> <c-file> <hdr-file>\n",
                        argv[0]);
                exit(2);
        }
 
-       kbuild_verbose = getenv("KBUILD_VERBOSE");
-       if (kbuild_verbose)
-               verbose = atoi(kbuild_verbose);
-
        filename = argv[1];
        outputname = argv[2];
        headername = argv[3];
@@ -608,6 +641,7 @@ int main(int argc, char **argv)
        tokenise(buffer, buffer + readlen);
        build_type_list();
        parse();
+       dump_elements();
 
        out = fopen(outputname, "w");
        if (!out) {
@@ -666,7 +700,7 @@ struct element {
        unsigned        flags;
 #define ELEMENT_IMPLICIT       0x0001
 #define ELEMENT_EXPLICIT       0x0002
-#define ELEMENT_MARKED         0x0004
+#define ELEMENT_TAG_SPECIFIED  0x0004
 #define ELEMENT_RENDERED       0x0008
 #define ELEMENT_SKIPPABLE      0x0010
 #define ELEMENT_CONDITIONAL    0x0020
@@ -693,7 +727,7 @@ static int type_index_compare(const void *_a, const void *_b)
        if ((*a)->name->size != (*b)->name->size)
                return (*a)->name->size - (*b)->name->size;
        else
-               return memcmp((*a)->name->value, (*b)->name->value,
+               return memcmp((*a)->name->content, (*b)->name->content,
                              (*a)->name->size);
 }
 
@@ -706,7 +740,7 @@ static int type_finder(const void *_key, const void *_ti)
        if (token->size != type->name->size)
                return token->size - type->name->size;
        else
-               return memcmp(token->value, type->name->value,
+               return memcmp(token->content, type->name->content,
                              token->size);
 }
 
@@ -756,14 +790,11 @@ static void build_type_list(void)
 
        qsort(type_index, nr, sizeof(type_index[0]), type_index_compare);
 
-       debug("Extracted %u types\n", nr_types);
+       verbose("Extracted %u types\n", nr_types);
 #if 0
        for (n = 0; n < nr_types; n++) {
                struct type *type = type_index[n];
-               debug("- %*.*s\n",
-                      (int)type->name->size,
-                      (int)type->name->size,
-                      type->name->value);
+               debug("- %*.*s\n", type->name->content);
        }
 #endif
 }
@@ -793,15 +824,14 @@ static void parse(void)
                type->element->type_def = type;
 
                if (cursor != type[1].name) {
-                       fprintf(stderr, "%s:%d: Parse error at token '%*.*s'\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Parse error at token '%s'\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
 
        } while (type++, !(type->flags & TYPE_STOP_MARKER));
 
-       debug("Extracted %u actions\n", nr_actions);
+       verbose("Extracted %u actions\n", nr_actions);
 }
 
 static struct element *element_list;
@@ -862,33 +892,31 @@ static struct element *parse_type(struct token **_cursor, struct token *end,
                        cursor++;
                        break;
                default:
-                       fprintf(stderr, "%s:%d: Unrecognised tag class token '%*.*s'\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Unrecognised tag class token '%s'\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
 
                if (cursor >= end)
                        goto overrun_error;
                if (cursor->token_type != TOKEN_NUMBER) {
-                       fprintf(stderr, "%s:%d: Missing tag number '%*.*s'\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Missing tag number '%s'\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
 
                element->tag &= ~0x1f;
-               element->tag |= strtoul(cursor->value, &p, 10);
-               if (p - cursor->value != cursor->size)
+               element->tag |= strtoul(cursor->content, &p, 10);
+               element->flags |= ELEMENT_TAG_SPECIFIED;
+               if (p - cursor->content != cursor->size)
                        abort();
                cursor++;
 
                if (cursor >= end)
                        goto overrun_error;
                if (cursor->token_type != TOKEN_CLOSE_SQUARE) {
-                       fprintf(stderr, "%s:%d: Missing closing square bracket '%*.*s'\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Missing closing square bracket '%s'\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
                cursor++;
@@ -988,9 +1016,8 @@ static struct element *parse_type(struct token **_cursor, struct token *end,
                ref = bsearch(cursor, type_index, nr_types, sizeof(type_index[0]),
                              type_finder);
                if (!ref) {
-                       fprintf(stderr, "%s:%d: Type '%*.*s' undefined\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Type '%s' undefined\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
                cursor->type = *ref;
@@ -1039,9 +1066,8 @@ static struct element *parse_type(struct token **_cursor, struct token *end,
                break;
 
        default:
-               fprintf(stderr, "%s:%d: Token '%*.*s' does not introduce a type\n",
-                       filename, cursor->line,
-                       (int)cursor->size, (int)cursor->size, cursor->value);
+               fprintf(stderr, "%s:%d: Token '%s' does not introduce a type\n",
+                       filename, cursor->line, cursor->content);
                exit(1);
        }
 
@@ -1058,20 +1084,18 @@ static struct element *parse_type(struct token **_cursor, struct token *end,
                if (cursor >= end)
                        goto overrun_error;
                if (cursor->token_type != TOKEN_ELEMENT_NAME) {
-                       fprintf(stderr, "%s:%d: Token '%*.*s' is not an action function name\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Token '%s' is not an action function name\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
 
-               action = malloc(sizeof(struct action) + cursor->size + 1);
+               action = malloc(sizeof(struct action));
                if (!action) {
                        perror(NULL);
                        exit(1);
                }
                action->index = 0;
-               memcpy(action->name, cursor->value, cursor->size);
-               action->name[cursor->size] = 0;
+               action->name = cursor->content;
 
                for (ppaction = &action_list;
                     *ppaction;
@@ -1101,9 +1125,8 @@ static struct element *parse_type(struct token **_cursor, struct token *end,
                if (cursor >= end)
                        goto overrun_error;
                if (cursor->token_type != TOKEN_CLOSE_ACTION) {
-                       fprintf(stderr, "%s:%d: Missing close action, got '%*.*s'\n",
-                               filename, cursor->line,
-                               (int)cursor->size, (int)cursor->size, cursor->value);
+                       fprintf(stderr, "%s:%d: Missing close action, got '%s'\n",
+                               filename, cursor->line, cursor->content);
                        exit(1);
                }
                cursor++;
@@ -1113,9 +1136,8 @@ static struct element *parse_type(struct token **_cursor, struct token *end,
        return top;
 
 parse_error:
-       fprintf(stderr, "%s:%d: Unexpected token '%*.*s'\n",
-               filename, cursor->line,
-               (int)cursor->size, (int)cursor->size, cursor->value);
+       fprintf(stderr, "%s:%d: Unexpected token '%s'\n",
+               filename, cursor->line, cursor->content);
        exit(1);
 
 overrun_error:
@@ -1133,9 +1155,8 @@ static struct element *parse_compound(struct token **_cursor, struct token *end,
        struct token *cursor = *_cursor, *name;
 
        if (cursor->token_type != TOKEN_OPEN_CURLY) {
-               fprintf(stderr, "%s:%d: Expected compound to start with brace not '%*.*s'\n",
-                       filename, cursor->line,
-                       (int)cursor->size, (int)cursor->size, cursor->value);
+               fprintf(stderr, "%s:%d: Expected compound to start with brace not '%s'\n",
+                       filename, cursor->line, cursor->content);
                exit(1);
        }
        cursor++;
@@ -1176,9 +1197,8 @@ static struct element *parse_compound(struct token **_cursor, struct token *end,
        children->flags &= ~ELEMENT_CONDITIONAL;
 
        if (cursor->token_type != TOKEN_CLOSE_CURLY) {
-               fprintf(stderr, "%s:%d: Expected compound closure, got '%*.*s'\n",
-                       filename, cursor->line,
-                       (int)cursor->size, (int)cursor->size, cursor->value);
+               fprintf(stderr, "%s:%d: Expected compound closure, got '%s'\n",
+                       filename, cursor->line, cursor->content);
                exit(1);
        }
        cursor++;
@@ -1191,6 +1211,52 @@ overrun_error:
        exit(1);
 }
 
+static void dump_element(const struct element *e, int level)
+{
+       const struct element *c;
+       const struct type *t = e->type_def;
+       const char *name = e->name ? e->name->content : ".";
+       const char *tname = t && t->name ? t->name->content : ".";
+       char tag[32];
+
+       if (e->class == 0 && e->method == 0 && e->tag == 0)
+               strcpy(tag, "<...>");
+       else if (e->class == ASN1_UNIV)
+               sprintf(tag, "%s %s %s",
+                       asn1_classes[e->class],
+                       asn1_methods[e->method],
+                       asn1_universal_tags[e->tag]);
+       else
+               sprintf(tag, "%s %s %u",
+                       asn1_classes[e->class],
+                       asn1_methods[e->method],
+                       e->tag);
+
+       printf("%c%c%c%c%c %c %*s[*] \e[33m%s\e[m %s %s \e[35m%s\e[m\n",
+              e->flags & ELEMENT_IMPLICIT ? 'I' : '-',
+              e->flags & ELEMENT_EXPLICIT ? 'E' : '-',
+              e->flags & ELEMENT_TAG_SPECIFIED ? 'T' : '-',
+              e->flags & ELEMENT_SKIPPABLE ? 'S' : '-',
+              e->flags & ELEMENT_CONDITIONAL ? 'C' : '-',
+              "-tTqQcaro"[e->compound],
+              level, "",
+              tag,
+              tname,
+              name,
+              e->action ? e->action->name : "");
+       if (e->compound == TYPE_REF)
+               dump_element(e->type->type->element, level + 3);
+       else
+               for (c = e->children; c; c = c->next)
+                       dump_element(c, level + 3);
+}
+
+static void dump_elements(void)
+{
+       if (debug_opt)
+               dump_element(type_list[0].element, 0);
+}
+
 static void render_element(FILE *out, struct element *e, struct element *tag);
 static void render_out_of_line_list(FILE *out);
 
@@ -1292,7 +1358,7 @@ static void render(FILE *out, FILE *hdr)
        }
 
        /* We do two passes - the first one calculates all the offsets */
-       debug("Pass 1\n");
+       verbose("Pass 1\n");
        nr_entries = 0;
        root = &type_list[0];
        render_element(NULL, root->element, NULL);
@@ -1303,7 +1369,7 @@ static void render(FILE *out, FILE *hdr)
                e->flags &= ~ELEMENT_RENDERED;
 
        /* And then we actually render */
-       debug("Pass 2\n");
+       verbose("Pass 2\n");
        fprintf(out, "\n");
        fprintf(out, "static const unsigned char %s_machine[] = {\n",
                grammar_name);
@@ -1376,7 +1442,7 @@ static void render_out_of_line_list(FILE *out)
  */
 static void render_element(FILE *out, struct element *e, struct element *tag)
 {
-       struct element *ec;
+       struct element *ec, *x;
        const char *cond, *act;
        int entry, skippable = 0, outofline = 0;
 
@@ -1389,9 +1455,7 @@ static void render_element(FILE *out, struct element *e, struct element *tag)
                outofline = 1;
 
        if (e->type_def && out) {
-               render_more(out, "\t// %*.*s\n",
-                           (int)e->type_def->name->size, (int)e->type_def->name->size,
-                           e->type_def->name->value);
+               render_more(out, "\t// %s\n", e->type_def->name->content);
        }
 
        /* Render the operation */
@@ -1400,11 +1464,10 @@ static void render_element(FILE *out, struct element *e, struct element *tag)
        act = e->action ? "_ACT" : "";
        switch (e->compound) {
        case ANY:
-               render_opcode(out, "ASN1_OP_%sMATCH_ANY%s,", cond, act);
+               render_opcode(out, "ASN1_OP_%sMATCH_ANY%s%s,",
+                             cond, act, skippable ? "_OR_SKIP" : "");
                if (e->name)
-                       render_more(out, "\t\t// %*.*s",
-                                   (int)e->name->size, (int)e->name->size,
-                                   e->name->value);
+                       render_more(out, "\t\t// %s", e->name->content);
                render_more(out, "\n");
                goto dont_render_tag;
 
@@ -1435,15 +1498,15 @@ static void render_element(FILE *out, struct element *e, struct element *tag)
                break;
        }
 
-       if (e->name)
-               render_more(out, "\t\t// %*.*s",
-                           (int)e->name->size, (int)e->name->size,
-                           e->name->value);
+       x = tag ?: e;
+       if (x->name)
+               render_more(out, "\t\t// %s", x->name->content);
        render_more(out, "\n");
 
        /* Render the tag */
-       if (!tag)
+       if (!tag || !(tag->flags & ELEMENT_TAG_SPECIFIED))
                tag = e;
+
        if (tag->class == ASN1_UNIV &&
            tag->tag != 14 &&
            tag->tag != 15 &&
@@ -1465,7 +1528,8 @@ dont_render_tag:
        case TYPE_REF:
                render_element(out, e->type->type->element, tag);
                if (e->action)
-                       render_opcode(out, "ASN1_OP_ACT,\n");
+                       render_opcode(out, "ASN1_OP_%sACT,\n",
+                                     skippable ? "MAYBE_" : "");
                break;
 
        case SEQUENCE:
@@ -1474,10 +1538,8 @@ dont_render_tag:
                         * skipability */
                        render_opcode(out, "_jump_target(%u),", e->entry_index);
                        if (e->type_def && e->type_def->name)
-                               render_more(out, "\t\t// --> %*.*s",
-                                           (int)e->type_def->name->size,
-                                           (int)e->type_def->name->size,
-                                           e->type_def->name->value);
+                               render_more(out, "\t\t// --> %s",
+                                           e->type_def->name->content);
                        render_more(out, "\n");
                        if (!(e->flags & ELEMENT_RENDERED)) {
                                e->flags |= ELEMENT_RENDERED;
@@ -1502,10 +1564,8 @@ dont_render_tag:
                         * skipability */
                        render_opcode(out, "_jump_target(%u),", e->entry_index);
                        if (e->type_def && e->type_def->name)
-                               render_more(out, "\t\t// --> %*.*s",
-                                           (int)e->type_def->name->size,
-                                           (int)e->type_def->name->size,
-                                           e->type_def->name->value);
+                               render_more(out, "\t\t// --> %s",
+                                           e->type_def->name->content);
                        render_more(out, "\n");
                        if (!(e->flags & ELEMENT_RENDERED)) {
                                e->flags |= ELEMENT_RENDERED;
@@ -1539,7 +1599,7 @@ dont_render_tag:
 
        case CHOICE:
                for (ec = e->children; ec; ec = ec->next)
-                       render_element(out, ec, NULL);
+                       render_element(out, ec, ec);
                if (!skippable)
                        render_opcode(out, "ASN1_OP_COND_FAIL,\n");
                if (e->action)
index b30406860b7397881e4e6ffeedcda1f5b2959cc0..c68fd61fdc42e11f7d4963c9e10a8f7462b3f63f 100644 (file)
@@ -191,23 +191,6 @@ static void define_config(const char *name, int len, unsigned int hash)
        hashtab[hash % HASHSZ] = aux;
 }
 
-/*
- * Clear the set of configuration strings.
- */
-static void clear_config(void)
-{
-       struct item *aux, *next;
-       unsigned int i;
-
-       for (i = 0; i < HASHSZ; i++) {
-               for (aux = hashtab[i]; aux; aux = next) {
-                       next = aux->next;
-                       free(aux);
-               }
-               hashtab[i] = NULL;
-       }
-}
-
 /*
  * Record the use of a CONFIG_* word.
  */
@@ -251,7 +234,8 @@ static void parse_config_file(const char *map, size_t len)
                        continue;
                if (memcmp(p, "CONFIG_", 7))
                        continue;
-               for (q = p + 7; q < map + len; q++) {
+               p += 7;
+               for (q = p; q < map + len; q++) {
                        if (!(isalnum(*q) || *q == '_'))
                                goto found;
                }
@@ -260,9 +244,9 @@ static void parse_config_file(const char *map, size_t len)
        found:
                if (!memcmp(q - 7, "_MODULE", 7))
                        q -= 7;
-               if( (q-p-7) < 0 )
+               if (q - p < 0)
                        continue;
-               use_config(p+7, q-p-7);
+               use_config(p, q - p);
        }
 }
 
@@ -324,8 +308,6 @@ static void parse_dep_file(void *map, size_t len)
        int saw_any_target = 0;
        int is_first_dep = 0;
 
-       clear_config();
-
        while (m < end) {
                /* Skip any "white space" */
                while (m < end && (*m == ' ' || *m == '\\' || *m == '\n'))
index a51ca0e5beef4bab5d06de05f96a639672f69596..f2a1131b2f8baf06f28e286d6b9203d4d1a873f6 100755 (executable)
@@ -264,6 +264,7 @@ our $Sparse = qr{
                        __kernel|
                        __force|
                        __iomem|
+                       __pmem|
                        __must_check|
                        __init_refok|
                        __kprobes|
@@ -584,7 +585,7 @@ our $LvalOrFunc     = qr{((?:[\&\*]\s*)?$Lval)\s*($balanced_parens{0,1})\s*};
 our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant|$String)};
 
 our $declaration_macros = qr{(?x:
-       (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,2}\s*\(|
+       (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(|
        (?:$Storage\s+)?LIST_HEAD\s*\(|
        (?:$Storage\s+)?${Type}\s+uninitialized_var\s*\(
 )};
@@ -1953,9 +1954,9 @@ sub process {
        our $clean = 1;
        my $signoff = 0;
        my $is_patch = 0;
-
        my $in_header_lines = $file ? 0 : 1;
        my $in_commit_log = 0;          #Scanning lines before patch
+       my $commit_log_possible_stack_dump = 0;
        my $commit_log_long_line = 0;
        my $commit_log_has_diff = 0;
        my $reported_maintainer_file = 0;
@@ -2166,11 +2167,15 @@ sub process {
                if ($showfile) {
                        $prefix = "$realfile:$realline: "
                } elsif ($emacs) {
-                       $prefix = "$filename:$linenr: ";
+                       if ($file) {
+                               $prefix = "$filename:$realline: ";
+                       } else {
+                               $prefix = "$filename:$linenr: ";
+                       }
                }
 
                if ($found_file) {
-                       if ($realfile =~ m@^(drivers/net/|net/)@) {
+                       if ($realfile =~ m@^(?:drivers/net/|net/|drivers/staging/)@) {
                                $check = 1;
                        } else {
                                $check = $check_orig;
@@ -2310,16 +2315,42 @@ sub process {
 
 # Check for line lengths > 75 in commit log, warn once
                if ($in_commit_log && !$commit_log_long_line &&
-                   length($line) > 75) {
+                   length($line) > 75 &&
+                   !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ ||
+                                       # file delta changes
+                     $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ ||
+                                       # filename then :
+                     $line =~ /^\s*(?:Fixes:|Link:)/i ||
+                                       # A Fixes: or Link: line
+                     $commit_log_possible_stack_dump)) {
                        WARN("COMMIT_LOG_LONG_LINE",
                             "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
                        $commit_log_long_line = 1;
                }
 
+# Check if the commit log is in a possible stack dump
+               if ($in_commit_log && !$commit_log_possible_stack_dump &&
+                   ($line =~ /^\s*(?:WARNING:|BUG:)/ ||
+                    $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ ||
+                               # timestamp
+                    $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) {
+                               # stack dump address
+                       $commit_log_possible_stack_dump = 1;
+               }
+
+# Reset possible stack dump if a blank line is found
+               if ($in_commit_log && $commit_log_possible_stack_dump &&
+                   $line =~ /^\s*$/) {
+                       $commit_log_possible_stack_dump = 0;
+               }
+
 # Check for git id commit length and improperly formed commit descriptions
-               if ($in_commit_log && $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i) {
-                       my $init_char = $1;
-                       my $orig_commit = lc($2);
+               if ($in_commit_log &&
+                   ($line =~ /\bcommit\s+[0-9a-f]{5,}\b/i ||
+                    ($line =~ /\b[0-9a-f]{12,40}\b/i &&
+                     $line !~ /\bfixes:\s*[0-9a-f]{12,40}/i))) {
+                       my $init_char = "c";
+                       my $orig_commit = "";
                        my $short = 1;
                        my $long = 0;
                        my $case = 1;
@@ -2330,6 +2361,13 @@ sub process {
                        my $orig_desc = "commit description";
                        my $description = "";
 
+                       if ($line =~ /\b(c)ommit\s+([0-9a-f]{5,})\b/i) {
+                               $init_char = $1;
+                               $orig_commit = lc($2);
+                       } elsif ($line =~ /\b([0-9a-f]{12,40})\b/i) {
+                               $orig_commit = lc($1);
+                       }
+
                        $short = 0 if ($line =~ /\bcommit\s+[0-9a-f]{12,40}/i);
                        $long = 1 if ($line =~ /\bcommit\s+[0-9a-f]{41,}/i);
                        $space = 0 if ($line =~ /\bcommit [0-9a-f]/i);
@@ -2738,6 +2776,8 @@ sub process {
                        }
                }
 
+# Block comment styles
+# Networking with an initial /*
                if ($realfile =~ m@^(drivers/net/|net/)@ &&
                    $prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
                    $rawline =~ /^\+[ \t]*\*/ &&
@@ -2746,22 +2786,23 @@ sub process {
                             "networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
                }
 
-               if ($realfile =~ m@^(drivers/net/|net/)@ &&
-                   $prevrawline =~ /^\+[ \t]*\/\*/ &&          #starting /*
+# Block comments use * on subsequent lines
+               if ($prevline =~ /$;[ \t]*$/ &&                 #ends in comment
+                   $prevrawline =~ /^\+.*?\/\*/ &&             #starting /*
                    $prevrawline !~ /\*\/[ \t]*$/ &&            #no trailing */
                    $rawline =~ /^\+/ &&                        #line is new
                    $rawline !~ /^\+[ \t]*\*/) {                #no leading *
-                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
-                            "networking block comments start with * on subsequent lines\n" . $hereprev);
+                       WARN("BLOCK_COMMENT_STYLE",
+                            "Block comments use * on subsequent lines\n" . $hereprev);
                }
 
-               if ($realfile =~ m@^(drivers/net/|net/)@ &&
-                   $rawline !~ m@^\+[ \t]*\*/[ \t]*$@ &&       #trailing */
+# Block comments use */ on trailing lines
+               if ($rawline !~ m@^\+[ \t]*\*/[ \t]*$@ &&       #trailing */
                    $rawline !~ m@^\+.*/\*.*\*/[ \t]*$@ &&      #inline /*...*/
                    $rawline !~ m@^\+.*\*{2,}/[ \t]*$@ &&       #trailing **/
                    $rawline =~ m@^\+[ \t]*.+\*\/[ \t]*$@) {    #non blank */
-                       WARN("NETWORKING_BLOCK_COMMENT_STYLE",
-                            "networking block comments put the trailing */ on a separate line\n" . $herecurr);
+                       WARN("BLOCK_COMMENT_STYLE",
+                            "Block comments use a trailing */ on a separate line\n" . $herecurr);
                }
 
 # check for missing blank lines after struct/union declarations
@@ -3067,15 +3108,22 @@ sub process {
 
                        substr($s, 0, length($c), '');
 
-                       # Make sure we remove the line prefixes as we have
-                       # none on the first line, and are going to readd them
-                       # where necessary.
-                       $s =~ s/\n./\n/gs;
+                       # remove inline comments
+                       $s =~ s/$;/ /g;
+                       $c =~ s/$;/ /g;
 
                        # Find out how long the conditional actually is.
                        my @newlines = ($c =~ /\n/gs);
                        my $cond_lines = 1 + $#newlines;
 
+                       # Make sure we remove the line prefixes as we have
+                       # none on the first line, and are going to readd them
+                       # where necessary.
+                       $s =~ s/\n./\n/gs;
+                       while ($s =~ /\n\s+\\\n/) {
+                               $cond_lines += $s =~ s/\n\s+\\\n/\n/g;
+                       }
+
                        # We want to check the first line inside the block
                        # starting at the end of the conditional, so remove:
                        #  1) any blank line termination
@@ -3141,8 +3189,10 @@ sub process {
 
                        #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n";
 
-                       if ($check && (($sindent % 8) != 0 ||
-                           ($sindent <= $indent && $s ne ''))) {
+                       if ($check && $s ne '' &&
+                           (($sindent % 8) != 0 ||
+                            ($sindent < $indent) ||
+                            ($sindent > $indent + 8))) {
                                WARN("SUSPECT_CODE_INDENT",
                                     "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n");
                        }
@@ -3439,13 +3489,15 @@ sub process {
                        }
                }
 
-# # no BUG() or BUG_ON()
-#              if ($line =~ /\b(BUG|BUG_ON)\b/) {
-#                      print "Try to use WARN_ON & Recovery code rather than BUG() or BUG_ON()\n";
-#                      print "$herecurr";
-#                      $clean = 0;
-#              }
+# avoid BUG() or BUG_ON()
+               if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
+                       my $msg_type = \&WARN;
+                       $msg_type = \&CHK if ($file);
+                       &{$msg_type}("AVOID_BUG",
+                                    "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
+               }
 
+# avoid LINUX_VERSION_CODE
                if ($line =~ /\bLINUX_VERSION_CODE\b/) {
                        WARN("LINUX_VERSION_CODE",
                             "LINUX_VERSION_CODE should be avoided, code should be for the version to which it is merged\n" . $herecurr);
@@ -3520,7 +3572,7 @@ sub process {
 # function brace can't be on same line, except for #defines of do while,
 # or if closed on same line
                if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and
-                   !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) {
+                   !($line=~/\#\s*define.*do\s\{/) and !($line=~/}/)) {
                        if (ERROR("OPEN_BRACE",
                                  "open brace '{' following function declarations go on the next line\n" . $herecurr) &&
                            $fix) {
@@ -4032,8 +4084,8 @@ sub process {
 ##             }
 
 #need space before brace following if, while, etc
-               if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) ||
-                   $line =~ /do{/) {
+               if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\){/) ||
+                   $line =~ /do\{/) {
                        if (ERROR("SPACING",
                                  "space required before the open brace '{'\n" . $herecurr) &&
                            $fix) {
@@ -4179,6 +4231,35 @@ sub process {
                        }
                }
 
+# comparisons with a constant or upper case identifier on the left
+#      avoid cases like "foo + BAR < baz"
+#      only fix matches surrounded by parentheses to avoid incorrect
+#      conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5"
+               if ($^V && $^V ge 5.10.0 &&
+                   $line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) {
+                       my $lead = $1;
+                       my $const = $2;
+                       my $comp = $3;
+                       my $to = $4;
+                       my $newcomp = $comp;
+                       if ($lead !~ /$Operators\s*$/ &&
+                           $to !~ /^(?:Constant|[A-Z_][A-Z0-9_]*)$/ &&
+                           WARN("CONSTANT_COMPARISON",
+                                "Comparisons should place the constant on the right side of the test\n" . $herecurr) &&
+                           $fix) {
+                               if ($comp eq "<") {
+                                       $newcomp = ">";
+                               } elsif ($comp eq "<=") {
+                                       $newcomp = ">=";
+                               } elsif ($comp eq ">") {
+                                       $newcomp = "<";
+                               } elsif ($comp eq ">=") {
+                                       $newcomp = "<=";
+                               }
+                               $fixed[$fixlinenr] =~ s/\(\s*\Q$const\E\s*$Compare\s*\Q$to\E\s*\)/($to $newcomp $const)/;
+                       }
+               }
+
 # Return of what appears to be an errno should normally be negative
                if ($sline =~ /\breturn(?:\s*\(+\s*|\s+)(E[A-Z]+)(?:\s*\)+\s*|\s*)[;:,]/) {
                        my $name = $1;
@@ -4480,7 +4561,7 @@ sub process {
                            $dstat !~ /^for\s*$Constant$/ &&                            # for (...)
                            $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ &&   # for (...) bar()
                            $dstat !~ /^do\s*{/ &&                                      # do {...
-                           $dstat !~ /^\({/ &&                                         # ({...
+                           $dstat !~ /^\(\{/ &&                                                # ({...
                            $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
                        {
                                $ctx =~ s/\n*$//;
@@ -4789,16 +4870,20 @@ sub process {
                             "Consecutive strings are generally better as a single string\n" . $herecurr);
                }
 
-# check for %L{u,d,i} in strings
+# check for %L{u,d,i} and 0x%[udi] in strings
                my $string;
                while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) {
                        $string = substr($rawline, $-[1], $+[1] - $-[1]);
                        $string =~ s/%%/__/g;
-                       if ($string =~ /(?<!%)%L[udi]/) {
+                       if ($string =~ /(?<!%)%[\*\d\.\$]*L[udi]/) {
                                WARN("PRINTF_L",
                                     "\%Ld/%Lu are not-standard C, use %lld/%llu\n" . $herecurr);
                                last;
                        }
+                       if ($string =~ /0x%[\*\d\.\$\Llzth]*[udi]/) {
+                               ERROR("PRINTF_0xDECIMAL",
+                                     "Prefixing 0x with decimal output is defective\n" . $herecurr);
+                       }
                }
 
 # check for line continuations in quoted strings with odd counts of "
@@ -4816,10 +4901,34 @@ sub process {
 
 # check for needless "if (<foo>) fn(<foo>)" uses
                if ($prevline =~ /\bif\s*\(\s*($Lval)\s*\)/) {
-                       my $expr = '\s*\(\s*' . quotemeta($1) . '\s*\)\s*;';
-                       if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?)$expr/) {
-                               WARN('NEEDLESS_IF',
-                                    "$1(NULL) is safe and this check is probably not required\n" . $hereprev);
+                       my $tested = quotemeta($1);
+                       my $expr = '\s*\(\s*' . $tested . '\s*\)\s*;';
+                       if ($line =~ /\b(kfree|usb_free_urb|debugfs_remove(?:_recursive)?|(?:kmem_cache|mempool|dma_pool)_destroy)$expr/) {
+                               my $func = $1;
+                               if (WARN('NEEDLESS_IF',
+                                        "$func(NULL) is safe and this check is probably not required\n" . $hereprev) &&
+                                   $fix) {
+                                       my $do_fix = 1;
+                                       my $leading_tabs = "";
+                                       my $new_leading_tabs = "";
+                                       if ($lines[$linenr - 2] =~ /^\+(\t*)if\s*\(\s*$tested\s*\)\s*$/) {
+                                               $leading_tabs = $1;
+                                       } else {
+                                               $do_fix = 0;
+                                       }
+                                       if ($lines[$linenr - 1] =~ /^\+(\t+)$func\s*\(\s*$tested\s*\)\s*;\s*$/) {
+                                               $new_leading_tabs = $1;
+                                               if (length($leading_tabs) + 1 ne length($new_leading_tabs)) {
+                                                       $do_fix = 0;
+                                               }
+                                       } else {
+                                               $do_fix = 0;
+                                       }
+                                       if ($do_fix) {
+                                               fix_delete_line($fixlinenr - 1, $prevrawline);
+                                               $fixed[$fixlinenr] =~ s/^\+$new_leading_tabs/\+$leading_tabs/;
+                                       }
+                               }
                        }
                }
 
diff --git a/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci b/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
new file mode 100644 (file)
index 0000000..9b7eb32
--- /dev/null
@@ -0,0 +1,84 @@
+///
+/// Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0
+///
+// Copyright: (C) 2015 Intel Corp.  GPLv2.
+// Options: --no-includes --include-headers
+//
+// Keywords: dma_pool_zalloc, pci_pool_zalloc
+//
+
+virtual context
+virtual patch
+virtual org
+virtual report
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@depends on context@
+expression x;
+statement S;
+@@
+
+* x = \(dma_pool_alloc\|pci_pool_alloc\)(...);
+  if ((x==NULL) || ...) S
+* memset(x,0, ...);
+
+//----------------------------------------------------------
+//  For patch mode
+//----------------------------------------------------------
+
+@depends on patch@
+expression x;
+expression a,b,c;
+statement S;
+@@
+
+- x = dma_pool_alloc(a,b,c);
++ x = dma_pool_zalloc(a,b,c);
+  if ((x==NULL) || ...) S
+- memset(x,0,...);
+
+@depends on patch@
+expression x;
+expression a,b,c;
+statement S;
+@@
+
+- x = pci_pool_alloc(a,b,c);
++ x = pci_pool_zalloc(a,b,c);
+  if ((x==NULL) || ...) S
+- memset(x,0,...);
+
+//----------------------------------------------------------
+//  For org and report mode
+//----------------------------------------------------------
+
+@r depends on org || report@
+expression x;
+expression a,b,c;
+statement S;
+position p;
+@@
+
+ x = @p\(dma_pool_alloc\|pci_pool_alloc\)(a,b,c);
+ if ((x==NULL) || ...) S
+ memset(x,0, ...);
+
+@script:python depends on org@
+p << r.p;
+x << r.x;
+@@
+
+msg="%s" % (x)
+msg_safe=msg.replace("[","@(").replace("]",")")
+coccilib.org.print_todo(p[0], msg_safe)
+
+@script:python depends on report@
+p << r.p;
+x << r.x;
+@@
+
+msg="WARNING: *_pool_zalloc should be used for %s, instead of *_pool_alloc/memset" % (x)
+coccilib.report.print_report(p[0], msg)
index e065b9e714fcdff408b24f8cb1380d9fc931e0aa..c5e3f73f20546513fa18e94ca2fd3f6290265e38 100644 (file)
@@ -9,11 +9,14 @@ virtual org
 virtual report
 
 @match1@
+declarer name module_i2c_driver;
 declarer name module_platform_driver;
 declarer name module_platform_driver_probe;
 identifier __driver;
 @@
 (
+       module_i2c_driver(__driver);
+|
        module_platform_driver(__driver);
 |
        module_platform_driver_probe(__driver, ...);
@@ -28,6 +31,15 @@ identifier match1.__driver;
                }
        };
 
+@fix1_i2c depends on match1 && patch && !context && !org && !report@
+identifier match1.__driver;
+@@
+       static struct i2c_driver __driver = {
+               .driver = {
+-                      .owner = THIS_MODULE,
+               }
+       };
+
 @match2@
 identifier __driver;
 @@
@@ -37,6 +49,8 @@ identifier __driver;
        platform_driver_probe(&__driver, ...)
 |
        platform_create_bundle(&__driver, ...)
+|
+       i2c_add_driver(&__driver)
 )
 
 @fix2 depends on match2 && patch && !context && !org && !report@
@@ -48,6 +62,15 @@ identifier match2.__driver;
                }
        };
 
+@fix2_i2c depends on match2 && patch && !context && !org && !report@
+identifier match2.__driver;
+@@
+       static struct i2c_driver __driver = {
+               .driver = {
+-                      .owner = THIS_MODULE,
+               }
+       };
+
 // ----------------------------------------------------------------------------
 
 @fix1_context depends on match1 && !patch && (context || org || report)@
@@ -61,6 +84,17 @@ position j0;
                }
        };
 
+@fix1_i2c_context depends on match1 && !patch && (context || org || report)@
+identifier match1.__driver;
+position j0;
+@@
+
+       static struct i2c_driver __driver = {
+               .driver = {
+*                      .owner@j0 = THIS_MODULE,
+               }
+       };
+
 @fix2_context depends on match2 && !patch && (context || org || report)@
 identifier match2.__driver;
 position j0;
@@ -72,6 +106,17 @@ position j0;
                }
        };
 
+@fix2_i2c_context depends on match2 && !patch && (context || org || report)@
+identifier match2.__driver;
+position j0;
+@@
+
+       static struct i2c_driver __driver = {
+               .driver = {
+*                      .owner@j0 = THIS_MODULE,
+               }
+       };
+
 // ----------------------------------------------------------------------------
 
 @script:python fix1_org depends on org@
@@ -81,6 +126,13 @@ j0 << fix1_context.j0;
 msg = "No need to set .owner here. The core will do it."
 coccilib.org.print_todo(j0[0], msg)
 
+@script:python fix1_i2c_org depends on org@
+j0 << fix1_i2c_context.j0;
+@@
+
+msg = "No need to set .owner here. The core will do it."
+coccilib.org.print_todo(j0[0], msg)
+
 @script:python fix2_org depends on org@
 j0 << fix2_context.j0;
 @@
@@ -88,6 +140,13 @@ j0 << fix2_context.j0;
 msg = "No need to set .owner here. The core will do it."
 coccilib.org.print_todo(j0[0], msg)
 
+@script:python fix2_i2c_org depends on org@
+j0 << fix2_i2c_context.j0;
+@@
+
+msg = "No need to set .owner here. The core will do it."
+coccilib.org.print_todo(j0[0], msg)
+
 // ----------------------------------------------------------------------------
 
 @script:python fix1_report depends on report@
@@ -97,6 +156,13 @@ j0 << fix1_context.j0;
 msg = "No need to set .owner here. The core will do it."
 coccilib.report.print_report(j0[0], msg)
 
+@script:python fix1_i2c_report depends on report@
+j0 << fix1_i2c_context.j0;
+@@
+
+msg = "No need to set .owner here. The core will do it."
+coccilib.report.print_report(j0[0], msg)
+
 @script:python fix2_report depends on report@
 j0 << fix2_context.j0;
 @@
@@ -104,3 +170,10 @@ j0 << fix2_context.j0;
 msg = "No need to set .owner here. The core will do it."
 coccilib.report.print_report(j0[0], msg)
 
+@script:python fix2_i2c_report depends on report@
+j0 << fix2_i2c_context.j0;
+@@
+
+msg = "No need to set .owner here. The core will do it."
+coccilib.report.print_report(j0[0], msg)
+
index f01789e967ec4bc10b5cb5aae8f83fc3e6ab2408..b7042d074078cbb2883f560df119b53fc4e39cae 100644 (file)
@@ -1,5 +1,5 @@
 /// Make sure pm_runtime_* calls does not use unnecessary IS_ERR_VALUE
-//
+///
 // Keywords: pm_runtime
 // Confidence: Medium
 // Copyright (C) 2013 Texas Instruments Incorporated - GPLv2.
index b67e174f3d95effd048f0bf15a412aa536e2820e..bd1a2a4ee106d04ff847ec5775a52c644e68c4d8 100644 (file)
@@ -1,5 +1,5 @@
-/// This removes an open coded simple_open() function
-/// and replaces file operations references to the function
+/// Remove an open coded simple_open() function
+/// and replace file operations references to the function
 /// with simple_open() instead.
 ///
 // Confidence: High
diff --git a/scripts/coccinelle/api/vma_pages.cocci b/scripts/coccinelle/api/vma_pages.cocci
new file mode 100644 (file)
index 0000000..3e52e11
--- /dev/null
@@ -0,0 +1,60 @@
+///
+/// Use vma_pages function on vma object instead of explicit computation.
+///
+//  Confidence: High
+//  Keywords: vma_pages vma
+//  Comment: Based on resource_size.cocci
+
+virtual context
+virtual patch
+virtual org
+virtual report
+
+//----------------------------------------------------------
+//  For context mode
+//----------------------------------------------------------
+
+@r_context depends on context && !patch && !org && !report@
+struct vm_area_struct *vma;
+@@
+
+* (vma->vm_end - vma->vm_start) >> PAGE_SHIFT
+
+//----------------------------------------------------------
+//  For patch mode
+//----------------------------------------------------------
+
+@r_patch depends on !context && patch && !org && !report@
+struct vm_area_struct *vma;
+@@
+
+- ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)
++ vma_pages(vma)
+
+//----------------------------------------------------------
+//  For org mode
+//----------------------------------------------------------
+
+@r_org depends on !context && !patch && (org || report)@
+struct vm_area_struct *vma;
+position p;
+@@
+
+  (vma->vm_end@p - vma->vm_start) >> PAGE_SHIFT
+
+@script:python depends on report@
+p << r_org.p;
+x << r_org.vma;
+@@
+
+msg="WARNING: Consider using vma_pages helper on %s" % (x)
+coccilib.report.print_report(p[0], msg)
+
+@script:python depends on org@
+p << r_org.p;
+x << r_org.vma;
+@@
+
+msg="WARNING: Consider using vma_pages helper on %s" % (x)
+msg_safe=msg.replace("[","@(").replace("]",")")
+coccilib.org.print_todo(p[0], msg_safe)
index 8aebd1875e75264016ed2ff37a7a21a047b7feae..c2663c677ac1c96ebf72a42a241714fe1231f9c0 100644 (file)
@@ -1,5 +1,4 @@
-/// the address of a variable or field is non-zero is likely always to bo
-/// non-zero
+/// The address of a variable or field is likely always to be non-zero.
 ///
 // Confidence: High
 // Copyright: (C) 2012 Julia Lawall, INRIA/LIP6.  GPLv2.
index a24a754ae1d7283f39e09541d41384b9c88773a2..b421150a2effcd925e83758bbf609d038e67ac1b 100644 (file)
@@ -1,5 +1,8 @@
-/// Make sure threaded IRQs without a primary handler are always request with
-/// IRQF_ONESHOT
+/// Since commit 1c6c69525b40 ("genirq: Reject bogus threaded irq requests")
+/// threaded IRQs without a primary handler need to be requested with
+/// IRQF_ONESHOT, otherwise the request will fail.
+///
+/// So pass the IRQF_ONESHOT flag in this case.
 ///
 //
 // Confidence: Good
index 605955a91c44937990cdbcf8aaeb76a966ecf7eb..d8286ef5307fcaf7f97302effdcb995698addc89 100644 (file)
@@ -1,5 +1,5 @@
 ///
-/// Removes unneeded variable used to store return value.
+/// Remove unneeded variable used to store return value.
 ///
 // Confidence: Moderate
 // Copyright: (C) 2012 Peter Senna Tschudin, INRIA/LIP6.  GPLv2.
index a47eba2edc9e1ee315f3a4b13f0c56e195763a2a..6740c659a2b38484188a773e41b71a8af6c1c4bd 100644 (file)
@@ -1,5 +1,5 @@
 ///
-/// Removes unneeded semicolon.
+/// Remove unneeded semicolon.
 ///
 // Confidence: Moderate
 // Copyright: (C) 2012 Peter Senna Tschudin, INRIA/LIP6.  GPLv2.
index 47f7084b6360a0fa8aa378e7e630dea33f002a88..e8b6313b116f57d1d27b66efac786efe92fcd6ec 100644 (file)
@@ -1,6 +1,6 @@
 /// Simplify a trivial if-return sequence.  Possibly combine with a
 /// preceding function call.
-//
+///
 // Confidence: High
 // Copyright: (C) 2014 Julia Lawall, INRIA/LIP6.  GPLv2.
 // Copyright: (C) 2014 Gilles Muller, INRIA/LiP6.  GPLv2.
diff --git a/scripts/extract-cert.c b/scripts/extract-cert.c
new file mode 100644 (file)
index 0000000..6ce5945
--- /dev/null
@@ -0,0 +1,166 @@
+/* Extract X.509 certificate in DER form from PKCS#11 or PEM.
+ *
+ * Copyright © 2014-2015 Red Hat, Inc. All Rights Reserved.
+ * Copyright © 2015      Intel Corporation.
+ *
+ * Authors: David Howells <dhowells@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the licence, or (at your option) any later version.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <getopt.h>
+#include <err.h>
+#include <arpa/inet.h>
+#include <openssl/bio.h>
+#include <openssl/evp.h>
+#include <openssl/pem.h>
+#include <openssl/pkcs7.h>
+#include <openssl/err.h>
+#include <openssl/engine.h>
+
+#define PKEY_ID_PKCS7 2
+
+static __attribute__((noreturn))
+void format(void)
+{
+       fprintf(stderr,
+               "Usage: scripts/extract-cert <source> <dest>\n");
+       exit(2);
+}
+
+static void display_openssl_errors(int l)
+{
+       const char *file;
+       char buf[120];
+       int e, line;
+
+       if (ERR_peek_error() == 0)
+               return;
+       fprintf(stderr, "At main.c:%d:\n", l);
+
+       while ((e = ERR_get_error_line(&file, &line))) {
+               ERR_error_string(e, buf);
+               fprintf(stderr, "- SSL %s: %s:%d\n", buf, file, line);
+       }
+}
+
+static void drain_openssl_errors(void)
+{
+       const char *file;
+       int line;
+
+       if (ERR_peek_error() == 0)
+               return;
+       while (ERR_get_error_line(&file, &line)) {}
+}
+
+#define ERR(cond, fmt, ...)                            \
+       do {                                            \
+               bool __cond = (cond);                   \
+               display_openssl_errors(__LINE__);       \
+               if (__cond) {                           \
+                       err(1, fmt, ## __VA_ARGS__);    \
+               }                                       \
+       } while(0)
+
+static const char *key_pass;
+static BIO *wb;
+static char *cert_dst;
+int kbuild_verbose;
+
+static void write_cert(X509 *x509)
+{
+       char buf[200];
+
+       if (!wb) {
+               wb = BIO_new_file(cert_dst, "wb");
+               ERR(!wb, "%s", cert_dst);
+       }
+       X509_NAME_oneline(X509_get_subject_name(x509), buf, sizeof(buf));
+       ERR(!i2d_X509_bio(wb, x509), "%s", cert_dst);
+       if (kbuild_verbose)
+               fprintf(stderr, "Extracted cert: %s\n", buf);
+}
+
+int main(int argc, char **argv)
+{
+       char *cert_src;
+
+       OpenSSL_add_all_algorithms();
+       ERR_load_crypto_strings();
+       ERR_clear_error();
+
+       kbuild_verbose = atoi(getenv("KBUILD_VERBOSE")?:"0");
+
+        key_pass = getenv("KBUILD_SIGN_PIN");
+
+       if (argc != 3)
+               format();
+
+       cert_src = argv[1];
+       cert_dst = argv[2];
+
+       if (!cert_src[0]) {
+               /* Invoked with no input; create empty file */
+               FILE *f = fopen(cert_dst, "wb");
+               ERR(!f, "%s", cert_dst);
+               fclose(f);
+               exit(0);
+       } else if (!strncmp(cert_src, "pkcs11:", 7)) {
+               ENGINE *e;
+               struct {
+                       const char *cert_id;
+                       X509 *cert;
+               } parms;
+
+               parms.cert_id = cert_src;
+               parms.cert = NULL;
+
+               ENGINE_load_builtin_engines();
+               drain_openssl_errors();
+               e = ENGINE_by_id("pkcs11");
+               ERR(!e, "Load PKCS#11 ENGINE");
+               if (ENGINE_init(e))
+                       drain_openssl_errors();
+               else
+                       ERR(1, "ENGINE_init");
+               if (key_pass)
+                       ERR(!ENGINE_ctrl_cmd_string(e, "PIN", key_pass, 0), "Set PKCS#11 PIN");
+               ENGINE_ctrl_cmd(e, "LOAD_CERT_CTRL", 0, &parms, NULL, 1);
+               ERR(!parms.cert, "Get X.509 from PKCS#11");
+               write_cert(parms.cert);
+       } else {
+               BIO *b;
+               X509 *x509;
+
+               b = BIO_new_file(cert_src, "rb");
+               ERR(!b, "%s", cert_src);
+
+               while (1) {
+                       x509 = PEM_read_bio_X509(b, NULL, NULL, NULL);
+                       if (wb && !x509) {
+                               unsigned long err = ERR_peek_last_error();
+                               if (ERR_GET_LIB(err) == ERR_LIB_PEM &&
+                                   ERR_GET_REASON(err) == PEM_R_NO_START_LINE) {
+                                       ERR_clear_error();
+                                       break;
+                               }
+                       }
+                       ERR(!x509, "%s", cert_src);
+                       write_cert(x509);
+               }
+       }
+
+       BIO_free(wb);
+
+       return 0;
+}
index c9f0f0ce82ff73b2aca4944dc3d63ca688c727af..99950b5afb0dd68e80ae00136a37545282be48e0 100644 (file)
@@ -1,4 +1,4 @@
-/* A Bison parser, made by GNU Bison 2.5.1.  */
+/* A Bison parser, made by GNU Bison 2.7.  */
 
 /* Bison implementation for Yacc-like parsers in C
    
@@ -44,7 +44,7 @@
 #define YYBISON 1
 
 /* Bison version.  */
-#define YYBISON_VERSION "2.5.1"
+#define YYBISON_VERSION "2.7"
 
 /* Skeleton name.  */
 #define YYSKELETON_NAME "yacc.c"
@@ -58,8 +58,6 @@
 /* Pull parsers.  */
 #define YYPULL 1
 
-/* Using locations.  */
-#define YYLSP_NEEDED 0
 
 
 
@@ -125,11 +123,6 @@ static void record_compound(struct string_list **keyw,
 #  endif
 # endif
 
-/* Enabling traces.  */
-#ifndef YYDEBUG
-# define YYDEBUG 1
-#endif
-
 /* Enabling verbose error messages.  */
 #ifdef YYERROR_VERBOSE
 # undef YYERROR_VERBOSE
@@ -138,11 +131,14 @@ static void record_compound(struct string_list **keyw,
 # define YYERROR_VERBOSE 0
 #endif
 
-/* Enabling the token table.  */
-#ifndef YYTOKEN_TABLE
-# define YYTOKEN_TABLE 0
-#endif
 
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 1
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
 
 /* Tokens.  */
 #ifndef YYTOKENTYPE
@@ -196,7 +192,6 @@ static void record_compound(struct string_list **keyw,
 #endif
 
 
-
 #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
 typedef int YYSTYPE;
 # define YYSTYPE_IS_TRIVIAL 1
@@ -204,6 +199,23 @@ typedef int YYSTYPE;
 # define YYSTYPE_IS_DECLARED 1
 #endif
 
+extern YYSTYPE yylval;
+
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
 
 /* Copy the second part of user declarations.  */
 
@@ -260,24 +272,24 @@ typedef short int yytype_int16;
 # if defined YYENABLE_NLS && YYENABLE_NLS
 #  if ENABLE_NLS
 #   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
-#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#   define YY_(Msgid) dgettext ("bison-runtime", Msgid)
 #  endif
 # endif
 # ifndef YY_
-#  define YY_(msgid) msgid
+#  define YY_(Msgid) Msgid
 # endif
 #endif
 
 /* Suppress unused-variable warnings by "using" E.  */
 #if ! defined lint || defined __GNUC__
-# define YYUSE(e) ((void) (e))
+# define YYUSE(E) ((void) (E))
 #else
-# define YYUSE(e) /* empty */
+# define YYUSE(E) /* empty */
 #endif
 
 /* Identity function, used to suppress warnings about constant conditions.  */
 #ifndef lint
-# define YYID(n) (n)
+# define YYID(N) (N)
 #else
 #if (defined __STDC__ || defined __C99__FUNC__ \
      || defined __cplusplus || defined _MSC_VER)
@@ -427,16 +439,16 @@ union yyalloc
 /* YYFINAL -- State number of the termination state.  */
 #define YYFINAL  4
 /* YYLAST -- Last index in YYTABLE.  */
-#define YYLAST   514
+#define YYLAST   515
 
 /* YYNTOKENS -- Number of terminals.  */
 #define YYNTOKENS  54
 /* YYNNTS -- Number of nonterminals.  */
 #define YYNNTS  49
 /* YYNRULES -- Number of rules.  */
-#define YYNRULES  132
+#define YYNRULES  133
 /* YYNRULES -- Number of states.  */
-#define YYNSTATES  187
+#define YYNSTATES  188
 
 /* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
 #define YYUNDEFTOK  2
@@ -492,13 +504,13 @@ static const yytype_uint16 yyprhs[] =
       97,   101,   105,   109,   112,   115,   118,   120,   122,   124,
      126,   128,   130,   132,   134,   136,   138,   140,   143,   144,
      146,   148,   151,   153,   155,   157,   159,   162,   164,   166,
-     171,   176,   179,   183,   187,   190,   192,   194,   196,   201,
-     206,   209,   213,   217,   220,   222,   226,   227,   229,   231,
-     235,   238,   241,   243,   244,   246,   248,   253,   258,   261,
-     265,   269,   273,   274,   276,   279,   283,   287,   288,   290,
-     292,   295,   299,   302,   303,   305,   307,   311,   314,   317,
-     319,   322,   323,   326,   330,   335,   337,   341,   343,   347,
-     350,   351,   353
+     168,   173,   178,   181,   185,   189,   192,   194,   196,   198,
+     203,   208,   211,   215,   219,   222,   224,   228,   229,   231,
+     233,   237,   240,   243,   245,   246,   248,   250,   255,   260,
+     263,   267,   271,   275,   276,   278,   281,   285,   289,   290,
+     292,   294,   297,   301,   304,   305,   307,   309,   313,   316,
+     319,   321,   324,   325,   328,   332,   337,   339,   343,   345,
+     349,   352,   353,   355
 };
 
 /* YYRHS -- A `-1'-separated list of the rules' RHS.  */
@@ -520,26 +532,27 @@ static const yytype_int8 yyrhs[] =
       13,    -1,     9,    -1,    26,    -1,     6,    -1,    42,    -1,
       50,    72,    -1,    -1,    73,    -1,    74,    -1,    73,    74,
       -1,     8,    -1,    27,    -1,    31,    -1,    18,    -1,    71,
-      75,    -1,    76,    -1,    38,    -1,    76,    48,    79,    49,
-      -1,    76,    48,     1,    49,    -1,    76,    34,    -1,    48,
-      75,    49,    -1,    48,     1,    49,    -1,    71,    77,    -1,
-      78,    -1,    38,    -1,    42,    -1,    78,    48,    79,    49,
-      -1,    78,    48,     1,    49,    -1,    78,    34,    -1,    48,
-      77,    49,    -1,    48,     1,    49,    -1,    80,    37,    -1,
-      80,    -1,    81,    47,    37,    -1,    -1,    81,    -1,    82,
-      -1,    81,    47,    82,    -1,    66,    83,    -1,    71,    83,
-      -1,    84,    -1,    -1,    38,    -1,    42,    -1,    84,    48,
-      79,    49,    -1,    84,    48,     1,    49,    -1,    84,    34,
-      -1,    48,    83,    49,    -1,    48,     1,    49,    -1,    65,
-      75,    33,    -1,    -1,    87,    -1,    51,    35,    -1,    52,
-      89,    46,    -1,    52,     1,    46,    -1,    -1,    90,    -1,
-      91,    -1,    90,    91,    -1,    65,    92,    45,    -1,     1,
-      45,    -1,    -1,    93,    -1,    94,    -1,    93,    47,    94,
-      -1,    77,    96,    -1,    38,    95,    -1,    95,    -1,    53,
-      35,    -1,    -1,    96,    31,    -1,    52,    98,    46,    -1,
-      52,    98,    47,    46,    -1,    99,    -1,    98,    47,    99,
-      -1,    38,    -1,    38,    51,    35,    -1,    30,    45,    -1,
-      -1,    30,    -1,    29,    48,    38,    49,    45,    -1
+      75,    -1,    76,    -1,    38,    -1,    42,    -1,    76,    48,
+      79,    49,    -1,    76,    48,     1,    49,    -1,    76,    34,
+      -1,    48,    75,    49,    -1,    48,     1,    49,    -1,    71,
+      77,    -1,    78,    -1,    38,    -1,    42,    -1,    78,    48,
+      79,    49,    -1,    78,    48,     1,    49,    -1,    78,    34,
+      -1,    48,    77,    49,    -1,    48,     1,    49,    -1,    80,
+      37,    -1,    80,    -1,    81,    47,    37,    -1,    -1,    81,
+      -1,    82,    -1,    81,    47,    82,    -1,    66,    83,    -1,
+      71,    83,    -1,    84,    -1,    -1,    38,    -1,    42,    -1,
+      84,    48,    79,    49,    -1,    84,    48,     1,    49,    -1,
+      84,    34,    -1,    48,    83,    49,    -1,    48,     1,    49,
+      -1,    65,    75,    33,    -1,    -1,    87,    -1,    51,    35,
+      -1,    52,    89,    46,    -1,    52,     1,    46,    -1,    -1,
+      90,    -1,    91,    -1,    90,    91,    -1,    65,    92,    45,
+      -1,     1,    45,    -1,    -1,    93,    -1,    94,    -1,    93,
+      47,    94,    -1,    77,    96,    -1,    38,    95,    -1,    95,
+      -1,    53,    35,    -1,    -1,    96,    31,    -1,    52,    98,
+      46,    -1,    52,    98,    47,    46,    -1,    99,    -1,    98,
+      47,    99,    -1,    38,    -1,    38,    51,    35,    -1,    30,
+      45,    -1,    -1,    30,    -1,    29,    48,    38,    49,    45,
+      -1
 };
 
 /* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
@@ -552,17 +565,17 @@ static const yytype_uint16 yyrline[] =
      237,   239,   241,   246,   249,   250,   254,   255,   256,   257,
      258,   259,   260,   261,   262,   263,   264,   268,   273,   274,
      278,   279,   283,   283,   283,   284,   292,   293,   297,   306,
-     308,   310,   312,   314,   321,   322,   326,   327,   328,   330,
-     332,   334,   336,   341,   342,   343,   347,   348,   352,   353,
-     358,   363,   365,   369,   370,   378,   382,   384,   386,   388,
-     390,   395,   404,   405,   410,   415,   416,   420,   421,   425,
-     426,   430,   432,   437,   438,   442,   443,   447,   448,   449,
-     453,   457,   458,   462,   463,   467,   468,   471,   476,   484,
-     488,   489,   493
+     315,   317,   319,   321,   323,   330,   331,   335,   336,   337,
+     339,   341,   343,   345,   350,   351,   352,   356,   357,   361,
+     362,   367,   372,   374,   378,   379,   387,   391,   393,   395,
+     397,   399,   404,   413,   414,   419,   424,   425,   429,   430,
+     434,   435,   439,   441,   446,   447,   451,   452,   456,   457,
+     458,   462,   466,   467,   471,   472,   476,   477,   480,   485,
+     493,   497,   498,   502
 };
 #endif
 
-#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+#if YYDEBUG || YYERROR_VERBOSE || 0
 /* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
    First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
 static const char *const yytname[] =
@@ -621,13 +634,13 @@ static const yytype_uint8 yyr1[] =
       69,    69,    69,    69,    69,    69,    70,    70,    70,    70,
       70,    70,    70,    70,    70,    70,    70,    71,    72,    72,
       73,    73,    74,    74,    74,    74,    75,    75,    76,    76,
-      76,    76,    76,    76,    77,    77,    78,    78,    78,    78,
-      78,    78,    78,    79,    79,    79,    80,    80,    81,    81,
-      82,    83,    83,    84,    84,    84,    84,    84,    84,    84,
-      84,    85,    86,    86,    87,    88,    88,    89,    89,    90,
-      90,    91,    91,    92,    92,    93,    93,    94,    94,    94,
-      95,    96,    96,    97,    97,    98,    98,    99,    99,   100,
-     101,   101,   102
+      76,    76,    76,    76,    76,    77,    77,    78,    78,    78,
+      78,    78,    78,    78,    79,    79,    79,    80,    80,    81,
+      81,    82,    83,    83,    84,    84,    84,    84,    84,    84,
+      84,    84,    85,    86,    86,    87,    88,    88,    89,    89,
+      90,    90,    91,    91,    92,    92,    93,    93,    94,    94,
+      94,    95,    96,    96,    97,    97,    98,    98,    99,    99,
+     100,   101,   101,   102
 };
 
 /* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
@@ -639,14 +652,14 @@ static const yytype_uint8 yyr2[] =
        1,     1,     1,     1,     1,     4,     1,     2,     2,     2,
        3,     3,     3,     2,     2,     2,     1,     1,     1,     1,
        1,     1,     1,     1,     1,     1,     1,     2,     0,     1,
-       1,     2,     1,     1,     1,     1,     2,     1,     1,     4,
-       4,     2,     3,     3,     2,     1,     1,     1,     4,     4,
-       2,     3,     3,     2,     1,     3,     0,     1,     1,     3,
-       2,     2,     1,     0,     1,     1,     4,     4,     2,     3,
-       3,     3,     0,     1,     2,     3,     3,     0,     1,     1,
-       2,     3,     2,     0,     1,     1,     3,     2,     2,     1,
-       2,     0,     2,     3,     4,     1,     3,     1,     3,     2,
-       0,     1,     5
+       1,     2,     1,     1,     1,     1,     2,     1,     1,     1,
+       4,     4,     2,     3,     3,     2,     1,     1,     1,     4,
+       4,     2,     3,     3,     2,     1,     3,     0,     1,     1,
+       3,     2,     2,     1,     0,     1,     1,     4,     4,     2,
+       3,     3,     3,     0,     1,     2,     3,     3,     0,     1,
+       1,     2,     3,     2,     0,     1,     1,     3,     2,     2,
+       1,     2,     0,     2,     3,     4,     1,     3,     1,     3,
+       2,     0,     1,     5
 };
 
 /* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM.
@@ -660,187 +673,187 @@ static const yytype_uint8 yydefact[] =
        0,     0,     0,    64,    36,    56,     5,    10,    17,    23,
       24,    26,    27,    33,    34,    11,    12,    13,    14,    15,
       39,     0,    43,     6,    37,     0,    44,    22,    38,    45,
-       0,     0,   129,    68,     0,    58,     0,    18,    19,     0,
-     130,    67,    25,    42,   127,     0,   125,    22,    40,     0,
-     113,     0,     0,   109,     9,    17,    41,    93,     0,     0,
-       0,     0,    57,    59,    60,    16,     0,    66,   131,   101,
-     121,    71,     0,     0,   123,     0,     7,   112,   106,    76,
-      77,     0,     0,     0,   121,    75,     0,   114,   115,   119,
-     105,     0,   110,   130,    94,    56,     0,    93,    90,    92,
-      35,     0,    73,    72,    61,    20,   102,     0,     0,    84,
-      87,    88,   128,   124,   126,   118,     0,    76,     0,   120,
-      74,   117,    80,     0,   111,     0,     0,    95,     0,    91,
-      98,     0,   132,   122,     0,    21,   103,    70,    69,    83,
-       0,    82,    81,     0,     0,   116,   100,    99,     0,     0,
-     104,    85,    89,    79,    78,    97,    96
+       0,     0,   130,    68,    69,     0,    58,     0,    18,    19,
+       0,   131,    67,    25,    42,   128,     0,   126,    22,    40,
+       0,   114,     0,     0,   110,     9,    17,    41,    94,     0,
+       0,     0,     0,    57,    59,    60,    16,     0,    66,   132,
+     102,   122,    72,     0,     0,   124,     0,     7,   113,   107,
+      77,    78,     0,     0,     0,   122,    76,     0,   115,   116,
+     120,   106,     0,   111,   131,    95,    56,     0,    94,    91,
+      93,    35,     0,    74,    73,    61,    20,   103,     0,     0,
+      85,    88,    89,   129,   125,   127,   119,     0,    77,     0,
+     121,    75,   118,    81,     0,   112,     0,     0,    96,     0,
+      92,    99,     0,   133,   123,     0,    21,   104,    71,    70,
+      84,     0,    83,    82,     0,     0,   117,   101,   100,     0,
+       0,   105,    86,    90,    80,    79,    98,    97
 };
 
 /* YYDEFGOTO[NTERM-NUM].  */
 static const yytype_int16 yydefgoto[] =
 {
-      -1,     1,     2,     3,    36,    77,    57,    37,    66,    67,
-      68,    80,    39,    40,    41,    42,    43,    69,    92,    93,
-      44,   123,    71,   114,   115,   138,   139,   140,   141,   128,
-     129,    45,   165,   166,    56,    81,    82,    83,   116,   117,
-     118,   119,   136,    52,    75,    76,    46,   100,    47
+      -1,     1,     2,     3,    36,    78,    57,    37,    67,    68,
+      69,    81,    39,    40,    41,    42,    43,    70,    93,    94,
+      44,   124,    72,   115,   116,   139,   140,   141,   142,   129,
+     130,    45,   166,   167,    56,    82,    83,    84,   117,   118,
+     119,   120,   137,    52,    76,    77,    46,   101,    47
 };
 
 /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
    STATE-NUM.  */
-#define YYPACT_NINF -140
+#define YYPACT_NINF -92
 static const yytype_int16 yypact[] =
 {
-    -140,    29,  -140,   207,  -140,  -140,    40,  -140,  -140,  -140,
-    -140,  -140,   -27,  -140,    44,  -140,  -140,  -140,  -140,  -140,
-    -140,  -140,  -140,  -140,   -22,  -140,   -18,  -140,  -140,  -140,
-      -9,    22,    28,  -140,  -140,  -140,  -140,  -140,    42,   472,
-    -140,  -140,  -140,  -140,  -140,  -140,  -140,  -140,  -140,  -140,
-      46,    43,  -140,  -140,    47,   107,  -140,   472,    47,  -140,
-     472,    62,  -140,  -140,    16,    -3,    57,    56,  -140,    42,
-      35,   -11,  -140,  -140,    53,    48,  -140,   472,  -140,    51,
-      21,    59,   157,  -140,  -140,    42,  -140,   388,    58,    60,
-      70,    81,  -140,    -3,  -140,  -140,    42,  -140,  -140,  -140,
-    -140,  -140,   253,    71,  -140,   -20,  -140,  -140,  -140,    83,
-    -140,     5,   102,    34,  -140,    12,    95,    94,  -140,  -140,
-    -140,    97,  -140,   113,  -140,  -140,     2,    41,  -140,    27,
-    -140,    99,  -140,  -140,  -140,  -140,   -24,    98,   101,   109,
-     104,  -140,  -140,  -140,  -140,  -140,   105,  -140,   110,  -140,
-    -140,   117,  -140,   298,  -140,    21,   112,  -140,   120,  -140,
-    -140,   343,  -140,  -140,   121,  -140,  -140,  -140,  -140,  -140,
-     434,  -140,  -140,   131,   137,  -140,  -140,  -140,   138,   141,
-    -140,  -140,  -140,  -140,  -140,  -140,  -140
+     -92,    19,   -92,   208,   -92,   -92,    39,   -92,   -92,   -92,
+     -92,   -92,   -27,   -92,    23,   -92,   -92,   -92,   -92,   -92,
+     -92,   -92,   -92,   -92,   -22,   -92,     9,   -92,   -92,   -92,
+      -6,    16,    25,   -92,   -92,   -92,   -92,   -92,    31,   473,
+     -92,   -92,   -92,   -92,   -92,   -92,   -92,   -92,   -92,   -92,
+      49,    37,   -92,   -92,    51,   108,   -92,   473,    51,   -92,
+     473,    59,   -92,   -92,   -92,    12,    -3,    60,    57,   -92,
+      31,    -7,    24,   -92,   -92,    55,    42,   -92,   473,   -92,
+      46,   -21,    61,   158,   -92,   -92,    31,   -92,   389,    71,
+      82,    88,    89,   -92,    -3,   -92,   -92,    31,   -92,   -92,
+     -92,   -92,   -92,   254,    73,   -92,   -24,   -92,   -92,   -92,
+      90,   -92,    17,    75,    45,   -92,    32,    96,    95,   -92,
+     -92,   -92,    99,   -92,   115,   -92,   -92,     3,    48,   -92,
+      34,   -92,   102,   -92,   -92,   -92,   -92,   -11,   100,   103,
+     111,   104,   -92,   -92,   -92,   -92,   -92,   106,   -92,   113,
+     -92,   -92,   126,   -92,   299,   -92,   -21,   121,   -92,   132,
+     -92,   -92,   344,   -92,   -92,   125,   -92,   -92,   -92,   -92,
+     -92,   435,   -92,   -92,   138,   139,   -92,   -92,   -92,   142,
+     143,   -92,   -92,   -92,   -92,   -92,   -92,   -92
 };
 
 /* YYPGOTO[NTERM-NUM].  */
 static const yytype_int16 yypgoto[] =
 {
-    -140,  -140,   190,  -140,  -140,  -140,  -140,   -45,  -140,  -140,
-      96,     1,   -60,   -31,  -140,  -140,  -140,   -78,  -140,  -140,
-     -55,    -7,  -140,   -92,  -140,  -139,  -140,  -140,   -59,   -39,
-    -140,  -140,  -140,  -140,   -13,  -140,  -140,   111,  -140,  -140,
-      39,    87,    84,   147,  -140,   106,  -140,  -140,  -140
+     -92,   -92,   192,   -92,   -92,   -92,   -92,   -47,   -92,   -92,
+      97,     0,   -60,   -32,   -92,   -92,   -92,   -79,   -92,   -92,
+     -58,   -26,   -92,   -38,   -92,   -91,   -92,   -92,   -59,   -28,
+     -92,   -92,   -92,   -92,   -20,   -92,   -92,   112,   -92,   -92,
+      41,    91,    83,   149,   -92,   101,   -92,   -92,   -92
 };
 
 /* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
    positive, shift that token.  If negative, reduce the rule which
    number is the opposite.  If YYTABLE_NINF, syntax error.  */
-#define YYTABLE_NINF -109
+#define YYTABLE_NINF -110
 static const yytype_int16 yytable[] =
 {
-      87,    88,   113,   156,    38,    10,   146,   163,    72,   127,
-      94,    50,    84,    59,   174,    20,    54,    90,    74,   148,
-      58,   150,   179,   101,    29,    51,   143,   164,    33,     4,
-      55,    70,   106,   113,    55,   113,   -93,   102,   134,    60,
-     124,    78,    87,   147,   157,    86,   152,   110,   127,   127,
-     126,   -93,    65,   111,    63,    65,    72,    91,    85,   109,
-     153,   160,    97,   110,    64,    98,    65,    53,    99,   111,
-      61,    65,   147,    62,   112,   161,   110,   113,    85,   124,
-      63,    74,   111,   157,    65,    48,    49,   158,   159,   126,
-      64,    65,    65,    87,   104,   105,   107,   108,    51,    55,
-      89,    87,    95,    96,   103,   120,   142,   130,    79,   131,
-      87,   182,     7,     8,     9,    10,    11,    12,    13,   132,
-      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
-     133,    26,    27,    28,    29,    30,   112,   149,    33,    34,
-     154,   155,   107,    98,   162,   -22,   169,   167,   163,    35,
-     168,   170,   -22,  -107,   171,   -22,   180,   -22,   121,   172,
-     -22,   176,     7,     8,     9,    10,    11,    12,    13,   177,
-      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
-     183,    26,    27,    28,    29,    30,   184,   185,    33,    34,
-     186,     5,   135,   122,   175,   -22,   145,    73,   151,    35,
-       0,     0,   -22,  -108,     0,   -22,     0,   -22,     6,     0,
-     -22,   144,     7,     8,     9,    10,    11,    12,    13,    14,
-      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
-      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
-       0,     0,     0,     0,     0,   -22,     0,     0,     0,    35,
-       0,     0,   -22,     0,   137,   -22,     0,   -22,     7,     8,
-       9,    10,    11,    12,    13,     0,    15,    16,    17,    18,
-      19,    20,    21,    22,    23,    24,     0,    26,    27,    28,
-      29,    30,     0,     0,    33,    34,     0,     0,     0,     0,
-     -86,     0,     0,     0,     0,    35,     0,     0,     0,   173,
-       0,     0,   -86,     7,     8,     9,    10,    11,    12,    13,
-       0,    15,    16,    17,    18,    19,    20,    21,    22,    23,
-      24,     0,    26,    27,    28,    29,    30,     0,     0,    33,
-      34,     0,     0,     0,     0,   -86,     0,     0,     0,     0,
-      35,     0,     0,     0,   178,     0,     0,   -86,     7,     8,
-       9,    10,    11,    12,    13,     0,    15,    16,    17,    18,
-      19,    20,    21,    22,    23,    24,     0,    26,    27,    28,
-      29,    30,     0,     0,    33,    34,     0,     0,     0,     0,
-     -86,     0,     0,     0,     0,    35,     0,     0,     0,     0,
-       0,     0,   -86,     7,     8,     9,    10,    11,    12,    13,
-       0,    15,    16,    17,    18,    19,    20,    21,    22,    23,
-      24,     0,    26,    27,    28,    29,    30,     0,     0,    33,
-      34,     0,     0,     0,     0,     0,   124,     0,     0,     0,
-     125,     0,     0,     0,     0,     0,   126,     0,    65,     7,
+      88,    89,   114,    38,   157,    10,    59,    73,    95,   128,
+      85,    50,    71,    91,    75,    20,    54,   110,   147,     4,
+     164,   111,   144,    99,    29,    51,   100,   112,    33,    66,
+      55,   107,   113,   114,    79,   114,   135,   -94,    87,    92,
+     165,   125,    60,    88,    98,   158,    53,    58,   128,   128,
+      63,   127,   -94,    66,    64,   148,    73,    86,   102,   111,
+      65,    55,    66,   175,    61,   112,   153,    66,   161,    63,
+      62,   180,   103,    64,   149,    75,   151,   114,    86,    65,
+     154,    66,   162,   148,    48,    49,   125,   111,   105,   106,
+     158,   108,   109,   112,    88,    66,   127,    90,    66,   159,
+     160,    51,    88,    55,    97,    96,   104,   121,   143,    80,
+     150,    88,   183,     7,     8,     9,    10,    11,    12,    13,
+     131,    15,    16,    17,    18,    19,    20,    21,    22,    23,
+      24,   132,    26,    27,    28,    29,    30,   133,   134,    33,
+      34,   155,   156,   113,   108,    99,   -22,   163,   170,   168,
+      35,   171,   169,   -22,  -108,   172,   -22,   164,   -22,   122,
+     181,   -22,   173,     7,     8,     9,    10,    11,    12,    13,
+     177,    15,    16,    17,    18,    19,    20,    21,    22,    23,
+      24,   178,    26,    27,    28,    29,    30,   184,   185,    33,
+      34,   186,   187,     5,   136,   123,   -22,   176,   152,    74,
+      35,   146,     0,   -22,  -109,     0,   -22,   145,   -22,     6,
+       0,   -22,     0,     7,     8,     9,    10,    11,    12,    13,
+      14,    15,    16,    17,    18,    19,    20,    21,    22,    23,
+      24,    25,    26,    27,    28,    29,    30,    31,    32,    33,
+      34,     0,     0,     0,     0,     0,   -22,     0,     0,     0,
+      35,     0,     0,   -22,     0,   138,   -22,     0,   -22,     7,
+       8,     9,    10,    11,    12,    13,     0,    15,    16,    17,
+      18,    19,    20,    21,    22,    23,    24,     0,    26,    27,
+      28,    29,    30,     0,     0,    33,    34,     0,     0,     0,
+       0,   -87,     0,     0,     0,     0,    35,     0,     0,     0,
+     174,     0,     0,   -87,     7,     8,     9,    10,    11,    12,
+      13,     0,    15,    16,    17,    18,    19,    20,    21,    22,
+      23,    24,     0,    26,    27,    28,    29,    30,     0,     0,
+      33,    34,     0,     0,     0,     0,   -87,     0,     0,     0,
+       0,    35,     0,     0,     0,   179,     0,     0,   -87,     7,
        8,     9,    10,    11,    12,    13,     0,    15,    16,    17,
       18,    19,    20,    21,    22,    23,    24,     0,    26,    27,
       28,    29,    30,     0,     0,    33,    34,     0,     0,     0,
-       0,   181,     0,     0,     0,     0,    35,     7,     8,     9,
-      10,    11,    12,    13,     0,    15,    16,    17,    18,    19,
-      20,    21,    22,    23,    24,     0,    26,    27,    28,    29,
-      30,     0,     0,    33,    34,     0,     0,     0,     0,     0,
-       0,     0,     0,     0,    35
+       0,   -87,     0,     0,     0,     0,    35,     0,     0,     0,
+       0,     0,     0,   -87,     7,     8,     9,    10,    11,    12,
+      13,     0,    15,    16,    17,    18,    19,    20,    21,    22,
+      23,    24,     0,    26,    27,    28,    29,    30,     0,     0,
+      33,    34,     0,     0,     0,     0,     0,   125,     0,     0,
+       0,   126,     0,     0,     0,     0,     0,   127,     0,    66,
+       7,     8,     9,    10,    11,    12,    13,     0,    15,    16,
+      17,    18,    19,    20,    21,    22,    23,    24,     0,    26,
+      27,    28,    29,    30,     0,     0,    33,    34,     0,     0,
+       0,     0,   182,     0,     0,     0,     0,    35,     7,     8,
+       9,    10,    11,    12,    13,     0,    15,    16,    17,    18,
+      19,    20,    21,    22,    23,    24,     0,    26,    27,    28,
+      29,    30,     0,     0,    33,    34,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,    35
 };
 
-#define yypact_value_is_default(yystate) \
-  ((yystate) == (-140))
+#define yypact_value_is_default(Yystate) \
+  (!!((Yystate) == (-92)))
 
-#define yytable_value_is_error(yytable_value) \
+#define yytable_value_is_error(Yytable_value) \
   YYID (0)
 
 static const yytype_int16 yycheck[] =
 {
-      60,    60,    80,     1,     3,     8,     1,    31,    39,    87,
-      65,    38,    57,    26,   153,    18,    38,     1,    38,   111,
-      38,   113,   161,    34,    27,    52,    46,    51,    31,     0,
-      52,    38,    77,   111,    52,   113,    34,    48,    93,    48,
-      38,    54,   102,    38,    42,    58,    34,    42,   126,   127,
-      48,    49,    50,    48,    38,    50,    87,    64,    57,    38,
-      48,    34,    69,    42,    48,    30,    50,    23,    33,    48,
-      48,    50,    38,    45,    53,    48,    42,   155,    77,    38,
-      38,    38,    48,    42,    50,    45,    46,   126,   127,    48,
-      48,    50,    50,   153,    46,    47,    45,    46,    52,    52,
-      38,   161,    45,    47,    51,    46,    35,    49,     1,    49,
-     170,   170,     5,     6,     7,     8,     9,    10,    11,    49,
-      13,    14,    15,    16,    17,    18,    19,    20,    21,    22,
-      49,    24,    25,    26,    27,    28,    53,    35,    31,    32,
-      45,    47,    45,    30,    45,    38,    37,    49,    31,    42,
-      49,    47,    45,    46,    49,    48,    35,    50,     1,    49,
-      53,    49,     5,     6,     7,     8,     9,    10,    11,    49,
-      13,    14,    15,    16,    17,    18,    19,    20,    21,    22,
-      49,    24,    25,    26,    27,    28,    49,    49,    31,    32,
-      49,     1,    96,    82,   155,    38,   109,    50,   114,    42,
-      -1,    -1,    45,    46,    -1,    48,    -1,    50,     1,    -1,
-      53,   105,     5,     6,     7,     8,     9,    10,    11,    12,
-      13,    14,    15,    16,    17,    18,    19,    20,    21,    22,
-      23,    24,    25,    26,    27,    28,    29,    30,    31,    32,
-      -1,    -1,    -1,    -1,    -1,    38,    -1,    -1,    -1,    42,
-      -1,    -1,    45,    -1,     1,    48,    -1,    50,     5,     6,
-       7,     8,     9,    10,    11,    -1,    13,    14,    15,    16,
-      17,    18,    19,    20,    21,    22,    -1,    24,    25,    26,
-      27,    28,    -1,    -1,    31,    32,    -1,    -1,    -1,    -1,
-      37,    -1,    -1,    -1,    -1,    42,    -1,    -1,    -1,     1,
-      -1,    -1,    49,     5,     6,     7,     8,     9,    10,    11,
-      -1,    13,    14,    15,    16,    17,    18,    19,    20,    21,
-      22,    -1,    24,    25,    26,    27,    28,    -1,    -1,    31,
-      32,    -1,    -1,    -1,    -1,    37,    -1,    -1,    -1,    -1,
-      42,    -1,    -1,    -1,     1,    -1,    -1,    49,     5,     6,
-       7,     8,     9,    10,    11,    -1,    13,    14,    15,    16,
-      17,    18,    19,    20,    21,    22,    -1,    24,    25,    26,
-      27,    28,    -1,    -1,    31,    32,    -1,    -1,    -1,    -1,
-      37,    -1,    -1,    -1,    -1,    42,    -1,    -1,    -1,    -1,
-      -1,    -1,    49,     5,     6,     7,     8,     9,    10,    11,
-      -1,    13,    14,    15,    16,    17,    18,    19,    20,    21,
-      22,    -1,    24,    25,    26,    27,    28,    -1,    -1,    31,
+      60,    60,    81,     3,     1,     8,    26,    39,    66,    88,
+      57,    38,    38,     1,    38,    18,    38,    38,     1,     0,
+      31,    42,    46,    30,    27,    52,    33,    48,    31,    50,
+      52,    78,    53,   112,    54,   114,    94,    34,    58,    65,
+      51,    38,    48,   103,    70,    42,    23,    38,   127,   128,
+      38,    48,    49,    50,    42,    38,    88,    57,    34,    42,
+      48,    52,    50,   154,    48,    48,    34,    50,    34,    38,
+      45,   162,    48,    42,   112,    38,   114,   156,    78,    48,
+      48,    50,    48,    38,    45,    46,    38,    42,    46,    47,
+      42,    45,    46,    48,   154,    50,    48,    38,    50,   127,
+     128,    52,   162,    52,    47,    45,    51,    46,    35,     1,
+      35,   171,   171,     5,     6,     7,     8,     9,    10,    11,
+      49,    13,    14,    15,    16,    17,    18,    19,    20,    21,
+      22,    49,    24,    25,    26,    27,    28,    49,    49,    31,
+      32,    45,    47,    53,    45,    30,    38,    45,    37,    49,
+      42,    47,    49,    45,    46,    49,    48,    31,    50,     1,
+      35,    53,    49,     5,     6,     7,     8,     9,    10,    11,
+      49,    13,    14,    15,    16,    17,    18,    19,    20,    21,
+      22,    49,    24,    25,    26,    27,    28,    49,    49,    31,
+      32,    49,    49,     1,    97,    83,    38,   156,   115,    50,
+      42,   110,    -1,    45,    46,    -1,    48,   106,    50,     1,
+      -1,    53,    -1,     5,     6,     7,     8,     9,    10,    11,
+      12,    13,    14,    15,    16,    17,    18,    19,    20,    21,
+      22,    23,    24,    25,    26,    27,    28,    29,    30,    31,
       32,    -1,    -1,    -1,    -1,    -1,    38,    -1,    -1,    -1,
-      42,    -1,    -1,    -1,    -1,    -1,    48,    -1,    50,     5,
+      42,    -1,    -1,    45,    -1,     1,    48,    -1,    50,     5,
        6,     7,     8,     9,    10,    11,    -1,    13,    14,    15,
       16,    17,    18,    19,    20,    21,    22,    -1,    24,    25,
       26,    27,    28,    -1,    -1,    31,    32,    -1,    -1,    -1,
-      -1,    37,    -1,    -1,    -1,    -1,    42,     5,     6,     7,
-       8,     9,    10,    11,    -1,    13,    14,    15,    16,    17,
-      18,    19,    20,    21,    22,    -1,    24,    25,    26,    27,
-      28,    -1,    -1,    31,    32,    -1,    -1,    -1,    -1,    -1,
-      -1,    -1,    -1,    -1,    42
+      -1,    37,    -1,    -1,    -1,    -1,    42,    -1,    -1,    -1,
+       1,    -1,    -1,    49,     5,     6,     7,     8,     9,    10,
+      11,    -1,    13,    14,    15,    16,    17,    18,    19,    20,
+      21,    22,    -1,    24,    25,    26,    27,    28,    -1,    -1,
+      31,    32,    -1,    -1,    -1,    -1,    37,    -1,    -1,    -1,
+      -1,    42,    -1,    -1,    -1,     1,    -1,    -1,    49,     5,
+       6,     7,     8,     9,    10,    11,    -1,    13,    14,    15,
+      16,    17,    18,    19,    20,    21,    22,    -1,    24,    25,
+      26,    27,    28,    -1,    -1,    31,    32,    -1,    -1,    -1,
+      -1,    37,    -1,    -1,    -1,    -1,    42,    -1,    -1,    -1,
+      -1,    -1,    -1,    49,     5,     6,     7,     8,     9,    10,
+      11,    -1,    13,    14,    15,    16,    17,    18,    19,    20,
+      21,    22,    -1,    24,    25,    26,    27,    28,    -1,    -1,
+      31,    32,    -1,    -1,    -1,    -1,    -1,    38,    -1,    -1,
+      -1,    42,    -1,    -1,    -1,    -1,    -1,    48,    -1,    50,
+       5,     6,     7,     8,     9,    10,    11,    -1,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    -1,    24,
+      25,    26,    27,    28,    -1,    -1,    31,    32,    -1,    -1,
+      -1,    -1,    37,    -1,    -1,    -1,    -1,    42,     5,     6,
+       7,     8,     9,    10,    11,    -1,    13,    14,    15,    16,
+      17,    18,    19,    20,    21,    22,    -1,    24,    25,    26,
+      27,    28,    -1,    -1,    31,    32,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    42
 };
 
 /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
@@ -853,19 +866,19 @@ static const yytype_uint8 yystos[] =
       28,    29,    30,    31,    32,    42,    58,    61,    65,    66,
       67,    68,    69,    70,    74,    85,   100,   102,    45,    46,
       38,    52,    97,    23,    38,    52,    88,    60,    38,    88,
-      48,    48,    45,    38,    48,    50,    62,    63,    64,    71,
-      75,    76,    67,    97,    38,    98,    99,    59,    88,     1,
-      65,    89,    90,    91,    61,    65,    88,    66,    82,    38,
-       1,    75,    72,    73,    74,    45,    47,    75,    30,    33,
-     101,    34,    48,    51,    46,    47,    61,    45,    46,    38,
-      42,    48,    53,    71,    77,    78,    92,    93,    94,    95,
-      46,     1,    91,    75,    38,    42,    48,    71,    83,    84,
-      49,    49,    49,    49,    74,    64,    96,     1,    79,    80,
-      81,    82,    35,    46,    99,    95,     1,    38,    77,    35,
-      77,    96,    34,    48,    45,    47,     1,    42,    83,    83,
-      34,    48,    45,    31,    51,    86,    87,    49,    49,    37,
-      47,    49,    49,     1,    79,    94,    49,    49,     1,    79,
-      35,    37,    82,    49,    49,    49,    49
+      48,    48,    45,    38,    42,    48,    50,    62,    63,    64,
+      71,    75,    76,    67,    97,    38,    98,    99,    59,    88,
+       1,    65,    89,    90,    91,    61,    65,    88,    66,    82,
+      38,     1,    75,    72,    73,    74,    45,    47,    75,    30,
+      33,   101,    34,    48,    51,    46,    47,    61,    45,    46,
+      38,    42,    48,    53,    71,    77,    78,    92,    93,    94,
+      95,    46,     1,    91,    75,    38,    42,    48,    71,    83,
+      84,    49,    49,    49,    49,    74,    64,    96,     1,    79,
+      80,    81,    82,    35,    46,    99,    95,     1,    38,    77,
+      35,    77,    96,    34,    48,    45,    47,     1,    42,    83,
+      83,    34,    48,    45,    31,    51,    86,    87,    49,    49,
+      37,    47,    49,    49,     1,    79,    94,    49,    49,     1,
+      79,    35,    37,    82,    49,    49,    49,    49
 };
 
 #define yyerrok                (yyerrstatus = 0)
@@ -912,46 +925,18 @@ do                                                              \
     }                                                          \
 while (YYID (0))
 
-
+/* Error token number */
 #define YYTERROR       1
 #define YYERRCODE      256
 
 
-/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
-   If N is 0, then set CURRENT to the empty location which ends
-   the previous symbol: RHS[0] (always defined).  */
-
-#define YYRHSLOC(Rhs, K) ((Rhs)[K])
-#ifndef YYLLOC_DEFAULT
-# define YYLLOC_DEFAULT(Current, Rhs, N)                               \
-    do                                                                 \
-      if (YYID (N))                                                    \
-       {                                                               \
-         (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;        \
-         (Current).first_column = YYRHSLOC (Rhs, 1).first_column;      \
-         (Current).last_line    = YYRHSLOC (Rhs, N).last_line;         \
-         (Current).last_column  = YYRHSLOC (Rhs, N).last_column;       \
-       }                                                               \
-      else                                                             \
-       {                                                               \
-         (Current).first_line   = (Current).last_line   =              \
-           YYRHSLOC (Rhs, 0).last_line;                                \
-         (Current).first_column = (Current).last_column =              \
-           YYRHSLOC (Rhs, 0).last_column;                              \
-       }                                                               \
-    while (YYID (0))
-#endif
-
-
 /* This macro is provided for backward compatibility. */
-
 #ifndef YY_LOCATION_PRINT
 # define YY_LOCATION_PRINT(File, Loc) ((void) 0)
 #endif
 
 
 /* YYLEX -- calling `yylex' with the right arguments.  */
-
 #ifdef YYLEX_PARAM
 # define YYLEX yylex (YYLEX_PARAM)
 #else
@@ -1014,7 +999,7 @@ yy_symbol_value_print (yyoutput, yytype, yyvaluep)
   switch (yytype)
     {
       default:
-       break;
+        break;
     }
 }
 
@@ -1256,7 +1241,6 @@ yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
 {
   YYSIZE_T yysize0 = yytnamerr (YY_NULL, yytname[yytoken]);
   YYSIZE_T yysize = yysize0;
-  YYSIZE_T yysize1;
   enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
   /* Internationalized format string. */
   const char *yyformat = YY_NULL;
@@ -1319,11 +1303,13 @@ yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
                     break;
                   }
                 yyarg[yycount++] = yytname[yyx];
-                yysize1 = yysize + yytnamerr (YY_NULL, yytname[yyx]);
-                if (! (yysize <= yysize1
-                       && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
-                  return 2;
-                yysize = yysize1;
+                {
+                  YYSIZE_T yysize1 = yysize + yytnamerr (YY_NULL, yytname[yyx]);
+                  if (! (yysize <= yysize1
+                         && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+                    return 2;
+                  yysize = yysize1;
+                }
               }
         }
     }
@@ -1343,10 +1329,12 @@ yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
 # undef YYCASE_
     }
 
-  yysize1 = yysize + yystrlen (yyformat);
-  if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
-    return 2;
-  yysize = yysize1;
+  {
+    YYSIZE_T yysize1 = yysize + yystrlen (yyformat);
+    if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+      return 2;
+    yysize = yysize1;
+  }
 
   if (*yymsg_alloc < yysize)
     {
@@ -1406,32 +1394,27 @@ yydestruct (yymsg, yytype, yyvaluep)
     {
 
       default:
-       break;
+        break;
     }
 }
 
 
-/* Prevent warnings from -Wmissing-prototypes.  */
-#ifdef YYPARSE_PARAM
-#if defined __STDC__ || defined __cplusplus
-int yyparse (void *YYPARSE_PARAM);
-#else
-int yyparse ();
-#endif
-#else /* ! YYPARSE_PARAM */
-#if defined __STDC__ || defined __cplusplus
-int yyparse (void);
-#else
-int yyparse ();
-#endif
-#endif /* ! YYPARSE_PARAM */
 
 
 /* The lookahead symbol.  */
 int yychar;
 
+
+#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+# define YY_IGNORE_MAYBE_UNINITIALIZED_END
+#endif
+#ifndef YY_INITIAL_VALUE
+# define YY_INITIAL_VALUE(Value) /* Nothing. */
+#endif
+
 /* The semantic value of the lookahead symbol.  */
-YYSTYPE yylval;
+YYSTYPE yylval YY_INITIAL_VALUE(yyval_default);
 
 /* Number of syntax errors so far.  */
 int yynerrs;
@@ -1489,7 +1472,7 @@ yyparse ()
   int yyn;
   int yyresult;
   /* Lookahead token as an internal (translated) token number.  */
-  int yytoken;
+  int yytoken = 0;
   /* The variables used to return semantic value and location from the
      action routines.  */
   YYSTYPE yyval;
@@ -1507,9 +1490,8 @@ yyparse ()
      Keep to zero when no symbol should be popped.  */
   int yylen = 0;
 
-  yytoken = 0;
-  yyss = yyssa;
-  yyvs = yyvsa;
+  yyssp = yyss = yyssa;
+  yyvsp = yyvs = yyvsa;
   yystacksize = YYINITDEPTH;
 
   YYDPRINTF ((stderr, "Starting parse\n"));
@@ -1518,14 +1500,6 @@ yyparse ()
   yyerrstatus = 0;
   yynerrs = 0;
   yychar = YYEMPTY; /* Cause a token to be read.  */
-
-  /* Initialize stack pointers.
-     Waste one element of value and location stack
-     so that they stay on the same level as the state stack.
-     The wasted elements are never initialized.  */
-  yyssp = yyss;
-  yyvsp = yyvs;
-
   goto yysetstate;
 
 /*------------------------------------------------------------.
@@ -1666,7 +1640,9 @@ yybackup:
   yychar = YYEMPTY;
 
   yystate = yyn;
+  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
   *++yyvsp = yylval;
+  YY_IGNORE_MAYBE_UNINITIALIZED_END
 
   goto yynewstate;
 
@@ -1916,7 +1892,14 @@ yyreduce:
 
   case 69:
 
-    { (yyval) = (yyvsp[(4) - (4)]); }
+    { if (current_name != NULL) {
+                   error_with_pos("unexpected second declaration name");
+                   YYERROR;
+                 } else {
+                   current_name = (*(yyvsp[(1) - (1)]))->string;
+                   (yyval) = (yyvsp[(1) - (1)]);
+                 }
+               }
     break;
 
   case 70:
@@ -1926,12 +1909,12 @@ yyreduce:
 
   case 71:
 
-    { (yyval) = (yyvsp[(2) - (2)]); }
+    { (yyval) = (yyvsp[(4) - (4)]); }
     break;
 
   case 72:
 
-    { (yyval) = (yyvsp[(3) - (3)]); }
+    { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
   case 73:
@@ -1941,12 +1924,12 @@ yyreduce:
 
   case 74:
 
-    { (yyval) = (yyvsp[(2) - (2)]); }
+    { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 78:
+  case 75:
 
-    { (yyval) = (yyvsp[(4) - (4)]); }
+    { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
   case 79:
@@ -1956,12 +1939,12 @@ yyreduce:
 
   case 80:
 
-    { (yyval) = (yyvsp[(2) - (2)]); }
+    { (yyval) = (yyvsp[(4) - (4)]); }
     break;
 
   case 81:
 
-    { (yyval) = (yyvsp[(3) - (3)]); }
+    { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
   case 82:
@@ -1971,40 +1954,45 @@ yyreduce:
 
   case 83:
 
+    { (yyval) = (yyvsp[(3) - (3)]); }
+    break;
+
+  case 84:
+
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 85:
+  case 86:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 86:
+  case 87:
 
     { (yyval) = NULL; }
     break;
 
-  case 89:
+  case 90:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 90:
+  case 91:
 
     { (yyval) = (yyvsp[(2) - (2)]) ? (yyvsp[(2) - (2)]) : (yyvsp[(1) - (2)]); }
     break;
 
-  case 91:
+  case 92:
 
     { (yyval) = (yyvsp[(2) - (2)]) ? (yyvsp[(2) - (2)]) : (yyvsp[(1) - (2)]); }
     break;
 
-  case 93:
+  case 94:
 
     { (yyval) = NULL; }
     break;
 
-  case 94:
+  case 95:
 
     { /* For version 2 checksums, we don't want to remember
                     private parameter names.  */
@@ -2013,39 +2001,39 @@ yyreduce:
                }
     break;
 
-  case 95:
+  case 96:
 
     { remove_node((yyvsp[(1) - (1)]));
                  (yyval) = (yyvsp[(1) - (1)]);
                }
     break;
 
-  case 96:
+  case 97:
 
     { (yyval) = (yyvsp[(4) - (4)]); }
     break;
 
-  case 97:
+  case 98:
 
     { (yyval) = (yyvsp[(4) - (4)]); }
     break;
 
-  case 98:
+  case 99:
 
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 99:
+  case 100:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 100:
+  case 101:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 101:
+  case 102:
 
     { struct string_list *decl = *(yyvsp[(2) - (3)]);
                  *(yyvsp[(2) - (3)]) = NULL;
@@ -2054,87 +2042,87 @@ yyreduce:
                }
     break;
 
-  case 102:
+  case 103:
 
     { (yyval) = NULL; }
     break;
 
-  case 104:
+  case 105:
 
     { remove_list((yyvsp[(2) - (2)]), &(*(yyvsp[(1) - (2)]))->next); (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 105:
+  case 106:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 106:
+  case 107:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 107:
+  case 108:
 
     { (yyval) = NULL; }
     break;
 
-  case 110:
+  case 111:
 
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 111:
+  case 112:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 112:
+  case 113:
 
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 113:
+  case 114:
 
     { (yyval) = NULL; }
     break;
 
-  case 116:
+  case 117:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 117:
+  case 118:
 
     { (yyval) = (yyvsp[(2) - (2)]) ? (yyvsp[(2) - (2)]) : (yyvsp[(1) - (2)]); }
     break;
 
-  case 118:
+  case 119:
 
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 120:
+  case 121:
 
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 121:
+  case 122:
 
     { (yyval) = NULL; }
     break;
 
-  case 123:
+  case 124:
 
     { (yyval) = (yyvsp[(3) - (3)]); }
     break;
 
-  case 124:
+  case 125:
 
     { (yyval) = (yyvsp[(4) - (4)]); }
     break;
 
-  case 127:
+  case 128:
 
     {
                        const char *name = strdup((*(yyvsp[(1) - (1)]))->string);
@@ -2142,7 +2130,7 @@ yyreduce:
                }
     break;
 
-  case 128:
+  case 129:
 
     {
                        const char *name = strdup((*(yyvsp[(1) - (3)]))->string);
@@ -2151,17 +2139,17 @@ yyreduce:
                }
     break;
 
-  case 129:
+  case 130:
 
     { (yyval) = (yyvsp[(2) - (2)]); }
     break;
 
-  case 130:
+  case 131:
 
     { (yyval) = NULL; }
     break;
 
-  case 132:
+  case 133:
 
     { export_symbol((*(yyvsp[(3) - (5)]))->string); (yyval) = (yyvsp[(5) - (5)]); }
     break;
@@ -2330,7 +2318,9 @@ yyerrlab1:
       YY_STACK_PRINT (yyss, yyssp);
     }
 
+  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
   *++yyvsp = yylval;
+  YY_IGNORE_MAYBE_UNINITIALIZED_END
 
 
   /* Shift the error token.  */
@@ -2404,4 +2394,3 @@ yyerror(const char *e)
 {
   error_with_pos("%s", e);
 }
-
index a4737dec45329c687ef6923fba9d6c2ee7d60805..4c00cef6d71dddd64999b6bef21f0c6e58f19756 100644 (file)
@@ -1,4 +1,4 @@
-/* A Bison parser, made by GNU Bison 2.5.1.  */
+/* A Bison parser, made by GNU Bison 2.7.  */
 
 /* Bison interface for Yacc-like parsers in C
    
    This special exception was added by the Free Software Foundation in
    version 2.2 of Bison.  */
 
+#ifndef YY_YY_SCRIPTS_GENKSYMS_PARSE_TAB_H_SHIPPED_INCLUDED
+# define YY_YY_SCRIPTS_GENKSYMS_PARSE_TAB_H_SHIPPED_INCLUDED
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 1
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
 
 /* Tokens.  */
 #ifndef YYTOKENTYPE
@@ -83,7 +92,6 @@
 #endif
 
 
-
 #if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
 typedef int YYSTYPE;
 # define YYSTYPE_IS_TRIVIAL 1
@@ -93,4 +101,18 @@ typedef int YYSTYPE;
 
 extern YYSTYPE yylval;
 
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
 
+#endif /* !YY_YY_SCRIPTS_GENKSYMS_PARSE_TAB_H_SHIPPED_INCLUDED  */
index b9f4cf202302d46d71e3899fc49f565371975c83..723ab30fe9d46951b6106c38bc81b1460c5df342 100644 (file)
@@ -303,6 +303,15 @@ direct_declarator:
                    $$ = $1;
                  }
                }
+       | TYPE
+               { if (current_name != NULL) {
+                   error_with_pos("unexpected second declaration name");
+                   YYERROR;
+                 } else {
+                   current_name = (*$1)->string;
+                   $$ = $1;
+                 }
+               }
        | direct_declarator '(' parameter_declaration_clause ')'
                { $$ = $4; }
        | direct_declarator '(' error ')'
index c814f57672fc03fb8720127d1527d17264454438..0b7dc2fd7bac0e2985a0b16c023b98cca23325e7 100644 (file)
@@ -268,8 +268,7 @@ int conf_read_simple(const char *name, int def)
                        goto load;
                sym_add_change_count(1);
                if (!sym_defconfig_list) {
-                       if (modules_sym)
-                               sym_calc_value(modules_sym);
+                       sym_calc_value(modules_sym);
                        return 1;
                }
 
@@ -404,9 +403,7 @@ setsym:
        }
        free(line);
        fclose(in);
-
-       if (modules_sym)
-               sym_calc_value(modules_sym);
+       sym_calc_value(modules_sym);
        return 0;
 }
 
index ec8e20350a648ca0f47f152f9650f6fa6e421a0b..0d883b37882a145b588a45f53f3be373849e7410 100755 (executable)
@@ -100,6 +100,10 @@ cat $INITFILE > $TMP_FILE
 # Merge files, printing warnings on overridden values
 for MERGE_FILE in $MERGE_LIST ; do
        echo "Merging $MERGE_FILE"
+       if [ ! -r "$MERGE_FILE" ]; then
+               echo "The merge file '$MERGE_FILE' does not exist.  Exit." >&2
+               exit 1
+       fi
        CFG_LIST=$(sed -n "$SED_CONFIG_EXP" $MERGE_FILE)
 
        for CFG in $CFG_LIST ; do
index 70c5ee189dce7c7d573c044117d3f63e4450cbcb..50878dc025a5746d51316c6f1bd4c1e1b8a41707 100644 (file)
@@ -467,8 +467,7 @@ void sym_clear_all_valid(void)
        for_all_symbols(i, sym)
                sym->flags &= ~SYMBOL_VALID;
        sym_add_change_count(1);
-       if (modules_sym)
-               sym_calc_value(modules_sym);
+       sym_calc_value(modules_sym);
 }
 
 bool sym_tristate_within_range(struct symbol *sym, tristate val)
index b6ac02d604f1fbcca8ecc04cf09ac038bcb81fa8..ac498f01b449f43476bd9a97586c359d94eaa9bb 100644 (file)
@@ -22,6 +22,7 @@ comment,      T_COMMENT,      TF_COMMAND
 config,                T_CONFIG,       TF_COMMAND
 menuconfig,    T_MENUCONFIG,   TF_COMMAND
 help,          T_HELP,         TF_COMMAND
+---help---,    T_HELP,         TF_COMMAND
 if,            T_IF,           TF_COMMAND|TF_PARAM
 endif,         T_ENDIF,        TF_COMMAND
 depends,       T_DEPENDS,      TF_COMMAND
index c77a8eff1ef21ef6d5a05d588d732a892f50d2cb..360a62df2b5e1c378fb9845e637f88dcde95fd69 100644 (file)
@@ -50,7 +50,7 @@ kconf_id_hash (register const char *str, register unsigned int len)
       73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
       73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
       73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
-      73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
+      73, 73, 73, 73, 73,  0, 73, 73, 73, 73,
       73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
       73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
       73, 73, 73, 73, 73, 73, 73, 73, 73, 73,
@@ -96,6 +96,7 @@ struct kconf_id_strings_t
     char kconf_id_strings_str7[sizeof("default")];
     char kconf_id_strings_str8[sizeof("tristate")];
     char kconf_id_strings_str9[sizeof("endchoice")];
+    char kconf_id_strings_str10[sizeof("---help---")];
     char kconf_id_strings_str12[sizeof("def_tristate")];
     char kconf_id_strings_str13[sizeof("def_bool")];
     char kconf_id_strings_str14[sizeof("defconfig_list")];
@@ -132,6 +133,7 @@ static const struct kconf_id_strings_t kconf_id_strings_contents =
     "default",
     "tristate",
     "endchoice",
+    "---help---",
     "def_tristate",
     "def_bool",
     "defconfig_list",
@@ -172,7 +174,7 @@ kconf_id_lookup (register const char *str, register unsigned int len)
 {
   enum
     {
-      TOTAL_KEYWORDS = 33,
+      TOTAL_KEYWORDS = 34,
       MIN_WORD_LENGTH = 2,
       MAX_WORD_LENGTH = 14,
       MIN_HASH_VALUE = 2,
@@ -182,34 +184,36 @@ kconf_id_lookup (register const char *str, register unsigned int len)
   static const struct kconf_id wordlist[] =
     {
       {-1}, {-1},
-#line 25 "scripts/kconfig/zconf.gperf"
+#line 26 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str2,            T_IF,           TF_COMMAND|TF_PARAM},
-#line 36 "scripts/kconfig/zconf.gperf"
+#line 37 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str3,            T_TYPE,         TF_COMMAND, S_INT},
       {-1},
-#line 26 "scripts/kconfig/zconf.gperf"
+#line 27 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str5,            T_ENDIF,        TF_COMMAND},
       {-1},
-#line 29 "scripts/kconfig/zconf.gperf"
+#line 30 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str7,    T_DEFAULT,      TF_COMMAND, S_UNKNOWN},
-#line 31 "scripts/kconfig/zconf.gperf"
+#line 32 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str8,    T_TYPE,         TF_COMMAND, S_TRISTATE},
 #line 20 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str9,    T_ENDCHOICE,    TF_COMMAND},
-      {-1}, {-1},
-#line 32 "scripts/kconfig/zconf.gperf"
+#line 25 "scripts/kconfig/zconf.gperf"
+      {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str10,   T_HELP,         TF_COMMAND},
+      {-1},
+#line 33 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str12,   T_DEFAULT,      TF_COMMAND, S_TRISTATE},
-#line 35 "scripts/kconfig/zconf.gperf"
+#line 36 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str13,   T_DEFAULT,      TF_COMMAND, S_BOOLEAN},
-#line 45 "scripts/kconfig/zconf.gperf"
+#line 46 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str14,   T_OPT_DEFCONFIG_LIST,TF_OPTION},
       {-1}, {-1},
-#line 43 "scripts/kconfig/zconf.gperf"
+#line 44 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str17,           T_ON,           TF_PARAM},
-#line 28 "scripts/kconfig/zconf.gperf"
+#line 29 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str18,   T_OPTIONAL,     TF_COMMAND},
       {-1}, {-1},
-#line 42 "scripts/kconfig/zconf.gperf"
+#line 43 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str21,           T_OPTION,       TF_COMMAND},
 #line 17 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str22,   T_ENDMENU,      TF_COMMAND},
@@ -219,51 +223,51 @@ kconf_id_lookup (register const char *str, register unsigned int len)
 #line 23 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str25,   T_MENUCONFIG,   TF_COMMAND},
       {-1},
-#line 44 "scripts/kconfig/zconf.gperf"
+#line 45 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str27,   T_OPT_MODULES,  TF_OPTION},
-#line 47 "scripts/kconfig/zconf.gperf"
+#line 48 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str28,   T_OPT_ALLNOCONFIG_Y,TF_OPTION},
 #line 16 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str29,           T_MENU,         TF_COMMAND},
       {-1},
-#line 39 "scripts/kconfig/zconf.gperf"
+#line 40 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str31,           T_SELECT,       TF_COMMAND},
 #line 21 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str32,   T_COMMENT,      TF_COMMAND},
-#line 46 "scripts/kconfig/zconf.gperf"
+#line 47 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str33,           T_OPT_ENV,      TF_OPTION},
       {-1},
-#line 40 "scripts/kconfig/zconf.gperf"
+#line 41 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str35,           T_RANGE,        TF_COMMAND},
 #line 19 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str36,           T_CHOICE,       TF_COMMAND},
       {-1}, {-1},
-#line 33 "scripts/kconfig/zconf.gperf"
+#line 34 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str39,           T_TYPE,         TF_COMMAND, S_BOOLEAN},
       {-1},
 #line 18 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str41,           T_SOURCE,       TF_COMMAND},
-#line 41 "scripts/kconfig/zconf.gperf"
+#line 42 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str42,   T_VISIBLE,      TF_COMMAND},
-#line 37 "scripts/kconfig/zconf.gperf"
+#line 38 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str43,           T_TYPE,         TF_COMMAND, S_HEX},
       {-1}, {-1},
 #line 22 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str46,           T_CONFIG,       TF_COMMAND},
-#line 34 "scripts/kconfig/zconf.gperf"
+#line 35 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str47,   T_TYPE,         TF_COMMAND, S_BOOLEAN},
       {-1}, {-1}, {-1},
-#line 38 "scripts/kconfig/zconf.gperf"
+#line 39 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str51,           T_TYPE,         TF_COMMAND, S_STRING},
       {-1}, {-1},
 #line 24 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str54,           T_HELP,         TF_COMMAND},
       {-1},
-#line 30 "scripts/kconfig/zconf.gperf"
+#line 31 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str56,           T_PROMPT,       TF_COMMAND},
       {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, {-1}, {-1},
       {-1}, {-1}, {-1}, {-1}, {-1}, {-1},
-#line 27 "scripts/kconfig/zconf.gperf"
+#line 28 "scripts/kconfig/zconf.gperf"
       {(int)(long)&((struct kconf_id_strings_t *)0)->kconf_id_strings_str72,   T_DEPENDS,      TF_COMMAND}
     };
 
@@ -285,5 +289,5 @@ kconf_id_lookup (register const char *str, register unsigned int len)
     }
   return 0;
 }
-#line 48 "scripts/kconfig/zconf.gperf"
+#line 49 "scripts/kconfig/zconf.gperf"
 
index 200a3fe3009153bba22742e862f6114f9e7f5fc8..c410d257da0602dc9b4de92dac457e237c097399 100644 (file)
@@ -66,9 +66,16 @@ static void alloc_string(const char *str, int size)
        memcpy(text, str, size);
        text[size] = 0;
 }
+
+static void warn_ignored_character(char chr)
+{
+       fprintf(stderr,
+               "%s:%d:warning: ignoring unsupported character '%c'\n",
+               zconf_curname(), zconf_lineno(), chr);
+}
 %}
 
-n      [A-Za-z0-9_]
+n      [A-Za-z0-9_-]
 
 %%
        int str = 0;
@@ -106,7 +113,7 @@ n   [A-Za-z0-9_]
                zconflval.string = text;
                return T_WORD;
        }
-       .
+       .       warn_ignored_character(*yytext);
        \n      {
                BEGIN(INITIAL);
                current_file->lineno++;
@@ -132,8 +139,7 @@ n   [A-Za-z0-9_]
                BEGIN(STRING);
        }
        \n      BEGIN(INITIAL); current_file->lineno++; return T_EOL;
-       ---     /* ignore */
-       ({n}|[-/.])+    {
+       ({n}|[/.])+     {
                const struct kconf_id *id = kconf_id_lookup(yytext, yyleng);
                if (id && id->flags & TF_PARAM) {
                        zconflval.id = id;
@@ -146,11 +152,7 @@ n  [A-Za-z0-9_]
        #.*     /* comment */
        \\\n    current_file->lineno++;
        [[:blank:]]+
-       .       {
-               fprintf(stderr,
-                       "%s:%d:warning: ignoring unsupported character '%c'\n",
-                       zconf_curname(), zconf_lineno(), *yytext);
-       }
+       .       warn_ignored_character(*yytext);
        <<EOF>> {
                BEGIN(INITIAL);
        }
index dd4e86c825210775cd9282eae71f489f80adec07..37fdf612350586d5a670fc7054aa06c1111d8a8e 100644 (file)
@@ -72,7 +72,6 @@ typedef int flex_int32_t;
 typedef unsigned char flex_uint8_t; 
 typedef unsigned short int flex_uint16_t;
 typedef unsigned int flex_uint32_t;
-#endif /* ! C99 */
 
 /* Limits of integral types. */
 #ifndef INT8_MIN
@@ -103,6 +102,8 @@ typedef unsigned int flex_uint32_t;
 #define UINT32_MAX             (4294967295U)
 #endif
 
+#endif /* ! C99 */
+
 #endif /* ! FLEXINT_H */
 
 #ifdef __cplusplus
@@ -159,7 +160,15 @@ typedef unsigned int flex_uint32_t;
 
 /* Size of default input buffer. */
 #ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
 #define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
 #endif
 
 /* The state buf must be large enough to hold one state per character in the main buffer.
@@ -365,354 +374,338 @@ int zconflineno = 1;
 
 extern char *zconftext;
 #define yytext_ptr zconftext
-static yyconst flex_int16_t yy_nxt[][19] =
+static yyconst flex_int16_t yy_nxt[][18] =
     {
     {
         0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
-        0,    0,    0,    0,    0,    0,    0,    0,    0
+        0,    0,    0,    0,    0,    0,    0,    0
     },
 
     {
        11,   12,   13,   14,   12,   12,   15,   12,   12,   12,
-       12,   12,   12,   12,   12,   12,   12,   12,   12
+       12,   12,   12,   12,   12,   12,   12,   12
     },
 
     {
        11,   12,   13,   14,   12,   12,   15,   12,   12,   12,
-       12,   12,   12,   12,   12,   12,   12,   12,   12
+       12,   12,   12,   12,   12,   12,   12,   12
     },
 
     {
        11,   16,   16,   17,   16,   16,   16,   16,   16,   16,
-       16,   16,   16,   18,   16,   16,   16,   16,   16
+       16,   18,   16,   16,   16,   16,   16,   16
     },
 
     {
        11,   16,   16,   17,   16,   16,   16,   16,   16,   16,
-       16,   16,   16,   18,   16,   16,   16,   16,   16
+       16,   18,   16,   16,   16,   16,   16,   16
 
     },
 
     {
        11,   19,   20,   21,   19,   19,   19,   19,   19,   19,
-       19,   19,   19,   19,   19,   19,   19,   19,   19
+       19,   19,   19,   19,   19,   19,   19,   19
     },
 
     {
        11,   19,   20,   21,   19,   19,   19,   19,   19,   19,
-       19,   19,   19,   19,   19,   19,   19,   19,   19
+       19,   19,   19,   19,   19,   19,   19,   19
     },
 
     {
        11,   22,   22,   23,   22,   24,   22,   22,   24,   22,
-       22,   22,   22,   22,   22,   22,   22,   25,   22
+       22,   22,   22,   22,   22,   22,   25,   22
     },
 
     {
        11,   22,   22,   23,   22,   24,   22,   22,   24,   22,
-       22,   22,   22,   22,   22,   22,   22,   25,   22
+       22,   22,   22,   22,   22,   22,   25,   22
     },
 
     {
        11,   26,   27,   28,   29,   30,   31,   32,   30,   33,
-       34,   35,   36,   36,   37,   38,   39,   40,   41
+       34,   35,   35,   36,   37,   38,   39,   40
 
     },
 
     {
        11,   26,   27,   28,   29,   30,   31,   32,   30,   33,
-       34,   35,   36,   36,   37,   38,   39,   40,   41
+       34,   35,   35,   36,   37,   38,   39,   40
     },
 
     {
       -11,  -11,  -11,  -11,  -11,  -11,  -11,  -11,  -11,  -11,
-      -11,  -11,  -11,  -11,  -11,  -11,  -11,  -11,  -11
+      -11,  -11,  -11,  -11,  -11,  -11,  -11,  -11
     },
 
     {
        11,  -12,  -12,  -12,  -12,  -12,  -12,  -12,  -12,  -12,
-      -12,  -12,  -12,  -12,  -12,  -12,  -12,  -12,  -12
+      -12,  -12,  -12,  -12,  -12,  -12,  -12,  -12
     },
 
     {
-       11,  -13,   42,   43,  -13,  -13,   44,  -13,  -13,  -13,
-      -13,  -13,  -13,  -13,  -13,  -13,  -13,  -13,  -13
+       11,  -13,   41,   42,  -13,  -13,   43,  -13,  -13,  -13,
+      -13,  -13,  -13,  -13,  -13,  -13,  -13,  -13
     },
 
     {
        11,  -14,  -14,  -14,  -14,  -14,  -14,  -14,  -14,  -14,
-      -14,  -14,  -14,  -14,  -14,  -14,  -14,  -14,  -14
+      -14,  -14,  -14,  -14,  -14,  -14,  -14,  -14
 
     },
 
     {
-       11,   45,   45,   46,   45,   45,   45,   45,   45,   45,
-       45,   45,   45,   45,   45,   45,   45,   45,   45
+       11,   44,   44,   45,   44,   44,   44,   44,   44,   44,
+       44,   44,   44,   44,   44,   44,   44,   44
     },
 
     {
        11,  -16,  -16,  -16,  -16,  -16,  -16,  -16,  -16,  -16,
-      -16,  -16,  -16,  -16,  -16,  -16,  -16,  -16,  -16
+      -16,  -16,  -16,  -16,  -16,  -16,  -16,  -16
     },
 
     {
        11,  -17,  -17,  -17,  -17,  -17,  -17,  -17,  -17,  -17,
-      -17,  -17,  -17,  -17,  -17,  -17,  -17,  -17,  -17
+      -17,  -17,  -17,  -17,  -17,  -17,  -17,  -17
     },
 
     {
        11,  -18,  -18,  -18,  -18,  -18,  -18,  -18,  -18,  -18,
-      -18,  -18,  -18,   47,  -18,  -18,  -18,  -18,  -18
+      -18,   46,  -18,  -18,  -18,  -18,  -18,  -18
     },
 
     {
-       11,   48,   48,  -19,   48,   48,   48,   48,   48,   48,
-       48,   48,   48,   48,   48,   48,   48,   48,   48
+       11,   47,   47,  -19,   47,   47,   47,   47,   47,   47,
+       47,   47,   47,   47,   47,   47,   47,   47
 
     },
 
     {
-       11,  -20,   49,   50,  -20,  -20,  -20,  -20,  -20,  -20,
-      -20,  -20,  -20,  -20,  -20,  -20,  -20,  -20,  -20
+       11,  -20,   48,   49,  -20,  -20,  -20,  -20,  -20,  -20,
+      -20,  -20,  -20,  -20,  -20,  -20,  -20,  -20
     },
 
     {
-       11,   51,  -21,  -21,   51,   51,   51,   51,   51,   51,
-       51,   51,   51,   51,   51,   51,   51,   51,   51
+       11,   50,  -21,  -21,   50,   50,   50,   50,   50,   50,
+       50,   50,   50,   50,   50,   50,   50,   50
     },
 
     {
-       11,   52,   52,   53,   52,  -22,   52,   52,  -22,   52,
-       52,   52,   52,   52,   52,   52,   52,  -22,   52
+       11,   51,   51,   52,   51,  -22,   51,   51,  -22,   51,
+       51,   51,   51,   51,   51,   51,  -22,   51
     },
 
     {
        11,  -23,  -23,  -23,  -23,  -23,  -23,  -23,  -23,  -23,
-      -23,  -23,  -23,  -23,  -23,  -23,  -23,  -23,  -23
+      -23,  -23,  -23,  -23,  -23,  -23,  -23,  -23
     },
 
     {
        11,  -24,  -24,  -24,  -24,  -24,  -24,  -24,  -24,  -24,
-      -24,  -24,  -24,  -24,  -24,  -24,  -24,  -24,  -24
+      -24,  -24,  -24,  -24,  -24,  -24,  -24,  -24
 
     },
 
     {
-       11,   54,   54,   55,   54,   54,   54,   54,   54,   54,
-       54,   54,   54,   54,   54,   54,   54,   54,   54
+       11,   53,   53,   54,   53,   53,   53,   53,   53,   53,
+       53,   53,   53,   53,   53,   53,   53,   53
     },
 
     {
        11,  -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26,
-      -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26
+      -26,  -26,  -26,  -26,  -26,  -26,  -26,  -26
     },
 
     {
-       11,  -27,   56,  -27,  -27,  -27,  -27,  -27,  -27,  -27,
-      -27,  -27,  -27,  -27,  -27,  -27,  -27,  -27,  -27
+       11,  -27,   55,  -27,  -27,  -27,  -27,  -27,  -27,  -27,
+      -27,  -27,  -27,  -27,  -27,  -27,  -27,  -27
     },
 
     {
        11,  -28,  -28,  -28,  -28,  -28,  -28,  -28,  -28,  -28,
-      -28,  -28,  -28,  -28,  -28,  -28,  -28,  -28,  -28
+      -28,  -28,  -28,  -28,  -28,  -28,  -28,  -28
     },
 
     {
        11,  -29,  -29,  -29,  -29,  -29,  -29,  -29,  -29,  -29,
-      -29,  -29,  -29,  -29,  -29,   57,  -29,  -29,  -29
+      -29,  -29,  -29,  -29,   56,  -29,  -29,  -29
 
     },
 
     {
        11,  -30,  -30,  -30,  -30,  -30,  -30,  -30,  -30,  -30,
-      -30,  -30,  -30,  -30,  -30,  -30,  -30,  -30,  -30
+      -30,  -30,  -30,  -30,  -30,  -30,  -30,  -30
     },
 
     {
-       11,   58,   58,  -31,   58,   58,   58,   58,   58,   58,
-       58,   58,   58,   58,   58,   58,   58,   58,   58
+       11,   57,   57,  -31,   57,   57,   57,   57,   57,   57,
+       57,   57,   57,   57,   57,   57,   57,   57
     },
 
     {
-       11,  -32,  -32,  -32,  -32,  -32,  -32,   59,  -32,  -32,
-      -32,  -32,  -32,  -32,  -32,  -32,  -32,  -32,  -32
+       11,  -32,  -32,  -32,  -32,  -32,  -32,   58,  -32,  -32,
+      -32,  -32,  -32,  -32,  -32,  -32,  -32,  -32
     },
 
     {
        11,  -33,  -33,  -33,  -33,  -33,  -33,  -33,  -33,  -33,
-      -33,  -33,  -33,  -33,  -33,  -33,  -33,  -33,  -33
+      -33,  -33,  -33,  -33,  -33,  -33,  -33,  -33
     },
 
     {
        11,  -34,  -34,  -34,  -34,  -34,  -34,  -34,  -34,  -34,
-      -34,  -34,  -34,  -34,  -34,  -34,  -34,  -34,  -34
+      -34,  -34,  -34,  -34,  -34,  -34,  -34,  -34
 
     },
 
     {
        11,  -35,  -35,  -35,  -35,  -35,  -35,  -35,  -35,  -35,
-      -35,   60,   61,   61,  -35,  -35,  -35,  -35,  -35
+      -35,   59,   59,  -35,  -35,  -35,  -35,  -35
     },
 
     {
        11,  -36,  -36,  -36,  -36,  -36,  -36,  -36,  -36,  -36,
-      -36,   61,   61,   61,  -36,  -36,  -36,  -36,  -36
+      -36,  -36,  -36,  -36,   60,  -36,  -36,  -36
     },
 
     {
        11,  -37,  -37,  -37,  -37,  -37,  -37,  -37,  -37,  -37,
-      -37,  -37,  -37,  -37,  -37,   62,  -37,  -37,  -37
+      -37,  -37,  -37,  -37,  -37,  -37,  -37,  -37
     },
 
     {
        11,  -38,  -38,  -38,  -38,  -38,  -38,  -38,  -38,  -38,
-      -38,  -38,  -38,  -38,  -38,  -38,  -38,  -38,  -38
+      -38,  -38,  -38,  -38,   61,  -38,  -38,  -38
     },
 
     {
-       11,  -39,  -39,  -39,  -39,  -39,  -39,  -39,  -39,  -39,
-      -39,  -39,  -39,  -39,  -39,   63,  -39,  -39,  -39
+       11,  -39,  -39,   62,  -39,  -39,  -39,  -39,  -39,  -39,
+      -39,  -39,  -39,  -39,  -39,  -39,  -39,  -39
 
     },
 
     {
-       11,  -40,  -40,   64,  -40,  -40,  -40,  -40,  -40,  -40,
-      -40,  -40,  -40,  -40,  -40,  -40,  -40,  -40,  -40
+       11,  -40,  -40,  -40,  -40,  -40,  -40,  -40,  -40,  -40,
+      -40,  -40,  -40,  -40,  -40,  -40,  -40,   63
     },
 
     {
-       11,  -41,  -41,  -41,  -41,  -41,  -41,  -41,  -41,  -41,
-      -41,  -41,  -41,  -41,  -41,  -41,  -41,  -41,   65
+       11,  -41,   41,   42,  -41,  -41,   43,  -41,  -41,  -41,
+      -41,  -41,  -41,  -41,  -41,  -41,  -41,  -41
     },
 
     {
-       11,  -42,   42,   43,  -42,  -42,   44,  -42,  -42,  -42,
-      -42,  -42,  -42,  -42,  -42,  -42,  -42,  -42,  -42
+       11,  -42,  -42,  -42,  -42,  -42,  -42,  -42,  -42,  -42,
+      -42,  -42,  -42,  -42,  -42,  -42,  -42,  -42
     },
 
     {
-       11,  -43,  -43,  -43,  -43,  -43,  -43,  -43,  -43,  -43,
-      -43,  -43,  -43,  -43,  -43,  -43,  -43,  -43,  -43
+       11,   44,   44,   45,   44,   44,   44,   44,   44,   44,
+       44,   44,   44,   44,   44,   44,   44,   44
     },
 
     {
-       11,   45,   45,   46,   45,   45,   45,   45,   45,   45,
-       45,   45,   45,   45,   45,   45,   45,   45,   45
+       11,   44,   44,   45,   44,   44,   44,   44,   44,   44,
+       44,   44,   44,   44,   44,   44,   44,   44
 
     },
 
     {
-       11,   45,   45,   46,   45,   45,   45,   45,   45,   45,
-       45,   45,   45,   45,   45,   45,   45,   45,   45
+       11,  -45,  -45,  -45,  -45,  -45,  -45,  -45,  -45,  -45,
+      -45,  -45,  -45,  -45,  -45,  -45,  -45,  -45
     },
 
     {
        11,  -46,  -46,  -46,  -46,  -46,  -46,  -46,  -46,  -46,
-      -46,  -46,  -46,  -46,  -46,  -46,  -46,  -46,  -46
+      -46,   46,  -46,  -46,  -46,  -46,  -46,  -46
     },
 
     {
-       11,  -47,  -47,  -47,  -47,  -47,  -47,  -47,  -47,  -47,
-      -47,  -47,  -47,   47,  -47,  -47,  -47,  -47,  -47
+       11,   47,   47,  -47,   47,   47,   47,   47,   47,   47,
+       47,   47,   47,   47,   47,   47,   47,   47
     },
 
     {
-       11,   48,   48,  -48,   48,   48,   48,   48,   48,   48,
-       48,   48,   48,   48,   48,   48,   48,   48,   48
+       11,  -48,   48,   49,  -48,  -48,  -48,  -48,  -48,  -48,
+      -48,  -48,  -48,  -48,  -48,  -48,  -48,  -48
     },
 
     {
-       11,  -49,   49,   50,  -49,  -49,  -49,  -49,  -49,  -49,
-      -49,  -49,  -49,  -49,  -49,  -49,  -49,  -49,  -49
+       11,   50,  -49,  -49,   50,   50,   50,   50,   50,   50,
+       50,   50,   50,   50,   50,   50,   50,   50
 
     },
 
     {
-       11,   51,  -50,  -50,   51,   51,   51,   51,   51,   51,
-       51,   51,   51,   51,   51,   51,   51,   51,   51
+       11,  -50,  -50,  -50,  -50,  -50,  -50,  -50,  -50,  -50,
+      -50,  -50,  -50,  -50,  -50,  -50,  -50,  -50
     },
 
     {
-       11,  -51,  -51,  -51,  -51,  -51,  -51,  -51,  -51,  -51,
-      -51,  -51,  -51,  -51,  -51,  -51,  -51,  -51,  -51
+       11,   51,   51,   52,   51,  -51,   51,   51,  -51,   51,
+       51,   51,   51,   51,   51,   51,  -51,   51
     },
 
     {
-       11,   52,   52,   53,   52,  -52,   52,   52,  -52,   52,
-       52,   52,   52,   52,   52,   52,   52,  -52,   52
+       11,  -52,  -52,  -52,  -52,  -52,  -52,  -52,  -52,  -52,
+      -52,  -52,  -52,  -52,  -52,  -52,  -52,  -52
     },
 
     {
-       11,  -53,  -53,  -53,  -53,  -53,  -53,  -53,  -53,  -53,
-      -53,  -53,  -53,  -53,  -53,  -53,  -53,  -53,  -53
+       11,  -53,  -53,   54,  -53,  -53,  -53,  -53,  -53,  -53,
+      -53,  -53,  -53,  -53,  -53,  -53,  -53,  -53
     },
 
     {
-       11,  -54,  -54,   55,  -54,  -54,  -54,  -54,  -54,  -54,
-      -54,  -54,  -54,  -54,  -54,  -54,  -54,  -54,  -54
+       11,  -54,  -54,  -54,  -54,  -54,  -54,  -54,  -54,  -54,
+      -54,  -54,  -54,  -54,  -54,  -54,  -54,  -54
 
     },
 
     {
-       11,  -55,  -55,  -55,  -55,  -55,  -55,  -55,  -55,  -55,
-      -55,  -55,  -55,  -55,  -55,  -55,  -55,  -55,  -55
+       11,  -55,   55,  -55,  -55,  -55,  -55,  -55,  -55,  -55,
+      -55,  -55,  -55,  -55,  -55,  -55,  -55,  -55
     },
 
     {
-       11,  -56,   56,  -56,  -56,  -56,  -56,  -56,  -56,  -56,
-      -56,  -56,  -56,  -56,  -56,  -56,  -56,  -56,  -56
+       11,  -56,  -56,  -56,  -56,  -56,  -56,  -56,  -56,  -56,
+      -56,  -56,  -56,  -56,  -56,  -56,  -56,  -56
     },
 
     {
-       11,  -57,  -57,  -57,  -57,  -57,  -57,  -57,  -57,  -57,
-      -57,  -57,  -57,  -57,  -57,  -57,  -57,  -57,  -57
+       11,   57,   57,  -57,   57,   57,   57,   57,   57,   57,
+       57,   57,   57,   57,   57,   57,   57,   57
     },
 
     {
-       11,   58,   58,  -58,   58,   58,   58,   58,   58,   58,
-       58,   58,   58,   58,   58,   58,   58,   58,   58
+       11,  -58,  -58,  -58,  -58,  -58,  -58,  -58,  -58,  -58,
+      -58,  -58,  -58,  -58,  -58,  -58,  -58,  -58
     },
 
     {
        11,  -59,  -59,  -59,  -59,  -59,  -59,  -59,  -59,  -59,
-      -59,  -59,  -59,  -59,  -59,  -59,  -59,  -59,  -59
+      -59,   59,   59,  -59,  -59,  -59,  -59,  -59
 
     },
 
     {
        11,  -60,  -60,  -60,  -60,  -60,  -60,  -60,  -60,  -60,
-      -60,   66,   61,   61,  -60,  -60,  -60,  -60,  -60
+      -60,  -60,  -60,  -60,  -60,  -60,  -60,  -60
     },
 
     {
        11,  -61,  -61,  -61,  -61,  -61,  -61,  -61,  -61,  -61,
-      -61,   61,   61,   61,  -61,  -61,  -61,  -61,  -61
+      -61,  -61,  -61,  -61,  -61,  -61,  -61,  -61
     },
 
     {
        11,  -62,  -62,  -62,  -62,  -62,  -62,  -62,  -62,  -62,
-      -62,  -62,  -62,  -62,  -62,  -62,  -62,  -62,  -62
+      -62,  -62,  -62,  -62,  -62,  -62,  -62,  -62
     },
 
     {
        11,  -63,  -63,  -63,  -63,  -63,  -63,  -63,  -63,  -63,
-      -63,  -63,  -63,  -63,  -63,  -63,  -63,  -63,  -63
-    },
-
-    {
-       11,  -64,  -64,  -64,  -64,  -64,  -64,  -64,  -64,  -64,
-      -64,  -64,  -64,  -64,  -64,  -64,  -64,  -64,  -64
-
-    },
-
-    {
-       11,  -65,  -65,  -65,  -65,  -65,  -65,  -65,  -65,  -65,
-      -65,  -65,  -65,  -65,  -65,  -65,  -65,  -65,  -65
-    },
-
-    {
-       11,  -66,  -66,  -66,  -66,  -66,  -66,  -66,  -66,  -66,
-      -66,   61,   61,   61,  -66,  -66,  -66,  -66,  -66
+      -63,  -63,  -63,  -63,  -63,  -63,  -63,  -63
     },
 
     } ;
@@ -732,8 +725,8 @@ static void yy_fatal_error (yyconst char msg[]  );
        *yy_cp = '\0'; \
        (yy_c_buf_p) = yy_cp;
 
-#define YY_NUM_RULES 38
-#define YY_END_OF_BUFFER 39
+#define YY_NUM_RULES 37
+#define YY_END_OF_BUFFER 38
 /* This struct is not used in this scanner,
    but its presence is necessary. */
 struct yy_trans_info
@@ -741,15 +734,15 @@ struct yy_trans_info
        flex_int32_t yy_verify;
        flex_int32_t yy_nxt;
        };
-static yyconst flex_int16_t yy_accept[67] =
+static yyconst flex_int16_t yy_accept[64] =
     {   0,
         0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
-       39,    5,    4,    2,    3,    7,    8,    6,   37,   34,
-       36,   29,   33,   32,   31,   27,   26,   21,   13,   20,
-       24,   27,   11,   12,   23,   23,   18,   14,   19,   27,
-       27,    4,    2,    3,    3,    1,    6,   37,   34,   36,
-       35,   29,   28,   31,   30,   26,   15,   24,    9,   23,
-       23,   16,   17,   25,   10,   22
+       38,    5,    4,    2,    3,    7,    8,    6,   36,   33,
+       35,   28,   32,   31,   30,   26,   25,   21,   13,   20,
+       23,   26,   11,   12,   22,   18,   14,   19,   26,   26,
+        4,    2,    3,    3,    1,    6,   36,   33,   35,   34,
+       28,   27,   30,   29,   25,   15,   23,    9,   22,   16,
+       17,   24,   10
     } ;
 
 static yyconst flex_int32_t yy_ec[256] =
@@ -758,16 +751,16 @@ static yyconst flex_int32_t yy_ec[256] =
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    2,    4,    5,    6,    1,    1,    7,    8,    9,
-       10,    1,    1,    1,   11,   12,   12,   13,   13,   13,
-       13,   13,   13,   13,   13,   13,   13,    1,    1,   14,
-       15,   16,    1,    1,   13,   13,   13,   13,   13,   13,
-       13,   13,   13,   13,   13,   13,   13,   13,   13,   13,
-       13,   13,   13,   13,   13,   13,   13,   13,   13,   13,
-        1,   17,    1,    1,   13,    1,   13,   13,   13,   13,
-
-       13,   13,   13,   13,   13,   13,   13,   13,   13,   13,
-       13,   13,   13,   13,   13,   13,   13,   13,   13,   13,
-       13,   13,    1,   18,    1,    1,    1,    1,    1,    1,
+       10,    1,    1,    1,   11,   12,   12,   11,   11,   11,
+       11,   11,   11,   11,   11,   11,   11,    1,    1,   13,
+       14,   15,    1,    1,   11,   11,   11,   11,   11,   11,
+       11,   11,   11,   11,   11,   11,   11,   11,   11,   11,
+       11,   11,   11,   11,   11,   11,   11,   11,   11,   11,
+        1,   16,    1,    1,   11,    1,   11,   11,   11,   11,
+
+       11,   11,   11,   11,   11,   11,   11,   11,   11,   11,
+       11,   11,   11,   11,   11,   11,   11,   11,   11,   11,
+       11,   11,    1,   17,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
         1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
@@ -861,6 +854,13 @@ static void alloc_string(const char *str, int size)
        text[size] = 0;
 }
 
+static void warn_ignored_character(char chr)
+{
+       fprintf(stderr,
+               "%s:%d:warning: ignoring unsupported character '%c'\n",
+               zconf_curname(), zconf_lineno(), chr);
+}
+
 #define INITIAL 0
 #define COMMAND 1
 #define HELP 2
@@ -944,7 +944,12 @@ static int input (void );
 
 /* Amount of stuff to slurp up with each read. */
 #ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
 #define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
 #endif
 
 /* Copy whatever the last rule matched to the standard output. */
@@ -952,7 +957,7 @@ static int input (void );
 /* This used to be an fputs(), but since the string might contain NUL's,
  * we now use fwrite().
  */
-#define ECHO fwrite( zconftext, zconfleng, 1, zconfout )
+#define ECHO do { if (fwrite( zconftext, zconfleng, 1, zconfout )) {} } while (0)
 #endif
 
 /* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
@@ -1132,7 +1137,7 @@ YY_RULE_SETUP
        YY_BREAK
 case 7:
 YY_RULE_SETUP
-
+warn_ignored_character(*zconftext);
        YY_BREAK
 case 8:
 /* rule 8 can match eol */
@@ -1203,10 +1208,6 @@ BEGIN(INITIAL); current_file->lineno++; return T_EOL;
        YY_BREAK
 case 22:
 YY_RULE_SETUP
-/* ignore */
-       YY_BREAK
-case 23:
-YY_RULE_SETUP
 {
                const struct kconf_id *id = kconf_id_lookup(zconftext, zconfleng);
                if (id && id->flags & TF_PARAM) {
@@ -1218,26 +1219,22 @@ YY_RULE_SETUP
                return T_WORD;
        }
        YY_BREAK
-case 24:
+case 23:
 YY_RULE_SETUP
 /* comment */
        YY_BREAK
-case 25:
-/* rule 25 can match eol */
+case 24:
+/* rule 24 can match eol */
 YY_RULE_SETUP
 current_file->lineno++;
        YY_BREAK
-case 26:
+case 25:
 YY_RULE_SETUP
 
        YY_BREAK
-case 27:
+case 26:
 YY_RULE_SETUP
-{
-               fprintf(stderr,
-                       "%s:%d:warning: ignoring unsupported character '%c'\n",
-                       zconf_curname(), zconf_lineno(), *zconftext);
-       }
+warn_ignored_character(*zconftext);
        YY_BREAK
 case YY_STATE_EOF(PARAM):
 {
@@ -1245,8 +1242,8 @@ case YY_STATE_EOF(PARAM):
        }
        YY_BREAK
 
-case 28:
-/* rule 28 can match eol */
+case 27:
+/* rule 27 can match eol */
 *yy_cp = (yy_hold_char); /* undo effects of setting up zconftext */
 (yy_c_buf_p) = yy_cp -= 1;
 YY_DO_BEFORE_ACTION; /* set up zconftext again */
@@ -1257,14 +1254,14 @@ YY_RULE_SETUP
                return T_WORD_QUOTE;
        }
        YY_BREAK
-case 29:
+case 28:
 YY_RULE_SETUP
 {
                append_string(zconftext, zconfleng);
        }
        YY_BREAK
-case 30:
-/* rule 30 can match eol */
+case 29:
+/* rule 29 can match eol */
 *yy_cp = (yy_hold_char); /* undo effects of setting up zconftext */
 (yy_c_buf_p) = yy_cp -= 1;
 YY_DO_BEFORE_ACTION; /* set up zconftext again */
@@ -1275,13 +1272,13 @@ YY_RULE_SETUP
                return T_WORD_QUOTE;
        }
        YY_BREAK
-case 31:
+case 30:
 YY_RULE_SETUP
 {
                append_string(zconftext + 1, zconfleng - 1);
        }
        YY_BREAK
-case 32:
+case 31:
 YY_RULE_SETUP
 {
                if (str == zconftext[0]) {
@@ -1292,8 +1289,8 @@ YY_RULE_SETUP
                        append_string(zconftext, 1);
        }
        YY_BREAK
-case 33:
-/* rule 33 can match eol */
+case 32:
+/* rule 32 can match eol */
 YY_RULE_SETUP
 {
                printf("%s:%d:warning: multi-line strings not supported\n", zconf_curname(), zconf_lineno());
@@ -1308,7 +1305,7 @@ case YY_STATE_EOF(STRING):
        }
        YY_BREAK
 
-case 34:
+case 33:
 YY_RULE_SETUP
 {
                ts = 0;
@@ -1333,8 +1330,8 @@ YY_RULE_SETUP
                }
        }
        YY_BREAK
-case 35:
-/* rule 35 can match eol */
+case 34:
+/* rule 34 can match eol */
 *yy_cp = (yy_hold_char); /* undo effects of setting up zconftext */
 (yy_c_buf_p) = yy_cp -= 1;
 YY_DO_BEFORE_ACTION; /* set up zconftext again */
@@ -1345,15 +1342,15 @@ YY_RULE_SETUP
                return T_HELPTEXT;
        }
        YY_BREAK
-case 36:
-/* rule 36 can match eol */
+case 35:
+/* rule 35 can match eol */
 YY_RULE_SETUP
 {
                current_file->lineno++;
                append_string("\n", 1);
        }
        YY_BREAK
-case 37:
+case 36:
 YY_RULE_SETUP
 {
                while (zconfleng) {
@@ -1384,7 +1381,7 @@ case YY_STATE_EOF(COMMAND):
        yyterminate();
 }
        YY_BREAK
-case 38:
+case 37:
 YY_RULE_SETUP
 YY_FATAL_ERROR( "flex scanner jammed" );
        YY_BREAK
@@ -2114,8 +2111,8 @@ YY_BUFFER_STATE zconf_scan_string (yyconst char * yystr )
 
 /** Setup the input buffer state to scan the given bytes. The next call to zconflex() will
  * scan from a @e copy of @a bytes.
- * @param bytes the byte buffer to scan
- * @param len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
  * 
  * @return the newly allocated buffer state object.
  */
index 168b43dc0a59b6be4edea0fc5768fb2d63cfa440..6a5e1515123b3b0e38a2036726e890e76d44cf48 100644 (file)
 
 #include "elfconfig.h"
 
+/* On BSD-alike OSes elf.h defines these according to host's word size */
+#undef ELF_ST_BIND
+#undef ELF_ST_TYPE
+#undef ELF_R_SYM
+#undef ELF_R_TYPE
+
 #if KERNEL_ELFCLASS == ELFCLASS32
 
 #define Elf_Ehdr    Elf32_Ehdr
index 99ca6e76eb0a532ffafd0918dca8fa217c88f571..1aca224e8597c838a78ce18c64d847e66d816091 100644 (file)
 # Note that the rpm-pkg target cannot be used with KBUILD_OUTPUT,
 # but the binrpm-pkg target can; for some reason O= gets ignored.
 
-# Do we have rpmbuild, otherwise fall back to the older rpm
-RPM := $(shell if [ -x "/usr/bin/rpmbuild" ]; then echo rpmbuild; \
-                  else echo rpm; fi)
-
 # Remove hyphens since they have special meaning in RPM filenames
 KERNELPATH := kernel-$(subst -,_,$(KERNELRELEASE))
+KDEB_SOURCENAME ?= linux-$(KERNELRELEASE)
+export KDEB_SOURCENAME
 # Include only those top-level files that are needed by make, plus the GPL copy
-TAR_CONTENT := $(KBUILD_ALLDIRS) kernel.spec .config .scmversion Makefile \
+TAR_CONTENT := $(KBUILD_ALLDIRS) .config .scmversion Makefile \
                Kbuild Kconfig COPYING $(wildcard localversion*)
-TAR_CONTENT := $(addprefix $(KERNELPATH)/,$(TAR_CONTENT))
 MKSPEC     := $(srctree)/scripts/package/mkspec
 
+quiet_cmd_src_tar = TAR     $(2).tar.gz
+      cmd_src_tar = \
+if test "$(objtree)" != "$(srctree)"; then \
+       echo "Building source tarball is not possible outside the"; \
+       echo "kernel source tree. Don't set KBUILD_OUTPUT, or use the"; \
+       echo "binrpm-pkg or bindeb-pkg target instead."; \
+       false; \
+fi ; \
+$(srctree)/scripts/setlocalversion --save-scmversion; \
+ln -sf $(srctree) $(2); \
+tar -cz $(RCS_TAR_IGNORE) -f $(2).tar.gz \
+       $(addprefix $(2)/,$(TAR_CONTENT) $(3)); \
+rm -f $(2) $(objtree)/.scmversion
+
 # rpm-pkg
 # ---------------------------------------------------------------------------
 rpm-pkg rpm: FORCE
-       @if test "$(objtree)" != "$(srctree)"; then \
-               echo "Building source + binary RPM is not possible outside the"; \
-               echo "kernel source tree. Don't set KBUILD_OUTPUT, or use the"; \
-               echo "binrpm-pkg target instead."; \
-               false; \
-       fi
        $(MAKE) clean
-       ln -sf $(srctree) $(KERNELPATH)
        $(CONFIG_SHELL) $(MKSPEC) >$(objtree)/kernel.spec
-       $(CONFIG_SHELL) $(srctree)/scripts/setlocalversion --save-scmversion
-       tar -cz $(RCS_TAR_IGNORE) -f $(KERNELPATH).tar.gz $(TAR_CONTENT)
-       rm $(KERNELPATH)
-       rm -f $(objtree)/.scmversion
+       $(call cmd,src_tar,$(KERNELPATH),kernel.spec)
        $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
        mv -f $(objtree)/.tmp_version $(objtree)/.version
-       $(RPM) $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
+       rpmbuild --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz
        rm $(KERNELPATH).tar.gz kernel.spec
 
 # binrpm-pkg
@@ -62,7 +63,7 @@ binrpm-pkg: FORCE
        $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version
        mv -f $(objtree)/.tmp_version $(objtree)/.version
 
-       $(RPM) $(RPMOPTS) --define "_builddir $(objtree)" --target \
+       rpmbuild --define "_builddir $(objtree)" --target \
                $(UTS_MACHINE) -bb $(objtree)/binkernel.spec
        rm binkernel.spec
 
@@ -84,11 +85,17 @@ quiet_cmd_builddeb = BUILDDEB
        } && \
        \
        $$KBUILD_PKG_ROOTCMD $(CONFIG_SHELL) \
-               $(srctree)/scripts/package/builddeb
+               $(srctree)/scripts/package/builddeb $@
 
 deb-pkg: FORCE
+       $(MAKE) clean
+       $(call cmd,src_tar,$(KDEB_SOURCENAME))
+       $(MAKE) KBUILD_SRC=
+       +$(call cmd,builddeb)
+
+bindeb-pkg: FORCE
        $(MAKE) KBUILD_SRC=
-       $(call cmd,builddeb)
+       +$(call cmd,builddeb)
 
 clean-dirs += $(objtree)/debian/
 
@@ -133,8 +140,9 @@ perf-%pkg: FORCE
 # ---------------------------------------------------------------------------
 help: FORCE
        @echo '  rpm-pkg             - Build both source and binary RPM kernel packages'
-       @echo '  binrpm-pkg          - Build only the binary kernel package'
-       @echo '  deb-pkg             - Build the kernel as a deb package'
+       @echo '  binrpm-pkg          - Build only the binary kernel RPM package'
+       @echo '  deb-pkg             - Build both source and binary deb kernel packages'
+       @echo '  bindeb-pkg          - Build only the binary kernel deb package'
        @echo '  tar-pkg             - Build the kernel as an uncompressed tarball'
        @echo '  targz-pkg           - Build the kernel as a gzip compressed tarball'
        @echo '  tarbz2-pkg          - Build the kernel as a bzip2 compressed tarball'
index 88dbf23b697082aa899c6b414b65cf0e4ecbf67d..0cd46e129920e8ad114eb335ae92ce85230d068d 100755 (executable)
@@ -15,6 +15,8 @@ set -e
 create_package() {
        local pname="$1" pdir="$2"
 
+       mkdir -m 755 -p "$pdir/DEBIAN"
+       mkdir -p "$pdir/usr/share/doc/$pname"
        cp debian/copyright "$pdir/usr/share/doc/$pname/"
        cp debian/changelog "$pdir/usr/share/doc/$pname/changelog.Debian"
        gzip -9 "$pdir/usr/share/doc/$pname/changelog.Debian"
@@ -25,8 +27,13 @@ create_package() {
        chown -R root:root "$pdir"
        chmod -R go-w "$pdir"
 
+       # Create the package
+       dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch}" -p$pname -P"$pdir"
+       dpkg --build "$pdir" ..
+}
+
+set_debarch() {
        # Attempt to find the correct Debian architecture
-       local forcearch="" debarch=""
        case "$UTS_MACHINE" in
        i386|ia64|alpha)
                debarch="$UTS_MACHINE" ;;
@@ -47,6 +54,7 @@ create_package() {
        arm*)
                debarch=arm$(grep -q CONFIG_AEABI=y $KCONFIG_CONFIG && echo el || true) ;;
        *)
+               debarch=$(dpkg --print-architecture)
                echo "" >&2
                echo "** ** **  WARNING  ** ** **" >&2
                echo "" >&2
@@ -59,13 +67,8 @@ create_package() {
        if [ -n "$KBUILD_DEBARCH" ] ; then
                debarch="$KBUILD_DEBARCH"
        fi
-       if [ -n "$debarch" ] ; then
-               forcearch="-DArchitecture=$debarch"
-       fi
+       forcearch="-DArchitecture=$debarch"
 
-       # Create the package
-       dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch:-$(dpkg --print-architecture)}" -p$pname -P"$pdir"
-       dpkg --build "$pdir" ..
 }
 
 # Some variables and settings used throughout the script
@@ -76,6 +79,7 @@ if [ -n "$KDEB_PKGVERSION" ]; then
 else
        packageversion=$version-$revision
 fi
+sourcename=$KDEB_SOURCENAME
 tmpdir="$objtree/debian/tmp"
 fwdir="$objtree/debian/fwtmp"
 kernel_headers_dir="$objtree/debian/hdrtmp"
@@ -86,6 +90,9 @@ fwpackagename=linux-firmware-image-$version
 kernel_headers_packagename=linux-headers-$version
 libc_headers_packagename=linux-libc-dev
 dbg_packagename=$packagename-dbg
+debarch=
+forcearch=
+set_debarch
 
 if [ "$ARCH" = "um" ] ; then
        packagename=user-mode-linux-$version
@@ -110,24 +117,13 @@ BUILD_DEBUG="$(grep -s '^CONFIG_DEBUG_INFO=y' $KCONFIG_CONFIG || true)"
 # Setup the directory structure
 rm -rf "$tmpdir" "$fwdir" "$kernel_headers_dir" "$libc_headers_dir" "$dbg_dir"
 mkdir -m 755 -p "$tmpdir/DEBIAN"
-mkdir -p  "$tmpdir/lib" "$tmpdir/boot" "$tmpdir/usr/share/doc/$packagename"
-mkdir -m 755 -p "$fwdir/DEBIAN"
-mkdir -p "$fwdir/lib/firmware/$version/" "$fwdir/usr/share/doc/$fwpackagename"
-mkdir -m 755 -p "$libc_headers_dir/DEBIAN"
-mkdir -p "$libc_headers_dir/usr/share/doc/$libc_headers_packagename"
-mkdir -m 755 -p "$kernel_headers_dir/DEBIAN"
-mkdir -p "$kernel_headers_dir/usr/share/doc/$kernel_headers_packagename"
+mkdir -p "$tmpdir/lib" "$tmpdir/boot"
+mkdir -p "$fwdir/lib/firmware/$version/"
 mkdir -p "$kernel_headers_dir/lib/modules/$version/"
-if [ "$ARCH" = "um" ] ; then
-       mkdir -p "$tmpdir/usr/lib/uml/modules/$version" "$tmpdir/usr/bin"
-fi
-if [ -n "$BUILD_DEBUG" ] ; then
-       mkdir -p "$dbg_dir/usr/share/doc/$dbg_packagename"
-       mkdir -m 755 -p "$dbg_dir/DEBIAN"
-fi
 
 # Build and install the kernel
 if [ "$ARCH" = "um" ] ; then
+       mkdir -p "$tmpdir/usr/lib/uml/modules/$version" "$tmpdir/usr/bin" "$tmpdir/usr/share/doc/$packagename"
        $MAKE linux
        cp System.map "$tmpdir/usr/lib/uml/modules/$version/System.map"
        cp $KCONFIG_CONFIG "$tmpdir/usr/share/doc/$packagename/config"
@@ -143,6 +139,13 @@ else
        cp arch/$ARCH/boot/$KBUILD_IMAGE "$tmpdir/$installed_image_path"
 fi
 
+if grep -q "^CONFIG_OF=y" $KCONFIG_CONFIG ; then
+       # Only some architectures with OF support have this target
+       if grep -q dtbs_install "${srctree}/arch/$SRCARCH/Makefile"; then
+               $MAKE KBUILD_SRC= INSTALL_DTBS_PATH="$tmpdir/usr/lib/$packagename" dtbs_install
+       fi
+fi
+
 if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then
        INSTALL_MOD_PATH="$tmpdir" $MAKE KBUILD_SRC= modules_install
        rm -f "$tmpdir/lib/modules/$version/build"
@@ -162,6 +165,12 @@ if grep -q '^CONFIG_MODULES=y' $KCONFIG_CONFIG ; then
                        # then add a link to those
                        $OBJCOPY --add-gnu-debuglink=$dbg_dir/usr/lib/debug/$module $tmpdir/$module
                done
+
+               # resign stripped modules
+               MODULE_SIG_ALL="$(grep -s '^CONFIG_MODULE_SIG_ALL=y' $KCONFIG_CONFIG || true)"
+               if [ -n "$MODULE_SIG_ALL" ]; then
+                       INSTALL_MOD_PATH="$tmpdir" $MAKE KBUILD_SRC= modules_sign
+               fi
        fi
 fi
 
@@ -206,7 +215,7 @@ if [ -n "$DEBEMAIL" ]; then
 elif [ -n "$EMAIL" ]; then
        email=$EMAIL
 else
-       email=$(id -nu)@$(hostname -f)
+       email=$(id -nu)@$(hostname -f 2>/dev/null || hostname)
 fi
 if [ -n "$DEBFULLNAME" ]; then
        name=$DEBFULLNAME
@@ -230,7 +239,7 @@ fi
 
 # Generate a simple changelog template
 cat <<EOF > debian/changelog
-linux-upstream ($packageversion) $distribution; urgency=low
+$sourcename ($packageversion) $distribution; urgency=low
 
   * Custom built Linux kernel.
 
@@ -257,12 +266,16 @@ On Debian GNU/Linux systems, the complete text of the GNU General Public
 License version 2 can be found in \`/usr/share/common-licenses/GPL-2'.
 EOF
 
+
+build_depends="bc, kmod, cpio "
+
 # Generate a control file
 cat <<EOF > debian/control
-Source: linux-upstream
+Source: $sourcename
 Section: kernel
 Priority: optional
 Maintainer: $maintainer
+Build-Depends: $build_depends
 Standards-Version: 3.8.4
 Homepage: http://www.kernel.org/
 EOF
@@ -383,4 +396,33 @@ EOF
        create_package "$dbg_packagename" "$dbg_dir"
 fi
 
+if [ "x$1" = "xdeb-pkg" ]
+then
+    cat <<EOF > debian/rules
+#!/usr/bin/make -f
+
+build:
+       \$(MAKE)
+
+binary-arch:
+       \$(MAKE) KDEB_SOURCENAME=${sourcename} KDEB_PKGVERSION=${packageversion} bindeb-pkg
+
+clean:
+       rm -rf debian/*tmp
+       mv debian/ debian.backup # debian/ might be cleaned away
+       \$(MAKE) clean
+       mv debian.backup debian
+
+binary: binary-arch
+EOF
+       mv ${sourcename}.tar.gz ../${sourcename}_${version}.orig.tar.gz
+       tar caf ../${sourcename}_${packageversion}.debian.tar.gz debian/{copyright,rules,changelog,control}
+       dpkg-source -cdebian/control -ldebian/changelog --format="3.0 (custom)" --target-format="3.0 (quilt)" \
+               -b / ../${sourcename}_${version}.orig.tar.gz  ../${sourcename}_${packageversion}.debian.tar.gz
+       mv ${sourcename}_${packageversion}*dsc ..
+       dpkg-genchanges > ../${sourcename}_${packageversion}_${debarch}.changes
+else
+       dpkg-genchanges -b > ../${sourcename}_${packageversion}_${debarch}.changes
+fi
+
 exit 0
index d9ab94b17de0bc119a6fbf3958886d1735085fdf..71004daefe31b6fc557453d1655005c81f7fa9cc 100755 (executable)
@@ -111,10 +111,8 @@ echo 'cp System.map $RPM_BUILD_ROOT'"/boot/System.map-$KERNELRELEASE"
 echo 'cp .config $RPM_BUILD_ROOT'"/boot/config-$KERNELRELEASE"
 
 echo "%ifnarch ppc64"
-echo 'cp vmlinux vmlinux.orig'
-echo 'bzip2 -9 vmlinux'
+echo 'bzip2 -9 --keep vmlinux'
 echo 'mv vmlinux.bz2 $RPM_BUILD_ROOT'"/boot/vmlinux-$KERNELRELEASE.bz2"
-echo 'mv vmlinux.orig vmlinux'
 echo "%endif"
 
 if ! $PREBUILT; then
@@ -142,7 +140,6 @@ echo "fi"
 echo ""
 echo "%files"
 echo '%defattr (-, root, root)'
-echo "%dir /lib/modules"
 echo "/lib/modules/$KERNELRELEASE"
 echo "%exclude /lib/modules/$KERNELRELEASE/build"
 echo "%exclude /lib/modules/$KERNELRELEASE/source"
index 62b34ce1f50dd16a0aed513ad40a31baead586ac..e10beb11b696e4f6d289e3c74a7dddf970b1b66b 100644 (file)
@@ -98,6 +98,7 @@ int main(int argc, char *argv[])
 
        /* types, roles, and allows */
        fprintf(fout, "type base_t;\n");
+       fprintf(fout, "role base_r;\n");
        fprintf(fout, "role base_r types { base_t };\n");
        for (i = 0; secclass_map[i].name; i++)
                fprintf(fout, "allow base_t base_t:%s *;\n",
diff --git a/scripts/sign-file b/scripts/sign-file
deleted file mode 100755 (executable)
index 3906ee1..0000000
+++ /dev/null
@@ -1,421 +0,0 @@
-#!/usr/bin/perl -w
-#
-# Sign a module file using the given key.
-#
-
-my $USAGE =
-"Usage: scripts/sign-file [-v] <hash algo> <key> <x509> <module> [<dest>]\n" .
-"       scripts/sign-file [-v] -s <raw sig> <hash algo> <x509> <module> [<dest>]\n";
-
-use strict;
-use FileHandle;
-use IPC::Open2;
-use Getopt::Std;
-
-my %opts;
-getopts('vs:', \%opts) or die $USAGE;
-my $verbose = $opts{'v'};
-my $signature_file = $opts{'s'};
-
-die $USAGE if ($#ARGV > 4);
-die $USAGE if (!$signature_file && $#ARGV < 3 || $signature_file && $#ARGV < 2);
-
-my $dgst = shift @ARGV;
-my $private_key;
-if (!$signature_file) {
-       $private_key = shift @ARGV;
-}
-my $x509 = shift @ARGV;
-my $module = shift @ARGV;
-my ($dest, $keep_orig);
-if (@ARGV) {
-       $dest = $ARGV[0];
-       $keep_orig = 1;
-} else {
-       $dest = $module . "~";
-}
-
-die "Can't read private key\n" if (!$signature_file && !-r $private_key);
-die "Can't read signature file\n" if ($signature_file && !-r $signature_file);
-die "Can't read X.509 certificate\n" unless (-r $x509);
-die "Can't read module\n" unless (-r $module);
-
-#
-# Function to read the contents of a file into a variable.
-#
-sub read_file($)
-{
-    my ($file) = @_;
-    my $contents;
-    my $len;
-
-    open(FD, "<$file") || die $file;
-    binmode FD;
-    my @st = stat(FD);
-    die $file if (!@st);
-    $len = read(FD, $contents, $st[7]) || die $file;
-    close(FD) || die $file;
-    die "$file: Wanted length ", $st[7], ", got ", $len, "\n"
-       if ($len != $st[7]);
-    return $contents;
-}
-
-###############################################################################
-#
-# First of all, we have to parse the X.509 certificate to find certain details
-# about it.
-#
-# We read the DER-encoded X509 certificate and parse it to extract the Subject
-# name and Subject Key Identifier.  Theis provides the data we need to build
-# the certificate identifier.
-#
-# The signer's name part of the identifier is fabricated from the commonName,
-# the organizationName or the emailAddress components of the X.509 subject
-# name.
-#
-# The subject key ID is used to select which of that signer's certificates
-# we're intending to use to sign the module.
-#
-###############################################################################
-my $x509_certificate = read_file($x509);
-
-my $UNIV = 0 << 6;
-my $APPL = 1 << 6;
-my $CONT = 2 << 6;
-my $PRIV = 3 << 6;
-
-my $CONS = 0x20;
-
-my $BOOLEAN    = 0x01;
-my $INTEGER    = 0x02;
-my $BIT_STRING = 0x03;
-my $OCTET_STRING = 0x04;
-my $NULL       = 0x05;
-my $OBJ_ID     = 0x06;
-my $UTF8String = 0x0c;
-my $SEQUENCE   = 0x10;
-my $SET                = 0x11;
-my $UTCTime    = 0x17;
-my $GeneralizedTime = 0x18;
-
-my %OIDs = (
-    pack("CCC", 85, 4, 3)      => "commonName",
-    pack("CCC", 85, 4, 6)      => "countryName",
-    pack("CCC", 85, 4, 10)     => "organizationName",
-    pack("CCC", 85, 4, 11)     => "organizationUnitName",
-    pack("CCCCCCCCC", 42, 134, 72, 134, 247, 13, 1, 1, 1) => "rsaEncryption",
-    pack("CCCCCCCCC", 42, 134, 72, 134, 247, 13, 1, 1, 5) => "sha1WithRSAEncryption",
-    pack("CCCCCCCCC", 42, 134, 72, 134, 247, 13, 1, 9, 1) => "emailAddress",
-    pack("CCC", 85, 29, 35)    => "authorityKeyIdentifier",
-    pack("CCC", 85, 29, 14)    => "subjectKeyIdentifier",
-    pack("CCC", 85, 29, 19)    => "basicConstraints"
-);
-
-###############################################################################
-#
-# Extract an ASN.1 element from a string and return information about it.
-#
-###############################################################################
-sub asn1_extract($$@)
-{
-    my ($cursor, $expected_tag, $optional) = @_;
-
-    return [ -1 ]
-       if ($cursor->[1] == 0 && $optional);
-
-    die $x509, ": ", $cursor->[0], ": ASN.1 data underrun (elem ", $cursor->[1], ")\n"
-       if ($cursor->[1] < 2);
-
-    my ($tag, $len) = unpack("CC", substr(${$cursor->[2]}, $cursor->[0], 2));
-
-    if ($expected_tag != -1 && $tag != $expected_tag) {
-       return [ -1 ]
-           if ($optional);
-       die $x509, ": ", $cursor->[0], ": ASN.1 unexpected tag (", $tag,
-       " not ", $expected_tag, ")\n";
-    }
-
-    $cursor->[0] += 2;
-    $cursor->[1] -= 2;
-
-    die $x509, ": ", $cursor->[0], ": ASN.1 long tag\n"
-       if (($tag & 0x1f) == 0x1f);
-    die $x509, ": ", $cursor->[0], ": ASN.1 indefinite length\n"
-       if ($len == 0x80);
-
-    if ($len > 0x80) {
-       my $l = $len - 0x80;
-       die $x509, ": ", $cursor->[0], ": ASN.1 data underrun (len len $l)\n"
-           if ($cursor->[1] < $l);
-
-       if ($l == 0x1) {
-           $len = unpack("C", substr(${$cursor->[2]}, $cursor->[0], 1));
-       } elsif ($l == 0x2) {
-           $len = unpack("n", substr(${$cursor->[2]}, $cursor->[0], 2));
-       } elsif ($l == 0x3) {
-           $len = unpack("C", substr(${$cursor->[2]}, $cursor->[0], 1)) << 16;
-           $len = unpack("n", substr(${$cursor->[2]}, $cursor->[0] + 1, 2));
-       } elsif ($l == 0x4) {
-           $len = unpack("N", substr(${$cursor->[2]}, $cursor->[0], 4));
-       } else {
-           die $x509, ": ", $cursor->[0], ": ASN.1 element too long (", $l, ")\n";
-       }
-
-       $cursor->[0] += $l;
-       $cursor->[1] -= $l;
-    }
-
-    die $x509, ": ", $cursor->[0], ": ASN.1 data underrun (", $len, ")\n"
-       if ($cursor->[1] < $len);
-
-    my $ret = [ $tag, [ $cursor->[0], $len, $cursor->[2] ] ];
-    $cursor->[0] += $len;
-    $cursor->[1] -= $len;
-
-    return $ret;
-}
-
-###############################################################################
-#
-# Retrieve the data referred to by a cursor
-#
-###############################################################################
-sub asn1_retrieve($)
-{
-    my ($cursor) = @_;
-    my ($offset, $len, $data) = @$cursor;
-    return substr($$data, $offset, $len);
-}
-
-###############################################################################
-#
-# Roughly parse the X.509 certificate
-#
-###############################################################################
-my $cursor = [ 0, length($x509_certificate), \$x509_certificate ];
-
-my $cert = asn1_extract($cursor, $UNIV | $CONS | $SEQUENCE);
-my $tbs = asn1_extract($cert->[1], $UNIV | $CONS | $SEQUENCE);
-my $version = asn1_extract($tbs->[1], $CONT | $CONS | 0, 1);
-my $serial_number = asn1_extract($tbs->[1], $UNIV | $INTEGER);
-my $sig_type = asn1_extract($tbs->[1], $UNIV | $CONS | $SEQUENCE);
-my $issuer = asn1_extract($tbs->[1], $UNIV | $CONS | $SEQUENCE);
-my $validity = asn1_extract($tbs->[1], $UNIV | $CONS | $SEQUENCE);
-my $subject = asn1_extract($tbs->[1], $UNIV | $CONS | $SEQUENCE);
-my $key = asn1_extract($tbs->[1], $UNIV | $CONS | $SEQUENCE);
-my $issuer_uid = asn1_extract($tbs->[1], $CONT | $CONS | 1, 1);
-my $subject_uid = asn1_extract($tbs->[1], $CONT | $CONS | 2, 1);
-my $extension_list = asn1_extract($tbs->[1], $CONT | $CONS | 3, 1);
-
-my $subject_key_id = ();
-my $authority_key_id = ();
-
-#
-# Parse the extension list
-#
-if ($extension_list->[0] != -1) {
-    my $extensions = asn1_extract($extension_list->[1], $UNIV | $CONS | $SEQUENCE);
-
-    while ($extensions->[1]->[1] > 0) {
-       my $ext = asn1_extract($extensions->[1], $UNIV | $CONS | $SEQUENCE);
-       my $x_oid = asn1_extract($ext->[1], $UNIV | $OBJ_ID);
-       my $x_crit = asn1_extract($ext->[1], $UNIV | $BOOLEAN, 1);
-       my $x_val = asn1_extract($ext->[1], $UNIV | $OCTET_STRING);
-
-       my $raw_oid = asn1_retrieve($x_oid->[1]);
-       next if (!exists($OIDs{$raw_oid}));
-       my $x_type = $OIDs{$raw_oid};
-
-       my $raw_value = asn1_retrieve($x_val->[1]);
-
-       if ($x_type eq "subjectKeyIdentifier") {
-           my $vcursor = [ 0, length($raw_value), \$raw_value ];
-
-           $subject_key_id = asn1_extract($vcursor, $UNIV | $OCTET_STRING);
-       }
-    }
-}
-
-###############################################################################
-#
-# Determine what we're going to use as the signer's name.  In order of
-# preference, take one of: commonName, organizationName or emailAddress.
-#
-###############################################################################
-my $org = "";
-my $cn = "";
-my $email = "";
-
-while ($subject->[1]->[1] > 0) {
-    my $rdn = asn1_extract($subject->[1], $UNIV | $CONS | $SET);
-    my $attr = asn1_extract($rdn->[1], $UNIV | $CONS | $SEQUENCE);
-    my $n_oid = asn1_extract($attr->[1], $UNIV | $OBJ_ID);
-    my $n_val = asn1_extract($attr->[1], -1);
-
-    my $raw_oid = asn1_retrieve($n_oid->[1]);
-    next if (!exists($OIDs{$raw_oid}));
-    my $n_type = $OIDs{$raw_oid};
-
-    my $raw_value = asn1_retrieve($n_val->[1]);
-
-    if ($n_type eq "organizationName") {
-       $org = $raw_value;
-    } elsif ($n_type eq "commonName") {
-       $cn = $raw_value;
-    } elsif ($n_type eq "emailAddress") {
-       $email = $raw_value;
-    }
-}
-
-my $signers_name = $email;
-
-if ($org && $cn) {
-    # Don't use the organizationName if the commonName repeats it
-    if (length($org) <= length($cn) &&
-       substr($cn, 0, length($org)) eq $org) {
-       $signers_name = $cn;
-       goto got_id_name;
-    }
-
-    # Or a signifcant chunk of it
-    if (length($org) >= 7 &&
-       length($cn) >= 7 &&
-       substr($cn, 0, 7) eq substr($org, 0, 7)) {
-       $signers_name = $cn;
-       goto got_id_name;
-    }
-
-    $signers_name = $org . ": " . $cn;
-} elsif ($org) {
-    $signers_name = $org;
-} elsif ($cn) {
-    $signers_name = $cn;
-}
-
-got_id_name:
-
-die $x509, ": ", "X.509: Couldn't find the Subject Key Identifier extension\n"
-    if (!$subject_key_id);
-
-my $key_identifier = asn1_retrieve($subject_key_id->[1]);
-
-###############################################################################
-#
-# Create and attach the module signature
-#
-###############################################################################
-
-#
-# Signature parameters
-#
-my $algo = 1;          # Public-key crypto algorithm: RSA
-my $hash = 0;          # Digest algorithm
-my $id_type = 1;       # Identifier type: X.509
-
-#
-# Digest the data
-#
-my $prologue;
-if ($dgst eq "sha1") {
-    $prologue = pack("C*",
-                    0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
-                    0x2B, 0x0E, 0x03, 0x02, 0x1A,
-                    0x05, 0x00, 0x04, 0x14);
-    $hash = 2;
-} elsif ($dgst eq "sha224") {
-    $prologue = pack("C*",
-                    0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09,
-                    0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04,
-                    0x05, 0x00, 0x04, 0x1C);
-    $hash = 7;
-} elsif ($dgst eq "sha256") {
-    $prologue = pack("C*",
-                    0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
-                    0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01,
-                    0x05, 0x00, 0x04, 0x20);
-    $hash = 4;
-} elsif ($dgst eq "sha384") {
-    $prologue = pack("C*",
-                    0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
-                    0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02,
-                    0x05, 0x00, 0x04, 0x30);
-    $hash = 5;
-} elsif ($dgst eq "sha512") {
-    $prologue = pack("C*",
-                    0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
-                    0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03,
-                    0x05, 0x00, 0x04, 0x40);
-    $hash = 6;
-} else {
-    die "Unknown hash algorithm: $dgst\n";
-}
-
-my $signature;
-if ($signature_file) {
-       $signature = read_file($signature_file);
-} else {
-       #
-       # Generate the digest and read from openssl's stdout
-       #
-       my $digest;
-       $digest = readpipe("openssl dgst -$dgst -binary $module") || die "openssl dgst";
-
-       #
-       # Generate the binary signature, which will be just the integer that
-       # comprises the signature with no metadata attached.
-       #
-       my $pid;
-       $pid = open2(*read_from, *write_to,
-                    "openssl rsautl -sign -inkey $private_key -keyform PEM") ||
-           die "openssl rsautl";
-       binmode write_to;
-       print write_to $prologue . $digest || die "pipe to openssl rsautl";
-       close(write_to) || die "pipe to openssl rsautl";
-
-       binmode read_from;
-       read(read_from, $signature, 4096) || die "pipe from openssl rsautl";
-       close(read_from) || die "pipe from openssl rsautl";
-       waitpid($pid, 0) || die;
-       die "openssl rsautl died: $?" if ($? >> 8);
-}
-$signature = pack("n", length($signature)) . $signature,
-
-#
-# Build the signed binary
-#
-my $unsigned_module = read_file($module);
-
-my $magic_number = "~Module signature appended~\n";
-
-my $info = pack("CCCCCxxxN",
-               $algo, $hash, $id_type,
-               length($signers_name),
-               length($key_identifier),
-               length($signature));
-
-if ($verbose) {
-    print "Size of unsigned module: ", length($unsigned_module), "\n";
-    print "Size of signer's name  : ", length($signers_name), "\n";
-    print "Size of key identifier : ", length($key_identifier), "\n";
-    print "Size of signature      : ", length($signature), "\n";
-    print "Size of information    : ", length($info), "\n";
-    print "Size of magic number   : ", length($magic_number), "\n";
-    print "Signer's name          : '", $signers_name, "'\n";
-    print "Digest                 : $dgst\n";
-}
-
-open(FD, ">$dest") || die $dest;
-binmode FD;
-print FD
-    $unsigned_module,
-    $signers_name,
-    $key_identifier,
-    $signature,
-    $info,
-    $magic_number
-    ;
-close FD || die $dest;
-
-if (!$keep_orig) {
-    rename($dest, $module) || die $module;
-}
diff --git a/scripts/sign-file.c b/scripts/sign-file.c
new file mode 100755 (executable)
index 0000000..c3899ca
--- /dev/null
@@ -0,0 +1,263 @@
+/* Sign a module file using the given key.
+ *
+ * Copyright © 2014-2015 Red Hat, Inc. All Rights Reserved.
+ * Copyright © 2015      Intel Corporation.
+ *
+ * Authors: David Howells <dhowells@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the licence, or (at your option) any later version.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <getopt.h>
+#include <err.h>
+#include <arpa/inet.h>
+#include <openssl/bio.h>
+#include <openssl/evp.h>
+#include <openssl/pem.h>
+#include <openssl/cms.h>
+#include <openssl/err.h>
+#include <openssl/engine.h>
+
+struct module_signature {
+       uint8_t         algo;           /* Public-key crypto algorithm [0] */
+       uint8_t         hash;           /* Digest algorithm [0] */
+       uint8_t         id_type;        /* Key identifier type [PKEY_ID_PKCS7] */
+       uint8_t         signer_len;     /* Length of signer's name [0] */
+       uint8_t         key_id_len;     /* Length of key identifier [0] */
+       uint8_t         __pad[3];
+       uint32_t        sig_len;        /* Length of signature data */
+};
+
+#define PKEY_ID_PKCS7 2
+
+static char magic_number[] = "~Module signature appended~\n";
+
+static __attribute__((noreturn))
+void format(void)
+{
+       fprintf(stderr,
+               "Usage: scripts/sign-file [-dp] <hash algo> <key> <x509> <module> [<dest>]\n");
+       exit(2);
+}
+
+static void display_openssl_errors(int l)
+{
+       const char *file;
+       char buf[120];
+       int e, line;
+
+       if (ERR_peek_error() == 0)
+               return;
+       fprintf(stderr, "At main.c:%d:\n", l);
+
+       while ((e = ERR_get_error_line(&file, &line))) {
+               ERR_error_string(e, buf);
+               fprintf(stderr, "- SSL %s: %s:%d\n", buf, file, line);
+       }
+}
+
+static void drain_openssl_errors(void)
+{
+       const char *file;
+       int line;
+
+       if (ERR_peek_error() == 0)
+               return;
+       while (ERR_get_error_line(&file, &line)) {}
+}
+
+#define ERR(cond, fmt, ...)                            \
+       do {                                            \
+               bool __cond = (cond);                   \
+               display_openssl_errors(__LINE__);       \
+               if (__cond) {                           \
+                       err(1, fmt, ## __VA_ARGS__);    \
+               }                                       \
+       } while(0)
+
+static const char *key_pass;
+
+static int pem_pw_cb(char *buf, int len, int w, void *v)
+{
+       int pwlen;
+
+       if (!key_pass)
+               return -1;
+
+       pwlen = strlen(key_pass);
+       if (pwlen >= len)
+               return -1;
+
+       strcpy(buf, key_pass);
+
+       /* If it's wrong, don't keep trying it. */
+       key_pass = NULL;
+
+       return pwlen;
+}
+
+int main(int argc, char **argv)
+{
+       struct module_signature sig_info = { .id_type = PKEY_ID_PKCS7 };
+       char *hash_algo = NULL;
+       char *private_key_name, *x509_name, *module_name, *dest_name;
+       bool save_cms = false, replace_orig;
+       bool sign_only = false;
+       unsigned char buf[4096];
+       unsigned long module_size, cms_size;
+       unsigned int use_keyid = 0, use_signed_attrs = CMS_NOATTR;
+       const EVP_MD *digest_algo;
+       EVP_PKEY *private_key;
+       CMS_ContentInfo *cms;
+       X509 *x509;
+       BIO *b, *bd = NULL, *bm;
+       int opt, n;
+
+       OpenSSL_add_all_algorithms();
+       ERR_load_crypto_strings();
+       ERR_clear_error();
+
+       key_pass = getenv("KBUILD_SIGN_PIN");
+
+       do {
+               opt = getopt(argc, argv, "dpk");
+               switch (opt) {
+               case 'p': save_cms = true; break;
+               case 'd': sign_only = true; save_cms = true; break;
+               case 'k': use_keyid = CMS_USE_KEYID; break;
+               case -1: break;
+               default: format();
+               }
+       } while (opt != -1);
+
+       argc -= optind;
+       argv += optind;
+       if (argc < 4 || argc > 5)
+               format();
+
+       hash_algo = argv[0];
+       private_key_name = argv[1];
+       x509_name = argv[2];
+       module_name = argv[3];
+       if (argc == 5) {
+               dest_name = argv[4];
+               replace_orig = false;
+       } else {
+               ERR(asprintf(&dest_name, "%s.~signed~", module_name) < 0,
+                   "asprintf");
+               replace_orig = true;
+       }
+
+       /* Read the private key and the X.509 cert the PKCS#7 message
+        * will point to.
+        */
+       if (!strncmp(private_key_name, "pkcs11:", 7)) {
+               ENGINE *e;
+
+               ENGINE_load_builtin_engines();
+               drain_openssl_errors();
+               e = ENGINE_by_id("pkcs11");
+               ERR(!e, "Load PKCS#11 ENGINE");
+               if (ENGINE_init(e))
+                       drain_openssl_errors();
+               else
+                       ERR(1, "ENGINE_init");
+               if (key_pass)
+                       ERR(!ENGINE_ctrl_cmd_string(e, "PIN", key_pass, 0), "Set PKCS#11 PIN");
+               private_key = ENGINE_load_private_key(e, private_key_name, NULL,
+                                                     NULL);
+               ERR(!private_key, "%s", private_key_name);
+       } else {
+               b = BIO_new_file(private_key_name, "rb");
+               ERR(!b, "%s", private_key_name);
+               private_key = PEM_read_bio_PrivateKey(b, NULL, pem_pw_cb, NULL);
+               ERR(!private_key, "%s", private_key_name);
+               BIO_free(b);
+       }
+
+       b = BIO_new_file(x509_name, "rb");
+       ERR(!b, "%s", x509_name);
+       x509 = d2i_X509_bio(b, NULL); /* Binary encoded X.509 */
+       if (!x509) {
+               ERR(BIO_reset(b) != 1, "%s", x509_name);
+               x509 = PEM_read_bio_X509(b, NULL, NULL, NULL); /* PEM encoded X.509 */
+               if (x509)
+                       drain_openssl_errors();
+       }
+       BIO_free(b);
+       ERR(!x509, "%s", x509_name);
+
+       /* Open the destination file now so that we can shovel the module data
+        * across as we read it.
+        */
+       if (!sign_only) {
+               bd = BIO_new_file(dest_name, "wb");
+               ERR(!bd, "%s", dest_name);
+       }
+
+       /* Digest the module data. */
+       OpenSSL_add_all_digests();
+       display_openssl_errors(__LINE__);
+       digest_algo = EVP_get_digestbyname(hash_algo);
+       ERR(!digest_algo, "EVP_get_digestbyname");
+
+       bm = BIO_new_file(module_name, "rb");
+       ERR(!bm, "%s", module_name);
+
+       /* Load the CMS message from the digest buffer. */
+       cms = CMS_sign(NULL, NULL, NULL, NULL,
+                      CMS_NOCERTS | CMS_PARTIAL | CMS_BINARY | CMS_DETACHED | CMS_STREAM);
+       ERR(!cms, "CMS_sign");
+
+       ERR(!CMS_add1_signer(cms, x509, private_key, digest_algo,
+                            CMS_NOCERTS | CMS_BINARY | CMS_NOSMIMECAP |
+                            use_keyid | use_signed_attrs),
+           "CMS_sign_add_signer");
+       ERR(CMS_final(cms, bm, NULL, CMS_NOCERTS | CMS_BINARY) < 0,
+           "CMS_final");
+
+       if (save_cms) {
+               char *cms_name;
+
+               ERR(asprintf(&cms_name, "%s.p7s", module_name) < 0, "asprintf");
+               b = BIO_new_file(cms_name, "wb");
+               ERR(!b, "%s", cms_name);
+               ERR(i2d_CMS_bio_stream(b, cms, NULL, 0) < 0, "%s", cms_name);
+               BIO_free(b);
+       }
+
+       if (sign_only)
+               return 0;
+
+       /* Append the marker and the PKCS#7 message to the destination file */
+       ERR(BIO_reset(bm) < 0, "%s", module_name);
+       while ((n = BIO_read(bm, buf, sizeof(buf))),
+              n > 0) {
+               ERR(BIO_write(bd, buf, n) < 0, "%s", dest_name);
+       }
+       ERR(n < 0, "%s", module_name);
+       module_size = BIO_number_written(bd);
+
+       ERR(i2d_CMS_bio_stream(bd, cms, NULL, 0) < 0, "%s", dest_name);
+       cms_size = BIO_number_written(bd) - module_size;
+       sig_info.sig_len = htonl(cms_size);
+       ERR(BIO_write(bd, &sig_info, sizeof(sig_info)) < 0, "%s", dest_name);
+       ERR(BIO_write(bd, magic_number, sizeof(magic_number) - 1) < 0, "%s", dest_name);
+
+       ERR(BIO_free(bd) < 0, "%s", dest_name);
+
+       /* Finally, if we're signing in place, replace the original. */
+       if (replace_orig)
+               ERR(rename(dest_name, module_name) < 0, "%s", dest_name);
+
+       return 0;
+}
diff --git a/scripts/stackdelta b/scripts/stackdelta
new file mode 100755 (executable)
index 0000000..48eabf2
--- /dev/null
@@ -0,0 +1,59 @@
+#!/usr/bin/perl
+
+# Read two files produced by the stackusage script, and show the
+# delta between them.
+#
+# Currently, only shows changes for functions listed in both files. We
+# could add an option to show also functions which have vanished or
+# appeared (which would often be due to gcc making other inlining
+# decisions).
+#
+# Another possible option would be a minimum absolute value for the
+# delta.
+#
+# A third possibility is for sorting by delta, but that can be
+# achieved by piping to sort -k5,5g.
+
+sub read_stack_usage_file {
+    my %su;
+    my $f = shift;
+    open(my $fh, '<', $f)
+       or die "cannot open $f: $!";
+    while (<$fh>) {
+       chomp;
+       my ($file, $func, $size, $type) = split;
+       # Old versions of gcc (at least 4.7) have an annoying quirk in
+       # that a (static) function whose name has been changed into
+       # for example ext4_find_unwritten_pgoff.isra.11 will show up
+       # in the .su file with a name of just "11". Since such a
+       # numeric suffix is likely to change across different
+       # commits/compilers/.configs or whatever else we're trying to
+       # tweak, we can't really track those functions, so we just
+       # silently skip them.
+       #
+       # Newer gcc (at least 5.0) report the full name, so again,
+       # since the suffix is likely to change, we strip it.
+       next if $func =~ m/^[0-9]+$/;
+       $func =~ s/\..*$//;
+       # Line numbers are likely to change; strip those.
+       $file =~ s/:[0-9]+$//;
+       $su{"${file}\t${func}"} = {size => $size, type => $type};
+    }
+    close($fh);
+    return \%su;
+}
+
+@ARGV == 2
+    or die "usage: $0 <old> <new>";
+
+my $old = read_stack_usage_file($ARGV[0]);
+my $new = read_stack_usage_file($ARGV[1]);
+my @common = sort grep {exists $new->{$_}} keys %$old;
+for (@common) {
+    my $x = $old->{$_}{size};
+    my $y = $new->{$_}{size};
+    my $delta = $y - $x;
+    if ($delta) {
+       printf "%s\t%d\t%d\t%+d\n", $_, $x, $y, $delta;
+    }
+}
diff --git a/scripts/stackusage b/scripts/stackusage
new file mode 100755 (executable)
index 0000000..8cf2664
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+outfile=""
+now=`date +%s`
+
+while [ $# -gt 0 ]
+do
+    case "$1" in
+        -o)
+           outfile="$2"
+           shift 2;;
+       -h)
+           echo "usage: $0 [-o outfile] <make options/args>"
+           exit 0;;
+       *)  break;;
+    esac
+done
+
+if [ -z "$outfile" ]
+then
+    outfile=`mktemp --tmpdir stackusage.$$.XXXX`
+fi
+
+KCFLAGS="${KCFLAGS} -fstack-usage" make "$@"
+
+# Prepend directory name to file names, remove column information,
+# make file:line/function/size/type properly tab-separated.
+find . -name '*.su' -newermt "@${now}" -print |                     \
+    xargs perl -MFile::Basename -pe                                 \
+        '$d = dirname($ARGV); s#([^:]+:[0-9]+):[0-9]+:#$d/$1\t#;' | \
+    sort -k3,3nr > "${outfile}"
+
+echo "$0: output written to ${outfile}"
index c0a932dff3290c4675688f5ce865c1f933a88b0d..8e5aee6d9da2be1d3a4c255947d07663c14c29d3 100755 (executable)
@@ -170,7 +170,9 @@ exuberant()
        --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' \
        --regex-c='/^COMPAT_SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/compat_sys_\1/' \
        --regex-c++='/^TRACE_EVENT\(([^,)]*).*/trace_\1/'               \
+       --regex-c++='/^TRACE_EVENT\(([^,)]*).*/trace_\1_rcuidle/'       \
        --regex-c++='/^DEFINE_EVENT\([^,)]*, *([^,)]*).*/trace_\1/'     \
+       --regex-c++='/^DEFINE_EVENT\([^,)]*, *([^,)]*).*/trace_\1_rcuidle/' \
        --regex-c++='/PAGEFLAG\(([^,)]*).*/Page\1/'                     \
        --regex-c++='/PAGEFLAG\(([^,)]*).*/SetPage\1/'                  \
        --regex-c++='/PAGEFLAG\(([^,)]*).*/ClearPage\1/'                \
@@ -233,7 +235,9 @@ emacs()
        --regex='/^SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/sys_\1/'   \
        --regex='/^COMPAT_SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/compat_sys_\1/' \
        --regex='/^TRACE_EVENT(\([^,)]*\).*/trace_\1/'          \
+       --regex='/^TRACE_EVENT(\([^,)]*\).*/trace_\1_rcuidle/'  \
        --regex='/^DEFINE_EVENT([^,)]*, *\([^,)]*\).*/trace_\1/' \
+       --regex='/^DEFINE_EVENT([^,)]*, *\([^,)]*\).*/trace_\1_rcuidle/' \
        --regex='/PAGEFLAG(\([^,)]*\).*/Page\1/'                        \
        --regex='/PAGEFLAG(\([^,)]*\).*/SetPage\1/'             \
        --regex='/PAGEFLAG(\([^,)]*\).*/ClearPage\1/'           \
index bf4ec46474b6319ec6ca035b44d21439f98ea461..e45237897b435f8fbf64950f580b9730f79eb1f4 100644 (file)
@@ -132,7 +132,6 @@ choice
        default DEFAULT_SECURITY_SMACK if SECURITY_SMACK
        default DEFAULT_SECURITY_TOMOYO if SECURITY_TOMOYO
        default DEFAULT_SECURITY_APPARMOR if SECURITY_APPARMOR
-       default DEFAULT_SECURITY_YAMA if SECURITY_YAMA
        default DEFAULT_SECURITY_DAC
 
        help
@@ -151,9 +150,6 @@ choice
        config DEFAULT_SECURITY_APPARMOR
                bool "AppArmor" if SECURITY_APPARMOR=y
 
-       config DEFAULT_SECURITY_YAMA
-               bool "Yama" if SECURITY_YAMA=y
-
        config DEFAULT_SECURITY_DAC
                bool "Unix Discretionary Access Controls"
 
@@ -165,7 +161,6 @@ config DEFAULT_SECURITY
        default "smack" if DEFAULT_SECURITY_SMACK
        default "tomoyo" if DEFAULT_SECURITY_TOMOYO
        default "apparmor" if DEFAULT_SECURITY_APPARMOR
-       default "yama" if DEFAULT_SECURITY_YAMA
        default "" if DEFAULT_SECURITY_DAC
 
 endmenu
index 73455089feef3a11af0d88880fa6423070c50ea5..03c1652c9a1f593669b6d9e3a0bd2e65e47da239 100644 (file)
@@ -401,7 +401,7 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
        bool match = false;
 
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
-                        lockdep_is_held(&devcgroup_mutex),
+                        !lockdep_is_held(&devcgroup_mutex),
                         "device_cgroup:verify_new_ex called without proper synchronization");
 
        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
index 4ed98107ace31c3bfef3dc87f6c58e0e91a581cf..cccbf3068cdca800eb53e49caf1d29de9d93b035 100644 (file)
@@ -245,6 +245,21 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                }
                break;
        }
+       case LSM_AUDIT_DATA_IOCTL_OP: {
+               struct inode *inode;
+
+               audit_log_d_path(ab, " path=", &a->u.op->path);
+
+               inode = a->u.op->path.dentry->d_inode;
+               if (inode) {
+                       audit_log_format(ab, " dev=");
+                       audit_log_untrustedstring(ab, inode->i_sb->s_id);
+                       audit_log_format(ab, " ino=%lu", inode->i_ino);
+               }
+
+               audit_log_format(ab, " ioctlcmd=%hx", a->u.op->cmd);
+               break;
+       }
        case LSM_AUDIT_DATA_DENTRY: {
                struct inode *inode;
 
index 75b85fdc4e9789bde2fe59b56a29a89ee4020c28..46f405ce6b0fbab600ea542a6921bea9227b0ef5 100644 (file)
@@ -56,18 +56,13 @@ int __init security_init(void)
        pr_info("Security Framework initialized\n");
 
        /*
-        * Always load the capability module.
+        * Load minor LSMs, with the capability module always first.
         */
        capability_add_hooks();
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-       /*
-        * If Yama is configured for stacking load it next.
-        */
        yama_add_hooks();
-#endif
+
        /*
-        * Load the chosen module if there is one.
-        * This will also find yama if it is stacking
+        * Load all the remaining security modules.
         */
        do_security_initcalls();
 
index 0b122b1421a9dcc7dfd26ac6f80d00d1c6a0d55e..e60c79de13e1c74ea6129cfb5431d5d2415cdc2d 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/skbuff.h>
 #include <linux/percpu.h>
+#include <linux/list.h>
 #include <net/sock.h>
 #include <linux/un.h>
 #include <net/af_unix.h>
@@ -48,6 +49,7 @@ struct avc_entry {
        u32                     tsid;
        u16                     tclass;
        struct av_decision      avd;
+       struct avc_xperms_node  *xp_node;
 };
 
 struct avc_node {
@@ -56,6 +58,16 @@ struct avc_node {
        struct rcu_head         rhead;
 };
 
+struct avc_xperms_decision_node {
+       struct extended_perms_decision xpd;
+       struct list_head xpd_list; /* list of extended_perms_decision */
+};
+
+struct avc_xperms_node {
+       struct extended_perms xp;
+       struct list_head xpd_head; /* list head of extended_perms_decision */
+};
+
 struct avc_cache {
        struct hlist_head       slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
        spinlock_t              slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
@@ -80,6 +92,9 @@ DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
 static struct avc_cache avc_cache;
 static struct avc_callback_node *avc_callbacks;
 static struct kmem_cache *avc_node_cachep;
+static struct kmem_cache *avc_xperms_data_cachep;
+static struct kmem_cache *avc_xperms_decision_cachep;
+static struct kmem_cache *avc_xperms_cachep;
 
 static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
 {
@@ -101,6 +116,7 @@ static void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
                return;
        }
 
+       BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
        perms = secclass_map[tclass-1].perms;
 
        audit_log_format(ab, " {");
@@ -149,7 +165,7 @@ static void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tcla
                kfree(scontext);
        }
 
-       BUG_ON(tclass >= ARRAY_SIZE(secclass_map));
+       BUG_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map));
        audit_log_format(ab, " tclass=%s", secclass_map[tclass-1].name);
 }
 
@@ -170,7 +186,17 @@ void __init avc_init(void)
        atomic_set(&avc_cache.lru_hint, 0);
 
        avc_node_cachep = kmem_cache_create("avc_node", sizeof(struct avc_node),
-                                            0, SLAB_PANIC, NULL);
+                                       0, SLAB_PANIC, NULL);
+       avc_xperms_cachep = kmem_cache_create("avc_xperms_node",
+                                       sizeof(struct avc_xperms_node),
+                                       0, SLAB_PANIC, NULL);
+       avc_xperms_decision_cachep = kmem_cache_create(
+                                       "avc_xperms_decision_node",
+                                       sizeof(struct avc_xperms_decision_node),
+                                       0, SLAB_PANIC, NULL);
+       avc_xperms_data_cachep = kmem_cache_create("avc_xperms_data",
+                                       sizeof(struct extended_perms_data),
+                                       0, SLAB_PANIC, NULL);
 
        audit_log(current->audit_context, GFP_KERNEL, AUDIT_KERNEL, "AVC INITIALIZED\n");
 }
@@ -205,9 +231,261 @@ int avc_get_hash_stats(char *page)
                         slots_used, AVC_CACHE_SLOTS, max_chain_len);
 }
 
+/*
+ * using a linked list for extended_perms_decision lookup because the list is
+ * always small. i.e. less than 5, typically 1
+ */
+static struct extended_perms_decision *avc_xperms_decision_lookup(u8 driver,
+                                       struct avc_xperms_node *xp_node)
+{
+       struct avc_xperms_decision_node *xpd_node;
+
+       list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
+               if (xpd_node->xpd.driver == driver)
+                       return &xpd_node->xpd;
+       }
+       return NULL;
+}
+
+static inline unsigned int
+avc_xperms_has_perm(struct extended_perms_decision *xpd,
+                                       u8 perm, u8 which)
+{
+       unsigned int rc = 0;
+
+       if ((which == XPERMS_ALLOWED) &&
+                       (xpd->used & XPERMS_ALLOWED))
+               rc = security_xperm_test(xpd->allowed->p, perm);
+       else if ((which == XPERMS_AUDITALLOW) &&
+                       (xpd->used & XPERMS_AUDITALLOW))
+               rc = security_xperm_test(xpd->auditallow->p, perm);
+       else if ((which == XPERMS_DONTAUDIT) &&
+                       (xpd->used & XPERMS_DONTAUDIT))
+               rc = security_xperm_test(xpd->dontaudit->p, perm);
+       return rc;
+}
+
+static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
+                               u8 driver, u8 perm)
+{
+       struct extended_perms_decision *xpd;
+       security_xperm_set(xp_node->xp.drivers.p, driver);
+       xpd = avc_xperms_decision_lookup(driver, xp_node);
+       if (xpd && xpd->allowed)
+               security_xperm_set(xpd->allowed->p, perm);
+}
+
+static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
+{
+       struct extended_perms_decision *xpd;
+
+       xpd = &xpd_node->xpd;
+       if (xpd->allowed)
+               kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
+       if (xpd->auditallow)
+               kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
+       if (xpd->dontaudit)
+               kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
+       kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
+}
+
+static void avc_xperms_free(struct avc_xperms_node *xp_node)
+{
+       struct avc_xperms_decision_node *xpd_node, *tmp;
+
+       if (!xp_node)
+               return;
+
+       list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
+               list_del(&xpd_node->xpd_list);
+               avc_xperms_decision_free(xpd_node);
+       }
+       kmem_cache_free(avc_xperms_cachep, xp_node);
+}
+
+static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
+                                       struct extended_perms_decision *src)
+{
+       dest->driver = src->driver;
+       dest->used = src->used;
+       if (dest->used & XPERMS_ALLOWED)
+               memcpy(dest->allowed->p, src->allowed->p,
+                               sizeof(src->allowed->p));
+       if (dest->used & XPERMS_AUDITALLOW)
+               memcpy(dest->auditallow->p, src->auditallow->p,
+                               sizeof(src->auditallow->p));
+       if (dest->used & XPERMS_DONTAUDIT)
+               memcpy(dest->dontaudit->p, src->dontaudit->p,
+                               sizeof(src->dontaudit->p));
+}
+
+/*
+ * similar to avc_copy_xperms_decision, but only copy decision
+ * information relevant to this perm
+ */
+static inline void avc_quick_copy_xperms_decision(u8 perm,
+                       struct extended_perms_decision *dest,
+                       struct extended_perms_decision *src)
+{
+       /*
+        * compute index of the u32 of the 256 bits (8 u32s) that contain this
+        * command permission
+        */
+       u8 i = perm >> 5;
+
+       dest->used = src->used;
+       if (dest->used & XPERMS_ALLOWED)
+               dest->allowed->p[i] = src->allowed->p[i];
+       if (dest->used & XPERMS_AUDITALLOW)
+               dest->auditallow->p[i] = src->auditallow->p[i];
+       if (dest->used & XPERMS_DONTAUDIT)
+               dest->dontaudit->p[i] = src->dontaudit->p[i];
+}
+
+static struct avc_xperms_decision_node
+               *avc_xperms_decision_alloc(u8 which)
+{
+       struct avc_xperms_decision_node *xpd_node;
+       struct extended_perms_decision *xpd;
+
+       xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
+                               GFP_ATOMIC | __GFP_NOMEMALLOC);
+       if (!xpd_node)
+               return NULL;
+
+       xpd = &xpd_node->xpd;
+       if (which & XPERMS_ALLOWED) {
+               xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
+                                               GFP_ATOMIC | __GFP_NOMEMALLOC);
+               if (!xpd->allowed)
+                       goto error;
+       }
+       if (which & XPERMS_AUDITALLOW) {
+               xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
+                                               GFP_ATOMIC | __GFP_NOMEMALLOC);
+               if (!xpd->auditallow)
+                       goto error;
+       }
+       if (which & XPERMS_DONTAUDIT) {
+               xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
+                                               GFP_ATOMIC | __GFP_NOMEMALLOC);
+               if (!xpd->dontaudit)
+                       goto error;
+       }
+       return xpd_node;
+error:
+       avc_xperms_decision_free(xpd_node);
+       return NULL;
+}
+
+static int avc_add_xperms_decision(struct avc_node *node,
+                       struct extended_perms_decision *src)
+{
+       struct avc_xperms_decision_node *dest_xpd;
+
+       node->ae.xp_node->xp.len++;
+       dest_xpd = avc_xperms_decision_alloc(src->used);
+       if (!dest_xpd)
+               return -ENOMEM;
+       avc_copy_xperms_decision(&dest_xpd->xpd, src);
+       list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
+       return 0;
+}
+
+static struct avc_xperms_node *avc_xperms_alloc(void)
+{
+       struct avc_xperms_node *xp_node;
+
+       xp_node = kmem_cache_zalloc(avc_xperms_cachep,
+                               GFP_ATOMIC|__GFP_NOMEMALLOC);
+       if (!xp_node)
+               return xp_node;
+       INIT_LIST_HEAD(&xp_node->xpd_head);
+       return xp_node;
+}
+
+static int avc_xperms_populate(struct avc_node *node,
+                               struct avc_xperms_node *src)
+{
+       struct avc_xperms_node *dest;
+       struct avc_xperms_decision_node *dest_xpd;
+       struct avc_xperms_decision_node *src_xpd;
+
+       if (src->xp.len == 0)
+               return 0;
+       dest = avc_xperms_alloc();
+       if (!dest)
+               return -ENOMEM;
+
+       memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
+       dest->xp.len = src->xp.len;
+
+       /* for each source xpd allocate a destination xpd and copy */
+       list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
+               dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
+               if (!dest_xpd)
+                       goto error;
+               avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
+               list_add(&dest_xpd->xpd_list, &dest->xpd_head);
+       }
+       node->ae.xp_node = dest;
+       return 0;
+error:
+       avc_xperms_free(dest);
+       return -ENOMEM;
+
+}
+
+static inline u32 avc_xperms_audit_required(u32 requested,
+                                       struct av_decision *avd,
+                                       struct extended_perms_decision *xpd,
+                                       u8 perm,
+                                       int result,
+                                       u32 *deniedp)
+{
+       u32 denied, audited;
+
+       denied = requested & ~avd->allowed;
+       if (unlikely(denied)) {
+               audited = denied & avd->auditdeny;
+               if (audited && xpd) {
+                       if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
+                               audited &= ~requested;
+               }
+       } else if (result) {
+               audited = denied = requested;
+       } else {
+               audited = requested & avd->auditallow;
+               if (audited && xpd) {
+                       if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
+                               audited &= ~requested;
+               }
+       }
+
+       *deniedp = denied;
+       return audited;
+}
+
+static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass,
+                               u32 requested, struct av_decision *avd,
+                               struct extended_perms_decision *xpd,
+                               u8 perm, int result,
+                               struct common_audit_data *ad)
+{
+       u32 audited, denied;
+
+       audited = avc_xperms_audit_required(
+                       requested, avd, xpd, perm, result, &denied);
+       if (likely(!audited))
+               return 0;
+       return slow_avc_audit(ssid, tsid, tclass, requested,
+                       audited, denied, result, ad, 0);
+}
+
 static void avc_node_free(struct rcu_head *rhead)
 {
        struct avc_node *node = container_of(rhead, struct avc_node, rhead);
+       avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
 }
@@ -221,6 +499,7 @@ static void avc_node_delete(struct avc_node *node)
 
 static void avc_node_kill(struct avc_node *node)
 {
+       avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
        atomic_dec(&avc_cache.active_nodes);
@@ -367,6 +646,7 @@ static int avc_latest_notif_update(int seqno, int is_insert)
  * @tsid: target security identifier
  * @tclass: target security class
  * @avd: resulting av decision
+ * @xp_node: resulting extended permissions
  *
  * Insert an AVC entry for the SID pair
  * (@ssid, @tsid) and class @tclass.
@@ -378,7 +658,9 @@ static int avc_latest_notif_update(int seqno, int is_insert)
  * the access vectors into a cache entry, returns
  * avc_node inserted. Otherwise, this function returns NULL.
  */
-static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
+static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass,
+                               struct av_decision *avd,
+                               struct avc_xperms_node *xp_node)
 {
        struct avc_node *pos, *node = NULL;
        int hvalue;
@@ -391,10 +673,15 @@ static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_dec
        if (node) {
                struct hlist_head *head;
                spinlock_t *lock;
+               int rc = 0;
 
                hvalue = avc_hash(ssid, tsid, tclass);
                avc_node_populate(node, ssid, tsid, tclass, avd);
-
+               rc = avc_xperms_populate(node, xp_node);
+               if (rc) {
+                       kmem_cache_free(avc_node_cachep, node);
+                       return NULL;
+               }
                head = &avc_cache.slots[hvalue];
                lock = &avc_cache.slots_lock[hvalue];
 
@@ -523,14 +810,17 @@ out:
  * @perms : Permission mask bits
  * @ssid,@tsid,@tclass : identifier of an AVC entry
  * @seqno : sequence number when decision was made
+ * @xpd: extended_perms_decision to be added to the node
  *
  * if a valid AVC entry doesn't exist,this function returns -ENOENT.
  * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
  * otherwise, this function updates the AVC entry. The original AVC-entry object
  * will release later by RCU.
  */
-static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass,
-                          u32 seqno)
+static int avc_update_node(u32 event, u32 perms, u8 driver, u8 xperm, u32 ssid,
+                       u32 tsid, u16 tclass, u32 seqno,
+                       struct extended_perms_decision *xpd,
+                       u32 flags)
 {
        int hvalue, rc = 0;
        unsigned long flag;
@@ -574,9 +864,19 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass,
 
        avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);
 
+       if (orig->ae.xp_node) {
+               rc = avc_xperms_populate(node, orig->ae.xp_node);
+               if (rc) {
+                       kmem_cache_free(avc_node_cachep, node);
+                       goto out_unlock;
+               }
+       }
+
        switch (event) {
        case AVC_CALLBACK_GRANT:
                node->ae.avd.allowed |= perms;
+               if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
+                       avc_xperms_allow_perm(node->ae.xp_node, driver, xperm);
                break;
        case AVC_CALLBACK_TRY_REVOKE:
        case AVC_CALLBACK_REVOKE:
@@ -594,6 +894,9 @@ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass,
        case AVC_CALLBACK_AUDITDENY_DISABLE:
                node->ae.avd.auditdeny &= ~perms;
                break;
+       case AVC_CALLBACK_ADD_XPERMS:
+               avc_add_xperms_decision(node, xpd);
+               break;
        }
        avc_node_replace(node, orig);
 out_unlock:
@@ -665,18 +968,20 @@ int avc_ss_reset(u32 seqno)
  * results in a bigger stack frame.
  */
 static noinline struct avc_node *avc_compute_av(u32 ssid, u32 tsid,
-                        u16 tclass, struct av_decision *avd)
+                        u16 tclass, struct av_decision *avd,
+                        struct avc_xperms_node *xp_node)
 {
        rcu_read_unlock();
-       security_compute_av(ssid, tsid, tclass, avd);
+       INIT_LIST_HEAD(&xp_node->xpd_head);
+       security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp);
        rcu_read_lock();
-       return avc_insert(ssid, tsid, tclass, avd);
+       return avc_insert(ssid, tsid, tclass, avd, xp_node);
 }
 
 static noinline int avc_denied(u32 ssid, u32 tsid,
-                        u16 tclass, u32 requested,
-                        unsigned flags,
-                        struct av_decision *avd)
+                               u16 tclass, u32 requested,
+                               u8 driver, u8 xperm, unsigned flags,
+                               struct av_decision *avd)
 {
        if (flags & AVC_STRICT)
                return -EACCES;
@@ -684,11 +989,91 @@ static noinline int avc_denied(u32 ssid, u32 tsid,
        if (selinux_enforcing && !(avd->flags & AVD_FLAGS_PERMISSIVE))
                return -EACCES;
 
-       avc_update_node(AVC_CALLBACK_GRANT, requested, ssid,
-                               tsid, tclass, avd->seqno);
+       avc_update_node(AVC_CALLBACK_GRANT, requested, driver, xperm, ssid,
+                               tsid, tclass, avd->seqno, NULL, flags);
        return 0;
 }
 
+/*
+ * The avc extended permissions logic adds an additional 256 bits of
+ * permissions to an avc node when extended permissions for that node are
+ * specified in the avtab. If the additional 256 permissions is not adequate,
+ * as-is the case with ioctls, then multiple may be chained together and the
+ * driver field is used to specify which set contains the permission.
+ */
+int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
+                       u8 driver, u8 xperm, struct common_audit_data *ad)
+{
+       struct avc_node *node;
+       struct av_decision avd;
+       u32 denied;
+       struct extended_perms_decision local_xpd;
+       struct extended_perms_decision *xpd = NULL;
+       struct extended_perms_data allowed;
+       struct extended_perms_data auditallow;
+       struct extended_perms_data dontaudit;
+       struct avc_xperms_node local_xp_node;
+       struct avc_xperms_node *xp_node;
+       int rc = 0, rc2;
+
+       xp_node = &local_xp_node;
+       BUG_ON(!requested);
+
+       rcu_read_lock();
+
+       node = avc_lookup(ssid, tsid, tclass);
+       if (unlikely(!node)) {
+               node = avc_compute_av(ssid, tsid, tclass, &avd, xp_node);
+       } else {
+               memcpy(&avd, &node->ae.avd, sizeof(avd));
+               xp_node = node->ae.xp_node;
+       }
+       /* if extended permissions are not defined, only consider av_decision */
+       if (!xp_node || !xp_node->xp.len)
+               goto decision;
+
+       local_xpd.allowed = &allowed;
+       local_xpd.auditallow = &auditallow;
+       local_xpd.dontaudit = &dontaudit;
+
+       xpd = avc_xperms_decision_lookup(driver, xp_node);
+       if (unlikely(!xpd)) {
+               /*
+                * Compute the extended_perms_decision only if the driver
+                * is flagged
+                */
+               if (!security_xperm_test(xp_node->xp.drivers.p, driver)) {
+                       avd.allowed &= ~requested;
+                       goto decision;
+               }
+               rcu_read_unlock();
+               security_compute_xperms_decision(ssid, tsid, tclass, driver,
+                                               &local_xpd);
+               rcu_read_lock();
+               avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver, xperm,
+                               ssid, tsid, tclass, avd.seqno, &local_xpd, 0);
+       } else {
+               avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
+       }
+       xpd = &local_xpd;
+
+       if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
+               avd.allowed &= ~requested;
+
+decision:
+       denied = requested & ~(avd.allowed);
+       if (unlikely(denied))
+               rc = avc_denied(ssid, tsid, tclass, requested, driver, xperm,
+                               AVC_EXTENDED_PERMS, &avd);
+
+       rcu_read_unlock();
+
+       rc2 = avc_xperms_audit(ssid, tsid, tclass, requested,
+                       &avd, xpd, xperm, rc, ad);
+       if (rc2)
+               return rc2;
+       return rc;
+}
 
 /**
  * avc_has_perm_noaudit - Check permissions but perform no auditing.
@@ -716,6 +1101,7 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
                         struct av_decision *avd)
 {
        struct avc_node *node;
+       struct avc_xperms_node xp_node;
        int rc = 0;
        u32 denied;
 
@@ -725,13 +1111,13 @@ inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
 
        node = avc_lookup(ssid, tsid, tclass);
        if (unlikely(!node))
-               node = avc_compute_av(ssid, tsid, tclass, avd);
+               node = avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
        else
                memcpy(avd, &node->ae.avd, sizeof(*avd));
 
        denied = requested & ~(avd->allowed);
        if (unlikely(denied))
-               rc = avc_denied(ssid, tsid, tclass, requested, flags, avd);
+               rc = avc_denied(ssid, tsid, tclass, requested, 0, 0, flags, avd);
 
        rcu_read_unlock();
        return rc;
index cdf4c589a3914bbc7d315b1e55fb1264f4be0309..e4369d86e5885d9b00e97cd7cd901810f2c1c237 100644 (file)
@@ -254,10 +254,21 @@ static void inode_free_security(struct inode *inode)
        struct inode_security_struct *isec = inode->i_security;
        struct superblock_security_struct *sbsec = inode->i_sb->s_security;
 
-       spin_lock(&sbsec->isec_lock);
-       if (!list_empty(&isec->list))
+       /*
+        * As not all inode security structures are in a list, we check for
+        * empty list outside of the lock to make sure that we won't waste
+        * time taking a lock doing nothing.
+        *
+        * The list_del_init() function can be safely called more than once.
+        * It should not be possible for this function to be called with
+        * concurrent list_add(), but for better safety against future changes
+        * in the code, we use list_empty_careful() here.
+        */
+       if (!list_empty_careful(&isec->list)) {
+               spin_lock(&sbsec->isec_lock);
                list_del_init(&isec->list);
-       spin_unlock(&sbsec->isec_lock);
+               spin_unlock(&sbsec->isec_lock);
+       }
 
        /*
         * The inode may still be referenced in a path walk and
@@ -1698,6 +1709,32 @@ out:
        return rc;
 }
 
+/*
+ * Determine the label for an inode that might be unioned.
+ */
+static int selinux_determine_inode_label(const struct inode *dir,
+                                        const struct qstr *name,
+                                        u16 tclass,
+                                        u32 *_new_isid)
+{
+       const struct superblock_security_struct *sbsec = dir->i_sb->s_security;
+       const struct inode_security_struct *dsec = dir->i_security;
+       const struct task_security_struct *tsec = current_security();
+
+       if ((sbsec->flags & SE_SBINITIALIZED) &&
+           (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
+               *_new_isid = sbsec->mntpoint_sid;
+       } else if ((sbsec->flags & SBLABEL_MNT) &&
+                  tsec->create_sid) {
+               *_new_isid = tsec->create_sid;
+       } else {
+               return security_transition_sid(tsec->sid, dsec->sid, tclass,
+                                              name, _new_isid);
+       }
+
+       return 0;
+}
+
 /* Check whether a task can create a file. */
 static int may_create(struct inode *dir,
                      struct dentry *dentry,
@@ -1714,7 +1751,6 @@ static int may_create(struct inode *dir,
        sbsec = dir->i_sb->s_security;
 
        sid = tsec->sid;
-       newsid = tsec->create_sid;
 
        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
@@ -1725,12 +1761,10 @@ static int may_create(struct inode *dir,
        if (rc)
                return rc;
 
-       if (!newsid || !(sbsec->flags & SBLABEL_MNT)) {
-               rc = security_transition_sid(sid, dsec->sid, tclass,
-                                            &dentry->d_name, &newsid);
-               if (rc)
-                       return rc;
-       }
+       rc = selinux_determine_inode_label(dir, &dentry->d_name, tclass,
+                                          &newsid);
+       if (rc)
+               return rc;
 
        rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad);
        if (rc)
@@ -2704,32 +2738,14 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode,
                                        struct qstr *name, void **ctx,
                                        u32 *ctxlen)
 {
-       const struct cred *cred = current_cred();
-       struct task_security_struct *tsec;
-       struct inode_security_struct *dsec;
-       struct superblock_security_struct *sbsec;
-       struct inode *dir = d_backing_inode(dentry->d_parent);
        u32 newsid;
        int rc;
 
-       tsec = cred->security;
-       dsec = dir->i_security;
-       sbsec = dir->i_sb->s_security;
-
-       if (tsec->create_sid && sbsec->behavior != SECURITY_FS_USE_MNTPOINT) {
-               newsid = tsec->create_sid;
-       } else {
-               rc = security_transition_sid(tsec->sid, dsec->sid,
-                                            inode_mode_to_security_class(mode),
-                                            name,
-                                            &newsid);
-               if (rc) {
-                       printk(KERN_WARNING
-                               "%s: security_transition_sid failed, rc=%d\n",
-                              __func__, -rc);
-                       return rc;
-               }
-       }
+       rc = selinux_determine_inode_label(d_inode(dentry->d_parent), name,
+                                          inode_mode_to_security_class(mode),
+                                          &newsid);
+       if (rc)
+               return rc;
 
        return security_sid_to_context(newsid, (char **)ctx, ctxlen);
 }
@@ -2752,22 +2768,12 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
        sid = tsec->sid;
        newsid = tsec->create_sid;
 
-       if ((sbsec->flags & SE_SBINITIALIZED) &&
-           (sbsec->behavior == SECURITY_FS_USE_MNTPOINT))
-               newsid = sbsec->mntpoint_sid;
-       else if (!newsid || !(sbsec->flags & SBLABEL_MNT)) {
-               rc = security_transition_sid(sid, dsec->sid,
-                                            inode_mode_to_security_class(inode->i_mode),
-                                            qstr, &newsid);
-               if (rc) {
-                       printk(KERN_WARNING "%s:  "
-                              "security_transition_sid failed, rc=%d (dev=%s "
-                              "ino=%ld)\n",
-                              __func__,
-                              -rc, inode->i_sb->s_id, inode->i_ino);
-                       return rc;
-               }
-       }
+       rc = selinux_determine_inode_label(
+               dir, qstr,
+               inode_mode_to_security_class(inode->i_mode),
+               &newsid);
+       if (rc)
+               return rc;
 
        /* Possibly defer initialization to selinux_complete_init. */
        if (sbsec->flags & SE_SBINITIALIZED) {
@@ -3228,6 +3234,46 @@ static void selinux_file_free_security(struct file *file)
        file_free_security(file);
 }
 
+/*
+ * Check whether a task has the ioctl permission and cmd
+ * operation to an inode.
+ */
+int ioctl_has_perm(const struct cred *cred, struct file *file,
+               u32 requested, u16 cmd)
+{
+       struct common_audit_data ad;
+       struct file_security_struct *fsec = file->f_security;
+       struct inode *inode = file_inode(file);
+       struct inode_security_struct *isec = inode->i_security;
+       struct lsm_ioctlop_audit ioctl;
+       u32 ssid = cred_sid(cred);
+       int rc;
+       u8 driver = cmd >> 8;
+       u8 xperm = cmd & 0xff;
+
+       ad.type = LSM_AUDIT_DATA_IOCTL_OP;
+       ad.u.op = &ioctl;
+       ad.u.op->cmd = cmd;
+       ad.u.op->path = file->f_path;
+
+       if (ssid != fsec->sid) {
+               rc = avc_has_perm(ssid, fsec->sid,
+                               SECCLASS_FD,
+                               FD__USE,
+                               &ad);
+               if (rc)
+                       goto out;
+       }
+
+       if (unlikely(IS_PRIVATE(inode)))
+               return 0;
+
+       rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass,
+                       requested, driver, xperm, &ad);
+out:
+       return rc;
+}
+
 static int selinux_file_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
 {
@@ -3270,7 +3316,7 @@ static int selinux_file_ioctl(struct file *file, unsigned int cmd,
         * to the file's ioctl() function.
         */
        default:
-               error = file_has_perm(cred, file, FILE__IOCTL);
+               error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
        }
        return error;
 }
@@ -4520,6 +4566,7 @@ static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority
 
        sksec->peer_sid = SECINITSID_UNLABELED;
        sksec->sid = SECINITSID_UNLABELED;
+       sksec->sclass = SECCLASS_SOCKET;
        selinux_netlbl_sk_security_reset(sksec);
        sk->sk_security = sksec;
 
index 5973c327c54e712edba1034808defd01afa8a8a0..0999df03af8bff2fe54c084e7459ed19da5a20c2 100644 (file)
@@ -143,6 +143,7 @@ static inline int avc_audit(u32 ssid, u32 tsid,
 }
 
 #define AVC_STRICT 1 /* Ignore permissive mode. */
+#define AVC_EXTENDED_PERMS 2   /* update extended permissions */
 int avc_has_perm_noaudit(u32 ssid, u32 tsid,
                         u16 tclass, u32 requested,
                         unsigned flags,
@@ -156,6 +157,10 @@ int avc_has_perm_flags(u32 ssid, u32 tsid,
                       struct common_audit_data *auditdata,
                       int flags);
 
+int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
+               u8 driver, u8 perm, struct common_audit_data *ad);
+
+
 u32 avc_policy_seqno(void);
 
 #define AVC_CALLBACK_GRANT             1
@@ -166,6 +171,7 @@ u32 avc_policy_seqno(void);
 #define AVC_CALLBACK_AUDITALLOW_DISABLE        32
 #define AVC_CALLBACK_AUDITDENY_ENABLE  64
 #define AVC_CALLBACK_AUDITDENY_DISABLE 128
+#define AVC_CALLBACK_ADD_XPERMS                256
 
 int avc_add_callback(int (*callback)(u32 event), u32 events);
 
index 36993ad1c067a8b32172b88eaac92fa884b6a072..6a681d26bf20a609aacbc65f508a85ce8d517861 100644 (file)
 #define POLICYDB_VERSION_NEW_OBJECT_DEFAULTS   27
 #define POLICYDB_VERSION_DEFAULT_TYPE  28
 #define POLICYDB_VERSION_CONSTRAINT_NAMES      29
+#define POLICYDB_VERSION_XPERMS_IOCTL  30
 
 /* Range of policy versions we understand*/
 #define POLICYDB_VERSION_MIN   POLICYDB_VERSION_BASE
 #ifdef CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX
 #define POLICYDB_VERSION_MAX   CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX_VALUE
 #else
-#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_CONSTRAINT_NAMES
+#define POLICYDB_VERSION_MAX   POLICYDB_VERSION_XPERMS_IOCTL
 #endif
 
 /* Mask for just the mount related flags */
@@ -109,11 +110,38 @@ struct av_decision {
        u32 flags;
 };
 
+#define XPERMS_ALLOWED 1
+#define XPERMS_AUDITALLOW 2
+#define XPERMS_DONTAUDIT 4
+
+#define security_xperm_set(perms, x) (perms[x >> 5] |= 1 << (x & 0x1f))
+#define security_xperm_test(perms, x) (1 & (perms[x >> 5] >> (x & 0x1f)))
+struct extended_perms_data {
+       u32 p[8];
+};
+
+struct extended_perms_decision {
+       u8 used;
+       u8 driver;
+       struct extended_perms_data *allowed;
+       struct extended_perms_data *auditallow;
+       struct extended_perms_data *dontaudit;
+};
+
+struct extended_perms {
+       u16 len;        /* length associated decision chain */
+       struct extended_perms_data drivers; /* flag drivers that are used */
+};
+
 /* definitions of av_decision.flags */
 #define AVD_FLAGS_PERMISSIVE   0x0001
 
 void security_compute_av(u32 ssid, u32 tsid,
-                        u16 tclass, struct av_decision *avd);
+                        u16 tclass, struct av_decision *avd,
+                        struct extended_perms *xperms);
+
+void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 tclass,
+                        u8 driver, struct extended_perms_decision *xpermd);
 
 void security_compute_av_user(u32 ssid, u32 tsid,
                             u16 tclass, struct av_decision *avd);
index 3d22014130289d2904dee4fb3c831223437864d2..5bed7716f8ab61b6f13a97ec8548c9559ad42b96 100644 (file)
@@ -472,7 +472,7 @@ static int sel_mmap_policy_fault(struct vm_area_struct *vma,
        return 0;
 }
 
-static struct vm_operations_struct sel_mmap_policy_ops = {
+static const struct vm_operations_struct sel_mmap_policy_ops = {
        .fault = sel_mmap_policy_fault,
        .page_mkwrite = sel_mmap_policy_fault,
 };
index b64f2772b030194d6ff3ca55e4cc86007ec9f193..3628d3a868b669c9aa9267808533aaa5dc8c0d1c 100644 (file)
@@ -24,6 +24,7 @@
 #include "policydb.h"
 
 static struct kmem_cache *avtab_node_cachep;
+static struct kmem_cache *avtab_xperms_cachep;
 
 /* Based on MurmurHash3, written by Austin Appleby and placed in the
  * public domain.
@@ -70,11 +71,24 @@ avtab_insert_node(struct avtab *h, int hvalue,
                  struct avtab_key *key, struct avtab_datum *datum)
 {
        struct avtab_node *newnode;
+       struct avtab_extended_perms *xperms;
        newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL);
        if (newnode == NULL)
                return NULL;
        newnode->key = *key;
-       newnode->datum = *datum;
+
+       if (key->specified & AVTAB_XPERMS) {
+               xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL);
+               if (xperms == NULL) {
+                       kmem_cache_free(avtab_node_cachep, newnode);
+                       return NULL;
+               }
+               *xperms = *(datum->u.xperms);
+               newnode->datum.u.xperms = xperms;
+       } else {
+               newnode->datum.u.data = datum->u.data;
+       }
+
        if (prev) {
                newnode->next = prev->next;
                prev->next = newnode;
@@ -107,8 +121,12 @@ static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_dat
                if (key->source_type == cur->key.source_type &&
                    key->target_type == cur->key.target_type &&
                    key->target_class == cur->key.target_class &&
-                   (specified & cur->key.specified))
+                   (specified & cur->key.specified)) {
+                       /* extended perms may not be unique */
+                       if (specified & AVTAB_XPERMS)
+                               break;
                        return -EEXIST;
+               }
                if (key->source_type < cur->key.source_type)
                        break;
                if (key->source_type == cur->key.source_type &&
@@ -271,6 +289,9 @@ void avtab_destroy(struct avtab *h)
                while (cur) {
                        temp = cur;
                        cur = cur->next;
+                       if (temp->key.specified & AVTAB_XPERMS)
+                               kmem_cache_free(avtab_xperms_cachep,
+                                               temp->datum.u.xperms);
                        kmem_cache_free(avtab_node_cachep, temp);
                }
        }
@@ -359,7 +380,10 @@ static uint16_t spec_order[] = {
        AVTAB_AUDITALLOW,
        AVTAB_TRANSITION,
        AVTAB_CHANGE,
-       AVTAB_MEMBER
+       AVTAB_MEMBER,
+       AVTAB_XPERMS_ALLOWED,
+       AVTAB_XPERMS_AUDITALLOW,
+       AVTAB_XPERMS_DONTAUDIT
 };
 
 int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
@@ -369,10 +393,11 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
 {
        __le16 buf16[4];
        u16 enabled;
-       __le32 buf32[7];
        u32 items, items2, val, vers = pol->policyvers;
        struct avtab_key key;
        struct avtab_datum datum;
+       struct avtab_extended_perms xperms;
+       __le32 buf32[ARRAY_SIZE(xperms.perms.p)];
        int i, rc;
        unsigned set;
 
@@ -429,11 +454,15 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
                        printk(KERN_ERR "SELinux: avtab: entry has both access vectors and types\n");
                        return -EINVAL;
                }
+               if (val & AVTAB_XPERMS) {
+                       printk(KERN_ERR "SELinux: avtab: entry has extended permissions\n");
+                       return -EINVAL;
+               }
 
                for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
                        if (val & spec_order[i]) {
                                key.specified = spec_order[i] | enabled;
-                               datum.data = le32_to_cpu(buf32[items++]);
+                               datum.u.data = le32_to_cpu(buf32[items++]);
                                rc = insertf(a, &key, &datum, p);
                                if (rc)
                                        return rc;
@@ -476,14 +505,42 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
                return -EINVAL;
        }
 
-       rc = next_entry(buf32, fp, sizeof(u32));
-       if (rc) {
-               printk(KERN_ERR "SELinux: avtab: truncated entry\n");
-               return rc;
+       if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) &&
+                       (key.specified & AVTAB_XPERMS)) {
+               printk(KERN_ERR "SELinux:  avtab:  policy version %u does not "
+                               "support extended permissions rules and one "
+                               "was specified\n", vers);
+               return -EINVAL;
+       } else if (key.specified & AVTAB_XPERMS) {
+               memset(&xperms, 0, sizeof(struct avtab_extended_perms));
+               rc = next_entry(&xperms.specified, fp, sizeof(u8));
+               if (rc) {
+                       printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+                       return rc;
+               }
+               rc = next_entry(&xperms.driver, fp, sizeof(u8));
+               if (rc) {
+                       printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+                       return rc;
+               }
+               rc = next_entry(buf32, fp, sizeof(u32)*ARRAY_SIZE(xperms.perms.p));
+               if (rc) {
+                       printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+                       return rc;
+               }
+               for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++)
+                       xperms.perms.p[i] = le32_to_cpu(buf32[i]);
+               datum.u.xperms = &xperms;
+       } else {
+               rc = next_entry(buf32, fp, sizeof(u32));
+               if (rc) {
+                       printk(KERN_ERR "SELinux: avtab: truncated entry\n");
+                       return rc;
+               }
+               datum.u.data = le32_to_cpu(*buf32);
        }
-       datum.data = le32_to_cpu(*buf32);
        if ((key.specified & AVTAB_TYPE) &&
-           !policydb_type_isvalid(pol, datum.data)) {
+           !policydb_type_isvalid(pol, datum.u.data)) {
                printk(KERN_ERR "SELinux: avtab: invalid type\n");
                return -EINVAL;
        }
@@ -543,8 +600,9 @@ bad:
 int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
 {
        __le16 buf16[4];
-       __le32 buf32[1];
+       __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)];
        int rc;
+       unsigned int i;
 
        buf16[0] = cpu_to_le16(cur->key.source_type);
        buf16[1] = cpu_to_le16(cur->key.target_type);
@@ -553,8 +611,22 @@ int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
        rc = put_entry(buf16, sizeof(u16), 4, fp);
        if (rc)
                return rc;
-       buf32[0] = cpu_to_le32(cur->datum.data);
-       rc = put_entry(buf32, sizeof(u32), 1, fp);
+
+       if (cur->key.specified & AVTAB_XPERMS) {
+               rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp);
+               if (rc)
+                       return rc;
+               rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp);
+               if (rc)
+                       return rc;
+               for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++)
+                       buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]);
+               rc = put_entry(buf32, sizeof(u32),
+                               ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp);
+       } else {
+               buf32[0] = cpu_to_le32(cur->datum.u.data);
+               rc = put_entry(buf32, sizeof(u32), 1, fp);
+       }
        if (rc)
                return rc;
        return 0;
@@ -588,9 +660,13 @@ void avtab_cache_init(void)
        avtab_node_cachep = kmem_cache_create("avtab_node",
                                              sizeof(struct avtab_node),
                                              0, SLAB_PANIC, NULL);
+       avtab_xperms_cachep = kmem_cache_create("avtab_extended_perms",
+                                               sizeof(struct avtab_extended_perms),
+                                               0, SLAB_PANIC, NULL);
 }
 
 void avtab_cache_destroy(void)
 {
        kmem_cache_destroy(avtab_node_cachep);
+       kmem_cache_destroy(avtab_xperms_cachep);
 }
index adb451cd44f9d3175ba84e1686bfeeae06a803d2..d946c9dc3c9ca6b2569ecd7624c3bd12f7789c77 100644 (file)
@@ -23,6 +23,7 @@
 #ifndef _SS_AVTAB_H_
 #define _SS_AVTAB_H_
 
+#include "security.h"
 #include <linux/flex_array.h>
 
 struct avtab_key {
@@ -37,13 +38,43 @@ struct avtab_key {
 #define AVTAB_MEMBER           0x0020
 #define AVTAB_CHANGE           0x0040
 #define AVTAB_TYPE             (AVTAB_TRANSITION | AVTAB_MEMBER | AVTAB_CHANGE)
+/* extended permissions */
+#define AVTAB_XPERMS_ALLOWED   0x0100
+#define AVTAB_XPERMS_AUDITALLOW        0x0200
+#define AVTAB_XPERMS_DONTAUDIT 0x0400
+#define AVTAB_XPERMS           (AVTAB_XPERMS_ALLOWED | \
+                               AVTAB_XPERMS_AUDITALLOW | \
+                               AVTAB_XPERMS_DONTAUDIT)
 #define AVTAB_ENABLED_OLD   0x80000000 /* reserved for used in cond_avtab */
 #define AVTAB_ENABLED          0x8000 /* reserved for used in cond_avtab */
        u16 specified;  /* what field is specified */
 };
 
+/*
+ * For operations that require more than the 32 permissions provided by the avc
+ * extended permissions may be used to provide 256 bits of permissions.
+ */
+struct avtab_extended_perms {
+/* These are not flags. All 256 values may be used */
+#define AVTAB_XPERMS_IOCTLFUNCTION     0x01
+#define AVTAB_XPERMS_IOCTLDRIVER       0x02
+       /* extension of the avtab_key specified */
+       u8 specified; /* ioctl, netfilter, ... */
+       /*
+        * if 256 bits is not adequate as is often the case with ioctls, then
+        * multiple extended perms may be used and the driver field
+        * specifies which permissions are included.
+        */
+       u8 driver;
+       /* 256 bits of permissions */
+       struct extended_perms_data perms;
+};
+
 struct avtab_datum {
-       u32 data; /* access vector or type value */
+       union {
+               u32 data; /* access vector or type value */
+               struct avtab_extended_perms *xperms;
+       } u;
 };
 
 struct avtab_node {
index 62c6773be0b75f69f4f9c6b579d0205aa602ed5c..18643bf9894d5e393bdb51b0d7b0740f34960e75 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "security.h"
 #include "conditional.h"
+#include "services.h"
 
 /*
  * cond_evaluate_expr evaluates a conditional expr
@@ -612,21 +613,39 @@ int cond_write_list(struct policydb *p, struct cond_node *list, void *fp)
 
        return 0;
 }
+
+void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key,
+               struct extended_perms_decision *xpermd)
+{
+       struct avtab_node *node;
+
+       if (!ctab || !key || !xpermd)
+               return;
+
+       for (node = avtab_search_node(ctab, key); node;
+                       node = avtab_search_node_next(node, key->specified)) {
+               if (node->key.specified & AVTAB_ENABLED)
+                       services_compute_xperms_decision(xpermd, node);
+       }
+       return;
+
+}
 /* Determine whether additional permissions are granted by the conditional
  * av table, and if so, add them to the result
  */
-void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd)
+void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
+               struct av_decision *avd, struct extended_perms *xperms)
 {
        struct avtab_node *node;
 
-       if (!ctab || !key || !avd)
+       if (!ctab || !key || !avd || !xperms)
                return;
 
        for (node = avtab_search_node(ctab, key); node;
                                node = avtab_search_node_next(node, key->specified)) {
                if ((u16)(AVTAB_ALLOWED|AVTAB_ENABLED) ==
                    (node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED)))
-                       avd->allowed |= node->datum.data;
+                       avd->allowed |= node->datum.u.data;
                if ((u16)(AVTAB_AUDITDENY|AVTAB_ENABLED) ==
                    (node->key.specified & (AVTAB_AUDITDENY|AVTAB_ENABLED)))
                        /* Since a '0' in an auditdeny mask represents a
@@ -634,10 +653,13 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decisi
                         * the '&' operand to ensure that all '0's in the mask
                         * are retained (much unlike the allow and auditallow cases).
                         */
-                       avd->auditdeny &= node->datum.data;
+                       avd->auditdeny &= node->datum.u.data;
                if ((u16)(AVTAB_AUDITALLOW|AVTAB_ENABLED) ==
                    (node->key.specified & (AVTAB_AUDITALLOW|AVTAB_ENABLED)))
-                       avd->auditallow |= node->datum.data;
+                       avd->auditallow |= node->datum.u.data;
+               if ((node->key.specified & AVTAB_ENABLED) &&
+                               (node->key.specified & AVTAB_XPERMS))
+                       services_compute_xperms_drivers(xperms, node);
        }
        return;
 }
index 4d1f87466508f7adf60f2eb7f5ae89ec3c49aa29..ddb43e7e1c756d83be6a5efa6482307a63089bc2 100644 (file)
@@ -73,8 +73,10 @@ int cond_read_list(struct policydb *p, void *fp);
 int cond_write_bool(void *key, void *datum, void *ptr);
 int cond_write_list(struct policydb *p, struct cond_node *list, void *fp);
 
-void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd);
-
+void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
+               struct av_decision *avd, struct extended_perms *xperms);
+void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key,
+               struct extended_perms_decision *xpermd);
 int evaluate_cond_node(struct policydb *p, struct cond_node *node);
 
 #endif /* _CONDITIONAL_H_ */
index 74aa224267c11fd31262dc45dc91c9c726ef6702..992a315308258724099f05f9fbca0e9e7f12cc89 100644 (file)
@@ -148,6 +148,11 @@ static struct policydb_compat_info policydb_compat[] = {
                .sym_num        = SYM_NUM,
                .ocon_num       = OCON_NUM,
        },
+       {
+               .version        = POLICYDB_VERSION_XPERMS_IOCTL,
+               .sym_num        = SYM_NUM,
+               .ocon_num       = OCON_NUM,
+       },
 };
 
 static struct policydb_compat_info *policydb_lookup_compat(int version)
index 9e2d82070915366333e193b99aca5badf16af72c..b7df12ba61d839c45789f762e6138aae1cc15ca9 100644 (file)
@@ -93,9 +93,10 @@ static int context_struct_to_string(struct context *context, char **scontext,
                                    u32 *scontext_len);
 
 static void context_struct_compute_av(struct context *scontext,
-                                     struct context *tcontext,
-                                     u16 tclass,
-                                     struct av_decision *avd);
+                                       struct context *tcontext,
+                                       u16 tclass,
+                                       struct av_decision *avd,
+                                       struct extended_perms *xperms);
 
 struct selinux_mapping {
        u16 value; /* policy value */
@@ -565,7 +566,8 @@ static void type_attribute_bounds_av(struct context *scontext,
                context_struct_compute_av(&lo_scontext,
                                          tcontext,
                                          tclass,
-                                         &lo_avd);
+                                         &lo_avd,
+                                         NULL);
                if ((lo_avd.allowed & avd->allowed) == avd->allowed)
                        return;         /* no masked permission */
                masked = ~lo_avd.allowed & avd->allowed;
@@ -580,7 +582,8 @@ static void type_attribute_bounds_av(struct context *scontext,
                context_struct_compute_av(scontext,
                                          &lo_tcontext,
                                          tclass,
-                                         &lo_avd);
+                                         &lo_avd,
+                                         NULL);
                if ((lo_avd.allowed & avd->allowed) == avd->allowed)
                        return;         /* no masked permission */
                masked = ~lo_avd.allowed & avd->allowed;
@@ -596,7 +599,8 @@ static void type_attribute_bounds_av(struct context *scontext,
                context_struct_compute_av(&lo_scontext,
                                          &lo_tcontext,
                                          tclass,
-                                         &lo_avd);
+                                         &lo_avd,
+                                         NULL);
                if ((lo_avd.allowed & avd->allowed) == avd->allowed)
                        return;         /* no masked permission */
                masked = ~lo_avd.allowed & avd->allowed;
@@ -613,13 +617,39 @@ static void type_attribute_bounds_av(struct context *scontext,
 }
 
 /*
- * Compute access vectors based on a context structure pair for
- * the permissions in a particular class.
+ * flag which drivers have permissions
+ * only looking for ioctl based extended permssions
+ */
+void services_compute_xperms_drivers(
+               struct extended_perms *xperms,
+               struct avtab_node *node)
+{
+       unsigned int i;
+
+       if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+               /* if one or more driver has all permissions allowed */
+               for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
+                       xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
+       } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+               /* if allowing permissions within a driver */
+               security_xperm_set(xperms->drivers.p,
+                                       node->datum.u.xperms->driver);
+       }
+
+       /* If no ioctl commands are allowed, ignore auditallow and auditdeny */
+       if (node->key.specified & AVTAB_XPERMS_ALLOWED)
+               xperms->len = 1;
+}
+
+/*
+ * Compute access vectors and extended permissions based on a context
+ * structure pair for the permissions in a particular class.
  */
 static void context_struct_compute_av(struct context *scontext,
-                                     struct context *tcontext,
-                                     u16 tclass,
-                                     struct av_decision *avd)
+                                       struct context *tcontext,
+                                       u16 tclass,
+                                       struct av_decision *avd,
+                                       struct extended_perms *xperms)
 {
        struct constraint_node *constraint;
        struct role_allow *ra;
@@ -633,6 +663,10 @@ static void context_struct_compute_av(struct context *scontext,
        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
+       if (xperms) {
+               memset(&xperms->drivers, 0, sizeof(xperms->drivers));
+               xperms->len = 0;
+       }
 
        if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) {
                if (printk_ratelimit())
@@ -647,7 +681,7 @@ static void context_struct_compute_av(struct context *scontext,
         * this permission check, then use it.
         */
        avkey.target_class = tclass;
-       avkey.specified = AVTAB_AV;
+       avkey.specified = AVTAB_AV | AVTAB_XPERMS;
        sattr = flex_array_get(policydb.type_attr_map_array, scontext->type - 1);
        BUG_ON(!sattr);
        tattr = flex_array_get(policydb.type_attr_map_array, tcontext->type - 1);
@@ -660,15 +694,18 @@ static void context_struct_compute_av(struct context *scontext,
                             node;
                             node = avtab_search_node_next(node, avkey.specified)) {
                                if (node->key.specified == AVTAB_ALLOWED)
-                                       avd->allowed |= node->datum.data;
+                                       avd->allowed |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITALLOW)
-                                       avd->auditallow |= node->datum.data;
+                                       avd->auditallow |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITDENY)
-                                       avd->auditdeny &= node->datum.data;
+                                       avd->auditdeny &= node->datum.u.data;
+                               else if (xperms && (node->key.specified & AVTAB_XPERMS))
+                                       services_compute_xperms_drivers(xperms, node);
                        }
 
                        /* Check conditional av table for additional permissions */
-                       cond_compute_av(&policydb.te_cond_avtab, &avkey, avd);
+                       cond_compute_av(&policydb.te_cond_avtab, &avkey,
+                                       avd, xperms);
 
                }
        }
@@ -899,6 +936,139 @@ static void avd_init(struct av_decision *avd)
        avd->flags = 0;
 }
 
+void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
+                                       struct avtab_node *node)
+{
+       unsigned int i;
+
+       if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+               if (xpermd->driver != node->datum.u.xperms->driver)
+                       return;
+       } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+               if (!security_xperm_test(node->datum.u.xperms->perms.p,
+                                       xpermd->driver))
+                       return;
+       } else {
+               BUG();
+       }
+
+       if (node->key.specified == AVTAB_XPERMS_ALLOWED) {
+               xpermd->used |= XPERMS_ALLOWED;
+               if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+                       memset(xpermd->allowed->p, 0xff,
+                                       sizeof(xpermd->allowed->p));
+               }
+               if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+                       for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++)
+                               xpermd->allowed->p[i] |=
+                                       node->datum.u.xperms->perms.p[i];
+               }
+       } else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) {
+               xpermd->used |= XPERMS_AUDITALLOW;
+               if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+                       memset(xpermd->auditallow->p, 0xff,
+                                       sizeof(xpermd->auditallow->p));
+               }
+               if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+                       for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++)
+                               xpermd->auditallow->p[i] |=
+                                       node->datum.u.xperms->perms.p[i];
+               }
+       } else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) {
+               xpermd->used |= XPERMS_DONTAUDIT;
+               if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+                       memset(xpermd->dontaudit->p, 0xff,
+                                       sizeof(xpermd->dontaudit->p));
+               }
+               if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+                       for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++)
+                               xpermd->dontaudit->p[i] |=
+                                       node->datum.u.xperms->perms.p[i];
+               }
+       } else {
+               BUG();
+       }
+}
+
+void security_compute_xperms_decision(u32 ssid,
+                               u32 tsid,
+                               u16 orig_tclass,
+                               u8 driver,
+                               struct extended_perms_decision *xpermd)
+{
+       u16 tclass;
+       struct context *scontext, *tcontext;
+       struct avtab_key avkey;
+       struct avtab_node *node;
+       struct ebitmap *sattr, *tattr;
+       struct ebitmap_node *snode, *tnode;
+       unsigned int i, j;
+
+       xpermd->driver = driver;
+       xpermd->used = 0;
+       memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p));
+       memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p));
+       memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p));
+
+       read_lock(&policy_rwlock);
+       if (!ss_initialized)
+               goto allow;
+
+       scontext = sidtab_search(&sidtab, ssid);
+       if (!scontext) {
+               printk(KERN_ERR "SELinux: %s:  unrecognized SID %d\n",
+                      __func__, ssid);
+               goto out;
+       }
+
+       tcontext = sidtab_search(&sidtab, tsid);
+       if (!tcontext) {
+               printk(KERN_ERR "SELinux: %s:  unrecognized SID %d\n",
+                      __func__, tsid);
+               goto out;
+       }
+
+       tclass = unmap_class(orig_tclass);
+       if (unlikely(orig_tclass && !tclass)) {
+               if (policydb.allow_unknown)
+                       goto allow;
+               goto out;
+       }
+
+
+       if (unlikely(!tclass || tclass > policydb.p_classes.nprim)) {
+               pr_warn_ratelimited("SELinux:  Invalid class %hu\n", tclass);
+               goto out;
+       }
+
+       avkey.target_class = tclass;
+       avkey.specified = AVTAB_XPERMS;
+       sattr = flex_array_get(policydb.type_attr_map_array,
+                               scontext->type - 1);
+       BUG_ON(!sattr);
+       tattr = flex_array_get(policydb.type_attr_map_array,
+                               tcontext->type - 1);
+       BUG_ON(!tattr);
+       ebitmap_for_each_positive_bit(sattr, snode, i) {
+               ebitmap_for_each_positive_bit(tattr, tnode, j) {
+                       avkey.source_type = i + 1;
+                       avkey.target_type = j + 1;
+                       for (node = avtab_search_node(&policydb.te_avtab, &avkey);
+                            node;
+                            node = avtab_search_node_next(node, avkey.specified))
+                               services_compute_xperms_decision(xpermd, node);
+
+                       cond_compute_xperms(&policydb.te_cond_avtab,
+                                               &avkey, xpermd);
+               }
+       }
+out:
+       read_unlock(&policy_rwlock);
+       return;
+allow:
+       memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p));
+       goto out;
+}
 
 /**
  * security_compute_av - Compute access vector decisions.
@@ -906,6 +1076,7 @@ static void avd_init(struct av_decision *avd)
  * @tsid: target security identifier
  * @tclass: target security class
  * @avd: access vector decisions
+ * @xperms: extended permissions
  *
  * Compute a set of access vector decisions based on the
  * SID pair (@ssid, @tsid) for the permissions in @tclass.
@@ -913,13 +1084,15 @@ static void avd_init(struct av_decision *avd)
 void security_compute_av(u32 ssid,
                         u32 tsid,
                         u16 orig_tclass,
-                        struct av_decision *avd)
+                        struct av_decision *avd,
+                        struct extended_perms *xperms)
 {
        u16 tclass;
        struct context *scontext = NULL, *tcontext = NULL;
 
        read_lock(&policy_rwlock);
        avd_init(avd);
+       xperms->len = 0;
        if (!ss_initialized)
                goto allow;
 
@@ -947,7 +1120,7 @@ void security_compute_av(u32 ssid,
                        goto allow;
                goto out;
        }
-       context_struct_compute_av(scontext, tcontext, tclass, avd);
+       context_struct_compute_av(scontext, tcontext, tclass, avd, xperms);
        map_decision(orig_tclass, avd, policydb.allow_unknown);
 out:
        read_unlock(&policy_rwlock);
@@ -993,7 +1166,7 @@ void security_compute_av_user(u32 ssid,
                goto out;
        }
 
-       context_struct_compute_av(scontext, tcontext, tclass, avd);
+       context_struct_compute_av(scontext, tcontext, tclass, avd, NULL);
  out:
        read_unlock(&policy_rwlock);
        return;
@@ -1515,7 +1688,7 @@ static int security_compute_sid(u32 ssid,
 
        if (avdatum) {
                /* Use the type from the type transition/member/change rule. */
-               newcontext.type = avdatum->data;
+               newcontext.type = avdatum->u.data;
        }
 
        /* if we have a objname this is a file trans check so check those rules */
index e8d907e903cdb1e05cf9f3a0bd28805f905a1214..6abcd8729ec3a6c7605ab394a7d108fe0192020a 100644 (file)
 
 extern struct policydb policydb;
 
+void services_compute_xperms_drivers(struct extended_perms *xperms,
+                               struct avtab_node *node);
+
+void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
+                                       struct avtab_node *node);
+
 #endif /* _SS_SERVICES_H_ */
 
index 244e035e5a99f303297536d62a41f62547233b4f..fff0c612bbb77be9f84254753c6b11a7572e4626 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/lsm_hooks.h>
 #include <linux/in.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/in6.h>
+#endif /* CONFIG_IPV6 */
 #include <net/netlabel.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/lsm_audit.h>
 
+/*
+ * Use IPv6 port labeling if IPv6 is enabled and secmarks
+ * are not being used.
+ */
+#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#define SMACK_IPV6_PORT_LABELING 1
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#define SMACK_IPV6_SECMARK_LABELING 1
+#endif
+
 /*
  * Smack labels were limited to 23 characters for a long time.
  */
@@ -118,15 +133,30 @@ struct smack_rule {
 };
 
 /*
- * An entry in the table identifying hosts.
+ * An entry in the table identifying IPv4 hosts.
  */
-struct smk_netlbladdr {
+struct smk_net4addr {
        struct list_head        list;
-       struct sockaddr_in      smk_host;       /* network address */
+       struct in_addr          smk_host;       /* network address */
        struct in_addr          smk_mask;       /* network mask */
+       int                     smk_masks;      /* mask size */
+       struct smack_known      *smk_label;     /* label */
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+/*
+ * An entry in the table identifying IPv6 hosts.
+ */
+struct smk_net6addr {
+       struct list_head        list;
+       struct in6_addr         smk_host;       /* network address */
+       struct in6_addr         smk_mask;       /* network mask */
+       int                     smk_masks;      /* mask size */
        struct smack_known      *smk_label;     /* label */
 };
+#endif /* CONFIG_IPV6 */
 
+#ifdef SMACK_IPV6_PORT_LABELING
 /*
  * An entry in the table identifying ports.
  */
@@ -137,12 +167,31 @@ struct smk_port_label {
        struct smack_known      *smk_in;        /* inbound label */
        struct smack_known      *smk_out;       /* outgoing label */
 };
+#endif /* SMACK_IPV6_PORT_LABELING */
 
 struct smack_onlycap {
        struct list_head        list;
        struct smack_known      *smk_label;
 };
 
+/* Super block security struct flags for mount options */
+#define FSDEFAULT_MNT  0x01
+#define FSFLOOR_MNT    0x02
+#define FSHAT_MNT      0x04
+#define FSROOT_MNT     0x08
+#define FSTRANS_MNT    0x10
+
+#define NUM_SMK_MNT_OPTS       5
+
+enum {
+       Opt_error = -1,
+       Opt_fsdefault = 1,
+       Opt_fsfloor = 2,
+       Opt_fshat = 3,
+       Opt_fsroot = 4,
+       Opt_fstransmute = 5,
+};
+
 /*
  * Mount options
  */
@@ -152,6 +201,7 @@ struct smack_onlycap {
 #define SMK_FSROOT     "smackfsroot="
 #define SMK_FSTRANS    "smackfstransmute="
 
+#define SMACK_DELETE_OPTION    "-DELETE"
 #define SMACK_CIPSO_OPTION     "-CIPSO"
 
 /*
@@ -234,10 +284,6 @@ struct smk_audit_info {
        struct smack_audit_data sad;
 #endif
 };
-/*
- * These functions are in smack_lsm.c
- */
-struct inode_smack *new_inode_smack(struct smack_known *);
 
 /*
  * These functions are in smack_access.c
@@ -267,7 +313,6 @@ extern struct smack_known *smack_syslog_label;
 #ifdef CONFIG_SECURITY_SMACK_BRINGUP
 extern struct smack_known *smack_unconfined;
 #endif
-extern struct smack_known smack_cipso_option;
 extern int smack_ptrace_rule;
 
 extern struct smack_known smack_known_floor;
@@ -279,7 +324,10 @@ extern struct smack_known smack_known_web;
 
 extern struct mutex    smack_known_lock;
 extern struct list_head smack_known_list;
-extern struct list_head smk_netlbladdr_list;
+extern struct list_head smk_net4addr_list;
+#if IS_ENABLED(CONFIG_IPV6)
+extern struct list_head smk_net6addr_list;
+#endif /* CONFIG_IPV6 */
 
 extern struct mutex     smack_onlycap_lock;
 extern struct list_head smack_onlycap_list;
index 00f6b38bffbde4546c51c929f1ad7ffb8c669f28..bc1053fb5d1d062a165b0b91c6553428aab653cf 100644 (file)
@@ -639,6 +639,12 @@ int smack_privileged(int cap)
        struct smack_known *skp = smk_of_current();
        struct smack_onlycap *sop;
 
+       /*
+        * All kernel tasks are privileged
+        */
+       if (unlikely(current->flags & PF_KTHREAD))
+               return 1;
+
        if (!capable(cap))
                return 0;
 
index a143328f75ebb0ec20e06a523ee1480403670b4d..996c889564383a4725e57a769b24abb53225cd50 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/msg.h>
 #include <linux/shm.h>
 #include <linux/binfmts.h>
+#include <linux/parser.h>
 #include "smack.h"
 
 #define TRANS_TRUE     "TRUE"
 #define SMK_RECEIVING  1
 #define SMK_SENDING    2
 
-#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#ifdef SMACK_IPV6_PORT_LABELING
 LIST_HEAD(smk_ipv6_port_list);
-#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
 static struct kmem_cache *smack_inode_cache;
 int smack_enabled;
 
+static const match_table_t smk_mount_tokens = {
+       {Opt_fsdefault, SMK_FSDEFAULT "%s"},
+       {Opt_fsfloor, SMK_FSFLOOR "%s"},
+       {Opt_fshat, SMK_FSHAT "%s"},
+       {Opt_fsroot, SMK_FSROOT "%s"},
+       {Opt_fstransmute, SMK_FSTRANS "%s"},
+       {Opt_error, NULL},
+};
+
 #ifdef CONFIG_SECURITY_SMACK_BRINGUP
 static char *smk_bu_mess[] = {
        "Bringup Error",        /* Unused */
@@ -281,7 +291,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip,
  *
  * Returns the new blob or NULL if there's no memory available
  */
-struct inode_smack *new_inode_smack(struct smack_known *skp)
+static struct inode_smack *new_inode_smack(struct smack_known *skp)
 {
        struct inode_smack *isp;
 
@@ -577,76 +587,197 @@ static int smack_sb_copy_data(char *orig, char *smackopts)
 }
 
 /**
- * smack_sb_kern_mount - Smack specific mount processing
+ * smack_parse_opts_str - parse Smack specific mount options
+ * @options: mount options string
+ * @opts: where to store converted mount opts
+ *
+ * Returns 0 on success or -ENOMEM on error.
+ *
+ * converts Smack specific mount options to generic security option format
+ */
+static int smack_parse_opts_str(char *options,
+               struct security_mnt_opts *opts)
+{
+       char *p;
+       char *fsdefault = NULL;
+       char *fsfloor = NULL;
+       char *fshat = NULL;
+       char *fsroot = NULL;
+       char *fstransmute = NULL;
+       int rc = -ENOMEM;
+       int num_mnt_opts = 0;
+       int token;
+
+       opts->num_mnt_opts = 0;
+
+       if (!options)
+               return 0;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               substring_t args[MAX_OPT_ARGS];
+
+               if (!*p)
+                       continue;
+
+               token = match_token(p, smk_mount_tokens, args);
+
+               switch (token) {
+               case Opt_fsdefault:
+                       if (fsdefault)
+                               goto out_opt_err;
+                       fsdefault = match_strdup(&args[0]);
+                       if (!fsdefault)
+                               goto out_err;
+                       break;
+               case Opt_fsfloor:
+                       if (fsfloor)
+                               goto out_opt_err;
+                       fsfloor = match_strdup(&args[0]);
+                       if (!fsfloor)
+                               goto out_err;
+                       break;
+               case Opt_fshat:
+                       if (fshat)
+                               goto out_opt_err;
+                       fshat = match_strdup(&args[0]);
+                       if (!fshat)
+                               goto out_err;
+                       break;
+               case Opt_fsroot:
+                       if (fsroot)
+                               goto out_opt_err;
+                       fsroot = match_strdup(&args[0]);
+                       if (!fsroot)
+                               goto out_err;
+                       break;
+               case Opt_fstransmute:
+                       if (fstransmute)
+                               goto out_opt_err;
+                       fstransmute = match_strdup(&args[0]);
+                       if (!fstransmute)
+                               goto out_err;
+                       break;
+               default:
+                       rc = -EINVAL;
+                       pr_warn("Smack:  unknown mount option\n");
+                       goto out_err;
+               }
+       }
+
+       opts->mnt_opts = kcalloc(NUM_SMK_MNT_OPTS, sizeof(char *), GFP_ATOMIC);
+       if (!opts->mnt_opts)
+               goto out_err;
+
+       opts->mnt_opts_flags = kcalloc(NUM_SMK_MNT_OPTS, sizeof(int),
+                       GFP_ATOMIC);
+       if (!opts->mnt_opts_flags) {
+               kfree(opts->mnt_opts);
+               goto out_err;
+       }
+
+       if (fsdefault) {
+               opts->mnt_opts[num_mnt_opts] = fsdefault;
+               opts->mnt_opts_flags[num_mnt_opts++] = FSDEFAULT_MNT;
+       }
+       if (fsfloor) {
+               opts->mnt_opts[num_mnt_opts] = fsfloor;
+               opts->mnt_opts_flags[num_mnt_opts++] = FSFLOOR_MNT;
+       }
+       if (fshat) {
+               opts->mnt_opts[num_mnt_opts] = fshat;
+               opts->mnt_opts_flags[num_mnt_opts++] = FSHAT_MNT;
+       }
+       if (fsroot) {
+               opts->mnt_opts[num_mnt_opts] = fsroot;
+               opts->mnt_opts_flags[num_mnt_opts++] = FSROOT_MNT;
+       }
+       if (fstransmute) {
+               opts->mnt_opts[num_mnt_opts] = fstransmute;
+               opts->mnt_opts_flags[num_mnt_opts++] = FSTRANS_MNT;
+       }
+
+       opts->num_mnt_opts = num_mnt_opts;
+       return 0;
+
+out_opt_err:
+       rc = -EINVAL;
+       pr_warn("Smack: duplicate mount options\n");
+
+out_err:
+       kfree(fsdefault);
+       kfree(fsfloor);
+       kfree(fshat);
+       kfree(fsroot);
+       kfree(fstransmute);
+       return rc;
+}
+
+/**
+ * smack_set_mnt_opts - set Smack specific mount options
  * @sb: the file system superblock
- * @flags: the mount flags
- * @data: the smack mount options
+ * @opts: Smack mount options
+ * @kern_flags: mount option from kernel space or user space
+ * @set_kern_flags: where to store converted mount opts
  *
  * Returns 0 on success, an error code on failure
+ *
+ * Allow filesystems with binary mount data to explicitly set Smack mount
+ * labels.
  */
-static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
+static int smack_set_mnt_opts(struct super_block *sb,
+               struct security_mnt_opts *opts,
+               unsigned long kern_flags,
+               unsigned long *set_kern_flags)
 {
        struct dentry *root = sb->s_root;
        struct inode *inode = d_backing_inode(root);
        struct superblock_smack *sp = sb->s_security;
        struct inode_smack *isp;
        struct smack_known *skp;
-       char *op;
-       char *commap;
+       int i;
+       int num_opts = opts->num_mnt_opts;
        int transmute = 0;
-       int specified = 0;
 
        if (sp->smk_initialized)
                return 0;
 
        sp->smk_initialized = 1;
 
-       for (op = data; op != NULL; op = commap) {
-               commap = strchr(op, ',');
-               if (commap != NULL)
-                       *commap++ = '\0';
-
-               if (strncmp(op, SMK_FSHAT, strlen(SMK_FSHAT)) == 0) {
-                       op += strlen(SMK_FSHAT);
-                       skp = smk_import_entry(op, 0);
+       for (i = 0; i < num_opts; i++) {
+               switch (opts->mnt_opts_flags[i]) {
+               case FSDEFAULT_MNT:
+                       skp = smk_import_entry(opts->mnt_opts[i], 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
-                       sp->smk_hat = skp;
-                       specified = 1;
-
-               } else if (strncmp(op, SMK_FSFLOOR, strlen(SMK_FSFLOOR)) == 0) {
-                       op += strlen(SMK_FSFLOOR);
-                       skp = smk_import_entry(op, 0);
+                       sp->smk_default = skp;
+                       break;
+               case FSFLOOR_MNT:
+                       skp = smk_import_entry(opts->mnt_opts[i], 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_floor = skp;
-                       specified = 1;
-
-               } else if (strncmp(op, SMK_FSDEFAULT,
-                                  strlen(SMK_FSDEFAULT)) == 0) {
-                       op += strlen(SMK_FSDEFAULT);
-                       skp = smk_import_entry(op, 0);
+                       break;
+               case FSHAT_MNT:
+                       skp = smk_import_entry(opts->mnt_opts[i], 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
-                       sp->smk_default = skp;
-                       specified = 1;
-
-               } else if (strncmp(op, SMK_FSROOT, strlen(SMK_FSROOT)) == 0) {
-                       op += strlen(SMK_FSROOT);
-                       skp = smk_import_entry(op, 0);
+                       sp->smk_hat = skp;
+                       break;
+               case FSROOT_MNT:
+                       skp = smk_import_entry(opts->mnt_opts[i], 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_root = skp;
-                       specified = 1;
-
-               } else if (strncmp(op, SMK_FSTRANS, strlen(SMK_FSTRANS)) == 0) {
-                       op += strlen(SMK_FSTRANS);
-                       skp = smk_import_entry(op, 0);
+                       break;
+               case FSTRANS_MNT:
+                       skp = smk_import_entry(opts->mnt_opts[i], 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_root = skp;
                        transmute = 1;
-                       specified = 1;
+                       break;
+               default:
+                       break;
                }
        }
 
@@ -654,7 +785,7 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
                /*
                 * Unprivileged mounts don't get to specify Smack values.
                 */
-               if (specified)
+               if (num_opts)
                        return -EPERM;
                /*
                 * Unprivileged mounts get root and default from the caller.
@@ -663,6 +794,7 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
                sp->smk_root = skp;
                sp->smk_default = skp;
        }
+
        /*
         * Initialize the root inode.
         */
@@ -681,6 +813,37 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
        return 0;
 }
 
+/**
+ * smack_sb_kern_mount - Smack specific mount processing
+ * @sb: the file system superblock
+ * @flags: the mount flags
+ * @data: the smack mount options
+ *
+ * Returns 0 on success, an error code on failure
+ */
+static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
+{
+       int rc = 0;
+       char *options = data;
+       struct security_mnt_opts opts;
+
+       security_init_mnt_opts(&opts);
+
+       if (!options)
+               goto out;
+
+       rc = smack_parse_opts_str(options, &opts);
+       if (rc)
+               goto out_err;
+
+out:
+       rc = smack_set_mnt_opts(sb, &opts, 0, NULL);
+
+out_err:
+       security_free_mnt_opts(&opts);
+       return rc;
+}
+
 /**
  * smack_sb_statfs - Smack check on statfs
  * @dentry: identifies the file system in question
@@ -2113,7 +2276,7 @@ static void smack_sk_free_security(struct sock *sk)
 }
 
 /**
-* smack_host_label - check host based restrictions
+* smack_ipv4host_label - check host based restrictions
 * @sip: the object end
 *
 * looks for host based access restrictions
@@ -2124,30 +2287,96 @@ static void smack_sk_free_security(struct sock *sk)
 *
 * Returns the label of the far end or NULL if it's not special.
 */
-static struct smack_known *smack_host_label(struct sockaddr_in *sip)
+static struct smack_known *smack_ipv4host_label(struct sockaddr_in *sip)
 {
-       struct smk_netlbladdr *snp;
+       struct smk_net4addr *snp;
        struct in_addr *siap = &sip->sin_addr;
 
        if (siap->s_addr == 0)
                return NULL;
 
-       list_for_each_entry_rcu(snp, &smk_netlbladdr_list, list)
+       list_for_each_entry_rcu(snp, &smk_net4addr_list, list)
+               /*
+                * we break after finding the first match because
+                * the list is sorted from longest to shortest mask
+                * so we have found the most specific match
+                */
+               if (snp->smk_host.s_addr ==
+                   (siap->s_addr & snp->smk_mask.s_addr))
+                       return snp->smk_label;
+
+       return NULL;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/*
+ * smk_ipv6_localhost - Check for local ipv6 host address
+ * @sip: the address
+ *
+ * Returns boolean true if this is the localhost address
+ */
+static bool smk_ipv6_localhost(struct sockaddr_in6 *sip)
+{
+       __be16 *be16p = (__be16 *)&sip->sin6_addr;
+       __be32 *be32p = (__be32 *)&sip->sin6_addr;
+
+       if (be32p[0] == 0 && be32p[1] == 0 && be32p[2] == 0 && be16p[6] == 0 &&
+           ntohs(be16p[7]) == 1)
+               return true;
+       return false;
+}
+
+/**
+* smack_ipv6host_label - check host based restrictions
+* @sip: the object end
+*
+* looks for host based access restrictions
+*
+* This version will only be appropriate for really small sets of single label
+* hosts.  The caller is responsible for ensuring that the RCU read lock is
+* taken before calling this function.
+*
+* Returns the label of the far end or NULL if it's not special.
+*/
+static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip)
+{
+       struct smk_net6addr *snp;
+       struct in6_addr *sap = &sip->sin6_addr;
+       int i;
+       int found = 0;
+
+       /*
+        * It's local. Don't look for a host label.
+        */
+       if (smk_ipv6_localhost(sip))
+               return NULL;
+
+       list_for_each_entry_rcu(snp, &smk_net6addr_list, list) {
                /*
                * we break after finding the first match because
                * the list is sorted from longest to shortest mask
                * so we have found the most specific match
                */
-               if ((&snp->smk_host.sin_addr)->s_addr ==
-                   (siap->s_addr & (&snp->smk_mask)->s_addr)) {
-                       /* we have found the special CIPSO option */
-                       if (snp->smk_label == &smack_cipso_option)
-                               return NULL;
-                       return snp->smk_label;
+               for (found = 1, i = 0; i < 8; i++) {
+                       /*
+                        * If the label is NULL the entry has
+                        * been renounced. Ignore it.
+                        */
+                       if (snp->smk_label == NULL)
+                               continue;
+                       if ((sap->s6_addr16[i] & snp->smk_mask.s6_addr16[i]) !=
+                           snp->smk_host.s6_addr16[i]) {
+                               found = 0;
+                               break;
+                       }
                }
+               if (found)
+                       return snp->smk_label;
+       }
 
        return NULL;
 }
+#endif /* CONFIG_IPV6 */
 
 /**
  * smack_netlabel - Set the secattr on a socket
@@ -2211,7 +2440,7 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
        struct smk_audit_info ad;
 
        rcu_read_lock();
-       hkp = smack_host_label(sap);
+       hkp = smack_ipv4host_label(sap);
        if (hkp != NULL) {
 #ifdef CONFIG_AUDIT
                struct lsm_network_audit net;
@@ -2236,7 +2465,42 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
        return smack_netlabel(sk, sk_lbl);
 }
 
-#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * smk_ipv6_check - check Smack access
+ * @subject: subject Smack label
+ * @object: object Smack label
+ * @address: address
+ * @act: the action being taken
+ *
+ * Check an IPv6 access
+ */
+static int smk_ipv6_check(struct smack_known *subject,
+                               struct smack_known *object,
+                               struct sockaddr_in6 *address, int act)
+{
+#ifdef CONFIG_AUDIT
+       struct lsm_network_audit net;
+#endif
+       struct smk_audit_info ad;
+       int rc;
+
+#ifdef CONFIG_AUDIT
+       smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
+       ad.a.u.net->family = PF_INET6;
+       ad.a.u.net->dport = ntohs(address->sin6_port);
+       if (act == SMK_RECEIVING)
+               ad.a.u.net->v6info.saddr = address->sin6_addr;
+       else
+               ad.a.u.net->v6info.daddr = address->sin6_addr;
+#endif
+       rc = smk_access(subject, object, MAY_WRITE, &ad);
+       rc = smk_bu_note("IPv6 check", subject, object, MAY_WRITE, rc);
+       return rc;
+}
+#endif /* CONFIG_IPV6 */
+
+#ifdef SMACK_IPV6_PORT_LABELING
 /**
  * smk_ipv6_port_label - Smack port access table management
  * @sock: socket
@@ -2320,48 +2584,43 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address)
 static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address,
                                int act)
 {
-       __be16 *bep;
-       __be32 *be32p;
        struct smk_port_label *spp;
        struct socket_smack *ssp = sk->sk_security;
-       struct smack_known *skp;
-       unsigned short port = 0;
+       struct smack_known *skp = NULL;
+       unsigned short port;
        struct smack_known *object;
-       struct smk_audit_info ad;
-       int rc;
-#ifdef CONFIG_AUDIT
-       struct lsm_network_audit net;
-#endif
 
        if (act == SMK_RECEIVING) {
-               skp = smack_net_ambient;
+               skp = smack_ipv6host_label(address);
                object = ssp->smk_in;
        } else {
                skp = ssp->smk_out;
-               object = smack_net_ambient;
+               object = smack_ipv6host_label(address);
        }
 
        /*
-        * Get the IP address and port from the address.
+        * The other end is a single label host.
         */
-       port = ntohs(address->sin6_port);
-       bep = (__be16 *)(&address->sin6_addr);
-       be32p = (__be32 *)(&address->sin6_addr);
+       if (skp != NULL && object != NULL)
+               return smk_ipv6_check(skp, object, address, act);
+       if (skp == NULL)
+               skp = smack_net_ambient;
+       if (object == NULL)
+               object = smack_net_ambient;
 
        /*
         * It's remote, so port lookup does no good.
         */
-       if (be32p[0] || be32p[1] || be32p[2] || bep[6] || ntohs(bep[7]) != 1)
-               goto auditout;
+       if (!smk_ipv6_localhost(address))
+               return smk_ipv6_check(skp, object, address, act);
 
        /*
         * It's local so the send check has to have passed.
         */
-       if (act == SMK_RECEIVING) {
-               skp = &smack_known_web;
-               goto auditout;
-       }
+       if (act == SMK_RECEIVING)
+               return 0;
 
+       port = ntohs(address->sin6_port);
        list_for_each_entry(spp, &smk_ipv6_port_list, list) {
                if (spp->smk_port != port)
                        continue;
@@ -2371,22 +2630,9 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address,
                break;
        }
 
-auditout:
-
-#ifdef CONFIG_AUDIT
-       smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
-       ad.a.u.net->family = sk->sk_family;
-       ad.a.u.net->dport = port;
-       if (act == SMK_RECEIVING)
-               ad.a.u.net->v6info.saddr = address->sin6_addr;
-       else
-               ad.a.u.net->v6info.daddr = address->sin6_addr;
-#endif
-       rc = smk_access(skp, object, MAY_WRITE, &ad);
-       rc = smk_bu_note("IPv6 port check", skp, object, MAY_WRITE, rc);
-       return rc;
+       return smk_ipv6_check(skp, object, address, act);
 }
-#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif /* SMACK_IPV6_PORT_LABELING */
 
 /**
  * smack_inode_setsecurity - set smack xattrs
@@ -2447,10 +2693,10 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
        } else
                return -EOPNOTSUPP;
 
-#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#ifdef SMACK_IPV6_PORT_LABELING
        if (sock->sk->sk_family == PF_INET6)
                smk_ipv6_port_label(sock, NULL);
-#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
 
        return 0;
 }
@@ -2492,7 +2738,7 @@ static int smack_socket_post_create(struct socket *sock, int family,
        return smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET);
 }
 
-#ifndef CONFIG_SECURITY_SMACK_NETFILTER
+#ifdef SMACK_IPV6_PORT_LABELING
 /**
  * smack_socket_bind - record port binding information.
  * @sock: the socket
@@ -2506,14 +2752,11 @@ static int smack_socket_post_create(struct socket *sock, int family,
 static int smack_socket_bind(struct socket *sock, struct sockaddr *address,
                                int addrlen)
 {
-#if IS_ENABLED(CONFIG_IPV6)
        if (sock->sk != NULL && sock->sk->sk_family == PF_INET6)
                smk_ipv6_port_label(sock, address);
-#endif
-
        return 0;
 }
-#endif /* !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif /* SMACK_IPV6_PORT_LABELING */
 
 /**
  * smack_socket_connect - connect access check
@@ -2529,6 +2772,13 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap,
                                int addrlen)
 {
        int rc = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+       struct sockaddr_in6 *sip = (struct sockaddr_in6 *)sap;
+#endif
+#ifdef SMACK_IPV6_SECMARK_LABELING
+       struct smack_known *rsp;
+       struct socket_smack *ssp = sock->sk->sk_security;
+#endif
 
        if (sock->sk == NULL)
                return 0;
@@ -2542,10 +2792,15 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap,
        case PF_INET6:
                if (addrlen < sizeof(struct sockaddr_in6))
                        return -EINVAL;
-#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
-               rc = smk_ipv6_port_check(sock->sk, (struct sockaddr_in6 *)sap,
+#ifdef SMACK_IPV6_SECMARK_LABELING
+               rsp = smack_ipv6host_label(sip);
+               if (rsp != NULL)
+                       rc = smk_ipv6_check(ssp->smk_out, rsp, sip,
                                                SMK_CONNECTING);
-#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
+#ifdef SMACK_IPV6_PORT_LABELING
+               rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING);
+#endif
                break;
        }
        return rc;
@@ -3431,9 +3686,13 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                int size)
 {
        struct sockaddr_in *sip = (struct sockaddr_in *) msg->msg_name;
-#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#if IS_ENABLED(CONFIG_IPV6)
        struct sockaddr_in6 *sap = (struct sockaddr_in6 *) msg->msg_name;
-#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
+#ifdef SMACK_IPV6_SECMARK_LABELING
+       struct socket_smack *ssp = sock->sk->sk_security;
+       struct smack_known *rsp;
+#endif
        int rc = 0;
 
        /*
@@ -3447,9 +3706,15 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                rc = smack_netlabel_send(sock->sk, sip);
                break;
        case AF_INET6:
-#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
+#ifdef SMACK_IPV6_SECMARK_LABELING
+               rsp = smack_ipv6host_label(sap);
+               if (rsp != NULL)
+                       rc = smk_ipv6_check(ssp->smk_out, rsp, sap,
+                                               SMK_CONNECTING);
+#endif
+#ifdef SMACK_IPV6_PORT_LABELING
                rc = smk_ipv6_port_check(sock->sk, sap, SMK_SENDING);
-#endif /* CONFIG_IPV6 && !CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
                break;
        }
        return rc;
@@ -3663,10 +3928,12 @@ access_check:
                proto = smk_skb_to_addr_ipv6(skb, &sadd);
                if (proto != IPPROTO_UDP && proto != IPPROTO_TCP)
                        break;
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
+#ifdef SMACK_IPV6_SECMARK_LABELING
                if (skb && skb->secmark != 0)
                        skp = smack_from_secid(skb->secmark);
                else
+                       skp = smack_ipv6host_label(&sadd);
+               if (skp == NULL)
                        skp = smack_net_ambient;
 #ifdef CONFIG_AUDIT
                smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
@@ -3677,9 +3944,10 @@ access_check:
                rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad);
                rc = smk_bu_note("IPv6 delivery", skp, ssp->smk_in,
                                        MAY_WRITE, rc);
-#else /* CONFIG_SECURITY_SMACK_NETFILTER */
+#endif /* SMACK_IPV6_SECMARK_LABELING */
+#ifdef SMACK_IPV6_PORT_LABELING
                rc = smk_ipv6_port_check(sk, &sadd, SMK_RECEIVING);
-#endif /* CONFIG_SECURITY_SMACK_NETFILTER */
+#endif /* SMACK_IPV6_PORT_LABELING */
                break;
 #endif /* CONFIG_IPV6 */
        }
@@ -3777,13 +4045,11 @@ static int smack_socket_getpeersec_dgram(struct socket *sock,
                }
                netlbl_secattr_destroy(&secattr);
                break;
-#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6:
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
+#ifdef SMACK_IPV6_SECMARK_LABELING
                s = skb->secmark;
-#endif /* CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
                break;
-#endif /* CONFIG_IPV6 */
        }
        *secid = s;
        if (s == 0)
@@ -3906,7 +4172,7 @@ access_check:
        hdr = ip_hdr(skb);
        addr.sin_addr.s_addr = hdr->saddr;
        rcu_read_lock();
-       hskp = smack_host_label(&addr);
+       hskp = smack_ipv4host_label(&addr);
        rcu_read_unlock();
 
        if (hskp == NULL)
@@ -4254,7 +4520,7 @@ static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
        return 0;
 }
 
-struct security_hook_list smack_hooks[] = {
+static struct security_hook_list smack_hooks[] = {
        LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
        LSM_HOOK_INIT(syslog, smack_syslog),
@@ -4264,6 +4530,8 @@ struct security_hook_list smack_hooks[] = {
        LSM_HOOK_INIT(sb_copy_data, smack_sb_copy_data),
        LSM_HOOK_INIT(sb_kern_mount, smack_sb_kern_mount),
        LSM_HOOK_INIT(sb_statfs, smack_sb_statfs),
+       LSM_HOOK_INIT(sb_set_mnt_opts, smack_set_mnt_opts),
+       LSM_HOOK_INIT(sb_parse_opts_str, smack_parse_opts_str),
 
        LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds),
        LSM_HOOK_INIT(bprm_committing_creds, smack_bprm_committing_creds),
@@ -4356,9 +4624,9 @@ struct security_hook_list smack_hooks[] = {
        LSM_HOOK_INIT(unix_may_send, smack_unix_may_send),
 
        LSM_HOOK_INIT(socket_post_create, smack_socket_post_create),
-#ifndef CONFIG_SECURITY_SMACK_NETFILTER
+#ifdef SMACK_IPV6_PORT_LABELING
        LSM_HOOK_INIT(socket_bind, smack_socket_bind),
-#endif /* CONFIG_SECURITY_SMACK_NETFILTER */
+#endif
        LSM_HOOK_INIT(socket_connect, smack_socket_connect),
        LSM_HOOK_INIT(socket_sendmsg, smack_socket_sendmsg),
        LSM_HOOK_INIT(socket_sock_rcv_skb, smack_socket_sock_rcv_skb),
@@ -4453,7 +4721,16 @@ static __init int smack_init(void)
                return -ENOMEM;
        }
 
-       printk(KERN_INFO "Smack:  Initializing.\n");
+       pr_info("Smack:  Initializing.\n");
+#ifdef CONFIG_SECURITY_SMACK_NETFILTER
+       pr_info("Smack:  Netfilter enabled.\n");
+#endif
+#ifdef SMACK_IPV6_PORT_LABELING
+       pr_info("Smack:  IPv6 port labeling enabled.\n");
+#endif
+#ifdef SMACK_IPV6_SECMARK_LABELING
+       pr_info("Smack:  IPv6 Netfilter enabled.\n");
+#endif
 
        /*
         * Set the security state for the initial task.
index 2716d02119f3e80634aedcc39d48feed197c0bee..c20b154a33f22f9eba932268d733b41110b5a038 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/magic.h>
 #include "smack.h"
 
+#define BEBITS (sizeof(__be32) * 8)
 /*
  * smackfs pseudo filesystem.
  */
@@ -40,7 +41,7 @@ enum smk_inos {
        SMK_DOI         = 5,    /* CIPSO DOI */
        SMK_DIRECT      = 6,    /* CIPSO level indicating direct label */
        SMK_AMBIENT     = 7,    /* internet ambient label */
-       SMK_NETLBLADDR  = 8,    /* single label hosts */
+       SMK_NET4ADDR    = 8,    /* single label hosts */
        SMK_ONLYCAP     = 9,    /* the only "capable" label */
        SMK_LOGGING     = 10,   /* logging */
        SMK_LOAD_SELF   = 11,   /* task specific rules */
@@ -57,6 +58,9 @@ enum smk_inos {
 #ifdef CONFIG_SECURITY_SMACK_BRINGUP
        SMK_UNCONFINED  = 22,   /* define an unconfined label */
 #endif
+#if IS_ENABLED(CONFIG_IPV6)
+       SMK_NET6ADDR    = 23,   /* single label IPv6 hosts */
+#endif /* CONFIG_IPV6 */
 };
 
 /*
@@ -64,7 +68,10 @@ enum smk_inos {
  */
 static DEFINE_MUTEX(smack_cipso_lock);
 static DEFINE_MUTEX(smack_ambient_lock);
-static DEFINE_MUTEX(smk_netlbladdr_lock);
+static DEFINE_MUTEX(smk_net4addr_lock);
+#if IS_ENABLED(CONFIG_IPV6)
+static DEFINE_MUTEX(smk_net6addr_lock);
+#endif /* CONFIG_IPV6 */
 
 /*
  * This is the "ambient" label for network traffic.
@@ -118,7 +125,10 @@ int smack_ptrace_rule = SMACK_PTRACE_DEFAULT;
  * can write to the specified label.
  */
 
-LIST_HEAD(smk_netlbladdr_list);
+LIST_HEAD(smk_net4addr_list);
+#if IS_ENABLED(CONFIG_IPV6)
+LIST_HEAD(smk_net6addr_list);
+#endif /* CONFIG_IPV6 */
 
 /*
  * Rule lists are maintained for each label.
@@ -129,7 +139,7 @@ struct smack_master_list {
        struct smack_rule       *smk_rule;
 };
 
-LIST_HEAD(smack_rule_list);
+static LIST_HEAD(smack_rule_list);
 
 struct smack_parsed_rule {
        struct smack_known      *smk_subject;
@@ -140,11 +150,6 @@ struct smack_parsed_rule {
 
 static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT;
 
-struct smack_known smack_cipso_option = {
-       .smk_known      = SMACK_CIPSO_OPTION,
-       .smk_secid      = 0,
-};
-
 /*
  * Values for parsing cipso rules
  * SMK_DIGITLEN: Length of a digit field in a rule.
@@ -1047,92 +1052,90 @@ static const struct file_operations smk_cipso2_ops = {
  * Seq_file read operations for /smack/netlabel
  */
 
-static void *netlbladdr_seq_start(struct seq_file *s, loff_t *pos)
+static void *net4addr_seq_start(struct seq_file *s, loff_t *pos)
 {
-       return smk_seq_start(s, pos, &smk_netlbladdr_list);
+       return smk_seq_start(s, pos, &smk_net4addr_list);
 }
 
-static void *netlbladdr_seq_next(struct seq_file *s, void *v, loff_t *pos)
+static void *net4addr_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
-       return smk_seq_next(s, v, pos, &smk_netlbladdr_list);
+       return smk_seq_next(s, v, pos, &smk_net4addr_list);
 }
-#define BEBITS (sizeof(__be32) * 8)
 
 /*
  * Print host/label pairs
  */
-static int netlbladdr_seq_show(struct seq_file *s, void *v)
+static int net4addr_seq_show(struct seq_file *s, void *v)
 {
        struct list_head *list = v;
-       struct smk_netlbladdr *skp =
-                       list_entry_rcu(list, struct smk_netlbladdr, list);
-       unsigned char *hp = (char *) &skp->smk_host.sin_addr.s_addr;
-       int maskn;
-       u32 temp_mask = be32_to_cpu(skp->smk_mask.s_addr);
-
-       for (maskn = 0; temp_mask; temp_mask <<= 1, maskn++);
+       struct smk_net4addr *skp =
+                       list_entry_rcu(list, struct smk_net4addr, list);
+       char *kp = SMACK_CIPSO_OPTION;
 
-       seq_printf(s, "%u.%u.%u.%u/%d %s\n",
-               hp[0], hp[1], hp[2], hp[3], maskn, skp->smk_label->smk_known);
+       if (skp->smk_label != NULL)
+               kp = skp->smk_label->smk_known;
+       seq_printf(s, "%pI4/%d %s\n", &skp->smk_host.s_addr,
+                       skp->smk_masks, kp);
 
        return 0;
 }
 
-static const struct seq_operations netlbladdr_seq_ops = {
-       .start = netlbladdr_seq_start,
-       .next  = netlbladdr_seq_next,
-       .show  = netlbladdr_seq_show,
+static const struct seq_operations net4addr_seq_ops = {
+       .start = net4addr_seq_start,
+       .next  = net4addr_seq_next,
+       .show  = net4addr_seq_show,
        .stop  = smk_seq_stop,
 };
 
 /**
- * smk_open_netlbladdr - open() for /smack/netlabel
+ * smk_open_net4addr - open() for /smack/netlabel
  * @inode: inode structure representing file
  * @file: "netlabel" file pointer
  *
- * Connect our netlbladdr_seq_* operations with /smack/netlabel
+ * Connect our net4addr_seq_* operations with /smack/netlabel
  * file_operations
  */
-static int smk_open_netlbladdr(struct inode *inode, struct file *file)
+static int smk_open_net4addr(struct inode *inode, struct file *file)
 {
-       return seq_open(file, &netlbladdr_seq_ops);
+       return seq_open(file, &net4addr_seq_ops);
 }
 
 /**
- * smk_netlbladdr_insert
+ * smk_net4addr_insert
  * @new : netlabel to insert
  *
- * This helper insert netlabel in the smack_netlbladdrs list
+ * This helper insert netlabel in the smack_net4addrs list
  * sorted by netmask length (longest to smallest)
- * locked by &smk_netlbladdr_lock in smk_write_netlbladdr
+ * locked by &smk_net4addr_lock in smk_write_net4addr
  *
  */
-static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
+static void smk_net4addr_insert(struct smk_net4addr *new)
 {
-       struct smk_netlbladdr *m, *m_next;
+       struct smk_net4addr *m;
+       struct smk_net4addr *m_next;
 
-       if (list_empty(&smk_netlbladdr_list)) {
-               list_add_rcu(&new->list, &smk_netlbladdr_list);
+       if (list_empty(&smk_net4addr_list)) {
+               list_add_rcu(&new->list, &smk_net4addr_list);
                return;
        }
 
-       m = list_entry_rcu(smk_netlbladdr_list.next,
-                          struct smk_netlbladdr, list);
+       m = list_entry_rcu(smk_net4addr_list.next,
+                          struct smk_net4addr, list);
 
        /* the comparison '>' is a bit hacky, but works */
-       if (new->smk_mask.s_addr > m->smk_mask.s_addr) {
-               list_add_rcu(&new->list, &smk_netlbladdr_list);
+       if (new->smk_masks > m->smk_masks) {
+               list_add_rcu(&new->list, &smk_net4addr_list);
                return;
        }
 
-       list_for_each_entry_rcu(m, &smk_netlbladdr_list, list) {
-               if (list_is_last(&m->list, &smk_netlbladdr_list)) {
+       list_for_each_entry_rcu(m, &smk_net4addr_list, list) {
+               if (list_is_last(&m->list, &smk_net4addr_list)) {
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
                m_next = list_entry_rcu(m->list.next,
-                                       struct smk_netlbladdr, list);
-               if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) {
+                                       struct smk_net4addr, list);
+               if (new->smk_masks > m_next->smk_masks) {
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
@@ -1141,28 +1144,29 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 
 
 /**
- * smk_write_netlbladdr - write() for /smack/netlabel
+ * smk_write_net4addr - write() for /smack/netlabel
  * @file: file pointer, not actually used
  * @buf: where to get the data from
  * @count: bytes sent
  * @ppos: where to start
  *
- * Accepts only one netlbladdr per write call.
+ * Accepts only one net4addr per write call.
  * Returns number of bytes written or error code, as appropriate
  */
-static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
+static ssize_t smk_write_net4addr(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
-       struct smk_netlbladdr *snp;
+       struct smk_net4addr *snp;
        struct sockaddr_in newname;
        char *smack;
-       struct smack_known *skp;
+       struct smack_known *skp = NULL;
        char *data;
        char *host = (char *)&newname.sin_addr.s_addr;
        int rc;
        struct netlbl_audit audit_info;
        struct in_addr mask;
        unsigned int m;
+       unsigned int masks;
        int found;
        u32 mask_bits = (1<<31);
        __be32 nsa;
@@ -1200,7 +1204,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
        data[count] = '\0';
 
        rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd/%u %s",
-               &host[0], &host[1], &host[2], &host[3], &m, smack);
+               &host[0], &host[1], &host[2], &host[3], &masks, smack);
        if (rc != 6) {
                rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd %s",
                        &host[0], &host[1], &host[2], &host[3], smack);
@@ -1209,8 +1213,9 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
                        goto free_out;
                }
                m = BEBITS;
+               masks = 32;
        }
-       if (m > BEBITS) {
+       if (masks > BEBITS) {
                rc = -EINVAL;
                goto free_out;
        }
@@ -1225,16 +1230,16 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
                        goto free_out;
                }
        } else {
-               /* check known options */
-               if (strcmp(smack, smack_cipso_option.smk_known) == 0)
-                       skp = &smack_cipso_option;
-               else {
+               /*
+                * Only the -CIPSO option is supported for IPv4
+                */
+               if (strcmp(smack, SMACK_CIPSO_OPTION) != 0) {
                        rc = -EINVAL;
                        goto free_out;
                }
        }
 
-       for (temp_mask = 0; m > 0; m--) {
+       for (m = masks, temp_mask = 0; m > 0; m--) {
                temp_mask |= mask_bits;
                mask_bits >>= 1;
        }
@@ -1245,14 +1250,13 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
         * Only allow one writer at a time. Writes should be
         * quite rare and small in any case.
         */
-       mutex_lock(&smk_netlbladdr_lock);
+       mutex_lock(&smk_net4addr_lock);
 
        nsa = newname.sin_addr.s_addr;
        /* try to find if the prefix is already in the list */
        found = 0;
-       list_for_each_entry_rcu(snp, &smk_netlbladdr_list, list) {
-               if (snp->smk_host.sin_addr.s_addr == nsa &&
-                   snp->smk_mask.s_addr == mask.s_addr) {
+       list_for_each_entry_rcu(snp, &smk_net4addr_list, list) {
+               if (snp->smk_host.s_addr == nsa && snp->smk_masks == masks) {
                        found = 1;
                        break;
                }
@@ -1265,17 +1269,20 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
                        rc = -ENOMEM;
                else {
                        rc = 0;
-                       snp->smk_host.sin_addr.s_addr = newname.sin_addr.s_addr;
+                       snp->smk_host.s_addr = newname.sin_addr.s_addr;
                        snp->smk_mask.s_addr = mask.s_addr;
                        snp->smk_label = skp;
-                       smk_netlbladdr_insert(snp);
+                       snp->smk_masks = masks;
+                       smk_net4addr_insert(snp);
                }
        } else {
-               /* we delete the unlabeled entry, only if the previous label
-                * wasn't the special CIPSO option */
-               if (snp->smk_label != &smack_cipso_option)
+               /*
+                * Delete the unlabeled entry, only if the previous label
+                * wasn't the special CIPSO option
+                */
+               if (snp->smk_label != NULL)
                        rc = netlbl_cfg_unlbl_static_del(&init_net, NULL,
-                                       &snp->smk_host.sin_addr, &snp->smk_mask,
+                                       &snp->smk_host, &snp->smk_mask,
                                        PF_INET, &audit_info);
                else
                        rc = 0;
@@ -1287,15 +1294,15 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
         * this host so that incoming packets get labeled.
         * but only if we didn't get the special CIPSO option
         */
-       if (rc == 0 && skp != &smack_cipso_option)
+       if (rc == 0 && skp != NULL)
                rc = netlbl_cfg_unlbl_static_add(&init_net, NULL,
-                       &snp->smk_host.sin_addr, &snp->smk_mask, PF_INET,
+                       &snp->smk_host, &snp->smk_mask, PF_INET,
                        snp->smk_label->smk_secid, &audit_info);
 
        if (rc == 0)
                rc = count;
 
-       mutex_unlock(&smk_netlbladdr_lock);
+       mutex_unlock(&smk_net4addr_lock);
 
 free_out:
        kfree(smack);
@@ -1305,14 +1312,279 @@ free_data_out:
        return rc;
 }
 
-static const struct file_operations smk_netlbladdr_ops = {
-       .open           = smk_open_netlbladdr,
+static const struct file_operations smk_net4addr_ops = {
+       .open           = smk_open_net4addr,
        .read           = seq_read,
        .llseek         = seq_lseek,
-       .write          = smk_write_netlbladdr,
+       .write          = smk_write_net4addr,
        .release        = seq_release,
 };
 
+#if IS_ENABLED(CONFIG_IPV6)
+/*
+ * Seq_file read operations for /smack/netlabel6
+ */
+
+static void *net6addr_seq_start(struct seq_file *s, loff_t *pos)
+{
+       return smk_seq_start(s, pos, &smk_net6addr_list);
+}
+
+static void *net6addr_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       return smk_seq_next(s, v, pos, &smk_net6addr_list);
+}
+
+/*
+ * Print host/label pairs
+ */
+static int net6addr_seq_show(struct seq_file *s, void *v)
+{
+       struct list_head *list = v;
+       struct smk_net6addr *skp =
+                        list_entry(list, struct smk_net6addr, list);
+
+       if (skp->smk_label != NULL)
+               seq_printf(s, "%pI6/%d %s\n", &skp->smk_host, skp->smk_masks,
+                               skp->smk_label->smk_known);
+
+       return 0;
+}
+
+static const struct seq_operations net6addr_seq_ops = {
+       .start = net6addr_seq_start,
+       .next  = net6addr_seq_next,
+       .show  = net6addr_seq_show,
+       .stop  = smk_seq_stop,
+};
+
+/**
+ * smk_open_net6addr - open() for /smack/netlabel
+ * @inode: inode structure representing file
+ * @file: "netlabel" file pointer
+ *
+ * Connect our net6addr_seq_* operations with /smack/netlabel
+ * file_operations
+ */
+static int smk_open_net6addr(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &net6addr_seq_ops);
+}
+
+/**
+ * smk_net6addr_insert
+ * @new : entry to insert
+ *
+ * This inserts an entry in the smack_net6addrs list
+ * sorted by netmask length (longest to smallest)
+ * locked by &smk_net6addr_lock in smk_write_net6addr
+ *
+ */
+static void smk_net6addr_insert(struct smk_net6addr *new)
+{
+       struct smk_net6addr *m_next;
+       struct smk_net6addr *m;
+
+       if (list_empty(&smk_net6addr_list)) {
+               list_add_rcu(&new->list, &smk_net6addr_list);
+               return;
+       }
+
+       m = list_entry_rcu(smk_net6addr_list.next,
+                          struct smk_net6addr, list);
+
+       if (new->smk_masks > m->smk_masks) {
+               list_add_rcu(&new->list, &smk_net6addr_list);
+               return;
+       }
+
+       list_for_each_entry_rcu(m, &smk_net6addr_list, list) {
+               if (list_is_last(&m->list, &smk_net6addr_list)) {
+                       list_add_rcu(&new->list, &m->list);
+                       return;
+               }
+               m_next = list_entry_rcu(m->list.next,
+                                       struct smk_net6addr, list);
+               if (new->smk_masks > m_next->smk_masks) {
+                       list_add_rcu(&new->list, &m->list);
+                       return;
+               }
+       }
+}
+
+
+/**
+ * smk_write_net6addr - write() for /smack/netlabel
+ * @file: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Accepts only one net6addr per write call.
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_net6addr(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       struct smk_net6addr *snp;
+       struct in6_addr newname;
+       struct in6_addr fullmask;
+       struct smack_known *skp = NULL;
+       char *smack;
+       char *data;
+       int rc = 0;
+       int found = 0;
+       int i;
+       unsigned int scanned[8];
+       unsigned int m;
+       unsigned int mask = 128;
+
+       /*
+        * Must have privilege.
+        * No partial writes.
+        * Enough data must be present.
+        * "<addr/mask, as a:b:c:d:e:f:g:h/e><space><label>"
+        * "<addr, as a:b:c:d:e:f:g:h><space><label>"
+        */
+       if (!smack_privileged(CAP_MAC_ADMIN))
+               return -EPERM;
+       if (*ppos != 0)
+               return -EINVAL;
+       if (count < SMK_NETLBLADDRMIN)
+               return -EINVAL;
+
+       data = kzalloc(count + 1, GFP_KERNEL);
+       if (data == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(data, buf, count) != 0) {
+               rc = -EFAULT;
+               goto free_data_out;
+       }
+
+       smack = kzalloc(count + 1, GFP_KERNEL);
+       if (smack == NULL) {
+               rc = -ENOMEM;
+               goto free_data_out;
+       }
+
+       data[count] = '\0';
+
+       i = sscanf(data, "%x:%x:%x:%x:%x:%x:%x:%x/%u %s",
+                       &scanned[0], &scanned[1], &scanned[2], &scanned[3],
+                       &scanned[4], &scanned[5], &scanned[6], &scanned[7],
+                       &mask, smack);
+       if (i != 10) {
+               i = sscanf(data, "%x:%x:%x:%x:%x:%x:%x:%x %s",
+                               &scanned[0], &scanned[1], &scanned[2],
+                               &scanned[3], &scanned[4], &scanned[5],
+                               &scanned[6], &scanned[7], smack);
+               if (i != 9) {
+                       rc = -EINVAL;
+                       goto free_out;
+               }
+       }
+       if (mask > 128) {
+               rc = -EINVAL;
+               goto free_out;
+       }
+       for (i = 0; i < 8; i++) {
+               if (scanned[i] > 0xffff) {
+                       rc = -EINVAL;
+                       goto free_out;
+               }
+               newname.s6_addr16[i] = htons(scanned[i]);
+       }
+
+       /*
+        * If smack begins with '-', it is an option, don't import it
+        */
+       if (smack[0] != '-') {
+               skp = smk_import_entry(smack, 0);
+               if (skp == NULL) {
+                       rc = -EINVAL;
+                       goto free_out;
+               }
+       } else {
+               /*
+                * Only -DELETE is supported for IPv6
+                */
+               if (strcmp(smack, SMACK_DELETE_OPTION) != 0) {
+                       rc = -EINVAL;
+                       goto free_out;
+               }
+       }
+
+       for (i = 0, m = mask; i < 8; i++) {
+               if (m >= 16) {
+                       fullmask.s6_addr16[i] = 0xffff;
+                       m -= 16;
+               } else if (m > 0) {
+                       fullmask.s6_addr16[i] = (1 << m) - 1;
+                       m = 0;
+               } else
+                       fullmask.s6_addr16[i] = 0;
+               newname.s6_addr16[i] &= fullmask.s6_addr16[i];
+       }
+
+       /*
+        * Only allow one writer at a time. Writes should be
+        * quite rare and small in any case.
+        */
+       mutex_lock(&smk_net6addr_lock);
+       /*
+        * Try to find the prefix in the list
+        */
+       list_for_each_entry_rcu(snp, &smk_net6addr_list, list) {
+               if (mask != snp->smk_masks)
+                       continue;
+               for (found = 1, i = 0; i < 8; i++) {
+                       if (newname.s6_addr16[i] !=
+                           snp->smk_host.s6_addr16[i]) {
+                               found = 0;
+                               break;
+                       }
+               }
+               if (found == 1)
+                       break;
+       }
+       if (found == 0) {
+               snp = kzalloc(sizeof(*snp), GFP_KERNEL);
+               if (snp == NULL)
+                       rc = -ENOMEM;
+               else {
+                       snp->smk_host = newname;
+                       snp->smk_mask = fullmask;
+                       snp->smk_masks = mask;
+                       snp->smk_label = skp;
+                       smk_net6addr_insert(snp);
+               }
+       } else {
+               snp->smk_label = skp;
+       }
+
+       if (rc == 0)
+               rc = count;
+
+       mutex_unlock(&smk_net6addr_lock);
+
+free_out:
+       kfree(smack);
+free_data_out:
+       kfree(data);
+
+       return rc;
+}
+
+static const struct file_operations smk_net6addr_ops = {
+       .open           = smk_open_net6addr,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .write          = smk_write_net6addr,
+       .release        = seq_release,
+};
+#endif /* CONFIG_IPV6 */
+
 /**
  * smk_read_doi - read() for /smack/doi
  * @filp: file pointer, not actually used
@@ -2320,11 +2592,7 @@ static const struct file_operations smk_revoke_subj_ops = {
  */
 static int smk_init_sysfs(void)
 {
-       int err;
-       err = sysfs_create_mount_point(fs_kobj, "smackfs");
-       if (err)
-               return err;
-       return 0;
+       return sysfs_create_mount_point(fs_kobj, "smackfs");
 }
 
 /**
@@ -2519,8 +2787,8 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
                        "direct", &smk_direct_ops, S_IRUGO|S_IWUSR},
                [SMK_AMBIENT] = {
                        "ambient", &smk_ambient_ops, S_IRUGO|S_IWUSR},
-               [SMK_NETLBLADDR] = {
-                       "netlabel", &smk_netlbladdr_ops, S_IRUGO|S_IWUSR},
+               [SMK_NET4ADDR] = {
+                       "netlabel", &smk_net4addr_ops, S_IRUGO|S_IWUSR},
                [SMK_ONLYCAP] = {
                        "onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR},
                [SMK_LOGGING] = {
@@ -2552,6 +2820,10 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
                [SMK_UNCONFINED] = {
                        "unconfined", &smk_unconfined_ops, S_IRUGO|S_IWUSR},
 #endif
+#if IS_ENABLED(CONFIG_IPV6)
+               [SMK_NET6ADDR] = {
+                       "ipv6host", &smk_net6addr_ops, S_IRUGO|S_IWUSR},
+#endif /* CONFIG_IPV6 */
                /* last one */
                        {""}
        };
index 3123e1da2fedb037a55077884d84461ac5971b1b..90c605eea8921546a4e58f55bccb1d19fbdec88f 100644 (file)
@@ -6,14 +6,7 @@ config SECURITY_YAMA
          This selects Yama, which extends DAC support with additional
          system-wide security settings beyond regular Linux discretionary
          access controls. Currently available is ptrace scope restriction.
+         Like capabilities, this security module stacks with other LSMs.
          Further information can be found in Documentation/security/Yama.txt.
 
          If you are unsure how to answer this question, answer N.
-
-config SECURITY_YAMA_STACKED
-       bool "Yama stacked with other LSMs"
-       depends on SECURITY_YAMA
-       default n
-       help
-         When Yama is built into the kernel, force it to stack with the
-         selected primary LSM.
index 5ebb8968793670d6a23ee6e6d2b30df6243a63ca..d3c19c970a06bf35789cd8444971e7421a00c8ca 100644 (file)
@@ -353,11 +353,6 @@ static struct security_hook_list yama_hooks[] = {
        LSM_HOOK_INIT(task_free, yama_task_free),
 };
 
-void __init yama_add_hooks(void)
-{
-       security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks));
-}
-
 #ifdef CONFIG_SYSCTL
 static int yama_dointvec_minmax(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -396,26 +391,18 @@ static struct ctl_table yama_sysctl_table[] = {
        },
        { }
 };
-#endif /* CONFIG_SYSCTL */
-
-static __init int yama_init(void)
+static void __init yama_init_sysctl(void)
 {
-#ifndef CONFIG_SECURITY_YAMA_STACKED
-       /*
-        * If yama is being stacked this is already taken care of.
-        */
-       if (!security_module_enable("yama"))
-               return 0;
-       yama_add_hooks();
-#endif
-       pr_info("Yama: becoming mindful.\n");
-
-#ifdef CONFIG_SYSCTL
        if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table))
                panic("Yama: sysctl registration failed.\n");
-#endif
-
-       return 0;
 }
+#else
+static inline void yama_init_sysctl(void) { }
+#endif /* CONFIG_SYSCTL */
 
-security_initcall(yama_init);
+void __init yama_add_hooks(void)
+{
+       pr_info("Yama: becoming mindful.\n");
+       security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks));
+       yama_init_sysctl();
+}
index 4e6b0907f908ce75f8aa6a1baf971b4456e85430..a75b5611d1e40121a5947e7a8f86e828307b7c9a 100644 (file)
@@ -1135,7 +1135,7 @@ static const struct hda_fixup alc880_fixups[] = {
                /* override all pins as BIOS on old Amilo is broken */
                .type = HDA_FIXUP_PINS,
                .v.pins = (const struct hda_pintbl[]) {
-                       { 0x14, 0x0121411f }, /* HP */
+                       { 0x14, 0x0121401f }, /* HP */
                        { 0x15, 0x99030120 }, /* speaker */
                        { 0x16, 0x99030130 }, /* bass speaker */
                        { 0x17, 0x411111f0 }, /* N/A */
@@ -1155,7 +1155,7 @@ static const struct hda_fixup alc880_fixups[] = {
                /* almost compatible with FUJITSU, but no bass and SPDIF */
                .type = HDA_FIXUP_PINS,
                .v.pins = (const struct hda_pintbl[]) {
-                       { 0x14, 0x0121411f }, /* HP */
+                       { 0x14, 0x0121401f }, /* HP */
                        { 0x15, 0x99030120 }, /* speaker */
                        { 0x16, 0x411111f0 }, /* N/A */
                        { 0x17, 0x411111f0 }, /* N/A */
@@ -1364,7 +1364,7 @@ static const struct snd_pci_quirk alc880_fixup_tbl[] = {
        SND_PCI_QUIRK(0x161f, 0x203d, "W810", ALC880_FIXUP_W810),
        SND_PCI_QUIRK(0x161f, 0x205d, "Medion Rim 2150", ALC880_FIXUP_MEDION_RIM),
        SND_PCI_QUIRK(0x1631, 0xe011, "PB 13201056", ALC880_FIXUP_6ST_AUTOMUTE),
-       SND_PCI_QUIRK(0x1734, 0x107c, "FSC F1734", ALC880_FIXUP_F1734),
+       SND_PCI_QUIRK(0x1734, 0x107c, "FSC Amilo M1437", ALC880_FIXUP_FUJITSU),
        SND_PCI_QUIRK(0x1734, 0x1094, "FSC Amilo M1451G", ALC880_FIXUP_FUJITSU),
        SND_PCI_QUIRK(0x1734, 0x10ac, "FSC AMILO Xi 1526", ALC880_FIXUP_F1734),
        SND_PCI_QUIRK(0x1734, 0x10b0, "FSC Amilo Pi1556", ALC880_FIXUP_FUJITSU),
@@ -5189,8 +5189,11 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x06c7, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x06d9, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x06da, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
-       SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
        SND_PCI_QUIRK(0x1028, 0x06db, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06dd, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+       SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
        SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2),
@@ -6381,6 +6384,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = {
        SND_PCI_QUIRK(0x1028, 0x05db, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x05fe, "Dell XPS 15", ALC668_FIXUP_DELL_XPS13),
        SND_PCI_QUIRK(0x1028, 0x060a, "Dell XPS 13", ALC668_FIXUP_DELL_XPS13),
+       SND_PCI_QUIRK(0x1028, 0x060d, "Dell M3800", ALC668_FIXUP_DELL_XPS13),
        SND_PCI_QUIRK(0x1028, 0x0625, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x0626, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
        SND_PCI_QUIRK(0x1028, 0x0696, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE),
index 784ceb85b2d9fe7cbe989d0c8443863f1d03df34..35c1f6ae773f051e159c896d4565901f69258f68 100644 (file)
@@ -1064,6 +1064,7 @@ static const struct of_device_id amd7930_match[] = {
        },
        {},
 };
+MODULE_DEVICE_TABLE(of, amd7930_match);
 
 static struct platform_driver amd7930_sbus_driver = {
        .driver = {
index 310a3822d2b72b51586a9ee06954ef08667bbeb8..970086015cded9f0a75f4d66a1471b1e99d9bfd4 100644 (file)
@@ -377,7 +377,15 @@ int snd_usb_add_audio_stream(struct snd_usb_audio *chip,
 
        snd_usb_init_substream(as, stream, fp);
 
-       list_add(&as->list, &chip->pcm_list);
+       /*
+        * Keep using head insertion for M-Audio Audiophile USB (tm) which has a
+        * fix to swap capture stream order in conf/cards/USB-audio.conf
+        */
+       if (chip->usb_id == USB_ID(0x0763, 0x2003))
+               list_add(&as->list, &chip->pcm_list);
+       else
+               list_add_tail(&as->list, &chip->pcm_list);
+
        chip->pcm_devs++;
 
        snd_usb_proc_pcm_format_add(as);
index eb51325e8ad99adefa7671a7b511626b7c136d1d..284a76e046284983cd769da89cb45604b114a41a 100644 (file)
@@ -768,8 +768,8 @@ static int process_exit_event(struct perf_tool *tool,
        if (!evsel->attr.sample_id_all) {
                sample->cpu = 0;
                sample->time = 0;
-               sample->tid = event->comm.tid;
-               sample->pid = event->comm.pid;
+               sample->tid = event->fork.tid;
+               sample->pid = event->fork.pid;
        }
        print_sample_start(sample, thread, evsel);
        perf_event__fprintf(event, stdout);
index 1aa21c90731b3eca400d91c2fec5d2b55a127f03..5b83f56a3b6f25928337d3954d2924322d3154ef 100644 (file)
@@ -34,6 +34,8 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
                .disabled = 1,
                .freq = 1,
        };
+       struct cpu_map *cpus;
+       struct thread_map *threads;
 
        attr.sample_freq = 500;
 
@@ -50,14 +52,19 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id)
        }
        perf_evlist__add(evlist, evsel);
 
-       evlist->cpus = cpu_map__dummy_new();
-       evlist->threads = thread_map__new_by_tid(getpid());
-       if (!evlist->cpus || !evlist->threads) {
+       cpus = cpu_map__dummy_new();
+       threads = thread_map__new_by_tid(getpid());
+       if (!cpus || !threads) {
                err = -ENOMEM;
                pr_debug("Not enough memory to create thread/cpu maps\n");
-               goto out_delete_evlist;
+               goto out_free_maps;
        }
 
+       perf_evlist__set_maps(evlist, cpus, threads);
+
+       cpus    = NULL;
+       threads = NULL;
+
        if (perf_evlist__open(evlist)) {
                const char *knob = "/proc/sys/kernel/perf_event_max_sample_rate";
 
@@ -107,6 +114,9 @@ next_event:
                err = -1;
        }
 
+out_free_maps:
+       cpu_map__put(cpus);
+       thread_map__put(threads);
 out_delete_evlist:
        perf_evlist__delete(evlist);
        return err;
index 3a8fedef83bc086e5328dea03dcf6107d3ff1e39..add16385f13e5bbb750cec83da3f1fe0d28889af 100644 (file)
@@ -43,6 +43,8 @@ int test__task_exit(void)
        };
        const char *argv[] = { "true", NULL };
        char sbuf[STRERR_BUFSIZE];
+       struct cpu_map *cpus;
+       struct thread_map *threads;
 
        signal(SIGCHLD, sig_handler);
 
@@ -58,14 +60,19 @@ int test__task_exit(void)
         * perf_evlist__prepare_workload we'll fill in the only thread
         * we're monitoring, the one forked there.
         */
-       evlist->cpus = cpu_map__dummy_new();
-       evlist->threads = thread_map__new_by_tid(-1);
-       if (!evlist->cpus || !evlist->threads) {
+       cpus = cpu_map__dummy_new();
+       threads = thread_map__new_by_tid(-1);
+       if (!cpus || !threads) {
                err = -ENOMEM;
                pr_debug("Not enough memory to create thread/cpu maps\n");
-               goto out_delete_evlist;
+               goto out_free_maps;
        }
 
+       perf_evlist__set_maps(evlist, cpus, threads);
+
+       cpus    = NULL;
+       threads = NULL;
+
        err = perf_evlist__prepare_workload(evlist, &target, argv, false,
                                            workload_exec_failed_signal);
        if (err < 0) {
@@ -114,6 +121,9 @@ retry:
                err = -1;
        }
 
+out_free_maps:
+       cpu_map__put(cpus);
+       thread_map__put(threads);
 out_delete_evlist:
        perf_evlist__delete(evlist);
        return err;
index cf86f2d3a5e725cb625505283af7ce80ccc13ff6..c04c60d4863ce71a08014c396aa7342c8af2fa16 100644 (file)
@@ -1968,7 +1968,8 @@ skip_annotation:
                                          &options[nr_options], dso);
                nr_options += add_map_opt(browser, &actions[nr_options],
                                          &options[nr_options],
-                                         browser->selection->map);
+                                         browser->selection ?
+                                               browser->selection->map : NULL);
 
                /* perf script support */
                if (browser->he_selection) {
@@ -1976,6 +1977,15 @@ skip_annotation:
                                                     &actions[nr_options],
                                                     &options[nr_options],
                                                     thread, NULL);
+                       /*
+                        * Note that browser->selection != NULL
+                        * when browser->he_selection is not NULL,
+                        * so we don't need to check browser->selection
+                        * before fetching browser->selection->sym like what
+                        * we do before fetching browser->selection->map.
+                        *
+                        * See hist_browser__show_entry.
+                        */
                        nr_options += add_script_opt(browser,
                                                     &actions[nr_options],
                                                     &options[nr_options],
index d51a5200c8af77b76e98c85d55f8acee8db304f0..c8fc8a258f4265c42c636d045b195a612cfae3a4 100644 (file)
@@ -124,6 +124,33 @@ void perf_evlist__delete(struct perf_evlist *evlist)
        free(evlist);
 }
 
+static void __perf_evlist__propagate_maps(struct perf_evlist *evlist,
+                                         struct perf_evsel *evsel)
+{
+       /*
+        * We already have cpus for evsel (via PMU sysfs) so
+        * keep it, if there's no target cpu list defined.
+        */
+       if (!evsel->own_cpus || evlist->has_user_cpus) {
+               cpu_map__put(evsel->cpus);
+               evsel->cpus = cpu_map__get(evlist->cpus);
+       } else if (evsel->cpus != evsel->own_cpus) {
+               cpu_map__put(evsel->cpus);
+               evsel->cpus = cpu_map__get(evsel->own_cpus);
+       }
+
+       thread_map__put(evsel->threads);
+       evsel->threads = thread_map__get(evlist->threads);
+}
+
+static void perf_evlist__propagate_maps(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel)
+               __perf_evlist__propagate_maps(evlist, evsel);
+}
+
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
 {
        entry->evlist = evlist;
@@ -133,18 +160,19 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
 
        if (!evlist->nr_entries++)
                perf_evlist__set_id_pos(evlist);
+
+       __perf_evlist__propagate_maps(evlist, entry);
 }
 
 void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
-                                  struct list_head *list,
-                                  int nr_entries)
+                                  struct list_head *list)
 {
-       bool set_id_pos = !evlist->nr_entries;
+       struct perf_evsel *evsel, *temp;
 
-       list_splice_tail(list, &evlist->entries);
-       evlist->nr_entries += nr_entries;
-       if (set_id_pos)
-               perf_evlist__set_id_pos(evlist);
+       __evlist__for_each_safe(list, temp, evsel) {
+               list_del_init(&evsel->node);
+               perf_evlist__add(evlist, evsel);
+       }
 }
 
 void __perf_evlist__set_leader(struct list_head *list)
@@ -210,7 +238,7 @@ static int perf_evlist__add_attrs(struct perf_evlist *evlist,
                list_add_tail(&evsel->node, &head);
        }
 
-       perf_evlist__splice_list_tail(evlist, &head, nr_attrs);
+       perf_evlist__splice_list_tail(evlist, &head);
 
        return 0;
 
@@ -1103,71 +1131,56 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
        return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false);
 }
 
-static int perf_evlist__propagate_maps(struct perf_evlist *evlist,
-                                      bool has_user_cpus)
-{
-       struct perf_evsel *evsel;
-
-       evlist__for_each(evlist, evsel) {
-               /*
-                * We already have cpus for evsel (via PMU sysfs) so
-                * keep it, if there's no target cpu list defined.
-                */
-               if (evsel->cpus && has_user_cpus)
-                       cpu_map__put(evsel->cpus);
-
-               if (!evsel->cpus || has_user_cpus)
-                       evsel->cpus = cpu_map__get(evlist->cpus);
-
-               evsel->threads = thread_map__get(evlist->threads);
-
-               if ((evlist->cpus && !evsel->cpus) ||
-                   (evlist->threads && !evsel->threads))
-                       return -ENOMEM;
-       }
-
-       return 0;
-}
-
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
 {
-       evlist->threads = thread_map__new_str(target->pid, target->tid,
-                                             target->uid);
+       struct cpu_map *cpus;
+       struct thread_map *threads;
+
+       threads = thread_map__new_str(target->pid, target->tid, target->uid);
 
-       if (evlist->threads == NULL)
+       if (!threads)
                return -1;
 
        if (target__uses_dummy_map(target))
-               evlist->cpus = cpu_map__dummy_new();
+               cpus = cpu_map__dummy_new();
        else
-               evlist->cpus = cpu_map__new(target->cpu_list);
+               cpus = cpu_map__new(target->cpu_list);
 
-       if (evlist->cpus == NULL)
+       if (!cpus)
                goto out_delete_threads;
 
-       return perf_evlist__propagate_maps(evlist, !!target->cpu_list);
+       evlist->has_user_cpus = !!target->cpu_list;
+
+       perf_evlist__set_maps(evlist, cpus, threads);
+
+       return 0;
 
 out_delete_threads:
-       thread_map__put(evlist->threads);
-       evlist->threads = NULL;
+       thread_map__put(threads);
        return -1;
 }
 
-int perf_evlist__set_maps(struct perf_evlist *evlist,
-                         struct cpu_map *cpus,
-                         struct thread_map *threads)
+void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
+                          struct thread_map *threads)
 {
-       if (evlist->cpus)
+       /*
+        * Allow for the possibility that one or another of the maps isn't being
+        * changed i.e. don't put it.  Note we are assuming the maps that are
+        * being applied are brand new and evlist is taking ownership of the
+        * original reference count of 1.  If that is not the case it is up to
+        * the caller to increase the reference count.
+        */
+       if (cpus != evlist->cpus) {
                cpu_map__put(evlist->cpus);
+               evlist->cpus = cpus;
+       }
 
-       evlist->cpus = cpus;
-
-       if (evlist->threads)
+       if (threads != evlist->threads) {
                thread_map__put(evlist->threads);
+               evlist->threads = threads;
+       }
 
-       evlist->threads = threads;
-
-       return perf_evlist__propagate_maps(evlist, false);
+       perf_evlist__propagate_maps(evlist);
 }
 
 int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel)
@@ -1387,6 +1400,8 @@ void perf_evlist__close(struct perf_evlist *evlist)
 
 static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist)
 {
+       struct cpu_map    *cpus;
+       struct thread_map *threads;
        int err = -ENOMEM;
 
        /*
@@ -1398,20 +1413,19 @@ static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist)
         * error, and we may not want to do that fallback to a
         * default cpu identity map :-\
         */
-       evlist->cpus = cpu_map__new(NULL);
-       if (evlist->cpus == NULL)
+       cpus = cpu_map__new(NULL);
+       if (!cpus)
                goto out;
 
-       evlist->threads = thread_map__new_dummy();
-       if (evlist->threads == NULL)
-               goto out_free_cpus;
+       threads = thread_map__new_dummy();
+       if (!threads)
+               goto out_put;
 
-       err = 0;
+       perf_evlist__set_maps(evlist, cpus, threads);
 out:
        return err;
-out_free_cpus:
-       cpu_map__put(evlist->cpus);
-       evlist->cpus = NULL;
+out_put:
+       cpu_map__put(cpus);
        goto out;
 }
 
index b39a6198f4ac00ae4c30cdfe659667cc84db7780..115d8b53c6010a5a1d466b04535245d9fd8f8579 100644 (file)
@@ -42,6 +42,7 @@ struct perf_evlist {
        int              nr_mmaps;
        bool             overwrite;
        bool             enabled;
+       bool             has_user_cpus;
        size_t           mmap_len;
        int              id_pos;
        int              is_pos;
@@ -155,9 +156,8 @@ int perf_evlist__enable_event_idx(struct perf_evlist *evlist,
 void perf_evlist__set_selected(struct perf_evlist *evlist,
                               struct perf_evsel *evsel);
 
-int perf_evlist__set_maps(struct perf_evlist *evlist,
-                         struct cpu_map *cpus,
-                         struct thread_map *threads);
+void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
+                          struct thread_map *threads);
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target);
 int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel);
 
@@ -179,8 +179,7 @@ bool perf_evlist__valid_sample_id_all(struct perf_evlist *evlist);
 bool perf_evlist__valid_read_format(struct perf_evlist *evlist);
 
 void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
-                                  struct list_head *list,
-                                  int nr_entries);
+                                  struct list_head *list);
 
 static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist)
 {
index c53f79123b37f4ab506f3a914263d59c3122cfb6..5410483d52198c5909ec5a502c61567c5119c8ef 100644 (file)
@@ -1033,6 +1033,7 @@ void perf_evsel__exit(struct perf_evsel *evsel)
        perf_evsel__free_config_terms(evsel);
        close_cgroup(evsel->cgrp);
        cpu_map__put(evsel->cpus);
+       cpu_map__put(evsel->own_cpus);
        thread_map__put(evsel->threads);
        zfree(&evsel->group_name);
        zfree(&evsel->name);
index 298e6bbca200bd4740bddc4e6bb1fce7414d5bf4..ef8925f7211a4a311c927e6919d83817745eb1cf 100644 (file)
@@ -98,6 +98,7 @@ struct perf_evsel {
        struct cgroup_sel       *cgrp;
        void                    *handler;
        struct cpu_map          *cpus;
+       struct cpu_map          *own_cpus;
        struct thread_map       *threads;
        unsigned int            sample_size;
        int                     id_pos;
index 41814547da159a14ac00eca8c774f3a46ae20558..fce6634aebe25d19d126120ec1e6e4dd323be46a 100644 (file)
@@ -1438,7 +1438,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused,
        if (ph->needs_swap)
                nr = bswap_32(nr);
 
-       ph->env.nr_cpus_online = nr;
+       ph->env.nr_cpus_avail = nr;
 
        ret = readn(fd, &nr, sizeof(nr));
        if (ret != sizeof(nr))
@@ -1447,7 +1447,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused,
        if (ph->needs_swap)
                nr = bswap_32(nr);
 
-       ph->env.nr_cpus_avail = nr;
+       ph->env.nr_cpus_online = nr;
        return 0;
 }
 
index ea768625ab5b39eaf8ce59059e10349eb7ecd34a..eb0e7f8bf5158c98c95216d4fb9d3eb96568c50b 100644 (file)
@@ -623,7 +623,7 @@ static int intel_bts_process_event(struct perf_session *session,
        if (err)
                return err;
        if (event->header.type == PERF_RECORD_EXIT) {
-               err = intel_bts_process_tid_exit(bts, event->comm.tid);
+               err = intel_bts_process_tid_exit(bts, event->fork.tid);
                if (err)
                        return err;
        }
index bb41c20e6005e1a2d986656fc94eead62de7a4ee..535d86f8e4d17b802a4c0473a7bf7ff937a10b15 100644 (file)
@@ -1494,7 +1494,7 @@ static int intel_pt_process_event(struct perf_session *session,
        if (pt->timeless_decoding) {
                if (event->header.type == PERF_RECORD_EXIT) {
                        err = intel_pt_process_timeless_queues(pt,
-                                                              event->comm.tid,
+                                                              event->fork.tid,
                                                               sample->time);
                }
        } else if (timestamp) {
index d826e6f515db12a3f75517bf93437eb9d0043829..21ed6ee63da9747d1215ac638f936357c8df2e8c 100644 (file)
@@ -287,8 +287,8 @@ __add_event(struct list_head *list, int *idx,
        if (!evsel)
                return NULL;
 
-       if (cpus)
-               evsel->cpus = cpu_map__get(cpus);
+       evsel->cpus     = cpu_map__get(cpus);
+       evsel->own_cpus = cpu_map__get(cpus);
 
        if (name)
                evsel->name = strdup(name);
@@ -1140,10 +1140,9 @@ int parse_events(struct perf_evlist *evlist, const char *str,
        ret = parse_events__scanner(str, &data, PE_START_EVENTS);
        perf_pmu__parse_cleanup();
        if (!ret) {
-               int entries = data.idx - evlist->nr_entries;
                struct perf_evsel *last;
 
-               perf_evlist__splice_list_tail(evlist, &data.list, entries);
+               perf_evlist__splice_list_tail(evlist, &data.list);
                evlist->nr_groups += data.nr_groups;
                last = perf_evlist__last(evlist);
                last->cmdline_group_boundary = true;
index 591905a02b926b6029447a372f2e5f7b7d34864a..9cd70819c7950e2de1aaae29a17b0f951b593bda 100644 (file)
@@ -255,7 +255,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
        list_add_tail(&term->list, head);
 
        ALLOC_LIST(list);
-       ABORT_ON(parse_events_add_pmu(list, &data->idx, "cpu", head));
+       ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
        parse_events__free_terms(head);
        $$ = list;
 }
index f56914c7929b80169efc6f3079cb1b1fae8cd269..38b00ecb2ed55a6313f54e6cadd08f67a564fe26 100644 (file)
@@ -1,9 +1,12 @@
-ldflags-y += --wrap=ioremap_wt
 ldflags-y += --wrap=ioremap_wc
+ldflags-y += --wrap=memremap
 ldflags-y += --wrap=devm_ioremap_nocache
-ldflags-y += --wrap=ioremap_cache
+ldflags-y += --wrap=devm_memremap
+ldflags-y += --wrap=devm_memunmap
 ldflags-y += --wrap=ioremap_nocache
 ldflags-y += --wrap=iounmap
+ldflags-y += --wrap=memunmap
+ldflags-y += --wrap=__devm_request_region
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 
@@ -15,6 +18,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
 
 nfit-y := $(ACPI_SRC)/nfit.o
@@ -29,6 +33,9 @@ nd_btt-y += config_check.o
 nd_blk-y := $(NVDIMM_SRC)/blk.o
 nd_blk-y += config_check.o
 
+nd_e820-y := $(NVDIMM_SRC)/e820.o
+nd_e820-y += config_check.o
+
 libnvdimm-y := $(NVDIMM_SRC)/core.o
 libnvdimm-y += $(NVDIMM_SRC)/bus.o
 libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
@@ -37,7 +44,9 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/region.o
 libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
 libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
+libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
 libnvdimm-y += config_check.o
 
 obj-m += test/
index 64bfaa50831ccd99f999ebb08fbb35eb0248820c..b7251314bbc038d598457b119e26e5cd3c2e3378 100644 (file)
@@ -80,23 +80,52 @@ void __iomem *__wrap_devm_ioremap_nocache(struct device *dev,
 }
 EXPORT_SYMBOL(__wrap_devm_ioremap_nocache);
 
-void __iomem *__wrap_ioremap_cache(resource_size_t offset, unsigned long size)
+void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
+               size_t size, unsigned long flags)
 {
-       return __nfit_test_ioremap(offset, size, ioremap_cache);
+       struct nfit_test_resource *nfit_res;
+
+       rcu_read_lock();
+       nfit_res = get_nfit_res(offset);
+       rcu_read_unlock();
+       if (nfit_res)
+               return nfit_res->buf + offset - nfit_res->res->start;
+       return devm_memremap(dev, offset, size, flags);
 }
-EXPORT_SYMBOL(__wrap_ioremap_cache);
+EXPORT_SYMBOL(__wrap_devm_memremap);
 
-void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
+void *__wrap_memremap(resource_size_t offset, size_t size,
+               unsigned long flags)
 {
-       return __nfit_test_ioremap(offset, size, ioremap_nocache);
+       struct nfit_test_resource *nfit_res;
+
+       rcu_read_lock();
+       nfit_res = get_nfit_res(offset);
+       rcu_read_unlock();
+       if (nfit_res)
+               return nfit_res->buf + offset - nfit_res->res->start;
+       return memremap(offset, size, flags);
 }
-EXPORT_SYMBOL(__wrap_ioremap_nocache);
+EXPORT_SYMBOL(__wrap_memremap);
+
+void __wrap_devm_memunmap(struct device *dev, void *addr)
+{
+       struct nfit_test_resource *nfit_res;
+
+       rcu_read_lock();
+       nfit_res = get_nfit_res((unsigned long) addr);
+       rcu_read_unlock();
+       if (nfit_res)
+               return;
+       return devm_memunmap(dev, addr);
+}
+EXPORT_SYMBOL(__wrap_devm_memunmap);
 
-void __iomem *__wrap_ioremap_wt(resource_size_t offset, unsigned long size)
+void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size)
 {
-       return __nfit_test_ioremap(offset, size, ioremap_wt);
+       return __nfit_test_ioremap(offset, size, ioremap_nocache);
 }
-EXPORT_SYMBOL(__wrap_ioremap_wt);
+EXPORT_SYMBOL(__wrap_ioremap_nocache);
 
 void __iomem *__wrap_ioremap_wc(resource_size_t offset, unsigned long size)
 {
@@ -117,9 +146,22 @@ void __wrap_iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(__wrap_iounmap);
 
-struct resource *__wrap___request_region(struct resource *parent,
-               resource_size_t start, resource_size_t n, const char *name,
-               int flags)
+void __wrap_memunmap(void *addr)
+{
+       struct nfit_test_resource *nfit_res;
+
+       rcu_read_lock();
+       nfit_res = get_nfit_res((unsigned long) addr);
+       rcu_read_unlock();
+       if (nfit_res)
+               return;
+       return memunmap(addr);
+}
+EXPORT_SYMBOL(__wrap_memunmap);
+
+static struct resource *nfit_test_request_region(struct device *dev,
+               struct resource *parent, resource_size_t start,
+               resource_size_t n, const char *name, int flags)
 {
        struct nfit_test_resource *nfit_res;
 
@@ -147,10 +189,29 @@ struct resource *__wrap___request_region(struct resource *parent,
                        return res;
                }
        }
+       if (dev)
+               return __devm_request_region(dev, parent, start, n, name);
        return __request_region(parent, start, n, name, flags);
 }
+
+struct resource *__wrap___request_region(struct resource *parent,
+               resource_size_t start, resource_size_t n, const char *name,
+               int flags)
+{
+       return nfit_test_request_region(NULL, parent, start, n, name, flags);
+}
 EXPORT_SYMBOL(__wrap___request_region);
 
+struct resource *__wrap___devm_request_region(struct device *dev,
+               struct resource *parent, resource_size_t start,
+               resource_size_t n, const char *name)
+{
+       if (!dev)
+               return NULL;
+       return nfit_test_request_region(dev, parent, start, n, name, 0);
+}
+EXPORT_SYMBOL(__wrap___devm_request_region);
+
 void __wrap___release_region(struct resource *parent, resource_size_t start,
                                resource_size_t n)
 {
index d0bdae40ccc9033aae0e9695943d4ce5367bde33..021e6f97f33e7af2a7e570ba46cd29b72523b130 100644 (file)
@@ -147,75 +147,153 @@ static struct nfit_test *to_nfit_test(struct device *dev)
        return container_of(pdev, struct nfit_test, pdev);
 }
 
+static int nfit_test_cmd_get_config_size(struct nd_cmd_get_config_size *nd_cmd,
+               unsigned int buf_len)
+{
+       if (buf_len < sizeof(*nd_cmd))
+               return -EINVAL;
+
+       nd_cmd->status = 0;
+       nd_cmd->config_size = LABEL_SIZE;
+       nd_cmd->max_xfer = SZ_4K;
+
+       return 0;
+}
+
+static int nfit_test_cmd_get_config_data(struct nd_cmd_get_config_data_hdr
+               *nd_cmd, unsigned int buf_len, void *label)
+{
+       unsigned int len, offset = nd_cmd->in_offset;
+       int rc;
+
+       if (buf_len < sizeof(*nd_cmd))
+               return -EINVAL;
+       if (offset >= LABEL_SIZE)
+               return -EINVAL;
+       if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
+               return -EINVAL;
+
+       nd_cmd->status = 0;
+       len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+       memcpy(nd_cmd->out_buf, label + offset, len);
+       rc = buf_len - sizeof(*nd_cmd) - len;
+
+       return rc;
+}
+
+static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
+               unsigned int buf_len, void *label)
+{
+       unsigned int len, offset = nd_cmd->in_offset;
+       u32 *status;
+       int rc;
+
+       if (buf_len < sizeof(*nd_cmd))
+               return -EINVAL;
+       if (offset >= LABEL_SIZE)
+               return -EINVAL;
+       if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
+               return -EINVAL;
+
+       status = (void *)nd_cmd + nd_cmd->in_length + sizeof(*nd_cmd);
+       *status = 0;
+       len = min(nd_cmd->in_length, LABEL_SIZE - offset);
+       memcpy(label + offset, nd_cmd->in_buf, len);
+       rc = buf_len - sizeof(*nd_cmd) - (len + 4);
+
+       return rc;
+}
+
+static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
+               unsigned int buf_len)
+{
+       if (buf_len < sizeof(*nd_cmd))
+               return -EINVAL;
+
+       nd_cmd->max_ars_out = 256;
+       nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
+
+       return 0;
+}
+
+static int nfit_test_cmd_ars_start(struct nd_cmd_ars_start *nd_cmd,
+               unsigned int buf_len)
+{
+       if (buf_len < sizeof(*nd_cmd))
+               return -EINVAL;
+
+       nd_cmd->status = 0;
+
+       return 0;
+}
+
+static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
+               unsigned int buf_len)
+{
+       if (buf_len < sizeof(*nd_cmd))
+               return -EINVAL;
+
+       nd_cmd->out_length = 256;
+       nd_cmd->num_records = 0;
+       nd_cmd->status = 0;
+
+       return 0;
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
                struct nvdimm *nvdimm, unsigned int cmd, void *buf,
                unsigned int buf_len)
 {
        struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
        struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
-       struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
-       int i, rc;
+       int i, rc = 0;
 
-       if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
-               return -ENOTTY;
+       if (nvdimm) {
+               struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 
-       /* lookup label space for the given dimm */
-       for (i = 0; i < ARRAY_SIZE(handle); i++)
-               if (__to_nfit_memdev(nfit_mem)->device_handle == handle[i])
+               if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+                       return -ENOTTY;
+
+               /* lookup label space for the given dimm */
+               for (i = 0; i < ARRAY_SIZE(handle); i++)
+                       if (__to_nfit_memdev(nfit_mem)->device_handle ==
+                                       handle[i])
+                               break;
+               if (i >= ARRAY_SIZE(handle))
+                       return -ENXIO;
+
+               switch (cmd) {
+               case ND_CMD_GET_CONFIG_SIZE:
+                       rc = nfit_test_cmd_get_config_size(buf, buf_len);
                        break;
-       if (i >= ARRAY_SIZE(handle))
-               return -ENXIO;
+               case ND_CMD_GET_CONFIG_DATA:
+                       rc = nfit_test_cmd_get_config_data(buf, buf_len,
+                               t->label[i]);
+                       break;
+               case ND_CMD_SET_CONFIG_DATA:
+                       rc = nfit_test_cmd_set_config_data(buf, buf_len,
+                               t->label[i]);
+                       break;
+               default:
+                       return -ENOTTY;
+               }
+       } else {
+               if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
+                       return -ENOTTY;
 
-       switch (cmd) {
-       case ND_CMD_GET_CONFIG_SIZE: {
-               struct nd_cmd_get_config_size *nd_cmd = buf;
-
-               if (buf_len < sizeof(*nd_cmd))
-                       return -EINVAL;
-               nd_cmd->status = 0;
-               nd_cmd->config_size = LABEL_SIZE;
-               nd_cmd->max_xfer = SZ_4K;
-               rc = 0;
-               break;
-       }
-       case ND_CMD_GET_CONFIG_DATA: {
-               struct nd_cmd_get_config_data_hdr *nd_cmd = buf;
-               unsigned int len, offset = nd_cmd->in_offset;
-
-               if (buf_len < sizeof(*nd_cmd))
-                       return -EINVAL;
-               if (offset >= LABEL_SIZE)
-                       return -EINVAL;
-               if (nd_cmd->in_length + sizeof(*nd_cmd) > buf_len)
-                       return -EINVAL;
-
-               nd_cmd->status = 0;
-               len = min(nd_cmd->in_length, LABEL_SIZE - offset);
-               memcpy(nd_cmd->out_buf, t->label[i] + offset, len);
-               rc = buf_len - sizeof(*nd_cmd) - len;
-               break;
-       }
-       case ND_CMD_SET_CONFIG_DATA: {
-               struct nd_cmd_set_config_hdr *nd_cmd = buf;
-               unsigned int len, offset = nd_cmd->in_offset;
-               u32 *status;
-
-               if (buf_len < sizeof(*nd_cmd))
-                       return -EINVAL;
-               if (offset >= LABEL_SIZE)
-                       return -EINVAL;
-               if (nd_cmd->in_length + sizeof(*nd_cmd) + 4 > buf_len)
-                       return -EINVAL;
-
-               status = buf + nd_cmd->in_length + sizeof(*nd_cmd);
-               *status = 0;
-               len = min(nd_cmd->in_length, LABEL_SIZE - offset);
-               memcpy(t->label[i] + offset, nd_cmd->in_buf, len);
-               rc = buf_len - sizeof(*nd_cmd) - (len + 4);
-               break;
-       }
-       default:
-               return -ENOTTY;
+               switch (cmd) {
+               case ND_CMD_ARS_CAP:
+                       rc = nfit_test_cmd_ars_cap(buf, buf_len);
+                       break;
+               case ND_CMD_ARS_START:
+                       rc = nfit_test_cmd_ars_start(buf, buf_len);
+                       break;
+               case ND_CMD_ARS_STATUS:
+                       rc = nfit_test_cmd_ars_status(buf, buf_len);
+                       break;
+               default:
+                       return -ENOTTY;
+               }
        }
 
        return rc;
@@ -876,6 +954,9 @@ static void nfit_test0_setup(struct nfit_test *t)
        set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
        set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
        set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
+       set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
+       set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
+       set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
        nd_desc = &acpi_desc->nd_desc;
        nd_desc->ndctl = nfit_test_ctl;
 }
@@ -948,9 +1029,13 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
 
        lane = nd_region_acquire_lane(nd_region);
        if (rw)
-               memcpy(mmio->base + dpa, iobuf, len);
-       else
-               memcpy(iobuf, mmio->base + dpa, len);
+               memcpy(mmio->addr.base + dpa, iobuf, len);
+       else {
+               memcpy(iobuf, mmio->addr.base + dpa, len);
+
+               /* give us some some coverage of the mmio_flush_range() API */
+               mmio_flush_range(mmio->addr.base + dpa, len);
+       }
        nd_region_release_lane(nd_region, lane);
 
        return 0;
index b8f12e0897e6b1debdd3165e4300ccdbf4e26f44..89b05e2222c913ff47e12673e9850616f585dbf9 100644 (file)
@@ -6,6 +6,7 @@ TARGETS += firmware
 TARGETS += ftrace
 TARGETS += futex
 TARGETS += kcmp
+TARGETS += membarrier
 TARGETS += memfd
 TARGETS += memory-hotplug
 TARGETS += mount
@@ -23,6 +24,7 @@ TARGETS += user
 TARGETS += jumplabel
 TARGETS += vm
 TARGETS += x86
+TARGETS += zram
 #Please keep the TARGETS list alphabetically sorted
 # Run "make quicktest=1 run_tests" or
 # "make quicktest=1 kselftest from top level Makefile
@@ -72,7 +74,6 @@ ifdef INSTALL_PATH
        @# Ask all targets to install their files
        mkdir -p $(INSTALL_PATH)
        for TARGET in $(TARGETS); do \
-               mkdir -p $(INSTALL_PATH)/$$TARGET ; \
                make -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \
        done;
 
index 1822356402090df03ee6311cb99dd7e2c7db94d7..d27108b4f2081fe2d250e4de8c45bf02f7c73d4f 100644 (file)
@@ -1,22 +1,12 @@
 # Taken from perf makefile
 uname_M := $(shell uname -m 2>/dev/null || echo not)
-ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
-ifeq ($(ARCH),i386)
-        ARCH := x86
-endif
-ifeq ($(ARCH),x86_64)
-       ARCH := x86
-endif
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
 
-
-all:
 ifeq ($(ARCH),x86)
-       gcc breakpoint_test.c -o breakpoint_test
-else
-       echo "Not an x86 target, can't build breakpoints selftests"
+TEST_PROGS := breakpoint_test
 endif
 
-TEST_PROGS := breakpoint_test
+all:
 
 include ../lib.mk
 
index ee412bab7ed4bd29c264af117398be85b1999c4d..97f1c6742066352c6fd0a32599ecff93b3896499 100644 (file)
@@ -12,11 +12,14 @@ run_tests: all
        $(RUN_TESTS)
 
 define INSTALL_RULE
-       mkdir -p $(INSTALL_PATH)
-       @for TEST_DIR in $(TEST_DIRS); do\
-               cp -r $$TEST_DIR $(INSTALL_PATH); \
-       done;
-       install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
+       @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then                        \
+               mkdir -p $(INSTALL_PATH);                                                               \
+               for TEST_DIR in $(TEST_DIRS); do                                                        \
+                       cp -r $$TEST_DIR $(INSTALL_PATH);                                               \
+               done;                                                                                   \
+               echo "install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)";   \
+               install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES);          \
+       fi
 endef
 
 install: all
diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore
new file mode 100644 (file)
index 0000000..020c44f
--- /dev/null
@@ -0,0 +1 @@
+membarrier_test
diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
new file mode 100644 (file)
index 0000000..877a503
--- /dev/null
@@ -0,0 +1,11 @@
+CFLAGS += -g -I../../../../usr/include/
+
+all:
+       $(CC) $(CFLAGS) membarrier_test.c -o membarrier_test
+
+TEST_PROGS := membarrier_test
+
+include ../lib.mk
+
+clean:
+       $(RM) membarrier_test
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
new file mode 100644 (file)
index 0000000..dde3125
--- /dev/null
@@ -0,0 +1,121 @@
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <linux/membarrier.h>
+#include <asm-generic/unistd.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+enum test_membarrier_status {
+       TEST_MEMBARRIER_PASS = 0,
+       TEST_MEMBARRIER_FAIL,
+       TEST_MEMBARRIER_SKIP,
+};
+
+static int sys_membarrier(int cmd, int flags)
+{
+       return syscall(__NR_membarrier, cmd, flags);
+}
+
+static enum test_membarrier_status test_membarrier_cmd_fail(void)
+{
+       int cmd = -1, flags = 0;
+
+       if (sys_membarrier(cmd, flags) != -1) {
+               printf("membarrier: Wrong command should fail but passed.\n");
+               return TEST_MEMBARRIER_FAIL;
+       }
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier_flags_fail(void)
+{
+       int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+
+       if (sys_membarrier(cmd, flags) != -1) {
+               printf("membarrier: Wrong flags should fail but passed.\n");
+               return TEST_MEMBARRIER_FAIL;
+       }
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier_success(void)
+{
+       int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               printf("membarrier: Executing MEMBARRIER_CMD_SHARED failed. %s.\n",
+                               strerror(errno));
+               return TEST_MEMBARRIER_FAIL;
+       }
+
+       printf("membarrier: MEMBARRIER_CMD_SHARED success.\n");
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier(void)
+{
+       enum test_membarrier_status status;
+
+       status = test_membarrier_cmd_fail();
+       if (status)
+               return status;
+       status = test_membarrier_flags_fail();
+       if (status)
+               return status;
+       status = test_membarrier_success();
+       if (status)
+               return status;
+       return TEST_MEMBARRIER_PASS;
+}
+
+static enum test_membarrier_status test_membarrier_query(void)
+{
+       int flags = 0, ret;
+
+       printf("membarrier MEMBARRIER_CMD_QUERY ");
+       ret = sys_membarrier(MEMBARRIER_CMD_QUERY, flags);
+       if (ret < 0) {
+               printf("failed. %s.\n", strerror(errno));
+               switch (errno) {
+               case ENOSYS:
+                       /*
+                        * It is valid to build a kernel with
+                        * CONFIG_MEMBARRIER=n. However, this skips the tests.
+                        */
+                       return TEST_MEMBARRIER_SKIP;
+               case EINVAL:
+               default:
+                       return TEST_MEMBARRIER_FAIL;
+               }
+       }
+       if (!(ret & MEMBARRIER_CMD_SHARED)) {
+               printf("command MEMBARRIER_CMD_SHARED is not supported.\n");
+               return TEST_MEMBARRIER_FAIL;
+       }
+       printf("syscall available.\n");
+       return TEST_MEMBARRIER_PASS;
+}
+
+int main(int argc, char **argv)
+{
+       switch (test_membarrier_query()) {
+       case TEST_MEMBARRIER_FAIL:
+               return ksft_exit_fail();
+       case TEST_MEMBARRIER_SKIP:
+               return ksft_exit_skip();
+       }
+       switch (test_membarrier()) {
+       case TEST_MEMBARRIER_FAIL:
+               return ksft_exit_fail();
+       case TEST_MEMBARRIER_SKIP:
+               return ksft_exit_skip();
+       }
+
+       printf("membarrier: tests done!\n");
+       return ksft_exit_pass();
+}
index 0d6854744b37307fe446e350219b6678fe215a49..d36fab7d8ebd90c1de5a6878171dca75ea518998 100644 (file)
@@ -4,7 +4,6 @@ CFLAGS = -Wall
 BINARIES = compaction_test
 BINARIES += hugepage-mmap
 BINARIES += hugepage-shm
-BINARIES += hugetlbfstest
 BINARIES += map_hugetlb
 BINARIES += thuge-gen
 BINARIES += transhuge-stress
diff --git a/tools/testing/selftests/vm/hugetlbfstest.c b/tools/testing/selftests/vm/hugetlbfstest.c
deleted file mode 100644 (file)
index 02e1072..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-#define _GNU_SOURCE
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-typedef unsigned long long u64;
-
-static size_t length = 1 << 24;
-
-static u64 read_rss(void)
-{
-       char buf[4096], *s = buf;
-       int i, fd;
-       u64 rss;
-
-       fd = open("/proc/self/statm", O_RDONLY);
-       assert(fd > 2);
-       memset(buf, 0, sizeof(buf));
-       read(fd, buf, sizeof(buf) - 1);
-       for (i = 0; i < 1; i++)
-               s = strchr(s, ' ') + 1;
-       rss = strtoull(s, NULL, 10);
-       return rss << 12; /* assumes 4k pagesize */
-}
-
-static void do_mmap(int fd, int extra_flags, int unmap)
-{
-       int *p;
-       int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags;
-       u64 before, after;
-       int ret;
-
-       before = read_rss();
-       p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0);
-       assert(p != MAP_FAILED ||
-                       !"mmap returned an unexpected error");
-       after = read_rss();
-       assert(llabs(after - before - length) < 0x40000 ||
-                       !"rss didn't grow as expected");
-       if (!unmap)
-               return;
-       ret = munmap(p, length);
-       assert(!ret || !"munmap returned an unexpected error");
-       after = read_rss();
-       assert(llabs(after - before) < 0x40000 ||
-                       !"rss didn't shrink as expected");
-}
-
-static int open_file(const char *path)
-{
-       int fd, err;
-
-       unlink(path);
-       fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL
-                       | O_LARGEFILE | O_CLOEXEC, 0600);
-       assert(fd > 2);
-       unlink(path);
-       err = ftruncate(fd, length);
-       assert(!err);
-       return fd;
-}
-
-int main(void)
-{
-       int hugefd, fd;
-
-       fd = open_file("/dev/shm/hugetlbhog");
-       hugefd = open_file("/hugepages/hugetlbhog");
-
-       system("echo 100 > /proc/sys/vm/nr_hugepages");
-       do_mmap(-1, MAP_ANONYMOUS, 1);
-       do_mmap(fd, 0, 1);
-       do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1);
-       do_mmap(hugefd, 0, 1);
-       do_mmap(hugefd, MAP_HUGETLB, 1);
-       /* Leak the last one to test do_exit() */
-       do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0);
-       printf("oll korrekt.\n");
-       return 0;
-}
index 831adeb5fc552b3c889510c24308d744c734c267..9179ce8df485d7b125b7e3619f7d82ee4df5ea64 100755 (executable)
@@ -75,16 +75,9 @@ else
        echo "[PASS]"
 fi
 
-echo "--------------------"
-echo "running hugetlbfstest"
-echo "--------------------"
-./hugetlbfstest
-if [ $? -ne 0 ]; then
-       echo "[FAIL]"
-       exitcode=1
-else
-       echo "[PASS]"
-fi
+echo "NOTE: The above hugetlb tests provide minimal coverage.  Use"
+echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
+echo "      hugetlb regression testing."
 
 echo "--------------------"
 echo "running userfaultfd"
index 0c0b8395335261b952d10465f8d9d8e874e3dcc2..2c7cca6f26a45c215b43b44a7f5c9dddceee301c 100644 (file)
@@ -69,7 +69,7 @@
 #ifdef __x86_64__
 #define __NR_userfaultfd 323
 #elif defined(__i386__)
-#define __NR_userfaultfd 359
+#define __NR_userfaultfd 374
 #elif defined(__powewrpc__)
 #define __NR_userfaultfd 364
 #else
@@ -147,7 +147,8 @@ static void *locking_thread(void *arg)
                        if (sizeof(page_nr) > sizeof(rand_nr)) {
                                if (random_r(&rand, &rand_nr))
                                        fprintf(stderr, "random_r 2 error\n"), exit(1);
-                               page_nr |= ((unsigned long) rand_nr) << 32;
+                               page_nr |= (((unsigned long) rand_nr) << 16) <<
+                                          16;
                        }
                } else
                        page_nr += 1;
@@ -290,7 +291,8 @@ static void *uffd_poll_thread(void *arg)
                                msg.event), exit(1);
                if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
                        fprintf(stderr, "unexpected write fault\n"), exit(1);
-               offset = (char *)msg.arg.pagefault.address - area_dst;
+               offset = (char *)(unsigned long)msg.arg.pagefault.address -
+                        area_dst;
                offset &= ~(page_size-1);
                if (copy_page(offset))
                        userfaults++;
@@ -327,7 +329,8 @@ static void *uffd_read_thread(void *arg)
                if (bounces & BOUNCE_VERIFY &&
                    msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
                        fprintf(stderr, "unexpected write fault\n"), exit(1);
-               offset = (char *)msg.arg.pagefault.address - area_dst;
+               offset = (char *)(unsigned long)msg.arg.pagefault.address -
+                        area_dst;
                offset &= ~(page_size-1);
                if (copy_page(offset))
                        (*this_cpu_userfaults)++;
index 9a43a59a9bb46b2944b20b26f1af6e8e405b5764..421c607a8856887d0f9f6048c2ab26bd9367efdd 100644 (file)
@@ -116,8 +116,9 @@ static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
        v86->regs.eip = eip;
        ret = vm86(VM86_ENTER, v86);
 
-       if (ret == -1 && errno == ENOSYS) {
-               printf("[SKIP]\tvm86 not supported\n");
+       if (ret == -1 && (errno == ENOSYS || errno == EPERM)) {
+               printf("[SKIP]\tvm86 %s\n",
+                      errno == ENOSYS ? "not supported" : "not allowed");
                return false;
        }
 
diff --git a/tools/testing/selftests/zram/Makefile b/tools/testing/selftests/zram/Makefile
new file mode 100644 (file)
index 0000000..29d8034
--- /dev/null
@@ -0,0 +1,9 @@
+all:
+
+TEST_PROGS := zram.sh
+TEST_FILES := zram01.sh zram02.sh zram_lib.sh
+
+include ../lib.mk
+
+clean:
+       $(RM) err.log
diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README
new file mode 100644 (file)
index 0000000..eb17917
--- /dev/null
@@ -0,0 +1,40 @@
+zram: Compressed RAM based block devices
+----------------------------------------
+* Introduction
+
+The zram module creates RAM based block devices named /dev/zram<id>
+(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
+in memory itself. These disks allow very fast I/O and compression provides
+good amounts of memory savings. Some of the usecases include /tmp storage,
+use as swap disks, various caches under /var and maybe many more :)
+
+Statistics for individual zram devices are exported through sysfs nodes at
+/sys/block/zram<id>/
+
+Kconfig required:
+CONFIG_ZRAM=y
+CONFIG_ZRAM_LZ4_COMPRESS=y
+CONFIG_ZPOOL=y
+CONFIG_ZSMALLOC=y
+
+ZRAM Testcases
+--------------
+zram_lib.sh: create library with initialization/cleanup functions
+zram.sh: For sanity check of CONFIG_ZRAM and to run zram01 and zram02
+
+Two functional tests: zram01 and zram02:
+zram01.sh: creates general purpose ram disks with ext4 filesystems
+zram02.sh: creates block device for swap
+
+Commands required for testing:
+ - bc
+ - dd
+ - free
+ - awk
+ - mkswap
+ - swapon
+ - swapoff
+ - mkfs/ mkfs.ext4
+
+For more information please refer:
+kernel-source-tree/Documentation/blockdev/zram.txt
diff --git a/tools/testing/selftests/zram/zram.sh b/tools/testing/selftests/zram/zram.sh
new file mode 100755 (executable)
index 0000000..20de9a7
--- /dev/null
@@ -0,0 +1,35 @@
+#!/bin/bash
+TCID="zram.sh"
+
+check_prereqs()
+{
+       local msg="skip all tests:"
+
+       if [ $UID != 0 ]; then
+               echo $msg must be run as root >&2
+               exit 0
+       fi
+}
+
+run_zram () {
+echo "--------------------"
+echo "running zram tests"
+echo "--------------------"
+./zram01.sh
+echo ""
+./zram02.sh
+}
+
+check_prereqs
+
+# check zram module exists
+MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko
+if [ -f $MODULE_PATH ]; then
+       run_zram
+elif [ -b /dev/zram0 ]; then
+       run_zram
+else
+       echo "$TCID : No zram.ko module or /dev/zram0 device file not found"
+       echo "$TCID : CONFIG_ZRAM is not set"
+       exit 1
+fi
diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh
new file mode 100755 (executable)
index 0000000..b9566a6
--- /dev/null
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of
+# the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# Test creates several zram devices with different filesystems on them.
+# It fills each device with zeros and checks that compression works.
+#
+# Author: Alexey Kodanev <alexey.kodanev@oracle.com>
+# Modified: Naresh Kamboju <naresh.kamboju@linaro.org>
+
+TCID="zram01"
+ERR_CODE=0
+
+. ./zram_lib.sh
+
+# Test will create the following number of zram devices:
+dev_num=1
+# This is a list of parameters for zram devices.
+# Number of items must be equal to 'dev_num' parameter.
+zram_max_streams="2"
+
+# The zram sysfs node 'disksize' value can be either in bytes,
+# or you can use mem suffixes. But in some old kernels, mem
+# suffixes are not supported, for example, in RHEL6.6GA's kernel
+# layer, it uses strict_strtoull() to parse disksize which does
+# not support mem suffixes, in some newer kernels, they use
+# memparse() which supports mem suffixes. So here we just use
+# bytes to make sure everything works correctly.
+zram_sizes="2097152" # 2MB
+zram_mem_limits="2M"
+zram_filesystems="ext4"
+zram_algs="lzo"
+
+zram_fill_fs()
+{
+       local mem_free0=$(free -m | awk 'NR==2 {print $4}')
+
+       for i in $(seq 0 $(($dev_num - 1))); do
+               echo "fill zram$i..."
+               local b=0
+               while [ true ]; do
+                       dd conv=notrunc if=/dev/zero of=zram${i}/file \
+                               oflag=append count=1 bs=1024 status=none \
+                               > /dev/null 2>&1 || break
+                       b=$(($b + 1))
+               done
+               echo "zram$i can be filled with '$b' KB"
+       done
+
+       local mem_free1=$(free -m | awk 'NR==2 {print $4}')
+       local used_mem=$(($mem_free0 - $mem_free1))
+
+       local total_size=0
+       for sm in $zram_sizes; do
+               local s=$(echo $sm | sed 's/M//')
+               total_size=$(($total_size + $s))
+       done
+
+       echo "zram used ${used_mem}M, zram disk sizes ${total_size}M"
+
+       local v=$((100 * $total_size / $used_mem))
+
+       if [ "$v" -lt 100 ]; then
+               echo "FAIL compression ratio: 0.$v:1"
+               ERR_CODE=-1
+               zram_cleanup
+               return
+       fi
+
+       echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK"
+}
+
+check_prereqs
+zram_load
+zram_max_streams
+zram_compress_alg
+zram_set_disksizes
+zram_set_memlimit
+zram_makefs
+zram_mount
+
+zram_fill_fs
+zram_cleanup
+zram_unload
+
+if [ $ERR_CODE -ne 0 ]; then
+       echo "$TCID : [FAIL]"
+else
+       echo "$TCID : [PASS]"
+fi
diff --git a/tools/testing/selftests/zram/zram02.sh b/tools/testing/selftests/zram/zram02.sh
new file mode 100755 (executable)
index 0000000..74569b8
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of
+# the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# Test checks that we can create swap zram device.
+#
+# Author: Alexey Kodanev <alexey.kodanev@oracle.com>
+# Modified: Naresh Kamboju <naresh.kamboju@linaro.org>
+
+TCID="zram02"
+ERR_CODE=0
+
+. ./zram_lib.sh
+
+# Test will create the following number of zram devices:
+dev_num=1
+# This is a list of parameters for zram devices.
+# Number of items must be equal to 'dev_num' parameter.
+zram_max_streams="2"
+
+# The zram sysfs node 'disksize' value can be either in bytes,
+# or you can use mem suffixes. But in some old kernels, mem
+# suffixes are not supported, for example, in RHEL6.6GA's kernel
+# layer, it uses strict_strtoull() to parse disksize which does
+# not support mem suffixes, in some newer kernels, they use
+# memparse() which supports mem suffixes. So here we just use
+# bytes to make sure everything works correctly.
+zram_sizes="1048576" # 1M
+zram_mem_limits="1M"
+
+check_prereqs
+zram_load
+zram_max_streams
+zram_set_disksizes
+zram_set_memlimit
+zram_makeswap
+zram_swapoff
+zram_cleanup
+zram_unload
+
+if [ $ERR_CODE -ne 0 ]; then
+       echo "$TCID : [FAIL]"
+else
+       echo "$TCID : [PASS]"
+fi
diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh
new file mode 100755 (executable)
index 0000000..424e68e
--- /dev/null
@@ -0,0 +1,232 @@
+#!/bin/sh
+# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of
+# the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# Author: Alexey Kodanev <alexey.kodanev@oracle.com>
+# Modified: Naresh Kamboju <naresh.kamboju@linaro.org>
+
+MODULE=0
+dev_makeswap=-1
+dev_mounted=-1
+
+trap INT
+
+check_prereqs()
+{
+       local msg="skip all tests:"
+
+       if [ $UID != 0 ]; then
+               echo $msg must be run as root >&2
+               exit 0
+       fi
+}
+
+zram_cleanup()
+{
+       echo "zram cleanup"
+       local i=
+       for i in $(seq 0 $dev_makeswap); do
+               swapoff /dev/zram$i
+       done
+
+       for i in $(seq 0 $dev_mounted); do
+               umount /dev/zram$i
+       done
+
+       for i in $(seq 0 $(($dev_num - 1))); do
+               echo 1 > /sys/block/zram${i}/reset
+               rm -rf zram$i
+       done
+
+}
+
+zram_unload()
+{
+       if [ $MODULE -ne 0 ] ; then
+               echo "zram rmmod zram"
+               rmmod zram > /dev/null 2>&1
+       fi
+}
+
+zram_load()
+{
+       # check zram module exists
+       MODULE_PATH=/lib/modules/`uname -r`/kernel/drivers/block/zram/zram.ko
+       if [ -f $MODULE_PATH ]; then
+               MODULE=1
+               echo "create '$dev_num' zram device(s)"
+               modprobe zram num_devices=$dev_num
+               if [ $? -ne 0 ]; then
+                       echo "failed to insert zram module"
+                       exit 1
+               fi
+
+               dev_num_created=$(ls /dev/zram* | wc -w)
+
+               if [ "$dev_num_created" -ne "$dev_num" ]; then
+                       echo "unexpected num of devices: $dev_num_created"
+                       ERR_CODE=-1
+               else
+                       echo "zram load module successful"
+               fi
+       elif [ -b /dev/zram0 ]; then
+               echo "/dev/zram0 device file found: OK"
+       else
+               echo "ERROR: No zram.ko module or no /dev/zram0 device found"
+               echo "$TCID : CONFIG_ZRAM is not set"
+               exit 1
+       fi
+}
+
+zram_max_streams()
+{
+       echo "set max_comp_streams to zram device(s)"
+
+       local i=0
+       for max_s in $zram_max_streams; do
+               local sys_path="/sys/block/zram${i}/max_comp_streams"
+               echo $max_s > $sys_path || \
+                       echo "FAIL failed to set '$max_s' to $sys_path"
+               sleep 1
+               local max_streams=$(cat $sys_path)
+
+               [ "$max_s" -ne "$max_streams" ] && \
+                       echo "FAIL can't set max_streams '$max_s', get $max_stream"
+
+               i=$(($i + 1))
+               echo "$sys_path = '$max_streams' ($i/$dev_num)"
+       done
+
+       echo "zram max streams: OK"
+}
+
+zram_compress_alg()
+{
+       echo "test that we can set compression algorithm"
+
+       local algs=$(cat /sys/block/zram0/comp_algorithm)
+       echo "supported algs: $algs"
+       local i=0
+       for alg in $zram_algs; do
+               local sys_path="/sys/block/zram${i}/comp_algorithm"
+               echo "$alg" >   $sys_path || \
+                       echo "FAIL can't set '$alg' to $sys_path"
+               i=$(($i + 1))
+               echo "$sys_path = '$alg' ($i/$dev_num)"
+       done
+
+       echo "zram set compression algorithm: OK"
+}
+
+zram_set_disksizes()
+{
+       echo "set disk size to zram device(s)"
+       local i=0
+       for ds in $zram_sizes; do
+               local sys_path="/sys/block/zram${i}/disksize"
+               echo "$ds" >    $sys_path || \
+                       echo "FAIL can't set '$ds' to $sys_path"
+
+               i=$(($i + 1))
+               echo "$sys_path = '$ds' ($i/$dev_num)"
+       done
+
+       echo "zram set disksizes: OK"
+}
+
+zram_set_memlimit()
+{
+       echo "set memory limit to zram device(s)"
+
+       local i=0
+       for ds in $zram_mem_limits; do
+               local sys_path="/sys/block/zram${i}/mem_limit"
+               echo "$ds" >    $sys_path || \
+                       echo "FAIL can't set '$ds' to $sys_path"
+
+               i=$(($i + 1))
+               echo "$sys_path = '$ds' ($i/$dev_num)"
+       done
+
+       echo "zram set memory limit: OK"
+}
+
+zram_makeswap()
+{
+       echo "make swap with zram device(s)"
+       local i=0
+       for i in $(seq 0 $(($dev_num - 1))); do
+               mkswap /dev/zram$i > err.log 2>&1
+               if [ $? -ne 0 ]; then
+                       cat err.log
+                       echo "FAIL mkswap /dev/zram$1 failed"
+               fi
+
+               swapon /dev/zram$i > err.log 2>&1
+               if [ $? -ne 0 ]; then
+                       cat err.log
+                       echo "FAIL swapon /dev/zram$1 failed"
+               fi
+
+               echo "done with /dev/zram$i"
+               dev_makeswap=$i
+       done
+
+       echo "zram making zram mkswap and swapon: OK"
+}
+
+zram_swapoff()
+{
+       local i=
+       for i in $(seq 0 $dev_makeswap); do
+               swapoff /dev/zram$i > err.log 2>&1
+               if [ $? -ne 0 ]; then
+                       cat err.log
+                       echo "FAIL swapoff /dev/zram$i failed"
+               fi
+       done
+       dev_makeswap=-1
+
+       echo "zram swapoff: OK"
+}
+
+zram_makefs()
+{
+       local i=0
+       for fs in $zram_filesystems; do
+               # if requested fs not supported default it to ext2
+               which mkfs.$fs > /dev/null 2>&1 || fs=ext2
+
+               echo "make $fs filesystem on /dev/zram$i"
+               mkfs.$fs /dev/zram$i > err.log 2>&1
+               if [ $? -ne 0 ]; then
+                       cat err.log
+                       echo "FAIL failed to make $fs on /dev/zram$i"
+               fi
+               i=$(($i + 1))
+               echo "zram mkfs.$fs: OK"
+       done
+}
+
+zram_mount()
+{
+       local i=0
+       for i in $(seq 0 $(($dev_num - 1))); do
+               echo "mount /dev/zram$i"
+               mkdir zram$i
+               mount /dev/zram$i zram$i > /dev/null || \
+                       echo "FAIL mount /dev/zram$i failed"
+               dev_mounted=$i
+       done
+
+       echo "zram mount of zram device(s): OK"
+}
index 8bdf16b8ba6047a2420998bde0adfd81489e117d..7f73fa32a590b5f5177affb4b2e6df93dc3e2f2f 100644 (file)
  * pagemap kernel ABI bits
  */
 
-#define PM_ENTRY_BYTES      sizeof(uint64_t)
-#define PM_STATUS_BITS      3
-#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS      6
-#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x)      (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
-
-#define __PM_SOFT_DIRTY      (1LL)
-#define PM_PRESENT          PM_STATUS(4LL)
-#define PM_SWAP             PM_STATUS(2LL)
-#define PM_SOFT_DIRTY       __PM_PSHIFT(__PM_SOFT_DIRTY)
-
+#define PM_ENTRY_BYTES         8
+#define PM_PFRAME_BITS         55
+#define PM_PFRAME_MASK         ((1LL << PM_PFRAME_BITS) - 1)
+#define PM_PFRAME(x)           ((x) & PM_PFRAME_MASK)
+#define PM_SOFT_DIRTY          (1ULL << 55)
+#define PM_MMAP_EXCLUSIVE      (1ULL << 56)
+#define PM_FILE                        (1ULL << 61)
+#define PM_SWAP                        (1ULL << 62)
+#define PM_PRESENT             (1ULL << 63)
 
 /*
  * kernel page flags
 #define KPF_SLOB_FREE          49
 #define KPF_SLUB_FROZEN                50
 #define KPF_SLUB_DEBUG         51
+#define KPF_FILE               62
+#define KPF_MMAP_EXCLUSIVE     63
 
 #define KPF_ALL_BITS           ((uint64_t)~0ULL)
 #define KPF_HACKERS_BITS       (0xffffULL << 32)
@@ -149,6 +143,9 @@ static const char * const page_flag_names[] = {
        [KPF_SLOB_FREE]         = "P:slob_free",
        [KPF_SLUB_FROZEN]       = "A:slub_frozen",
        [KPF_SLUB_DEBUG]        = "E:slub_debug",
+
+       [KPF_FILE]              = "F:file",
+       [KPF_MMAP_EXCLUSIVE]    = "1:mmap_exclusive",
 };
 
 
@@ -452,6 +449,10 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
 
        if (pme & PM_SOFT_DIRTY)
                flags |= BIT(SOFTDIRTY);
+       if (pme & PM_FILE)
+               flags |= BIT(FILE);
+       if (pme & PM_MMAP_EXCLUSIVE)
+               flags |= BIT(MMAP_EXCLUSIVE);
 
        return flags;
 }
index 98c95f2fcba4a63912fb81fbafd3854b08835e00..76e38d231e9959d085673b4eda7e4065f4a7fbb7 100644 (file)
@@ -64,10 +64,10 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
        int ret;
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
-       timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
-       ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-                                 timer->irq->irq,
-                                 timer->irq->level);
+       kvm_vgic_set_phys_irq_active(timer->map, true);
+       ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
+                                        timer->map,
+                                        timer->irq->level);
        WARN_ON(ret);
 }
 
@@ -117,7 +117,8 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
        cycle_t cval, now;
 
        if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-               !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+           !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
+           kvm_vgic_get_phys_irq_active(timer->map))
                return false;
 
        cval = timer->cntv_cval;
@@ -184,10 +185,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
        timer_arm(timer, ns);
 }
 
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-                         const struct kvm_irq_level *irq)
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+                        const struct kvm_irq_level *irq)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+       struct irq_phys_map *map;
 
        /*
         * The vcpu timer irq number cannot be determined in
@@ -196,6 +198,17 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
         * vcpu timer irq number when the vcpu is reset.
         */
        timer->irq = irq;
+
+       /*
+        * Tell the VGIC that the virtual interrupt is tied to a
+        * physical interrupt. We do that once per VCPU.
+        */
+       map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
+       if (WARN_ON(IS_ERR(map)))
+               return PTR_ERR(map);
+
+       timer->map = map;
+       return 0;
 }
 
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
@@ -335,6 +348,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
        timer_disarm(timer);
+       if (timer->map)
+               kvm_vgic_unmap_phys_irq(vcpu, timer->map);
 }
 
 void kvm_timer_enable(struct kvm *kvm)
index f9b9c7c5137214cb56bcc0cf7455d2d712dc6848..8d7b04db8471877fc2d28df790c6316ab51f6399 100644 (file)
@@ -48,6 +48,10 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
                lr_desc.state |= LR_STATE_ACTIVE;
        if (val & GICH_LR_EOI)
                lr_desc.state |= LR_EOI_INT;
+       if (val & GICH_LR_HW) {
+               lr_desc.state |= LR_HW;
+               lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
+       }
 
        return lr_desc;
 }
@@ -55,7 +59,9 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
                           struct vgic_lr lr_desc)
 {
-       u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq;
+       u32 lr_val;
+
+       lr_val = lr_desc.irq;
 
        if (lr_desc.state & LR_STATE_PENDING)
                lr_val |= GICH_LR_PENDING_BIT;
@@ -64,6 +70,14 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
        if (lr_desc.state & LR_EOI_INT)
                lr_val |= GICH_LR_EOI;
 
+       if (lr_desc.state & LR_HW) {
+               lr_val |= GICH_LR_HW;
+               lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
+       }
+
+       if (lr_desc.irq < VGIC_NR_SGIS)
+               lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
+
        vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 }
 
index dff06021e74855a2d6cb9b8830fd30818a63c927..afbf925b00f4ff079ea28925174e1998e54c44d1 100644 (file)
@@ -67,6 +67,10 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
                lr_desc.state |= LR_STATE_ACTIVE;
        if (val & ICH_LR_EOI)
                lr_desc.state |= LR_EOI_INT;
+       if (val & ICH_LR_HW) {
+               lr_desc.state |= LR_HW;
+               lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
+       }
 
        return lr_desc;
 }
@@ -84,10 +88,17 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
         * Eventually we want to make this configurable, so we may revisit
         * this in the future.
         */
-       if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+       switch (vcpu->kvm->arch.vgic.vgic_model) {
+       case KVM_DEV_TYPE_ARM_VGIC_V3:
                lr_val |= ICH_LR_GROUP;
-       else
-               lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+               break;
+       case  KVM_DEV_TYPE_ARM_VGIC_V2:
+               if (lr_desc.irq < VGIC_NR_SGIS)
+                       lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+               break;
+       default:
+               BUG();
+       }
 
        if (lr_desc.state & LR_STATE_PENDING)
                lr_val |= ICH_LR_PENDING_BIT;
@@ -95,6 +106,10 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
                lr_val |= ICH_LR_ACTIVE_BIT;
        if (lr_desc.state & LR_EOI_INT)
                lr_val |= ICH_LR_EOI;
+       if (lr_desc.state & LR_HW) {
+               lr_val |= ICH_LR_HW;
+               lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
+       }
 
        vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 }
index bc40137a022d51e451913f3011639e196ddd135b..9eb489a2c94c2b5ef07146a01ec9ac943034065c 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/rculist.h>
 #include <linux/uaccess.h>
 
 #include <asm/kvm_emulate.h>
  *   cause the interrupt to become inactive in such a situation.
  *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
  *   inactive as long as the external input line is held high.
+ *
+ *
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *   [to be renamed to kvm_vgic_init??]
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_cpu_early_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
  */
 
 #include "vgic.h"
@@ -82,6 +105,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
 static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+                                               int virt_irq);
 
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_params *vgic;
@@ -375,7 +400,7 @@ void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
 
 static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
 {
-       return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
+       return !vgic_irq_is_queued(vcpu, irq);
 }
 
 /**
@@ -1115,6 +1140,39 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
        if (!vgic_irq_is_edge(vcpu, irq))
                vlr.state |= LR_EOI_INT;
 
+       if (vlr.irq >= VGIC_NR_SGIS) {
+               struct irq_phys_map *map;
+               map = vgic_irq_map_search(vcpu, irq);
+
+               /*
+                * If we have a mapping, and the virtual interrupt is
+                * being injected, then we must set the state to
+                * active in the physical world. Otherwise the
+                * physical interrupt will fire and the guest will
+                * exit before processing the virtual interrupt.
+                */
+               if (map) {
+                       int ret;
+
+                       BUG_ON(!map->active);
+                       vlr.hwirq = map->phys_irq;
+                       vlr.state |= LR_HW;
+                       vlr.state &= ~LR_EOI_INT;
+
+                       ret = irq_set_irqchip_state(map->irq,
+                                                   IRQCHIP_STATE_ACTIVE,
+                                                   true);
+                       WARN_ON(ret);
+
+                       /*
+                        * Make sure we're not going to sample this
+                        * again, as a HW-backed interrupt cannot be
+                        * in the PENDING_ACTIVE stage.
+                        */
+                       vgic_irq_set_queued(vcpu, irq);
+               }
+       }
+
        vgic_set_lr(vcpu, lr_nr, vlr);
        vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
@@ -1339,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
        return level_pending;
 }
 
+/*
+ * Save the physical active state, and reset it to inactive.
+ *
+ * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ */
+static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+{
+       struct irq_phys_map *map;
+       int ret;
+
+       if (!(vlr.state & LR_HW))
+               return 0;
+
+       map = vgic_irq_map_search(vcpu, vlr.irq);
+       BUG_ON(!map || !map->active);
+
+       ret = irq_get_irqchip_state(map->irq,
+                                   IRQCHIP_STATE_ACTIVE,
+                                   &map->active);
+
+       WARN_ON(ret);
+
+       if (map->active) {
+               ret = irq_set_irqchip_state(map->irq,
+                                           IRQCHIP_STATE_ACTIVE,
+                                           false);
+               WARN_ON(ret);
+               return 0;
+       }
+
+       return 1;
+}
+
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -1353,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
        elrsr = vgic_get_elrsr(vcpu);
        elrsr_ptr = u64_to_bitmask(&elrsr);
 
-       /* Clear mappings for empty LRs */
-       for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
+       /* Deal with HW interrupts, and clear mappings for empty LRs */
+       for (lr = 0; lr < vgic->nr_lr; lr++) {
                struct vgic_lr vlr;
 
-               if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
+               if (!test_bit(lr, vgic_cpu->lr_used))
                        continue;
 
                vlr = vgic_get_lr(vcpu, lr);
+               if (vgic_sync_hwirq(vcpu, vlr)) {
+                       /*
+                        * So this is a HW interrupt that the guest
+                        * EOI-ed. Clean the LR state and allow the
+                        * interrupt to be sampled again.
+                        */
+                       vlr.state = 0;
+                       vlr.hwirq = 0;
+                       vgic_set_lr(vcpu, lr, vlr);
+                       vgic_irq_clear_queued(vcpu, vlr.irq);
+                       set_bit(lr, elrsr_ptr);
+               }
+
+               if (!test_bit(lr, elrsr_ptr))
+                       continue;
+
+               clear_bit(lr, vgic_cpu->lr_used);
 
                BUG_ON(vlr.irq >= dist->nr_irqs);
                vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
@@ -1447,7 +1555,8 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 }
 
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-                                 unsigned int irq_num, bool level)
+                                  struct irq_phys_map *map,
+                                  unsigned int irq_num, bool level)
 {
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
@@ -1455,6 +1564,9 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
        int enabled;
        bool ret = true, can_inject = true;
 
+       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+               return -EINVAL;
+
        spin_lock(&dist->lock);
 
        vcpu = kvm_get_vcpu(kvm, cpuid);
@@ -1517,18 +1629,46 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 out:
        spin_unlock(&dist->lock);
 
-       return ret ? cpuid : -EINVAL;
+       if (ret) {
+               /* kick the specified vcpu */
+               kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
+       }
+
+       return 0;
+}
+
+static int vgic_lazy_init(struct kvm *kvm)
+{
+       int ret = 0;
+
+       if (unlikely(!vgic_initialized(kvm))) {
+               /*
+                * We only provide the automatic initialization of the VGIC
+                * for the legacy case of a GICv2. Any other type must
+                * be explicitly initialized once setup with the respective
+                * KVM device call.
+                */
+               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+                       return -EBUSY;
+
+               mutex_lock(&kvm->lock);
+               ret = vgic_init(kvm);
+               mutex_unlock(&kvm->lock);
+       }
+
+       return ret;
 }
 
 /**
  * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
  * @kvm:     The VM structure pointer
  * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device
+ * @irq_num: The IRQ number that is assigned to the device. This IRQ
+ *           must not be mapped to a HW interrupt.
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *                           false: to ignore the call
- *          Level-sensitive  true:  activates an interrupt
- *                           false: deactivates an interrupt
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
  *
  * The GIC is not concerned with devices being active-LOW or active-HIGH for
  * level-sensitive interrupts.  You can think of the level parameter as 1
@@ -1537,39 +1677,44 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
                        bool level)
 {
-       int ret = 0;
-       int vcpu_id;
-
-       if (unlikely(!vgic_initialized(kvm))) {
-               /*
-                * We only provide the automatic initialization of the VGIC
-                * for the legacy case of a GICv2. Any other type must
-                * be explicitly initialized once setup with the respective
-                * KVM device call.
-                */
-               if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) {
-                       ret = -EBUSY;
-                       goto out;
-               }
-               mutex_lock(&kvm->lock);
-               ret = vgic_init(kvm);
-               mutex_unlock(&kvm->lock);
+       struct irq_phys_map *map;
+       int ret;
 
-               if (ret)
-                       goto out;
-       }
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
 
-       if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+       map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
+       if (map)
                return -EINVAL;
 
-       vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-       if (vcpu_id >= 0) {
-               /* kick the specified vcpu */
-               kvm_vcpu_kick(kvm_get_vcpu(kvm, vcpu_id));
-       }
+       return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level);
+}
 
-out:
-       return ret;
+/**
+ * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @map:     Pointer to a irq_phys_map structure describing the mapping
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *                           false: to ignore the call
+ *          Level-sensitive  true:  raise the input signal
+ *                           false: lower the input signal
+ *
+ * The GIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+                              struct irq_phys_map *map, bool level)
+{
+       int ret;
+
+       ret = vgic_lazy_init(kvm);
+       if (ret)
+               return ret;
+
+       return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level);
 }
 
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1583,6 +1728,188 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
        return IRQ_HANDLED;
 }
 
+static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
+                                                   int virt_irq)
+{
+       if (virt_irq < VGIC_NR_PRIVATE_IRQS)
+               return &vcpu->arch.vgic_cpu.irq_phys_map_list;
+       else
+               return &vcpu->kvm->arch.vgic.irq_phys_map_list;
+}
+
+/**
+ * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
+ * @vcpu: The VCPU pointer
+ * @virt_irq: The virtual irq number
+ * @irq: The Linux IRQ number
+ *
+ * Establish a mapping between a guest visible irq (@virt_irq) and a
+ * Linux irq (@irq). On injection, @virt_irq will be associated with
+ * the physical interrupt represented by @irq. This mapping can be
+ * established multiple times as long as the parameters are the same.
+ *
+ * Returns a valid pointer on success, and an error pointer otherwise
+ */
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+                                          int virt_irq, int irq)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+       struct irq_phys_map *map;
+       struct irq_phys_map_entry *entry;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       int phys_irq;
+
+       desc = irq_to_desc(irq);
+       if (!desc) {
+               kvm_err("%s: no interrupt descriptor\n", __func__);
+               return ERR_PTR(-EINVAL);
+       }
+
+       data = irq_desc_get_irq_data(desc);
+       while (data->parent_data)
+               data = data->parent_data;
+
+       phys_irq = data->hwirq;
+
+       /* Create a new mapping */
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return ERR_PTR(-ENOMEM);
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       /* Try to match an existing mapping */
+       map = vgic_irq_map_search(vcpu, virt_irq);
+       if (map) {
+               /* Make sure this mapping matches */
+               if (map->phys_irq != phys_irq   ||
+                   map->irq      != irq)
+                       map = ERR_PTR(-EINVAL);
+
+               /* Found an existing, valid mapping */
+               goto out;
+       }
+
+       map           = &entry->map;
+       map->virt_irq = virt_irq;
+       map->phys_irq = phys_irq;
+       map->irq      = irq;
+
+       list_add_tail_rcu(&entry->entry, root);
+
+out:
+       spin_unlock(&dist->irq_phys_map_lock);
+       /* If we've found a hit in the existing list, free the useless
+        * entry */
+       if (IS_ERR(map) || map != &entry->map)
+               kfree(entry);
+       return map;
+}
+
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+                                               int virt_irq)
+{
+       struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+       struct irq_phys_map_entry *entry;
+       struct irq_phys_map *map;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(entry, root, entry) {
+               map = &entry->map;
+               if (map->virt_irq == virt_irq) {
+                       rcu_read_unlock();
+                       return map;
+               }
+       }
+
+       rcu_read_unlock();
+
+       return NULL;
+}
+
+static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
+{
+       struct irq_phys_map_entry *entry;
+
+       entry = container_of(rcu, struct irq_phys_map_entry, rcu);
+       kfree(entry);
+}
+
+/**
+ * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
+ *
+ * Return the logical active state of a mapped interrupt. This doesn't
+ * necessarily reflects the current HW state.
+ */
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
+{
+       BUG_ON(!map);
+       return map->active;
+}
+
+/**
+ * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
+ *
+ * Set the logical active state of a mapped interrupt. This doesn't
+ * immediately affects the HW state.
+ */
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
+{
+       BUG_ON(!map);
+       map->active = active;
+}
+
+/**
+ * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
+ * @vcpu: The VCPU pointer
+ * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
+ *
+ * Remove an existing mapping between virtual and physical interrupts.
+ */
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+{
+       struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+       struct irq_phys_map_entry *entry;
+       struct list_head *root;
+
+       if (!map)
+               return -EINVAL;
+
+       root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq);
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       list_for_each_entry(entry, root, entry) {
+               if (&entry->map == map) {
+                       list_del_rcu(&entry->entry);
+                       call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+                       break;
+               }
+       }
+
+       spin_unlock(&dist->irq_phys_map_lock);
+
+       return 0;
+}
+
+static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
+{
+       struct vgic_dist *dist = &kvm->arch.vgic;
+       struct irq_phys_map_entry *entry;
+
+       spin_lock(&dist->irq_phys_map_lock);
+
+       list_for_each_entry(entry, root, entry) {
+               list_del_rcu(&entry->entry);
+               call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+       }
+
+       spin_unlock(&dist->irq_phys_map_lock);
+}
+
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1591,6 +1918,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
        kfree(vgic_cpu->active_shared);
        kfree(vgic_cpu->pend_act_shared);
        kfree(vgic_cpu->vgic_irq_lr_map);
+       vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
        vgic_cpu->pending_shared = NULL;
        vgic_cpu->active_shared = NULL;
        vgic_cpu->pend_act_shared = NULL;
@@ -1627,6 +1955,17 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
        return 0;
 }
 
+/**
+ * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
+{
+       struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+       INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
+}
+
 /**
  * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
  *
@@ -1664,6 +2003,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
        kfree(dist->irq_spi_target);
        kfree(dist->irq_pending_on_cpu);
        kfree(dist->irq_active_on_cpu);
+       vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
        dist->irq_sgi_sources = NULL;
        dist->irq_spi_cpu = NULL;
        dist->irq_spi_target = NULL;
@@ -1787,6 +2127,18 @@ static int init_vgic_model(struct kvm *kvm, int type)
        return 0;
 }
 
+/**
+ * kvm_vgic_early_init - Earliest possible vgic initialization stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+       spin_lock_init(&kvm->arch.vgic.lock);
+       spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
+       INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
+}
+
 int kvm_vgic_create(struct kvm *kvm, u32 type)
 {
        int i, vcpu_lock_idx = -1, ret;
@@ -1832,7 +2184,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
        if (ret)
                goto out_unlock;
 
-       spin_lock_init(&kvm->arch.vgic.lock);
        kvm->arch.vgic.in_kernel = true;
        kvm->arch.vgic.vgic_model = type;
        kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
index 21c14244f4c4fd1c3c8ffade7f3265df91c89efb..d7ea8e20dae4ee9b353441e5e342a6279d1485d6 100644 (file)
@@ -213,11 +213,15 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        goto out;
 
                r = -EINVAL;
-               if (ue->flags)
+               if (ue->flags) {
+                       kfree(e);
                        goto out;
+               }
                r = setup_routing_entry(new, e, ue);
-               if (r)
+               if (r) {
+                       kfree(e);
                        goto out;
+               }
                ++ue;
        }
 
index d8db2f8fce9c7ab727fb5dfa957d5f7ba828fa42..a25a73147f714458dd6c55fe7426649f9dd5baa2 100644 (file)
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static unsigned int halt_poll_ns;
+/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
+static unsigned int halt_poll_ns = 500000;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
+/* Default doubles per-vcpu halt_poll_ns. */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu halt_poll_ns . */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+
 /*
  * Ordering of locks:
  *
@@ -217,6 +226,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
+       vcpu->halt_poll_ns = 0;
        init_waitqueue_head(&vcpu->wq);
        kvm_async_pf_vcpu_init(vcpu);
 
@@ -387,6 +397,36 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young, idx;
+
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+       /*
+        * Even though we do not flush TLB, this will still adversely
+        * affect performance on pre-Haswell Intel EPT, where there is
+        * no EPT Access Bit to clear so that we have to tear down EPT
+        * tables instead. If we find this unacceptable, we can always
+        * add a parameter to kvm_age_hva so that it effectively doesn't
+        * do anything on clear_young.
+        *
+        * Also note that currently we never issue secondary TLB flushes
+        * from clear_young, leaving this job up to the regular system
+        * cadence. If we find this inaccurate, we might come up with a
+        * more sophisticated heuristic later.
+        */
+       young = kvm_age_hva(kvm, start, end);
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
@@ -419,6 +459,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+       .clear_young            = kvm_mmu_notifier_clear_young,
        .test_young             = kvm_mmu_notifier_test_young,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,
@@ -1906,6 +1947,35 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+       int old, val;
+
+       old = val = vcpu->halt_poll_ns;
+       /* 10us base */
+       if (val == 0 && halt_poll_ns_grow)
+               val = 10000;
+       else
+               val *= halt_poll_ns_grow;
+
+       vcpu->halt_poll_ns = val;
+       trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
+}
+
+static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+       int old, val;
+
+       old = val = vcpu->halt_poll_ns;
+       if (halt_poll_ns_shrink == 0)
+               val = 0;
+       else
+               val /= halt_poll_ns_shrink;
+
+       vcpu->halt_poll_ns = val;
+       trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
+}
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
        if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1928,10 +1998,11 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        ktime_t start, cur;
        DEFINE_WAIT(wait);
        bool waited = false;
+       u64 block_ns;
 
        start = cur = ktime_get();
-       if (halt_poll_ns) {
-               ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+       if (vcpu->halt_poll_ns) {
+               ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
                do {
                        /*
@@ -1960,7 +2031,21 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        cur = ktime_get();
 
 out:
-       trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
+       block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+
+       if (halt_poll_ns) {
+               if (block_ns <= vcpu->halt_poll_ns)
+                       ;
+               /* we had a long block, shrink polling */
+               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                       shrink_halt_poll_ns(vcpu);
+               /* we had a short halt and our poll time is too small */
+               else if (vcpu->halt_poll_ns < halt_poll_ns &&
+                       block_ns < halt_poll_ns)
+                       grow_halt_poll_ns(vcpu);
+       }
+
+       trace_kvm_vcpu_wakeup(block_ns, waited);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);